/*
* Copyright 1995, 2003 Perforce Software. All rights reserved.
*
* This file is part of Perforce - the FAST SCM System.
*/
#include "validate.h"
/*
* ValidateCharSet
*/
CharSetValid::~CharSetValid()
{
}
CharSetUTF8Valid::CharSetUTF8Valid()
: followcnt(0), magic(0)
{
}
void
CharSetUTF8Valid::Reset()
{
followcnt = 0;
magic = 0;
}
/*
* What do these bits mean?
*
* 0x40 First byte of a multi-byte sequence, which includes trivial
* multi-byte sequences of length one (i.e. only this byte)
* 0x80 Part of a multi byte sequence
* 0x08 UTF-16 surrogate
* 0x07 count of following bytes
*/
unsigned char
CharSetUTF8Valid::validmap[256] = {
// 0 - 0x7f
0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
// 0x80 - 0x8f
0xb0, 0xb0, 0xb0, 0xb0, 0xb0, 0xb0, 0xb0, 0xb0,
0xb0, 0xb0, 0xb0, 0xb0, 0xb0, 0xb0, 0xb0, 0xb0,
// 0x90 - 0x9f
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
// 0xa0 - 0xbf
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
// 0xc0, 0xc1 illegal
0, 0,
// 0xc2 - 0xdf
0x41, 0x41, 0x41, 0x41, 0x41, 0x41,
0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41,
0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41,
0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41,
// 0xe0 - 0xef ( 0xe0 and 0xed are magical )
0x72, 0x42, 0x42, 0x42, 0x42, 0x42, 0x42, 0x42,
0x42, 0x42, 0x42, 0x42, 0x42, 0x4a, 0x42, 0x42,
// 0xf0 - 0xf4 ( 0xf0 and 0xf4 are magical )
0x63, 0x43, 0x43, 0x43, 0x53,
// 0xf5 - 0xff illegal
0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0
};
/*
* return values are...
*
* 0 not valid
* 1 valid
* 3 valid so far (following bytes needed to complete a multi-byte char)
*/
int
CharSetUTF8Valid::Valid( const char *buf, int len, const char **retp )
{
while( len-- > 0 )
{
int chflags = validmap[0xff & *buf];
if( followcnt )
{
if( ( chflags & 0x80 ) != 0x80 )
return 0;
--followcnt;
if( magic )
{
switch( magic )
{
case 0x10: // lead is 0xf4
if( ( chflags & 0x20 ) != 0x20 )
return 0;
break;
case 0x20: // lead is 0xf0
if( ( chflags & 0x20 ) == 0x20 )
return 0;
break;
case 0x30: // lead is 0xe0
if( ( chflags & 0x10 ) == 0x10 )
return 0;
break;
case 0x08: // lead is 0xed (UTF-16 surrogates)
if( ( chflags & 0x30 ) == 0x00 )
return 0;
break;
}
magic = 0;
}
}
else
{
if( retp )
*retp = buf;
if( ( chflags & 0x40 ) != 0x40 )
return 0;
followcnt = chflags & 0x7;
magic = chflags & 0x38;
}
buf++;
}
if( followcnt )
return 3;
if( retp )
*retp = buf;
return 1;
}