/*
* Copyright 2001 Perforce Software. All rights reserved.
*
*/
/*
* basecvt.cc - Character set conversion code base class
*
* This is seporate from other converters so that unless you are really
* possibly going to do character set conversions you will not link
* in those large tables into the server. The NT server needs
* UTF-8 to UTF-16 conversions.
*/
#include <stdhdrs.h>
#include <strbuf.h>
#include <debug.h>
#include "i18napi.h"
#include "charcvt.h"
#include "charman.h"
CharSetCvt::~CharSetCvt()
{
delete [] fastbuf;
}
unsigned long CharSetCvt::offsetsFromUTF8[6] =
{0x00000000UL, 0x00003080UL, 0x000E2080UL,
0x03C82080UL, 0xFA082080UL, 0x82082080UL};
unsigned long CharSetCvt::minimumFromUTF8[6] =
{0x00000000UL, 0x00000080UL, 0x00000800UL,
0x00010000UL, 0x00200000UL, 0x04000000UL};
char CharSetCvt::bytesFromUTF8[256] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5};
int
CharSetCvt::LastErr()
{
return lasterr;
}
void
CharSetCvt::ResetErr()
{
lasterr = NONE;
}
CharSetCvt *
CharSetCvt::Clone()
{
return new CharSetCvt;
}
CharSetCvt *
CharSetCvt::ReverseCvt()
{
return Clone();
}
int
CharSetCvt::Cvt(const char **sourcestart, const char *sourceend,
char **targetstart, char *targetend)
{
int slen = sourceend - *sourcestart;
int tlen = targetend - *targetstart;
if (tlen < slen)
slen = tlen;
memcpy((void *)*targetstart, (void *)*sourcestart, slen);
*sourcestart += slen;
*targetstart += slen;
return 0;
}
char *
CharSetCvt::CvtBuffer(const char *s, int slen, int *retlen)
{
const char *ss, *se, *lastsserr = NULL;
char *retbuf;
char *ts, *te;
int rlen;
rlen = slen;
if( rlen & 1 ) // force result length to be even
++rlen;
se = s + slen;
for (;;)
{
ResetErr();
ts = retbuf = new char[rlen+2];
te = ts + rlen;
ss = s;
Cvt(&ss, se, &ts, te);
if (ss == se)
break;
delete [] retbuf;
if (LastErr() == NOMAPPING)
return NULL;
if (LastErr() == PARTIALCHAR)
{
if (lastsserr == ss)
return NULL;
lastsserr = ss;
}
rlen <<= 1;
}
if (retlen)
*retlen = ts-retbuf;
*ts++ = '\0';
*ts = '\0';
return retbuf;
}
const char *
CharSetCvt::FastCvt(const char *s, int slen, int *retlen)
{
const char *ss, *se, *lastsserr = NULL;
char *ts, *te;
int rlen;
if (slen + 2 > fastsize) {
fastsize = 2 * slen + 2; // make fast buffer big and even in length
delete [] fastbuf;
fastbuf = new char[fastsize];
}
rlen = fastsize - 2;
se = s + slen;
for (;;)
{
ResetErr();
ts = fastbuf;
te = ts + rlen;
ss = s;
Cvt(&ss, se, &ts, te);
if (ss == se)
break;
if (LastErr() == NOMAPPING)
return NULL;
if (LastErr() == PARTIALCHAR)
{
// This ts + 10 is to check if the parial char
// is due to the source being a partial char
// not the target as determined by seeing
// that there is plenty of space (10 bytes)
// in the target to have completed that character
if (ts + 10 < te || lastsserr == ss)
return NULL;
lastsserr = ss;
}
delete [] fastbuf;
fastsize <<= 1;
fastbuf = new char[fastsize];
rlen = fastsize - 2;
}
if (retlen)
*retlen = ts-fastbuf;
*ts++ = '\0';
*ts = '\0';
return fastbuf;
}
const char *
CharSetCvt::FastCvtQues(const char *s, int slen, int *retlen)
{
const char *ss, *se, *lastsserr = NULL;
char *ts, *te;
int rlen;
if ( slen + 2 > fastsize ) {
fastsize = 2 * slen + 2; // make fast buffer big and even in length
delete [] fastbuf;
fastbuf = new char[ fastsize ];
}
rlen = fastsize - 2;
se = s + slen;
for (;;)
{
ResetErr();
ts = fastbuf;
te = ts + rlen;
ss = s;
restart:
Cvt( &ss, se, &ts, te );
if ( ss >= se )
break;
if ( ts != te && LastErr() == NOMAPPING )
{
*ts++ = '?';
CharStep *stepper = FromCharStep( (char *)ss );
ss = stepper->Next();
delete stepper;
if ( ss >= se )
break;
goto restart;
}
if ( LastErr() == PARTIALCHAR )
{
// This ts + 10 is to check if the parial char
// is due to the source being a partial char
// not the target as determined by seeing
// that there is plenty of space (10 bytes)
// in the target to have completed that character
if ( ts + 10 < te || lastsserr == ss )
return NULL;
lastsserr = ss;
}
delete [] fastbuf;
fastsize <<= 1;
fastbuf = new char[fastsize];
rlen = fastsize - 2;
}
if ( retlen )
*retlen = ts-fastbuf;
*ts++ = '\0';
*ts = '\0';
return fastbuf;
}
void
CharSetCvt::IgnoreBOM()
{
}
void
CharSetCvt::printmap(unsigned short f, unsigned short t, unsigned short b)
{
if( b == 0xfffe )
p4debug.printf("%04x -> %04x -> unknown\n", f, t);
else
p4debug.printf("%04x -> %04x -> %04x\n", f, t, b);
}
void
CharSetCvt::printmap(unsigned short f, unsigned short t)
{
p4debug.printf("%04x -> %04x\n", f, t);
}
CharStep *
CharSetCvt::FromCharStep(char *p)
{
return new CharStep( p );
}
CharSetCvtUTF16::CharSetCvtUTF16(int i, int b)
: bom( b )
{
if( i == -1 ) {
// detect byte ordering
unsigned short s = 1;
i = *(unsigned char *)&s;
}
fileinvert = invert = i;
}
void
CharSetCvtUTF16::IgnoreBOM()
{
CharSetCvtFromUTF8::IgnoreBOM();
fileinvert = invert;
}
CharSetCvt *
CharSetCvtUTF816::Clone()
{
return new CharSetCvtUTF816( invert, bom );
}
CharSetCvt *
CharSetCvtUTF168::Clone()
{
return new CharSetCvtUTF168( invert, bom );
}
CharSetCvt *
CharSetCvtUTF816::ReverseCvt()
{
return new CharSetCvtUTF168( invert, bom );
}
CharSetCvt *
CharSetCvtUTF168::ReverseCvt()
{
return new CharSetCvtUTF816( invert, bom );
}
int
CharSetCvtUTF816::Cvt(const char **sourcestart, const char *sourceend,
char **targetstart, char *targetend)
{
unsigned int v;
if( checkBOM && bom )
{
if( *targetstart >= targetend - 2 )
{
lasterr = PARTIALCHAR;
return 0;
}
if( fileinvert )
{
**targetstart = 0xff;
*++*targetstart = 0xfe;
}
else
{
**targetstart = 0xfe;
*++*targetstart = 0xff;
}
++*targetstart;
}
while( *sourcestart < sourceend && *targetstart < targetend - 1 )
{
v = **sourcestart & 0xff;
int l;
if ( v & 0x80 )
{
l = bytesFromUTF8[v];
if ( l + *sourcestart >= sourceend )
{
lasterr = PARTIALCHAR;
return 0;
}
switch( l )
{
default:
lasterr = NOMAPPING;
return 0;
case 3:
// surrogates are needed - check for more target space
if( *targetstart > targetend - 4 )
{
lasterr = PARTIALCHAR;
return 0;
}
v <<= 6;
v += 0xff & *++*sourcestart;
// fall through...
case 2:
v <<= 6;
v += 0xff & *++*sourcestart;
// fall through...
case 1:
v <<= 6;
v += 0xff & *++*sourcestart;
v -= offsetsFromUTF8[l];
# ifdef STRICT_UTF8
if( v < minimumFromUTF8[l] )
{
// illegal over long UTF8 sequence
lasterr = NOMAPPING;
*sourcestart -= l;
return 0;
}
#endif
// at this point v is a unicode position
if( checkBOM && v == 0xfeff )
{
checkBOM = 0;
++*sourcestart;
continue;
}
// note fall through
}
}
checkBOM = 0;
# ifdef STRICT_UTF8
// check for invalid unicode positions
if( ( v & 0x1ff800 ) == 0xd800 || ( v >= 0xfdd0 && v <= 0xfdef ) )
{
lasterr = NOMAPPING;
*sourcestart -= l;
return 0;
}
# endif
++charcnt;
if( v == '\n' ) {
++linecnt;
charcnt = 0;
}
// handle UTF16 surrogates
if( v > 0xffff )
{
unsigned int s = ( v >> 10 ) + 0xd7c0;
if( fileinvert )
{
**targetstart = s & 0xff;
*++*targetstart = (s >> 8) & 0xff;
}
else
{
**targetstart = (s >> 8) & 0xff;
*++*targetstart = s & 0xff;
}
++*targetstart;
v = 0xdc00 | (v & 0x3ff);
}
if( fileinvert )
{
**targetstart = v & 0xff;
*++*targetstart = (v >> 8) & 0xff;
}
else
{
**targetstart = (v >> 8) & 0xff;
*++*targetstart = v & 0xff;
}
++*targetstart;
++*sourcestart;
}
if( *sourcestart < sourceend && *targetstart < targetend )
lasterr = PARTIALCHAR;
return 0;
}
int
CharSetCvtUTF168::Cvt(const char **sourcestart, const char *sourceend,
char **targetstart, char *targetend)
{
unsigned int v;
while( *sourcestart < sourceend-1 && *targetstart < targetend )
{
if( fileinvert )
{
v = **sourcestart & 0xff;
v |= (*++*sourcestart & 0xff) << 8;
}
else
{
v = (**sourcestart & 0xff) << 8;
v |= *++*sourcestart & 0xff;
}
++*sourcestart;
if( checkBOM )
{
checkBOM = 0;
switch( v )
{
case 0xfffe:
fileinvert ^= 1;
// fall through...
case 0xfeff:
// suppress BOM
continue;
}
}
// is this the start of surrogate pair?
if( ( v & 0xfc00 ) == 0xd800 )
{
// it is...
unsigned int s;
if( *sourcestart >= sourceend-1 )
{
lasterr = PARTIALCHAR;
*sourcestart -= 2;
return 0;
}
if( fileinvert )
{
s = **sourcestart & 0xff;
s |= (*++*sourcestart & 0xff) << 8;
}
else
{
s = (**sourcestart & 0xff) << 8;
s |= *++*sourcestart & 0xff;
}
++*sourcestart;
if( ( s & 0xfc00 ) != 0xdc00 )
{
// trailing surrogate not correct
lasterr = NOMAPPING;
*sourcestart -= 4;
return 0;
}
v = ( v << 10 ) + s - 0x35fdc00; // magic
}
// emit UTF8 of v
if( ( v & 0x1ff800 ) == 0xd800 || ( v >= 0xfdd0 && v <= 0xfdef ) )
{
lasterr = NOMAPPING;
*sourcestart -= 2;
if( v >= 0x10000 )
*sourcestart -= 2;
return 0;
}
if( v >= 0x10000 )
{
// Extended multilingual plane - 4 byte UTF8
if (3 + *targetstart >= targetend)
{
lasterr = PARTIALCHAR;
*sourcestart -= 4; // 4 because of UTF 16 surrogates
return 0;
}
**targetstart = 0xf0 | (v >> 18);
*++*targetstart = 0x80 | ((v >> 12) & 0x3f);
*++*targetstart = 0x80 | ((v >> 6) & 0x3f);
*++*targetstart = 0x80 | (v & 0x3f);
}
else if( v >= 0x800 )
{
if (2 + *targetstart >= targetend)
{
lasterr = PARTIALCHAR;
*sourcestart -= 2;
return 0;
}
**targetstart = 0xe0 | (v >> 12);
*++*targetstart = 0x80 | ((v >> 6) & 0x3f);
*++*targetstart = 0x80 | (v & 0x3f);
}
else if( v >= 0x80 )
{
if (1 + *targetstart >= targetend)
{
lasterr = PARTIALCHAR;
*sourcestart -= 2;
return 0;
}
**targetstart = 0xc0 | (v >> 6);
*++*targetstart = 0x80 | (v & 0x3f);
}
else
**targetstart = v;
++*targetstart;
}
if( *sourcestart < sourceend && *targetstart < targetend )
lasterr = PARTIALCHAR;
++charcnt;
if( v == '\n' ) {
++linecnt;
charcnt = 0;
}
return 0;
}
CharStep *
CharSetCvtFromUTF8::FromCharStep(char *p)
{
return new CharStepUTF8( p );
}
void
CharSetCvtFromUTF8::IgnoreBOM()
{
checkBOM = 1;
}
unsigned short
CharSetCvt::MapThru(unsigned short v,
const CharSetCvt::MapEnt *m,
int n,
unsigned short d)
{
const MapEnt *e = m + n;
const MapEnt *c;
while (m < e)
{
c = (e - m) / 2 + m;
if (c->cfrom == v)
return c->cto;
if (c->cfrom < v)
m = c + 1;
else
e = c;
}
return d;
}