/*
* Copyright 2001 Perforce Software. All rights reserved.
*
*/
/*
* charcvt.cc - Real character set conversion code
*
* This is seporate from basecvt.cc so that unless you are really
* possibly going to do character set conversions you will not link
* in the large conversion tables.
*/
#include <stdhdrs.h>
#include "i18napi.h"
#include "charcvt.h"
#include "charman.h"
#include "debug.h"
class CharSetCvtCache
{
public:
CharSetCvtCache()
{
fromUtf8To = NULL;
toUtf8From = NULL;
}
~CharSetCvtCache();
CharSetCvt * FindCvt(CharSetCvt::CharSet from, CharSetCvt::CharSet to);
void InsertCvt(CharSetCvt::CharSet from, CharSetCvt::CharSet to, CharSetCvt * cvt);
private:
CharSetCvt ** fromUtf8To;
CharSetCvt ** toUtf8From;
};
static CharSetCvtCache gCharSetCvtCache;
CharSetCvtCache::~CharSetCvtCache()
{
const int charSetCount = CharSetApi::CharSetCount();
if (fromUtf8To)
{
for (int i=0; i<charSetCount; i++)
delete fromUtf8To[i];
delete [] fromUtf8To;
fromUtf8To = NULL;
}
if (toUtf8From)
{
for (int i=0; i<charSetCount; i++)
delete toUtf8From[i];
delete [] toUtf8From;
toUtf8From = NULL;
}
}
CharSetCvt *
CharSetCvtCache::FindCvt(CharSetCvt::CharSet from, CharSetCvt::CharSet to)
{
const int charSetCount = CharSetApi::CharSetCount();
if (from < 0 || from >= charSetCount)
return NULL;
if (to < 0 || to >= charSetCount)
return NULL;
if (from == CharSetApi::UTF_8)
{
if (!fromUtf8To)
{
fromUtf8To = new CharSetCvt*[charSetCount];
for (int i=0; i<charSetCount; i++)
fromUtf8To[i] = NULL;
}
if (fromUtf8To[to] != NULL)
{
CharSetCvt * charSetCvt = fromUtf8To[to];
charSetCvt->ResetErr();
return charSetCvt;
}
}
if (to == CharSetApi::UTF_8)
{
if (!toUtf8From)
{
toUtf8From = new CharSetCvt*[charSetCount];
for (int i=0; i<charSetCount; i++)
toUtf8From[i] = NULL;
}
if (toUtf8From[from] != NULL)
{
CharSetCvt * charSetCvt = toUtf8From[from];
charSetCvt->ResetErr();
return charSetCvt;
}
}
return NULL;
}
void
CharSetCvtCache::InsertCvt(CharSetCvt::CharSet from, CharSetCvt::CharSet to, CharSetCvt * cvt)
{
if (from == CharSetApi::UTF_8)
fromUtf8To[to] = cvt;
else if (to == CharSetApi::UTF_8)
toUtf8From[from] = cvt;
}
CharSetCvt *
CharSetCvt::FindCvt(CharSetCvt::CharSet from, CharSetCvt::CharSet to)
{
switch( from )
{
case UTF_8:
switch( to )
{
case UTF_16:
return new CharSetCvtUTF816; // byte order should match machine
case UTF_16_LE:
return new CharSetCvtUTF816(1);
case UTF_16_BE:
return new CharSetCvtUTF816(0);
case UTF_16_BOM:
return new CharSetCvtUTF816(-1, 1);
case UTF_16_LE_BOM:
return new CharSetCvtUTF816(1, 1);
case UTF_16_BE_BOM:
return new CharSetCvtUTF816(0, 1);
case UTF_8_UNCHECKED:
return new CharSetCvt;
case UTF_8_UNCHECKED_BOM:
return new CharSetCvtUTF8UTF8(1, UTF8_WRITE_BOM);
case ISO8859_1:
return new CharSetCvtUTF8to8859_1;
case SHIFTJIS:
return new CharSetCvtUTF8toShiftJis;
case EUCJP:
return new CharSetCvtUTF8toEUCJP;
case WIN_US_ANSI:
return new CharSetCvtUTF8toSimple(6);
case WIN_US_OEM:
return new CharSetCvtUTF8toSimple(0);
case MACOS_ROMAN:
return new CharSetCvtUTF8toSimple(1);
case ISO8859_15:
return new CharSetCvtUTF8toSimple(2);
case ISO8859_5:
return new CharSetCvtUTF8toSimple(3);
case KOI8_R:
return new CharSetCvtUTF8toSimple(4);
case WIN_CP_1251:
return new CharSetCvtUTF8toSimple(5);
case CP850:
return new CharSetCvtUTF8toSimple(7);
case CP858:
return new CharSetCvtUTF8toSimple(8);
case UTF_32:
return new CharSetCvtUTF832; // byte order should match machine
case UTF_32_LE:
return new CharSetCvtUTF832(1);
case UTF_32_BE:
return new CharSetCvtUTF832(0);
case UTF_32_BOM:
return new CharSetCvtUTF832(-1, 1);
case UTF_32_LE_BOM:
return new CharSetCvtUTF832(1, 1);
case UTF_32_BE_BOM:
return new CharSetCvtUTF832(0, 1);
case UTF_8:
return new CharSetCvtUTF8UTF8(1, UTF8_VALID_CHECK);
case UTF_8_BOM:
return new CharSetCvtUTF8UTF8(1, UTF8_VALID_CHECK|UTF8_WRITE_BOM);
case CP949:
return new CharSetCvtUTF8toCp949;
case CP936:
return new CharSetCvtUTF8toCp936;
case CP950:
return new CharSetCvtUTF8toCp950;
case CP737:
return new CharSetCvtUTF8toSimple(11);
case CP1253:
return new CharSetCvtUTF8toSimple(9);
case ISO8859_7:
return new CharSetCvtUTF8toSimple(10);
case CP852:
return new CharSetCvtUTF8toSimple(13);
case CP1250:
return new CharSetCvtUTF8toSimple(12);
case ISO8859_2:
return new CharSetCvtUTF8toSimple(14);
}
break;
case UTF_8_UNCHECKED_BOM:
if (to == UTF_8)
return new CharSetCvtUTF8UTF8(-1, UTF8_WRITE_BOM);
break;
case UTF_8_UNCHECKED:
if (to == UTF_8)
return new CharSetCvt;
break;
case UTF_8_BOM:
if (to == UTF_8)
return new CharSetCvtUTF8UTF8(-1, UTF8_VALID_CHECK|UTF8_WRITE_BOM);
break;
case UTF_16:
if (to == UTF_8)
return new CharSetCvtUTF168;
break;
case UTF_16_BE:
if (to == UTF_8)
return new CharSetCvtUTF168(0);
break;
case UTF_16_LE:
if (to == UTF_8)
return new CharSetCvtUTF168(1);
break;
case UTF_16_BOM:
if (to == UTF_8)
return new CharSetCvtUTF168(-1, 1);
break;
case UTF_16_BE_BOM:
if (to == UTF_8)
return new CharSetCvtUTF168(0, 1);
break;
case UTF_16_LE_BOM:
if (to == UTF_8)
return new CharSetCvtUTF168(1, 1);
break;
case ISO8859_1:
if (to == UTF_8)
return new CharSetCvt8859_1toUTF8;
break;
case ISO8859_15:
if (to == UTF_8)
return new CharSetCvtSimpletoUTF8(2);
break;
case ISO8859_5:
if (to == UTF_8)
return new CharSetCvtSimpletoUTF8(3);
break;
case KOI8_R:
if (to == UTF_8)
return new CharSetCvtSimpletoUTF8(4);
break;
case WIN_CP_1251:
if (to == UTF_8)
return new CharSetCvtSimpletoUTF8(5);
break;
case CP850:
if (to == UTF_8)
return new CharSetCvtSimpletoUTF8(7);
break;
case CP858:
if (to == UTF_8)
return new CharSetCvtSimpletoUTF8(8);
break;
case CP737:
if (to == UTF_8)
return new CharSetCvtSimpletoUTF8(11);
break;
case CP1253:
if (to == UTF_8)
return new CharSetCvtSimpletoUTF8(9);
break;
case ISO8859_7:
if (to == UTF_8)
return new CharSetCvtSimpletoUTF8(10);
break;
case SHIFTJIS:
if (to == UTF_8)
return new CharSetCvtShiftJistoUTF8;
break;
case EUCJP:
if (to == UTF_8)
return new CharSetCvtEUCJPtoUTF8;
break;
case WIN_US_ANSI:
if (to == UTF_8)
return new CharSetCvtSimpletoUTF8(6);
break;
case WIN_US_OEM:
if (to == UTF_8)
return new CharSetCvtSimpletoUTF8(0);
break;
case MACOS_ROMAN:
if (to == UTF_8)
return new CharSetCvtSimpletoUTF8(1);
break;
case CP949:
if (to == UTF_8)
return new CharSetCvtCp949toUTF8;
break;
case CP936:
if (to == UTF_8)
return new CharSetCvtCp936toUTF8;
break;
case CP950:
if (to == UTF_8)
return new CharSetCvtCp950toUTF8;
break;
case UTF_32:
if (to == UTF_8)
return new CharSetCvtUTF328;
break;
case UTF_32_BE:
if (to == UTF_8)
return new CharSetCvtUTF328(0);
break;
case UTF_32_LE:
if (to == UTF_8)
return new CharSetCvtUTF328(1);
break;
case UTF_32_BOM:
if (to == UTF_8)
return new CharSetCvtUTF328(-1, 1);
break;
case UTF_32_BE_BOM:
if (to == UTF_8)
return new CharSetCvtUTF328(0, 1);
break;
case UTF_32_LE_BOM:
if (to == UTF_8)
return new CharSetCvtUTF328(1, 1);
break;
case CP852:
if (to == UTF_8)
return new CharSetCvtSimpletoUTF8(13);
break;
case CP1250:
if (to == UTF_8)
return new CharSetCvtSimpletoUTF8(12);
break;
case ISO8859_2:
if (to == UTF_8)
return new CharSetCvtSimpletoUTF8(14);
break;
}
return NULL;
}
CharSetCvt *
CharSetCvt::FindCachedCvt(CharSetCvt::CharSet from, CharSetCvt::CharSet to)
{
CharSetCvt * cvt = gCharSetCvtCache.FindCvt(from, to);
if (cvt)
return cvt;
cvt = FindCvt(from, to);
if (cvt)
gCharSetCvtCache.InsertCvt(from, to, cvt);
return cvt;
}
CharSetCvt *
CharSetCvtUTF8toShiftJis::Clone()
{
return new CharSetCvtUTF8toShiftJis;
}
CharSetCvt *
CharSetCvtUTF8toShiftJis::ReverseCvt()
{
return new CharSetCvtShiftJistoUTF8;
}
void
CharSetCvtUTF8toShiftJis::printmap(unsigned short f, unsigned short t, unsigned short b)
{
if( b == 0xfffe )
p4debug.printf("U+%04x -> %04x -> unknown\n", f, t);
else
p4debug.printf("U+%04x -> %04x -> U+%04x\n", f, t, b);
}
void
CharSetCvtShiftJistoUTF8::printmap(unsigned short f, unsigned short t, unsigned short b)
{
if( b == 0xfffe )
p4debug.printf("%04x -> U+%04x -> unknown\n", f, t);
else
p4debug.printf("%04x -> U+%04x -> %04x\n", f, t, b);
}
void
CharSetCvtUTF8toShiftJis::printmap(unsigned short f, unsigned short t)
{
p4debug.printf("U+%04x -> %04x\n", f, t);
}
void
CharSetCvtShiftJistoUTF8::printmap(unsigned short f, unsigned short t)
{
p4debug.printf("%04x -> U+%04x\n", f, t);
}
CharSetCvt *
CharSetCvtShiftJistoUTF8::Clone()
{
return new CharSetCvtShiftJistoUTF8;
}
CharSetCvt *
CharSetCvtShiftJistoUTF8::ReverseCvt()
{
return new CharSetCvtUTF8toShiftJis;
}
int
CharSetCvtUTF8toShiftJis::Cvt(const char **sourcestart, const char *sourceend,
char **targetstart, char *targetend)
{
unsigned int v, newv;
while (*sourcestart < sourceend && *targetstart < targetend)
{
v = **sourcestart & 0xff;
int l;
if (v & 0x80)
{
l = bytesFromUTF8[v];
if (l + *sourcestart >= sourceend)
{
lasterr = PARTIALCHAR;
return 0;
}
switch (l)
{
case 2:
v <<= 6;
v += 0xff & *++*sourcestart;
case 1:
v <<= 6;
v += 0xff & *++*sourcestart;
v -= offsetsFromUTF8[l];
# ifdef STRICT_UTF8
if( v < minimumFromUTF8[l] )
{
// illegal over long UTF8 sequence
lasterr = NOMAPPING;
*sourcestart -= l;
return 0;
}
# endif
// at this point v is UCS 2
newv = MapThru(v, UCS2toShiftJis, MapCount(), 0xfffd);
if (newv != 0xfffd)
{
emitit:
if (newv > 0xff)
{
if (2 + *targetstart >= targetend)
{
lasterr = PARTIALCHAR;
*sourcestart -= l;
return 0;
}
*(*targetstart)++ = newv >> 8;
}
**targetstart = newv & 0xff;
break;
}
// Check if this is a 'user-defined character'
if (v >= 0xE000 && v <= 0xE757)
{
// yup...
v -= 0xE000;
newv = v / 0xBC;
v %= 0xBC;
v += 0x40 + (v >= 0x3F);
newv = 0xF000 + (newv << 8) + v;
goto emitit;
}
if (checkBOM && v == 0xfeff)
{
checkBOM = 0;
++*sourcestart;
continue; // suppress BOM
}
*sourcestart -= l;
// note fall through
default:
lasterr = NOMAPPING;
return 0;
}
}
else
{
// almost simple ASCII
#ifdef UNICODEMAPPING
if (v == 0x5c)
{
// map to 0x815F
l = 1;
v = 0x815f;
goto emitit;
}
else if (v == 0x7e /* TILDE */)
{
// we'll keep this anyway?
lasterr = NOMAPPING;
return 0;
}
else
#endif
**targetstart = v;
}
++charcnt;
if( v == '\n' ) {
++linecnt;
charcnt = 0;
}
++*sourcestart;
++*targetstart;
checkBOM = 0;
}
return 0;
}
int
CharSetCvtShiftJistoUTF8::Cvt(const char **sourcestart, const char *sourceend,
char **targetstart, char *targetend)
{
unsigned int v, oldv;
while (*sourcestart < sourceend && *targetstart < targetend)
{
v = **sourcestart & 0xff;
int l = 0;
if ((v & 0x80) && (v < 0xa1 || v >= 0xe0))
{
if (1 + *sourcestart >= sourceend)
{
lasterr = PARTIALCHAR;
return 0;
}
l = 1;
v <<= 8;
v |= *++*sourcestart & 0xff;
}
oldv = v;
if (v > 0x20)
v = MapThru(v, ShiftJistoUCS2, MapCount(), 0xfffd);
if (v == 0xfffd)
{
int upper, lower;
// Check if this is a 'user-defined character'
upper = oldv >> 8;
lower = oldv & 0xff;
if (upper >= 0xF0 && upper <= 0xF9
&& lower >= 0x40 && lower <= 0xFC && lower != 0x7F)
{
// Is a "user-defined character"
// compute UTF8
v = (upper - 0xF0) * 0xBC + 0xE000 - 0x40 +
lower - (lower > 0x7F);
}
else
{
lasterr = NOMAPPING;
if (l)
--*sourcestart;
return 0;
}
}
if (v >= 0x800)
{
if (2 + *targetstart >= targetend)
{
lasterr = PARTIALCHAR;
if (l)
--*sourcestart;
return 0;
}
**targetstart = 0xe0 | (v >> 12);
*++*targetstart = 0x80 | ((v >> 6) & 0x3f);
*++*targetstart = 0x80 | (v & 0x3f);
}
else if (v >= 0x80)
{
if (1 + *targetstart >= targetend)
{
lasterr = PARTIALCHAR;
if (l)
--*sourcestart;
return 0;
}
**targetstart = 0xc0 | (v >> 6);
*++*targetstart = 0x80 | (v & 0x3f);
}
else
**targetstart = v;
++charcnt;
if( v == '\n' ) {
++linecnt;
charcnt = 0;
}
++*targetstart;
++*sourcestart;
}
return 0;
}
CharStep *
CharSetCvtShiftJistoUTF8::FromCharStep( char *p )
{
return new CharStepShiftJis( p );
}
CharSetCvt *
CharSetCvtUTF8toEUCJP::Clone()
{
return new CharSetCvtUTF8toEUCJP;
}
CharSetCvt *
CharSetCvtUTF8toEUCJP::ReverseCvt()
{
return new CharSetCvtEUCJPtoUTF8;
}
CharSetCvt *
CharSetCvtEUCJPtoUTF8::Clone()
{
return new CharSetCvtEUCJPtoUTF8;
}
CharSetCvt *
CharSetCvtEUCJPtoUTF8::ReverseCvt()
{
return new CharSetCvtUTF8toEUCJP;
}
int
CharSetCvtUTF8toEUCJP::Cvt(const char **sourcestart, const char *sourceend,
char **targetstart, char *targetend)
{
unsigned int v, oldv;
while (*sourcestart < sourceend && *targetstart < targetend)
{
v = **sourcestart & 0xff;
int l = 0; // extra characters expected
int t = 2; // characters output
if (v > 0x20)
{
l = bytesFromUTF8[v];
if (l + *sourcestart >= sourceend)
{
lasterr = PARTIALCHAR;
return 0;
}
switch (l)
{
case 2:
v <<= 6;
v += 0xff & *++*sourcestart;
case 1:
v <<= 6;
v += 0xff & *++*sourcestart;
v -= offsetsFromUTF8[l];
# ifdef STRICT_UTF8
if( v < minimumFromUTF8[l] )
{
// illegal over long UTF8 sequence
lasterr = NOMAPPING;
*sourcestart -= l;
return 0;
}
# endif
// at this point v is UCS 2
case 0:
oldv = v;
v = MapThru(v, UCS2toEUCJP, MapCount(), 0xfffd);
if (v == 0xfffd && oldv >= 0xe000 && oldv <= 0xe757)
{
// user defined character
oldv -= 0xe000;
int line = oldv / 94;
v = (line << 8) + (oldv % 94) + (line < 10 ? 0xf5a1 : 0x6B21);
}
if (v != 0xfffd)
{
if (v > 0xa0) // codeset (1,2,3)
{
if ((v > 0xdf) && (v>>8 < 0xa1))
t = 3; // code set 3 (3 chars output)
if ((t + *targetstart) >= targetend)
{
lasterr = PARTIALCHAR;
*sourcestart -= l;
return 0;
}
if (t == 3)
{
// code set 3, prepend 0x8f and offset 0x8080
*(*targetstart)++ = 0x8f;
v += 0x8080;
}
if (v < 0xe0)
{
// code set 2, prepend 0x8e
*(*targetstart)++ = 0x8e;
}
else
{
*(*targetstart)++ = v >> 8;
}
}
**targetstart = v & 0xff;
break;
}
if (checkBOM && oldv == 0xfeff)
{
// suppress BOM
checkBOM = 0;
++*sourcestart;
continue;
}
*sourcestart -= l;
// note fall through
default:
lasterr = NOMAPPING;
return 0;
}
}
else
{
**targetstart = v;
}
++*sourcestart;
++*targetstart;
checkBOM = 0;
++charcnt;
if( v == '\n' ) {
++linecnt;
charcnt = 0;
}
}
return 0;
}
int
CharSetCvtEUCJPtoUTF8::Cvt(const char **sourcestart, const char *sourceend,
char **targetstart, char *targetend)
{
unsigned int v, oldv;
while( (*sourcestart < sourceend) && (*targetstart < targetend) )
{
v = **sourcestart & 0xff;
int l = 0; // extra characters expected
int c = 0; // code set detection (0-3)
if ( v > 0x7e ) // another char rqrd
{
// which codeset (1-3)
c = (v == 0x8e) ? 2 : ((v == 0x8f) ? 3 : 1);
l = (c == 3) ? 2 : 1;
if( (l + *sourcestart) >= sourceend )
{
lasterr = PARTIALCHAR;
return 0;
}
if( c > 1 ) // lose the first byte its info only
v = *++*sourcestart & 0xff;
if( (c == 1) || (c == 3) )
{
v <<= 8;
v |= *++*sourcestart & 0xff;
}
// if codeset is 3, subtract offset
if (c == 3)
v -= 0x8080;
}
oldv = v;
if ( v > 0x20 )
v = MapThru(v, EUCJPtoUCS2, MapCount(), 0xfffd);
if (v == 0xfffd)
{
// check for user-defined character
if ( c == 3 )
oldv += 0x8080;
int c1 = oldv >> 8;
int c2 = oldv & 0xff;
if ( c1 >= 0xF5 && c1 <= 0xFE && c2 >= 0xA1 && c2 <=0xFE )
{
// is a user-defined character
c1 -= 0xF5;
c2 -= 0xA1;
v = 0xE000 + (c1 * 94) + c2;
if (c == 3)
v += 940;
}
else
{
lasterr = NOMAPPING;
while(l--)
--*sourcestart;
return 0;
}
}
if (v >= 0x800) // 3 UTF8 bytes required
{
if (2 + *targetstart >= targetend)
{
lasterr = PARTIALCHAR;
while(l--)
--*sourcestart;
return 0;
}
**targetstart = 0xe0 | (v >> 12);
*++*targetstart = 0x80 | ((v >> 6) & 0x3f);
*++*targetstart = 0x80 | (v & 0x3f);
}
else if (v >= 0x80) // 2 UTF8 bytes required
{
if (1 + *targetstart >= targetend)
{
lasterr = PARTIALCHAR;
while(l--)
--*sourcestart;
return 0;
}
**targetstart = 0xc0 | (v >> 6);
*++*targetstart = 0x80 | (v & 0x3f);
}
else
**targetstart = v; // 1 UTF8 byte required
++*targetstart;
++*sourcestart;
++charcnt;
if( v == '\n' ) {
++linecnt;
charcnt = 0;
}
}
return 0;
}
CharStep *
CharSetCvtEUCJPtoUTF8::FromCharStep( char *p )
{
return new CharStepEUCJP( p );
}
static char *
cvteucval(unsigned short v)
{
static char obuf[20];
if (v > 0x7f && v < 0x8000)
{
if (v <= 0xff)
sprintf(obuf, " 8e%2x", v);
else
sprintf(obuf, "8f%4x", v ^ 0x8080);
}
else
sprintf(obuf, "%6x", v);
return obuf;
}
void
CharSetCvtUTF8toEUCJP::printmap(unsigned short f, unsigned short t, unsigned short b)
{
if( b == 0xfffe )
p4debug.printf("U+%04x -> %s -> unknown\n", f, cvteucval(t));
else
p4debug.printf("U+%04x -> %s -> U+%04x\n", f, cvteucval(t), b);
}
void
CharSetCvtEUCJPtoUTF8::printmap(unsigned short f, unsigned short t, unsigned short b)
{
if( b == 0xfffe )
p4debug.printf("%s -> U+%04x -> unknown\n", cvteucval(f), t);
else
{
p4debug.printf("%s", cvteucval(f));
p4debug.printf(" -> U+%04x -> %s\n", t, cvteucval(b));
}
}
void
CharSetCvtUTF8toEUCJP::printmap(unsigned short f, unsigned short t)
{
p4debug.printf("U+%04x -> %s\n", f, cvteucval(t));
}
void
CharSetCvtEUCJPtoUTF8::printmap(unsigned short f, unsigned short t)
{
p4debug.printf("%s -> U+%04x\n", cvteucval(f), t);
}
CharSetCvt *
CharSetCvtUTF8toCp949::Clone()
{
return new CharSetCvtUTF8toCp949;
}
CharSetCvt *
CharSetCvtUTF8toCp949::ReverseCvt()
{
return new CharSetCvtCp949toUTF8;
}
CharSetCvt *
CharSetCvtUTF8toCp936::Clone()
{
return new CharSetCvtUTF8toCp936;
}
CharSetCvt *
CharSetCvtUTF8toCp936::ReverseCvt()
{
return new CharSetCvtCp936toUTF8;
}
CharSetCvt *
CharSetCvtUTF8toCp950::Clone()
{
return new CharSetCvtUTF8toCp950;
}
CharSetCvt *
CharSetCvtUTF8toCp950::ReverseCvt()
{
return new CharSetCvtCp950toUTF8;
}
void
CharSetCvtUTF8toCp::printmap(unsigned short f, unsigned short t, unsigned short b)
{
if( b == 0xfffe )
p4debug.printf("U+%04x -> %04x -> unknown\n", f, t);
else
p4debug.printf("U+%04x -> %04x -> U+%04x\n", f, t, b);
}
void
CharSetCvtCptoUTF8::printmap(unsigned short f, unsigned short t, unsigned short b)
{
if( b == 0xfffe )
p4debug.printf("%04x -> U+%04x -> unknown\n", f, t);
else
p4debug.printf("%04x -> U+%04x -> %04x\n", f, t, b);
}
void
CharSetCvtUTF8toCp::printmap(unsigned short f, unsigned short t)
{
p4debug.printf("U+%04x -> %04x\n", f, t);
}
void
CharSetCvtCptoUTF8::printmap(unsigned short f, unsigned short t)
{
p4debug.printf("%04x -> U+%04x\n", f, t);
}
int
CharSetCvtUTF8toCp::Cvt(const char **sourcestart, const char *sourceend,
char **targetstart, char *targetend)
{
unsigned int v, newv;
while (*sourcestart < sourceend && *targetstart < targetend)
{
v = **sourcestart & 0xff;
int l;
if (v & 0x80)
{
l = bytesFromUTF8[v];
if (l + *sourcestart >= sourceend)
{
lasterr = PARTIALCHAR;
return 0;
}
switch (l)
{
case 2:
v <<= 6;
v += 0xff & *++*sourcestart;
case 1:
v <<= 6;
v += 0xff & *++*sourcestart;
v -= offsetsFromUTF8[l];
# ifdef STRICT_UTF8
if( v < minimumFromUTF8[l] )
{
// illegal over long UTF8 sequence
lasterr = NOMAPPING;
*sourcestart -= l;
return 0;
}
# endif
// at this point v is UCS 2
newv = MapThru(v, toMap, toMapSize, 0xfffd);
if (newv != 0xfffd)
{
emitit:
if (newv > 0xff)
{
if (2 + *targetstart >= targetend)
{
lasterr = PARTIALCHAR;
*sourcestart -= l;
return 0;
}
*(*targetstart)++ = newv >> 8;
}
**targetstart = newv & 0xff;
break;
}
if (checkBOM && v == 0xfeff)
{
checkBOM = 0;
++*sourcestart;
continue; // suppress BOM
}
*sourcestart -= l;
// note fall through
default:
lasterr = NOMAPPING;
return 0;
}
}
else
{
**targetstart = v;
}
++charcnt;
if( v == '\n' ) {
++linecnt;
charcnt = 0;
}
++*sourcestart;
++*targetstart;
checkBOM = 0;
}
return 0;
}
CharSetCvt *
CharSetCvtCp949toUTF8::Clone()
{
return new CharSetCvtCp949toUTF8;
}
CharSetCvt *
CharSetCvtCp949toUTF8::ReverseCvt()
{
return new CharSetCvtUTF8toCp949;
}
CharSetCvt *
CharSetCvtCp936toUTF8::Clone()
{
return new CharSetCvtCp936toUTF8;
}
CharSetCvt *
CharSetCvtCp936toUTF8::ReverseCvt()
{
return new CharSetCvtUTF8toCp936;
}
CharSetCvt *
CharSetCvtCp950toUTF8::Clone()
{
return new CharSetCvtCp950toUTF8;
}
CharSetCvt *
CharSetCvtCp950toUTF8::ReverseCvt()
{
return new CharSetCvtUTF8toCp950;
}
int
CharSetCvtCp936toUTF8::isDoubleByte( int leadByte )
{
return leadByte >= 0x81 && leadByte <= 0xfe;
}
int
CharSetCvtCp949toUTF8::isDoubleByte( int leadByte )
{
return leadByte >= 0x81 && leadByte <= 0xfd && leadByte != 0xc9;
}
int
CharSetCvtCp950toUTF8::isDoubleByte( int leadByte )
{
return leadByte >= 0xa1 && leadByte <= 0xc6
|| leadByte >= 0xc9 && leadByte <= 0xf9;
}
int
CharSetCvtCptoUTF8::Cvt(const char **sourcestart, const char *sourceend,
char **targetstart, char *targetend)
{
unsigned int v, oldv;
while (*sourcestart < sourceend && *targetstart < targetend)
{
v = **sourcestart & 0xff;
int l = 0;
if ( isDoubleByte( v ) )
{
if (1 + *sourcestart >= sourceend)
{
lasterr = PARTIALCHAR;
return 0;
}
l = 1;
v <<= 8;
v |= *++*sourcestart & 0xff;
}
oldv = v;
if (v > 0x7f)
v = MapThru(v, toMap, toMapSize, 0xfffd);
if (v == 0xfffd)
{
lasterr = NOMAPPING;
if (l)
--*sourcestart;
return 0;
}
if (v >= 0x800)
{
if (2 + *targetstart >= targetend)
{
lasterr = PARTIALCHAR;
if (l)
--*sourcestart;
return 0;
}
**targetstart = 0xe0 | (v >> 12);
*++*targetstart = 0x80 | ((v >> 6) & 0x3f);
*++*targetstart = 0x80 | (v & 0x3f);
}
else if (v >= 0x80)
{
if (1 + *targetstart >= targetend)
{
lasterr = PARTIALCHAR;
if (l)
--*sourcestart;
return 0;
}
**targetstart = 0xc0 | (v >> 6);
*++*targetstart = 0x80 | (v & 0x3f);
}
else
**targetstart = v;
++charcnt;
if( v == '\n' ) {
++linecnt;
charcnt = 0;
}
++*targetstart;
++*sourcestart;
}
return 0;
}
CharStep *
CharSetCvtCp949toUTF8::FromCharStep( char *p )
{
return new CharStepCP949( p );
}
CharStep *
CharSetCvtCp936toUTF8::FromCharStep( char *p )
{
return new CharStepCN( p );
}
CharStep *
CharSetCvtCp950toUTF8::FromCharStep( char *p )
{
return new CharStepCN( p );
}