/*
* Copyright 2001 Perforce Software. All rights reserved.
*
*/
/*
* i18napi.cc - API support for charset conversion identifiers
*
*/
// this NEED_GETPID triggers loading windows.h
#define NEED_GETPID
#include <stdhdrs.h>
#include <strbuf.h>
#include "i18napi.h"
#include "enviro.h"
static const char *charsetname[] = {
"none",
"utf8",
"iso8859-1",
"utf16-nobom",
"shiftjis",
"eucjp",
"winansi",
"winoem",
"macosroman",
"iso8859-15",
"iso8859-5",
"koi8-r",
"cp1251",
"utf16le",
"utf16be",
"utf16le-bom",
"utf16be-bom",
"utf16",
"utf8-bom",
"utf32-nobom",
"utf32le",
"utf32be",
"utf32le-bom",
"utf32be-bom",
"utf32",
"utf8unchecked",
"utf8unchecked-bom",
"cp949",
"cp936",
"cp950",
"cp850",
"cp858",
"cp1253",
"cp737",
"iso8859-7"
};
static unsigned int charsetcount = sizeof(charsetname) / sizeof(*charsetname);
unsigned int
CharSetApi::CharSetCount()
{
return charsetcount;
}
const char *
CharSetApi::Name(CharSetApi::CharSet c)
{
unsigned int i = (unsigned int) c;
if( i < charsetcount )
return charsetname[i];
return NULL;
}
CharSetApi::CharSet
CharSetApi::Lookup(const char *s, Enviro *env)
{
StrRef st( s );
if( st == "auto" )
return Discover( env );
for( unsigned int i = 0; i < charsetcount; ++i)
if( st == charsetname[i] )
return (CharSet) i;
return (CharSet) -1;
}
CharSetApi::CharSet
CharSetApi::Discover(Enviro *enviro)
{
#if defined( OS_NT )
UINT codePage = GetConsoleCP();
if( codePage == 0 )
codePage = GetACP();
switch (codePage)
{
case 437: return WIN_US_OEM;
// case 737: return CP737;
case 850: return CP850;
case 858: return CP858;
case 932: return SHIFTJIS;
case 936: return CP936;
case 949: return CP949;
case 950: return CP950;
case 1200: return UTF_16_LE_BOM;
case 1201: return UTF_16_BE_BOM;
case 1251: return WIN_CP_1251;
case 1252: return WIN_US_ANSI;
case 1253: return CP1253;
case 10000: return MACOS_ROMAN;
case 12000: return UTF_32_LE_BOM;
case 12001: return UTF_32_BE_BOM;
case 20866: return KOI8_R;
case 20932: return EUCJP;
case 28591: return ISO8859_1;
case 28595: return ISO8859_5;
case 28605: return ISO8859_15;
case 65001: return UTF_8;
}
return (CharSet) -1;
#else
const char *setInEnv = NULL;
if( enviro )
setInEnv = enviro->Get( "LANG" );
if( !setInEnv )
setInEnv = getenv( "LANG" );
if( !setInEnv ) // Without LANG, we'll assume UTF-8
return UTF_8;
int begin = 0;
int len = strlen( setInEnv );
// if it is just set to C, use UTF_8
if( len == 1 && setInEnv[0] == 'C' )
return UTF_8;
// Now we search for this pattern
// [language[_territory][.codeset][@modifier]]
// See http://osr507doc.sco.com/en/man/html.M/locale.M.html
// We are interested in the codeset so look for '.'
for( begin=0; begin < len; begin++ )
if( setInEnv[ begin ] == '.' )
break;
// modifier is optional
if( begin < len )
{
int end;
begin++;
for( end = begin; end < len; end++)
if( setInEnv[ end ] == '@' )
break;
StrBuf charset;
charset.Set( &(setInEnv[ begin ]), end - begin );
setInEnv = charset.Text();
if( !StrPtr::CCompare(setInEnv, "ISO8859-1") )
return ISO8859_1;
if( !StrPtr::CCompare(setInEnv, "ISO8859-5") )
return ISO8859_5;
if( !StrPtr::CCompare(setInEnv, "ISO8859-7") )
return ISO8859_7;
if( !StrPtr::CCompare(setInEnv, "ISO8859-15") )
return ISO8859_15;
if( !StrPtr::CCompare(setInEnv, "JISX0201.1976-0") )
return SHIFTJIS;
if( !StrPtr::CCompare(setInEnv, "JISX0208.1983-0") )
return SHIFTJIS;
if( !StrPtr::CCompare(setInEnv, "EUC-JP") )
return EUCJP;
if( !StrPtr::CCompare(setInEnv, "UTF-8") )
return UTF_8;
if( !StrPtr::CCompare(setInEnv, "GB2312.1980-0") )
return CP936;
if( !StrPtr::CCompare(setInEnv, "GB18030") )
return CP936;
if( !StrPtr::CCompare(setInEnv, "KSC5601.1987-0") )
return CP949;
}
return UTF_8;
#endif
}
int
CharSetApi::isUnicode(CharSetApi::CharSet c)
{
switch( c )
{
case UTF_8:
case UTF_8_BOM:
case UTF_8_UNCHECKED:
case UTF_8_UNCHECKED_BOM:
case UTF_16:
case UTF_16_LE:
case UTF_16_BE:
case UTF_16_BOM:
case UTF_16_LE_BOM:
case UTF_16_BE_BOM:
case UTF_32:
case UTF_32_LE:
case UTF_32_BE:
case UTF_32_BOM:
case UTF_32_LE_BOM:
case UTF_32_BE_BOM:
return 1;
}
return 0;
}
int
CharSetApi::Granularity(CharSetApi::CharSet c)
{
if( c == UTF_16 || c >= UTF_16_LE && c <= UTF_16_BOM )
return 2;
if( c >= UTF_32 && c <= UTF_32_BOM )
return 4;
if( (unsigned int)c < charsetcount )
return 1;
return 0;
}