TextEncoder.cpp #1

#include "StdAfx.h"
#include "TextEncoder.h"
#include <String.h>

TextEncoder::TextEncoder(void)
{
}

TextEncoder::~TextEncoder(void)
{
}

void TextEncoder::SmokeTest(void)
{
    // test mode fo Unicode transcoding
    //wchar_t * wTestStr = L"The € quick ¢ brown fox jumped the fence <АБВГ>";
    wchar_t * wTestStr = L"\xD852\xDF62 The € quick ¢ brown fox jumped the fence <АБВГ>";
    //wchar_t * wTestStr = L"\xD852\xDF62 T €<АБВГ>";

    unsigned long uniCodeLong = 0x00024B62; // a Unicode point that requires a surrogate pair to encode in UTF-16
        
    SurrogatePair sp =  TextEncoder::UnicodePointToSurrogatePair(uniCodeLong);
    unsigned long uniCodeTest = TextEncoder::SurrogatePairToUnicodePoint(sp);

    wprintf(L"UTF-16 String: %s\r\n", wTestStr);

    char * aTestStr = TextEncoder::Utf16ToUtf8(wTestStr);
    printf("UTF-8 Translation: %s\r\n", aTestStr);

    wchar_t * wResultStr = TextEncoder::Utf8ToUtf16(aTestStr);

    wprintf(L"UTF-16 Retranslation: %s\r\n", wResultStr);
}

char * TextEncoder::CopyStr(const char *s)
{
    int cnt = strlen(s);
    char * newStr = new char[cnt+1];
    for (int idx = 0; idx < cnt; idx++)
        newStr[idx] = s[idx];
    newStr[cnt] = '\0';
    return newStr;
}

wchar_t * TextEncoder::CopyWStr(const wchar_t *s)
{
    int cnt = wcslen(s);
    wchar_t * newStr = new wchar_t[cnt+1];
    for (int idx = 0; idx < cnt; idx++)
        newStr[idx] = s[idx];
    newStr[cnt] = '\0';
    return newStr;
}

SurrogatePair TextEncoder::UnicodePointToSurrogatePair(unsigned long codePoint)
{
    SurrogatePair val;
    val.first = 0;
    val.second = 0;

    // need to encode an up to 21 bit code point into a two wchar surrogate pair.
        
    codePoint -= 0x10000; //this gives us a 20 bit number, yyyyyyyyyyxxxxxxxxxx

    // yyyyyyyyyyxxxxxxxxxx -> the two surrogate pair using this transfermation
    //              110110yyyyyyyyyy (0xD800 + yyyyyyyyyyy) 110111xxxxxxxxxx (0xDC00 +yyyyyyyyyy)

    val.first = 0xD800 | ((codePoint >> 10) & 0x03FF); //yyyyyyyyyy goes in the first of the pair

    val.second = 0xDC00 | (codePoint & 0x03FF); //xxxxxxxxxx goes in the second of the pair

    return val;
}

unsigned long TextEncoder::SurrogatePairToUnicodePoint(wchar_t first, wchar_t second)
{
    unsigned long val = 0;

    // get the 20 bit value from the pair
    //  110110yyyyyyyyyy 110111xxxxxxxxxx --> 000000000000YYYY yyyyyyxxxxxxxxxx (32 bit number, need only 20 bits)
    val = ((unsigned long)(0x03FF & first)) << 10; // get yyyyyyyyyy in the right place
    val |= ((unsigned long)(0x03FF & second)); //get xxxxxxxxxx

    val += 0x10000;

    return val;
}

unsigned long TextEncoder::SurrogatePairToUnicodePoint(SurrogatePair surrogates)
{
    return SurrogatePairToUnicodePoint(surrogates.first, surrogates.second);
}

void TextEncoder::RecodeUtf16CharInUtf8(const wchar_t * pSrc, char * pDest)
{
    // check two see if this is a sing UTF-16 wchar or a surrogate pair
    if (( *pSrc < 0xD800 ) || ( *pSrc > 0xDFFF ))
    {
        //single wchar 
        if (*pSrc < 128)
        {
            // single byte ASCII UTF16 character
            *pDest = (char) *pSrc;
            return;
        }
        if (*pSrc < 0x0800)
        {
            // two byte UTF8 character
            // 00000yyy XXxxxxxx --> 110yyyXX 10xxxxxx
            
            // first char
            char c = 0xC0 | (char)(((wchar_t)(*pSrc >> 6)) & 0x1F); 
            *pDest = c;

            //second char
            c = 0x80 | (char)(*pSrc & 0x3F);
            *(pDest + 1) = c;
            return;
        }
        else
        {
            // three byte UTF8 character
            // YYYYyyyy XXxxxxxx --> 1110YYYY 10yyyyXX 10xxxxxx
            
            // first char
            char c = 0xE0 | (((wchar_t)(*pSrc >> 12)) & 0x0F); 
            *pDest = c;

            //second char
            c = 0x80 | (char)(((wchar_t)(*pSrc >> 6)) & 0x3F);
            *(pDest + 1) = c;

            //third char
            c = 0x80 | (char)(*pSrc & 0x3F);
            *(pDest + 2) = c;
            return;
        }
    }

    // this is a two wchar surrogate pair and it's going to take 4 bytes

    // get the 20 bit value from the pair
    //  110110YYYYyyyyyy 110111xxxxxxxxxx --> 000000000000YYYY yyyyyyxxxxxxxxxx +0x10000 (32 bit number, need only 21 bits)
    unsigned long uCodePoint = SurrogatePairToUnicodePoint(*pSrc, *(pSrc + 1));

    // encode the now bits into 4 UTF-8 bytes
    // 000ZZZzz YYYYyyyy XXxxxxxx --> 11110ZZZ 10zzYYYY 10yyyyXX 10xxxxxx

    // first char
    char c = 0xF0 | (char)((unsigned long)(((uCodePoint >> 18)) & 0x07)); 
    *pDest = c;

    //second char
    c = 0x80 | (char)((unsigned long)(((uCodePoint >> 12)) & 0x3F));
    *(pDest + 1) = c;

    //third char
    c = 0x80 | (char)((unsigned long)(((uCodePoint >> 6)) & 0x3F));
    *(pDest + 2) = c;

    //fourth char
    c = 0x80 | (char)((unsigned long)(uCodePoint & 0x3F));
    *(pDest + 3) = c;

    return;
}

void TextEncoder::RecodeUtf8CharInUtf16(const char * pSrc, wchar_t * pDest)
{

    if ( !(*pSrc & 0x80) ) // ASCII character < 128
    {
        *pDest = (wchar_t) *pSrc;
        return;
    }
    unsigned int w1 = 0;
    unsigned long w2 = 0;

    if ( (*pSrc & 0xE0) == 0xC0) // 110xxxxx 2 bytes in UTF-8
    {
        // 110yyyXX 10xxxxxx UTF-8 -->  00000yyy XXxxxxxx UTF-16

        w1 = ((wchar_t)(*pSrc & 0x1C)) << 6; // puts the yyy in the lower bits of the upper byte where they belong
        w1 &= 0xFF00; // mask off the lower byte

        w1 |= ((*pSrc & 0x03)) << 6; // get the two XX bits from the first utf-8 character
                                   // into the right bits of the lower byte
        w1 |= (*(pSrc+1) & 0x3F);  // copy the remaining xxxxxx bits from the 2nd UTF-8 byte

        *pDest = (wchar_t) w1;

        return;
    }
    if ( (*pSrc & 0xF0) == 0xE0) // 1110xxxx 3 bytes in UTF-8
    {
        // 1110YYYY 10yyyyXX 10xxxxxx UTF-8 -->  YYYYyyyy XXxxxxxx UTF-16

        w1 = ((wchar_t)(*pSrc & 0x0F)) << 12; // puts the YYYY in the upper bits of the upper byte

        w1 |= ((wchar_t)(*(pSrc + 1) & 0x3C)) << 6; //puts yyyy in the lower bits of the upper byte

        w1 |= ((wchar_t)(*(pSrc + 1) & 0x03)) << 6; // get the two XX bits from the second utf-8 character
                                       // into the upper bits of the lower byte

        w1 |= (*(pSrc+2) & 0x2F);  // copy the remaining xxxxxx bits from the 3rd UTF-8 byte

        *pDest = (wchar_t) w1;

        return;
    }
    if ( (*pSrc & 0xF1) == 0xF0) // 11110xxx
    {
        // 11110ZZZ 10zzYYYY 10yyyyXX 10xxxxxx UTF-8 --> 000000 000ZZZzz YYYYyyyy XXxxxxxx UTF-16 (extended code plane)

        w2 = ((unsigned long)(*pSrc & 0x07)) << 2; // puts the ZZZ in the middle bits of the lower byte
        w2 |= ((unsigned long)(*(pSrc + 1) & 0x30)) >> 4; // puts the zz in the lower bits of the lower byte
        w2 <<= 16; //Shift the lower byte to the third byte

        w1 = ((unsigned long)(*(pSrc + 1) & 0x0F)) << 4; // puts the YYYY in the upper bits of the lower byte
        w1 |= ((unsigned long)(*(pSrc + 2) & 0x3C)) >> 2; //puts yyyy in the lower bits of the lower byte
        w1 <<= 8; // shift into the upper byte where they belong

        w1 |= ((unsigned long)(*(pSrc+2) & 0x03)) << 6; // get the two XX bits from the 3rd utf-8 character
                                       // into the right bits of the lower byte

        w1 |= ((unsigned long)(*(pSrc+3)) & 0x2F);  // copy the remaining xxxxxx bits from the 4th UTF-8 byte

        w2 |= w1; // put both bytes in w1 we now have up to a 21 bit number


        // need to encode the 21 bit Unicode point into a two wchar surrogate pair.
        SurrogatePair sp = UnicodePointToSurrogatePair(w2);

        *pDest = sp.first;

        *(pDest + 1) = sp.second;

        return;
    }
    // Not a valid Utf-8 leading byte

    *pDest = (wchar_t) 0xFFFD;

    return;

}
wchar_t * TextEncoder::Utf8ToUtf16(const char * pStr)
{
    int cnt = Utf8StrLen( pStr );

    // initially allow space for each character to take 4 bytes
    wchar_t * pCpy = new wchar_t[(2 * cnt) + 1];

    // copy each 1,2,3, or 4 byte utf-8 encoded character into the proper
    // sized (1 or 2 wchar_t) utf-16 encoded character
    int idx1 = 0;
    int idx2 = 0;
    for (int idx = 0; idx < cnt; idx++)
    {
        RecodeUtf8CharInUtf16(&pStr[idx2], &pCpy[idx1]);

        idx1 += Utf16CharSize(pCpy[idx1]);
        idx2 += Utf8CharSize(pStr[idx2]);
    }
    pCpy[idx1] = '\0';

    // copy into the 'right' sized array
    wchar_t * pRet = CopyWStr(pCpy);
    delete pCpy;

    return pRet;
}

wchar_t * TextEncoder::AsciiToUtf16(const char * pStr)
{
    int cnt = strlen( pStr );

    wchar_t * pCpy = new wchar_t[cnt + 1]; 

    for (int idx = 0; idx < cnt; idx++)
    {
        pCpy[idx] = (wchar_t) pStr[idx];
    }
    pCpy[cnt] = '\0';
    return pCpy;

}

int TextEncoder::Utf16StrLen(const wchar_t * pStr)  // length in characters, not bytes
{
    int cnt = 0;
    int idx = 0;
    while ( pStr[idx] != '\0')
     {
         idx += Utf16CharSize(pStr[idx]);
         cnt++;
    }
    return cnt;
}

int TextEncoder::Utf16StrBytes(const wchar_t * pStr)  // length in bytes, not characters
{
    int cnt = 0;
    while ( pStr[cnt] != '\0')
        cnt += 2;
    return cnt;
}

int TextEncoder::Utf16CharSize(const wchar_t c)  // How many wchar_t's in this character, 1 or 2?
{
    if (( c >= 0xD800 ) && ( c <= 0xDFFF ))
        return 2;
    return 1;
}

char * TextEncoder::Utf16ToUtf8(const wchar_t * pStr)
{
    int cnt = Utf16StrLen(pStr);

    // allow 4 bytes per character (max length possible)
    char * pCpy = new char[(4 * cnt) + 1];

    // copy each 1 or 2 wchar character into a 1,2,3, or 4 byte utf-8 encoded character
    int idx1 = 0;
    int idx2 = 0;
    for (int idx = 0; idx < cnt; idx++)
    {
        RecodeUtf16CharInUtf8(&pStr[idx2], &pCpy[idx1]);

        idx1 += Utf8CharSize(pCpy[idx1]);
        idx2 += Utf16CharSize(pStr[idx2]);
    }
    pCpy[idx1] = '\0';

    // copy into the 'right' sized array
    char * pRet = CopyStr(pCpy);
    delete pCpy;

    return pRet;
}
char * TextEncoder::AsciiToUtf8(const char * pStr)
{
    // An Ascii string is already utf-8, so just copy it
    return CopyStr(pStr);
}

// length in characters, not bytes
int TextEncoder::Utf8StrLen(const char* pStr)
{
    int cnt = 0;
    int idx = 0;
    while ( pStr[idx] != '\0')
     {
         idx += Utf8CharSize(pStr[idx]);
         cnt++;
    }
    return cnt;
}
 
// length in bytes, not characters
int TextEncoder::Utf8StrBytes(const char * pStr)
{
    int cnt = 0;
    while ( pStr[cnt] != '\0')
        cnt++;
    return cnt;
}

// How many bytes in this character, 1, 2, 3, or 4?
int TextEncoder::Utf8CharSize(char c)
{
    if ( !(c & 0x80) ) // ASCII character < 128
        return 1;
    if ( (c & 0xE0) == 0xC0) // 110xxxxx
        return 2;
    if ( (c & 0xF0) == 0xE0) // 1110xxxx
        return 3;
    if ( (c & 0xF1) == 0xF0) // 11110xxx
        return 4;

    // Not a valid Utf-8 leading byte
    return 1;
}

char * TextEncoder::Utf16ToAscii(const wchar_t * pStr)
{
    int cnt = Utf16StrLen(pStr);

    char * pCpy = new char[cnt + 1];

    int idx2 = 0;
    for (int idx = 0; idx < cnt; idx++)
    {
        if (pStr[idx2] < 128)
            pCpy[idx] = (char) pStr[idx2];
        else
            pCpy[idx] = '?'; // not mapable

        idx2 += Utf16CharSize(pStr[idx2]);
    }
    pCpy[cnt] = '\0';
    return pCpy;
}

char * TextEncoder::Utf8ToAscii(const char * pStr)
{
    int cnt = Utf8StrLen(pStr);

    char * pCpy = new char[cnt + 1];

    int idx2 = 0;
    for (int idx = 0; idx < cnt; idx++)
    {
        if (((unsigned char) pStr[idx2]) < 128)
            pCpy[idx] = (char) pStr[idx2];
        else
            pCpy[idx] = '?'; // not mapable

        idx2 += Utf8CharSize(pStr[idx2]);
    }
    pCpy[cnt] = '\0';
    return pCpy;
}
#	Change	User	Description
#1	13946	OmegaNemesis28	Merging //guest/perforce_software/p4api.net/... to //guest/omeganemesis28/p4api.net/...
//guest/perforce_software/p4api.net/p4bridge-unit-test/TextEncoder.cpp
#2	8964	Bill	fix line endings
#1	8873	Matt Attaway	Initial add of the P4API.NET source code