COMMON: Support cp936

This commit is contained in:
Vladimir Serbinenko 2023-02-27 04:16:28 +01:00
parent e7a760e9c5
commit 01a71b4927
4 changed files with 122 additions and 0 deletions

View File

@ -94,6 +94,8 @@ const uint16 invalidCode = 0xFFFD;
static bool cjk_tables_loaded = false;
static const uint16 *windows932ConversionTable = 0;
static const uint16 *windows932ReverseConversionTable = 0;
static const uint16 *windows936ConversionTable = 0;
static const uint16 *windows936ReverseConversionTable = 0;
static const uint16 *windows949ConversionTable = 0;
static const uint16 *windows949ReverseConversionTable = 0;
static const uint16 *windows950ConversionTable = 0;
@ -145,6 +147,7 @@ static void loadCJKTables() {
windows949ConversionTable = loadCJKTable(f, 1, 0x7e * 0xb2);
windows950ConversionTable = loadCJKTable(f, 2, 89 * 157);
johabConversionTable = loadCJKTable(f, 3, 80 * 188);
windows936ConversionTable = loadCJKTable(f, 4, 126 * 190);
}
void releaseCJKTables() {
@ -153,6 +156,10 @@ void releaseCJKTables() {
windows932ConversionTable = 0;
delete[] windows932ReverseConversionTable;
windows932ReverseConversionTable = 0;
delete[] windows936ConversionTable;
windows936ConversionTable = 0;
delete[] windows936ReverseConversionTable;
windows936ReverseConversionTable = 0;
delete[] windows949ConversionTable;
windows949ConversionTable = 0;
delete[] windows949ReverseConversionTable;
@ -224,6 +231,49 @@ void U32String::decodeWindows932(const char *src, uint32 len) {
}
}
void U32String::decodeWindows936(const char *src, uint32 len) {
ensureCapacity(len, false);
if (!cjk_tables_loaded)
loadCJKTables();
for (uint i = 0; i < len;) {
uint8 high = src[i++];
if ((high & 0x80) == 0x00) {
operator+=(high);
continue;
}
// Euro symbol
if (high == 0x80) {
operator+=(0x20ac);
continue;
}
if (high == 0xff) {
operator+=(invalidCode);
continue;
}
uint8 low = src[i++];
if (low < 0x40 || low == 0x7f || low == 0xff) {
operator+=(invalidCode);
continue;
}
uint8 lowidx = low > 0x7f ? low - 0x41 : low - 0x40;
uint8 highidx = high - 0x81;
if (!windows936ConversionTable) {
operator+=(invalidCode);
continue;
}
uint16 val = windows936ConversionTable[highidx * 190 + lowidx];
operator+=(val ? val : invalidCode);
}
}
static uint16 convertUHCToUCSReal(uint8 high, uint8 low) {
uint lowidx = 0;
if (low >= 0x41 && low < 0x5b)
@ -462,6 +512,67 @@ StringEncodingResult String::encodeWindows932(const U32String &src, char errorCh
return encodingResult;
}
StringEncodingResult String::encodeWindows936(const U32String &src, char errorChar) {
StringEncodingResult encodingResult = kStringEncodingResultSucceeded;
ensureCapacity(src.size() * 2, false);
if (!cjk_tables_loaded)
loadCJKTables();
if (!windows936ReverseConversionTable && windows936ConversionTable) {
uint16 *rt = new uint16[0x10000]();
for (uint highidx = 0; highidx < 190; highidx++) {
uint8 high = highidx + 0x81;
for (uint lowidx = 0; lowidx < 192; lowidx++) {
uint8 low = lowidx + 0x40;
if (low >= 0x7f)
low++;
uint16 unicode = windows936ConversionTable[highidx * 190 + lowidx];
rt[unicode] = (high << 8) | low;
}
}
windows936ReverseConversionTable = rt;
}
for (uint i = 0; i < src.size();) {
uint32 point = src[i++];
if (point < 0x80) {
operator+=(point);
continue;
}
if (point > 0x10000) {
operator+=(errorChar);
encodingResult = kStringEncodingResultHasErrors;
continue;
}
if (!windows936ReverseConversionTable) {
operator+=(errorChar);
encodingResult = kStringEncodingResultHasErrors;
continue;
}
uint16 rev = windows936ReverseConversionTable[point];
if (rev != 0) {
operator+=(rev >> 8);
operator+=(rev & 0xff);
continue;
}
// This codepage contains cyrillic, so no need to transliterate
operator+=(errorChar);
encodingResult = kStringEncodingResultHasErrors;
continue;
}
return encodingResult;
}
StringEncodingResult String::encodeWindows949(const U32String &src, char errorChar) {
StringEncodingResult encodingResult = kStringEncodingResultSucceeded;
@ -882,6 +993,7 @@ getConversionTable(CodePage page) {
// Multibyte encodings. Can't be represented in simple table way
case kUtf8:
case kWindows932:
case kWindows936:
case kWindows949:
case kWindows950:
case kJohab:
@ -1010,6 +1122,8 @@ StringEncodingResult String::encodeInternal(const U32String &src, CodePage page,
return encodeUTF8(src, errorChar);
case kWindows932:
return encodeWindows932(src, errorChar);
case kWindows936:
return encodeWindows936(src, errorChar);
case kWindows949:
return encodeWindows949(src, errorChar);
case kWindows950:
@ -1050,6 +1164,9 @@ void U32String::decodeInternal(const char *str, uint32 len, CodePage page) {
case kWindows932:
decodeWindows932(str, len);
break;
case kWindows936:
decodeWindows936(str, len);
break;
case kWindows949:
decodeWindows949(str, len);
break;

View File

@ -39,6 +39,7 @@ enum CodePage {
kWindows1256,
kWindows1257,
kWindows932,
kWindows936,
kWindows949,
kWindows950,
kISO8859_1,
@ -54,6 +55,8 @@ enum CodePage {
kLatin1 = kISO8859_1,
kBig5 = kWindows950,
kGBK = kWindows936,
kEUCCN = kWindows936,
kLastEncoding = kASCII
};

View File

@ -252,6 +252,7 @@ public:
protected:
StringEncodingResult encodeUTF8(const U32String &src, char errorChar);
StringEncodingResult encodeWindows932(const U32String &src, char errorChar);
StringEncodingResult encodeWindows936(const U32String &src, char errorChar);
StringEncodingResult encodeWindows949(const U32String &src, char errorChar);
StringEncodingResult encodeWindows950(const U32String &src, bool translit, char errorChar);
StringEncodingResult encodeJohab(const U32String &src, char errorChar);

View File

@ -231,6 +231,7 @@ private:
void decodeInternal(const char *str, uint32 len, CodePage page);
void decodeOneByte(const char *str, uint32 len, CodePage page);
void decodeWindows932(const char *src, uint32 len);
void decodeWindows936(const char *src, uint32 len);
void decodeWindows949(const char *src, uint32 len);
void decodeWindows950(const char *src, uint32 len);
void decodeJohab(const char *src, uint32 len);