/* ScummVM - Graphic Adventure Engine * * ScummVM is the legal property of its developers, whose names * are too numerous to list here. Please refer to the COPYRIGHT * file distributed with this source distribution. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * */ #include "common/str.h" #include "common/ustr.h" #include "common/util.h" #include "common/endian.h" #include "common/error.h" #include "common/system.h" #include "common/enc-internal.h" #include "common/file.h" namespace Common { // //TODO: This is a quick and dirty converter. Refactoring needed: // 1. Original version has an option for performing strict / nonstrict // conversion for the 0xD800...0xDFFF interval // 2. Original version returns a result code. This version does NOT // insert 'FFFD' on errors & does not inform caller on any errors // // More comprehensive one lives in wintermute/utils/convert_utf.cpp void U32String::decodeUTF8(const char *src, uint32 len) { ensureCapacity(len, false); // The String class, and therefore the Font class as well, assume one // character is one byte, but in this case it's actually an UTF-8 // string with up to 4 bytes per character. To work around this, // convert it to an U32String before drawing it, because our Font class // can handle that. for (uint i = 0; i < len;) { uint32 chr = 0; uint num = 1; if ((src[i] & 0xF8) == 0xF0) { num = 4; } else if ((src[i] & 0xF0) == 0xE0) { num = 3; } else if ((src[i] & 0xE0) == 0xC0) { num = 2; } if (len - i >= num) { switch (num) { case 4: chr |= (src[i++] & 0x07) << 18; chr |= (src[i++] & 0x3F) << 12; chr |= (src[i++] & 0x3F) << 6; chr |= (src[i++] & 0x3F); break; case 3: chr |= (src[i++] & 0x0F) << 12; chr |= (src[i++] & 0x3F) << 6; chr |= (src[i++] & 0x3F); break; case 2: chr |= (src[i++] & 0x1F) << 6; chr |= (src[i++] & 0x3F); break; default: chr = (src[i++] & 0x7F); break; } } else { break; } operator+=(chr); } } const uint16 invalidCode = 0xFFFD; static bool cjk_tables_loaded = false; static const uint16 *windows932ConversionTable; static const uint16 *windows949ConversionTable; static const uint16 *windows950ConversionTable; static const uint16 *loadCJKTable(File &f, int idx, size_t sz) { f.seek(16 + idx * 4); uint32 off = f.readUint32LE(); f.seek(off); uint16 *res = new uint16[sz]; f.read(res, 2 * sz); #ifndef SCUMM_LITTLE_ENDIAN for (uint i = 0; i < sz; i++) res[i] = FROM_LE_16(res[i]); #endif return res; } static void loadCJKTables() { File f; cjk_tables_loaded = true; if (!f.open("encoding.dat")) { warning("encoding.dat is not found. Support for CJK is disabled"); return; } if (f.size() < 16 + 3 * 4) { warning("encoding.dat is invalid. Support for CJK is disabled"); return; } if (f.readUint32BE() != MKTAG('S', 'C', 'V', 'M') || f.readUint32BE() != MKTAG('E', 'N', 'C', 'D')) { warning("encoding.dat is invalid. Support for CJK is disabled"); return; } // Version and number of tables. if (f.readUint32LE() != 0 || f.readUint32LE() < 3) { warning("encoding.dat is of incompatible version. Support for CJK is disabled"); return; } windows932ConversionTable = loadCJKTable(f, 0, 47 * 192); windows949ConversionTable = loadCJKTable(f, 1, 0x7e * 0xb2); windows950ConversionTable = loadCJKTable(f, 2, 89 * 157); } void U32String::decodeWindows932(const char *src, uint32 len) { ensureCapacity(len, false); if (!cjk_tables_loaded) loadCJKTables(); for (uint i = 0; i < len;) { uint8 high = src[i++]; if ((high & 0x80) == 0x00) { operator+=(high); continue; } // Katakana if (high >= 0xa1 && high <= 0xdf) { operator+=(high - 0xa1 + 0xFF61); continue; } if (i >= len) { operator+=(invalidCode); continue; } uint8 low = src[i++]; if (low < 0x40) { operator+=(invalidCode); continue; } uint8 lowidx = low - 0x40; uint8 highidx; if (high >= 0x81 && high < 0x85) highidx = high - 0x81; else if (high >= 0x87 && high < 0xa0) highidx = high - 0x87 + 4; else if (high >= 0xe0 && high < 0xef) highidx = high - 0xe0 + 29; else if (high >= 0xfa && high < 0xfd) highidx = high - 0xfa + 44; else { operator+=(invalidCode); continue; } if (!windows932ConversionTable) { operator+=(invalidCode); continue; } // Main range uint16 val = windows932ConversionTable[highidx * 192 + lowidx]; operator+=(val ? val : invalidCode); } } static uint16 convertUHCToUCSReal(uint8 high, uint8 low) { uint lowidx = 0; if (low >= 0x41 && low < 0x5b) lowidx = low - 0x41; else if (low >= 0x61 && low < 0x7b) lowidx = low - 0x61 + 0x1a; else if (low >= 0x81 && low < 0xff) lowidx = low - 0x81 + 0x1a * 2; else return 0; if (!windows949ConversionTable) return 0; uint16 idx = (high - 0x81) * 0xb2 + lowidx; return windows949ConversionTable[idx]; } uint16 convertUHCToUCS(uint8 high, uint8 low) { if (!cjk_tables_loaded) loadCJKTables(); return convertUHCToUCSReal(high, low); } void U32String::decodeWindows949(const char *src, uint32 len) { ensureCapacity(len, false); if (!cjk_tables_loaded) loadCJKTables(); for (uint i = 0; i < len;) { uint8 high = src[i++]; if ((high & 0x80) == 0x00) { operator+=(high); continue; } if (high == 0x80 || high == 0xff) { operator+=(invalidCode); continue; } if (i >= len) { operator+=(invalidCode); continue; } uint8 low = src[i++]; uint16 val = convertUHCToUCSReal(high, low); operator+=(val ? val : invalidCode); } } void U32String::decodeWindows950(const char *src, uint32 len) { ensureCapacity(len, false); if (!cjk_tables_loaded) loadCJKTables(); for (uint i = 0; i < len;) { uint8 high = src[i++]; if ((high & 0x80) == 0x00) { operator+=(high); continue; } // Euro symbol if (high == 0x80) { operator+=(0x20ac); continue; } if (high == 0xff) { operator+=(invalidCode); continue; } if (i >= len) { operator+=(invalidCode); continue; } uint8 low = src[i++]; uint8 lowidx = low < 0x80 ? low - 0x40 : (low - 0xa1 + 0x3f); // Main range if (high >= 0xa1 && high < 0xfa) { uint16 val = windows950ConversionTable ? windows950ConversionTable[(high - 0xa1) * 157 + lowidx] : 0; operator+=(val ? val : invalidCode); continue; } // PUA range if (high <= 0x8d) { operator+=(0xeeb8 + 157 * (high-0x81) + lowidx); continue; } if (high <= 0xa0) { operator+=(0xe311 + (157 * (high-0x8e)) + lowidx); continue; } if (high >= 0xfa) { operator+=(0xe000 + (157 * (high-0xfa)) + lowidx); continue; } } } void String::encodeWindows932(const U32String &src) { static uint16 *reverseTable; ensureCapacity(src.size() * 2, false); if (!cjk_tables_loaded) loadCJKTables(); if (!reverseTable && windows932ConversionTable) { uint16 *rt = new uint16[0x10000]; memset(rt, 0, sizeof(rt[0]) * 0x10000); for (uint highidx = 0; highidx < 47; highidx++) { uint8 high = 0; if (highidx < 4) high = highidx + 0x81; else if (highidx < 29) high = highidx + 0x87 - 4; else if (highidx < 44) high = highidx + 0xe0 - 29; else high = highidx + 0xfa - 44; for (uint lowidx = 0; lowidx < 192; lowidx++) { uint8 low = lowidx + 0x40; uint16 unicode = windows932ConversionTable[highidx * 192 + lowidx]; rt[unicode] = (high << 8) | low; } } reverseTable = rt; } for (uint i = 0; i < src.size();) { uint32 point = src[i++]; if (point < 0x80) { operator+=(point); continue; } // Katakana if (point >= 0xff61 && point <= 0xff9f) { operator+=(0xa1 + (point - 0xFF61)); continue; } if (point > 0x10000) { operator+=('?'); continue; } if (!reverseTable) { operator+=('?'); continue; } uint16 rev = reverseTable[point]; if (rev != 0) { operator+=(rev >> 8); operator+=(rev & 0xff); continue; } // This codepage contains cyrillic, so no need to transliterate operator+=('?'); continue; } } void String::encodeWindows949(const U32String &src) { static const uint16 *reverseTable; ensureCapacity(src.size() * 2, false); if (!cjk_tables_loaded) loadCJKTables(); if (!reverseTable && windows949ConversionTable) { uint16 *rt = new uint16[0x10000]; memset(rt, 0, sizeof(rt[0]) * 0x10000); for (uint lowidx = 0; lowidx < 0xb2; lowidx++) { uint8 low = 0; if (lowidx < 0x1a) low = 0x41 + lowidx; else if (lowidx < 0x1a * 2) low = 0x61 + lowidx - 0x1a; else low = 0x81 + lowidx - 0x1a * 2; for (uint highidx = 0; highidx < 0x7e; highidx++) { uint8 high = highidx + 0x81; uint16 unicode = windows949ConversionTable[highidx * 0xb2 + lowidx]; rt[unicode] = (high << 8) | low; } } reverseTable = rt; } for (uint i = 0; i < src.size();) { uint32 point = src[i++]; if (point < 0x80) { operator+=(point); continue; } if (point > 0x10000 || !reverseTable) { operator+=('?'); continue; } uint16 rev = reverseTable[point]; if (rev == 0) { // This codepage contains cyrillic, so no need to transliterate operator+=('?'); continue; } operator+=(rev >> 8); operator+=(rev & 0xff); } } static const char g_cyrillicTransliterationTable[] = { ' ', 'E', 'D', 'G', 'E', 'Z', 'I', 'I', 'J', 'L', 'N', 'C', 'K', 'I', 'U', 'D', 'A', 'B', 'V', 'G', 'D', 'E', 'Z', 'Z', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'R', 'S', 'T', 'U', 'F', 'H', 'C', 'C', 'S', 'S', '\"', 'Y', '\'', 'E', 'U', 'A', 'a', 'b', 'v', 'g', 'd', 'e', 'z', 'z', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'f', 'h', 'c', 'c', 's', 's', '\"', 'y', '\'', 'e', 'u', 'a', 'e', 'e', 'd', 'g', 'e', 'z', 'i', 'i', 'j', 'l', 'n', 'c', 'k', 'i', 'u', 'd', }; void String::translitChar(U32String::value_type point) { if (point == 0xa0) { operator+=(' '); return; } if (point == 0xad) { operator+=('-'); return; } if (point == 0x2116) { operator+=('N'); return; } if (point >= 0x401 && point <= 0x45f) { operator+=(g_cyrillicTransliterationTable[point - 0x400]); return; } operator+=('?'); } void String::encodeWindows950(const U32String &src, bool transliterate) { static uint16 *reverseTable; ensureCapacity(src.size() * 2, false); if (!cjk_tables_loaded) loadCJKTables(); if (!reverseTable && windows950ConversionTable) { uint16 *rt = new uint16[0x10000]; memset(rt, 0, sizeof(rt[0]) * 0x10000); for (uint lowidx = 0; lowidx < 157; lowidx++) { uint8 low = 0; if (lowidx < 0x3f) low = 0x40 + lowidx; else low = 0xa1 + lowidx - 0x3f; for (uint highidx = 0; highidx < 89; highidx++) { uint8 high = highidx + 0xa1; uint16 unicode = windows950ConversionTable[highidx * 157 + lowidx]; rt[unicode] = (high << 8) | low; } } reverseTable = rt; } for (uint i = 0; i < src.size();) { uint32 point = src[i++]; if (point < 0x80) { operator+=(point); continue; } if (point > 0x10000) { operator+=('?'); continue; } // Euro symbol if (point == 0x20ac) { operator+=((char) 0x80); continue; } if (!reverseTable) { operator+=('?'); continue; } uint16 rev = reverseTable[point]; if (rev != 0) { operator+=(rev >> 8); operator+=(rev & 0xff); continue; } // PUA range if (point >= 0xe000 && point <= 0xf848) { byte lowidx = 0, high = 0, low = 0; if (point <= 0xe310) { high = (point - 0xe000) / 157 + 0xfa; lowidx = (point - 0xe000) % 157; } else if (point <= 0xeeb7) { high = (point - 0xe311) / 157 + 0x8e; lowidx = (point - 0xe311) % 157; } else if (point <= 0xf6b0) { high = (point - 0xeeb8) / 157 + 0x81; lowidx = (point - 0xeeb8) % 157; } else { high = (point - 0xf672) / 157 + 0xc6; lowidx = (point - 0xf672) % 157; } if (lowidx <= 0x3e) low = 0x40 + lowidx; else low = 0x62 + lowidx; operator+=(high); operator+=(low); reverseTable[point] = (high << 8) | low; continue; } if (transliterate) { translitChar(point); continue; } operator+=('?'); continue; } } // //TODO: This is a quick and dirty converter. Refactoring needed: // 1. Original version has an option for performing strict / nonstrict // conversion for the 0xD800...0xDFFF interval // 2. Original version returns a result code. This version inserts '0xFFFD' if // character does not fit in 4 bytes & does not inform caller on any errors // // More comprehensive one lives in wintermute/utils/convert_utf.cpp void String::encodeUTF8(const U32String &src) { ensureCapacity(src.size(), false); static const uint8 firstByteMark[5] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0 }; char writingBytes[5] = {0x00, 0x00, 0x00, 0x00, 0x00}; uint i = 0; while (i < src.size()) { unsigned short bytesToWrite = 0; const uint32 byteMask = 0xBF; const uint32 byteMark = 0x80; uint32 ch = src[i++]; if (ch < (uint32)0x80) { bytesToWrite = 1; } else if (ch < (uint32)0x800) { bytesToWrite = 2; } else if (ch < (uint32)0x10000) { bytesToWrite = 3; } else if (ch <= 0x0010FFFF) { bytesToWrite = 4; } else { bytesToWrite = 3; ch = invalidCode; } char *pBytes = writingBytes + (4 - bytesToWrite); switch (bytesToWrite) { case 4: pBytes[3] = (char)((ch | byteMark) & byteMask); ch >>= 6; // fallthrough case 3: pBytes[2] = (char)((ch | byteMark) & byteMask); ch >>= 6; // fallthrough case 2: pBytes[1] = (char)((ch | byteMark) & byteMask); ch >>= 6; // fallthrough case 1: pBytes[0] = (char)(ch | firstByteMark[bytesToWrite]); break; default: break; } operator+=(pBytes); } } #define decodeUTF16Template(suffix, read) \ Common::U32String U32String::decodeUTF16 ## suffix (const uint16 *start, uint len) { \ const uint16 *ptr = start; \ Common::U32String dst; \ dst.ensureCapacity(len, false); \ \ while (len > 0) { \ uint16 c = read(ptr++); \ len--; \ if (c >= 0xD800 && c <= 0xDBFF && len > 0) { \ uint16 low = read(ptr); \ if (low >= 0xDC00 && low <= 0xDFFF) { \ /* low is OK, we can advance pointer */ \ ptr++; len--; \ dst += ((c & 0x3ff) << 10) \ | (low & 0x3ff); \ } else { \ dst += invalidCode; \ } \ continue; \ } \ \ if (c >= 0xD800 && c <= 0xDFFF) { \ dst += invalidCode; \ continue; \ } \ dst += c; \ } \ \ return dst; \ } decodeUTF16Template(BE, READ_BE_UINT16) decodeUTF16Template(LE, READ_LE_UINT16) decodeUTF16Template(Native, READ_UINT16) #define encodeUTF16Template(suffix, write) \ uint16 *U32String::encodeUTF16 ## suffix (uint *len) const { \ uint16 *out = new uint16[_size * 2 + 1]; \ uint16 *ptr = out; \ \ for (uint i = 0; i < _size; i++) { \ uint32 c = _str[i]; \ if (c < 0x10000) { \ write(ptr++, c); \ continue; \ } \ write (ptr++, 0xD800 | ((c >> 10) & 0x3ff)); \ write (ptr++, 0xDC00 | (c & 0x3ff)); \ } \ \ write(ptr, 0); \ if (len) \ *len = ptr - out; \ \ return out; \ } encodeUTF16Template(BE, WRITE_BE_UINT16) encodeUTF16Template(LE, WRITE_LE_UINT16) encodeUTF16Template(Native, WRITE_UINT16) // Upper bound on unicode codepoint in any single-byte encoding. Must be divisible by 0x100 and be strictly above large codepoint static const int kMaxCharSingleByte = 0x3000; static const uint16 * getConversionTable(CodePage page) { switch (page) { case kWindows1250: return kWindows1250ConversionTable; case kWindows1251: return kWindows1251ConversionTable; case kWindows1252: return kWindows1252ConversionTable; case kWindows1253: return kWindows1253ConversionTable; case kWindows1254: return kWindows1254ConversionTable; case kWindows1255: return kWindows1255ConversionTable; case kWindows1256: return kWindows1256ConversionTable; case kWindows1257: return kWindows1257ConversionTable; case kMacCentralEurope: return kMacCentralEuropeConversionTable; case kISO8859_1: return kLatin1ConversionTable; case kISO8859_2: return kLatin2ConversionTable; case kISO8859_5: return kISO5ConversionTable; case kDos850: return kDos850ConversionTable; case kDos866: return kDos866ConversionTable; case kASCII: return kASCIIConversionTable; case kCodePageInvalid: // Multibyte encodings. Can't be represented in simple table way case kUtf8: case kWindows932: case kWindows949: case kWindows950: return nullptr; } return nullptr; } struct ReverseTablePrefixTreeLevel1 { struct ReverseTablePrefixTreeLevel2 *next[kMaxCharSingleByte / 0x100]; bool valid; }; struct ReverseTablePrefixTreeLevel2 { uint8 end[256]; ReverseTablePrefixTreeLevel2() { memset(end, 0, sizeof(end)); } }; ReverseTablePrefixTreeLevel1 reverseTables[kLastEncoding + 1]; static const ReverseTablePrefixTreeLevel1 * getReverseConversionTable(CodePage page) { if (reverseTables[page].valid) return &reverseTables[page]; const uint16 *conversionTable = getConversionTable(page); if (!conversionTable) return nullptr; reverseTables[page].valid = true; for (uint i = 0; i < 0x80; i++) { uint32 c = conversionTable[i]; if (c == 0 || c >= kMaxCharSingleByte) continue; if (!reverseTables[page].next[c >> 8]) { reverseTables[page].next[c >> 8] = new ReverseTablePrefixTreeLevel2(); } reverseTables[page].next[c >> 8]->end[c&0xff] = i | 0x80; } return &reverseTables[page]; } void U32String::decodeOneByte(const char *src, uint32 len, CodePage page) { const uint16 *conversionTable = getConversionTable(page); if (conversionTable == nullptr) { conversionTable = kASCIIConversionTable; } ensureCapacity(len, false); for (uint i = 0; i < len; ++i) { if ((src[i] & 0x80) == 0) { operator+=(src[i]); continue; } uint16 val = conversionTable[src[i] & 0x7f]; operator+=(val ? val : invalidCode); } } void String::encodeOneByte(const U32String &src, CodePage page, bool transliterate) { const ReverseTablePrefixTreeLevel1 *conversionTable = getReverseConversionTable(page); ensureCapacity(src.size(), false); if (conversionTable == nullptr) { for (uint i = 0; i < src.size(); ++i) { uint32 c = src[i]; if (c <= 0x7F) { operator+=((char)c); continue; } if (transliterate) { translitChar(c); } else operator+=('?'); } return; } for (uint i = 0; i < src.size(); ++i) { uint32 c = src[i]; if (c <= 0x7F) { operator+=((char)c); continue; } if (c >= kMaxCharSingleByte) continue; ReverseTablePrefixTreeLevel2 *l2 = conversionTable->next[c>>8]; unsigned char uc = l2 ? l2->end[c&0xff] : 0; if (uc != 0) { operator+=((char)uc); continue; } if (transliterate) { translitChar(c); } else operator+=('?'); } } void String::encodeInternal(const U32String &src, CodePage page) { switch(page) { case kUtf8: encodeUTF8(src); break; case kWindows932: encodeWindows932(src); break; case kWindows949: encodeWindows949(src); break; case kWindows950: encodeWindows950(src); break; default: encodeOneByte(src, page); break; } } U32String convertToU32String(const char *str, CodePage page) { return String(str).decode(page); } U32String convertUtf8ToUtf32(const String &str) { return str.decode(kUtf8); } String convertFromU32String(const U32String &string, CodePage page) { return string.encode(page); } String convertUtf32ToUtf8(const U32String &u32str) { return u32str.encode(kUtf8); } void U32String::decodeInternal(const char *str, uint32 len, CodePage page) { assert(str); _storage[0] = 0; _size = 0; switch(page) { case kUtf8: decodeUTF8(str, len); break; case kWindows932: decodeWindows932(str, len); break; case kWindows949: decodeWindows949(str, len); break; case kWindows950: decodeWindows950(str, len); break; default: decodeOneByte(str, len, page); break; } } U32String String::decode(CodePage page) const { if (page == kCodePageInvalid || page > kLastEncoding) { error("Invalid codepage"); } U32String unicodeString; unicodeString.decodeInternal(_str, _size, page); return unicodeString; } String U32String::encode(CodePage page) const { if (page == kCodePageInvalid || page > kLastEncoding) { error("Invalid codepage"); } String string; string.encodeInternal(*this, page); return string; } } // End of namespace Common