mirror of
https://github.com/libretro/scummvm.git
synced 2025-01-21 01:05:59 +00:00
928 lines
21 KiB
C++
928 lines
21 KiB
C++
/* ScummVM - Graphic Adventure Engine
|
|
*
|
|
* ScummVM is the legal property of its developers, whose names
|
|
* are too numerous to list here. Please refer to the COPYRIGHT
|
|
* file distributed with this source distribution.
|
|
*
|
|
* This program is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU General Public License
|
|
* as published by the Free Software Foundation; either version 2
|
|
* of the License, or (at your option) any later version.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with this program; if not, write to the Free Software
|
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
|
*
|
|
*/
|
|
|
|
#include "common/str.h"
|
|
#include "common/ustr.h"
|
|
#include "common/util.h"
|
|
#include "common/endian.h"
|
|
#include "common/error.h"
|
|
#include "common/system.h"
|
|
#include "common/enc-internal.h"
|
|
#include "common/file.h"
|
|
|
|
namespace Common {
|
|
|
|
// //TODO: This is a quick and dirty converter. Refactoring needed:
|
|
// 1. Original version has an option for performing strict / nonstrict
|
|
// conversion for the 0xD800...0xDFFF interval
|
|
// 2. Original version returns a result code. This version does NOT
|
|
// insert 'FFFD' on errors & does not inform caller on any errors
|
|
//
|
|
// More comprehensive one lives in wintermute/utils/convert_utf.cpp
|
|
void U32String::decodeUTF8(const char *src, uint32 len) {
|
|
ensureCapacity(len, false);
|
|
|
|
// The String class, and therefore the Font class as well, assume one
|
|
// character is one byte, but in this case it's actually an UTF-8
|
|
// string with up to 4 bytes per character. To work around this,
|
|
// convert it to an U32String before drawing it, because our Font class
|
|
// can handle that.
|
|
for (uint i = 0; i < len;) {
|
|
uint32 chr = 0;
|
|
uint num = 1;
|
|
|
|
if ((src[i] & 0xF8) == 0xF0) {
|
|
num = 4;
|
|
} else if ((src[i] & 0xF0) == 0xE0) {
|
|
num = 3;
|
|
} else if ((src[i] & 0xE0) == 0xC0) {
|
|
num = 2;
|
|
}
|
|
|
|
if (len - i >= num) {
|
|
switch (num) {
|
|
case 4:
|
|
chr |= (src[i++] & 0x07) << 18;
|
|
chr |= (src[i++] & 0x3F) << 12;
|
|
chr |= (src[i++] & 0x3F) << 6;
|
|
chr |= (src[i++] & 0x3F);
|
|
break;
|
|
|
|
case 3:
|
|
chr |= (src[i++] & 0x0F) << 12;
|
|
chr |= (src[i++] & 0x3F) << 6;
|
|
chr |= (src[i++] & 0x3F);
|
|
break;
|
|
|
|
case 2:
|
|
chr |= (src[i++] & 0x1F) << 6;
|
|
chr |= (src[i++] & 0x3F);
|
|
break;
|
|
|
|
default:
|
|
chr = (src[i++] & 0x7F);
|
|
break;
|
|
}
|
|
} else {
|
|
break;
|
|
}
|
|
|
|
operator+=(chr);
|
|
}
|
|
}
|
|
|
|
const uint16 invalidCode = 0xFFFD;
|
|
|
|
static bool cjk_tables_loaded = false;
|
|
static const uint16 *windows932ConversionTable;
|
|
static const uint16 *windows949ConversionTable;
|
|
static const uint16 *windows950ConversionTable;
|
|
|
|
static const uint16 *loadCJKTable(File &f, int idx, size_t sz) {
|
|
f.seek(16 + idx * 4);
|
|
uint32 off = f.readUint32LE();
|
|
f.seek(off);
|
|
uint16 *res = new uint16[sz];
|
|
f.read(res, 2 * sz);
|
|
#ifndef SCUMM_LITTLE_ENDIAN
|
|
for (uint i = 0; i < sz; i++)
|
|
res[i] = FROM_LE_16(res[i]);
|
|
#endif
|
|
return res;
|
|
}
|
|
|
|
static void loadCJKTables() {
|
|
File f;
|
|
|
|
cjk_tables_loaded = true;
|
|
|
|
if (!f.open("encoding.dat")) {
|
|
warning("encoding.dat is not found. Support for CJK is disabled");
|
|
return;
|
|
}
|
|
|
|
if (f.size() < 16 + 3 * 4) {
|
|
warning("encoding.dat is invalid. Support for CJK is disabled");
|
|
return;
|
|
}
|
|
|
|
if (f.readUint32BE() != MKTAG('S', 'C', 'V', 'M')
|
|
|| f.readUint32BE() != MKTAG('E', 'N', 'C', 'D')) {
|
|
warning("encoding.dat is invalid. Support for CJK is disabled");
|
|
return;
|
|
}
|
|
|
|
// Version and number of tables.
|
|
if (f.readUint32LE() != 0 || f.readUint32LE() < 3) {
|
|
warning("encoding.dat is of incompatible version. Support for CJK is disabled");
|
|
return;
|
|
}
|
|
|
|
windows932ConversionTable = loadCJKTable(f, 0, 47 * 192);
|
|
windows949ConversionTable = loadCJKTable(f, 1, 0x7e * 0xb2);
|
|
windows950ConversionTable = loadCJKTable(f, 2, 89 * 157);
|
|
}
|
|
|
|
void U32String::decodeWindows932(const char *src, uint32 len) {
|
|
ensureCapacity(len, false);
|
|
|
|
if (!cjk_tables_loaded)
|
|
loadCJKTables();
|
|
|
|
for (uint i = 0; i < len;) {
|
|
uint8 high = src[i++];
|
|
|
|
if ((high & 0x80) == 0x00) {
|
|
operator+=(high);
|
|
continue;
|
|
}
|
|
|
|
// Katakana
|
|
if (high >= 0xa1 && high <= 0xdf) {
|
|
operator+=(high - 0xa1 + 0xFF61);
|
|
continue;
|
|
}
|
|
|
|
if (i >= len) {
|
|
operator+=(invalidCode);
|
|
continue;
|
|
}
|
|
|
|
uint8 low = src[i++];
|
|
if (low < 0x40) {
|
|
operator+=(invalidCode);
|
|
continue;
|
|
}
|
|
uint8 lowidx = low - 0x40;
|
|
uint8 highidx;
|
|
|
|
if (high >= 0x81 && high < 0x85)
|
|
highidx = high - 0x81;
|
|
else if (high >= 0x87 && high < 0xa0)
|
|
highidx = high - 0x87 + 4;
|
|
else if (high >= 0xe0 && high < 0xef)
|
|
highidx = high - 0xe0 + 29;
|
|
else if (high >= 0xfa && high < 0xfd)
|
|
highidx = high - 0xfa + 44;
|
|
else {
|
|
operator+=(invalidCode);
|
|
continue;
|
|
}
|
|
|
|
if (!windows932ConversionTable) {
|
|
operator+=(invalidCode);
|
|
continue;
|
|
}
|
|
|
|
// Main range
|
|
uint16 val = windows932ConversionTable[highidx * 192 + lowidx];
|
|
operator+=(val ? val : invalidCode);
|
|
}
|
|
}
|
|
|
|
static uint16 convertUHCToUCSReal(uint8 high, uint8 low) {
|
|
uint lowidx = 0;
|
|
if (low >= 0x41 && low < 0x5b)
|
|
lowidx = low - 0x41;
|
|
else if (low >= 0x61 && low < 0x7b)
|
|
lowidx = low - 0x61 + 0x1a;
|
|
else if (low >= 0x81 && low < 0xff)
|
|
lowidx = low - 0x81 + 0x1a * 2;
|
|
else
|
|
return 0;
|
|
if (!windows949ConversionTable)
|
|
return 0;
|
|
uint16 idx = (high - 0x81) * 0xb2 + lowidx;
|
|
return windows949ConversionTable[idx];
|
|
}
|
|
|
|
uint16 convertUHCToUCS(uint8 high, uint8 low) {
|
|
if (!cjk_tables_loaded)
|
|
loadCJKTables();
|
|
|
|
return convertUHCToUCSReal(high, low);
|
|
}
|
|
|
|
|
|
void U32String::decodeWindows949(const char *src, uint32 len) {
|
|
ensureCapacity(len, false);
|
|
|
|
if (!cjk_tables_loaded)
|
|
loadCJKTables();
|
|
|
|
for (uint i = 0; i < len;) {
|
|
uint8 high = src[i++];
|
|
|
|
if ((high & 0x80) == 0x00) {
|
|
operator+=(high);
|
|
continue;
|
|
}
|
|
|
|
if (high == 0x80 || high == 0xff) {
|
|
operator+=(invalidCode);
|
|
continue;
|
|
}
|
|
|
|
if (i >= len) {
|
|
operator+=(invalidCode);
|
|
continue;
|
|
}
|
|
|
|
uint8 low = src[i++];
|
|
uint16 val = convertUHCToUCSReal(high, low);
|
|
|
|
operator+=(val ? val : invalidCode);
|
|
}
|
|
}
|
|
|
|
void U32String::decodeWindows950(const char *src, uint32 len) {
|
|
ensureCapacity(len, false);
|
|
|
|
if (!cjk_tables_loaded)
|
|
loadCJKTables();
|
|
|
|
for (uint i = 0; i < len;) {
|
|
uint8 high = src[i++];
|
|
|
|
if ((high & 0x80) == 0x00) {
|
|
operator+=(high);
|
|
continue;
|
|
}
|
|
|
|
// Euro symbol
|
|
if (high == 0x80) {
|
|
operator+=(0x20ac);
|
|
continue;
|
|
}
|
|
|
|
if (high == 0xff) {
|
|
operator+=(invalidCode);
|
|
continue;
|
|
}
|
|
|
|
if (i >= len) {
|
|
operator+=(invalidCode);
|
|
continue;
|
|
}
|
|
|
|
uint8 low = src[i++];
|
|
uint8 lowidx = low < 0x80 ? low - 0x40 : (low - 0xa1 + 0x3f);
|
|
|
|
// Main range
|
|
if (high >= 0xa1 && high < 0xfa) {
|
|
uint16 val = windows950ConversionTable ?
|
|
windows950ConversionTable[(high - 0xa1) * 157 + lowidx] : 0;
|
|
operator+=(val ? val : invalidCode);
|
|
continue;
|
|
}
|
|
|
|
// PUA range
|
|
if (high <= 0x8d) {
|
|
operator+=(0xeeb8 + 157 * (high-0x81) + lowidx);
|
|
continue;
|
|
}
|
|
if (high <= 0xa0) {
|
|
operator+=(0xe311 + (157 * (high-0x8e)) + lowidx);
|
|
continue;
|
|
}
|
|
if (high >= 0xfa) {
|
|
operator+=(0xe000 + (157 * (high-0xfa)) + lowidx);
|
|
continue;
|
|
}
|
|
}
|
|
}
|
|
|
|
void String::encodeWindows932(const U32String &src) {
|
|
static uint16 *reverseTable;
|
|
|
|
ensureCapacity(src.size() * 2, false);
|
|
|
|
if (!cjk_tables_loaded)
|
|
loadCJKTables();
|
|
|
|
if (!reverseTable && windows932ConversionTable) {
|
|
uint16 *rt = new uint16[0x10000];
|
|
memset(rt, 0, sizeof(rt[0]) * 0x10000);
|
|
for (uint highidx = 0; highidx < 47; highidx++) {
|
|
uint8 high = 0;
|
|
if (highidx < 4)
|
|
high = highidx + 0x81;
|
|
else if (highidx < 29)
|
|
high = highidx + 0x87 - 4;
|
|
else if (highidx < 44)
|
|
high = highidx + 0xe0 - 29;
|
|
else
|
|
high = highidx + 0xfa - 44;
|
|
|
|
for (uint lowidx = 0; lowidx < 192; lowidx++) {
|
|
uint8 low = lowidx + 0x40;
|
|
uint16 unicode = windows932ConversionTable[highidx * 192 + lowidx];
|
|
|
|
rt[unicode] = (high << 8) | low;
|
|
}
|
|
}
|
|
reverseTable = rt;
|
|
}
|
|
|
|
for (uint i = 0; i < src.size();) {
|
|
uint32 point = src[i++];
|
|
|
|
if (point < 0x80) {
|
|
operator+=(point);
|
|
continue;
|
|
}
|
|
|
|
// Katakana
|
|
if (point >= 0xff61 && point <= 0xff9f) {
|
|
operator+=(0xa1 + (point - 0xFF61));
|
|
continue;
|
|
}
|
|
|
|
if (point > 0x10000) {
|
|
operator+=('?');
|
|
continue;
|
|
}
|
|
|
|
if (!reverseTable) {
|
|
operator+=('?');
|
|
continue;
|
|
}
|
|
|
|
uint16 rev = reverseTable[point];
|
|
if (rev != 0) {
|
|
operator+=(rev >> 8);
|
|
operator+=(rev & 0xff);
|
|
continue;
|
|
}
|
|
|
|
// This codepage contains cyrillic, so no need to transliterate
|
|
|
|
operator+=('?');
|
|
continue;
|
|
}
|
|
}
|
|
|
|
void String::encodeWindows949(const U32String &src) {
|
|
static const uint16 *reverseTable;
|
|
|
|
ensureCapacity(src.size() * 2, false);
|
|
|
|
if (!cjk_tables_loaded)
|
|
loadCJKTables();
|
|
|
|
if (!reverseTable && windows949ConversionTable) {
|
|
uint16 *rt = new uint16[0x10000];
|
|
memset(rt, 0, sizeof(rt[0]) * 0x10000);
|
|
|
|
for (uint lowidx = 0; lowidx < 0xb2; lowidx++) {
|
|
uint8 low = 0;
|
|
if (lowidx < 0x1a)
|
|
low = 0x41 + lowidx;
|
|
else if (lowidx < 0x1a * 2)
|
|
low = 0x61 + lowidx - 0x1a;
|
|
else
|
|
low = 0x81 + lowidx - 0x1a * 2;
|
|
|
|
for (uint highidx = 0; highidx < 0x7e; highidx++) {
|
|
uint8 high = highidx + 0x81;
|
|
uint16 unicode = windows949ConversionTable[highidx * 0xb2 + lowidx];
|
|
|
|
rt[unicode] = (high << 8) | low;
|
|
}
|
|
}
|
|
|
|
reverseTable = rt;
|
|
}
|
|
|
|
for (uint i = 0; i < src.size();) {
|
|
uint32 point = src[i++];
|
|
|
|
if (point < 0x80) {
|
|
operator+=(point);
|
|
continue;
|
|
}
|
|
|
|
if (point > 0x10000 || !reverseTable) {
|
|
operator+=('?');
|
|
continue;
|
|
}
|
|
|
|
uint16 rev = reverseTable[point];
|
|
if (rev == 0) {
|
|
// This codepage contains cyrillic, so no need to transliterate
|
|
operator+=('?');
|
|
continue;
|
|
}
|
|
|
|
operator+=(rev >> 8);
|
|
operator+=(rev & 0xff);
|
|
}
|
|
}
|
|
|
|
static const char g_cyrillicTransliterationTable[] = {
|
|
' ', 'E', 'D', 'G', 'E', 'Z', 'I', 'I', 'J', 'L', 'N', 'C', 'K', 'I', 'U', 'D',
|
|
'A', 'B', 'V', 'G', 'D', 'E', 'Z', 'Z', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
|
|
'R', 'S', 'T', 'U', 'F', 'H', 'C', 'C', 'S', 'S', '\"', 'Y', '\'', 'E', 'U', 'A',
|
|
'a', 'b', 'v', 'g', 'd', 'e', 'z', 'z', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p',
|
|
'r', 's', 't', 'u', 'f', 'h', 'c', 'c', 's', 's', '\"', 'y', '\'', 'e', 'u', 'a',
|
|
'e', 'e', 'd', 'g', 'e', 'z', 'i', 'i', 'j', 'l', 'n', 'c', 'k', 'i', 'u', 'd',
|
|
};
|
|
|
|
void String::translitChar(U32String::value_type point) {
|
|
if (point == 0xa0) {
|
|
operator+=(' ');
|
|
return;
|
|
}
|
|
|
|
if (point == 0xad) {
|
|
operator+=('-');
|
|
return;
|
|
}
|
|
|
|
if (point == 0x2116) {
|
|
operator+=('N');
|
|
return;
|
|
}
|
|
|
|
if (point >= 0x401 && point <= 0x45f) {
|
|
operator+=(g_cyrillicTransliterationTable[point - 0x400]);
|
|
return;
|
|
}
|
|
|
|
operator+=('?');
|
|
}
|
|
|
|
void String::encodeWindows950(const U32String &src, bool transliterate) {
|
|
static uint16 *reverseTable;
|
|
|
|
ensureCapacity(src.size() * 2, false);
|
|
|
|
if (!cjk_tables_loaded)
|
|
loadCJKTables();
|
|
|
|
if (!reverseTable && windows950ConversionTable) {
|
|
uint16 *rt = new uint16[0x10000];
|
|
memset(rt, 0, sizeof(rt[0]) * 0x10000);
|
|
|
|
for (uint lowidx = 0; lowidx < 157; lowidx++) {
|
|
uint8 low = 0;
|
|
if (lowidx < 0x3f)
|
|
low = 0x40 + lowidx;
|
|
else
|
|
low = 0xa1 + lowidx - 0x3f;
|
|
|
|
for (uint highidx = 0; highidx < 89; highidx++) {
|
|
uint8 high = highidx + 0xa1;
|
|
uint16 unicode = windows950ConversionTable[highidx * 157 + lowidx];
|
|
|
|
rt[unicode] = (high << 8) | low;
|
|
}
|
|
}
|
|
|
|
reverseTable = rt;
|
|
}
|
|
|
|
for (uint i = 0; i < src.size();) {
|
|
uint32 point = src[i++];
|
|
|
|
if (point < 0x80) {
|
|
operator+=(point);
|
|
continue;
|
|
}
|
|
|
|
if (point > 0x10000) {
|
|
operator+=('?');
|
|
continue;
|
|
}
|
|
|
|
// Euro symbol
|
|
if (point == 0x20ac) {
|
|
operator+=((char) 0x80);
|
|
continue;
|
|
}
|
|
|
|
if (!reverseTable) {
|
|
operator+=('?');
|
|
continue;
|
|
}
|
|
|
|
uint16 rev = reverseTable[point];
|
|
if (rev != 0) {
|
|
operator+=(rev >> 8);
|
|
operator+=(rev & 0xff);
|
|
continue;
|
|
}
|
|
|
|
// PUA range
|
|
if (point >= 0xe000 && point <= 0xf848) {
|
|
byte lowidx = 0, high = 0, low = 0;
|
|
if (point <= 0xe310) {
|
|
high = (point - 0xe000) / 157 + 0xfa;
|
|
lowidx = (point - 0xe000) % 157;
|
|
} else if (point <= 0xeeb7) {
|
|
high = (point - 0xe311) / 157 + 0x8e;
|
|
lowidx = (point - 0xe311) % 157;
|
|
} else if (point <= 0xf6b0) {
|
|
high = (point - 0xeeb8) / 157 + 0x81;
|
|
lowidx = (point - 0xeeb8) % 157;
|
|
} else {
|
|
high = (point - 0xf672) / 157 + 0xc6;
|
|
lowidx = (point - 0xf672) % 157;
|
|
}
|
|
|
|
if (lowidx <= 0x3e)
|
|
low = 0x40 + lowidx;
|
|
else
|
|
low = 0x62 + lowidx;
|
|
|
|
operator+=(high);
|
|
operator+=(low);
|
|
reverseTable[point] = (high << 8) | low;
|
|
continue;
|
|
}
|
|
|
|
if (transliterate) {
|
|
translitChar(point);
|
|
continue;
|
|
}
|
|
|
|
operator+=('?');
|
|
continue;
|
|
}
|
|
}
|
|
|
|
// //TODO: This is a quick and dirty converter. Refactoring needed:
|
|
// 1. Original version has an option for performing strict / nonstrict
|
|
// conversion for the 0xD800...0xDFFF interval
|
|
// 2. Original version returns a result code. This version inserts '0xFFFD' if
|
|
// character does not fit in 4 bytes & does not inform caller on any errors
|
|
//
|
|
// More comprehensive one lives in wintermute/utils/convert_utf.cpp
|
|
void String::encodeUTF8(const U32String &src) {
|
|
ensureCapacity(src.size(), false);
|
|
static const uint8 firstByteMark[5] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0 };
|
|
char writingBytes[5] = {0x00, 0x00, 0x00, 0x00, 0x00};
|
|
|
|
uint i = 0;
|
|
while (i < src.size()) {
|
|
unsigned short bytesToWrite = 0;
|
|
const uint32 byteMask = 0xBF;
|
|
const uint32 byteMark = 0x80;
|
|
|
|
uint32 ch = src[i++];
|
|
if (ch < (uint32)0x80) {
|
|
bytesToWrite = 1;
|
|
} else if (ch < (uint32)0x800) {
|
|
bytesToWrite = 2;
|
|
} else if (ch < (uint32)0x10000) {
|
|
bytesToWrite = 3;
|
|
} else if (ch <= 0x0010FFFF) {
|
|
bytesToWrite = 4;
|
|
} else {
|
|
bytesToWrite = 3;
|
|
ch = invalidCode;
|
|
}
|
|
|
|
char *pBytes = writingBytes + (4 - bytesToWrite);
|
|
|
|
switch (bytesToWrite) {
|
|
case 4:
|
|
pBytes[3] = (char)((ch | byteMark) & byteMask);
|
|
ch >>= 6;
|
|
// fallthrough
|
|
case 3:
|
|
pBytes[2] = (char)((ch | byteMark) & byteMask);
|
|
ch >>= 6;
|
|
// fallthrough
|
|
case 2:
|
|
pBytes[1] = (char)((ch | byteMark) & byteMask);
|
|
ch >>= 6;
|
|
// fallthrough
|
|
case 1:
|
|
pBytes[0] = (char)(ch | firstByteMark[bytesToWrite]);
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
|
|
operator+=(pBytes);
|
|
}
|
|
}
|
|
|
|
#define decodeUTF16Template(suffix, read) \
|
|
Common::U32String U32String::decodeUTF16 ## suffix (const uint16 *start, uint len) { \
|
|
const uint16 *ptr = start; \
|
|
Common::U32String dst; \
|
|
dst.ensureCapacity(len, false); \
|
|
\
|
|
while (len > 0) { \
|
|
uint16 c = read(ptr++); \
|
|
len--; \
|
|
if (c >= 0xD800 && c <= 0xDBFF && len > 0) { \
|
|
uint16 low = read(ptr); \
|
|
if (low >= 0xDC00 && low <= 0xDFFF) { \
|
|
/* low is OK, we can advance pointer */ \
|
|
ptr++; len--; \
|
|
dst += ((c & 0x3ff) << 10) \
|
|
| (low & 0x3ff); \
|
|
} else { \
|
|
dst += invalidCode; \
|
|
} \
|
|
continue; \
|
|
} \
|
|
\
|
|
if (c >= 0xD800 && c <= 0xDFFF) { \
|
|
dst += invalidCode; \
|
|
continue; \
|
|
} \
|
|
dst += c; \
|
|
} \
|
|
\
|
|
return dst; \
|
|
}
|
|
|
|
decodeUTF16Template(BE, READ_BE_UINT16)
|
|
decodeUTF16Template(LE, READ_LE_UINT16)
|
|
decodeUTF16Template(Native, READ_UINT16)
|
|
|
|
#define encodeUTF16Template(suffix, write) \
|
|
uint16 *U32String::encodeUTF16 ## suffix (uint *len) const { \
|
|
uint16 *out = new uint16[_size * 2 + 1]; \
|
|
uint16 *ptr = out; \
|
|
\
|
|
for (uint i = 0; i < _size; i++) { \
|
|
uint32 c = _str[i]; \
|
|
if (c < 0x10000) { \
|
|
write(ptr++, c); \
|
|
continue; \
|
|
} \
|
|
write (ptr++, 0xD800 | ((c >> 10) & 0x3ff)); \
|
|
write (ptr++, 0xDC00 | (c & 0x3ff)); \
|
|
} \
|
|
\
|
|
write(ptr, 0); \
|
|
if (len) \
|
|
*len = ptr - out; \
|
|
\
|
|
return out; \
|
|
}
|
|
|
|
encodeUTF16Template(BE, WRITE_BE_UINT16)
|
|
encodeUTF16Template(LE, WRITE_LE_UINT16)
|
|
encodeUTF16Template(Native, WRITE_UINT16)
|
|
|
|
// Upper bound on unicode codepoint in any single-byte encoding. Must be divisible by 0x100 and be strictly above large codepoint
|
|
static const int kMaxCharSingleByte = 0x3000;
|
|
|
|
|
|
static const uint16 *
|
|
getConversionTable(CodePage page) {
|
|
switch (page) {
|
|
case kWindows1250:
|
|
return kWindows1250ConversionTable;
|
|
case kWindows1251:
|
|
return kWindows1251ConversionTable;
|
|
case kWindows1252:
|
|
return kWindows1252ConversionTable;
|
|
case kWindows1253:
|
|
return kWindows1253ConversionTable;
|
|
case kWindows1254:
|
|
return kWindows1254ConversionTable;
|
|
case kWindows1255:
|
|
return kWindows1255ConversionTable;
|
|
case kWindows1256:
|
|
return kWindows1256ConversionTable;
|
|
case kWindows1257:
|
|
return kWindows1257ConversionTable;
|
|
case kMacCentralEurope:
|
|
return kMacCentralEuropeConversionTable;
|
|
case kISO8859_1:
|
|
return kLatin1ConversionTable;
|
|
case kISO8859_2:
|
|
return kLatin2ConversionTable;
|
|
case kISO8859_5:
|
|
return kISO5ConversionTable;
|
|
case kDos850:
|
|
return kDos850ConversionTable;
|
|
case kDos866:
|
|
return kDos866ConversionTable;
|
|
case kASCII:
|
|
return kASCIIConversionTable;
|
|
|
|
case kCodePageInvalid:
|
|
// Multibyte encodings. Can't be represented in simple table way
|
|
case kUtf8:
|
|
case kWindows932:
|
|
case kWindows949:
|
|
case kWindows950:
|
|
return nullptr;
|
|
}
|
|
return nullptr;
|
|
}
|
|
|
|
struct ReverseTablePrefixTreeLevel1 {
|
|
struct ReverseTablePrefixTreeLevel2 *next[kMaxCharSingleByte / 0x100];
|
|
bool valid;
|
|
};
|
|
|
|
struct ReverseTablePrefixTreeLevel2 {
|
|
uint8 end[256];
|
|
|
|
ReverseTablePrefixTreeLevel2() {
|
|
memset(end, 0, sizeof(end));
|
|
}
|
|
};
|
|
|
|
ReverseTablePrefixTreeLevel1 reverseTables[kLastEncoding + 1];
|
|
|
|
static const ReverseTablePrefixTreeLevel1 *
|
|
getReverseConversionTable(CodePage page) {
|
|
if (reverseTables[page].valid)
|
|
return &reverseTables[page];
|
|
const uint16 *conversionTable = getConversionTable(page);
|
|
if (!conversionTable)
|
|
return nullptr;
|
|
reverseTables[page].valid = true;
|
|
for (uint i = 0; i < 0x80; i++) {
|
|
uint32 c = conversionTable[i];
|
|
if (c == 0 || c >= kMaxCharSingleByte)
|
|
continue;
|
|
if (!reverseTables[page].next[c >> 8]) {
|
|
reverseTables[page].next[c >> 8] = new ReverseTablePrefixTreeLevel2();
|
|
}
|
|
|
|
reverseTables[page].next[c >> 8]->end[c&0xff] = i | 0x80;
|
|
}
|
|
|
|
return &reverseTables[page];
|
|
}
|
|
|
|
void U32String::decodeOneByte(const char *src, uint32 len, CodePage page) {
|
|
const uint16 *conversionTable = getConversionTable(page);
|
|
|
|
if (conversionTable == nullptr) {
|
|
conversionTable = kASCIIConversionTable;
|
|
}
|
|
|
|
ensureCapacity(len, false);
|
|
|
|
for (uint i = 0; i < len; ++i) {
|
|
if ((src[i] & 0x80) == 0) {
|
|
operator+=(src[i]);
|
|
continue;
|
|
}
|
|
|
|
uint16 val = conversionTable[src[i] & 0x7f];
|
|
operator+=(val ? val : invalidCode);
|
|
}
|
|
}
|
|
|
|
void String::encodeOneByte(const U32String &src, CodePage page, bool transliterate) {
|
|
const ReverseTablePrefixTreeLevel1 *conversionTable =
|
|
getReverseConversionTable(page);
|
|
|
|
ensureCapacity(src.size(), false);
|
|
|
|
if (conversionTable == nullptr) {
|
|
for (uint i = 0; i < src.size(); ++i) {
|
|
uint32 c = src[i];
|
|
if (c <= 0x7F) {
|
|
operator+=((char)c);
|
|
continue;
|
|
}
|
|
|
|
if (transliterate) {
|
|
translitChar(c);
|
|
} else
|
|
operator+=('?');
|
|
}
|
|
return;
|
|
}
|
|
|
|
for (uint i = 0; i < src.size(); ++i) {
|
|
uint32 c = src[i];
|
|
if (c <= 0x7F) {
|
|
operator+=((char)c);
|
|
continue;
|
|
}
|
|
|
|
if (c >= kMaxCharSingleByte)
|
|
continue;
|
|
ReverseTablePrefixTreeLevel2 *l2 = conversionTable->next[c>>8];
|
|
unsigned char uc = l2 ? l2->end[c&0xff] : 0;
|
|
if (uc != 0) {
|
|
operator+=((char)uc);
|
|
continue;
|
|
}
|
|
|
|
if (transliterate) {
|
|
translitChar(c);
|
|
} else
|
|
operator+=('?');
|
|
}
|
|
}
|
|
|
|
void String::encodeInternal(const U32String &src, CodePage page) {
|
|
switch(page) {
|
|
case kUtf8:
|
|
encodeUTF8(src);
|
|
break;
|
|
case kWindows932:
|
|
encodeWindows932(src);
|
|
break;
|
|
case kWindows949:
|
|
encodeWindows949(src);
|
|
break;
|
|
case kWindows950:
|
|
encodeWindows950(src);
|
|
break;
|
|
default:
|
|
encodeOneByte(src, page);
|
|
break;
|
|
}
|
|
}
|
|
|
|
U32String convertToU32String(const char *str, CodePage page) {
|
|
return String(str).decode(page);
|
|
}
|
|
|
|
U32String convertUtf8ToUtf32(const String &str) {
|
|
return str.decode(kUtf8);
|
|
}
|
|
|
|
String convertFromU32String(const U32String &string, CodePage page) {
|
|
return string.encode(page);
|
|
}
|
|
|
|
String convertUtf32ToUtf8(const U32String &u32str) {
|
|
return u32str.encode(kUtf8);
|
|
}
|
|
|
|
void U32String::decodeInternal(const char *str, uint32 len, CodePage page) {
|
|
assert(str);
|
|
|
|
_storage[0] = 0;
|
|
_size = 0;
|
|
|
|
switch(page) {
|
|
case kUtf8:
|
|
decodeUTF8(str, len);
|
|
break;
|
|
case kWindows932:
|
|
decodeWindows932(str, len);
|
|
break;
|
|
case kWindows949:
|
|
decodeWindows949(str, len);
|
|
break;
|
|
case kWindows950:
|
|
decodeWindows950(str, len);
|
|
break;
|
|
default:
|
|
decodeOneByte(str, len, page);
|
|
break;
|
|
}
|
|
}
|
|
|
|
U32String String::decode(CodePage page) const {
|
|
if (page == kCodePageInvalid ||
|
|
page > kLastEncoding) {
|
|
error("Invalid codepage");
|
|
}
|
|
|
|
U32String unicodeString;
|
|
unicodeString.decodeInternal(_str, _size, page);
|
|
return unicodeString;
|
|
}
|
|
|
|
String U32String::encode(CodePage page) const {
|
|
if (page == kCodePageInvalid ||
|
|
page > kLastEncoding) {
|
|
error("Invalid codepage");
|
|
}
|
|
|
|
String string;
|
|
string.encodeInternal(*this, page);
|
|
return string;
|
|
}
|
|
|
|
} // End of namespace Common
|