mirror of
https://github.com/mozilla/gecko-dev.git
synced 2024-11-24 05:11:16 +00:00
Back out 2 changesets (bug 912470) for static analysis bustage
CLOSED TREE Backed out changeset 8ecf2f65d4f0 (bug 912470) Backed out changeset 19af08a9c288 (bug 912470)
This commit is contained in:
parent
89639e1fda
commit
354efcb7fe
@ -54,8 +54,8 @@ xn--wgbh1c=windows-1256
|
||||
|
||||
gr=ISO-8859-7
|
||||
|
||||
hk=Big5
|
||||
xn--j6w193g=Big5
|
||||
hk=Big5-HKSCS
|
||||
xn--j6w193g=Big5-HKSCS
|
||||
|
||||
hr=windows-1250
|
||||
|
||||
|
@ -5,6 +5,7 @@
|
||||
# x-unicode is assumed for encodings not listed here
|
||||
|
||||
Big5=zh-TW
|
||||
Big5-HKSCS=zh=HK
|
||||
EUC-JP=ja
|
||||
EUC-KR=ko
|
||||
gb18030=zh-CN
|
||||
|
@ -189,7 +189,7 @@ x-gbk=gbk
|
||||
gb18030=gb18030
|
||||
hz-gb-2312=replacement
|
||||
big5=Big5
|
||||
big5-hkscs=Big5
|
||||
big5-hkscs=Big5-HKSCS
|
||||
cn-big5=Big5
|
||||
csbig5=Big5
|
||||
x-x-big5=Big5
|
||||
|
@ -44,7 +44,6 @@ function runTextDecoderOptions()
|
||||
}, "testDecodeABVOption");
|
||||
test(testDecoderForThaiEncoding, "testDecoderForThaiEncoding");
|
||||
test(testInvalid2022JP, "testInvalid2022JP");
|
||||
test(testDecoderForBig5, "testDecoderForBig5");
|
||||
}
|
||||
|
||||
/*
|
||||
@ -356,7 +355,8 @@ function testDecoderGetEncoding()
|
||||
{encoding: "x-mac-cyrillic", labels: ["x-mac-cyrillic", "x-mac-ukrainian"]},
|
||||
{encoding: "gbk", labels: ["chinese", "csgb2312", "csiso58gb231280", "gb2312", "gb_2312", "gb_2312-80", "gbk", "iso-ir-58", "x-gbk"]},
|
||||
{encoding: "gb18030", labels: ["gb18030"]},
|
||||
{encoding: "big5", labels: ["big5", "cn-big5", "csbig5", "x-x-big5", "big5-hkscs"]},
|
||||
{encoding: "big5", labels: ["big5", "cn-big5", "csbig5", "x-x-big5"]},
|
||||
{encoding: "big5-hkscs", labels: ["big5-hkscs"]},
|
||||
{encoding: "euc-jp", labels: ["cseucpkdfmtjapanese", "euc-jp", "x-euc-jp"]},
|
||||
{encoding: "iso-2022-jp", labels: ["csiso2022jp", "iso-2022-jp"]},
|
||||
{encoding: "shift_jis", labels: ["csshiftjis", "ms_kanji", "shift-jis", "shift_jis", "sjis", "windows-31j", "x-sjis"]},
|
||||
@ -463,78 +463,3 @@ function testInvalid2022JP()
|
||||
});
|
||||
assert_equals(failureCount, 0, failureCount + " of " + inputs.length + " tests failed");
|
||||
}
|
||||
|
||||
function testDecoderForBig5()
|
||||
{
|
||||
const inputs = [
|
||||
[ 0x61, 0x62 ],
|
||||
[ 0x87, 0x40 ],
|
||||
[ 0xFE, 0xFE ],
|
||||
[ 0xFE, 0xFD ],
|
||||
[ 0x88, 0x62 ],
|
||||
[ 0x88, 0x64 ],
|
||||
[ 0x88, 0x66 ],
|
||||
[ 0x88, 0xA3 ],
|
||||
[ 0x88, 0xA5 ],
|
||||
[ 0x88, 0xA7 ],
|
||||
[ 0x99, 0xD4 ],
|
||||
[ 0x99, 0xD5 ],
|
||||
[ 0x99, 0xD6 ],
|
||||
[ 0x61, 0x87, 0x40, 0x62 ],
|
||||
[ 0x61, 0xFE, 0xFE, 0x62 ],
|
||||
[ 0x61, 0xFE, 0xFD, 0x62 ],
|
||||
[ 0x61, 0x88, 0x62, 0x62 ],
|
||||
[ 0x61, 0x88, 0x64, 0x62 ],
|
||||
[ 0x61, 0x88, 0x66, 0x62 ],
|
||||
[ 0x61, 0x88, 0xA3, 0x62 ],
|
||||
[ 0x61, 0x88, 0xA5, 0x62 ],
|
||||
[ 0x61, 0x88, 0xA7, 0x62 ],
|
||||
[ 0x61, 0x99, 0xD4, 0x62 ],
|
||||
[ 0x61, 0x99, 0xD5, 0x62 ],
|
||||
[ 0x61, 0x99, 0xD6, 0x62 ],
|
||||
[ 0x80, 0x61 ],
|
||||
[ 0xFF, 0x61 ],
|
||||
[ 0xFE, 0x39 ],
|
||||
[ 0x87, 0x66 ],
|
||||
[ 0x81, 0x40 ],
|
||||
[ 0x61, 0x81 ],
|
||||
];
|
||||
const expectations = [
|
||||
"\u0061\u0062",
|
||||
"\u43F0",
|
||||
"\u79D4",
|
||||
"\uD864\uDD0D",
|
||||
"\u00CA\u0304",
|
||||
"\u00CA\u030C",
|
||||
"\u00CA",
|
||||
"\u00EA\u0304",
|
||||
"\u00EA\u030C",
|
||||
"\u00EA",
|
||||
"\u8991",
|
||||
"\uD85E\uDD67",
|
||||
"\u8A29",
|
||||
"\u0061\u43F0\u0062",
|
||||
"\u0061\u79D4\u0062",
|
||||
"\u0061\uD864\uDD0D\u0062",
|
||||
"\u0061\u00CA\u0304\u0062",
|
||||
"\u0061\u00CA\u030C\u0062",
|
||||
"\u0061\u00CA\u0062",
|
||||
"\u0061\u00EA\u0304\u0062",
|
||||
"\u0061\u00EA\u030C\u0062",
|
||||
"\u0061\u00EA\u0062",
|
||||
"\u0061\u8991\u0062",
|
||||
"\u0061\uD85E\uDD67\u0062",
|
||||
"\u0061\u8A29\u0062",
|
||||
"\uFFFD\u0061",
|
||||
"\uFFFD\u0061",
|
||||
"\uFFFD\u0039",
|
||||
"\uFFFD\u0066",
|
||||
"\uFFFD\u0040",
|
||||
"\u0061\uFFFD",
|
||||
];
|
||||
|
||||
for (var i = 0; i < inputs.length; i++) {
|
||||
testCharset({encoding: "big5", input: inputs[i], expected: expectations[i],
|
||||
msg: "decoder test #" + i + " for big5."});
|
||||
}
|
||||
}
|
||||
|
@ -11,7 +11,7 @@ acp.932=Shift_JIS
|
||||
acp.936=gb18030
|
||||
acp.949=EUC-KR
|
||||
acp.950=Big5
|
||||
acp.951=Big5
|
||||
acp.951=Big5-HKSCS
|
||||
acp.1250=windows-1250
|
||||
acp.1251=windows-1251
|
||||
acp.1252=windows-1252
|
||||
|
@ -27,6 +27,7 @@ EXPORTS += [
|
||||
'ucvja/nsUCVJACID.h',
|
||||
'ucvko/nsUCvKOCID.h',
|
||||
'ucvlatin/nsUCvLatinCID.h',
|
||||
'ucvtw/nsUCvTWCID.h',
|
||||
]
|
||||
|
||||
UNIFIED_SOURCES += [
|
||||
@ -136,9 +137,10 @@ UNIFIED_SOURCES += [
|
||||
]
|
||||
|
||||
UNIFIED_SOURCES += [
|
||||
'ucvtw/nsBIG5Data.cpp',
|
||||
'ucvtw/nsBIG5HKSCSToUnicode.cpp',
|
||||
'ucvtw/nsBIG5ToUnicode.cpp',
|
||||
'ucvtw/nsUnicodeToBIG5.cpp',
|
||||
'ucvtw/nsUnicodeToBIG5HKSCS.cpp',
|
||||
]
|
||||
|
||||
UNIFIED_SOURCES += [
|
||||
|
@ -82,18 +82,14 @@ public:
|
||||
* @param aDestLength [IN/OUT] the length of the destination data buffer;
|
||||
* after conversion will contain the number of Unicode
|
||||
* characters written
|
||||
* @return NS_ERROR_UDEC_ILLEGALINPUT if an illegal input sequence
|
||||
* @return NS_PARTIAL_MORE_INPUT if only a partial conversion was
|
||||
* done; more input is needed to continue
|
||||
* NS_PARTIAL_MORE_OUTPUT if only a partial conversion
|
||||
* was done; more output space is needed to continue
|
||||
* NS_ERROR_ILLEGAL_INPUT if an illegal input sequence
|
||||
* was encountered and the behavior was set to "signal";
|
||||
* the caller must skip over one byte, reset the decoder
|
||||
* and retry.
|
||||
* NS_OK_UDEC_MOREOUTPUT if only a partial conversion
|
||||
* was done; more output space is needed to continue
|
||||
* NS_OK_UDEC_MOREINPUT if the input ended in the middle
|
||||
* of an input code unit sequence. If this is the last
|
||||
* result the caller has at the end of the stream, the
|
||||
* caller must append one U+FFFD to the output.
|
||||
* NS_OK if the input ended after a complete input code
|
||||
* unit sequence.
|
||||
*/
|
||||
NS_IMETHOD Convert(const char * aSrc, int32_t * aSrcLength,
|
||||
char16_t * aDest, int32_t * aDestLength) = 0;
|
||||
|
@ -96,12 +96,6 @@ public:
|
||||
* the first of a surrogate pair.
|
||||
* NS_ERROR_UENC_NOMAPPING if character without mapping
|
||||
* was encountered and the behavior was set to "signal".
|
||||
* In the case of an unmappable BMP character, aDestLength
|
||||
* must indicate that the unmappable character was
|
||||
* consumed by the encoder (unlike in the decode API!).
|
||||
* In the case of an unmappable astral character,
|
||||
* aDestLength must indicate that the high surrogate was
|
||||
* consumed by the encoder but the low surrogate was not.
|
||||
*/
|
||||
NS_IMETHOD Convert(const char16_t * aSrc, int32_t * aSrcLength,
|
||||
char * aDest, int32_t * aDestLength) = 0;
|
||||
|
@ -107,8 +107,12 @@
|
||||
#include "nsUnicodeToISO2022JP.h"
|
||||
|
||||
// ucvtw
|
||||
#include "nsUCvTWCID.h"
|
||||
#include "nsUCvTWDll.h"
|
||||
#include "nsBIG5ToUnicode.h"
|
||||
#include "nsUnicodeToBIG5.h"
|
||||
#include "nsBIG5HKSCSToUnicode.h"
|
||||
#include "nsUnicodeToBIG5HKSCS.h"
|
||||
|
||||
// ucvko
|
||||
#include "nsUCvKOCID.h"
|
||||
@ -180,6 +184,7 @@ NS_UCONV_REG_UNREG("EUC-JP", NS_EUCJPTOUNICODE_CID, NS_UNICODETOEUCJP_CID)
|
||||
|
||||
// ucvtw
|
||||
NS_UCONV_REG_UNREG("Big5", NS_BIG5TOUNICODE_CID, NS_UNICODETOBIG5_CID)
|
||||
NS_UCONV_REG_UNREG("Big5-HKSCS", NS_BIG5HKSCSTOUNICODE_CID, NS_UNICODETOBIG5HKSCS_CID)
|
||||
|
||||
// ucvko
|
||||
NS_UCONV_REG_UNREG("EUC-KR", NS_EUCKRTOUNICODE_CID, NS_UNICODETOEUCKR_CID)
|
||||
@ -209,8 +214,6 @@ NS_GENERIC_FACTORY_CONSTRUCTOR(nsISO2022JPToUnicodeV2)
|
||||
NS_GENERIC_FACTORY_CONSTRUCTOR(nsUnicodeToISO2022JP)
|
||||
|
||||
// ucvtw
|
||||
NS_GENERIC_FACTORY_CONSTRUCTOR(nsBIG5ToUnicode)
|
||||
NS_GENERIC_FACTORY_CONSTRUCTOR(nsUnicodeToBIG5)
|
||||
|
||||
// ucvko
|
||||
|
||||
@ -244,6 +247,23 @@ const uint16_t g_ASCIIMappingTable[] = {
|
||||
0x0001, 0x0004, 0x0005, 0x0008, 0x0000, 0x0000, 0x007F, 0x0000
|
||||
};
|
||||
|
||||
// ucvtw
|
||||
const uint16_t g_ufBig5Mapping[] = {
|
||||
#include "big5.uf"
|
||||
};
|
||||
|
||||
const uint16_t g_utBIG5Mapping[] = {
|
||||
#include "big5.ut"
|
||||
};
|
||||
|
||||
const uint16_t g_ufBig5HKSCSMapping[] = {
|
||||
#include "hkscs.uf"
|
||||
};
|
||||
|
||||
const uint16_t g_utBig5HKSCSMapping[] = {
|
||||
#include "hkscs.ut"
|
||||
};
|
||||
|
||||
// ucvko
|
||||
const uint16_t g_utKSC5601Mapping[] = {
|
||||
#include "u20kscgl.ut"
|
||||
@ -357,6 +377,8 @@ NS_DEFINE_NAMED_CID(NS_UNICODETOEUCJP_CID);
|
||||
NS_DEFINE_NAMED_CID(NS_UNICODETOISO2022JP_CID);
|
||||
NS_DEFINE_NAMED_CID(NS_UNICODETOBIG5_CID);
|
||||
NS_DEFINE_NAMED_CID(NS_BIG5TOUNICODE_CID);
|
||||
NS_DEFINE_NAMED_CID(NS_UNICODETOBIG5HKSCS_CID);
|
||||
NS_DEFINE_NAMED_CID(NS_BIG5HKSCSTOUNICODE_CID);
|
||||
NS_DEFINE_NAMED_CID(NS_EUCKRTOUNICODE_CID);
|
||||
NS_DEFINE_NAMED_CID(NS_UNICODETOEUCKR_CID);
|
||||
NS_DEFINE_NAMED_CID(NS_GBKTOUNICODE_CID);
|
||||
@ -459,6 +481,8 @@ static const mozilla::Module::CIDEntry kUConvCIDs[] = {
|
||||
{ &kNS_UNICODETOISO2022JP_CID, false, nullptr, nsUnicodeToISO2022JPConstructor },
|
||||
{ &kNS_UNICODETOBIG5_CID, false, nullptr, nsUnicodeToBIG5Constructor },
|
||||
{ &kNS_BIG5TOUNICODE_CID, false, nullptr, nsBIG5ToUnicodeConstructor },
|
||||
{ &kNS_UNICODETOBIG5HKSCS_CID, false, nullptr, nsUnicodeToBIG5HKSCSConstructor },
|
||||
{ &kNS_BIG5HKSCSTOUNICODE_CID, false, nullptr, nsBIG5HKSCSToUnicodeConstructor },
|
||||
{ &kNS_EUCKRTOUNICODE_CID, false, nullptr, nsCP949ToUnicodeConstructor },
|
||||
{ &kNS_UNICODETOEUCKR_CID, false, nullptr, nsUnicodeToCP949Constructor },
|
||||
{ &kNS_GBKTOUNICODE_CID, false, nullptr, nsGB18030ToUnicodeConstructor },
|
||||
@ -563,6 +587,8 @@ static const mozilla::Module::ContractIDEntry kUConvContracts[] = {
|
||||
{ NS_UNICODEENCODER_CONTRACTID_BASE "ISO-2022-JP", &kNS_UNICODETOISO2022JP_CID },
|
||||
{ NS_UNICODEENCODER_CONTRACTID_BASE "Big5", &kNS_UNICODETOBIG5_CID },
|
||||
{ NS_UNICODEDECODER_CONTRACTID_BASE "Big5", &kNS_BIG5TOUNICODE_CID },
|
||||
{ NS_UNICODEENCODER_CONTRACTID_BASE "Big5-HKSCS", &kNS_UNICODETOBIG5HKSCS_CID },
|
||||
{ NS_UNICODEDECODER_CONTRACTID_BASE "Big5-HKSCS", &kNS_BIG5HKSCSTOUNICODE_CID },
|
||||
{ NS_UNICODEDECODER_CONTRACTID_BASE "EUC-KR", &kNS_EUCKRTOUNICODE_CID },
|
||||
{ NS_UNICODEENCODER_CONTRACTID_BASE "EUC-KR", &kNS_UNICODETOEUCKR_CID },
|
||||
{ NS_UNICODEDECODER_CONTRACTID_BASE "gbk", &kNS_GBKTOUNICODE_CID },
|
||||
|
@ -12,4 +12,3 @@ skip-if = toolkit == 'android' #bug 775227
|
||||
[test_unicode_noncharacters_gb18030.html]
|
||||
[test_unicode_noncharacters_utf8.html]
|
||||
[test_utf8_overconsumption.html]
|
||||
[test_big5_encoder.html]
|
||||
|
@ -1,43 +0,0 @@
|
||||
<!DOCTYPE HTML>
|
||||
<html>
|
||||
<!--
|
||||
https://bugzilla.mozilla.org/show_bug.cgi?id=912470
|
||||
-->
|
||||
<head>
|
||||
<meta http-equiv="Content-type" content="text/html; charset=UTF-8">
|
||||
<title>Test for Unicode non-characters</title>
|
||||
<script type="text/javascript" src="/tests/SimpleTest/SimpleTest.js"></script>
|
||||
<link rel="stylesheet" type="text/css" href="/tests/SimpleTest/test.css" />
|
||||
</head>
|
||||
<body onload="test()">
|
||||
<pre id="test">
|
||||
<script class="testbody" type="text/javascript">
|
||||
/* NOTE:
|
||||
* When we make our data: URL origin work as in Blink, this test will fail.
|
||||
* Hopefully, by that time are URL parser has become spec-compliant, so that
|
||||
* we'll pass the Web Platform Test for the big5 encoder
|
||||
* (testing/web-platform/tests/encoding/big5-encoder.html) and this test can
|
||||
* simply be removed.
|
||||
*/
|
||||
SimpleTest.waitForExplicitFinish();
|
||||
|
||||
function test() {
|
||||
var f = document.getElementsByTagName("iframe")[0];
|
||||
f.onload = function() {
|
||||
var href = f.contentWindow.location.href;
|
||||
var index = href.indexOf("?foo=");
|
||||
var actual = href.substring(index + 5);
|
||||
var expected = "h%26%2340614%3Bi%26%23156267%3Bj%A1%40k%A3%E1l%A4%40m%C8%A4n%C8%CDo%FE%FEp%26%238365%3Bq%FDjr%F9%F9s%26%23128169%3Bt";
|
||||
is(actual, expected, "Should have gotten the expected encode.");
|
||||
SimpleTest.finish();
|
||||
}
|
||||
f.contentDocument.forms[0].submit();
|
||||
}
|
||||
</script>
|
||||
</pre>
|
||||
<a target="_blank" href="https://bugzilla.mozilla.org/show_bug.cgi?id=912470">Mozilla Bug 912470</a>
|
||||
<p id="display"></p>
|
||||
<div id="content" style="display: none"><iframe src="data:text/html;charset=big5,<form><input name=foo value=h&%23x9EA6;i&%23x2626B;j&%23x3000;k&%23x20AC;l&%23x4E00;m&%23x27607;n&%23xFFE2;o&%23x79D4;p&%23x20AD;q&%23x203B5;r&%23x2550;s&%23x1F4A9;t></form>">
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
@ -11,6 +11,7 @@ function run_test() {
|
||||
// this list excludes codepages that can represent all Unicode
|
||||
var encoders = [
|
||||
"Big5",
|
||||
"Big5-HKSCS",
|
||||
"EUC-JP",
|
||||
"EUC-KR",
|
||||
"gbk",
|
||||
|
@ -1,253 +0,0 @@
|
||||
#!/usr/bin/python
|
||||
|
||||
# This Source Code Form is subject to the terms of the Mozilla Public
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
# Adapted from
|
||||
# https://hg.mozilla.org/projects/htmlparser/file/0d906fb1ab90/generate-encoding-data.py
|
||||
|
||||
# indexes.json comes from
|
||||
# https://encoding.spec.whatwg.org/indexes.json
|
||||
# i.e.
|
||||
# https://github.com/whatwg/encoding/blob/ce4e83d0df5b5efec0697fc76e66699737e033a3/indexes.json
|
||||
|
||||
import json
|
||||
|
||||
indexes = json.load(open("indexes.json", "r"))
|
||||
|
||||
def nullToZero(codePoint):
|
||||
if not codePoint:
|
||||
codePoint = 0
|
||||
return codePoint
|
||||
|
||||
index = []
|
||||
|
||||
for codePoint in indexes["big5"]:
|
||||
index.append(nullToZero(codePoint))
|
||||
|
||||
# There are four major gaps consisting of more than 4 consecutive invalid pointers
|
||||
gaps = []
|
||||
consecutive = 0
|
||||
consecutiveStart = 0
|
||||
offset = 0
|
||||
for codePoint in index:
|
||||
if codePoint == 0:
|
||||
if consecutive == 0:
|
||||
consecutiveStart = offset
|
||||
consecutive +=1
|
||||
else:
|
||||
if consecutive > 4:
|
||||
gaps.append((consecutiveStart, consecutiveStart + consecutive))
|
||||
consecutive = 0
|
||||
offset += 1
|
||||
|
||||
def invertRanges(ranges, cap):
|
||||
inverted = []
|
||||
invertStart = 0
|
||||
for (start, end) in ranges:
|
||||
if start != 0:
|
||||
inverted.append((invertStart, start))
|
||||
invertStart = end
|
||||
inverted.append((invertStart, cap))
|
||||
return inverted
|
||||
|
||||
cap = len(index)
|
||||
ranges = invertRanges(gaps, cap)
|
||||
|
||||
# Now compute a compressed lookup table for astralness
|
||||
|
||||
gaps = []
|
||||
consecutive = 0
|
||||
consecutiveStart = 0
|
||||
offset = 0
|
||||
for codePoint in index:
|
||||
if codePoint <= 0xFFFF:
|
||||
if consecutive == 0:
|
||||
consecutiveStart = offset
|
||||
consecutive +=1
|
||||
else:
|
||||
if consecutive > 40:
|
||||
gaps.append((consecutiveStart, consecutiveStart + consecutive))
|
||||
consecutive = 0
|
||||
offset += 1
|
||||
|
||||
astralRanges = invertRanges(gaps, cap)
|
||||
|
||||
|
||||
classFile = open("../ucvtw/nsBIG5Data.cpp", "w")
|
||||
classFile.write('''/* This Source Code Form is subject to the terms of the Mozilla Public
|
||||
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
|
||||
|
||||
/*
|
||||
* THIS IS A GENERATED FILE. PLEASE DO NOT EDIT.
|
||||
* Instead, please regenerate using intl/uconv/tools/gen-big5-data.py
|
||||
*/
|
||||
|
||||
#include "nsBIG5Data.h"
|
||||
|
||||
static const char16_t kBig5LowBitsTable[] = {
|
||||
''')
|
||||
|
||||
for (low, high) in ranges:
|
||||
for i in xrange(low, high):
|
||||
classFile.write(' 0x%04X,\n' % (index[i] & 0xFFFF))
|
||||
|
||||
classFile.write('''};
|
||||
|
||||
static const uint32_t kBig5AstralnessTable[] = {
|
||||
''')
|
||||
|
||||
# An array of bool is inefficient per
|
||||
# http://stackoverflow.com/questions/4049156/1-bit-per-bool-in-array-c
|
||||
|
||||
bits = []
|
||||
for (low, high) in astralRanges:
|
||||
for i in xrange(low, high):
|
||||
bits.append(1 if index[i] > 0xFFFF else 0)
|
||||
# pad length to multiple of 32
|
||||
for i in xrange(32 - (len(bits) % 32)):
|
||||
bits.append(0)
|
||||
i = 0
|
||||
while i < len(bits):
|
||||
accu = 0
|
||||
for j in xrange(32):
|
||||
accu |= bits[i + j] << j
|
||||
classFile.write(' 0x%08X,\n' % accu)
|
||||
i += 32
|
||||
|
||||
classFile.write('''};
|
||||
|
||||
// static
|
||||
char16_t
|
||||
nsBIG5Data::LowBits(size_t aPointer)
|
||||
{
|
||||
''')
|
||||
|
||||
base = 0
|
||||
for (low, high) in ranges:
|
||||
classFile.write(''' if (aPointer < %d) {
|
||||
return 0;
|
||||
}
|
||||
if (aPointer < %d) {
|
||||
return kBig5LowBitsTable[%d + (aPointer - %d)];
|
||||
}
|
||||
''' % (low, high, base, low))
|
||||
base += (high - low)
|
||||
|
||||
classFile.write(''' return 0;
|
||||
}
|
||||
|
||||
// static
|
||||
bool
|
||||
nsBIG5Data::IsAstral(size_t aPointer)
|
||||
{
|
||||
''')
|
||||
|
||||
base = 0
|
||||
for (low, high) in astralRanges:
|
||||
if high - low == 1:
|
||||
classFile.write(''' if (aPointer < %d) {
|
||||
return false;
|
||||
}
|
||||
if (aPointer == %d) {
|
||||
return true;
|
||||
}
|
||||
''' % (low, low))
|
||||
else:
|
||||
classFile.write(''' if (aPointer < %d) {
|
||||
return false;
|
||||
}
|
||||
if (aPointer < %d) {
|
||||
size_t index = %d + (aPointer - %d);
|
||||
return kBig5AstralnessTable[index >> 5] & (1 << (index & 0x1F));
|
||||
}
|
||||
''' % (low, high, base, low))
|
||||
base += (high - low)
|
||||
|
||||
classFile.write(''' return false;
|
||||
}
|
||||
|
||||
//static
|
||||
size_t
|
||||
nsBIG5Data::FindPointer(char16_t aLowBits, bool aIsAstral)
|
||||
{
|
||||
if (!aIsAstral) {
|
||||
switch (aLowBits) {
|
||||
''')
|
||||
|
||||
hkscsBound = (0xA1 - 0x81) * 157
|
||||
|
||||
preferLast = [
|
||||
0x2550,
|
||||
0x255E,
|
||||
0x2561,
|
||||
0x256A,
|
||||
0x5341,
|
||||
0x5345,
|
||||
]
|
||||
|
||||
for codePoint in preferLast:
|
||||
# Python lists don't have .rindex() :-(
|
||||
for i in xrange(len(index) - 1, -1, -1):
|
||||
candidate = index[i]
|
||||
if candidate == codePoint:
|
||||
classFile.write(''' case 0x%04X:
|
||||
return %d;
|
||||
''' % (codePoint, i))
|
||||
break
|
||||
|
||||
classFile.write(''' default:
|
||||
break;
|
||||
}
|
||||
}''')
|
||||
|
||||
base = 0
|
||||
start = 0
|
||||
for (low, high) in ranges:
|
||||
if low <= hkscsBound and hkscsBound < high:
|
||||
# This is the first range we don't ignore and the
|
||||
# range that contains the first non-HKSCS pointer.
|
||||
# Avoid searching HKSCS.
|
||||
start = base + hkscsBound - low
|
||||
break
|
||||
base += (high - low)
|
||||
|
||||
classFile.write('''
|
||||
for (size_t i = %d; i < MOZ_ARRAY_LENGTH(kBig5LowBitsTable); ++i) {
|
||||
if (kBig5LowBitsTable[i] == aLowBits) {
|
||||
size_t pointer;
|
||||
''' % start)
|
||||
|
||||
base = 0
|
||||
prevLow = 0
|
||||
prevHigh = 0
|
||||
prevBase = 0
|
||||
writing = False
|
||||
for (low, high) in ranges:
|
||||
if writing:
|
||||
classFile.write('''if (i < %d) {
|
||||
pointer = i + %d;
|
||||
} else ''' % ((prevBase + prevHigh - prevLow), (prevLow - prevBase)))
|
||||
prevLow = low
|
||||
prevHigh = high
|
||||
prevBase = base
|
||||
if high > hkscsBound:
|
||||
writing = True
|
||||
base += (high - low)
|
||||
|
||||
classFile.write('''{
|
||||
pointer = i + %d;
|
||||
}''' % (prevLow - prevBase))
|
||||
|
||||
classFile.write('''
|
||||
if (aIsAstral == IsAstral(pointer)) {
|
||||
return pointer;
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
''')
|
||||
classFile.close()
|
959
intl/uconv/tools/gen-big5hkscs-2001-mozilla.pl
Normal file
959
intl/uconv/tools/gen-big5hkscs-2001-mozilla.pl
Normal file
@ -0,0 +1,959 @@
|
||||
#!/usr/bin/perl -w
|
||||
#
|
||||
# gen-big5hkscs-2001-mozilla.pl
|
||||
# a Perl script that generates Big5-HKSCS <-> Unicode
|
||||
# conversion tables for Mozilla
|
||||
#
|
||||
# Author (of the original Perl script):
|
||||
# Anthony Fok <anthony@thizlinux.com> <foka@debian.org>
|
||||
# Copyright (C) 2001, 2002 ThizLinux Laboratory Ltd.
|
||||
# License: GNU General Public License, v2 or later.
|
||||
#
|
||||
# This version includes original C source code from
|
||||
# glibc-2.2.5/iconvdata/big5hkscs.c by Ulrich Drepper <drepper@redhat.com>
|
||||
# Roger So <roger.so@sw-linux.com>
|
||||
#
|
||||
# First attempt for Qt-2.3.x: 2001-09-21
|
||||
# A working version for Qt-2.3.x: 2001-10-30
|
||||
# Ported to glibc-2.2.5 with HKSCS-2001: 2002-03-21
|
||||
# Adapted to generate conversion tables for Mozilla: 2002-11-26
|
||||
# Adapted to generate conversion tables for Mozilla: 2002-11-30
|
||||
# Cleaned up the script somewhat: 2002-12-04
|
||||
# Minor revisions for submitting to Mozilla Bugzilla: 2002-12-10
|
||||
#
|
||||
# Notes:
|
||||
#
|
||||
# 1. The latest version of this script may be found in:
|
||||
# http://www.thizlinux.com/~anthony/hkscs/gen-glibc-big5hkscs.pl
|
||||
# http://people.debian.org/~foka/hkscs/gen-glibc-big5hkscs.pl
|
||||
# Or, better yet, e-mail me and ask for the latest version.
|
||||
#
|
||||
# 2. This script generates data from 3 tables:
|
||||
# a. http://www.microsoft.com/typography/unicode/950.txt
|
||||
# b. http://www.info.gov.hk/digital21/chi/hkscs/download/big5-iso.txt
|
||||
# c. http://www.info.gov.hk/digital21/chi/hkscs/download/big5cmp.txt
|
||||
#
|
||||
# Make sure your big5-iso.txt is the latest HKSCS-2001 version.
|
||||
#
|
||||
# 3. [glibc]: I have currently split the ucs_to_big5_hkscs_?[] tables into
|
||||
# different areas similar to the way Ulrich and Roger did it,
|
||||
# but extended for HKSCS-2001.
|
||||
#
|
||||
# 4. [Mozilla]: This script is very quick-and-dirty in some places.
|
||||
# Call either gen_mozilla_uf() or gen_mozilla_ut() to generate
|
||||
# the appropriate tables for feeding into "fromu" or "tou".
|
||||
#
|
||||
# 5. [CharMapML]: The comments regarding TW-BIG5 herein need to be organized.
|
||||
# Also, please make sure "$hkscs_mode = 0;" for TW-BIG5 mode.
|
||||
# Otherwise, this script would generate a HKSCS table.
|
||||
# (Yes, I know, I should clean up this script and make it more modular,
|
||||
# and with command-line options or whatnot. I'll do that later. :-)
|
||||
#
|
||||
# If you have any questions or concerns, please feel free to contact me
|
||||
# at Anthony Fok <anthony@thizlinux.com> or <foka@debian.org> :-)
|
||||
#
|
||||
# Last but not least, special thanks to ThizLinux Laboratory Ltd. (HK)
|
||||
# for their generous support in this work.
|
||||
#
|
||||
|
||||
# 1. UDA3, 0x8840 - 0x8dfe
|
||||
# 2. UDA2, 0x8e40 - 0xa0fe
|
||||
# 3. VDA, 0xc6a1 - 0xc8fe
|
||||
|
||||
#use Getopt::Std;
|
||||
|
||||
my ( %b2u, %u2b, $unicode, $big5, $high, $low, $i, $count );
|
||||
|
||||
my $debug = 0;
|
||||
my $hkscs_mode = 1;
|
||||
my $kangxi = 0;
|
||||
my $use_range = 0;
|
||||
my $bmp_only = 1;
|
||||
|
||||
#
|
||||
# Subroutine Declaration
|
||||
#
|
||||
sub read_cp950();
|
||||
sub adjust_radicals();
|
||||
sub read_hkscs_main();
|
||||
sub read_hkscs_cmp();
|
||||
sub post_tuning();
|
||||
sub gen_charmapml();
|
||||
sub gen_check_b2u();
|
||||
sub gen_check_u2b();
|
||||
sub gen_mozilla_uf();
|
||||
sub gen_mozilla_ut();
|
||||
sub gen_glibc();
|
||||
|
||||
###########################################################################
|
||||
#
|
||||
# Main program
|
||||
#
|
||||
|
||||
# First, read Microsoft's CP950 as base Big5.
|
||||
read_cp950 ();
|
||||
|
||||
# Add mappings to Kangxi Radicals.
|
||||
# The b2u direction is added only if $kangxi is not null.
|
||||
adjust_radicals ();
|
||||
|
||||
# Then, read the HKSCS table.
|
||||
# Again, see the $hkscs_mode variable.
|
||||
read_hkscs_main ();
|
||||
read_hkscs_cmp () if $hkscs_mode;
|
||||
|
||||
post_tuning ();
|
||||
|
||||
|
||||
# Then, choose one of the following:
|
||||
#gen_charmapml();
|
||||
gen_mozilla_uf();
|
||||
#gen_mozilla_ut();
|
||||
#gen_check_u2b();
|
||||
#gen_glibc();
|
||||
|
||||
|
||||
# End of program
|
||||
exit 0;
|
||||
|
||||
|
||||
#############################################################################
|
||||
#
|
||||
# Subroutines
|
||||
#
|
||||
|
||||
sub read_cp950() {
|
||||
open( CP950, "950.txt" ) or die;
|
||||
my $mode = 0;
|
||||
while (<CP950>) {
|
||||
s/\r//;
|
||||
chomp;
|
||||
next if /^$/;
|
||||
last if /^ENDCODEPAGE/;
|
||||
|
||||
if (/^DBCSTABLE (\d+)\s+;LeadByte = 0x([0-9a-f]{2})/) {
|
||||
$mode = 1;
|
||||
( $count, $high ) = ( $1, $2 );
|
||||
$i = 0;
|
||||
next;
|
||||
}
|
||||
if (/^WCTABLE (\d+)/) {
|
||||
$mode = 2;
|
||||
$count = $1;
|
||||
$i = 0;
|
||||
next;
|
||||
}
|
||||
next if $mode == 0;
|
||||
|
||||
if ( $mode == 1 ) {
|
||||
( $low, $unicode, $comment ) = split "\t";
|
||||
$low =~ s/^0x//;
|
||||
$unicode =~ s/^0x//;
|
||||
$big5 = $high . $low;
|
||||
$b2u{ uc($big5) } = uc($unicode);
|
||||
if ( ++$i == $count ) { $mode = 0; $count = 0; next; }
|
||||
}
|
||||
|
||||
if ( $mode == 2 ) {
|
||||
( $unicode, $big5, $comment ) = split "\t";
|
||||
$unicode =~ s/^0x//;
|
||||
$big5 =~ s/^0x//;
|
||||
my $u = hex($unicode);
|
||||
my $b = hex($big5);
|
||||
|
||||
$u2b{ uc($unicode) } = uc($big5) unless
|
||||
|
||||
# Skip Microsoft's over-generous (or over-zealous?) mappings
|
||||
# "Faked" accented latin characters
|
||||
( $b <= 0xFF and $b != $u )
|
||||
|
||||
# "Faked" Ideographic Annotation ___ Mark
|
||||
or ( $u >= 0x3192 and $u <= 0x319F )
|
||||
|
||||
# "Faked" Parenthesized Ideograph ___
|
||||
or ( $u >= 0x3220 and $u <= 0x3243 )
|
||||
|
||||
# "Faked" Circled Ideograph ___ except Circled Ideograph Correct
|
||||
or ( $u >= 0x3280 and $u <= 0x32B0 and $u != 0x32A3 )
|
||||
|
||||
# ¢£¥’μ﹐
|
||||
or ( $u == 0xA2
|
||||
or $u == 0xA3
|
||||
or $u == 0xA5
|
||||
or $u == 0xB4
|
||||
or $u == 0xB5
|
||||
or $u == 0xB8 )
|
||||
|
||||
# ¯─∥‧˙〃 ̄﹨°≡︴⊙⊕~﹋
|
||||
or ( $u == 0x0305 # ???
|
||||
or $u == 0x2015
|
||||
or $u == 0x2016
|
||||
or $u == 0x2022
|
||||
or $u == 0x2024
|
||||
or $u == 0x2033
|
||||
or $u == 0x203E # ???
|
||||
or $u == 0x2216
|
||||
or $u == 0x2218
|
||||
or $u == 0x2263
|
||||
or $u == 0x2307
|
||||
or $u == 0x2609
|
||||
or $u == 0x2641
|
||||
or $u == 0x301C
|
||||
or $u == 0x3030 )
|
||||
|
||||
# ︿‘﹑
|
||||
or ( $u == 0xFF3E or $u == 0xFF40 or $u == 0xFF64 );
|
||||
|
||||
if ( ++$i == $count ) { $mode = 0; $count = 0; next; }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
sub adjust_radicals() {
|
||||
|
||||
# B5+C6BF - B5+C6D7: Radicals (?)
|
||||
|
||||
# TW-BIG5 drafted by Autrijus uses Kangxi Radicals whenever possible.
|
||||
#
|
||||
# Big5-HKSCS tends towards using the character in Unicode CJK Ideographs
|
||||
# Note that HKSCS does not explicitly define
|
||||
# B5+C6CF, B5+C6D3, B5+C6D5, B5+C6D7 (廴、无、癶、隶),
|
||||
# but do have these characters at B5+FBFD, B5+FCD3, B5+FEC1, B5+90C4,
|
||||
# mapped to U+5EF4, U+65E0, U+7676, U+96B6 respectively.
|
||||
#
|
||||
# As for B5+C6CD (⼳), HKSCS maps it to U+2F33 just like TW-BIG5.
|
||||
# However, it also maps B5+FBF4 (幺) to U+5E7A.
|
||||
$b2u{"C6BF"} = "2F02" if $kangxi;
|
||||
$u2b{"2F02"} = "C6BF"; # 丶
|
||||
$b2u{"C6C0"} = "2F03" if $kangxi;
|
||||
$u2b{"2F03"} = "C6C0"; # 丿
|
||||
$b2u{"C6C1"} = "2F05" if $kangxi;
|
||||
$u2b{"2F05"} = "C6C1"; # 亅
|
||||
$b2u{"C6C2"} = "2F07" if $kangxi;
|
||||
$u2b{"2F07"} = "C6C2"; # 亠
|
||||
$b2u{"C6C3"} = "2F0C" if $kangxi;
|
||||
$u2b{"2F0C"} = "C6C3"; # 冂
|
||||
$b2u{"C6C4"} = "2F0D" if $kangxi;
|
||||
$u2b{"2F0D"} = "C6C4"; # 冖
|
||||
$b2u{"C6C5"} = "2F0E" if $kangxi;
|
||||
$u2b{"2F0E"} = "C6C5"; # 冫
|
||||
$b2u{"C6C6"} = "2F13" if $kangxi;
|
||||
$u2b{"2F13"} = "C6C6"; # 勹
|
||||
$b2u{"C6C7"} = "2F16" if $kangxi;
|
||||
$u2b{"2F16"} = "C6C7"; # 匸
|
||||
$b2u{"C6C8"} = "2F19" if $kangxi;
|
||||
$u2b{"2F19"} = "C6C8"; # 卩
|
||||
$b2u{"C6C9"} = "2F1B" if $kangxi;
|
||||
$u2b{"2F1B"} = "C6C9"; # 厶
|
||||
$b2u{"C6CA"} = "2F22" if $kangxi;
|
||||
$u2b{"2F22"} = "C6CA"; # 夊
|
||||
$b2u{"C6CB"} = "2F27" if $kangxi;
|
||||
$u2b{"2F27"} = "C6CB"; # 宀
|
||||
$b2u{"C6CC"} = "2F2E" if $kangxi;
|
||||
$u2b{"2F2E"} = "C6CC"; # 巛
|
||||
$b2u{"C6CD"} = "2F33" if $kangxi;
|
||||
$u2b{"2F33"} = "C6CD"; # ⼳
|
||||
$b2u{"C6CE"} = "2F34" if $kangxi;
|
||||
$u2b{"2F34"} = "C6CE"; # 广
|
||||
$b2u{"C6CF"} = "2F35" if $kangxi;
|
||||
$u2b{"2F35"} = "C6CF"; # 廴
|
||||
$b2u{"C6D0"} = "2F39" if $kangxi;
|
||||
$u2b{"2F39"} = "C6D0"; # 彐
|
||||
$b2u{"C6D1"} = "2F3A" if $kangxi;
|
||||
$u2b{"2F3A"} = "C6D1"; # 彡
|
||||
$b2u{"C6D2"} = "2F41" if $kangxi;
|
||||
$u2b{"2F41"} = "C6D2"; # 攴
|
||||
$b2u{"C6D3"} = "2F46" if $kangxi;
|
||||
$u2b{"2F46"} = "C6D3"; # 无
|
||||
$b2u{"C6D4"} = "2F67" if $kangxi;
|
||||
$u2b{"2F67"} = "C6D4"; # 疒
|
||||
$b2u{"C6D5"} = "2F68" if $kangxi;
|
||||
$u2b{"2F68"} = "C6D5"; # 癶
|
||||
$b2u{"C6D6"} = "2FA1" if $kangxi;
|
||||
$u2b{"2FA1"} = "C6D6"; # 辵
|
||||
$b2u{"C6D7"} = "2FAA" if $kangxi;
|
||||
$u2b{"2FAA"} = "C6D7"; # 隶
|
||||
}
|
||||
|
||||
sub read_hkscs_main() {
|
||||
|
||||
open( B2U, "<big5-iso.txt" ) or die;
|
||||
while (<B2U>) {
|
||||
next
|
||||
unless
|
||||
/([[:xdigit:]]{4})\s+([[:xdigit:]]{4})\s+([[:xdigit:]]{4})\s+([[:xdigit:]]{4,5})/;
|
||||
( $big5, $iso1993, $iso2000, $iso2001 ) = ( $1, $2, $3, $4 );
|
||||
|
||||
my $b = hex($big5);
|
||||
|
||||
# For non-HKSCS mode, only take data in the VDA range (?)
|
||||
next unless $hkscs_mode
|
||||
|
||||
# Note that we don't go from B5+C6A1-B5+C6FE, but rather only
|
||||
# C6A1-C8D3 excluding C6BF-C6D7 (Kangxi Radicals)
|
||||
# because C8D4-C8FE are not assigned in TW-BIG5
|
||||
# if we are to follow Arphic PL Big-5 fonts. (To be discussed)
|
||||
or
|
||||
( $b >= 0xC6A1 && $b <= 0xC8D3 and !( $b >= 0xC6BF && $b <= 0xC6D7 ) )
|
||||
or ( $b >= 0xF9D6 && $b <= 0xF9FE );
|
||||
|
||||
print STDERR
|
||||
"B2U, 2000: $big5 redefined from U+$b2u{$big5} to U+$iso2000.\n"
|
||||
if $debug
|
||||
and defined( $b2u{$big5} )
|
||||
and $b2u{$big5} ne $iso2000;
|
||||
|
||||
$b2u{$big5} = $bmp_only ? $iso2000 : $iso2001
|
||||
unless !$hkscs_mode
|
||||
and $b == 0xF9FE;
|
||||
|
||||
# B5+F9FE is mapped differently in TW-BIG5 and HKSCS, to
|
||||
# U+2593 (Dark Shade) and U+FFED (Halfwidth Black Square) respectively.
|
||||
# Which is more correct? I don't know! (To be discussed)
|
||||
|
||||
print STDERR
|
||||
"1993: U+$iso1993 redefined from $u2b{$iso1993} to $big5.\n"
|
||||
if $debug
|
||||
and defined( $u2b{$iso1993} )
|
||||
and $u2b{$iso1993} ne $big5;
|
||||
|
||||
$u2b{$iso1993} = $big5;
|
||||
|
||||
print STDERR
|
||||
"2000: U+$iso2000 redefined from $u2b{$iso2000} to $big5.\n"
|
||||
if $debug
|
||||
and defined( $u2b{$iso2000} )
|
||||
and $u2b{$iso2000} ne $big5;
|
||||
|
||||
$u2b{$iso2000} = $big5;
|
||||
|
||||
print STDERR
|
||||
"2001: U+$iso2001 redefined from $u2b{$iso2001} to $big5.\n"
|
||||
if $debug
|
||||
and defined( $u2b{$iso2001} )
|
||||
and $u2b{$iso2001} ne $big5;
|
||||
|
||||
$u2b{$iso2001} = $big5;
|
||||
}
|
||||
close B2U;
|
||||
|
||||
} # read_hkscs_main()
|
||||
|
||||
|
||||
sub read_hkscs_cmp() {
|
||||
|
||||
###########################################################################
|
||||
# Add Big5 compatibility coding...
|
||||
#
|
||||
# Stephan, here is the code segment that you may want to implement
|
||||
# in your convertbig5hkscs2001.pl
|
||||
#
|
||||
open( B5CMP, "<big5cmp.txt" ) or die;
|
||||
$mode = 0;
|
||||
while (<B5CMP>) {
|
||||
if (/^=====/) { $mode = 1; next; }
|
||||
next if $mode == 0;
|
||||
last if $mode == 1 and /^\s+/;
|
||||
chomp;
|
||||
my ( $big5cmp, $big5 ) = split " ";
|
||||
|
||||
$big5cmp = uc($big5cmp);
|
||||
$big5 = uc($big5);
|
||||
my $uni = $b2u{$big5};
|
||||
my $unicmp = $b2u{$big5cmp};
|
||||
|
||||
print STDERR
|
||||
"Was: U+$unicmp -> $u2b{$unicmp}, $big5cmp -> U+$b2u{$big5cmp}\t"
|
||||
if $debug;
|
||||
$b2u{$big5cmp} = $uni;
|
||||
$u2b{$unicmp} = $big5;
|
||||
print STDERR
|
||||
"Now: U+$unicmp -> $u2b{$unicmp}, $big5cmp -> U+$b2u{$big5cmp}\n"
|
||||
if $debug;
|
||||
}
|
||||
close B5CMP;
|
||||
} # read_hkscs_cmp();
|
||||
|
||||
|
||||
sub post_tuning() {
|
||||
|
||||
# And finally, fine-tuning...
|
||||
for $i ( 0x00 .. 0x80 ) {
|
||||
$big5 = $unicode = sprintf( "%04X", $i );
|
||||
$b2u{$big5} = $unicode;
|
||||
}
|
||||
|
||||
# Add Euro '€' (I wonder why this 950.txt doesn't have it.)
|
||||
$b2u{"A3E1"} = "20AC";
|
||||
$u2b{"20AC"} = "A3E1";
|
||||
|
||||
# Box drawing characters:
|
||||
# Align with Big-5E (To be discussed, as it differs from CP950 and HKSCS)
|
||||
# (To be discussed)
|
||||
if ( !$hkscs_mode ) {
|
||||
$u2b{"2550"} = "A2A4"; # Big5: ═ (also B5-F9F9)
|
||||
$u2b{"255E"} = "A2A5"; # Big5: ╞ (also B5-F9E9)
|
||||
$u2b{"2561"} = "A2A7"; # Big5: ╡ (also B5-F9EB)
|
||||
$u2b{"256A"} = "A2A6"; # Big5: ╪ (also B5-F9EA)
|
||||
$u2b{"256D"} = "A27E"; # Big5: ╭ (also B5-F9FA)
|
||||
$u2b{"256E"} = "A2A1"; # Big5: ╮ (also B5-F9FB)
|
||||
$u2b{"256F"} = "A2A3"; # Big5: ╯ (also B5-F9FD)
|
||||
$u2b{"2570"} = "A2A2"; # Big5: ╰ (also B5-F9FC)
|
||||
}
|
||||
|
||||
# "Hangzhou" or "Suzhou" Chinese numerals 10, 20, 30 (十卄卅)
|
||||
# (To be discussed)
|
||||
if ( !$hkscs_mode ) {
|
||||
$b2u{"A2CC"} = "3038";
|
||||
$u2b{"3038"} = "A2CC";
|
||||
$b2u{"A2CD"} = "3039";
|
||||
$u2b{"3039"} = "A2CD";
|
||||
$b2u{"A2CE"} = "303A";
|
||||
$u2b{"303A"} = "A2CE";
|
||||
}
|
||||
|
||||
# The character for ethnic group "Yi" (彝):
|
||||
# (To be discussed)
|
||||
$u2b{"5F5E"} = "C255"; # Always add this.
|
||||
if ( !$hkscs_mode ) {
|
||||
$b2u{"C255"} = "5F5E";
|
||||
}
|
||||
|
||||
} # post_tuning()
|
||||
|
||||
|
||||
sub gen_charmapml() {
|
||||
|
||||
###########################################################################
|
||||
#
|
||||
# Codes for generating CharMapML XML file
|
||||
|
||||
print <<EOT;
|
||||
<?xml version="1.0" encoding="UTF-8" ?>
|
||||
<!DOCTYPE characterMapping SYSTEM "http://www.unicode.org/unicode/reports/tr22/CharacterMapping.dtd">
|
||||
EOT
|
||||
|
||||
if ($hkscs_mode) {
|
||||
print <<EOT;
|
||||
<characterMapping id="big5-hkscs-2001" version="1">
|
||||
<history>
|
||||
<modified version="1" date="2002-11-30">
|
||||
Trial version generated from 950.txt + part of big5-iso.txt (HKSCS-2001)
|
||||
with Euro added, with CP950's excessive fub (fallbacks uni->big5) removed,
|
||||
and with some other manual tweaking.
|
||||
</modified>
|
||||
</history>
|
||||
EOT
|
||||
}
|
||||
else {
|
||||
print <<EOT;
|
||||
<characterMapping id="tw-big5-2002" version="1">
|
||||
<history>
|
||||
<modified version="1" date="2002-11-30">
|
||||
Trial version generated from 950.txt + part of big5-iso.txt (HKSCS-2001)
|
||||
with Euro added, with CP950's excessive fub (fallbacks uni->big5) removed,
|
||||
and with some other manual tweaking.
|
||||
</modified>
|
||||
</history>
|
||||
EOT
|
||||
}
|
||||
|
||||
print <<EOT;
|
||||
<validity>
|
||||
<state type="FIRST" next="VALID" s="0" e="80" max="FFFF"/>
|
||||
<state type="FIRST" next="SECOND" s="81" e="FE" max="FFFF"/>
|
||||
<state type="SECOND" next="VALID" s="40" e="7E" max="FFFF"/>
|
||||
<state type="SECOND" next="VALID" s="A1" e="FE" max="FFFF"/>
|
||||
</validity>
|
||||
<assignments sub="3F">
|
||||
EOT
|
||||
print " <!-- One to one mappings -->\n";
|
||||
for $unicode ( sort { hex($a) <=> hex($b) } keys %u2b ) {
|
||||
$big5 = $u2b{$unicode};
|
||||
$u = hex($unicode);
|
||||
next
|
||||
unless defined( $b2u{$big5} )
|
||||
and $unicode eq $b2u{$big5}
|
||||
and
|
||||
not( $use_range and !$hkscs_mode and $u >= 0xE000 && $u <= 0xF6B0 );
|
||||
printf " <a u=\"%04X\" ", $u;
|
||||
if ( hex($big5) <= 0xFF ) {
|
||||
printf "b=\"%02X\"/>\n", hex($big5);
|
||||
}
|
||||
else {
|
||||
printf "b=\"%s %s\"/>\n", substr( $big5, 0, 2 ),
|
||||
substr( $big5, 2, 2 );
|
||||
}
|
||||
}
|
||||
|
||||
print " <!-- Fallback mappings from Unicode to bytes -->\n";
|
||||
for $unicode ( sort { hex($a) <=> hex($b) } keys %u2b ) {
|
||||
$big5 = $u2b{$unicode};
|
||||
next if defined( $b2u{$big5} ) and hex($unicode) == hex( $b2u{$big5} );
|
||||
if ( $unicode eq "F900" ) {
|
||||
print " <!-- CJK Compatibility Ideographs: U+F900 - U+FA6A.\n";
|
||||
print
|
||||
" These are included in CP950 (Unicode->Big5 direction only).\n";
|
||||
print " Should we include this area in TW-BIG5 or not? -->\n";
|
||||
}
|
||||
printf " <fub u=\"%04X\" b=\"%s %s\"/>\n", hex($unicode),
|
||||
substr( $big5, 0, 2 ), substr( $big5, 2, 2 );
|
||||
}
|
||||
|
||||
my %fbu;
|
||||
print " <!-- Fallback mappings from bytes to Unicode -->\n";
|
||||
for $big5 ( sort { hex($a) <=> hex($b) } keys %b2u ) {
|
||||
$unicode = $b2u{$big5};
|
||||
if ( !defined( $u2b{$unicode} ) or hex($big5) != hex( $u2b{$unicode} ) )
|
||||
{
|
||||
$fbu{$unicode} = $big5;
|
||||
}
|
||||
}
|
||||
for $unicode ( sort { hex($a) <=> hex($b) } keys %fbu ) {
|
||||
$big5 = $fbu{$unicode};
|
||||
printf " <fbu u=\"%04X\" b=\"%s %s\"/>\n", hex($unicode),
|
||||
substr( $big5, 0, 2 ), substr( $big5, 2, 2 );
|
||||
}
|
||||
|
||||
if ( $use_range and !$hkscs_mode ) {
|
||||
print <<EOT;
|
||||
<!-- Roundtrip-mappings that can be enumerated
|
||||
Note: We can only use the <range> tag for TW-BIG5.
|
||||
Big-5E and Big5-HKSCS have assigned characters in these areas,
|
||||
and we will have to use the <a> and <fub> tags instead.
|
||||
-->
|
||||
<!-- User-Defined Area 1 (UDA1) -->
|
||||
<range uFirst="E000" uLast="E310" bFirst="FA 40" bLast="FE FE" bMin="81 40" bMax="FE FE"/>
|
||||
<!-- User-Defined Area 2 (UDA2) -->
|
||||
<range uFirst="E311" uLast="EEB7" bFirst="8E 40" bLast="A0 FE" bMin="81 40" bMax="FE FE"/>
|
||||
<!-- User-Defined Area 3 (UDA3) -->
|
||||
<range uFirst="EEB8" uLast="F6B0" bFirst="81 40" bLast="8D FE" bMin="81 40" bMax="FE FE"/>
|
||||
EOT
|
||||
}
|
||||
|
||||
print <<EOT;
|
||||
</assignments>
|
||||
</characterMapping>
|
||||
EOT
|
||||
|
||||
} # gen_charmapml()
|
||||
|
||||
sub gen_check_b2u() {
|
||||
|
||||
###########################################################################
|
||||
#
|
||||
# Codes for generating a raw table for verification and testing
|
||||
#
|
||||
# #print $u2b{"F7D1"}, "\n";
|
||||
# print $b2u{$u2b{"F7D1"}}, "\n";
|
||||
# print "FA59 -> U+", $b2u{"FA59"}, "\n";
|
||||
|
||||
foreach $big5 ( sort { hex($a) <=> hex($b) } keys %b2u ) {
|
||||
$unicode = $b2u{$big5};
|
||||
$big5 =~ s/^00//;
|
||||
print "U+", $unicode, ": ", $big5, "\n";
|
||||
}
|
||||
}
|
||||
|
||||
sub gen_check_u2b() {
|
||||
foreach $unicode ( sort { hex($a) <=> hex($b) } keys %u2b ) {
|
||||
$big5 = $u2b{$unicode};
|
||||
$big5 =~ s/^00//;
|
||||
print "U+", $unicode, ": ", $big5, "\n";
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
###########################################################################
|
||||
#
|
||||
# Codes for generating hkscs.ut and hkscs.uf files for Mozilla
|
||||
#
|
||||
sub gen_mozilla_uf() {
|
||||
# hkscs.uf
|
||||
foreach $unicode ( sort keys %u2b ) {
|
||||
$big5 = $u2b{$unicode};
|
||||
my $b = hex($big5);
|
||||
print "0x", uc($big5), "\t0x", uc($unicode), "\n"
|
||||
unless ( $b >= 0xA140 and $b <= 0xC6A0 )
|
||||
or ( $b >= 0xC940 and $b <= 0xF9D5 )
|
||||
or ( $b < 0x8140 )
|
||||
or ( hex($unicode) > 0xFFFF );
|
||||
}
|
||||
}
|
||||
|
||||
sub gen_mozilla_ut() {
|
||||
# hkscs.ut
|
||||
foreach $big5 ( sort keys %b2u ) {
|
||||
my $b = hex($big5);
|
||||
print "0x", uc($big5), "\t0x", uc( $b2u{$big5} ), "\n"
|
||||
unless ( $b >= 0xA140 and $b <= 0xC6A0 )
|
||||
or ( $b < 0x8140 )
|
||||
or ( $b >= 0xC940 and $b <= 0xF9D5 );
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
###########################################################################
|
||||
|
||||
sub gen_glibc() {
|
||||
|
||||
##########################################################################
|
||||
#
|
||||
# Generate index for UCS4 to Big5-HKSCS conversion table
|
||||
#
|
||||
@index_array = ();
|
||||
|
||||
$mode = 0;
|
||||
$count = 0;
|
||||
for ( $uni = 0x81 ; $uni <= 0x2FFFF ; $uni++ ) {
|
||||
$unicode = sprintf( "%04X", $uni );
|
||||
|
||||
# print " /* U+$unicode */\t" if $low % 4 == 0;
|
||||
if ( defined( $u2b{$unicode} ) ) {
|
||||
if ( $mode == 0 ) {
|
||||
$range_start = $range_end = $uni;
|
||||
|
||||
# printf " { %7s, ", sprintf("0x%04X", $range_start);
|
||||
$mode = 1;
|
||||
}
|
||||
else {
|
||||
$range_end = $uni;
|
||||
}
|
||||
}
|
||||
elsif ( $mode == 1 and ( $uni - $range_end ) >= 0x80 ) {
|
||||
|
||||
# Start a new range if the gap is 0x80 or larger
|
||||
# printf "%7s, %5d },\n", sprintf("0x%04X", $range_end), $count;
|
||||
push @index_array, [ ( $range_start, $range_end, $count ) ];
|
||||
$count += $range_end - $range_start + 1;
|
||||
$mode = 0;
|
||||
}
|
||||
}
|
||||
|
||||
#
|
||||
# Note that $count and $range_end are used again as global variables
|
||||
# below
|
||||
#
|
||||
|
||||
###########################################################################
|
||||
#
|
||||
# Start generating real C code...
|
||||
#
|
||||
|
||||
print <<'EOT';
|
||||
/* Mapping tables for Big5-HKSCS handling.
|
||||
Copyright (C) 1997, 1998, 2000, 2001, 2002 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
|
||||
Modified for Big5-HKSCS by Roger So <roger.so@sw-linux.com>, 2000.
|
||||
Updated for HKSCS-2001 by James Su <suzhe@turbolinux.com.cn>
|
||||
and Anthony Fok <anthony@thizlinux.com>, 2002
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, write to the Free
|
||||
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
|
||||
02111-1307 USA. */
|
||||
|
||||
#include <dlfcn.h>
|
||||
#include <gconv.h>
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <wchar.h>
|
||||
|
||||
|
||||
/* Table for Big5-HKSCS to UCS conversion.
|
||||
|
||||
Original comments by Roger So when he updated the tables for HKSCS-1999:
|
||||
|
||||
With HKSCS mappings 0x8140-0xA0FE and 0xFA40-0xFEFE added; more info:
|
||||
http://www.digital21.gov.hk/eng/hkscs/index.html
|
||||
- spacehunt 07/01/2000
|
||||
|
||||
The BIG5-HKSCS mapping tables are generated from 950.txt, big5-iso.txt
|
||||
and big5cmp.txt using a Perl script while merging C source code from
|
||||
other developers. A copy of the source Perl script is available at:
|
||||
|
||||
http://www.thizlinux.com/~anthony/hkscs/gen-glibc-big5hkscs.pl
|
||||
http://people.debian.org/~foka/hkscs/gen-glibc-big5hkscs.pl
|
||||
|
||||
Revisions:
|
||||
2001-10-30 made codec for Qt
|
||||
2002-03-21 ported to glibc-2.2.5 and added HKSCS-2001
|
||||
|
||||
Todo:
|
||||
Use a hash for characters beyond BMP to save space and make it
|
||||
more efficient
|
||||
|
||||
- Anthony Fok <anthony@thizlinux.com> 21 Mar 2002
|
||||
On behalf of ThizLinux Laboratory Ltd., Hong Kong SAR, China
|
||||
*/
|
||||
|
||||
EOT
|
||||
|
||||
##########################################################################
|
||||
#
|
||||
# Generate Big5-HKSCS to Unicode conversion table
|
||||
#
|
||||
|
||||
## print "Big5HKSCS to Unicode\n";
|
||||
|
||||
# for $high (0x81..0x8d, 0x8e..0xa0, 0xc6..0xc8, 0xf9, 0xfa..0xfe) {
|
||||
|
||||
$high_start = 0x88;
|
||||
$high_end = 0xfe;
|
||||
|
||||
print "static const uint16_t big5_hkscs_to_ucs[";
|
||||
print( ( $high_end - $high_start + 1 ) * 157 );
|
||||
print "] =\n{\n";
|
||||
for $high ( 0x88 .. 0xfe ) {
|
||||
for $low ( 0x40 .. 0x7e, 0xa1 .. 0xfe ) {
|
||||
if ( $low == 0x40 ) {
|
||||
print "\n" unless $high == $high_start;
|
||||
printf
|
||||
"\t/* Big5-HKSCS 0x%02X40..0x%02X7E, 0x%02XA1..0x%02XFE */\n",
|
||||
$high, $high, $high, $high;
|
||||
}
|
||||
elsif ( $low == 0xa1 ) {
|
||||
print "\t\t";
|
||||
}
|
||||
$big5 = sprintf( "%02X%02X", $high, $low );
|
||||
print "\t" if $low % 8 == 0;
|
||||
if ( defined( $b2u{$big5} ) ) {
|
||||
$unicode = $b2u{$big5};
|
||||
print "0x", $unicode, ",";
|
||||
}
|
||||
else {
|
||||
print "0x0000,"; # for glibc
|
||||
}
|
||||
print( ( $low % 8 == 7 or $low == 0x7e or $low == 0xfe )
|
||||
? "\n"
|
||||
: "\t" );
|
||||
}
|
||||
}
|
||||
print "};\n\n";
|
||||
|
||||
##########################################################################
|
||||
#
|
||||
# Generate Unicode to Big5-HKSCS conversion table
|
||||
#
|
||||
print "static const unsigned char ucs4_to_big5_hkscs[$count][2] =\n{\n";
|
||||
foreach $index (@index_array) {
|
||||
( $start, $end ) = ( @$index[0], @$index[1] );
|
||||
printf( " /* U+%04X */\t", $start ) if ( $start % 4 != 0 );
|
||||
print "\t" x ( ( $start % 4 ) * 1.5 ) . " " x ( $start % 2 );
|
||||
for ( $i = $start ; $i <= $end ; $i++ ) {
|
||||
printf( " /* U+%04X */\t", $i ) if ( $i % 4 == 0 );
|
||||
$unicode = sprintf( "%04X", $i );
|
||||
if ( defined( $big5 = $u2b{$unicode} ) ) {
|
||||
if ( $big5 =~ /^00/ ) {
|
||||
print '"\x', substr( $big5, 2, 2 ), '\x00",';
|
||||
}
|
||||
else {
|
||||
print '"\x', substr( $big5, 0, 2 ), '\x',
|
||||
substr( $big5, 2, 2 ), '",';
|
||||
}
|
||||
}
|
||||
else {
|
||||
print '"\x00\x00",';
|
||||
}
|
||||
print( ( $i % 4 == 3 ) ? "\n" : " " ) unless $i == $end;
|
||||
}
|
||||
print $end == $range_end ? "\n" : "\n\n";
|
||||
}
|
||||
print "};\n\n";
|
||||
|
||||
###########################################################################
|
||||
|
||||
print <<EOT;
|
||||
static struct
|
||||
{
|
||||
/* Note: We are going to split this table so that we can use
|
||||
uint16_t for "from" and "to" again. Anthony Fok, 2002-03-21 */
|
||||
uint32_t from;
|
||||
uint32_t to;
|
||||
uint32_t offset;
|
||||
} from_ucs4_idx[] =
|
||||
{
|
||||
EOT
|
||||
foreach $index (@index_array) {
|
||||
printf " { %7s, %7s, %5d },\n", sprintf( "0x%04X", @$index[0] ),
|
||||
sprintf( "0x%04X", @$index[1] ), @$index[2];
|
||||
}
|
||||
print "};\n\n";
|
||||
|
||||
#foreach $i (sort keys %b2u) {
|
||||
# print $b2u{$i} . ' ';
|
||||
#}
|
||||
|
||||
print <<'EOT';
|
||||
/* Definitions used in the body of the `gconv' function. */
|
||||
#define CHARSET_NAME "BIG5HKSCS//"
|
||||
#define FROM_LOOP from_big5
|
||||
#define TO_LOOP to_big5
|
||||
#define DEFINE_INIT 1
|
||||
#define DEFINE_FINI 1
|
||||
#define MIN_NEEDED_FROM 1
|
||||
#define MAX_NEEDED_FROM 2
|
||||
#define MIN_NEEDED_TO 4
|
||||
|
||||
|
||||
/* First define the conversion function from Big5-HKSCS to UCS4. */
|
||||
#define MIN_NEEDED_INPUT MIN_NEEDED_FROM
|
||||
#define MAX_NEEDED_INPUT MAX_NEEDED_FROM
|
||||
#define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
|
||||
#define LOOPFCT FROM_LOOP
|
||||
#define BODY \
|
||||
{ \
|
||||
uint32_t ch = *inptr; \
|
||||
\
|
||||
if (ch >= 0x81 && ch <= 0xfe) \
|
||||
{ \
|
||||
/* Two-byte character. First test whether the next character \
|
||||
is also available. */ \
|
||||
uint32_t ch2; \
|
||||
int idx; \
|
||||
\
|
||||
if (__builtin_expect (inptr + 1 >= inend, 0)) \
|
||||
{ \
|
||||
/* The second character is not available. */ \
|
||||
result = __GCONV_INCOMPLETE_INPUT; \
|
||||
break; \
|
||||
} \
|
||||
\
|
||||
ch2 = inptr[1]; \
|
||||
/* See whether the second byte is in the correct range. */ \
|
||||
if ((ch2 >= 0x40 && ch2 <= 0x7e) || (ch2 >= 0xa1 && ch2 <= 0xfe)) \
|
||||
{ \
|
||||
if (ch >= 0x88) \
|
||||
{ \
|
||||
/* Look up the table */ \
|
||||
idx = (ch - 0x88) * 157 + ch2 - (ch2 <= 0x7e ? 0x40 : 0x62); \
|
||||
if ((ch = big5_hkscs_to_ucs[idx]) == 0) \
|
||||
{ \
|
||||
/* This is illegal. */ \
|
||||
if (! ignore_errors_p ()) \
|
||||
{ \
|
||||
result = __GCONV_ILLEGAL_INPUT; \
|
||||
break; \
|
||||
} \
|
||||
\
|
||||
++inptr; \
|
||||
++*irreversible; \
|
||||
continue; \
|
||||
} \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* 0x81..0x87 in UDA3, currently maps linearly to PUA */ \
|
||||
ch = (ch - 0x81) * 157 + ch2 - (ch2 <= 0x7e ? 0x40 : 0x62) \
|
||||
+ 0xeeb8; \
|
||||
} \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* This is illegal. */ \
|
||||
if (! ignore_errors_p ()) \
|
||||
{ \
|
||||
result = __GCONV_ILLEGAL_INPUT; \
|
||||
break; \
|
||||
} \
|
||||
\
|
||||
++inptr; \
|
||||
++*irreversible; \
|
||||
continue; \
|
||||
} \
|
||||
\
|
||||
inptr += 2; \
|
||||
} \
|
||||
else if (__builtin_expect (ch, 0) == 0xff) \
|
||||
{ \
|
||||
result = __GCONV_ILLEGAL_INPUT; \
|
||||
break; \
|
||||
} \
|
||||
else /* 0x00 to 0x80 */ \
|
||||
++inptr; \
|
||||
\
|
||||
put32 (outptr, ch); \
|
||||
outptr += 4; \
|
||||
}
|
||||
#define LOOP_NEED_FLAGS
|
||||
#include <iconv/loop.c>
|
||||
|
||||
|
||||
/* Next, define the other direction. */
|
||||
#define MIN_NEEDED_INPUT MIN_NEEDED_TO
|
||||
#define MIN_NEEDED_OUTPUT MIN_NEEDED_FROM
|
||||
#define MAX_NEEDED_OUTPUT MAX_NEEDED_FROM
|
||||
#define LOOPFCT TO_LOOP
|
||||
#define BODY \
|
||||
{ \
|
||||
uint32_t ch = get32 (inptr); \
|
||||
const unsigned char *cp = ""; \
|
||||
unsigned char b5ch[2] = "\0\0"; \
|
||||
int i; \
|
||||
\
|
||||
for (i = 0; \
|
||||
i < (int) (sizeof (from_ucs4_idx) / sizeof (from_ucs4_idx[0])); \
|
||||
++i) \
|
||||
{ \
|
||||
if (ch < from_ucs4_idx[i].from) \
|
||||
break; \
|
||||
if (from_ucs4_idx[i].to >= ch) \
|
||||
{ \
|
||||
cp = ucs4_to_big5_hkscs[from_ucs4_idx[i].offset \
|
||||
+ ch - from_ucs4_idx[i].from]; \
|
||||
break; \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
if (ch <= 0x80) \
|
||||
{ \
|
||||
b5ch[0] = ch; \
|
||||
cp = b5ch; \
|
||||
} \
|
||||
\
|
||||
if (cp[0] == '\0' && ch != 0) \
|
||||
{ \
|
||||
UNICODE_TAG_HANDLER (ch, 4); \
|
||||
\
|
||||
/* Illegal character. */ \
|
||||
STANDARD_ERR_HANDLER (4); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* See whether there is enough room for the second byte we write. */ \
|
||||
if (__builtin_expect (cp[1], '\1') != '\0' \
|
||||
&& __builtin_expect (outptr + 1 >= outend, 0)) \
|
||||
{ \
|
||||
/* We have not enough room. */ \
|
||||
result = __GCONV_FULL_OUTPUT; \
|
||||
break; \
|
||||
} \
|
||||
\
|
||||
*outptr++ = cp[0]; \
|
||||
if (cp[1] != '\0') \
|
||||
*outptr++ = cp[1]; \
|
||||
} \
|
||||
\
|
||||
inptr += 4; \
|
||||
}
|
||||
#define LOOP_NEED_FLAGS
|
||||
#include <iconv/loop.c>
|
||||
|
||||
|
||||
/* Now define the toplevel functions. */
|
||||
#include <iconv/skeleton.c>
|
||||
EOT
|
||||
|
||||
}
|
File diff suppressed because one or more lines are too long
9282
intl/uconv/tools/mozilla-xscii-hkscs-2001-uf.txt
Normal file
9282
intl/uconv/tools/mozilla-xscii-hkscs-2001-uf.txt
Normal file
File diff suppressed because it is too large
Load Diff
6258
intl/uconv/tools/mozilla-xscii-hkscs-2001-ut.txt
Normal file
6258
intl/uconv/tools/mozilla-xscii-hkscs-2001-ut.txt
Normal file
File diff suppressed because it is too large
Load Diff
6612
intl/uconv/ucvtw/big5.uf
Normal file
6612
intl/uconv/ucvtw/big5.uf
Normal file
File diff suppressed because it is too large
Load Diff
7552
intl/uconv/ucvtw/big5.ut
Normal file
7552
intl/uconv/ucvtw/big5.ut
Normal file
File diff suppressed because it is too large
Load Diff
11142
intl/uconv/ucvtw/hkscs.uf
Normal file
11142
intl/uconv/ucvtw/hkscs.uf
Normal file
File diff suppressed because it is too large
Load Diff
2368
intl/uconv/ucvtw/hkscs.ut
Normal file
2368
intl/uconv/ucvtw/hkscs.ut
Normal file
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -1,18 +0,0 @@
|
||||
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
|
||||
/* This Source Code Form is subject to the terms of the Mozilla Public
|
||||
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
|
||||
|
||||
#ifndef nsBIG5Data_h_
|
||||
#define nsBIG5Data_h_
|
||||
|
||||
class nsBIG5Data
|
||||
{
|
||||
public:
|
||||
static char16_t LowBits(size_t aPointer);
|
||||
static bool IsAstral(size_t aPointer);
|
||||
static size_t FindPointer(char16_t aLowBits, bool aIsAstral);
|
||||
};
|
||||
|
||||
#endif /* nsBIG5Data_h_ */
|
||||
|
55
intl/uconv/ucvtw/nsBIG5HKSCSToUnicode.cpp
Normal file
55
intl/uconv/ucvtw/nsBIG5HKSCSToUnicode.cpp
Normal file
@ -0,0 +1,55 @@
|
||||
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
|
||||
/* This Source Code Form is subject to the terms of the Mozilla Public
|
||||
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
|
||||
|
||||
#include "nsBIG5HKSCSToUnicode.h"
|
||||
#include "nsUCvTWDll.h"
|
||||
#include "nsUCConstructors.h"
|
||||
|
||||
//----------------------------------------------------------------------
|
||||
// Global functions and data [declaration]
|
||||
|
||||
static const uScanClassID g_BIG5HKSCSScanClassIDs[] = {
|
||||
u1ByteCharset,
|
||||
u2BytesCharset,
|
||||
u2BytesCharset,
|
||||
u2BytesCharset,
|
||||
u2BytesCharset,
|
||||
u2BytesCharset
|
||||
};
|
||||
|
||||
static const uint16_t *g_BIG5HKSCSMappingTableSet [] ={
|
||||
g_ASCIIMappingTable,
|
||||
g_utBig5HKSCSMapping,
|
||||
g_utBIG5Mapping,
|
||||
g_utBig5HKSCSMapping,
|
||||
g_utBIG5Mapping,
|
||||
g_utBig5HKSCSMapping,
|
||||
};
|
||||
|
||||
static const uRange g_BIG5HKSCSRanges[] = {
|
||||
{ 0x00, 0x7F },
|
||||
{ 0x81, 0xA0 },
|
||||
{ 0xA1, 0xC6 },
|
||||
{ 0xC6, 0xC8 },
|
||||
{ 0xC9, 0xF9 },
|
||||
{ 0xF9, 0xFE }
|
||||
};
|
||||
|
||||
//----------------------------------------------------------------------
|
||||
// Class nsBIG5HKSCSToUnicode [implementation]
|
||||
|
||||
nsresult
|
||||
nsBIG5HKSCSToUnicodeConstructor(nsISupports *aOuter, REFNSIID aIID,
|
||||
void **aResult)
|
||||
{
|
||||
return CreateMultiTableDecoder(6,
|
||||
(const uRange* ) &g_BIG5HKSCSRanges,
|
||||
(uScanClassID*) &g_BIG5HKSCSScanClassIDs,
|
||||
(uMappingTable**) &g_BIG5HKSCSMappingTableSet,
|
||||
1,
|
||||
aOuter, aIID, aResult);
|
||||
}
|
||||
|
||||
|
21
intl/uconv/ucvtw/nsBIG5HKSCSToUnicode.h
Normal file
21
intl/uconv/ucvtw/nsBIG5HKSCSToUnicode.h
Normal file
@ -0,0 +1,21 @@
|
||||
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
|
||||
/* This Source Code Form is subject to the terms of the Mozilla Public
|
||||
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
|
||||
|
||||
#ifndef nsBIG5HKSCSToUnicode_h___
|
||||
#define nsBIG5HKSCSToUnicode_h___
|
||||
|
||||
#include "nsISupports.h"
|
||||
|
||||
/**
|
||||
* A character set converter from BIG5-HKSCS to Unicode.
|
||||
*
|
||||
* @created 02/Jul/2000
|
||||
* @author Gavin Ho, Hong Kong Professional Services, Compaq Computer (Hong Kong) Ltd.
|
||||
*/
|
||||
nsresult
|
||||
nsBIG5HKSCSToUnicodeConstructor(nsISupports *aOuter, REFNSIID aIID,
|
||||
void **aResult);
|
||||
|
||||
#endif /* nsBIG5HKSCSToUnicode_h___ */
|
@ -4,162 +4,36 @@
|
||||
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
|
||||
|
||||
#include "nsBIG5ToUnicode.h"
|
||||
#include "mozilla/BinarySearch.h"
|
||||
#include "mozilla/ArrayUtils.h"
|
||||
#include "nsBIG5Data.h"
|
||||
#include "nsUCvTWDll.h"
|
||||
#include "nsUCConstructors.h"
|
||||
|
||||
nsBIG5ToUnicode::nsBIG5ToUnicode()
|
||||
: mPendingTrail(0)
|
||||
, mBig5Lead(0)
|
||||
//----------------------------------------------------------------------
|
||||
// Global functions and data [declaration]
|
||||
|
||||
static const uScanClassID g_BIG5ScanClassIDs[] = {
|
||||
u1ByteCharset,
|
||||
u2BytesCharset
|
||||
};
|
||||
|
||||
static const uint16_t *g_BIG5MappingTableSet [] ={
|
||||
g_ASCIIMappingTable,
|
||||
g_utBIG5Mapping
|
||||
};
|
||||
|
||||
static const uRange g_BIG5Ranges[] = {
|
||||
{ 0x00, 0x7F },
|
||||
{ 0x81, 0xFE }
|
||||
};
|
||||
|
||||
nsresult
|
||||
nsBIG5ToUnicodeConstructor(nsISupports *aOuter, REFNSIID aIID,
|
||||
void **aResult)
|
||||
{
|
||||
return CreateMultiTableDecoder(2,
|
||||
(const uRange* ) &g_BIG5Ranges,
|
||||
(uScanClassID*) &g_BIG5ScanClassIDs,
|
||||
(uMappingTable**) &g_BIG5MappingTableSet, 1,
|
||||
aOuter, aIID, aResult);
|
||||
}
|
||||
|
||||
NS_IMETHODIMP
|
||||
nsBIG5ToUnicode::Convert(const char* aSrc,
|
||||
int32_t* aSrcLength,
|
||||
char16_t* aDest,
|
||||
int32_t* aDestLength)
|
||||
{
|
||||
// We'll be doing comparisons as unsigned.
|
||||
const uint8_t* in = reinterpret_cast<const uint8_t*>(aSrc);
|
||||
const uint8_t* inEnd = in + *aSrcLength;
|
||||
char16_t* out = aDest;
|
||||
char16_t* outEnd = out + *aDestLength;
|
||||
|
||||
if (mPendingTrail) {
|
||||
if (out == outEnd) {
|
||||
*aSrcLength = 0;
|
||||
*aDestLength = 0;
|
||||
return NS_OK_UDEC_MOREOUTPUT;
|
||||
}
|
||||
*out++ = mPendingTrail;
|
||||
mPendingTrail = 0;
|
||||
}
|
||||
for (;;) {
|
||||
if (in == inEnd) {
|
||||
*aSrcLength = in - reinterpret_cast<const uint8_t*>(aSrc);
|
||||
*aDestLength = out - aDest;
|
||||
return mBig5Lead ? NS_OK_UDEC_MOREINPUT : NS_OK;
|
||||
}
|
||||
if (out == outEnd) {
|
||||
*aSrcLength = in - reinterpret_cast<const uint8_t*>(aSrc);
|
||||
*aDestLength = out - aDest;
|
||||
return NS_OK_UDEC_MOREOUTPUT;
|
||||
}
|
||||
uint8_t b = *in++;
|
||||
if (!mBig5Lead) {
|
||||
if (b <= 0x7F) {
|
||||
*out++ = (char16_t)b;
|
||||
continue;
|
||||
}
|
||||
if (b >= 0x81 && b <= 0xFE) {
|
||||
mBig5Lead = b;
|
||||
continue;
|
||||
}
|
||||
if (mErrBehavior == kOnError_Signal) {
|
||||
--in;
|
||||
*aSrcLength = in - reinterpret_cast<const uint8_t*>(aSrc);
|
||||
*aDestLength = out - aDest;
|
||||
return NS_ERROR_ILLEGAL_INPUT;
|
||||
}
|
||||
*out++ = 0xFFFD;
|
||||
continue;
|
||||
}
|
||||
size_t lead = mBig5Lead;
|
||||
mBig5Lead = 0;
|
||||
size_t offset = (b < 0x7F) ? 0x40 : 0x62;
|
||||
if ((b >= 0x40 && b <= 0x7E) || (b >= 0xA1 && b <= 0xFE)) {
|
||||
size_t pointer = (lead - 0x81) * 157L + (b - offset);
|
||||
char16_t outTrail;
|
||||
switch (pointer) {
|
||||
case 1133:
|
||||
*out++ = 0x00CA;
|
||||
outTrail = 0x0304;
|
||||
break;
|
||||
case 1135:
|
||||
*out++ = 0x00CA;
|
||||
outTrail = 0x030C;
|
||||
break;
|
||||
case 1164:
|
||||
*out++ = 0x00EA;
|
||||
outTrail = 0x0304;
|
||||
break;
|
||||
case 1166:
|
||||
*out++ = 0x00EA;
|
||||
outTrail = 0x030C;
|
||||
break;
|
||||
default:
|
||||
char16_t lowBits = nsBIG5Data::LowBits(pointer);
|
||||
if (!lowBits) {
|
||||
if (b <= 0x7F) {
|
||||
// prepend byte to stream
|
||||
// Always legal, since we've always just read a byte
|
||||
// if we come here.
|
||||
--in;
|
||||
}
|
||||
if (mErrBehavior == kOnError_Signal) {
|
||||
--in;
|
||||
*aSrcLength = in - reinterpret_cast<const uint8_t*>(aSrc);
|
||||
*aDestLength = out - aDest;
|
||||
return NS_ERROR_ILLEGAL_INPUT;
|
||||
}
|
||||
*out++ = 0xFFFD;
|
||||
continue;
|
||||
}
|
||||
if (nsBIG5Data::IsAstral(pointer)) {
|
||||
uint32_t codePoint = uint32_t(lowBits) | 0x20000;
|
||||
*out++ = char16_t(0xD7C0 + (codePoint >> 10));
|
||||
outTrail = char16_t(0xDC00 + (codePoint & 0x3FF));
|
||||
break;
|
||||
}
|
||||
*out++ = lowBits;
|
||||
continue;
|
||||
}
|
||||
if (out == outEnd) {
|
||||
mPendingTrail = outTrail;
|
||||
*aSrcLength = in - reinterpret_cast<const uint8_t*>(aSrc);
|
||||
*aDestLength = out - aDest;
|
||||
return NS_OK_UDEC_MOREOUTPUT;
|
||||
}
|
||||
*out++ = outTrail;
|
||||
continue;
|
||||
}
|
||||
// pointer is null
|
||||
if (b <= 0x7F) {
|
||||
// prepend byte to stream
|
||||
// Always legal, since we've always just read a byte
|
||||
// if we come here.
|
||||
--in;
|
||||
}
|
||||
if (mErrBehavior == kOnError_Signal) {
|
||||
// Moving in one past the start of aSrc is actually OK per API contract,
|
||||
// since assigning -1 to aSrcLength means that we want the caller to
|
||||
// record one U+FFFD and repush the same input buffer.
|
||||
--in;
|
||||
*aSrcLength = in - reinterpret_cast<const uint8_t*>(aSrc);
|
||||
*aDestLength = out - aDest;
|
||||
return NS_ERROR_ILLEGAL_INPUT;
|
||||
}
|
||||
*out++ = 0xFFFD;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
NS_IMETHODIMP
|
||||
nsBIG5ToUnicode::GetMaxLength(const char* aSrc,
|
||||
int32_t aSrcLength,
|
||||
int32_t* aDestLength)
|
||||
{
|
||||
// The length of the output in UTF-16 code units never exceeds the length
|
||||
// of the input in bytes.
|
||||
*aDestLength = aSrcLength + (mPendingTrail ? 1 : 0) + (mBig5Lead ? 1 : 0);
|
||||
return NS_OK;
|
||||
}
|
||||
|
||||
NS_IMETHODIMP
|
||||
nsBIG5ToUnicode::Reset()
|
||||
{
|
||||
mPendingTrail = 0;
|
||||
mBig5Lead = 0;
|
||||
return NS_OK;
|
||||
}
|
||||
|
@ -6,34 +6,16 @@
|
||||
#ifndef nsBIG5ToUnicode_h___
|
||||
#define nsBIG5ToUnicode_h___
|
||||
|
||||
#include "nsUCSupport.h"
|
||||
#include "nsISupports.h"
|
||||
|
||||
#define NS_BIG5TOUNICODE_CID \
|
||||
{ 0xefc323e1, 0xec62, 0x11d2, \
|
||||
{ 0x8a, 0xac, 0x0, 0x60, 0x8, 0x11, 0xa8, 0x36 } }
|
||||
|
||||
#define NS_BIG5TOUNICODE_CONTRACTID \
|
||||
"@mozilla.org/intl/unicode/decoder;1?charset=big5"
|
||||
|
||||
class nsBIG5ToUnicode : public nsBasicDecoderSupport
|
||||
{
|
||||
public:
|
||||
nsBIG5ToUnicode();
|
||||
|
||||
NS_IMETHOD Convert(const char* aSrc,
|
||||
int32_t* aSrcLength,
|
||||
char16_t* aDest,
|
||||
int32_t* aDestLength);
|
||||
|
||||
NS_IMETHOD GetMaxLength(const char* aSrc,
|
||||
int32_t aSrcLength,
|
||||
int32_t* aDestLength);
|
||||
|
||||
NS_IMETHOD Reset();
|
||||
|
||||
private:
|
||||
char16_t mPendingTrail;
|
||||
uint8_t mBig5Lead;
|
||||
};
|
||||
/**
|
||||
* A character set converter from BIG5 to Unicode.
|
||||
*
|
||||
* @created 06/Apr/1999
|
||||
* @author Catalin Rotaru [CATA]
|
||||
*/
|
||||
nsresult
|
||||
nsBIG5ToUnicodeConstructor(nsISupports *aOuter, REFNSIID aIID,
|
||||
void **aResult);
|
||||
|
||||
#endif /* nsBIG5ToUnicode_h___ */
|
||||
|
31
intl/uconv/ucvtw/nsUCvTWCID.h
Normal file
31
intl/uconv/ucvtw/nsUCvTWCID.h
Normal file
@ -0,0 +1,31 @@
|
||||
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
|
||||
/* This Source Code Form is subject to the terms of the Mozilla Public
|
||||
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
|
||||
|
||||
#ifndef nsUCvTWCID_h___
|
||||
#define nsUCvTWCID_h___
|
||||
|
||||
#include "nsISupports.h"
|
||||
|
||||
// Class ID for our BIG5ToUnicode charset converter
|
||||
// {EFC323E1-EC62-11d2-8AAC-00600811A836}
|
||||
#define NS_BIG5TOUNICODE_CID \
|
||||
{ 0xefc323e1, 0xec62, 0x11d2, {0x8a, 0xac, 0x0, 0x60, 0x8, 0x11, 0xa8, 0x36}}
|
||||
|
||||
// Class ID for our UnicodeToBIG5 charset converter
|
||||
// {EFC323E2-EC62-11d2-8AAC-00600811A836}
|
||||
#define NS_UNICODETOBIG5_CID \
|
||||
{ 0xefc323e2, 0xec62, 0x11d2, {0x8a, 0xac, 0x0, 0x60, 0x8, 0x11, 0xa8, 0x36}}
|
||||
|
||||
// Class ID for our BIG5HKSCSToUnicode charset converter
|
||||
// {BA6151BB-EC62-11d2-8AAC-00600811A836}
|
||||
#define NS_BIG5HKSCSTOUNICODE_CID \
|
||||
{ 0xba6151bb, 0xec62, 0x11d2, {0x8a, 0xac, 0x0, 0x60, 0x8, 0x11, 0xa8, 0x36}}
|
||||
|
||||
// Class ID for our UnicodeToBIG5HKSCS charset converter
|
||||
// {BA6151BC-EC62-11d2-8AAC-00600811A836}
|
||||
#define NS_UNICODETOBIG5HKSCS_CID \
|
||||
{ 0xba6151bc, 0xec62, 0x11d2, {0x8a, 0xac, 0x0, 0x60, 0x8, 0x11, 0xa8, 0x36}}
|
||||
|
||||
#endif /* nsUCvTWCID_h___ */
|
15
intl/uconv/ucvtw/nsUCvTWDll.h
Normal file
15
intl/uconv/ucvtw/nsUCvTWDll.h
Normal file
@ -0,0 +1,15 @@
|
||||
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
|
||||
/* This Source Code Form is subject to the terms of the Mozilla Public
|
||||
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
|
||||
|
||||
#ifndef nsUCvTWDll_h_
|
||||
#define nsUCvTWDll_h_
|
||||
|
||||
extern const uint16_t g_ufBig5Mapping[];
|
||||
extern const uint16_t g_utBIG5Mapping[];
|
||||
extern const uint16_t g_ASCIIMappingTable[];
|
||||
extern const uint16_t g_ufBig5HKSCSMapping[];
|
||||
extern const uint16_t g_utBig5HKSCSMapping[];
|
||||
|
||||
#endif /* nsUCvTWDll_h_ */
|
@ -4,248 +4,35 @@
|
||||
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
|
||||
|
||||
#include "nsUnicodeToBIG5.h"
|
||||
#include "nsUCvTWDll.h"
|
||||
#include "nsUCConstructors.h"
|
||||
|
||||
NS_IMPL_ADDREF(nsUnicodeToBIG5)
|
||||
NS_IMPL_RELEASE(nsUnicodeToBIG5)
|
||||
NS_IMPL_QUERY_INTERFACE(nsUnicodeToBIG5,
|
||||
nsIUnicodeEncoder)
|
||||
//----------------------------------------------------------------------
|
||||
// Global functions and data [declaration]
|
||||
|
||||
nsUnicodeToBIG5::nsUnicodeToBIG5()
|
||||
: mUtf16Lead(0)
|
||||
, mPendingTrail(0)
|
||||
, mSignal(true) // as in nsEncoderSupport
|
||||
|
||||
static const uint16_t *g_Big5MappingTable[2] = {
|
||||
g_ASCIIMappingTable,
|
||||
g_ufBig5Mapping
|
||||
};
|
||||
|
||||
static const uScanClassID g_Big5ScanClassIDs[2] = {
|
||||
u1ByteCharset,
|
||||
u2BytesCharset
|
||||
};
|
||||
|
||||
//----------------------------------------------------------------------
|
||||
// Class nsUnicodeToBIG5 [implementation]
|
||||
|
||||
nsresult
|
||||
nsUnicodeToBIG5Constructor(nsISupports *aOuter, REFNSIID aIID,
|
||||
void **aResult)
|
||||
{
|
||||
|
||||
return CreateMultiTableEncoder(2,
|
||||
(uScanClassID*) &g_Big5ScanClassIDs,
|
||||
(uMappingTable**) &g_Big5MappingTable,
|
||||
2 /* max length = src * 2 */,
|
||||
aOuter, aIID, aResult);
|
||||
}
|
||||
|
||||
NS_IMETHODIMP
|
||||
nsUnicodeToBIG5::Convert(const char16_t* aSrc,
|
||||
int32_t* aSrcLength,
|
||||
char* aDest,
|
||||
int32_t * aDestLength)
|
||||
{
|
||||
const char16_t* in = aSrc;
|
||||
const char16_t* inEnd = in + *aSrcLength;
|
||||
uint8_t* out = reinterpret_cast<uint8_t*>(aDest);
|
||||
uint8_t* outEnd = out + *aDestLength;
|
||||
|
||||
MOZ_ASSERT(!(mPendingTrail && mUtf16Lead),
|
||||
"Can't have both pending output and pending input.");
|
||||
|
||||
if (mPendingTrail) {
|
||||
if (out == outEnd) {
|
||||
*aSrcLength = 0;
|
||||
*aDestLength = 0;
|
||||
return NS_OK_UENC_MOREOUTPUT;
|
||||
}
|
||||
*out++ = mPendingTrail;
|
||||
mPendingTrail = 0;
|
||||
}
|
||||
for (;;) {
|
||||
if (in == inEnd) {
|
||||
*aSrcLength = in - aSrc;
|
||||
*aDestLength = out - reinterpret_cast<uint8_t*>(aDest);
|
||||
return NS_OK_UENC_MOREINPUT;
|
||||
}
|
||||
if (out == outEnd) {
|
||||
*aSrcLength = in - aSrc;
|
||||
*aDestLength = out - reinterpret_cast<uint8_t*>(aDest);
|
||||
return NS_OK_UENC_MOREOUTPUT;
|
||||
}
|
||||
bool isAstral; // true means Plane 2, false means BMP
|
||||
char16_t lowBits; // The low 16 bits of the code point
|
||||
char16_t codeUnit = *in++;
|
||||
size_t highBits = (codeUnit & 0xFC00);
|
||||
if (highBits == 0xD800) {
|
||||
// high surrogate
|
||||
if (mUtf16Lead) {
|
||||
// High surrogate follows another high surrogate. The
|
||||
// *previous* code unit is in error.
|
||||
if (mSignal) {
|
||||
mUtf16Lead = 0;
|
||||
// NOTE: Encode API differs from decode API!
|
||||
--in;
|
||||
*aSrcLength = in - aSrc;
|
||||
*aDestLength = out - reinterpret_cast<uint8_t*>(aDest);
|
||||
return NS_ERROR_UENC_NOMAPPING;
|
||||
}
|
||||
*out++ = '?';
|
||||
}
|
||||
mUtf16Lead = codeUnit;
|
||||
continue;
|
||||
}
|
||||
if (highBits == 0xDC00) {
|
||||
// low surrogate
|
||||
if (!mUtf16Lead) {
|
||||
// Got low surrogate without a previous high surrogate
|
||||
if (mSignal) {
|
||||
// NOTE: Encode API differs from decode API!
|
||||
*aSrcLength = in - aSrc;
|
||||
*aDestLength = out - reinterpret_cast<uint8_t*>(aDest);
|
||||
return NS_ERROR_UENC_NOMAPPING;
|
||||
}
|
||||
*out++ = '?';
|
||||
continue;
|
||||
}
|
||||
size_t codePoint = (mUtf16Lead << 10) + codeUnit -
|
||||
(((0xD800 << 10) - 0x10000) + 0xDC00);
|
||||
mUtf16Lead = 0;
|
||||
// Plane 2 is the only astral plane that has potentially
|
||||
// Big5-encodable characters.
|
||||
if ((0xFF0000 & codePoint) != 0x20000) {
|
||||
if (mSignal) {
|
||||
// NOTE: Encode API differs from decode API!
|
||||
// nsSaveAsCharset wants us to back up on step in the case of a
|
||||
// surrogate pair.
|
||||
--in;
|
||||
*aSrcLength = in - aSrc;
|
||||
*aDestLength = out - reinterpret_cast<uint8_t*>(aDest);
|
||||
return NS_ERROR_UENC_NOMAPPING;
|
||||
}
|
||||
*out++ = '?';
|
||||
continue;
|
||||
}
|
||||
isAstral = true;
|
||||
lowBits = (char16_t)(codePoint & 0xFFFF);
|
||||
} else {
|
||||
// not a surrogate
|
||||
if (mUtf16Lead) {
|
||||
// Non-surrogate follows a high surrogate. The *previous*
|
||||
// code unit is in error.
|
||||
mUtf16Lead = 0;
|
||||
if (mSignal) {
|
||||
// NOTE: Encode API differs from decode API!
|
||||
--in;
|
||||
*aSrcLength = in - aSrc;
|
||||
*aDestLength = out - reinterpret_cast<uint8_t*>(aDest);
|
||||
return NS_ERROR_UENC_NOMAPPING;
|
||||
}
|
||||
*out++ = '?';
|
||||
// Let's unconsume this code unit and reloop in order to
|
||||
// re-check if the output buffer still has space.
|
||||
--in;
|
||||
continue;
|
||||
}
|
||||
isAstral = false;
|
||||
lowBits = codeUnit;
|
||||
}
|
||||
// isAstral now tells us if we have a Plane 2 or a BMP character.
|
||||
// lowBits tells us the low 16 bits.
|
||||
// After all the above setup to deal with UTF-16, we are now
|
||||
// finally ready to follow the spec.
|
||||
if (!isAstral && lowBits <= 0x7F) {
|
||||
*out++ = (uint8_t)lowBits;
|
||||
continue;
|
||||
}
|
||||
size_t pointer = nsBIG5Data::FindPointer(lowBits, isAstral);
|
||||
if (!pointer) {
|
||||
if (mSignal) {
|
||||
// NOTE: Encode API differs from decode API!
|
||||
if (isAstral) {
|
||||
// nsSaveAsCharset wants us to back up on step in the case of a
|
||||
// surrogate pair.
|
||||
--in;
|
||||
}
|
||||
*aSrcLength = in - aSrc;
|
||||
*aDestLength = out - reinterpret_cast<uint8_t*>(aDest);
|
||||
return NS_ERROR_UENC_NOMAPPING;
|
||||
}
|
||||
*out++ = '?';
|
||||
continue;
|
||||
}
|
||||
uint8_t lead = (uint8_t)(pointer / 157 + 0x81);
|
||||
uint8_t trail = (uint8_t)(pointer % 157);
|
||||
if (trail < 0x3F) {
|
||||
trail += 0x40;
|
||||
} else {
|
||||
trail += 0x62;
|
||||
}
|
||||
*out++ = lead;
|
||||
if (out == outEnd) {
|
||||
mPendingTrail = trail;
|
||||
*aSrcLength = in - aSrc;
|
||||
*aDestLength = out - reinterpret_cast<uint8_t*>(aDest);
|
||||
return NS_OK_UENC_MOREOUTPUT;
|
||||
}
|
||||
*out++ = trail;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
NS_IMETHODIMP
|
||||
nsUnicodeToBIG5::Finish(char* aDest,
|
||||
int32_t* aDestLength)
|
||||
{
|
||||
MOZ_ASSERT(!(mPendingTrail && mUtf16Lead),
|
||||
"Can't have both pending output and pending input.");
|
||||
uint8_t* out = reinterpret_cast<uint8_t*>(aDest);
|
||||
if (mPendingTrail) {
|
||||
if (*aDestLength < 1) {
|
||||
*aDestLength = 0;
|
||||
return NS_OK_UENC_MOREOUTPUT;
|
||||
}
|
||||
*out = mPendingTrail;
|
||||
mPendingTrail = 0;
|
||||
*aDestLength = 1;
|
||||
return NS_OK;
|
||||
}
|
||||
if (mUtf16Lead) {
|
||||
if (*aDestLength < 1) {
|
||||
*aDestLength = 0;
|
||||
return NS_OK_UENC_MOREOUTPUT;
|
||||
}
|
||||
// The API doesn't support signaling an error. It pretends that malformed
|
||||
// input doesn't exist. The UTF-8 encoder outputs the replacement character
|
||||
// unconditionally.
|
||||
mUtf16Lead = 0;
|
||||
*out = '?';
|
||||
*aDestLength = 1;
|
||||
return NS_OK;
|
||||
}
|
||||
*aDestLength = 0;
|
||||
return NS_OK;
|
||||
}
|
||||
|
||||
NS_IMETHODIMP
|
||||
nsUnicodeToBIG5::GetMaxLength(const char16_t* aSrc,
|
||||
int32_t aSrcLength,
|
||||
int32_t* aDestLength)
|
||||
{
|
||||
*aDestLength = (aSrcLength * 2) +
|
||||
(mPendingTrail ? 1 : 0) +
|
||||
// If the lead ends up being paired, the bytes produced
|
||||
// are already included above.
|
||||
// If not, it produces a single '?'.
|
||||
(mUtf16Lead ? 1 : 0);
|
||||
return NS_OK;
|
||||
}
|
||||
|
||||
NS_IMETHODIMP
|
||||
nsUnicodeToBIG5::Reset()
|
||||
{
|
||||
mUtf16Lead = 0;
|
||||
mPendingTrail = 0;
|
||||
return NS_OK;
|
||||
}
|
||||
|
||||
NS_IMETHODIMP
|
||||
nsUnicodeToBIG5::SetOutputErrorBehavior(int32_t aBehavior,
|
||||
nsIUnicharEncoder* aEncoder,
|
||||
char16_t aChar)
|
||||
{
|
||||
switch (aBehavior) {
|
||||
case kOnError_Signal:
|
||||
mSignal = true;
|
||||
break;
|
||||
case kOnError_Replace:
|
||||
mSignal = false;
|
||||
MOZ_ASSERT(aChar == '?', "Unsupported replacement.");
|
||||
break;
|
||||
case kOnError_CallBack:
|
||||
MOZ_ASSERT_UNREACHABLE("kOnError_CallBack is supposed to be unused.");
|
||||
break;
|
||||
default:
|
||||
MOZ_ASSERT_UNREACHABLE("Non-existent enum item.");
|
||||
break;
|
||||
}
|
||||
return NS_OK;
|
||||
}
|
||||
|
@ -3,48 +3,19 @@
|
||||
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
|
||||
|
||||
#ifndef nsUnicodeToBIG5_h_
|
||||
#define nsUnicodeToBIG5_h_
|
||||
#ifndef nsUnicodeToBIG5_h___
|
||||
#define nsUnicodeToBIG5_h___
|
||||
|
||||
#include "nsIUnicodeEncoder.h"
|
||||
#include "nsISupports.h"
|
||||
|
||||
#define NS_UNICODETOBIG5_CID \
|
||||
{ 0xefc323e2, 0xec62, 0x11d2, \
|
||||
{ 0x8a, 0xac, 0x0, 0x60, 0x8, 0x11, 0xa8, 0x36 } }
|
||||
/**
|
||||
* A character set converter from Unicode to BIG5.
|
||||
*
|
||||
* @created 06/Apr/1999
|
||||
* @author Catalin Rotaru [CATA]
|
||||
*/
|
||||
nsresult
|
||||
nsUnicodeToBIG5Constructor(nsISupports *aOuter, REFNSIID aIID,
|
||||
void **aResult);
|
||||
|
||||
class nsUnicodeToBIG5 : public nsIUnicodeEncoder
|
||||
{
|
||||
public:
|
||||
// Encoders probably shouldn't use the thread-safe variant, but we should
|
||||
// make a systematic change instead of making this class different.
|
||||
NS_DECL_THREADSAFE_ISUPPORTS
|
||||
|
||||
nsUnicodeToBIG5();
|
||||
|
||||
NS_IMETHOD Convert(const char16_t* aSrc,
|
||||
int32_t* aSrcLength,
|
||||
char* aDest,
|
||||
int32_t * aDestLength);
|
||||
|
||||
NS_IMETHOD Finish(char* aDest,
|
||||
int32_t* aDestLength);
|
||||
|
||||
MOZ_WARN_UNUSED_RESULT NS_IMETHOD GetMaxLength(const char16_t* aSrc,
|
||||
int32_t aSrcLength,
|
||||
int32_t* aDestLength);
|
||||
|
||||
NS_IMETHOD Reset();
|
||||
|
||||
NS_IMETHOD SetOutputErrorBehavior(int32_t aBehavior,
|
||||
nsIUnicharEncoder* aEncoder,
|
||||
char16_t aChar);
|
||||
|
||||
private:
|
||||
virtual ~nsUnicodeToBIG5(){};
|
||||
|
||||
char16_t mUtf16Lead;
|
||||
uint8_t mPendingTrail;
|
||||
bool mSignal;
|
||||
};
|
||||
|
||||
#endif /* nsUnicodeToBIG5_h_ */
|
||||
#endif /* nsUnicodeToBIG5_h___ */
|
||||
|
36
intl/uconv/ucvtw/nsUnicodeToBIG5HKSCS.cpp
Normal file
36
intl/uconv/ucvtw/nsUnicodeToBIG5HKSCS.cpp
Normal file
@ -0,0 +1,36 @@
|
||||
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
|
||||
/* This Source Code Form is subject to the terms of the Mozilla Public
|
||||
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
|
||||
|
||||
#include "nsUnicodeToBIG5HKSCS.h"
|
||||
#include "nsUCvTWDll.h"
|
||||
#include "nsUCConstructors.h"
|
||||
|
||||
//----------------------------------------------------------------------
|
||||
// Global functions and data [declaration]
|
||||
|
||||
nsresult
|
||||
nsUnicodeToBIG5HKSCSConstructor(nsISupports *aOuter, REFNSIID aIID,
|
||||
void **aResult)
|
||||
{
|
||||
static const uint16_t *g_Big5HKSCSMappingTable[] = {
|
||||
g_ASCIIMappingTable,
|
||||
g_ufBig5Mapping,
|
||||
g_ufBig5HKSCSMapping
|
||||
};
|
||||
|
||||
static const uScanClassID g_Big5HKSCSScanClassIDs[] = {
|
||||
u1ByteCharset,
|
||||
u2BytesCharset,
|
||||
u2BytesCharset
|
||||
};
|
||||
|
||||
return CreateMultiTableEncoder(3,
|
||||
(uScanClassID*) &g_Big5HKSCSScanClassIDs,
|
||||
(uMappingTable**) &g_Big5HKSCSMappingTable,
|
||||
2 /* max length = src * 2 */,
|
||||
aOuter, aIID, aResult);
|
||||
}
|
||||
|
||||
|
21
intl/uconv/ucvtw/nsUnicodeToBIG5HKSCS.h
Normal file
21
intl/uconv/ucvtw/nsUnicodeToBIG5HKSCS.h
Normal file
@ -0,0 +1,21 @@
|
||||
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
|
||||
/* This Source Code Form is subject to the terms of the Mozilla Public
|
||||
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
|
||||
|
||||
#ifndef nsUnicodeToBIG5HKSCS_h___
|
||||
#define nsUnicodeToBIG5HKSCS_h___
|
||||
|
||||
#include "nsISupports.h"
|
||||
|
||||
/**
|
||||
* A character set converter from Unicode to BIG5-HKSCS.
|
||||
*
|
||||
* @created 02/Jul/2000
|
||||
* @author Gavin Ho, Hong Kong Professional Services, Compaq Computer (Hong Kong) Ltd.
|
||||
*/
|
||||
nsresult
|
||||
nsUnicodeToBIG5HKSCSConstructor(nsISupports *aOuter, REFNSIID aIID,
|
||||
void **aResult);
|
||||
|
||||
#endif /* nsUnicodeToBIG5HKSCS_h___ */
|
1
intl/uconv/ucvtw/readme.txt
Normal file
1
intl/uconv/ucvtw/readme.txt
Normal file
@ -0,0 +1 @@
|
||||
We should put Big5 converter into this directory/dll
|
@ -29051,16 +29051,7 @@
|
||||
},
|
||||
"local_changes": {
|
||||
"deleted": [],
|
||||
"items": {
|
||||
"testharness": {
|
||||
"encoding/big5-encoder.html": [
|
||||
{
|
||||
"path": "encoding/big5-encoder.html",
|
||||
"url": "/encoding/big5-encoder.html"
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"items": {},
|
||||
"reftest_nodes": {}
|
||||
},
|
||||
"reftest_nodes": {
|
||||
|
@ -444,6 +444,12 @@
|
||||
[Name "hz-gb-2312" has label "hz-gb-2312" (inputEncoding)]
|
||||
expected: FAIL
|
||||
|
||||
[Name "big5" has label "big5-hkscs" (characterSet)]
|
||||
expected: FAIL
|
||||
|
||||
[Name "big5" has label "big5-hkscs" (inputEncoding)]
|
||||
expected: FAIL
|
||||
|
||||
[Name "replacement" has label "csiso2022kr" (characterSet)]
|
||||
expected: FAIL
|
||||
|
||||
|
@ -1,14 +0,0 @@
|
||||
[big5-encoder.html]
|
||||
type: testharness
|
||||
[big5 encoder: Highest-pointer BMP character excluded from encoder]
|
||||
expected: FAIL
|
||||
|
||||
[big5 encoder: Highest-pointer character excluded from encoder]
|
||||
expected: FAIL
|
||||
|
||||
[big5 encoder: The canonical BMP test character that is not in the index]
|
||||
expected: FAIL
|
||||
|
||||
[big5 encoder: The canonical astral test character that is not in the index]
|
||||
expected: FAIL
|
||||
|
@ -0,0 +1,53 @@
|
||||
[textdecoder-labels.html]
|
||||
type: testharness
|
||||
[name=big5 label=big5-hkscs]
|
||||
expected: FAIL
|
||||
|
||||
["big5-hkscs" => "big5"]
|
||||
expected: FAIL
|
||||
|
||||
[" big5-hkscs" => "big5"]
|
||||
expected: FAIL
|
||||
|
||||
["big5-hkscs " => "big5"]
|
||||
expected: FAIL
|
||||
|
||||
[" big5-hkscs " => "big5"]
|
||||
expected: FAIL
|
||||
|
||||
["\\tbig5-hkscs" => "big5"]
|
||||
expected: FAIL
|
||||
|
||||
["big5-hkscs\\t" => "big5"]
|
||||
expected: FAIL
|
||||
|
||||
["\\tbig5-hkscs\\t" => "big5"]
|
||||
expected: FAIL
|
||||
|
||||
["\\nbig5-hkscs" => "big5"]
|
||||
expected: FAIL
|
||||
|
||||
["big5-hkscs\\n" => "big5"]
|
||||
expected: FAIL
|
||||
|
||||
["\\nbig5-hkscs\\n" => "big5"]
|
||||
expected: FAIL
|
||||
|
||||
["\\fbig5-hkscs" => "big5"]
|
||||
expected: FAIL
|
||||
|
||||
["big5-hkscs\\f" => "big5"]
|
||||
expected: FAIL
|
||||
|
||||
["\\fbig5-hkscs\\f" => "big5"]
|
||||
expected: FAIL
|
||||
|
||||
["\\rbig5-hkscs" => "big5"]
|
||||
expected: FAIL
|
||||
|
||||
["big5-hkscs\\r" => "big5"]
|
||||
expected: FAIL
|
||||
|
||||
["\\rbig5-hkscs\\r" => "big5"]
|
||||
expected: FAIL
|
||||
|
@ -1,33 +0,0 @@
|
||||
<!doctype html>
|
||||
<meta charset=big5> <!-- test breaks if the server overrides this -->
|
||||
<script src=/resources/testharness.js></script>
|
||||
<script src=/resources/testharnessreport.js></script>
|
||||
<div id=log></div>
|
||||
<script>
|
||||
function encode(input, output, desc) {
|
||||
test(function() {
|
||||
var a = document.createElement("a"); // <a> uses document encoding for URL's query
|
||||
// Append and prepend X to test for off-by-one errors
|
||||
a.href = "https://example.com/?X" + input + "X";
|
||||
assert_equals(a.search.substr(1), "X" + output + "X"); // remove leading "?"
|
||||
}, "big5 encoder: " + desc);
|
||||
}
|
||||
|
||||
encode("ab", "ab", "very basic")
|
||||
// edge cases
|
||||
encode("\u9EA6", "%26%2340614%3B", "Highest-pointer BMP character excluded from encoder");
|
||||
encode("\uD858\uDE6B", "%26%23156267%3B", "Highest-pointer character excluded from encoder");
|
||||
encode("\u3000", "%A1@", "Lowest-pointer character included in encoder");
|
||||
encode("\u20AC", "%A3%E1", "Euro; the highest-pointer character before a range of 30 unmapped pointers");
|
||||
encode("\u4E00", "%A4@", "The lowest-pointer character after the range of 30 unmapped pointers");
|
||||
encode("\uD85D\uDE07", "%C8%A4", "The highest-pointer character before a range of 41 unmapped pointers");
|
||||
encode("\uFFE2", "%C8%CD", "The lowest-pointer character after the range of 41 unmapped pointers");
|
||||
encode("\u79D4", "%FE%FE", "The last character in the index");
|
||||
// not in index
|
||||
encode("\u2603", "%26%239731%3B", "The canonical BMP test character that is not in the index");
|
||||
encode("\uD83D\uDCA9", "%26%23128169%3B", "The canonical astral test character that is not in the index");
|
||||
// duplicate low bits
|
||||
encode("\uD840\uDFB5", "%FDj", "A Plane 2 character whose low 16 bits match a BMP character that has a lower pointer");
|
||||
// prefer last
|
||||
encode("\u2550", "%F9%F9", "A duplicate-mapped code point that prefers the highest pointer in the encoder");
|
||||
</script>
|
Loading…
Reference in New Issue
Block a user