Back out 2 changesets (bug 912470) for static analysis bustage

CLOSED TREE

Backed out changeset 8ecf2f65d4f0 (bug 912470)
Backed out changeset 19af08a9c288 (bug 912470)
This commit is contained in:
Phil Ringnalda 2015-09-10 09:37:51 -07:00
parent 89639e1fda
commit 354efcb7fe
39 changed files with 44535 additions and 19924 deletions

View File

@ -54,8 +54,8 @@ xn--wgbh1c=windows-1256
gr=ISO-8859-7
hk=Big5
xn--j6w193g=Big5
hk=Big5-HKSCS
xn--j6w193g=Big5-HKSCS
hr=windows-1250

View File

@ -5,6 +5,7 @@
# x-unicode is assumed for encodings not listed here
Big5=zh-TW
Big5-HKSCS=zh=HK
EUC-JP=ja
EUC-KR=ko
gb18030=zh-CN

View File

@ -189,7 +189,7 @@ x-gbk=gbk
gb18030=gb18030
hz-gb-2312=replacement
big5=Big5
big5-hkscs=Big5
big5-hkscs=Big5-HKSCS
cn-big5=Big5
csbig5=Big5
x-x-big5=Big5

View File

@ -44,7 +44,6 @@ function runTextDecoderOptions()
}, "testDecodeABVOption");
test(testDecoderForThaiEncoding, "testDecoderForThaiEncoding");
test(testInvalid2022JP, "testInvalid2022JP");
test(testDecoderForBig5, "testDecoderForBig5");
}
/*
@ -356,7 +355,8 @@ function testDecoderGetEncoding()
{encoding: "x-mac-cyrillic", labels: ["x-mac-cyrillic", "x-mac-ukrainian"]},
{encoding: "gbk", labels: ["chinese", "csgb2312", "csiso58gb231280", "gb2312", "gb_2312", "gb_2312-80", "gbk", "iso-ir-58", "x-gbk"]},
{encoding: "gb18030", labels: ["gb18030"]},
{encoding: "big5", labels: ["big5", "cn-big5", "csbig5", "x-x-big5", "big5-hkscs"]},
{encoding: "big5", labels: ["big5", "cn-big5", "csbig5", "x-x-big5"]},
{encoding: "big5-hkscs", labels: ["big5-hkscs"]},
{encoding: "euc-jp", labels: ["cseucpkdfmtjapanese", "euc-jp", "x-euc-jp"]},
{encoding: "iso-2022-jp", labels: ["csiso2022jp", "iso-2022-jp"]},
{encoding: "shift_jis", labels: ["csshiftjis", "ms_kanji", "shift-jis", "shift_jis", "sjis", "windows-31j", "x-sjis"]},
@ -463,78 +463,3 @@ function testInvalid2022JP()
});
assert_equals(failureCount, 0, failureCount + " of " + inputs.length + " tests failed");
}
function testDecoderForBig5()
{
const inputs = [
[ 0x61, 0x62 ],
[ 0x87, 0x40 ],
[ 0xFE, 0xFE ],
[ 0xFE, 0xFD ],
[ 0x88, 0x62 ],
[ 0x88, 0x64 ],
[ 0x88, 0x66 ],
[ 0x88, 0xA3 ],
[ 0x88, 0xA5 ],
[ 0x88, 0xA7 ],
[ 0x99, 0xD4 ],
[ 0x99, 0xD5 ],
[ 0x99, 0xD6 ],
[ 0x61, 0x87, 0x40, 0x62 ],
[ 0x61, 0xFE, 0xFE, 0x62 ],
[ 0x61, 0xFE, 0xFD, 0x62 ],
[ 0x61, 0x88, 0x62, 0x62 ],
[ 0x61, 0x88, 0x64, 0x62 ],
[ 0x61, 0x88, 0x66, 0x62 ],
[ 0x61, 0x88, 0xA3, 0x62 ],
[ 0x61, 0x88, 0xA5, 0x62 ],
[ 0x61, 0x88, 0xA7, 0x62 ],
[ 0x61, 0x99, 0xD4, 0x62 ],
[ 0x61, 0x99, 0xD5, 0x62 ],
[ 0x61, 0x99, 0xD6, 0x62 ],
[ 0x80, 0x61 ],
[ 0xFF, 0x61 ],
[ 0xFE, 0x39 ],
[ 0x87, 0x66 ],
[ 0x81, 0x40 ],
[ 0x61, 0x81 ],
];
const expectations = [
"\u0061\u0062",
"\u43F0",
"\u79D4",
"\uD864\uDD0D",
"\u00CA\u0304",
"\u00CA\u030C",
"\u00CA",
"\u00EA\u0304",
"\u00EA\u030C",
"\u00EA",
"\u8991",
"\uD85E\uDD67",
"\u8A29",
"\u0061\u43F0\u0062",
"\u0061\u79D4\u0062",
"\u0061\uD864\uDD0D\u0062",
"\u0061\u00CA\u0304\u0062",
"\u0061\u00CA\u030C\u0062",
"\u0061\u00CA\u0062",
"\u0061\u00EA\u0304\u0062",
"\u0061\u00EA\u030C\u0062",
"\u0061\u00EA\u0062",
"\u0061\u8991\u0062",
"\u0061\uD85E\uDD67\u0062",
"\u0061\u8A29\u0062",
"\uFFFD\u0061",
"\uFFFD\u0061",
"\uFFFD\u0039",
"\uFFFD\u0066",
"\uFFFD\u0040",
"\u0061\uFFFD",
];
for (var i = 0; i < inputs.length; i++) {
testCharset({encoding: "big5", input: inputs[i], expected: expectations[i],
msg: "decoder test #" + i + " for big5."});
}
}

View File

@ -11,7 +11,7 @@ acp.932=Shift_JIS
acp.936=gb18030
acp.949=EUC-KR
acp.950=Big5
acp.951=Big5
acp.951=Big5-HKSCS
acp.1250=windows-1250
acp.1251=windows-1251
acp.1252=windows-1252

View File

@ -27,6 +27,7 @@ EXPORTS += [
'ucvja/nsUCVJACID.h',
'ucvko/nsUCvKOCID.h',
'ucvlatin/nsUCvLatinCID.h',
'ucvtw/nsUCvTWCID.h',
]
UNIFIED_SOURCES += [
@ -136,9 +137,10 @@ UNIFIED_SOURCES += [
]
UNIFIED_SOURCES += [
'ucvtw/nsBIG5Data.cpp',
'ucvtw/nsBIG5HKSCSToUnicode.cpp',
'ucvtw/nsBIG5ToUnicode.cpp',
'ucvtw/nsUnicodeToBIG5.cpp',
'ucvtw/nsUnicodeToBIG5HKSCS.cpp',
]
UNIFIED_SOURCES += [

View File

@ -82,18 +82,14 @@ public:
* @param aDestLength [IN/OUT] the length of the destination data buffer;
* after conversion will contain the number of Unicode
* characters written
* @return NS_ERROR_UDEC_ILLEGALINPUT if an illegal input sequence
* @return NS_PARTIAL_MORE_INPUT if only a partial conversion was
* done; more input is needed to continue
* NS_PARTIAL_MORE_OUTPUT if only a partial conversion
* was done; more output space is needed to continue
* NS_ERROR_ILLEGAL_INPUT if an illegal input sequence
* was encountered and the behavior was set to "signal";
* the caller must skip over one byte, reset the decoder
* and retry.
* NS_OK_UDEC_MOREOUTPUT if only a partial conversion
* was done; more output space is needed to continue
* NS_OK_UDEC_MOREINPUT if the input ended in the middle
* of an input code unit sequence. If this is the last
* result the caller has at the end of the stream, the
* caller must append one U+FFFD to the output.
* NS_OK if the input ended after a complete input code
* unit sequence.
*/
NS_IMETHOD Convert(const char * aSrc, int32_t * aSrcLength,
char16_t * aDest, int32_t * aDestLength) = 0;

View File

@ -96,12 +96,6 @@ public:
* the first of a surrogate pair.
* NS_ERROR_UENC_NOMAPPING if character without mapping
* was encountered and the behavior was set to "signal".
* In the case of an unmappable BMP character, aDestLength
* must indicate that the unmappable character was
* consumed by the encoder (unlike in the decode API!).
* In the case of an unmappable astral character,
* aDestLength must indicate that the high surrogate was
* consumed by the encoder but the low surrogate was not.
*/
NS_IMETHOD Convert(const char16_t * aSrc, int32_t * aSrcLength,
char * aDest, int32_t * aDestLength) = 0;

View File

@ -107,8 +107,12 @@
#include "nsUnicodeToISO2022JP.h"
// ucvtw
#include "nsUCvTWCID.h"
#include "nsUCvTWDll.h"
#include "nsBIG5ToUnicode.h"
#include "nsUnicodeToBIG5.h"
#include "nsBIG5HKSCSToUnicode.h"
#include "nsUnicodeToBIG5HKSCS.h"
// ucvko
#include "nsUCvKOCID.h"
@ -180,6 +184,7 @@ NS_UCONV_REG_UNREG("EUC-JP", NS_EUCJPTOUNICODE_CID, NS_UNICODETOEUCJP_CID)
// ucvtw
NS_UCONV_REG_UNREG("Big5", NS_BIG5TOUNICODE_CID, NS_UNICODETOBIG5_CID)
NS_UCONV_REG_UNREG("Big5-HKSCS", NS_BIG5HKSCSTOUNICODE_CID, NS_UNICODETOBIG5HKSCS_CID)
// ucvko
NS_UCONV_REG_UNREG("EUC-KR", NS_EUCKRTOUNICODE_CID, NS_UNICODETOEUCKR_CID)
@ -209,8 +214,6 @@ NS_GENERIC_FACTORY_CONSTRUCTOR(nsISO2022JPToUnicodeV2)
NS_GENERIC_FACTORY_CONSTRUCTOR(nsUnicodeToISO2022JP)
// ucvtw
NS_GENERIC_FACTORY_CONSTRUCTOR(nsBIG5ToUnicode)
NS_GENERIC_FACTORY_CONSTRUCTOR(nsUnicodeToBIG5)
// ucvko
@ -244,6 +247,23 @@ const uint16_t g_ASCIIMappingTable[] = {
0x0001, 0x0004, 0x0005, 0x0008, 0x0000, 0x0000, 0x007F, 0x0000
};
// ucvtw
const uint16_t g_ufBig5Mapping[] = {
#include "big5.uf"
};
const uint16_t g_utBIG5Mapping[] = {
#include "big5.ut"
};
const uint16_t g_ufBig5HKSCSMapping[] = {
#include "hkscs.uf"
};
const uint16_t g_utBig5HKSCSMapping[] = {
#include "hkscs.ut"
};
// ucvko
const uint16_t g_utKSC5601Mapping[] = {
#include "u20kscgl.ut"
@ -357,6 +377,8 @@ NS_DEFINE_NAMED_CID(NS_UNICODETOEUCJP_CID);
NS_DEFINE_NAMED_CID(NS_UNICODETOISO2022JP_CID);
NS_DEFINE_NAMED_CID(NS_UNICODETOBIG5_CID);
NS_DEFINE_NAMED_CID(NS_BIG5TOUNICODE_CID);
NS_DEFINE_NAMED_CID(NS_UNICODETOBIG5HKSCS_CID);
NS_DEFINE_NAMED_CID(NS_BIG5HKSCSTOUNICODE_CID);
NS_DEFINE_NAMED_CID(NS_EUCKRTOUNICODE_CID);
NS_DEFINE_NAMED_CID(NS_UNICODETOEUCKR_CID);
NS_DEFINE_NAMED_CID(NS_GBKTOUNICODE_CID);
@ -459,6 +481,8 @@ static const mozilla::Module::CIDEntry kUConvCIDs[] = {
{ &kNS_UNICODETOISO2022JP_CID, false, nullptr, nsUnicodeToISO2022JPConstructor },
{ &kNS_UNICODETOBIG5_CID, false, nullptr, nsUnicodeToBIG5Constructor },
{ &kNS_BIG5TOUNICODE_CID, false, nullptr, nsBIG5ToUnicodeConstructor },
{ &kNS_UNICODETOBIG5HKSCS_CID, false, nullptr, nsUnicodeToBIG5HKSCSConstructor },
{ &kNS_BIG5HKSCSTOUNICODE_CID, false, nullptr, nsBIG5HKSCSToUnicodeConstructor },
{ &kNS_EUCKRTOUNICODE_CID, false, nullptr, nsCP949ToUnicodeConstructor },
{ &kNS_UNICODETOEUCKR_CID, false, nullptr, nsUnicodeToCP949Constructor },
{ &kNS_GBKTOUNICODE_CID, false, nullptr, nsGB18030ToUnicodeConstructor },
@ -563,6 +587,8 @@ static const mozilla::Module::ContractIDEntry kUConvContracts[] = {
{ NS_UNICODEENCODER_CONTRACTID_BASE "ISO-2022-JP", &kNS_UNICODETOISO2022JP_CID },
{ NS_UNICODEENCODER_CONTRACTID_BASE "Big5", &kNS_UNICODETOBIG5_CID },
{ NS_UNICODEDECODER_CONTRACTID_BASE "Big5", &kNS_BIG5TOUNICODE_CID },
{ NS_UNICODEENCODER_CONTRACTID_BASE "Big5-HKSCS", &kNS_UNICODETOBIG5HKSCS_CID },
{ NS_UNICODEDECODER_CONTRACTID_BASE "Big5-HKSCS", &kNS_BIG5HKSCSTOUNICODE_CID },
{ NS_UNICODEDECODER_CONTRACTID_BASE "EUC-KR", &kNS_EUCKRTOUNICODE_CID },
{ NS_UNICODEENCODER_CONTRACTID_BASE "EUC-KR", &kNS_UNICODETOEUCKR_CID },
{ NS_UNICODEDECODER_CONTRACTID_BASE "gbk", &kNS_GBKTOUNICODE_CID },

View File

@ -12,4 +12,3 @@ skip-if = toolkit == 'android' #bug 775227
[test_unicode_noncharacters_gb18030.html]
[test_unicode_noncharacters_utf8.html]
[test_utf8_overconsumption.html]
[test_big5_encoder.html]

View File

@ -1,43 +0,0 @@
<!DOCTYPE HTML>
<html>
<!--
https://bugzilla.mozilla.org/show_bug.cgi?id=912470
-->
<head>
<meta http-equiv="Content-type" content="text/html; charset=UTF-8">
<title>Test for Unicode non-characters</title>
<script type="text/javascript" src="/tests/SimpleTest/SimpleTest.js"></script>
<link rel="stylesheet" type="text/css" href="/tests/SimpleTest/test.css" />
</head>
<body onload="test()">
<pre id="test">
<script class="testbody" type="text/javascript">
/* NOTE:
* When we make our data: URL origin work as in Blink, this test will fail.
* Hopefully, by that time are URL parser has become spec-compliant, so that
* we'll pass the Web Platform Test for the big5 encoder
* (testing/web-platform/tests/encoding/big5-encoder.html) and this test can
* simply be removed.
*/
SimpleTest.waitForExplicitFinish();
function test() {
var f = document.getElementsByTagName("iframe")[0];
f.onload = function() {
var href = f.contentWindow.location.href;
var index = href.indexOf("?foo=");
var actual = href.substring(index + 5);
var expected = "h%26%2340614%3Bi%26%23156267%3Bj%A1%40k%A3%E1l%A4%40m%C8%A4n%C8%CDo%FE%FEp%26%238365%3Bq%FDjr%F9%F9s%26%23128169%3Bt";
is(actual, expected, "Should have gotten the expected encode.");
SimpleTest.finish();
}
f.contentDocument.forms[0].submit();
}
</script>
</pre>
<a target="_blank" href="https://bugzilla.mozilla.org/show_bug.cgi?id=912470">Mozilla Bug 912470</a>
<p id="display"></p>
<div id="content" style="display: none"><iframe src="data:text/html;charset=big5,<form><input name=foo value=h&amp;%23x9EA6;i&amp;%23x2626B;j&amp;%23x3000;k&amp;%23x20AC;l&amp;%23x4E00;m&amp;%23x27607;n&amp;%23xFFE2;o&amp;%23x79D4;p&amp;%23x20AD;q&amp;%23x203B5;r&amp;%23x2550;s&amp;%23x1F4A9;t></form>">
</div>
</body>
</html>

View File

@ -11,6 +11,7 @@ function run_test() {
// this list excludes codepages that can represent all Unicode
var encoders = [
"Big5",
"Big5-HKSCS",
"EUC-JP",
"EUC-KR",
"gbk",

View File

@ -1,253 +0,0 @@
#!/usr/bin/python
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
# Adapted from
# https://hg.mozilla.org/projects/htmlparser/file/0d906fb1ab90/generate-encoding-data.py
# indexes.json comes from
# https://encoding.spec.whatwg.org/indexes.json
# i.e.
# https://github.com/whatwg/encoding/blob/ce4e83d0df5b5efec0697fc76e66699737e033a3/indexes.json
import json
indexes = json.load(open("indexes.json", "r"))
def nullToZero(codePoint):
if not codePoint:
codePoint = 0
return codePoint
index = []
for codePoint in indexes["big5"]:
index.append(nullToZero(codePoint))
# There are four major gaps consisting of more than 4 consecutive invalid pointers
gaps = []
consecutive = 0
consecutiveStart = 0
offset = 0
for codePoint in index:
if codePoint == 0:
if consecutive == 0:
consecutiveStart = offset
consecutive +=1
else:
if consecutive > 4:
gaps.append((consecutiveStart, consecutiveStart + consecutive))
consecutive = 0
offset += 1
def invertRanges(ranges, cap):
inverted = []
invertStart = 0
for (start, end) in ranges:
if start != 0:
inverted.append((invertStart, start))
invertStart = end
inverted.append((invertStart, cap))
return inverted
cap = len(index)
ranges = invertRanges(gaps, cap)
# Now compute a compressed lookup table for astralness
gaps = []
consecutive = 0
consecutiveStart = 0
offset = 0
for codePoint in index:
if codePoint <= 0xFFFF:
if consecutive == 0:
consecutiveStart = offset
consecutive +=1
else:
if consecutive > 40:
gaps.append((consecutiveStart, consecutiveStart + consecutive))
consecutive = 0
offset += 1
astralRanges = invertRanges(gaps, cap)
classFile = open("../ucvtw/nsBIG5Data.cpp", "w")
classFile.write('''/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
/*
* THIS IS A GENERATED FILE. PLEASE DO NOT EDIT.
* Instead, please regenerate using intl/uconv/tools/gen-big5-data.py
*/
#include "nsBIG5Data.h"
static const char16_t kBig5LowBitsTable[] = {
''')
for (low, high) in ranges:
for i in xrange(low, high):
classFile.write(' 0x%04X,\n' % (index[i] & 0xFFFF))
classFile.write('''};
static const uint32_t kBig5AstralnessTable[] = {
''')
# An array of bool is inefficient per
# http://stackoverflow.com/questions/4049156/1-bit-per-bool-in-array-c
bits = []
for (low, high) in astralRanges:
for i in xrange(low, high):
bits.append(1 if index[i] > 0xFFFF else 0)
# pad length to multiple of 32
for i in xrange(32 - (len(bits) % 32)):
bits.append(0)
i = 0
while i < len(bits):
accu = 0
for j in xrange(32):
accu |= bits[i + j] << j
classFile.write(' 0x%08X,\n' % accu)
i += 32
classFile.write('''};
// static
char16_t
nsBIG5Data::LowBits(size_t aPointer)
{
''')
base = 0
for (low, high) in ranges:
classFile.write(''' if (aPointer < %d) {
return 0;
}
if (aPointer < %d) {
return kBig5LowBitsTable[%d + (aPointer - %d)];
}
''' % (low, high, base, low))
base += (high - low)
classFile.write(''' return 0;
}
// static
bool
nsBIG5Data::IsAstral(size_t aPointer)
{
''')
base = 0
for (low, high) in astralRanges:
if high - low == 1:
classFile.write(''' if (aPointer < %d) {
return false;
}
if (aPointer == %d) {
return true;
}
''' % (low, low))
else:
classFile.write(''' if (aPointer < %d) {
return false;
}
if (aPointer < %d) {
size_t index = %d + (aPointer - %d);
return kBig5AstralnessTable[index >> 5] & (1 << (index & 0x1F));
}
''' % (low, high, base, low))
base += (high - low)
classFile.write(''' return false;
}
//static
size_t
nsBIG5Data::FindPointer(char16_t aLowBits, bool aIsAstral)
{
if (!aIsAstral) {
switch (aLowBits) {
''')
hkscsBound = (0xA1 - 0x81) * 157
preferLast = [
0x2550,
0x255E,
0x2561,
0x256A,
0x5341,
0x5345,
]
for codePoint in preferLast:
# Python lists don't have .rindex() :-(
for i in xrange(len(index) - 1, -1, -1):
candidate = index[i]
if candidate == codePoint:
classFile.write(''' case 0x%04X:
return %d;
''' % (codePoint, i))
break
classFile.write(''' default:
break;
}
}''')
base = 0
start = 0
for (low, high) in ranges:
if low <= hkscsBound and hkscsBound < high:
# This is the first range we don't ignore and the
# range that contains the first non-HKSCS pointer.
# Avoid searching HKSCS.
start = base + hkscsBound - low
break
base += (high - low)
classFile.write('''
for (size_t i = %d; i < MOZ_ARRAY_LENGTH(kBig5LowBitsTable); ++i) {
if (kBig5LowBitsTable[i] == aLowBits) {
size_t pointer;
''' % start)
base = 0
prevLow = 0
prevHigh = 0
prevBase = 0
writing = False
for (low, high) in ranges:
if writing:
classFile.write('''if (i < %d) {
pointer = i + %d;
} else ''' % ((prevBase + prevHigh - prevLow), (prevLow - prevBase)))
prevLow = low
prevHigh = high
prevBase = base
if high > hkscsBound:
writing = True
base += (high - low)
classFile.write('''{
pointer = i + %d;
}''' % (prevLow - prevBase))
classFile.write('''
if (aIsAstral == IsAstral(pointer)) {
return pointer;
}
}
}
return 0;
}
''')
classFile.close()

View File

@ -0,0 +1,959 @@
#!/usr/bin/perl -w
#
# gen-big5hkscs-2001-mozilla.pl
# a Perl script that generates Big5-HKSCS <-> Unicode
# conversion tables for Mozilla
#
# Author (of the original Perl script):
# Anthony Fok <anthony@thizlinux.com> <foka@debian.org>
# Copyright (C) 2001, 2002 ThizLinux Laboratory Ltd.
# License: GNU General Public License, v2 or later.
#
# This version includes original C source code from
# glibc-2.2.5/iconvdata/big5hkscs.c by Ulrich Drepper <drepper@redhat.com>
# Roger So <roger.so@sw-linux.com>
#
# First attempt for Qt-2.3.x: 2001-09-21
# A working version for Qt-2.3.x: 2001-10-30
# Ported to glibc-2.2.5 with HKSCS-2001: 2002-03-21
# Adapted to generate conversion tables for Mozilla: 2002-11-26
# Adapted to generate conversion tables for Mozilla: 2002-11-30
# Cleaned up the script somewhat: 2002-12-04
# Minor revisions for submitting to Mozilla Bugzilla: 2002-12-10
#
# Notes:
#
# 1. The latest version of this script may be found in:
# http://www.thizlinux.com/~anthony/hkscs/gen-glibc-big5hkscs.pl
# http://people.debian.org/~foka/hkscs/gen-glibc-big5hkscs.pl
# Or, better yet, e-mail me and ask for the latest version.
#
# 2. This script generates data from 3 tables:
# a. http://www.microsoft.com/typography/unicode/950.txt
# b. http://www.info.gov.hk/digital21/chi/hkscs/download/big5-iso.txt
# c. http://www.info.gov.hk/digital21/chi/hkscs/download/big5cmp.txt
#
# Make sure your big5-iso.txt is the latest HKSCS-2001 version.
#
# 3. [glibc]: I have currently split the ucs_to_big5_hkscs_?[] tables into
# different areas similar to the way Ulrich and Roger did it,
# but extended for HKSCS-2001.
#
# 4. [Mozilla]: This script is very quick-and-dirty in some places.
# Call either gen_mozilla_uf() or gen_mozilla_ut() to generate
# the appropriate tables for feeding into "fromu" or "tou".
#
# 5. [CharMapML]: The comments regarding TW-BIG5 herein need to be organized.
# Also, please make sure "$hkscs_mode = 0;" for TW-BIG5 mode.
# Otherwise, this script would generate a HKSCS table.
# (Yes, I know, I should clean up this script and make it more modular,
# and with command-line options or whatnot. I'll do that later. :-)
#
# If you have any questions or concerns, please feel free to contact me
# at Anthony Fok <anthony@thizlinux.com> or <foka@debian.org> :-)
#
# Last but not least, special thanks to ThizLinux Laboratory Ltd. (HK)
# for their generous support in this work.
#
# 1. UDA3, 0x8840 - 0x8dfe
# 2. UDA2, 0x8e40 - 0xa0fe
# 3. VDA, 0xc6a1 - 0xc8fe
#use Getopt::Std;
my ( %b2u, %u2b, $unicode, $big5, $high, $low, $i, $count );
my $debug = 0;
my $hkscs_mode = 1;
my $kangxi = 0;
my $use_range = 0;
my $bmp_only = 1;
#
# Subroutine Declaration
#
sub read_cp950();
sub adjust_radicals();
sub read_hkscs_main();
sub read_hkscs_cmp();
sub post_tuning();
sub gen_charmapml();
sub gen_check_b2u();
sub gen_check_u2b();
sub gen_mozilla_uf();
sub gen_mozilla_ut();
sub gen_glibc();
###########################################################################
#
# Main program
#
# First, read Microsoft's CP950 as base Big5.
read_cp950 ();
# Add mappings to Kangxi Radicals.
# The b2u direction is added only if $kangxi is not null.
adjust_radicals ();
# Then, read the HKSCS table.
# Again, see the $hkscs_mode variable.
read_hkscs_main ();
read_hkscs_cmp () if $hkscs_mode;
post_tuning ();
# Then, choose one of the following:
#gen_charmapml();
gen_mozilla_uf();
#gen_mozilla_ut();
#gen_check_u2b();
#gen_glibc();
# End of program
exit 0;
#############################################################################
#
# Subroutines
#
sub read_cp950() {
open( CP950, "950.txt" ) or die;
my $mode = 0;
while (<CP950>) {
s/\r//;
chomp;
next if /^$/;
last if /^ENDCODEPAGE/;
if (/^DBCSTABLE (\d+)\s+;LeadByte = 0x([0-9a-f]{2})/) {
$mode = 1;
( $count, $high ) = ( $1, $2 );
$i = 0;
next;
}
if (/^WCTABLE (\d+)/) {
$mode = 2;
$count = $1;
$i = 0;
next;
}
next if $mode == 0;
if ( $mode == 1 ) {
( $low, $unicode, $comment ) = split "\t";
$low =~ s/^0x//;
$unicode =~ s/^0x//;
$big5 = $high . $low;
$b2u{ uc($big5) } = uc($unicode);
if ( ++$i == $count ) { $mode = 0; $count = 0; next; }
}
if ( $mode == 2 ) {
( $unicode, $big5, $comment ) = split "\t";
$unicode =~ s/^0x//;
$big5 =~ s/^0x//;
my $u = hex($unicode);
my $b = hex($big5);
$u2b{ uc($unicode) } = uc($big5) unless
# Skip Microsoft's over-generous (or over-zealous?) mappings
# "Faked" accented latin characters
( $b <= 0xFF and $b != $u )
# "Faked" Ideographic Annotation ___ Mark
or ( $u >= 0x3192 and $u <= 0x319F )
# "Faked" Parenthesized Ideograph ___
or ( $u >= 0x3220 and $u <= 0x3243 )
# "Faked" Circled Ideograph ___ except Circled Ideograph Correct
or ( $u >= 0x3280 and $u <= 0x32B0 and $u != 0x32A3 )
# ¢£¥’μ﹐
or ( $u == 0xA2
or $u == 0xA3
or $u == 0xA5
or $u == 0xB4
or $u == 0xB5
or $u == 0xB8 )
# ¯─∥‧˙〃 ̄﹨°≡︴⊙⊕~﹋
or ( $u == 0x0305 # ???
or $u == 0x2015
or $u == 0x2016
or $u == 0x2022
or $u == 0x2024
or $u == 0x2033
or $u == 0x203E # ???
or $u == 0x2216
or $u == 0x2218
or $u == 0x2263
or $u == 0x2307
or $u == 0x2609
or $u == 0x2641
or $u == 0x301C
or $u == 0x3030 )
# ︿‘﹑
or ( $u == 0xFF3E or $u == 0xFF40 or $u == 0xFF64 );
if ( ++$i == $count ) { $mode = 0; $count = 0; next; }
}
}
}
sub adjust_radicals() {
# B5+C6BF - B5+C6D7: Radicals (?)
# TW-BIG5 drafted by Autrijus uses Kangxi Radicals whenever possible.
#
# Big5-HKSCS tends towards using the character in Unicode CJK Ideographs
# Note that HKSCS does not explicitly define
# B5+C6CF, B5+C6D3, B5+C6D5, B5+C6D7 (廴、无、癶、隶),
# but do have these characters at B5+FBFD, B5+FCD3, B5+FEC1, B5+90C4,
# mapped to U+5EF4, U+65E0, U+7676, U+96B6 respectively.
#
# As for B5+C6CD (⼳), HKSCS maps it to U+2F33 just like TW-BIG5.
# However, it also maps B5+FBF4 (幺) to U+5E7A.
$b2u{"C6BF"} = "2F02" if $kangxi;
$u2b{"2F02"} = "C6BF"; #
$b2u{"C6C0"} = "2F03" if $kangxi;
$u2b{"2F03"} = "C6C0"; # 丿
$b2u{"C6C1"} = "2F05" if $kangxi;
$u2b{"2F05"} = "C6C1"; # 亅
$b2u{"C6C2"} = "2F07" if $kangxi;
$u2b{"2F07"} = "C6C2"; # 亠
$b2u{"C6C3"} = "2F0C" if $kangxi;
$u2b{"2F0C"} = "C6C3"; # 冂
$b2u{"C6C4"} = "2F0D" if $kangxi;
$u2b{"2F0D"} = "C6C4"; # 冖
$b2u{"C6C5"} = "2F0E" if $kangxi;
$u2b{"2F0E"} = "C6C5"; # 冫
$b2u{"C6C6"} = "2F13" if $kangxi;
$u2b{"2F13"} = "C6C6"; # 勹
$b2u{"C6C7"} = "2F16" if $kangxi;
$u2b{"2F16"} = "C6C7"; # 匸
$b2u{"C6C8"} = "2F19" if $kangxi;
$u2b{"2F19"} = "C6C8"; # 卩
$b2u{"C6C9"} = "2F1B" if $kangxi;
$u2b{"2F1B"} = "C6C9"; # 厶
$b2u{"C6CA"} = "2F22" if $kangxi;
$u2b{"2F22"} = "C6CA"; # 夊
$b2u{"C6CB"} = "2F27" if $kangxi;
$u2b{"2F27"} = "C6CB"; # 宀
$b2u{"C6CC"} = "2F2E" if $kangxi;
$u2b{"2F2E"} = "C6CC"; # 巛
$b2u{"C6CD"} = "2F33" if $kangxi;
$u2b{"2F33"} = "C6CD"; # ⼳
$b2u{"C6CE"} = "2F34" if $kangxi;
$u2b{"2F34"} = "C6CE"; # 广
$b2u{"C6CF"} = "2F35" if $kangxi;
$u2b{"2F35"} = "C6CF"; # 廴
$b2u{"C6D0"} = "2F39" if $kangxi;
$u2b{"2F39"} = "C6D0"; # 彐
$b2u{"C6D1"} = "2F3A" if $kangxi;
$u2b{"2F3A"} = "C6D1"; # 彡
$b2u{"C6D2"} = "2F41" if $kangxi;
$u2b{"2F41"} = "C6D2"; # 攴
$b2u{"C6D3"} = "2F46" if $kangxi;
$u2b{"2F46"} = "C6D3"; # 无
$b2u{"C6D4"} = "2F67" if $kangxi;
$u2b{"2F67"} = "C6D4"; # 疒
$b2u{"C6D5"} = "2F68" if $kangxi;
$u2b{"2F68"} = "C6D5"; # 癶
$b2u{"C6D6"} = "2FA1" if $kangxi;
$u2b{"2FA1"} = "C6D6"; # 辵
$b2u{"C6D7"} = "2FAA" if $kangxi;
$u2b{"2FAA"} = "C6D7"; # 隶
}
sub read_hkscs_main() {
open( B2U, "<big5-iso.txt" ) or die;
while (<B2U>) {
next
unless
/([[:xdigit:]]{4})\s+([[:xdigit:]]{4})\s+([[:xdigit:]]{4})\s+([[:xdigit:]]{4,5})/;
( $big5, $iso1993, $iso2000, $iso2001 ) = ( $1, $2, $3, $4 );
my $b = hex($big5);
# For non-HKSCS mode, only take data in the VDA range (?)
next unless $hkscs_mode
# Note that we don't go from B5+C6A1-B5+C6FE, but rather only
# C6A1-C8D3 excluding C6BF-C6D7 (Kangxi Radicals)
# because C8D4-C8FE are not assigned in TW-BIG5
# if we are to follow Arphic PL Big-5 fonts. (To be discussed)
or
( $b >= 0xC6A1 && $b <= 0xC8D3 and !( $b >= 0xC6BF && $b <= 0xC6D7 ) )
or ( $b >= 0xF9D6 && $b <= 0xF9FE );
print STDERR
"B2U, 2000: $big5 redefined from U+$b2u{$big5} to U+$iso2000.\n"
if $debug
and defined( $b2u{$big5} )
and $b2u{$big5} ne $iso2000;
$b2u{$big5} = $bmp_only ? $iso2000 : $iso2001
unless !$hkscs_mode
and $b == 0xF9FE;
# B5+F9FE is mapped differently in TW-BIG5 and HKSCS, to
# U+2593 (Dark Shade) and U+FFED (Halfwidth Black Square) respectively.
# Which is more correct? I don't know! (To be discussed)
print STDERR
"1993: U+$iso1993 redefined from $u2b{$iso1993} to $big5.\n"
if $debug
and defined( $u2b{$iso1993} )
and $u2b{$iso1993} ne $big5;
$u2b{$iso1993} = $big5;
print STDERR
"2000: U+$iso2000 redefined from $u2b{$iso2000} to $big5.\n"
if $debug
and defined( $u2b{$iso2000} )
and $u2b{$iso2000} ne $big5;
$u2b{$iso2000} = $big5;
print STDERR
"2001: U+$iso2001 redefined from $u2b{$iso2001} to $big5.\n"
if $debug
and defined( $u2b{$iso2001} )
and $u2b{$iso2001} ne $big5;
$u2b{$iso2001} = $big5;
}
close B2U;
} # read_hkscs_main()
sub read_hkscs_cmp() {
###########################################################################
# Add Big5 compatibility coding...
#
# Stephan, here is the code segment that you may want to implement
# in your convertbig5hkscs2001.pl
#
open( B5CMP, "<big5cmp.txt" ) or die;
$mode = 0;
while (<B5CMP>) {
if (/^=====/) { $mode = 1; next; }
next if $mode == 0;
last if $mode == 1 and /^\s+/;
chomp;
my ( $big5cmp, $big5 ) = split " ";
$big5cmp = uc($big5cmp);
$big5 = uc($big5);
my $uni = $b2u{$big5};
my $unicmp = $b2u{$big5cmp};
print STDERR
"Was: U+$unicmp -> $u2b{$unicmp}, $big5cmp -> U+$b2u{$big5cmp}\t"
if $debug;
$b2u{$big5cmp} = $uni;
$u2b{$unicmp} = $big5;
print STDERR
"Now: U+$unicmp -> $u2b{$unicmp}, $big5cmp -> U+$b2u{$big5cmp}\n"
if $debug;
}
close B5CMP;
} # read_hkscs_cmp();
sub post_tuning() {
# And finally, fine-tuning...
for $i ( 0x00 .. 0x80 ) {
$big5 = $unicode = sprintf( "%04X", $i );
$b2u{$big5} = $unicode;
}
# Add Euro '€' (I wonder why this 950.txt doesn't have it.)
$b2u{"A3E1"} = "20AC";
$u2b{"20AC"} = "A3E1";
# Box drawing characters:
# Align with Big-5E (To be discussed, as it differs from CP950 and HKSCS)
# (To be discussed)
if ( !$hkscs_mode ) {
$u2b{"2550"} = "A2A4"; # Big5: ═ (also B5-F9F9)
$u2b{"255E"} = "A2A5"; # Big5: ╞ (also B5-F9E9)
$u2b{"2561"} = "A2A7"; # Big5: ╡ (also B5-F9EB)
$u2b{"256A"} = "A2A6"; # Big5: ╪ (also B5-F9EA)
$u2b{"256D"} = "A27E"; # Big5: ╭ (also B5-F9FA)
$u2b{"256E"} = "A2A1"; # Big5: ╮ (also B5-F9FB)
$u2b{"256F"} = "A2A3"; # Big5: ╯ (also B5-F9FD)
$u2b{"2570"} = "A2A2"; # Big5: ╰ (also B5-F9FC)
}
# "Hangzhou" or "Suzhou" Chinese numerals 10, 20, 30 (十卄卅)
# (To be discussed)
if ( !$hkscs_mode ) {
$b2u{"A2CC"} = "3038";
$u2b{"3038"} = "A2CC";
$b2u{"A2CD"} = "3039";
$u2b{"3039"} = "A2CD";
$b2u{"A2CE"} = "303A";
$u2b{"303A"} = "A2CE";
}
# The character for ethnic group "Yi" (彝):
# (To be discussed)
$u2b{"5F5E"} = "C255"; # Always add this.
if ( !$hkscs_mode ) {
$b2u{"C255"} = "5F5E";
}
} # post_tuning()
sub gen_charmapml() {
###########################################################################
#
# Codes for generating CharMapML XML file
print <<EOT;
<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE characterMapping SYSTEM "http://www.unicode.org/unicode/reports/tr22/CharacterMapping.dtd">
EOT
if ($hkscs_mode) {
print <<EOT;
<characterMapping id="big5-hkscs-2001" version="1">
<history>
<modified version="1" date="2002-11-30">
Trial version generated from 950.txt + part of big5-iso.txt (HKSCS-2001)
with Euro added, with CP950's excessive fub (fallbacks uni->big5) removed,
and with some other manual tweaking.
</modified>
</history>
EOT
}
else {
print <<EOT;
<characterMapping id="tw-big5-2002" version="1">
<history>
<modified version="1" date="2002-11-30">
Trial version generated from 950.txt + part of big5-iso.txt (HKSCS-2001)
with Euro added, with CP950's excessive fub (fallbacks uni->big5) removed,
and with some other manual tweaking.
</modified>
</history>
EOT
}
print <<EOT;
<validity>
<state type="FIRST" next="VALID" s="0" e="80" max="FFFF"/>
<state type="FIRST" next="SECOND" s="81" e="FE" max="FFFF"/>
<state type="SECOND" next="VALID" s="40" e="7E" max="FFFF"/>
<state type="SECOND" next="VALID" s="A1" e="FE" max="FFFF"/>
</validity>
<assignments sub="3F">
EOT
print " <!-- One to one mappings -->\n";
for $unicode ( sort { hex($a) <=> hex($b) } keys %u2b ) {
$big5 = $u2b{$unicode};
$u = hex($unicode);
next
unless defined( $b2u{$big5} )
and $unicode eq $b2u{$big5}
and
not( $use_range and !$hkscs_mode and $u >= 0xE000 && $u <= 0xF6B0 );
printf " <a u=\"%04X\" ", $u;
if ( hex($big5) <= 0xFF ) {
printf "b=\"%02X\"/>\n", hex($big5);
}
else {
printf "b=\"%s %s\"/>\n", substr( $big5, 0, 2 ),
substr( $big5, 2, 2 );
}
}
print " <!-- Fallback mappings from Unicode to bytes -->\n";
for $unicode ( sort { hex($a) <=> hex($b) } keys %u2b ) {
$big5 = $u2b{$unicode};
next if defined( $b2u{$big5} ) and hex($unicode) == hex( $b2u{$big5} );
if ( $unicode eq "F900" ) {
print " <!-- CJK Compatibility Ideographs: U+F900 - U+FA6A.\n";
print
" These are included in CP950 (Unicode->Big5 direction only).\n";
print " Should we include this area in TW-BIG5 or not? -->\n";
}
printf " <fub u=\"%04X\" b=\"%s %s\"/>\n", hex($unicode),
substr( $big5, 0, 2 ), substr( $big5, 2, 2 );
}
my %fbu;
print " <!-- Fallback mappings from bytes to Unicode -->\n";
for $big5 ( sort { hex($a) <=> hex($b) } keys %b2u ) {
$unicode = $b2u{$big5};
if ( !defined( $u2b{$unicode} ) or hex($big5) != hex( $u2b{$unicode} ) )
{
$fbu{$unicode} = $big5;
}
}
for $unicode ( sort { hex($a) <=> hex($b) } keys %fbu ) {
$big5 = $fbu{$unicode};
printf " <fbu u=\"%04X\" b=\"%s %s\"/>\n", hex($unicode),
substr( $big5, 0, 2 ), substr( $big5, 2, 2 );
}
if ( $use_range and !$hkscs_mode ) {
print <<EOT;
<!-- Roundtrip-mappings that can be enumerated
Note: We can only use the <range> tag for TW-BIG5.
Big-5E and Big5-HKSCS have assigned characters in these areas,
and we will have to use the <a> and <fub> tags instead.
-->
<!-- User-Defined Area 1 (UDA1) -->
<range uFirst="E000" uLast="E310" bFirst="FA 40" bLast="FE FE" bMin="81 40" bMax="FE FE"/>
<!-- User-Defined Area 2 (UDA2) -->
<range uFirst="E311" uLast="EEB7" bFirst="8E 40" bLast="A0 FE" bMin="81 40" bMax="FE FE"/>
<!-- User-Defined Area 3 (UDA3) -->
<range uFirst="EEB8" uLast="F6B0" bFirst="81 40" bLast="8D FE" bMin="81 40" bMax="FE FE"/>
EOT
}
print <<EOT;
</assignments>
</characterMapping>
EOT
} # gen_charmapml()
sub gen_check_b2u() {
###########################################################################
#
# Codes for generating a raw table for verification and testing
#
# #print $u2b{"F7D1"}, "\n";
# print $b2u{$u2b{"F7D1"}}, "\n";
# print "FA59 -> U+", $b2u{"FA59"}, "\n";
foreach $big5 ( sort { hex($a) <=> hex($b) } keys %b2u ) {
$unicode = $b2u{$big5};
$big5 =~ s/^00//;
print "U+", $unicode, ": ", $big5, "\n";
}
}
sub gen_check_u2b() {
foreach $unicode ( sort { hex($a) <=> hex($b) } keys %u2b ) {
$big5 = $u2b{$unicode};
$big5 =~ s/^00//;
print "U+", $unicode, ": ", $big5, "\n";
}
}
###########################################################################
#
# Codes for generating hkscs.ut and hkscs.uf files for Mozilla
#
sub gen_mozilla_uf() {
# hkscs.uf
foreach $unicode ( sort keys %u2b ) {
$big5 = $u2b{$unicode};
my $b = hex($big5);
print "0x", uc($big5), "\t0x", uc($unicode), "\n"
unless ( $b >= 0xA140 and $b <= 0xC6A0 )
or ( $b >= 0xC940 and $b <= 0xF9D5 )
or ( $b < 0x8140 )
or ( hex($unicode) > 0xFFFF );
}
}
sub gen_mozilla_ut() {
# hkscs.ut
foreach $big5 ( sort keys %b2u ) {
my $b = hex($big5);
print "0x", uc($big5), "\t0x", uc( $b2u{$big5} ), "\n"
unless ( $b >= 0xA140 and $b <= 0xC6A0 )
or ( $b < 0x8140 )
or ( $b >= 0xC940 and $b <= 0xF9D5 );
}
}
###########################################################################
sub gen_glibc() {
##########################################################################
#
# Generate index for UCS4 to Big5-HKSCS conversion table
#
@index_array = ();
$mode = 0;
$count = 0;
for ( $uni = 0x81 ; $uni <= 0x2FFFF ; $uni++ ) {
$unicode = sprintf( "%04X", $uni );
# print " /* U+$unicode */\t" if $low % 4 == 0;
if ( defined( $u2b{$unicode} ) ) {
if ( $mode == 0 ) {
$range_start = $range_end = $uni;
# printf " { %7s, ", sprintf("0x%04X", $range_start);
$mode = 1;
}
else {
$range_end = $uni;
}
}
elsif ( $mode == 1 and ( $uni - $range_end ) >= 0x80 ) {
# Start a new range if the gap is 0x80 or larger
# printf "%7s, %5d },\n", sprintf("0x%04X", $range_end), $count;
push @index_array, [ ( $range_start, $range_end, $count ) ];
$count += $range_end - $range_start + 1;
$mode = 0;
}
}
#
# Note that $count and $range_end are used again as global variables
# below
#
###########################################################################
#
# Start generating real C code...
#
print <<'EOT';
/* Mapping tables for Big5-HKSCS handling.
Copyright (C) 1997, 1998, 2000, 2001, 2002 Free Software Foundation, Inc.
This file is part of the GNU C Library.
Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
Modified for Big5-HKSCS by Roger So <roger.so@sw-linux.com>, 2000.
Updated for HKSCS-2001 by James Su <suzhe@turbolinux.com.cn>
and Anthony Fok <anthony@thizlinux.com>, 2002
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, write to the Free
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
02111-1307 USA. */
#include <dlfcn.h>
#include <gconv.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <wchar.h>
/* Table for Big5-HKSCS to UCS conversion.
Original comments by Roger So when he updated the tables for HKSCS-1999:
With HKSCS mappings 0x8140-0xA0FE and 0xFA40-0xFEFE added; more info:
http://www.digital21.gov.hk/eng/hkscs/index.html
- spacehunt 07/01/2000
The BIG5-HKSCS mapping tables are generated from 950.txt, big5-iso.txt
and big5cmp.txt using a Perl script while merging C source code from
other developers. A copy of the source Perl script is available at:
http://www.thizlinux.com/~anthony/hkscs/gen-glibc-big5hkscs.pl
http://people.debian.org/~foka/hkscs/gen-glibc-big5hkscs.pl
Revisions:
2001-10-30 made codec for Qt
2002-03-21 ported to glibc-2.2.5 and added HKSCS-2001
Todo:
Use a hash for characters beyond BMP to save space and make it
more efficient
- Anthony Fok <anthony@thizlinux.com> 21 Mar 2002
On behalf of ThizLinux Laboratory Ltd., Hong Kong SAR, China
*/
EOT
##########################################################################
#
# Generate Big5-HKSCS to Unicode conversion table
#
## print "Big5HKSCS to Unicode\n";
# for $high (0x81..0x8d, 0x8e..0xa0, 0xc6..0xc8, 0xf9, 0xfa..0xfe) {
$high_start = 0x88;
$high_end = 0xfe;
print "static const uint16_t big5_hkscs_to_ucs[";
print( ( $high_end - $high_start + 1 ) * 157 );
print "] =\n{\n";
for $high ( 0x88 .. 0xfe ) {
for $low ( 0x40 .. 0x7e, 0xa1 .. 0xfe ) {
if ( $low == 0x40 ) {
print "\n" unless $high == $high_start;
printf
"\t/* Big5-HKSCS 0x%02X40..0x%02X7E, 0x%02XA1..0x%02XFE */\n",
$high, $high, $high, $high;
}
elsif ( $low == 0xa1 ) {
print "\t\t";
}
$big5 = sprintf( "%02X%02X", $high, $low );
print "\t" if $low % 8 == 0;
if ( defined( $b2u{$big5} ) ) {
$unicode = $b2u{$big5};
print "0x", $unicode, ",";
}
else {
print "0x0000,"; # for glibc
}
print( ( $low % 8 == 7 or $low == 0x7e or $low == 0xfe )
? "\n"
: "\t" );
}
}
print "};\n\n";
##########################################################################
#
# Generate Unicode to Big5-HKSCS conversion table
#
print "static const unsigned char ucs4_to_big5_hkscs[$count][2] =\n{\n";
foreach $index (@index_array) {
( $start, $end ) = ( @$index[0], @$index[1] );
printf( " /* U+%04X */\t", $start ) if ( $start % 4 != 0 );
print "\t" x ( ( $start % 4 ) * 1.5 ) . " " x ( $start % 2 );
for ( $i = $start ; $i <= $end ; $i++ ) {
printf( " /* U+%04X */\t", $i ) if ( $i % 4 == 0 );
$unicode = sprintf( "%04X", $i );
if ( defined( $big5 = $u2b{$unicode} ) ) {
if ( $big5 =~ /^00/ ) {
print '"\x', substr( $big5, 2, 2 ), '\x00",';
}
else {
print '"\x', substr( $big5, 0, 2 ), '\x',
substr( $big5, 2, 2 ), '",';
}
}
else {
print '"\x00\x00",';
}
print( ( $i % 4 == 3 ) ? "\n" : " " ) unless $i == $end;
}
print $end == $range_end ? "\n" : "\n\n";
}
print "};\n\n";
###########################################################################
print <<EOT;
static struct
{
/* Note: We are going to split this table so that we can use
uint16_t for "from" and "to" again. Anthony Fok, 2002-03-21 */
uint32_t from;
uint32_t to;
uint32_t offset;
} from_ucs4_idx[] =
{
EOT
foreach $index (@index_array) {
printf " { %7s, %7s, %5d },\n", sprintf( "0x%04X", @$index[0] ),
sprintf( "0x%04X", @$index[1] ), @$index[2];
}
print "};\n\n";
#foreach $i (sort keys %b2u) {
# print $b2u{$i} . ' ';
#}
print <<'EOT';
/* Definitions used in the body of the `gconv' function. */
#define CHARSET_NAME "BIG5HKSCS//"
#define FROM_LOOP from_big5
#define TO_LOOP to_big5
#define DEFINE_INIT 1
#define DEFINE_FINI 1
#define MIN_NEEDED_FROM 1
#define MAX_NEEDED_FROM 2
#define MIN_NEEDED_TO 4
/* First define the conversion function from Big5-HKSCS to UCS4. */
#define MIN_NEEDED_INPUT MIN_NEEDED_FROM
#define MAX_NEEDED_INPUT MAX_NEEDED_FROM
#define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
#define LOOPFCT FROM_LOOP
#define BODY \
{ \
uint32_t ch = *inptr; \
\
if (ch >= 0x81 && ch <= 0xfe) \
{ \
/* Two-byte character. First test whether the next character \
is also available. */ \
uint32_t ch2; \
int idx; \
\
if (__builtin_expect (inptr + 1 >= inend, 0)) \
{ \
/* The second character is not available. */ \
result = __GCONV_INCOMPLETE_INPUT; \
break; \
} \
\
ch2 = inptr[1]; \
/* See whether the second byte is in the correct range. */ \
if ((ch2 >= 0x40 && ch2 <= 0x7e) || (ch2 >= 0xa1 && ch2 <= 0xfe)) \
{ \
if (ch >= 0x88) \
{ \
/* Look up the table */ \
idx = (ch - 0x88) * 157 + ch2 - (ch2 <= 0x7e ? 0x40 : 0x62); \
if ((ch = big5_hkscs_to_ucs[idx]) == 0) \
{ \
/* This is illegal. */ \
if (! ignore_errors_p ()) \
{ \
result = __GCONV_ILLEGAL_INPUT; \
break; \
} \
\
++inptr; \
++*irreversible; \
continue; \
} \
} \
else \
{ \
/* 0x81..0x87 in UDA3, currently maps linearly to PUA */ \
ch = (ch - 0x81) * 157 + ch2 - (ch2 <= 0x7e ? 0x40 : 0x62) \
+ 0xeeb8; \
} \
} \
else \
{ \
/* This is illegal. */ \
if (! ignore_errors_p ()) \
{ \
result = __GCONV_ILLEGAL_INPUT; \
break; \
} \
\
++inptr; \
++*irreversible; \
continue; \
} \
\
inptr += 2; \
} \
else if (__builtin_expect (ch, 0) == 0xff) \
{ \
result = __GCONV_ILLEGAL_INPUT; \
break; \
} \
else /* 0x00 to 0x80 */ \
++inptr; \
\
put32 (outptr, ch); \
outptr += 4; \
}
#define LOOP_NEED_FLAGS
#include <iconv/loop.c>
/* Next, define the other direction. */
#define MIN_NEEDED_INPUT MIN_NEEDED_TO
#define MIN_NEEDED_OUTPUT MIN_NEEDED_FROM
#define MAX_NEEDED_OUTPUT MAX_NEEDED_FROM
#define LOOPFCT TO_LOOP
#define BODY \
{ \
uint32_t ch = get32 (inptr); \
const unsigned char *cp = ""; \
unsigned char b5ch[2] = "\0\0"; \
int i; \
\
for (i = 0; \
i < (int) (sizeof (from_ucs4_idx) / sizeof (from_ucs4_idx[0])); \
++i) \
{ \
if (ch < from_ucs4_idx[i].from) \
break; \
if (from_ucs4_idx[i].to >= ch) \
{ \
cp = ucs4_to_big5_hkscs[from_ucs4_idx[i].offset \
+ ch - from_ucs4_idx[i].from]; \
break; \
} \
} \
\
if (ch <= 0x80) \
{ \
b5ch[0] = ch; \
cp = b5ch; \
} \
\
if (cp[0] == '\0' && ch != 0) \
{ \
UNICODE_TAG_HANDLER (ch, 4); \
\
/* Illegal character. */ \
STANDARD_ERR_HANDLER (4); \
} \
else \
{ \
/* See whether there is enough room for the second byte we write. */ \
if (__builtin_expect (cp[1], '\1') != '\0' \
&& __builtin_expect (outptr + 1 >= outend, 0)) \
{ \
/* We have not enough room. */ \
result = __GCONV_FULL_OUTPUT; \
break; \
} \
\
*outptr++ = cp[0]; \
if (cp[1] != '\0') \
*outptr++ = cp[1]; \
} \
\
inptr += 4; \
}
#define LOOP_NEED_FLAGS
#include <iconv/loop.c>
/* Now define the toplevel functions. */
#include <iconv/skeleton.c>
EOT
}

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

6612
intl/uconv/ucvtw/big5.uf Normal file

File diff suppressed because it is too large Load Diff

7552
intl/uconv/ucvtw/big5.ut Normal file

File diff suppressed because it is too large Load Diff

11142
intl/uconv/ucvtw/hkscs.uf Normal file

File diff suppressed because it is too large Load Diff

2368
intl/uconv/ucvtw/hkscs.ut Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,18 +0,0 @@
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#ifndef nsBIG5Data_h_
#define nsBIG5Data_h_
class nsBIG5Data
{
public:
static char16_t LowBits(size_t aPointer);
static bool IsAstral(size_t aPointer);
static size_t FindPointer(char16_t aLowBits, bool aIsAstral);
};
#endif /* nsBIG5Data_h_ */

View File

@ -0,0 +1,55 @@
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#include "nsBIG5HKSCSToUnicode.h"
#include "nsUCvTWDll.h"
#include "nsUCConstructors.h"
//----------------------------------------------------------------------
// Global functions and data [declaration]
static const uScanClassID g_BIG5HKSCSScanClassIDs[] = {
u1ByteCharset,
u2BytesCharset,
u2BytesCharset,
u2BytesCharset,
u2BytesCharset,
u2BytesCharset
};
static const uint16_t *g_BIG5HKSCSMappingTableSet [] ={
g_ASCIIMappingTable,
g_utBig5HKSCSMapping,
g_utBIG5Mapping,
g_utBig5HKSCSMapping,
g_utBIG5Mapping,
g_utBig5HKSCSMapping,
};
static const uRange g_BIG5HKSCSRanges[] = {
{ 0x00, 0x7F },
{ 0x81, 0xA0 },
{ 0xA1, 0xC6 },
{ 0xC6, 0xC8 },
{ 0xC9, 0xF9 },
{ 0xF9, 0xFE }
};
//----------------------------------------------------------------------
// Class nsBIG5HKSCSToUnicode [implementation]
nsresult
nsBIG5HKSCSToUnicodeConstructor(nsISupports *aOuter, REFNSIID aIID,
void **aResult)
{
return CreateMultiTableDecoder(6,
(const uRange* ) &g_BIG5HKSCSRanges,
(uScanClassID*) &g_BIG5HKSCSScanClassIDs,
(uMappingTable**) &g_BIG5HKSCSMappingTableSet,
1,
aOuter, aIID, aResult);
}

View File

@ -0,0 +1,21 @@
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#ifndef nsBIG5HKSCSToUnicode_h___
#define nsBIG5HKSCSToUnicode_h___
#include "nsISupports.h"
/**
* A character set converter from BIG5-HKSCS to Unicode.
*
* @created 02/Jul/2000
* @author Gavin Ho, Hong Kong Professional Services, Compaq Computer (Hong Kong) Ltd.
*/
nsresult
nsBIG5HKSCSToUnicodeConstructor(nsISupports *aOuter, REFNSIID aIID,
void **aResult);
#endif /* nsBIG5HKSCSToUnicode_h___ */

View File

@ -4,162 +4,36 @@
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#include "nsBIG5ToUnicode.h"
#include "mozilla/BinarySearch.h"
#include "mozilla/ArrayUtils.h"
#include "nsBIG5Data.h"
#include "nsUCvTWDll.h"
#include "nsUCConstructors.h"
nsBIG5ToUnicode::nsBIG5ToUnicode()
: mPendingTrail(0)
, mBig5Lead(0)
//----------------------------------------------------------------------
// Global functions and data [declaration]
static const uScanClassID g_BIG5ScanClassIDs[] = {
u1ByteCharset,
u2BytesCharset
};
static const uint16_t *g_BIG5MappingTableSet [] ={
g_ASCIIMappingTable,
g_utBIG5Mapping
};
static const uRange g_BIG5Ranges[] = {
{ 0x00, 0x7F },
{ 0x81, 0xFE }
};
nsresult
nsBIG5ToUnicodeConstructor(nsISupports *aOuter, REFNSIID aIID,
void **aResult)
{
return CreateMultiTableDecoder(2,
(const uRange* ) &g_BIG5Ranges,
(uScanClassID*) &g_BIG5ScanClassIDs,
(uMappingTable**) &g_BIG5MappingTableSet, 1,
aOuter, aIID, aResult);
}
NS_IMETHODIMP
nsBIG5ToUnicode::Convert(const char* aSrc,
int32_t* aSrcLength,
char16_t* aDest,
int32_t* aDestLength)
{
// We'll be doing comparisons as unsigned.
const uint8_t* in = reinterpret_cast<const uint8_t*>(aSrc);
const uint8_t* inEnd = in + *aSrcLength;
char16_t* out = aDest;
char16_t* outEnd = out + *aDestLength;
if (mPendingTrail) {
if (out == outEnd) {
*aSrcLength = 0;
*aDestLength = 0;
return NS_OK_UDEC_MOREOUTPUT;
}
*out++ = mPendingTrail;
mPendingTrail = 0;
}
for (;;) {
if (in == inEnd) {
*aSrcLength = in - reinterpret_cast<const uint8_t*>(aSrc);
*aDestLength = out - aDest;
return mBig5Lead ? NS_OK_UDEC_MOREINPUT : NS_OK;
}
if (out == outEnd) {
*aSrcLength = in - reinterpret_cast<const uint8_t*>(aSrc);
*aDestLength = out - aDest;
return NS_OK_UDEC_MOREOUTPUT;
}
uint8_t b = *in++;
if (!mBig5Lead) {
if (b <= 0x7F) {
*out++ = (char16_t)b;
continue;
}
if (b >= 0x81 && b <= 0xFE) {
mBig5Lead = b;
continue;
}
if (mErrBehavior == kOnError_Signal) {
--in;
*aSrcLength = in - reinterpret_cast<const uint8_t*>(aSrc);
*aDestLength = out - aDest;
return NS_ERROR_ILLEGAL_INPUT;
}
*out++ = 0xFFFD;
continue;
}
size_t lead = mBig5Lead;
mBig5Lead = 0;
size_t offset = (b < 0x7F) ? 0x40 : 0x62;
if ((b >= 0x40 && b <= 0x7E) || (b >= 0xA1 && b <= 0xFE)) {
size_t pointer = (lead - 0x81) * 157L + (b - offset);
char16_t outTrail;
switch (pointer) {
case 1133:
*out++ = 0x00CA;
outTrail = 0x0304;
break;
case 1135:
*out++ = 0x00CA;
outTrail = 0x030C;
break;
case 1164:
*out++ = 0x00EA;
outTrail = 0x0304;
break;
case 1166:
*out++ = 0x00EA;
outTrail = 0x030C;
break;
default:
char16_t lowBits = nsBIG5Data::LowBits(pointer);
if (!lowBits) {
if (b <= 0x7F) {
// prepend byte to stream
// Always legal, since we've always just read a byte
// if we come here.
--in;
}
if (mErrBehavior == kOnError_Signal) {
--in;
*aSrcLength = in - reinterpret_cast<const uint8_t*>(aSrc);
*aDestLength = out - aDest;
return NS_ERROR_ILLEGAL_INPUT;
}
*out++ = 0xFFFD;
continue;
}
if (nsBIG5Data::IsAstral(pointer)) {
uint32_t codePoint = uint32_t(lowBits) | 0x20000;
*out++ = char16_t(0xD7C0 + (codePoint >> 10));
outTrail = char16_t(0xDC00 + (codePoint & 0x3FF));
break;
}
*out++ = lowBits;
continue;
}
if (out == outEnd) {
mPendingTrail = outTrail;
*aSrcLength = in - reinterpret_cast<const uint8_t*>(aSrc);
*aDestLength = out - aDest;
return NS_OK_UDEC_MOREOUTPUT;
}
*out++ = outTrail;
continue;
}
// pointer is null
if (b <= 0x7F) {
// prepend byte to stream
// Always legal, since we've always just read a byte
// if we come here.
--in;
}
if (mErrBehavior == kOnError_Signal) {
// Moving in one past the start of aSrc is actually OK per API contract,
// since assigning -1 to aSrcLength means that we want the caller to
// record one U+FFFD and repush the same input buffer.
--in;
*aSrcLength = in - reinterpret_cast<const uint8_t*>(aSrc);
*aDestLength = out - aDest;
return NS_ERROR_ILLEGAL_INPUT;
}
*out++ = 0xFFFD;
continue;
}
}
NS_IMETHODIMP
nsBIG5ToUnicode::GetMaxLength(const char* aSrc,
int32_t aSrcLength,
int32_t* aDestLength)
{
// The length of the output in UTF-16 code units never exceeds the length
// of the input in bytes.
*aDestLength = aSrcLength + (mPendingTrail ? 1 : 0) + (mBig5Lead ? 1 : 0);
return NS_OK;
}
NS_IMETHODIMP
nsBIG5ToUnicode::Reset()
{
mPendingTrail = 0;
mBig5Lead = 0;
return NS_OK;
}

View File

@ -6,34 +6,16 @@
#ifndef nsBIG5ToUnicode_h___
#define nsBIG5ToUnicode_h___
#include "nsUCSupport.h"
#include "nsISupports.h"
#define NS_BIG5TOUNICODE_CID \
{ 0xefc323e1, 0xec62, 0x11d2, \
{ 0x8a, 0xac, 0x0, 0x60, 0x8, 0x11, 0xa8, 0x36 } }
#define NS_BIG5TOUNICODE_CONTRACTID \
"@mozilla.org/intl/unicode/decoder;1?charset=big5"
class nsBIG5ToUnicode : public nsBasicDecoderSupport
{
public:
nsBIG5ToUnicode();
NS_IMETHOD Convert(const char* aSrc,
int32_t* aSrcLength,
char16_t* aDest,
int32_t* aDestLength);
NS_IMETHOD GetMaxLength(const char* aSrc,
int32_t aSrcLength,
int32_t* aDestLength);
NS_IMETHOD Reset();
private:
char16_t mPendingTrail;
uint8_t mBig5Lead;
};
/**
* A character set converter from BIG5 to Unicode.
*
* @created 06/Apr/1999
* @author Catalin Rotaru [CATA]
*/
nsresult
nsBIG5ToUnicodeConstructor(nsISupports *aOuter, REFNSIID aIID,
void **aResult);
#endif /* nsBIG5ToUnicode_h___ */

View File

@ -0,0 +1,31 @@
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#ifndef nsUCvTWCID_h___
#define nsUCvTWCID_h___
#include "nsISupports.h"
// Class ID for our BIG5ToUnicode charset converter
// {EFC323E1-EC62-11d2-8AAC-00600811A836}
#define NS_BIG5TOUNICODE_CID \
{ 0xefc323e1, 0xec62, 0x11d2, {0x8a, 0xac, 0x0, 0x60, 0x8, 0x11, 0xa8, 0x36}}
// Class ID for our UnicodeToBIG5 charset converter
// {EFC323E2-EC62-11d2-8AAC-00600811A836}
#define NS_UNICODETOBIG5_CID \
{ 0xefc323e2, 0xec62, 0x11d2, {0x8a, 0xac, 0x0, 0x60, 0x8, 0x11, 0xa8, 0x36}}
// Class ID for our BIG5HKSCSToUnicode charset converter
// {BA6151BB-EC62-11d2-8AAC-00600811A836}
#define NS_BIG5HKSCSTOUNICODE_CID \
{ 0xba6151bb, 0xec62, 0x11d2, {0x8a, 0xac, 0x0, 0x60, 0x8, 0x11, 0xa8, 0x36}}
// Class ID for our UnicodeToBIG5HKSCS charset converter
// {BA6151BC-EC62-11d2-8AAC-00600811A836}
#define NS_UNICODETOBIG5HKSCS_CID \
{ 0xba6151bc, 0xec62, 0x11d2, {0x8a, 0xac, 0x0, 0x60, 0x8, 0x11, 0xa8, 0x36}}
#endif /* nsUCvTWCID_h___ */

View File

@ -0,0 +1,15 @@
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#ifndef nsUCvTWDll_h_
#define nsUCvTWDll_h_
extern const uint16_t g_ufBig5Mapping[];
extern const uint16_t g_utBIG5Mapping[];
extern const uint16_t g_ASCIIMappingTable[];
extern const uint16_t g_ufBig5HKSCSMapping[];
extern const uint16_t g_utBig5HKSCSMapping[];
#endif /* nsUCvTWDll_h_ */

View File

@ -4,248 +4,35 @@
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#include "nsUnicodeToBIG5.h"
#include "nsUCvTWDll.h"
#include "nsUCConstructors.h"
NS_IMPL_ADDREF(nsUnicodeToBIG5)
NS_IMPL_RELEASE(nsUnicodeToBIG5)
NS_IMPL_QUERY_INTERFACE(nsUnicodeToBIG5,
nsIUnicodeEncoder)
//----------------------------------------------------------------------
// Global functions and data [declaration]
nsUnicodeToBIG5::nsUnicodeToBIG5()
: mUtf16Lead(0)
, mPendingTrail(0)
, mSignal(true) // as in nsEncoderSupport
static const uint16_t *g_Big5MappingTable[2] = {
g_ASCIIMappingTable,
g_ufBig5Mapping
};
static const uScanClassID g_Big5ScanClassIDs[2] = {
u1ByteCharset,
u2BytesCharset
};
//----------------------------------------------------------------------
// Class nsUnicodeToBIG5 [implementation]
nsresult
nsUnicodeToBIG5Constructor(nsISupports *aOuter, REFNSIID aIID,
void **aResult)
{
return CreateMultiTableEncoder(2,
(uScanClassID*) &g_Big5ScanClassIDs,
(uMappingTable**) &g_Big5MappingTable,
2 /* max length = src * 2 */,
aOuter, aIID, aResult);
}
NS_IMETHODIMP
nsUnicodeToBIG5::Convert(const char16_t* aSrc,
int32_t* aSrcLength,
char* aDest,
int32_t * aDestLength)
{
const char16_t* in = aSrc;
const char16_t* inEnd = in + *aSrcLength;
uint8_t* out = reinterpret_cast<uint8_t*>(aDest);
uint8_t* outEnd = out + *aDestLength;
MOZ_ASSERT(!(mPendingTrail && mUtf16Lead),
"Can't have both pending output and pending input.");
if (mPendingTrail) {
if (out == outEnd) {
*aSrcLength = 0;
*aDestLength = 0;
return NS_OK_UENC_MOREOUTPUT;
}
*out++ = mPendingTrail;
mPendingTrail = 0;
}
for (;;) {
if (in == inEnd) {
*aSrcLength = in - aSrc;
*aDestLength = out - reinterpret_cast<uint8_t*>(aDest);
return NS_OK_UENC_MOREINPUT;
}
if (out == outEnd) {
*aSrcLength = in - aSrc;
*aDestLength = out - reinterpret_cast<uint8_t*>(aDest);
return NS_OK_UENC_MOREOUTPUT;
}
bool isAstral; // true means Plane 2, false means BMP
char16_t lowBits; // The low 16 bits of the code point
char16_t codeUnit = *in++;
size_t highBits = (codeUnit & 0xFC00);
if (highBits == 0xD800) {
// high surrogate
if (mUtf16Lead) {
// High surrogate follows another high surrogate. The
// *previous* code unit is in error.
if (mSignal) {
mUtf16Lead = 0;
// NOTE: Encode API differs from decode API!
--in;
*aSrcLength = in - aSrc;
*aDestLength = out - reinterpret_cast<uint8_t*>(aDest);
return NS_ERROR_UENC_NOMAPPING;
}
*out++ = '?';
}
mUtf16Lead = codeUnit;
continue;
}
if (highBits == 0xDC00) {
// low surrogate
if (!mUtf16Lead) {
// Got low surrogate without a previous high surrogate
if (mSignal) {
// NOTE: Encode API differs from decode API!
*aSrcLength = in - aSrc;
*aDestLength = out - reinterpret_cast<uint8_t*>(aDest);
return NS_ERROR_UENC_NOMAPPING;
}
*out++ = '?';
continue;
}
size_t codePoint = (mUtf16Lead << 10) + codeUnit -
(((0xD800 << 10) - 0x10000) + 0xDC00);
mUtf16Lead = 0;
// Plane 2 is the only astral plane that has potentially
// Big5-encodable characters.
if ((0xFF0000 & codePoint) != 0x20000) {
if (mSignal) {
// NOTE: Encode API differs from decode API!
// nsSaveAsCharset wants us to back up on step in the case of a
// surrogate pair.
--in;
*aSrcLength = in - aSrc;
*aDestLength = out - reinterpret_cast<uint8_t*>(aDest);
return NS_ERROR_UENC_NOMAPPING;
}
*out++ = '?';
continue;
}
isAstral = true;
lowBits = (char16_t)(codePoint & 0xFFFF);
} else {
// not a surrogate
if (mUtf16Lead) {
// Non-surrogate follows a high surrogate. The *previous*
// code unit is in error.
mUtf16Lead = 0;
if (mSignal) {
// NOTE: Encode API differs from decode API!
--in;
*aSrcLength = in - aSrc;
*aDestLength = out - reinterpret_cast<uint8_t*>(aDest);
return NS_ERROR_UENC_NOMAPPING;
}
*out++ = '?';
// Let's unconsume this code unit and reloop in order to
// re-check if the output buffer still has space.
--in;
continue;
}
isAstral = false;
lowBits = codeUnit;
}
// isAstral now tells us if we have a Plane 2 or a BMP character.
// lowBits tells us the low 16 bits.
// After all the above setup to deal with UTF-16, we are now
// finally ready to follow the spec.
if (!isAstral && lowBits <= 0x7F) {
*out++ = (uint8_t)lowBits;
continue;
}
size_t pointer = nsBIG5Data::FindPointer(lowBits, isAstral);
if (!pointer) {
if (mSignal) {
// NOTE: Encode API differs from decode API!
if (isAstral) {
// nsSaveAsCharset wants us to back up on step in the case of a
// surrogate pair.
--in;
}
*aSrcLength = in - aSrc;
*aDestLength = out - reinterpret_cast<uint8_t*>(aDest);
return NS_ERROR_UENC_NOMAPPING;
}
*out++ = '?';
continue;
}
uint8_t lead = (uint8_t)(pointer / 157 + 0x81);
uint8_t trail = (uint8_t)(pointer % 157);
if (trail < 0x3F) {
trail += 0x40;
} else {
trail += 0x62;
}
*out++ = lead;
if (out == outEnd) {
mPendingTrail = trail;
*aSrcLength = in - aSrc;
*aDestLength = out - reinterpret_cast<uint8_t*>(aDest);
return NS_OK_UENC_MOREOUTPUT;
}
*out++ = trail;
continue;
}
}
NS_IMETHODIMP
nsUnicodeToBIG5::Finish(char* aDest,
int32_t* aDestLength)
{
MOZ_ASSERT(!(mPendingTrail && mUtf16Lead),
"Can't have both pending output and pending input.");
uint8_t* out = reinterpret_cast<uint8_t*>(aDest);
if (mPendingTrail) {
if (*aDestLength < 1) {
*aDestLength = 0;
return NS_OK_UENC_MOREOUTPUT;
}
*out = mPendingTrail;
mPendingTrail = 0;
*aDestLength = 1;
return NS_OK;
}
if (mUtf16Lead) {
if (*aDestLength < 1) {
*aDestLength = 0;
return NS_OK_UENC_MOREOUTPUT;
}
// The API doesn't support signaling an error. It pretends that malformed
// input doesn't exist. The UTF-8 encoder outputs the replacement character
// unconditionally.
mUtf16Lead = 0;
*out = '?';
*aDestLength = 1;
return NS_OK;
}
*aDestLength = 0;
return NS_OK;
}
NS_IMETHODIMP
nsUnicodeToBIG5::GetMaxLength(const char16_t* aSrc,
int32_t aSrcLength,
int32_t* aDestLength)
{
*aDestLength = (aSrcLength * 2) +
(mPendingTrail ? 1 : 0) +
// If the lead ends up being paired, the bytes produced
// are already included above.
// If not, it produces a single '?'.
(mUtf16Lead ? 1 : 0);
return NS_OK;
}
NS_IMETHODIMP
nsUnicodeToBIG5::Reset()
{
mUtf16Lead = 0;
mPendingTrail = 0;
return NS_OK;
}
NS_IMETHODIMP
nsUnicodeToBIG5::SetOutputErrorBehavior(int32_t aBehavior,
nsIUnicharEncoder* aEncoder,
char16_t aChar)
{
switch (aBehavior) {
case kOnError_Signal:
mSignal = true;
break;
case kOnError_Replace:
mSignal = false;
MOZ_ASSERT(aChar == '?', "Unsupported replacement.");
break;
case kOnError_CallBack:
MOZ_ASSERT_UNREACHABLE("kOnError_CallBack is supposed to be unused.");
break;
default:
MOZ_ASSERT_UNREACHABLE("Non-existent enum item.");
break;
}
return NS_OK;
}

View File

@ -3,48 +3,19 @@
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#ifndef nsUnicodeToBIG5_h_
#define nsUnicodeToBIG5_h_
#ifndef nsUnicodeToBIG5_h___
#define nsUnicodeToBIG5_h___
#include "nsIUnicodeEncoder.h"
#include "nsISupports.h"
#define NS_UNICODETOBIG5_CID \
{ 0xefc323e2, 0xec62, 0x11d2, \
{ 0x8a, 0xac, 0x0, 0x60, 0x8, 0x11, 0xa8, 0x36 } }
/**
* A character set converter from Unicode to BIG5.
*
* @created 06/Apr/1999
* @author Catalin Rotaru [CATA]
*/
nsresult
nsUnicodeToBIG5Constructor(nsISupports *aOuter, REFNSIID aIID,
void **aResult);
class nsUnicodeToBIG5 : public nsIUnicodeEncoder
{
public:
// Encoders probably shouldn't use the thread-safe variant, but we should
// make a systematic change instead of making this class different.
NS_DECL_THREADSAFE_ISUPPORTS
nsUnicodeToBIG5();
NS_IMETHOD Convert(const char16_t* aSrc,
int32_t* aSrcLength,
char* aDest,
int32_t * aDestLength);
NS_IMETHOD Finish(char* aDest,
int32_t* aDestLength);
MOZ_WARN_UNUSED_RESULT NS_IMETHOD GetMaxLength(const char16_t* aSrc,
int32_t aSrcLength,
int32_t* aDestLength);
NS_IMETHOD Reset();
NS_IMETHOD SetOutputErrorBehavior(int32_t aBehavior,
nsIUnicharEncoder* aEncoder,
char16_t aChar);
private:
virtual ~nsUnicodeToBIG5(){};
char16_t mUtf16Lead;
uint8_t mPendingTrail;
bool mSignal;
};
#endif /* nsUnicodeToBIG5_h_ */
#endif /* nsUnicodeToBIG5_h___ */

View File

@ -0,0 +1,36 @@
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#include "nsUnicodeToBIG5HKSCS.h"
#include "nsUCvTWDll.h"
#include "nsUCConstructors.h"
//----------------------------------------------------------------------
// Global functions and data [declaration]
nsresult
nsUnicodeToBIG5HKSCSConstructor(nsISupports *aOuter, REFNSIID aIID,
void **aResult)
{
static const uint16_t *g_Big5HKSCSMappingTable[] = {
g_ASCIIMappingTable,
g_ufBig5Mapping,
g_ufBig5HKSCSMapping
};
static const uScanClassID g_Big5HKSCSScanClassIDs[] = {
u1ByteCharset,
u2BytesCharset,
u2BytesCharset
};
return CreateMultiTableEncoder(3,
(uScanClassID*) &g_Big5HKSCSScanClassIDs,
(uMappingTable**) &g_Big5HKSCSMappingTable,
2 /* max length = src * 2 */,
aOuter, aIID, aResult);
}

View File

@ -0,0 +1,21 @@
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#ifndef nsUnicodeToBIG5HKSCS_h___
#define nsUnicodeToBIG5HKSCS_h___
#include "nsISupports.h"
/**
* A character set converter from Unicode to BIG5-HKSCS.
*
* @created 02/Jul/2000
* @author Gavin Ho, Hong Kong Professional Services, Compaq Computer (Hong Kong) Ltd.
*/
nsresult
nsUnicodeToBIG5HKSCSConstructor(nsISupports *aOuter, REFNSIID aIID,
void **aResult);
#endif /* nsUnicodeToBIG5HKSCS_h___ */

View File

@ -0,0 +1 @@
We should put Big5 converter into this directory/dll

View File

@ -29051,16 +29051,7 @@
},
"local_changes": {
"deleted": [],
"items": {
"testharness": {
"encoding/big5-encoder.html": [
{
"path": "encoding/big5-encoder.html",
"url": "/encoding/big5-encoder.html"
}
]
}
},
"items": {},
"reftest_nodes": {}
},
"reftest_nodes": {

View File

@ -444,6 +444,12 @@
[Name "hz-gb-2312" has label "hz-gb-2312" (inputEncoding)]
expected: FAIL
[Name "big5" has label "big5-hkscs" (characterSet)]
expected: FAIL
[Name "big5" has label "big5-hkscs" (inputEncoding)]
expected: FAIL
[Name "replacement" has label "csiso2022kr" (characterSet)]
expected: FAIL

View File

@ -1,14 +0,0 @@
[big5-encoder.html]
type: testharness
[big5 encoder: Highest-pointer BMP character excluded from encoder]
expected: FAIL
[big5 encoder: Highest-pointer character excluded from encoder]
expected: FAIL
[big5 encoder: The canonical BMP test character that is not in the index]
expected: FAIL
[big5 encoder: The canonical astral test character that is not in the index]
expected: FAIL

View File

@ -0,0 +1,53 @@
[textdecoder-labels.html]
type: testharness
[name=big5 label=big5-hkscs]
expected: FAIL
["big5-hkscs" => "big5"]
expected: FAIL
[" big5-hkscs" => "big5"]
expected: FAIL
["big5-hkscs " => "big5"]
expected: FAIL
[" big5-hkscs " => "big5"]
expected: FAIL
["\\tbig5-hkscs" => "big5"]
expected: FAIL
["big5-hkscs\\t" => "big5"]
expected: FAIL
["\\tbig5-hkscs\\t" => "big5"]
expected: FAIL
["\\nbig5-hkscs" => "big5"]
expected: FAIL
["big5-hkscs\\n" => "big5"]
expected: FAIL
["\\nbig5-hkscs\\n" => "big5"]
expected: FAIL
["\\fbig5-hkscs" => "big5"]
expected: FAIL
["big5-hkscs\\f" => "big5"]
expected: FAIL
["\\fbig5-hkscs\\f" => "big5"]
expected: FAIL
["\\rbig5-hkscs" => "big5"]
expected: FAIL
["big5-hkscs\\r" => "big5"]
expected: FAIL
["\\rbig5-hkscs\\r" => "big5"]
expected: FAIL

View File

@ -1,33 +0,0 @@
<!doctype html>
<meta charset=big5> <!-- test breaks if the server overrides this -->
<script src=/resources/testharness.js></script>
<script src=/resources/testharnessreport.js></script>
<div id=log></div>
<script>
function encode(input, output, desc) {
test(function() {
var a = document.createElement("a"); // <a> uses document encoding for URL's query
// Append and prepend X to test for off-by-one errors
a.href = "https://example.com/?X" + input + "X";
assert_equals(a.search.substr(1), "X" + output + "X"); // remove leading "?"
}, "big5 encoder: " + desc);
}
encode("ab", "ab", "very basic")
// edge cases
encode("\u9EA6", "%26%2340614%3B", "Highest-pointer BMP character excluded from encoder");
encode("\uD858\uDE6B", "%26%23156267%3B", "Highest-pointer character excluded from encoder");
encode("\u3000", "%A1@", "Lowest-pointer character included in encoder");
encode("\u20AC", "%A3%E1", "Euro; the highest-pointer character before a range of 30 unmapped pointers");
encode("\u4E00", "%A4@", "The lowest-pointer character after the range of 30 unmapped pointers");
encode("\uD85D\uDE07", "%C8%A4", "The highest-pointer character before a range of 41 unmapped pointers");
encode("\uFFE2", "%C8%CD", "The lowest-pointer character after the range of 41 unmapped pointers");
encode("\u79D4", "%FE%FE", "The last character in the index");
// not in index
encode("\u2603", "%26%239731%3B", "The canonical BMP test character that is not in the index");
encode("\uD83D\uDCA9", "%26%23128169%3B", "The canonical astral test character that is not in the index");
// duplicate low bits
encode("\uD840\uDFB5", "%FDj", "A Plane 2 character whose low 16 bits match a BMP character that has a lower pointer");
// prefer last
encode("\u2550", "%F9%F9", "A duplicate-mapped code point that prefers the highest pointer in the encoder");
</script>