Bug 912470 part 1 - Implement Encoding Standard-compliant big5 decoder. r=emk.

This commit is contained in:
Henri Sivonen 2015-06-16 15:26:10 +03:00
parent b212375b7e
commit 93e9ac505d
24 changed files with 19393 additions and 37783 deletions

View File

@ -54,8 +54,8 @@ xn--wgbh1c=windows-1256
gr=ISO-8859-7
hk=Big5-HKSCS
xn--j6w193g=Big5-HKSCS
hk=Big5
xn--j6w193g=Big5
hr=windows-1250

View File

@ -189,7 +189,7 @@ x-gbk=gbk
gb18030=gb18030
hz-gb-2312=replacement
big5=Big5
big5-hkscs=Big5-HKSCS
big5-hkscs=Big5
cn-big5=Big5
csbig5=Big5
x-x-big5=Big5

View File

@ -44,6 +44,7 @@ function runTextDecoderOptions()
}, "testDecodeABVOption");
test(testDecoderForThaiEncoding, "testDecoderForThaiEncoding");
test(testInvalid2022JP, "testInvalid2022JP");
test(testDecoderForBig5, "testDecoderForBig5");
}
/*
@ -355,8 +356,7 @@ function testDecoderGetEncoding()
{encoding: "x-mac-cyrillic", labels: ["x-mac-cyrillic", "x-mac-ukrainian"]},
{encoding: "gbk", labels: ["chinese", "csgb2312", "csiso58gb231280", "gb2312", "gb_2312", "gb_2312-80", "gbk", "iso-ir-58", "x-gbk"]},
{encoding: "gb18030", labels: ["gb18030"]},
{encoding: "big5", labels: ["big5", "cn-big5", "csbig5", "x-x-big5"]},
{encoding: "big5-hkscs", labels: ["big5-hkscs"]},
{encoding: "big5", labels: ["big5", "cn-big5", "csbig5", "x-x-big5", "big5-hkscs"]},
{encoding: "euc-jp", labels: ["cseucpkdfmtjapanese", "euc-jp", "x-euc-jp"]},
{encoding: "iso-2022-jp", labels: ["csiso2022jp", "iso-2022-jp"]},
{encoding: "shift_jis", labels: ["csshiftjis", "ms_kanji", "shift-jis", "shift_jis", "sjis", "windows-31j", "x-sjis"]},
@ -463,3 +463,78 @@ function testInvalid2022JP()
});
assert_equals(failureCount, 0, failureCount + " of " + inputs.length + " tests failed");
}
function testDecoderForBig5()
{
const inputs = [
[ 0x61, 0x62 ],
[ 0x87, 0x40 ],
[ 0xFE, 0xFE ],
[ 0xFE, 0xFD ],
[ 0x88, 0x62 ],
[ 0x88, 0x64 ],
[ 0x88, 0x66 ],
[ 0x88, 0xA3 ],
[ 0x88, 0xA5 ],
[ 0x88, 0xA7 ],
[ 0x99, 0xD4 ],
[ 0x99, 0xD5 ],
[ 0x99, 0xD6 ],
[ 0x61, 0x87, 0x40, 0x62 ],
[ 0x61, 0xFE, 0xFE, 0x62 ],
[ 0x61, 0xFE, 0xFD, 0x62 ],
[ 0x61, 0x88, 0x62, 0x62 ],
[ 0x61, 0x88, 0x64, 0x62 ],
[ 0x61, 0x88, 0x66, 0x62 ],
[ 0x61, 0x88, 0xA3, 0x62 ],
[ 0x61, 0x88, 0xA5, 0x62 ],
[ 0x61, 0x88, 0xA7, 0x62 ],
[ 0x61, 0x99, 0xD4, 0x62 ],
[ 0x61, 0x99, 0xD5, 0x62 ],
[ 0x61, 0x99, 0xD6, 0x62 ],
[ 0x80, 0x61 ],
[ 0xFF, 0x61 ],
[ 0xFE, 0x39 ],
[ 0x87, 0x66 ],
[ 0x81, 0x40 ],
[ 0x61, 0x81 ],
];
const expectations = [
"\u0061\u0062",
"\u43F0",
"\u79D4",
"\uD864\uDD0D",
"\u00CA\u0304",
"\u00CA\u030C",
"\u00CA",
"\u00EA\u0304",
"\u00EA\u030C",
"\u00EA",
"\u8991",
"\uD85E\uDD67",
"\u8A29",
"\u0061\u43F0\u0062",
"\u0061\u79D4\u0062",
"\u0061\uD864\uDD0D\u0062",
"\u0061\u00CA\u0304\u0062",
"\u0061\u00CA\u030C\u0062",
"\u0061\u00CA\u0062",
"\u0061\u00EA\u0304\u0062",
"\u0061\u00EA\u030C\u0062",
"\u0061\u00EA\u0062",
"\u0061\u8991\u0062",
"\u0061\uD85E\uDD67\u0062",
"\u0061\u8A29\u0062",
"\uFFFD\u0061",
"\uFFFD\u0061",
"\uFFFD\u0039",
"\uFFFD\u0066",
"\uFFFD\u0040",
"\u0061\uFFFD",
];
for (var i = 0; i < inputs.length; i++) {
testCharset({encoding: "big5", input: inputs[i], expected: expectations[i],
msg: "decoder test #" + i + " for big5."});
}
}

View File

@ -11,7 +11,7 @@ acp.932=Shift_JIS
acp.936=gb18030
acp.949=EUC-KR
acp.950=Big5
acp.951=Big5-HKSCS
acp.951=Big5
acp.1250=windows-1250
acp.1251=windows-1251
acp.1252=windows-1252

View File

@ -137,10 +137,8 @@ UNIFIED_SOURCES += [
]
UNIFIED_SOURCES += [
'ucvtw/nsBIG5HKSCSToUnicode.cpp',
'ucvtw/nsBIG5ToUnicode.cpp',
'ucvtw/nsUnicodeToBIG5.cpp',
'ucvtw/nsUnicodeToBIG5HKSCS.cpp',
]
UNIFIED_SOURCES += [

View File

@ -82,14 +82,18 @@ public:
* @param aDestLength [IN/OUT] the length of the destination data buffer;
* after conversion will contain the number of Unicode
* characters written
* @return NS_PARTIAL_MORE_INPUT if only a partial conversion was
* done; more input is needed to continue
* NS_PARTIAL_MORE_OUTPUT if only a partial conversion
* was done; more output space is needed to continue
* NS_ERROR_ILLEGAL_INPUT if an illegal input sequence
* @return NS_ERROR_UDEC_ILLEGALINPUT if an illegal input sequence
* was encountered and the behavior was set to "signal";
* the caller must skip over one byte, reset the decoder
* and retry.
* NS_OK_UDEC_MOREOUTPUT if only a partial conversion
* was done; more output space is needed to continue
* NS_OK_UDEC_MOREINPUT if the input ended in the middle
* of an input code unit sequence. If this is the last
* result the caller has at the end of the stream, the
* caller must append one U+FFFD to the output.
* NS_OK if the input ended after a complete input code
* unit sequence.
*/
NS_IMETHOD Convert(const char * aSrc, int32_t * aSrcLength,
char16_t * aDest, int32_t * aDestLength) = 0;

View File

@ -111,8 +111,6 @@
#include "nsUCvTWDll.h"
#include "nsBIG5ToUnicode.h"
#include "nsUnicodeToBIG5.h"
#include "nsBIG5HKSCSToUnicode.h"
#include "nsUnicodeToBIG5HKSCS.h"
// ucvko
#include "nsUCvKOCID.h"
@ -184,7 +182,6 @@ NS_UCONV_REG_UNREG("EUC-JP", NS_EUCJPTOUNICODE_CID, NS_UNICODETOEUCJP_CID)
// ucvtw
NS_UCONV_REG_UNREG("Big5", NS_BIG5TOUNICODE_CID, NS_UNICODETOBIG5_CID)
NS_UCONV_REG_UNREG("Big5-HKSCS", NS_BIG5HKSCSTOUNICODE_CID, NS_UNICODETOBIG5HKSCS_CID)
// ucvko
NS_UCONV_REG_UNREG("EUC-KR", NS_EUCKRTOUNICODE_CID, NS_UNICODETOEUCKR_CID)
@ -214,6 +211,7 @@ NS_GENERIC_FACTORY_CONSTRUCTOR(nsISO2022JPToUnicodeV2)
NS_GENERIC_FACTORY_CONSTRUCTOR(nsUnicodeToISO2022JP)
// ucvtw
NS_GENERIC_FACTORY_CONSTRUCTOR(nsBIG5ToUnicode)
// ucvko
@ -252,18 +250,6 @@ const uint16_t g_ufBig5Mapping[] = {
#include "big5.uf"
};
const uint16_t g_utBIG5Mapping[] = {
#include "big5.ut"
};
const uint16_t g_ufBig5HKSCSMapping[] = {
#include "hkscs.uf"
};
const uint16_t g_utBig5HKSCSMapping[] = {
#include "hkscs.ut"
};
// ucvko
const uint16_t g_utKSC5601Mapping[] = {
#include "u20kscgl.ut"
@ -377,8 +363,6 @@ NS_DEFINE_NAMED_CID(NS_UNICODETOEUCJP_CID);
NS_DEFINE_NAMED_CID(NS_UNICODETOISO2022JP_CID);
NS_DEFINE_NAMED_CID(NS_UNICODETOBIG5_CID);
NS_DEFINE_NAMED_CID(NS_BIG5TOUNICODE_CID);
NS_DEFINE_NAMED_CID(NS_UNICODETOBIG5HKSCS_CID);
NS_DEFINE_NAMED_CID(NS_BIG5HKSCSTOUNICODE_CID);
NS_DEFINE_NAMED_CID(NS_EUCKRTOUNICODE_CID);
NS_DEFINE_NAMED_CID(NS_UNICODETOEUCKR_CID);
NS_DEFINE_NAMED_CID(NS_GBKTOUNICODE_CID);
@ -481,8 +465,6 @@ static const mozilla::Module::CIDEntry kUConvCIDs[] = {
{ &kNS_UNICODETOISO2022JP_CID, false, nullptr, nsUnicodeToISO2022JPConstructor },
{ &kNS_UNICODETOBIG5_CID, false, nullptr, nsUnicodeToBIG5Constructor },
{ &kNS_BIG5TOUNICODE_CID, false, nullptr, nsBIG5ToUnicodeConstructor },
{ &kNS_UNICODETOBIG5HKSCS_CID, false, nullptr, nsUnicodeToBIG5HKSCSConstructor },
{ &kNS_BIG5HKSCSTOUNICODE_CID, false, nullptr, nsBIG5HKSCSToUnicodeConstructor },
{ &kNS_EUCKRTOUNICODE_CID, false, nullptr, nsCP949ToUnicodeConstructor },
{ &kNS_UNICODETOEUCKR_CID, false, nullptr, nsUnicodeToCP949Constructor },
{ &kNS_GBKTOUNICODE_CID, false, nullptr, nsGB18030ToUnicodeConstructor },
@ -587,8 +569,6 @@ static const mozilla::Module::ContractIDEntry kUConvContracts[] = {
{ NS_UNICODEENCODER_CONTRACTID_BASE "ISO-2022-JP", &kNS_UNICODETOISO2022JP_CID },
{ NS_UNICODEENCODER_CONTRACTID_BASE "Big5", &kNS_UNICODETOBIG5_CID },
{ NS_UNICODEDECODER_CONTRACTID_BASE "Big5", &kNS_BIG5TOUNICODE_CID },
{ NS_UNICODEENCODER_CONTRACTID_BASE "Big5-HKSCS", &kNS_UNICODETOBIG5HKSCS_CID },
{ NS_UNICODEDECODER_CONTRACTID_BASE "Big5-HKSCS", &kNS_BIG5HKSCSTOUNICODE_CID },
{ NS_UNICODEDECODER_CONTRACTID_BASE "EUC-KR", &kNS_EUCKRTOUNICODE_CID },
{ NS_UNICODEENCODER_CONTRACTID_BASE "EUC-KR", &kNS_UNICODETOEUCKR_CID },
{ NS_UNICODEDECODER_CONTRACTID_BASE "gbk", &kNS_GBKTOUNICODE_CID },

View File

@ -0,0 +1,170 @@
#!/usr/bin/python
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
# Adapted from
# https://hg.mozilla.org/projects/htmlparser/file/3ac10f9e8612/generate-encoding-data.py
# indexes.json comes from
# https://encoding.spec.whatwg.org/indexes.json
# i.e.
# https://github.com/whatwg/encoding/blob/a5215d07106e250dfef34908b99b3e4a576be2f6/indexes.json
import json
indexes = json.load(open("indexes.json", "r"))
def nullToZero(codePoint):
if not codePoint:
codePoint = 0
return codePoint
index = []
for codePoint in indexes["big5"]:
index.append(nullToZero(codePoint))
# There are four major gaps consisting of more than 4 consecutive invalid pointers
gaps = []
consecutive = 0
consecutiveStart = 0
offset = 0
for codePoint in index:
if codePoint == 0:
if consecutive == 0:
consecutiveStart = offset
consecutive +=1
else:
if consecutive > 4:
gaps.append((consecutiveStart, consecutiveStart + consecutive))
consecutive = 0
offset += 1
def invertRanges(ranges, cap):
inverted = []
invertStart = 0
for (start, end) in ranges:
if start != 0:
inverted.append((invertStart, start))
invertStart = end
inverted.append((invertStart, cap))
return inverted
cap = len(index)
ranges = invertRanges(gaps, cap)
# Now compute a compressed lookup table for astralness
gaps = []
consecutive = 0
consecutiveStart = 0
offset = 0
for codePoint in index:
if codePoint <= 0xFFFF:
if consecutive == 0:
consecutiveStart = offset
consecutive +=1
else:
if consecutive > 40:
gaps.append((consecutiveStart, consecutiveStart + consecutive))
consecutive = 0
offset += 1
astralRanges = invertRanges(gaps, cap)
includeFile = open("../ucvtw/nsBIG5DecoderData.h", "w")
includeFile.write('''/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
/*
* THIS IS A GENERATED FILE. PLEASE DO NOT EDIT.
* Instead, please regenerate using intl/uconv/tools/gen-big5-data.py
*/
static const char16_t kBig5LowBitsTable[] = {
''')
for (low, high) in ranges:
for i in xrange(low, high):
includeFile.write(' 0x%04X,\n' % (index[i] & 0xFFFF))
includeFile.write('''};
static const uint32_t kBig5AstralnessTable[] = {
''')
# An array of bool is inefficient per
# http://stackoverflow.com/questions/4049156/1-bit-per-bool-in-array-c
bits = []
for (low, high) in astralRanges:
for i in xrange(low, high):
bits.append(1 if index[i] > 0xFFFF else 0)
# pad length to multiple of 32
for i in xrange(32 - (len(bits) % 32)):
bits.append(0)
i = 0
while i < len(bits):
accu = 0
for j in xrange(32):
accu |= bits[i + j] << j
includeFile.write(' 0x%08X,\n' % accu)
i += 32
includeFile.write('''};
// static
char16_t
nsBIG5ToUnicode::LowBits(size_t aPointer)
{
''')
base = 0
for (low, high) in ranges:
includeFile.write(''' if (aPointer < %d) {
return 0;
}
if (aPointer < %d) {
return kBig5LowBitsTable[%d + (aPointer - %d)];
}
''' % (low, high, base, low))
base += (high - low)
includeFile.write(''' return 0;
}
// static
bool
nsBIG5ToUnicode::IsAstral(size_t aPointer)
{
''')
base = 0
for (low, high) in astralRanges:
if high - low == 1:
includeFile.write(''' if (aPointer < %d) {
return false;
}
if (aPointer == %d) {
return true;
}
''' % (low, low))
else:
includeFile.write(''' if (aPointer < %d) {
return false;
}
if (aPointer < %d) {
size_t index = %d + (aPointer - %d);
return kBig5AstralnessTable[index >> 5] & (1 << (index & 0x1F));
}
''' % (low, high, base, low))
base += (high - low)
includeFile.write(''' return false;
}
''')
includeFile.close()

View File

@ -1,959 +0,0 @@
#!/usr/bin/perl -w
#
# gen-big5hkscs-2001-mozilla.pl
# a Perl script that generates Big5-HKSCS <-> Unicode
# conversion tables for Mozilla
#
# Author (of the original Perl script):
# Anthony Fok <anthony@thizlinux.com> <foka@debian.org>
# Copyright (C) 2001, 2002 ThizLinux Laboratory Ltd.
# License: GNU General Public License, v2 or later.
#
# This version includes original C source code from
# glibc-2.2.5/iconvdata/big5hkscs.c by Ulrich Drepper <drepper@redhat.com>
# Roger So <roger.so@sw-linux.com>
#
# First attempt for Qt-2.3.x: 2001-09-21
# A working version for Qt-2.3.x: 2001-10-30
# Ported to glibc-2.2.5 with HKSCS-2001: 2002-03-21
# Adapted to generate conversion tables for Mozilla: 2002-11-26
# Adapted to generate conversion tables for Mozilla: 2002-11-30
# Cleaned up the script somewhat: 2002-12-04
# Minor revisions for submitting to Mozilla Bugzilla: 2002-12-10
#
# Notes:
#
# 1. The latest version of this script may be found in:
# http://www.thizlinux.com/~anthony/hkscs/gen-glibc-big5hkscs.pl
# http://people.debian.org/~foka/hkscs/gen-glibc-big5hkscs.pl
# Or, better yet, e-mail me and ask for the latest version.
#
# 2. This script generates data from 3 tables:
# a. http://www.microsoft.com/typography/unicode/950.txt
# b. http://www.info.gov.hk/digital21/chi/hkscs/download/big5-iso.txt
# c. http://www.info.gov.hk/digital21/chi/hkscs/download/big5cmp.txt
#
# Make sure your big5-iso.txt is the latest HKSCS-2001 version.
#
# 3. [glibc]: I have currently split the ucs_to_big5_hkscs_?[] tables into
# different areas similar to the way Ulrich and Roger did it,
# but extended for HKSCS-2001.
#
# 4. [Mozilla]: This script is very quick-and-dirty in some places.
# Call either gen_mozilla_uf() or gen_mozilla_ut() to generate
# the appropriate tables for feeding into "fromu" or "tou".
#
# 5. [CharMapML]: The comments regarding TW-BIG5 herein need to be organized.
# Also, please make sure "$hkscs_mode = 0;" for TW-BIG5 mode.
# Otherwise, this script would generate a HKSCS table.
# (Yes, I know, I should clean up this script and make it more modular,
# and with command-line options or whatnot. I'll do that later. :-)
#
# If you have any questions or concerns, please feel free to contact me
# at Anthony Fok <anthony@thizlinux.com> or <foka@debian.org> :-)
#
# Last but not least, special thanks to ThizLinux Laboratory Ltd. (HK)
# for their generous support in this work.
#
# 1. UDA3, 0x8840 - 0x8dfe
# 2. UDA2, 0x8e40 - 0xa0fe
# 3. VDA, 0xc6a1 - 0xc8fe
#use Getopt::Std;
my ( %b2u, %u2b, $unicode, $big5, $high, $low, $i, $count );
my $debug = 0;
my $hkscs_mode = 1;
my $kangxi = 0;
my $use_range = 0;
my $bmp_only = 1;
#
# Subroutine Declaration
#
sub read_cp950();
sub adjust_radicals();
sub read_hkscs_main();
sub read_hkscs_cmp();
sub post_tuning();
sub gen_charmapml();
sub gen_check_b2u();
sub gen_check_u2b();
sub gen_mozilla_uf();
sub gen_mozilla_ut();
sub gen_glibc();
###########################################################################
#
# Main program
#
# First, read Microsoft's CP950 as base Big5.
read_cp950 ();
# Add mappings to Kangxi Radicals.
# The b2u direction is added only if $kangxi is not null.
adjust_radicals ();
# Then, read the HKSCS table.
# Again, see the $hkscs_mode variable.
read_hkscs_main ();
read_hkscs_cmp () if $hkscs_mode;
post_tuning ();
# Then, choose one of the following:
#gen_charmapml();
gen_mozilla_uf();
#gen_mozilla_ut();
#gen_check_u2b();
#gen_glibc();
# End of program
exit 0;
#############################################################################
#
# Subroutines
#
sub read_cp950() {
open( CP950, "950.txt" ) or die;
my $mode = 0;
while (<CP950>) {
s/\r//;
chomp;
next if /^$/;
last if /^ENDCODEPAGE/;
if (/^DBCSTABLE (\d+)\s+;LeadByte = 0x([0-9a-f]{2})/) {
$mode = 1;
( $count, $high ) = ( $1, $2 );
$i = 0;
next;
}
if (/^WCTABLE (\d+)/) {
$mode = 2;
$count = $1;
$i = 0;
next;
}
next if $mode == 0;
if ( $mode == 1 ) {
( $low, $unicode, $comment ) = split "\t";
$low =~ s/^0x//;
$unicode =~ s/^0x//;
$big5 = $high . $low;
$b2u{ uc($big5) } = uc($unicode);
if ( ++$i == $count ) { $mode = 0; $count = 0; next; }
}
if ( $mode == 2 ) {
( $unicode, $big5, $comment ) = split "\t";
$unicode =~ s/^0x//;
$big5 =~ s/^0x//;
my $u = hex($unicode);
my $b = hex($big5);
$u2b{ uc($unicode) } = uc($big5) unless
# Skip Microsoft's over-generous (or over-zealous?) mappings
# "Faked" accented latin characters
( $b <= 0xFF and $b != $u )
# "Faked" Ideographic Annotation ___ Mark
or ( $u >= 0x3192 and $u <= 0x319F )
# "Faked" Parenthesized Ideograph ___
or ( $u >= 0x3220 and $u <= 0x3243 )
# "Faked" Circled Ideograph ___ except Circled Ideograph Correct
or ( $u >= 0x3280 and $u <= 0x32B0 and $u != 0x32A3 )
# ¢£¥’μ﹐
or ( $u == 0xA2
or $u == 0xA3
or $u == 0xA5
or $u == 0xB4
or $u == 0xB5
or $u == 0xB8 )
# ¯─∥‧˙〃 ̄﹨°≡︴⊙⊕~﹋
or ( $u == 0x0305 # ???
or $u == 0x2015
or $u == 0x2016
or $u == 0x2022
or $u == 0x2024
or $u == 0x2033
or $u == 0x203E # ???
or $u == 0x2216
or $u == 0x2218
or $u == 0x2263
or $u == 0x2307
or $u == 0x2609
or $u == 0x2641
or $u == 0x301C
or $u == 0x3030 )
# ︿‘﹑
or ( $u == 0xFF3E or $u == 0xFF40 or $u == 0xFF64 );
if ( ++$i == $count ) { $mode = 0; $count = 0; next; }
}
}
}
sub adjust_radicals() {
# B5+C6BF - B5+C6D7: Radicals (?)
# TW-BIG5 drafted by Autrijus uses Kangxi Radicals whenever possible.
#
# Big5-HKSCS tends towards using the character in Unicode CJK Ideographs
# Note that HKSCS does not explicitly define
# B5+C6CF, B5+C6D3, B5+C6D5, B5+C6D7 (廴、无、癶、隶),
# but do have these characters at B5+FBFD, B5+FCD3, B5+FEC1, B5+90C4,
# mapped to U+5EF4, U+65E0, U+7676, U+96B6 respectively.
#
# As for B5+C6CD (⼳), HKSCS maps it to U+2F33 just like TW-BIG5.
# However, it also maps B5+FBF4 (幺) to U+5E7A.
$b2u{"C6BF"} = "2F02" if $kangxi;
$u2b{"2F02"} = "C6BF"; #
$b2u{"C6C0"} = "2F03" if $kangxi;
$u2b{"2F03"} = "C6C0"; # 丿
$b2u{"C6C1"} = "2F05" if $kangxi;
$u2b{"2F05"} = "C6C1"; # 亅
$b2u{"C6C2"} = "2F07" if $kangxi;
$u2b{"2F07"} = "C6C2"; # 亠
$b2u{"C6C3"} = "2F0C" if $kangxi;
$u2b{"2F0C"} = "C6C3"; # 冂
$b2u{"C6C4"} = "2F0D" if $kangxi;
$u2b{"2F0D"} = "C6C4"; # 冖
$b2u{"C6C5"} = "2F0E" if $kangxi;
$u2b{"2F0E"} = "C6C5"; # 冫
$b2u{"C6C6"} = "2F13" if $kangxi;
$u2b{"2F13"} = "C6C6"; # 勹
$b2u{"C6C7"} = "2F16" if $kangxi;
$u2b{"2F16"} = "C6C7"; # 匸
$b2u{"C6C8"} = "2F19" if $kangxi;
$u2b{"2F19"} = "C6C8"; # 卩
$b2u{"C6C9"} = "2F1B" if $kangxi;
$u2b{"2F1B"} = "C6C9"; # 厶
$b2u{"C6CA"} = "2F22" if $kangxi;
$u2b{"2F22"} = "C6CA"; # 夊
$b2u{"C6CB"} = "2F27" if $kangxi;
$u2b{"2F27"} = "C6CB"; # 宀
$b2u{"C6CC"} = "2F2E" if $kangxi;
$u2b{"2F2E"} = "C6CC"; # 巛
$b2u{"C6CD"} = "2F33" if $kangxi;
$u2b{"2F33"} = "C6CD"; # ⼳
$b2u{"C6CE"} = "2F34" if $kangxi;
$u2b{"2F34"} = "C6CE"; # 广
$b2u{"C6CF"} = "2F35" if $kangxi;
$u2b{"2F35"} = "C6CF"; # 廴
$b2u{"C6D0"} = "2F39" if $kangxi;
$u2b{"2F39"} = "C6D0"; # 彐
$b2u{"C6D1"} = "2F3A" if $kangxi;
$u2b{"2F3A"} = "C6D1"; # 彡
$b2u{"C6D2"} = "2F41" if $kangxi;
$u2b{"2F41"} = "C6D2"; # 攴
$b2u{"C6D3"} = "2F46" if $kangxi;
$u2b{"2F46"} = "C6D3"; # 无
$b2u{"C6D4"} = "2F67" if $kangxi;
$u2b{"2F67"} = "C6D4"; # 疒
$b2u{"C6D5"} = "2F68" if $kangxi;
$u2b{"2F68"} = "C6D5"; # 癶
$b2u{"C6D6"} = "2FA1" if $kangxi;
$u2b{"2FA1"} = "C6D6"; # 辵
$b2u{"C6D7"} = "2FAA" if $kangxi;
$u2b{"2FAA"} = "C6D7"; # 隶
}
sub read_hkscs_main() {
open( B2U, "<big5-iso.txt" ) or die;
while (<B2U>) {
next
unless
/([[:xdigit:]]{4})\s+([[:xdigit:]]{4})\s+([[:xdigit:]]{4})\s+([[:xdigit:]]{4,5})/;
( $big5, $iso1993, $iso2000, $iso2001 ) = ( $1, $2, $3, $4 );
my $b = hex($big5);
# For non-HKSCS mode, only take data in the VDA range (?)
next unless $hkscs_mode
# Note that we don't go from B5+C6A1-B5+C6FE, but rather only
# C6A1-C8D3 excluding C6BF-C6D7 (Kangxi Radicals)
# because C8D4-C8FE are not assigned in TW-BIG5
# if we are to follow Arphic PL Big-5 fonts. (To be discussed)
or
( $b >= 0xC6A1 && $b <= 0xC8D3 and !( $b >= 0xC6BF && $b <= 0xC6D7 ) )
or ( $b >= 0xF9D6 && $b <= 0xF9FE );
print STDERR
"B2U, 2000: $big5 redefined from U+$b2u{$big5} to U+$iso2000.\n"
if $debug
and defined( $b2u{$big5} )
and $b2u{$big5} ne $iso2000;
$b2u{$big5} = $bmp_only ? $iso2000 : $iso2001
unless !$hkscs_mode
and $b == 0xF9FE;
# B5+F9FE is mapped differently in TW-BIG5 and HKSCS, to
# U+2593 (Dark Shade) and U+FFED (Halfwidth Black Square) respectively.
# Which is more correct? I don't know! (To be discussed)
print STDERR
"1993: U+$iso1993 redefined from $u2b{$iso1993} to $big5.\n"
if $debug
and defined( $u2b{$iso1993} )
and $u2b{$iso1993} ne $big5;
$u2b{$iso1993} = $big5;
print STDERR
"2000: U+$iso2000 redefined from $u2b{$iso2000} to $big5.\n"
if $debug
and defined( $u2b{$iso2000} )
and $u2b{$iso2000} ne $big5;
$u2b{$iso2000} = $big5;
print STDERR
"2001: U+$iso2001 redefined from $u2b{$iso2001} to $big5.\n"
if $debug
and defined( $u2b{$iso2001} )
and $u2b{$iso2001} ne $big5;
$u2b{$iso2001} = $big5;
}
close B2U;
} # read_hkscs_main()
sub read_hkscs_cmp() {
###########################################################################
# Add Big5 compatibility coding...
#
# Stephan, here is the code segment that you may want to implement
# in your convertbig5hkscs2001.pl
#
open( B5CMP, "<big5cmp.txt" ) or die;
$mode = 0;
while (<B5CMP>) {
if (/^=====/) { $mode = 1; next; }
next if $mode == 0;
last if $mode == 1 and /^\s+/;
chomp;
my ( $big5cmp, $big5 ) = split " ";
$big5cmp = uc($big5cmp);
$big5 = uc($big5);
my $uni = $b2u{$big5};
my $unicmp = $b2u{$big5cmp};
print STDERR
"Was: U+$unicmp -> $u2b{$unicmp}, $big5cmp -> U+$b2u{$big5cmp}\t"
if $debug;
$b2u{$big5cmp} = $uni;
$u2b{$unicmp} = $big5;
print STDERR
"Now: U+$unicmp -> $u2b{$unicmp}, $big5cmp -> U+$b2u{$big5cmp}\n"
if $debug;
}
close B5CMP;
} # read_hkscs_cmp();
sub post_tuning() {
# And finally, fine-tuning...
for $i ( 0x00 .. 0x80 ) {
$big5 = $unicode = sprintf( "%04X", $i );
$b2u{$big5} = $unicode;
}
# Add Euro '€' (I wonder why this 950.txt doesn't have it.)
$b2u{"A3E1"} = "20AC";
$u2b{"20AC"} = "A3E1";
# Box drawing characters:
# Align with Big-5E (To be discussed, as it differs from CP950 and HKSCS)
# (To be discussed)
if ( !$hkscs_mode ) {
$u2b{"2550"} = "A2A4"; # Big5: ═ (also B5-F9F9)
$u2b{"255E"} = "A2A5"; # Big5: ╞ (also B5-F9E9)
$u2b{"2561"} = "A2A7"; # Big5: ╡ (also B5-F9EB)
$u2b{"256A"} = "A2A6"; # Big5: ╪ (also B5-F9EA)
$u2b{"256D"} = "A27E"; # Big5: ╭ (also B5-F9FA)
$u2b{"256E"} = "A2A1"; # Big5: ╮ (also B5-F9FB)
$u2b{"256F"} = "A2A3"; # Big5: ╯ (also B5-F9FD)
$u2b{"2570"} = "A2A2"; # Big5: ╰ (also B5-F9FC)
}
# "Hangzhou" or "Suzhou" Chinese numerals 10, 20, 30 (十卄卅)
# (To be discussed)
if ( !$hkscs_mode ) {
$b2u{"A2CC"} = "3038";
$u2b{"3038"} = "A2CC";
$b2u{"A2CD"} = "3039";
$u2b{"3039"} = "A2CD";
$b2u{"A2CE"} = "303A";
$u2b{"303A"} = "A2CE";
}
# The character for ethnic group "Yi" (彝):
# (To be discussed)
$u2b{"5F5E"} = "C255"; # Always add this.
if ( !$hkscs_mode ) {
$b2u{"C255"} = "5F5E";
}
} # post_tuning()
sub gen_charmapml() {
###########################################################################
#
# Codes for generating CharMapML XML file
print <<EOT;
<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE characterMapping SYSTEM "http://www.unicode.org/unicode/reports/tr22/CharacterMapping.dtd">
EOT
if ($hkscs_mode) {
print <<EOT;
<characterMapping id="big5-hkscs-2001" version="1">
<history>
<modified version="1" date="2002-11-30">
Trial version generated from 950.txt + part of big5-iso.txt (HKSCS-2001)
with Euro added, with CP950's excessive fub (fallbacks uni->big5) removed,
and with some other manual tweaking.
</modified>
</history>
EOT
}
else {
print <<EOT;
<characterMapping id="tw-big5-2002" version="1">
<history>
<modified version="1" date="2002-11-30">
Trial version generated from 950.txt + part of big5-iso.txt (HKSCS-2001)
with Euro added, with CP950's excessive fub (fallbacks uni->big5) removed,
and with some other manual tweaking.
</modified>
</history>
EOT
}
print <<EOT;
<validity>
<state type="FIRST" next="VALID" s="0" e="80" max="FFFF"/>
<state type="FIRST" next="SECOND" s="81" e="FE" max="FFFF"/>
<state type="SECOND" next="VALID" s="40" e="7E" max="FFFF"/>
<state type="SECOND" next="VALID" s="A1" e="FE" max="FFFF"/>
</validity>
<assignments sub="3F">
EOT
print " <!-- One to one mappings -->\n";
for $unicode ( sort { hex($a) <=> hex($b) } keys %u2b ) {
$big5 = $u2b{$unicode};
$u = hex($unicode);
next
unless defined( $b2u{$big5} )
and $unicode eq $b2u{$big5}
and
not( $use_range and !$hkscs_mode and $u >= 0xE000 && $u <= 0xF6B0 );
printf " <a u=\"%04X\" ", $u;
if ( hex($big5) <= 0xFF ) {
printf "b=\"%02X\"/>\n", hex($big5);
}
else {
printf "b=\"%s %s\"/>\n", substr( $big5, 0, 2 ),
substr( $big5, 2, 2 );
}
}
print " <!-- Fallback mappings from Unicode to bytes -->\n";
for $unicode ( sort { hex($a) <=> hex($b) } keys %u2b ) {
$big5 = $u2b{$unicode};
next if defined( $b2u{$big5} ) and hex($unicode) == hex( $b2u{$big5} );
if ( $unicode eq "F900" ) {
print " <!-- CJK Compatibility Ideographs: U+F900 - U+FA6A.\n";
print
" These are included in CP950 (Unicode->Big5 direction only).\n";
print " Should we include this area in TW-BIG5 or not? -->\n";
}
printf " <fub u=\"%04X\" b=\"%s %s\"/>\n", hex($unicode),
substr( $big5, 0, 2 ), substr( $big5, 2, 2 );
}
my %fbu;
print " <!-- Fallback mappings from bytes to Unicode -->\n";
for $big5 ( sort { hex($a) <=> hex($b) } keys %b2u ) {
$unicode = $b2u{$big5};
if ( !defined( $u2b{$unicode} ) or hex($big5) != hex( $u2b{$unicode} ) )
{
$fbu{$unicode} = $big5;
}
}
for $unicode ( sort { hex($a) <=> hex($b) } keys %fbu ) {
$big5 = $fbu{$unicode};
printf " <fbu u=\"%04X\" b=\"%s %s\"/>\n", hex($unicode),
substr( $big5, 0, 2 ), substr( $big5, 2, 2 );
}
if ( $use_range and !$hkscs_mode ) {
print <<EOT;
<!-- Roundtrip-mappings that can be enumerated
Note: We can only use the <range> tag for TW-BIG5.
Big-5E and Big5-HKSCS have assigned characters in these areas,
and we will have to use the <a> and <fub> tags instead.
-->
<!-- User-Defined Area 1 (UDA1) -->
<range uFirst="E000" uLast="E310" bFirst="FA 40" bLast="FE FE" bMin="81 40" bMax="FE FE"/>
<!-- User-Defined Area 2 (UDA2) -->
<range uFirst="E311" uLast="EEB7" bFirst="8E 40" bLast="A0 FE" bMin="81 40" bMax="FE FE"/>
<!-- User-Defined Area 3 (UDA3) -->
<range uFirst="EEB8" uLast="F6B0" bFirst="81 40" bLast="8D FE" bMin="81 40" bMax="FE FE"/>
EOT
}
print <<EOT;
</assignments>
</characterMapping>
EOT
} # gen_charmapml()
sub gen_check_b2u() {
###########################################################################
#
# Codes for generating a raw table for verification and testing
#
# #print $u2b{"F7D1"}, "\n";
# print $b2u{$u2b{"F7D1"}}, "\n";
# print "FA59 -> U+", $b2u{"FA59"}, "\n";
foreach $big5 ( sort { hex($a) <=> hex($b) } keys %b2u ) {
$unicode = $b2u{$big5};
$big5 =~ s/^00//;
print "U+", $unicode, ": ", $big5, "\n";
}
}
sub gen_check_u2b() {
foreach $unicode ( sort { hex($a) <=> hex($b) } keys %u2b ) {
$big5 = $u2b{$unicode};
$big5 =~ s/^00//;
print "U+", $unicode, ": ", $big5, "\n";
}
}
###########################################################################
#
# Codes for generating hkscs.ut and hkscs.uf files for Mozilla
#
sub gen_mozilla_uf() {
# hkscs.uf
foreach $unicode ( sort keys %u2b ) {
$big5 = $u2b{$unicode};
my $b = hex($big5);
print "0x", uc($big5), "\t0x", uc($unicode), "\n"
unless ( $b >= 0xA140 and $b <= 0xC6A0 )
or ( $b >= 0xC940 and $b <= 0xF9D5 )
or ( $b < 0x8140 )
or ( hex($unicode) > 0xFFFF );
}
}
sub gen_mozilla_ut() {
# hkscs.ut
foreach $big5 ( sort keys %b2u ) {
my $b = hex($big5);
print "0x", uc($big5), "\t0x", uc( $b2u{$big5} ), "\n"
unless ( $b >= 0xA140 and $b <= 0xC6A0 )
or ( $b < 0x8140 )
or ( $b >= 0xC940 and $b <= 0xF9D5 );
}
}
###########################################################################
sub gen_glibc() {
##########################################################################
#
# Generate index for UCS4 to Big5-HKSCS conversion table
#
@index_array = ();
$mode = 0;
$count = 0;
for ( $uni = 0x81 ; $uni <= 0x2FFFF ; $uni++ ) {
$unicode = sprintf( "%04X", $uni );
# print " /* U+$unicode */\t" if $low % 4 == 0;
if ( defined( $u2b{$unicode} ) ) {
if ( $mode == 0 ) {
$range_start = $range_end = $uni;
# printf " { %7s, ", sprintf("0x%04X", $range_start);
$mode = 1;
}
else {
$range_end = $uni;
}
}
elsif ( $mode == 1 and ( $uni - $range_end ) >= 0x80 ) {
# Start a new range if the gap is 0x80 or larger
# printf "%7s, %5d },\n", sprintf("0x%04X", $range_end), $count;
push @index_array, [ ( $range_start, $range_end, $count ) ];
$count += $range_end - $range_start + 1;
$mode = 0;
}
}
#
# Note that $count and $range_end are used again as global variables
# below
#
###########################################################################
#
# Start generating real C code...
#
print <<'EOT';
/* Mapping tables for Big5-HKSCS handling.
Copyright (C) 1997, 1998, 2000, 2001, 2002 Free Software Foundation, Inc.
This file is part of the GNU C Library.
Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
Modified for Big5-HKSCS by Roger So <roger.so@sw-linux.com>, 2000.
Updated for HKSCS-2001 by James Su <suzhe@turbolinux.com.cn>
and Anthony Fok <anthony@thizlinux.com>, 2002
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, write to the Free
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
02111-1307 USA. */
#include <dlfcn.h>
#include <gconv.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <wchar.h>
/* Table for Big5-HKSCS to UCS conversion.
Original comments by Roger So when he updated the tables for HKSCS-1999:
With HKSCS mappings 0x8140-0xA0FE and 0xFA40-0xFEFE added; more info:
http://www.digital21.gov.hk/eng/hkscs/index.html
- spacehunt 07/01/2000
The BIG5-HKSCS mapping tables are generated from 950.txt, big5-iso.txt
and big5cmp.txt using a Perl script while merging C source code from
other developers. A copy of the source Perl script is available at:
http://www.thizlinux.com/~anthony/hkscs/gen-glibc-big5hkscs.pl
http://people.debian.org/~foka/hkscs/gen-glibc-big5hkscs.pl
Revisions:
2001-10-30 made codec for Qt
2002-03-21 ported to glibc-2.2.5 and added HKSCS-2001
Todo:
Use a hash for characters beyond BMP to save space and make it
more efficient
- Anthony Fok <anthony@thizlinux.com> 21 Mar 2002
On behalf of ThizLinux Laboratory Ltd., Hong Kong SAR, China
*/
EOT
##########################################################################
#
# Generate Big5-HKSCS to Unicode conversion table
#
## print "Big5HKSCS to Unicode\n";
# for $high (0x81..0x8d, 0x8e..0xa0, 0xc6..0xc8, 0xf9, 0xfa..0xfe) {
$high_start = 0x88;
$high_end = 0xfe;
print "static const uint16_t big5_hkscs_to_ucs[";
print( ( $high_end - $high_start + 1 ) * 157 );
print "] =\n{\n";
for $high ( 0x88 .. 0xfe ) {
for $low ( 0x40 .. 0x7e, 0xa1 .. 0xfe ) {
if ( $low == 0x40 ) {
print "\n" unless $high == $high_start;
printf
"\t/* Big5-HKSCS 0x%02X40..0x%02X7E, 0x%02XA1..0x%02XFE */\n",
$high, $high, $high, $high;
}
elsif ( $low == 0xa1 ) {
print "\t\t";
}
$big5 = sprintf( "%02X%02X", $high, $low );
print "\t" if $low % 8 == 0;
if ( defined( $b2u{$big5} ) ) {
$unicode = $b2u{$big5};
print "0x", $unicode, ",";
}
else {
print "0x0000,"; # for glibc
}
print( ( $low % 8 == 7 or $low == 0x7e or $low == 0xfe )
? "\n"
: "\t" );
}
}
print "};\n\n";
##########################################################################
#
# Generate Unicode to Big5-HKSCS conversion table
#
print "static const unsigned char ucs4_to_big5_hkscs[$count][2] =\n{\n";
foreach $index (@index_array) {
( $start, $end ) = ( @$index[0], @$index[1] );
printf( " /* U+%04X */\t", $start ) if ( $start % 4 != 0 );
print "\t" x ( ( $start % 4 ) * 1.5 ) . " " x ( $start % 2 );
for ( $i = $start ; $i <= $end ; $i++ ) {
printf( " /* U+%04X */\t", $i ) if ( $i % 4 == 0 );
$unicode = sprintf( "%04X", $i );
if ( defined( $big5 = $u2b{$unicode} ) ) {
if ( $big5 =~ /^00/ ) {
print '"\x', substr( $big5, 2, 2 ), '\x00",';
}
else {
print '"\x', substr( $big5, 0, 2 ), '\x',
substr( $big5, 2, 2 ), '",';
}
}
else {
print '"\x00\x00",';
}
print( ( $i % 4 == 3 ) ? "\n" : " " ) unless $i == $end;
}
print $end == $range_end ? "\n" : "\n\n";
}
print "};\n\n";
###########################################################################
print <<EOT;
static struct
{
/* Note: We are going to split this table so that we can use
uint16_t for "from" and "to" again. Anthony Fok, 2002-03-21 */
uint32_t from;
uint32_t to;
uint32_t offset;
} from_ucs4_idx[] =
{
EOT
foreach $index (@index_array) {
printf " { %7s, %7s, %5d },\n", sprintf( "0x%04X", @$index[0] ),
sprintf( "0x%04X", @$index[1] ), @$index[2];
}
print "};\n\n";
#foreach $i (sort keys %b2u) {
# print $b2u{$i} . ' ';
#}
print <<'EOT';
/* Definitions used in the body of the `gconv' function. */
#define CHARSET_NAME "BIG5HKSCS//"
#define FROM_LOOP from_big5
#define TO_LOOP to_big5
#define DEFINE_INIT 1
#define DEFINE_FINI 1
#define MIN_NEEDED_FROM 1
#define MAX_NEEDED_FROM 2
#define MIN_NEEDED_TO 4
/* First define the conversion function from Big5-HKSCS to UCS4. */
#define MIN_NEEDED_INPUT MIN_NEEDED_FROM
#define MAX_NEEDED_INPUT MAX_NEEDED_FROM
#define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
#define LOOPFCT FROM_LOOP
#define BODY \
{ \
uint32_t ch = *inptr; \
\
if (ch >= 0x81 && ch <= 0xfe) \
{ \
/* Two-byte character. First test whether the next character \
is also available. */ \
uint32_t ch2; \
int idx; \
\
if (__builtin_expect (inptr + 1 >= inend, 0)) \
{ \
/* The second character is not available. */ \
result = __GCONV_INCOMPLETE_INPUT; \
break; \
} \
\
ch2 = inptr[1]; \
/* See whether the second byte is in the correct range. */ \
if ((ch2 >= 0x40 && ch2 <= 0x7e) || (ch2 >= 0xa1 && ch2 <= 0xfe)) \
{ \
if (ch >= 0x88) \
{ \
/* Look up the table */ \
idx = (ch - 0x88) * 157 + ch2 - (ch2 <= 0x7e ? 0x40 : 0x62); \
if ((ch = big5_hkscs_to_ucs[idx]) == 0) \
{ \
/* This is illegal. */ \
if (! ignore_errors_p ()) \
{ \
result = __GCONV_ILLEGAL_INPUT; \
break; \
} \
\
++inptr; \
++*irreversible; \
continue; \
} \
} \
else \
{ \
/* 0x81..0x87 in UDA3, currently maps linearly to PUA */ \
ch = (ch - 0x81) * 157 + ch2 - (ch2 <= 0x7e ? 0x40 : 0x62) \
+ 0xeeb8; \
} \
} \
else \
{ \
/* This is illegal. */ \
if (! ignore_errors_p ()) \
{ \
result = __GCONV_ILLEGAL_INPUT; \
break; \
} \
\
++inptr; \
++*irreversible; \
continue; \
} \
\
inptr += 2; \
} \
else if (__builtin_expect (ch, 0) == 0xff) \
{ \
result = __GCONV_ILLEGAL_INPUT; \
break; \
} \
else /* 0x00 to 0x80 */ \
++inptr; \
\
put32 (outptr, ch); \
outptr += 4; \
}
#define LOOP_NEED_FLAGS
#include <iconv/loop.c>
/* Next, define the other direction. */
#define MIN_NEEDED_INPUT MIN_NEEDED_TO
#define MIN_NEEDED_OUTPUT MIN_NEEDED_FROM
#define MAX_NEEDED_OUTPUT MAX_NEEDED_FROM
#define LOOPFCT TO_LOOP
#define BODY \
{ \
uint32_t ch = get32 (inptr); \
const unsigned char *cp = ""; \
unsigned char b5ch[2] = "\0\0"; \
int i; \
\
for (i = 0; \
i < (int) (sizeof (from_ucs4_idx) / sizeof (from_ucs4_idx[0])); \
++i) \
{ \
if (ch < from_ucs4_idx[i].from) \
break; \
if (from_ucs4_idx[i].to >= ch) \
{ \
cp = ucs4_to_big5_hkscs[from_ucs4_idx[i].offset \
+ ch - from_ucs4_idx[i].from]; \
break; \
} \
} \
\
if (ch <= 0x80) \
{ \
b5ch[0] = ch; \
cp = b5ch; \
} \
\
if (cp[0] == '\0' && ch != 0) \
{ \
UNICODE_TAG_HANDLER (ch, 4); \
\
/* Illegal character. */ \
STANDARD_ERR_HANDLER (4); \
} \
else \
{ \
/* See whether there is enough room for the second byte we write. */ \
if (__builtin_expect (cp[1], '\1') != '\0' \
&& __builtin_expect (outptr + 1 >= outend, 0)) \
{ \
/* We have not enough room. */ \
result = __GCONV_FULL_OUTPUT; \
break; \
} \
\
*outptr++ = cp[0]; \
if (cp[1] != '\0') \
*outptr++ = cp[1]; \
} \
\
inptr += 4; \
}
#define LOOP_NEED_FLAGS
#include <iconv/loop.c>
/* Now define the toplevel functions. */
#include <iconv/skeleton.c>
EOT
}

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,55 +0,0 @@
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#include "nsBIG5HKSCSToUnicode.h"
#include "nsUCvTWDll.h"
#include "nsUCConstructors.h"
//----------------------------------------------------------------------
// Global functions and data [declaration]
static const uScanClassID g_BIG5HKSCSScanClassIDs[] = {
u1ByteCharset,
u2BytesCharset,
u2BytesCharset,
u2BytesCharset,
u2BytesCharset,
u2BytesCharset
};
static const uint16_t *g_BIG5HKSCSMappingTableSet [] ={
g_ASCIIMappingTable,
g_utBig5HKSCSMapping,
g_utBIG5Mapping,
g_utBig5HKSCSMapping,
g_utBIG5Mapping,
g_utBig5HKSCSMapping,
};
static const uRange g_BIG5HKSCSRanges[] = {
{ 0x00, 0x7F },
{ 0x81, 0xA0 },
{ 0xA1, 0xC6 },
{ 0xC6, 0xC8 },
{ 0xC9, 0xF9 },
{ 0xF9, 0xFE }
};
//----------------------------------------------------------------------
// Class nsBIG5HKSCSToUnicode [implementation]
nsresult
nsBIG5HKSCSToUnicodeConstructor(nsISupports *aOuter, REFNSIID aIID,
void **aResult)
{
return CreateMultiTableDecoder(6,
(const uRange* ) &g_BIG5HKSCSRanges,
(uScanClassID*) &g_BIG5HKSCSScanClassIDs,
(uMappingTable**) &g_BIG5HKSCSMappingTableSet,
1,
aOuter, aIID, aResult);
}

View File

@ -1,21 +0,0 @@
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#ifndef nsBIG5HKSCSToUnicode_h___
#define nsBIG5HKSCSToUnicode_h___
#include "nsISupports.h"
/**
* A character set converter from BIG5-HKSCS to Unicode.
*
* @created 02/Jul/2000
* @author Gavin Ho, Hong Kong Professional Services, Compaq Computer (Hong Kong) Ltd.
*/
nsresult
nsBIG5HKSCSToUnicodeConstructor(nsISupports *aOuter, REFNSIID aIID,
void **aResult);
#endif /* nsBIG5HKSCSToUnicode_h___ */

View File

@ -4,36 +4,163 @@
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#include "nsBIG5ToUnicode.h"
#include "nsUCvTWDll.h"
#include "nsUCConstructors.h"
#include "mozilla/BinarySearch.h"
#include "mozilla/ArrayUtils.h"
//----------------------------------------------------------------------
// Global functions and data [declaration]
#include "nsBIG5DecoderData.h"
static const uScanClassID g_BIG5ScanClassIDs[] = {
u1ByteCharset,
u2BytesCharset
};
static const uint16_t *g_BIG5MappingTableSet [] ={
g_ASCIIMappingTable,
g_utBIG5Mapping
};
static const uRange g_BIG5Ranges[] = {
{ 0x00, 0x7F },
{ 0x81, 0xFE }
};
nsresult
nsBIG5ToUnicodeConstructor(nsISupports *aOuter, REFNSIID aIID,
void **aResult)
nsBIG5ToUnicode::nsBIG5ToUnicode()
: mPendingTrail(0)
, mBig5Lead(0)
{
return CreateMultiTableDecoder(2,
(const uRange* ) &g_BIG5Ranges,
(uScanClassID*) &g_BIG5ScanClassIDs,
(uMappingTable**) &g_BIG5MappingTableSet, 1,
aOuter, aIID, aResult);
}
NS_IMETHODIMP
nsBIG5ToUnicode::Convert(const char* aSrc,
int32_t* aSrcLength,
char16_t* aDest,
int32_t* aDestLength)
{
// We'll be doing comparisons as unsigned.
const uint8_t* in = reinterpret_cast<const uint8_t*>(aSrc);
const uint8_t* inEnd = in + *aSrcLength;
char16_t* out = aDest;
char16_t* outEnd = out + *aDestLength;
if (mPendingTrail) {
if (out == outEnd) {
*aSrcLength = 0;
*aDestLength = 0;
return NS_OK_UDEC_MOREOUTPUT;
}
*out++ = mPendingTrail;
mPendingTrail = 0;
}
for (;;) {
if (in == inEnd) {
*aSrcLength = in - reinterpret_cast<const uint8_t*>(aSrc);
*aDestLength = out - aDest;
return mBig5Lead ? NS_OK_UDEC_MOREINPUT : NS_OK;
}
if (out == outEnd) {
*aSrcLength = in - reinterpret_cast<const uint8_t*>(aSrc);
*aDestLength = out - aDest;
return NS_OK_UDEC_MOREOUTPUT;
}
uint8_t b = *in++;
if (!mBig5Lead) {
if (b <= 0x7F) {
*out++ = (char16_t)b;
continue;
}
if (b >= 0x81 && b <= 0xFE) {
mBig5Lead = b;
continue;
}
if (mErrBehavior == kOnError_Signal) {
--in;
*aSrcLength = in - reinterpret_cast<const uint8_t*>(aSrc);
*aDestLength = out - aDest;
return NS_ERROR_ILLEGAL_INPUT;
}
*out++ = 0xFFFD;
continue;
}
size_t lead = mBig5Lead;
mBig5Lead = 0;
size_t offset = (b < 0x7F) ? 0x40 : 0x62;
if ((b >= 0x40 && b <= 0x7E) || (b >= 0xA1 && b <= 0xFE)) {
size_t pointer = (lead - 0x81) * 157L + (b - offset);
char16_t outTrail;
switch (pointer) {
case 1133:
*out++ = 0x00CA;
outTrail = 0x0304;
break;
case 1135:
*out++ = 0x00CA;
outTrail = 0x030C;
break;
case 1164:
*out++ = 0x00EA;
outTrail = 0x0304;
break;
case 1166:
*out++ = 0x00EA;
outTrail = 0x030C;
break;
default:
char16_t lowBits = LowBits(pointer);
if (!lowBits) {
if (b <= 0x7F) {
// prepend byte to stream
// Always legal, since we've always just read a byte
// if we come here.
--in;
}
if (mErrBehavior == kOnError_Signal) {
--in;
*aSrcLength = in - reinterpret_cast<const uint8_t*>(aSrc);
*aDestLength = out - aDest;
return NS_ERROR_ILLEGAL_INPUT;
}
*out++ = 0xFFFD;
continue;
}
if (IsAstral(pointer)) {
uint32_t codePoint = uint32_t(lowBits) | 0x20000;
*out++ = char16_t(0xD7C0 + (codePoint >> 10));
outTrail = char16_t(0xDC00 + (codePoint & 0x3FF));
break;
}
*out++ = lowBits;
continue;
}
if (out == outEnd) {
mPendingTrail = outTrail;
*aSrcLength = in - reinterpret_cast<const uint8_t*>(aSrc);
*aDestLength = out - aDest;
return NS_OK_UDEC_MOREOUTPUT;
}
*out++ = outTrail;
continue;
}
// pointer is null
if (b <= 0x7F) {
// prepend byte to stream
// Always legal, since we've always just read a byte
// if we come here.
--in;
}
if (mErrBehavior == kOnError_Signal) {
// Moving in one past the start of aSrc is actually OK per API contract,
// since assigning -1 to aSrcLength means that we want the caller to
// record one U+FFFD and repush the same input buffer.
--in;
*aSrcLength = in - reinterpret_cast<const uint8_t*>(aSrc);
*aDestLength = out - aDest;
return NS_ERROR_ILLEGAL_INPUT;
}
*out++ = 0xFFFD;
continue;
}
}
NS_IMETHODIMP
nsBIG5ToUnicode::GetMaxLength(const char* aSrc,
int32_t aSrcLength,
int32_t* aDestLength)
{
// The length of the output in UTF-16 code units never exceeds the length
// of the input in bytes.
*aDestLength = aSrcLength + (mPendingTrail ? 1 : 0) + (mBig5Lead ? 1 : 0);
return NS_OK;
}
NS_IMETHODIMP
nsBIG5ToUnicode::Reset()
{
mPendingTrail = 0;
mBig5Lead = 0;
return NS_OK;
}

View File

@ -6,16 +6,37 @@
#ifndef nsBIG5ToUnicode_h___
#define nsBIG5ToUnicode_h___
#include "nsISupports.h"
#include "nsUCSupport.h"
/**
* A character set converter from BIG5 to Unicode.
*
* @created 06/Apr/1999
* @author Catalin Rotaru [CATA]
*/
nsresult
nsBIG5ToUnicodeConstructor(nsISupports *aOuter, REFNSIID aIID,
void **aResult);
#define NS_BIG5TOUNICODE_CID \
{ 0xefc323e1, 0xec62, 0x11d2, \
{ 0x8a, 0xac, 0x0, 0x60, 0x8, 0x11, 0xa8, 0x36 } }
#define NS_BIG5TOUNICODE_CONTRACTID \
"@mozilla.org/intl/unicode/decoder;1?charset=big5"
class nsBIG5ToUnicode : public nsBasicDecoderSupport
{
public:
nsBIG5ToUnicode();
NS_IMETHOD Convert(const char* aSrc,
int32_t* aSrcLength,
char16_t* aDest,
int32_t* aDestLength);
NS_IMETHOD GetMaxLength(const char* aSrc,
int32_t aSrcLength,
int32_t* aDestLength);
NS_IMETHOD Reset();
private:
static char16_t LowBits(size_t aPointer);
static bool IsAstral(size_t aPointer);
char16_t mPendingTrail;
uint8_t mBig5Lead;
};
#endif /* nsBIG5ToUnicode_h___ */

View File

@ -8,24 +8,9 @@
#include "nsISupports.h"
// Class ID for our BIG5ToUnicode charset converter
// {EFC323E1-EC62-11d2-8AAC-00600811A836}
#define NS_BIG5TOUNICODE_CID \
{ 0xefc323e1, 0xec62, 0x11d2, {0x8a, 0xac, 0x0, 0x60, 0x8, 0x11, 0xa8, 0x36}}
// Class ID for our UnicodeToBIG5 charset converter
// {EFC323E2-EC62-11d2-8AAC-00600811A836}
#define NS_UNICODETOBIG5_CID \
{ 0xefc323e2, 0xec62, 0x11d2, {0x8a, 0xac, 0x0, 0x60, 0x8, 0x11, 0xa8, 0x36}}
// Class ID for our BIG5HKSCSToUnicode charset converter
// {BA6151BB-EC62-11d2-8AAC-00600811A836}
#define NS_BIG5HKSCSTOUNICODE_CID \
{ 0xba6151bb, 0xec62, 0x11d2, {0x8a, 0xac, 0x0, 0x60, 0x8, 0x11, 0xa8, 0x36}}
// Class ID for our UnicodeToBIG5HKSCS charset converter
// {BA6151BC-EC62-11d2-8AAC-00600811A836}
#define NS_UNICODETOBIG5HKSCS_CID \
{ 0xba6151bc, 0xec62, 0x11d2, {0x8a, 0xac, 0x0, 0x60, 0x8, 0x11, 0xa8, 0x36}}
#endif /* nsUCvTWCID_h___ */

View File

@ -7,9 +7,6 @@
#define nsUCvTWDll_h_
extern const uint16_t g_ufBig5Mapping[];
extern const uint16_t g_utBIG5Mapping[];
extern const uint16_t g_ASCIIMappingTable[];
extern const uint16_t g_ufBig5HKSCSMapping[];
extern const uint16_t g_utBig5HKSCSMapping[];
#endif /* nsUCvTWDll_h_ */

View File

@ -1,36 +0,0 @@
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#include "nsUnicodeToBIG5HKSCS.h"
#include "nsUCvTWDll.h"
#include "nsUCConstructors.h"
//----------------------------------------------------------------------
// Global functions and data [declaration]
nsresult
nsUnicodeToBIG5HKSCSConstructor(nsISupports *aOuter, REFNSIID aIID,
void **aResult)
{
static const uint16_t *g_Big5HKSCSMappingTable[] = {
g_ASCIIMappingTable,
g_ufBig5Mapping,
g_ufBig5HKSCSMapping
};
static const uScanClassID g_Big5HKSCSScanClassIDs[] = {
u1ByteCharset,
u2BytesCharset,
u2BytesCharset
};
return CreateMultiTableEncoder(3,
(uScanClassID*) &g_Big5HKSCSScanClassIDs,
(uMappingTable**) &g_Big5HKSCSMappingTable,
2 /* max length = src * 2 */,
aOuter, aIID, aResult);
}

View File

@ -1,21 +0,0 @@
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#ifndef nsUnicodeToBIG5HKSCS_h___
#define nsUnicodeToBIG5HKSCS_h___
#include "nsISupports.h"
/**
* A character set converter from Unicode to BIG5-HKSCS.
*
* @created 02/Jul/2000
* @author Gavin Ho, Hong Kong Professional Services, Compaq Computer (Hong Kong) Ltd.
*/
nsresult
nsUnicodeToBIG5HKSCSConstructor(nsISupports *aOuter, REFNSIID aIID,
void **aResult);
#endif /* nsUnicodeToBIG5HKSCS_h___ */