gecko-dev/intl/uconv/tools/gen-big5hkscs-2001-mozilla.pl
shanjian%netscape.com fd06eca99b #182089 Extended hkscs.uf and hkscs.ut for HKSCS-2001 support
Tools written by Anthony (anthony@thizlinux.com) to generate the new conversion table for
hkscs. Not part of the build, just put here for future reference.
2002-12-13 00:47:38 +00:00

960 lines
30 KiB
Perl
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/perl -w
#
# gen-big5hkscs-2001-mozilla.pl
# a Perl script that generates Big5-HKSCS <-> Unicode
# conversion tables for Mozilla
#
# Author (of the original Perl script):
# Anthony Fok <anthony@thizlinux.com> <foka@debian.org>
# Copyright (C) 2001, 2002 ThizLinux Laboratory Ltd.
# License: GNU General Public License, v2 or later.
#
# This version includes original C source code from
# glibc-2.2.5/iconvdata/big5hkscs.c by Ulrich Drepper <drepper@redhat.com>
# Roger So <roger.so@sw-linux.com>
#
# First attempt for Qt-2.3.x: 2001-09-21
# A working version for Qt-2.3.x: 2001-10-30
# Ported to glibc-2.2.5 with HKSCS-2001: 2002-03-21
# Adapted to generate conversion tables for Mozilla: 2002-11-26
# Adapted to generate conversion tables for Mozilla: 2002-11-30
# Cleaned up the script somewhat: 2002-12-04
# Minor revisions for submitting to Mozilla Bugzilla: 2002-12-10
#
# Notes:
#
# 1. The latest version of this script may be found in:
# http://www.thizlinux.com/~anthony/hkscs/gen-glibc-big5hkscs.pl
# http://people.debian.org/~foka/hkscs/gen-glibc-big5hkscs.pl
# Or, better yet, e-mail me and ask for the latest version.
#
# 2. This script generates data from 3 tables:
# a. http://www.microsoft.com/typography/unicode/950.txt
# b. http://www.info.gov.hk/digital21/chi/hkscs/download/big5-iso.txt
# c. http://www.info.gov.hk/digital21/chi/hkscs/download/big5cmp.txt
#
# Make sure your big5-iso.txt is the latest HKSCS-2001 version.
#
# 3. [glibc]: I have currently split the ucs_to_big5_hkscs_?[] tables into
# different areas similar to the way Ulrich and Roger did it,
# but extended for HKSCS-2001.
#
# 4. [Mozilla]: This script is very quick-and-dirty in some places.
# Call either gen_mozilla_uf() or gen_mozilla_ut() to generate
# the appropriate tables for feeding into "fromu" or "tou".
#
# 5. [CharMapML]: The comments regarding TW-BIG5 herein need to be organized.
# Also, please make sure "$hkscs_mode = 0;" for TW-BIG5 mode.
# Otherwise, this script would generate a HKSCS table.
# (Yes, I know, I should clean up this script and make it more modular,
# and with command-line options or whatnot. I'll do that later. :-)
#
# If you have any questions or concerns, please feel free to contact me
# at Anthony Fok <anthony@thizlinux.com> or <foka@debian.org> :-)
#
# Last but not least, special thanks to ThizLinux Laboratory Ltd. (HK)
# for their generous support in this work.
#
# 1. UDA3, 0x8840 - 0x8dfe
# 2. UDA2, 0x8e40 - 0xa0fe
# 3. VDA, 0xc6a1 - 0xc8fe
#use Getopt::Std;
my ( %b2u, %u2b, $unicode, $big5, $high, $low, $i, $count );
my $debug = 0;
my $hkscs_mode = 1;
my $kangxi = 0;
my $use_range = 0;
my $bmp_only = 1;
#
# Subroutine Declaration
#
sub read_cp950();
sub adjust_radicals();
sub read_hkscs_main();
sub read_hkscs_cmp();
sub post_tuning();
sub gen_charmapml();
sub gen_check_b2u();
sub gen_check_u2b();
sub gen_mozilla_uf();
sub gen_mozilla_ut();
sub gen_glibc();
###########################################################################
#
# Main program
#
# First, read Microsoft's CP950 as base Big5.
read_cp950 ();
# Add mappings to Kangxi Radicals.
# The b2u direction is added only if $kangxi is not null.
adjust_radicals ();
# Then, read the HKSCS table.
# Again, see the $hkscs_mode variable.
read_hkscs_main ();
read_hkscs_cmp () if $hkscs_mode;
post_tuning ();
# Then, choose one of the following:
#gen_charmapml();
gen_mozilla_uf();
#gen_mozilla_ut();
#gen_check_u2b();
#gen_glibc();
# End of program
exit 0;
#############################################################################
#
# Subroutines
#
sub read_cp950() {
open( CP950, "950.txt" ) or die;
my $mode = 0;
while (<CP950>) {
s/\r//;
chomp;
next if /^$/;
last if /^ENDCODEPAGE/;
if (/^DBCSTABLE (\d+)\s+;LeadByte = 0x([0-9a-f]{2})/) {
$mode = 1;
( $count, $high ) = ( $1, $2 );
$i = 0;
next;
}
if (/^WCTABLE (\d+)/) {
$mode = 2;
$count = $1;
$i = 0;
next;
}
next if $mode == 0;
if ( $mode == 1 ) {
( $low, $unicode, $comment ) = split "\t";
$low =~ s/^0x//;
$unicode =~ s/^0x//;
$big5 = $high . $low;
$b2u{ uc($big5) } = uc($unicode);
if ( ++$i == $count ) { $mode = 0; $count = 0; next; }
}
if ( $mode == 2 ) {
( $unicode, $big5, $comment ) = split "\t";
$unicode =~ s/^0x//;
$big5 =~ s/^0x//;
my $u = hex($unicode);
my $b = hex($big5);
$u2b{ uc($unicode) } = uc($big5) unless
# Skip Microsoft's over-generous (or over-zealous?) mappings
# "Faked" accented latin characters
( $b <= 0xFF and $b != $u )
# "Faked" Ideographic Annotation ___ Mark
or ( $u >= 0x3192 and $u <= 0x319F )
# "Faked" Parenthesized Ideograph ___
or ( $u >= 0x3220 and $u <= 0x3243 )
# "Faked" Circled Ideograph ___ except Circled Ideograph Correct
or ( $u >= 0x3280 and $u <= 0x32B0 and $u != 0x32A3 )
# <20>F<EFBFBD>G<EFBFBD>D<EFBFBD><44><EFBFBD>g<EFBFBD>M
or ( $u == 0xA2
or $u == 0xA3
or $u == 0xA5
or $u == 0xB4
or $u == 0xB5
or $u == 0xB8 )
# <20>¢w<C2A2><77><EFBFBD>E<EFBFBD><45><EFBFBD><EFBFBD><EFBFBD>âB<C3A2>X<EFBFBD>ݡ[<5B><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
or ( $u == 0x0305 # ???
or $u == 0x2015
or $u == 0x2016
or $u == 0x2022
or $u == 0x2024
or $u == 0x2033
or $u == 0x203E # ???
or $u == 0x2216
or $u == 0x2218
or $u == 0x2263
or $u == 0x2307
or $u == 0x2609
or $u == 0x2641
or $u == 0x301C
or $u == 0x3030 )
# <20>s<EFBFBD><73><EFBFBD>N
or ( $u == 0xFF3E or $u == 0xFF40 or $u == 0xFF64 );
if ( ++$i == $count ) { $mode = 0; $count = 0; next; }
}
}
}
sub adjust_radicals() {
# B5+C6BF - B5+C6D7: Radicals (?)
# TW-BIG5 drafted by Autrijus uses Kangxi Radicals whenever possible.
#
# Big5-HKSCS tends towards using the character in Unicode CJK Ideographs
# Note that HKSCS does not explicitly define
# B5+C6CF, B5+C6D3, B5+C6D5, B5+C6D7 (<28>ϡB<CFA1>ӡB<D3A1>աB<D5A1><42>),
# but do have these characters at B5+FBFD, B5+FCD3, B5+FEC1, B5+90C4,
# mapped to U+5EF4, U+65E0, U+7676, U+96B6 respectively.
#
# As for B5+C6CD (<28><>), HKSCS maps it to U+2F33 just like TW-BIG5.
# However, it also maps B5+FBF4 (<28><>) to U+5E7A.
$b2u{"C6BF"} = "2F02" if $kangxi;
$u2b{"2F02"} = "C6BF"; # ƿ
$b2u{"C6C0"} = "2F03" if $kangxi;
$u2b{"2F03"} = "C6C0"; # <20><>
$b2u{"C6C1"} = "2F05" if $kangxi;
$u2b{"2F05"} = "C6C1"; # <20><>
$b2u{"C6C2"} = "2F07" if $kangxi;
$u2b{"2F07"} = "C6C2"; # <20><>
$b2u{"C6C3"} = "2F0C" if $kangxi;
$u2b{"2F0C"} = "C6C3"; # <20><>
$b2u{"C6C4"} = "2F0D" if $kangxi;
$u2b{"2F0D"} = "C6C4"; # <20><>
$b2u{"C6C5"} = "2F0E" if $kangxi;
$u2b{"2F0E"} = "C6C5"; # <20><>
$b2u{"C6C6"} = "2F13" if $kangxi;
$u2b{"2F13"} = "C6C6"; # <20><>
$b2u{"C6C7"} = "2F16" if $kangxi;
$u2b{"2F16"} = "C6C7"; # <20><>
$b2u{"C6C8"} = "2F19" if $kangxi;
$u2b{"2F19"} = "C6C8"; # <20><>
$b2u{"C6C9"} = "2F1B" if $kangxi;
$u2b{"2F1B"} = "C6C9"; # <20><>
$b2u{"C6CA"} = "2F22" if $kangxi;
$u2b{"2F22"} = "C6CA"; # <20><>
$b2u{"C6CB"} = "2F27" if $kangxi;
$u2b{"2F27"} = "C6CB"; # <20><>
$b2u{"C6CC"} = "2F2E" if $kangxi;
$u2b{"2F2E"} = "C6CC"; # <20><>
$b2u{"C6CD"} = "2F33" if $kangxi;
$u2b{"2F33"} = "C6CD"; # <20><>
$b2u{"C6CE"} = "2F34" if $kangxi;
$u2b{"2F34"} = "C6CE"; # <20><>
$b2u{"C6CF"} = "2F35" if $kangxi;
$u2b{"2F35"} = "C6CF"; # <20><>
$b2u{"C6D0"} = "2F39" if $kangxi;
$u2b{"2F39"} = "C6D0"; # <20><>
$b2u{"C6D1"} = "2F3A" if $kangxi;
$u2b{"2F3A"} = "C6D1"; # <20><>
$b2u{"C6D2"} = "2F41" if $kangxi;
$u2b{"2F41"} = "C6D2"; # <20><>
$b2u{"C6D3"} = "2F46" if $kangxi;
$u2b{"2F46"} = "C6D3"; # <20><>
$b2u{"C6D4"} = "2F67" if $kangxi;
$u2b{"2F67"} = "C6D4"; # <20><>
$b2u{"C6D5"} = "2F68" if $kangxi;
$u2b{"2F68"} = "C6D5"; # <20><>
$b2u{"C6D6"} = "2FA1" if $kangxi;
$u2b{"2FA1"} = "C6D6"; # <20><>
$b2u{"C6D7"} = "2FAA" if $kangxi;
$u2b{"2FAA"} = "C6D7"; # <20><>
}
sub read_hkscs_main() {
open( B2U, "<big5-iso.txt" ) or die;
while (<B2U>) {
next
unless
/([[:xdigit:]]{4})\s+([[:xdigit:]]{4})\s+([[:xdigit:]]{4})\s+([[:xdigit:]]{4,5})/;
( $big5, $iso1993, $iso2000, $iso2001 ) = ( $1, $2, $3, $4 );
my $b = hex($big5);
# For non-HKSCS mode, only take data in the VDA range (?)
next unless $hkscs_mode
# Note that we don't go from B5+C6A1-B5+C6FE, but rather only
# C6A1-C8D3 excluding C6BF-C6D7 (Kangxi Radicals)
# because C8D4-C8FE are not assigned in TW-BIG5
# if we are to follow Arphic PL Big-5 fonts. (To be discussed)
or
( $b >= 0xC6A1 && $b <= 0xC8D3 and !( $b >= 0xC6BF && $b <= 0xC6D7 ) )
or ( $b >= 0xF9D6 && $b <= 0xF9FE );
print STDERR
"B2U, 2000: $big5 redefined from U+$b2u{$big5} to U+$iso2000.\n"
if $debug
and defined( $b2u{$big5} )
and $b2u{$big5} ne $iso2000;
$b2u{$big5} = $bmp_only ? $iso2000 : $iso2001
unless !$hkscs_mode
and $b == 0xF9FE;
# B5+F9FE is mapped differently in TW-BIG5 and HKSCS, to
# U+2593 (Dark Shade) and U+FFED (Halfwidth Black Square) respectively.
# Which is more correct? I don't know! (To be discussed)
print STDERR
"1993: U+$iso1993 redefined from $u2b{$iso1993} to $big5.\n"
if $debug
and defined( $u2b{$iso1993} )
and $u2b{$iso1993} ne $big5;
$u2b{$iso1993} = $big5;
print STDERR
"2000: U+$iso2000 redefined from $u2b{$iso2000} to $big5.\n"
if $debug
and defined( $u2b{$iso2000} )
and $u2b{$iso2000} ne $big5;
$u2b{$iso2000} = $big5;
print STDERR
"2001: U+$iso2001 redefined from $u2b{$iso2001} to $big5.\n"
if $debug
and defined( $u2b{$iso2001} )
and $u2b{$iso2001} ne $big5;
$u2b{$iso2001} = $big5;
}
close B2U;
} # read_hkscs_main()
sub read_hkscs_cmp() {
###########################################################################
# Add Big5 compatibility coding...
#
# Stephan, here is the code segment that you may want to implement
# in your convertbig5hkscs2001.pl
#
open( B5CMP, "<big5cmp.txt" ) or die;
$mode = 0;
while (<B5CMP>) {
if (/^=====/) { $mode = 1; next; }
next if $mode == 0;
last if $mode == 1 and /^\s+/;
chomp;
my ( $big5cmp, $big5 ) = split " ";
$big5cmp = uc($big5cmp);
$big5 = uc($big5);
my $uni = $b2u{$big5};
my $unicmp = $b2u{$big5cmp};
print STDERR
"Was: U+$unicmp -> $u2b{$unicmp}, $big5cmp -> U+$b2u{$big5cmp}\t"
if $debug;
$b2u{$big5cmp} = $uni;
$u2b{$unicmp} = $big5;
print STDERR
"Now: U+$unicmp -> $u2b{$unicmp}, $big5cmp -> U+$b2u{$big5cmp}\n"
if $debug;
}
close B5CMP;
} # read_hkscs_cmp();
sub post_tuning() {
# And finally, fine-tuning...
for $i ( 0x00 .. 0x80 ) {
$big5 = $unicode = sprintf( "%04X", $i );
$b2u{$big5} = $unicode;
}
# Add Euro '<27><>' (I wonder why this 950.txt doesn't have it.)
$b2u{"A3E1"} = "20AC";
$u2b{"20AC"} = "A3E1";
# Box drawing characters:
# Align with Big-5E (To be discussed, as it differs from CP950 and HKSCS)
# (To be discussed)
if ( !$hkscs_mode ) {
$u2b{"2550"} = "A2A4"; # Big5: <20><> (also B5-F9F9)
$u2b{"255E"} = "A2A5"; # Big5: <20><> (also B5-F9E9)
$u2b{"2561"} = "A2A7"; # Big5: <20><> (also B5-F9EB)
$u2b{"256A"} = "A2A6"; # Big5: <20><> (also B5-F9EA)
$u2b{"256D"} = "A27E"; # Big5: <20>~ (also B5-F9FA)
$u2b{"256E"} = "A2A1"; # Big5: <20><> (also B5-F9FB)
$u2b{"256F"} = "A2A3"; # Big5: <20><> (also B5-F9FD)
$u2b{"2570"} = "A2A2"; # Big5: <20><> (also B5-F9FC)
}
# "Hangzhou" or "Suzhou" Chinese numerals 10, 20, 30 (<28>̢͢<CCA2>)
# (To be discussed)
if ( !$hkscs_mode ) {
$b2u{"A2CC"} = "3038";
$u2b{"3038"} = "A2CC";
$b2u{"A2CD"} = "3039";
$u2b{"3039"} = "A2CD";
$b2u{"A2CE"} = "303A";
$u2b{"303A"} = "A2CE";
}
# The character for ethnic group "Yi" (<28>U):
# (To be discussed)
$u2b{"5F5E"} = "C255"; # Always add this.
if ( !$hkscs_mode ) {
$b2u{"C255"} = "5F5E";
}
} # post_tuning()
sub gen_charmapml() {
###########################################################################
#
# Codes for generating CharMapML XML file
print <<EOT;
<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE characterMapping SYSTEM "http://www.unicode.org/unicode/reports/tr22/CharacterMapping.dtd">
EOT
if ($hkscs_mode) {
print <<EOT;
<characterMapping id="big5-hkscs-2001" version="1">
<history>
<modified version="1" date="2002-11-30">
Trial version generated from 950.txt + part of big5-iso.txt (HKSCS-2001)
with Euro added, with CP950's excessive fub (fallbacks uni->big5) removed,
and with some other manual tweaking.
</modified>
</history>
EOT
}
else {
print <<EOT;
<characterMapping id="tw-big5-2002" version="1">
<history>
<modified version="1" date="2002-11-30">
Trial version generated from 950.txt + part of big5-iso.txt (HKSCS-2001)
with Euro added, with CP950's excessive fub (fallbacks uni->big5) removed,
and with some other manual tweaking.
</modified>
</history>
EOT
}
print <<EOT;
<validity>
<state type="FIRST" next="VALID" s="0" e="80" max="FFFF"/>
<state type="FIRST" next="SECOND" s="81" e="FE" max="FFFF"/>
<state type="SECOND" next="VALID" s="40" e="7E" max="FFFF"/>
<state type="SECOND" next="VALID" s="A1" e="FE" max="FFFF"/>
</validity>
<assignments sub="3F">
EOT
print " <!-- One to one mappings -->\n";
for $unicode ( sort { hex($a) <=> hex($b) } keys %u2b ) {
$big5 = $u2b{$unicode};
$u = hex($unicode);
next
unless defined( $b2u{$big5} )
and $unicode eq $b2u{$big5}
and
not( $use_range and !$hkscs_mode and $u >= 0xE000 && $u <= 0xF6B0 );
printf " <a u=\"%04X\" ", $u;
if ( hex($big5) <= 0xFF ) {
printf "b=\"%02X\"/>\n", hex($big5);
}
else {
printf "b=\"%s %s\"/>\n", substr( $big5, 0, 2 ),
substr( $big5, 2, 2 );
}
}
print " <!-- Fallback mappings from Unicode to bytes -->\n";
for $unicode ( sort { hex($a) <=> hex($b) } keys %u2b ) {
$big5 = $u2b{$unicode};
next if defined( $b2u{$big5} ) and hex($unicode) == hex( $b2u{$big5} );
if ( $unicode eq "F900" ) {
print " <!-- CJK Compatibility Ideographs: U+F900 - U+FA6A.\n";
print
" These are included in CP950 (Unicode->Big5 direction only).\n";
print " Should we include this area in TW-BIG5 or not? -->\n";
}
printf " <fub u=\"%04X\" b=\"%s %s\"/>\n", hex($unicode),
substr( $big5, 0, 2 ), substr( $big5, 2, 2 );
}
my %fbu;
print " <!-- Fallback mappings from bytes to Unicode -->\n";
for $big5 ( sort { hex($a) <=> hex($b) } keys %b2u ) {
$unicode = $b2u{$big5};
if ( !defined( $u2b{$unicode} ) or hex($big5) != hex( $u2b{$unicode} ) )
{
$fbu{$unicode} = $big5;
}
}
for $unicode ( sort { hex($a) <=> hex($b) } keys %fbu ) {
$big5 = $fbu{$unicode};
printf " <fbu u=\"%04X\" b=\"%s %s\"/>\n", hex($unicode),
substr( $big5, 0, 2 ), substr( $big5, 2, 2 );
}
if ( $use_range and !$hkscs_mode ) {
print <<EOT;
<!-- Roundtrip-mappings that can be enumerated
Note: We can only use the <range> tag for TW-BIG5.
Big-5E and Big5-HKSCS have assigned characters in these areas,
and we will have to use the <a> and <fub> tags instead.
-->
<!-- User-Defined Area 1 (UDA1) -->
<range uFirst="E000" uLast="E310" bFirst="FA 40" bLast="FE FE" bMin="81 40" bMax="FE FE"/>
<!-- User-Defined Area 2 (UDA2) -->
<range uFirst="E311" uLast="EEB7" bFirst="8E 40" bLast="A0 FE" bMin="81 40" bMax="FE FE"/>
<!-- User-Defined Area 3 (UDA3) -->
<range uFirst="EEB8" uLast="F6B0" bFirst="81 40" bLast="8D FE" bMin="81 40" bMax="FE FE"/>
EOT
}
print <<EOT;
</assignments>
</characterMapping>
EOT
} # gen_charmapml()
sub gen_check_b2u() {
###########################################################################
#
# Codes for generating a raw table for verification and testing
#
# #print $u2b{"F7D1"}, "\n";
# print $b2u{$u2b{"F7D1"}}, "\n";
# print "FA59 -> U+", $b2u{"FA59"}, "\n";
foreach $big5 ( sort { hex($a) <=> hex($b) } keys %b2u ) {
$unicode = $b2u{$big5};
$big5 =~ s/^00//;
print "U+", $unicode, ": ", $big5, "\n";
}
}
sub gen_check_u2b() {
foreach $unicode ( sort { hex($a) <=> hex($b) } keys %u2b ) {
$big5 = $u2b{$unicode};
$big5 =~ s/^00//;
print "U+", $unicode, ": ", $big5, "\n";
}
}
###########################################################################
#
# Codes for generating hkscs.ut and hkscs.uf files for Mozilla
#
sub gen_mozilla_uf() {
# hkscs.uf
foreach $unicode ( sort keys %u2b ) {
$big5 = $u2b{$unicode};
my $b = hex($big5);
print "0x", uc($big5), "\t0x", uc($unicode), "\n"
unless ( $b >= 0xA140 and $b <= 0xC6A0 )
or ( $b >= 0xC940 and $b <= 0xF9D5 )
or ( $b < 0x8140 )
or ( hex($unicode) > 0xFFFF );
}
}
sub gen_mozilla_ut() {
# hkscs.ut
foreach $big5 ( sort keys %b2u ) {
my $b = hex($big5);
print "0x", uc($big5), "\t0x", uc( $b2u{$big5} ), "\n"
unless ( $b >= 0xA140 and $b <= 0xC6A0 )
or ( $b < 0x8140 )
or ( $b >= 0xC940 and $b <= 0xF9D5 );
}
}
###########################################################################
sub gen_glibc() {
##########################################################################
#
# Generate index for UCS4 to Big5-HKSCS conversion table
#
@index_array = ();
$mode = 0;
$count = 0;
for ( $uni = 0x81 ; $uni <= 0x2FFFF ; $uni++ ) {
$unicode = sprintf( "%04X", $uni );
# print " /* U+$unicode */\t" if $low % 4 == 0;
if ( defined( $u2b{$unicode} ) ) {
if ( $mode == 0 ) {
$range_start = $range_end = $uni;
# printf " { %7s, ", sprintf("0x%04X", $range_start);
$mode = 1;
}
else {
$range_end = $uni;
}
}
elsif ( $mode == 1 and ( $uni - $range_end ) >= 0x80 ) {
# Start a new range if the gap is 0x80 or larger
# printf "%7s, %5d },\n", sprintf("0x%04X", $range_end), $count;
push @index_array, [ ( $range_start, $range_end, $count ) ];
$count += $range_end - $range_start + 1;
$mode = 0;
}
}
#
# Note that $count and $range_end are used again as global variables
# below
#
###########################################################################
#
# Start generating real C code...
#
print <<'EOT';
/* Mapping tables for Big5-HKSCS handling.
Copyright (C) 1997, 1998, 2000, 2001, 2002 Free Software Foundation, Inc.
This file is part of the GNU C Library.
Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
Modified for Big5-HKSCS by Roger So <roger.so@sw-linux.com>, 2000.
Updated for HKSCS-2001 by James Su <suzhe@turbolinux.com.cn>
and Anthony Fok <anthony@thizlinux.com>, 2002
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, write to the Free
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
02111-1307 USA. */
#include <dlfcn.h>
#include <gconv.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <wchar.h>
/* Table for Big5-HKSCS to UCS conversion.
Original comments by Roger So when he updated the tables for HKSCS-1999:
With HKSCS mappings 0x8140-0xA0FE and 0xFA40-0xFEFE added; more info:
http://www.digital21.gov.hk/eng/hkscs/index.html
- spacehunt 07/01/2000
The BIG5-HKSCS mapping tables are generated from 950.txt, big5-iso.txt
and big5cmp.txt using a Perl script while merging C source code from
other developers. A copy of the source Perl script is available at:
http://www.thizlinux.com/~anthony/hkscs/gen-glibc-big5hkscs.pl
http://people.debian.org/~foka/hkscs/gen-glibc-big5hkscs.pl
Revisions:
2001-10-30 made codec for Qt
2002-03-21 ported to glibc-2.2.5 and added HKSCS-2001
Todo:
Use a hash for characters beyond BMP to save space and make it
more efficient
- Anthony Fok <anthony@thizlinux.com> 21 Mar 2002
On behalf of ThizLinux Laboratory Ltd., Hong Kong SAR, China
*/
EOT
##########################################################################
#
# Generate Big5-HKSCS to Unicode conversion table
#
## print "Big5HKSCS to Unicode\n";
# for $high (0x81..0x8d, 0x8e..0xa0, 0xc6..0xc8, 0xf9, 0xfa..0xfe) {
$high_start = 0x88;
$high_end = 0xfe;
print "static const uint16_t big5_hkscs_to_ucs[";
print( ( $high_end - $high_start + 1 ) * 157 );
print "] =\n{\n";
for $high ( 0x88 .. 0xfe ) {
for $low ( 0x40 .. 0x7e, 0xa1 .. 0xfe ) {
if ( $low == 0x40 ) {
print "\n" unless $high == $high_start;
printf
"\t/* Big5-HKSCS 0x%02X40..0x%02X7E, 0x%02XA1..0x%02XFE */\n",
$high, $high, $high, $high;
}
elsif ( $low == 0xa1 ) {
print "\t\t";
}
$big5 = sprintf( "%02X%02X", $high, $low );
print "\t" if $low % 8 == 0;
if ( defined( $b2u{$big5} ) ) {
$unicode = $b2u{$big5};
print "0x", $unicode, ",";
}
else {
print "0x0000,"; # for glibc
}
print( ( $low % 8 == 7 or $low == 0x7e or $low == 0xfe )
? "\n"
: "\t" );
}
}
print "};\n\n";
##########################################################################
#
# Generate Unicode to Big5-HKSCS conversion table
#
print "static const unsigned char ucs4_to_big5_hkscs[$count][2] =\n{\n";
foreach $index (@index_array) {
( $start, $end ) = ( @$index[0], @$index[1] );
printf( " /* U+%04X */\t", $start ) if ( $start % 4 != 0 );
print "\t" x ( ( $start % 4 ) * 1.5 ) . " " x ( $start % 2 );
for ( $i = $start ; $i <= $end ; $i++ ) {
printf( " /* U+%04X */\t", $i ) if ( $i % 4 == 0 );
$unicode = sprintf( "%04X", $i );
if ( defined( $big5 = $u2b{$unicode} ) ) {
if ( $big5 =~ /^00/ ) {
print '"\x', substr( $big5, 2, 2 ), '\x00",';
}
else {
print '"\x', substr( $big5, 0, 2 ), '\x',
substr( $big5, 2, 2 ), '",';
}
}
else {
print '"\x00\x00",';
}
print( ( $i % 4 == 3 ) ? "\n" : " " ) unless $i == $end;
}
print $end == $range_end ? "\n" : "\n\n";
}
print "};\n\n";
###########################################################################
print <<EOT;
static struct
{
/* Note: We are going to split this table so that we can use
uint16_t for "from" and "to" again. Anthony Fok, 2002-03-21 */
uint32_t from;
uint32_t to;
uint32_t offset;
} from_ucs4_idx[] =
{
EOT
foreach $index (@index_array) {
printf " { %7s, %7s, %5d },\n", sprintf( "0x%04X", @$index[0] ),
sprintf( "0x%04X", @$index[1] ), @$index[2];
}
print "};\n\n";
#foreach $i (sort keys %b2u) {
# print $b2u{$i} . ' ';
#}
print <<'EOT';
/* Definitions used in the body of the `gconv' function. */
#define CHARSET_NAME "BIG5HKSCS//"
#define FROM_LOOP from_big5
#define TO_LOOP to_big5
#define DEFINE_INIT 1
#define DEFINE_FINI 1
#define MIN_NEEDED_FROM 1
#define MAX_NEEDED_FROM 2
#define MIN_NEEDED_TO 4
/* First define the conversion function from Big5-HKSCS to UCS4. */
#define MIN_NEEDED_INPUT MIN_NEEDED_FROM
#define MAX_NEEDED_INPUT MAX_NEEDED_FROM
#define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
#define LOOPFCT FROM_LOOP
#define BODY \
{ \
uint32_t ch = *inptr; \
\
if (ch >= 0x81 && ch <= 0xfe) \
{ \
/* Two-byte character. First test whether the next character \
is also available. */ \
uint32_t ch2; \
int idx; \
\
if (__builtin_expect (inptr + 1 >= inend, 0)) \
{ \
/* The second character is not available. */ \
result = __GCONV_INCOMPLETE_INPUT; \
break; \
} \
\
ch2 = inptr[1]; \
/* See whether the second byte is in the correct range. */ \
if ((ch2 >= 0x40 && ch2 <= 0x7e) || (ch2 >= 0xa1 && ch2 <= 0xfe)) \
{ \
if (ch >= 0x88) \
{ \
/* Look up the table */ \
idx = (ch - 0x88) * 157 + ch2 - (ch2 <= 0x7e ? 0x40 : 0x62); \
if ((ch = big5_hkscs_to_ucs[idx]) == 0) \
{ \
/* This is illegal. */ \
if (! ignore_errors_p ()) \
{ \
result = __GCONV_ILLEGAL_INPUT; \
break; \
} \
\
++inptr; \
++*irreversible; \
continue; \
} \
} \
else \
{ \
/* 0x81..0x87 in UDA3, currently maps linearly to PUA */ \
ch = (ch - 0x81) * 157 + ch2 - (ch2 <= 0x7e ? 0x40 : 0x62) \
+ 0xeeb8; \
} \
} \
else \
{ \
/* This is illegal. */ \
if (! ignore_errors_p ()) \
{ \
result = __GCONV_ILLEGAL_INPUT; \
break; \
} \
\
++inptr; \
++*irreversible; \
continue; \
} \
\
inptr += 2; \
} \
else if (__builtin_expect (ch, 0) == 0xff) \
{ \
result = __GCONV_ILLEGAL_INPUT; \
break; \
} \
else /* 0x00 to 0x80 */ \
++inptr; \
\
put32 (outptr, ch); \
outptr += 4; \
}
#define LOOP_NEED_FLAGS
#include <iconv/loop.c>
/* Next, define the other direction. */
#define MIN_NEEDED_INPUT MIN_NEEDED_TO
#define MIN_NEEDED_OUTPUT MIN_NEEDED_FROM
#define MAX_NEEDED_OUTPUT MAX_NEEDED_FROM
#define LOOPFCT TO_LOOP
#define BODY \
{ \
uint32_t ch = get32 (inptr); \
const unsigned char *cp = ""; \
unsigned char b5ch[2] = "\0\0"; \
int i; \
\
for (i = 0; \
i < (int) (sizeof (from_ucs4_idx) / sizeof (from_ucs4_idx[0])); \
++i) \
{ \
if (ch < from_ucs4_idx[i].from) \
break; \
if (from_ucs4_idx[i].to >= ch) \
{ \
cp = ucs4_to_big5_hkscs[from_ucs4_idx[i].offset \
+ ch - from_ucs4_idx[i].from]; \
break; \
} \
} \
\
if (ch <= 0x80) \
{ \
b5ch[0] = ch; \
cp = b5ch; \
} \
\
if (cp[0] == '\0' && ch != 0) \
{ \
UNICODE_TAG_HANDLER (ch, 4); \
\
/* Illegal character. */ \
STANDARD_ERR_HANDLER (4); \
} \
else \
{ \
/* See whether there is enough room for the second byte we write. */ \
if (__builtin_expect (cp[1], '\1') != '\0' \
&& __builtin_expect (outptr + 1 >= outend, 0)) \
{ \
/* We have not enough room. */ \
result = __GCONV_FULL_OUTPUT; \
break; \
} \
\
*outptr++ = cp[0]; \
if (cp[1] != '\0') \
*outptr++ = cp[1]; \
} \
\
inptr += 4; \
}
#define LOOP_NEED_FLAGS
#include <iconv/loop.c>
/* Now define the toplevel functions. */
#include <iconv/skeleton.c>
EOT
}