mirror of
https://github.com/mozilla/gecko-dev.git
synced 2025-02-16 05:45:33 +00:00
move to intl/unicharutil
This commit is contained in:
parent
2e166878ba
commit
b6f97e8826
@ -1,28 +0,0 @@
|
|||||||
#!gmake
|
|
||||||
# The contents of this file are subject to the Netscape Public License
|
|
||||||
# Version 1.0 (the "NPL"); you may not use this file except in
|
|
||||||
# compliance with the NPL. You may obtain a copy of the NPL at
|
|
||||||
# http://www.mozilla.org/NPL/
|
|
||||||
#
|
|
||||||
# Software distributed under the NPL is distributed on an "AS IS" basis,
|
|
||||||
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the NPL
|
|
||||||
# for the specific language governing rights and limitations under the
|
|
||||||
# NPL.
|
|
||||||
#
|
|
||||||
# The Initial Developer of this code under the NPL is Netscape
|
|
||||||
# Communications Corporation. Portions created by Netscape are
|
|
||||||
# Copyright (C) 1998 Netscape Communications Corporation. All Rights
|
|
||||||
# Reserved.
|
|
||||||
|
|
||||||
DEPTH = ../..
|
|
||||||
topsrcdir = @top_srcdir@
|
|
||||||
VPATH = @srcdir@
|
|
||||||
srcdir = @srcdir@
|
|
||||||
|
|
||||||
include $(DEPTH)/config/autoconf.mk
|
|
||||||
|
|
||||||
DIRS = public src
|
|
||||||
|
|
||||||
DIRS += tests tools
|
|
||||||
|
|
||||||
include $(topsrcdir)/config/rules.mk
|
|
@ -1,21 +0,0 @@
|
|||||||
#!nmake
|
|
||||||
#
|
|
||||||
# The contents of this file are subject to the Netscape Public License
|
|
||||||
# Version 1.0 (the "NPL"); you may not use this file except in
|
|
||||||
# compliance with the NPL. You may obtain a copy of the NPL at
|
|
||||||
# http://www.mozilla.org/NPL/
|
|
||||||
#
|
|
||||||
# Software distributed under the NPL is distributed on an "AS IS" basis,
|
|
||||||
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the NPL
|
|
||||||
# for the specific language governing rights and limitations under the
|
|
||||||
# NPL.
|
|
||||||
#
|
|
||||||
# The Initial Developer of this code under the NPL is Netscape
|
|
||||||
# Communications Corporation. Portions created by Netscape are
|
|
||||||
# Copyright (C) 1998 Netscape Communications Corporation. All Rights
|
|
||||||
# Reserved.
|
|
||||||
DEPTH=..\..
|
|
||||||
IGNORE_MANIFEST=1
|
|
||||||
|
|
||||||
DIRS= public src tools tests
|
|
||||||
include <$(DEPTH)\config\rules.mak>
|
|
@ -1,25 +0,0 @@
|
|||||||
|
|
||||||
|
|
||||||
1. This directly have not been turn on to build system yet.
|
|
||||||
2. Currently, the implementation of the nsICaseConversion call the
|
|
||||||
UCData package develped by Mark Leisher <mleisher@crl.nmsu.edu>.
|
|
||||||
I have not check in the package into the mozilla tree yet. You can get
|
|
||||||
the package from
|
|
||||||
ftp://crl.nmsu.edu/CLR/multiling/unicode/ucdata.tar.gz
|
|
||||||
or
|
|
||||||
ftp://crl.nmsu.edu/CLR/multiling/unicode/ucdata.zip
|
|
||||||
|
|
||||||
After you get the package, you need to put those file into different
|
|
||||||
place under Mozilla tree untill we check them in.
|
|
||||||
|
|
||||||
a. put ucdata.c and ucdata.h into mozilla/modules/unicharutil/src
|
|
||||||
b. put MUTTUCData.txt and ucgendat.c into mozilla/modules/unicharutil/tools
|
|
||||||
|
|
||||||
You also need the Unicode data base from Unicode ftp site
|
|
||||||
ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData-Latest.txt
|
|
||||||
|
|
||||||
Put it under mozilla/modules/unicharutil/tools
|
|
||||||
|
|
||||||
3. The Window makefile work. I am still working on Mac and UNIX build
|
|
||||||
|
|
||||||
Frank Tang <ftang@netscape.com>
|
|
@ -1,208 +0,0 @@
|
|||||||
#
|
|
||||||
# $Id: MUTTUCData.txt,v 1.1 1999/01/06 01:46:03 ftang%netscape.com Exp $
|
|
||||||
#
|
|
||||||
# Copyright 1996, 1997, 1998 Computing Research Labs,
|
|
||||||
# New Mexico State University
|
|
||||||
#
|
|
||||||
# Permission is hereby granted, free of charge, to any person obtaining a
|
|
||||||
# copy of this software and associated documentation files (the "Software"),
|
|
||||||
# to deal in the Software without restriction, including without limitation
|
|
||||||
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
||||||
# and/or sell copies of the Software, and to permit persons to whom the
|
|
||||||
# Software is furnished to do so, subject to the following conditions:
|
|
||||||
#
|
|
||||||
# The above copyright notice and this permission notice shall be included in
|
|
||||||
# all copies or substantial portions of the Software.
|
|
||||||
#
|
|
||||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
||||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
||||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
||||||
# THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
|
|
||||||
# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
|
|
||||||
# OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
|
||||||
# THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
||||||
#
|
|
||||||
#
|
|
||||||
# Implementation specific character properties.
|
|
||||||
#
|
|
||||||
#
|
|
||||||
# Space, other.
|
|
||||||
#
|
|
||||||
0009;;Ss;;;;;;;;;;;;
|
|
||||||
000A;;Ss;;;;;;;;;;;;
|
|
||||||
000B;;Ss;;;;;;;;;;;;
|
|
||||||
000C;;Ss;;;;;;;;;;;;
|
|
||||||
000D;;Ss;;;;;;;;;;;;
|
|
||||||
#
|
|
||||||
# Non-breaking.
|
|
||||||
#
|
|
||||||
00A0;;Nb;;;;;;;;;;;;
|
|
||||||
2007;;Nb;;;;;;;;;;;;
|
|
||||||
2011;;Nb;;;;;;;;;;;;
|
|
||||||
FEFF;;Nb;;;;;;;;;;;;
|
|
||||||
#
|
|
||||||
# Symmetric.
|
|
||||||
#
|
|
||||||
0028;;Sy;;;;;;;;;;;;
|
|
||||||
0029;;Sy;;;;;;;;;;;;
|
|
||||||
005B;;Sy;;;;;;;;;;;;
|
|
||||||
005D;;Sy;;;;;;;;;;;;
|
|
||||||
007B;;Sy;;;;;;;;;;;;
|
|
||||||
007D;;Sy;;;;;;;;;;;;
|
|
||||||
00AB;;Sy;;;;;;;;;;;;
|
|
||||||
00BB;;Sy;;;;;;;;;;;;
|
|
||||||
0F3A;;Sy;;;;;;;;;;;;
|
|
||||||
0F3B;;Sy;;;;;;;;;;;;
|
|
||||||
0F3C;;Sy;;;;;;;;;;;;
|
|
||||||
0F3D;;Sy;;;;;;;;;;;;
|
|
||||||
0F3E;;Sy;;;;;;;;;;;;
|
|
||||||
0F3F;;Sy;;;;;;;;;;;;
|
|
||||||
2018;;Sy;;;;;;;;;;;;
|
|
||||||
2019;;Sy;;;;;;;;;;;;
|
|
||||||
201A;;Sy;;;;;;;;;;;;
|
|
||||||
201B;;Sy;;;;;;;;;;;;
|
|
||||||
201C;;Sy;;;;;;;;;;;;
|
|
||||||
201D;;Sy;;;;;;;;;;;;
|
|
||||||
201E;;Sy;;;;;;;;;;;;
|
|
||||||
201F;;Sy;;;;;;;;;;;;
|
|
||||||
2039;;Sy;;;;;;;;;;;;
|
|
||||||
203A;;Sy;;;;;;;;;;;;
|
|
||||||
2045;;Sy;;;;;;;;;;;;
|
|
||||||
2046;;Sy;;;;;;;;;;;;
|
|
||||||
207D;;Sy;;;;;;;;;;;;
|
|
||||||
207E;;Sy;;;;;;;;;;;;
|
|
||||||
208D;;Sy;;;;;;;;;;;;
|
|
||||||
208E;;Sy;;;;;;;;;;;;
|
|
||||||
2329;;Sy;;;;;;;;;;;;
|
|
||||||
232A;;Sy;;;;;;;;;;;;
|
|
||||||
3008;;Sy;;;;;;;;;;;;
|
|
||||||
3009;;Sy;;;;;;;;;;;;
|
|
||||||
300A;;Sy;;;;;;;;;;;;
|
|
||||||
300B;;Sy;;;;;;;;;;;;
|
|
||||||
300C;;Sy;;;;;;;;;;;;
|
|
||||||
300D;;Sy;;;;;;;;;;;;
|
|
||||||
300E;;Sy;;;;;;;;;;;;
|
|
||||||
300F;;Sy;;;;;;;;;;;;
|
|
||||||
3010;;Sy;;;;;;;;;;;;
|
|
||||||
3011;;Sy;;;;;;;;;;;;
|
|
||||||
3014;;Sy;;;;;;;;;;;;
|
|
||||||
3015;;Sy;;;;;;;;;;;;
|
|
||||||
3016;;Sy;;;;;;;;;;;;
|
|
||||||
3017;;Sy;;;;;;;;;;;;
|
|
||||||
3018;;Sy;;;;;;;;;;;;
|
|
||||||
3019;;Sy;;;;;;;;;;;;
|
|
||||||
301A;;Sy;;;;;;;;;;;;
|
|
||||||
301B;;Sy;;;;;;;;;;;;
|
|
||||||
301D;;Sy;;;;;;;;;;;;
|
|
||||||
301E;;Sy;;;;;;;;;;;;
|
|
||||||
FD3E;;Sy;;;;;;;;;;;;
|
|
||||||
FD3F;;Sy;;;;;;;;;;;;
|
|
||||||
FE35;;Sy;;;;;;;;;;;;
|
|
||||||
FE36;;Sy;;;;;;;;;;;;
|
|
||||||
FE37;;Sy;;;;;;;;;;;;
|
|
||||||
FE38;;Sy;;;;;;;;;;;;
|
|
||||||
FE39;;Sy;;;;;;;;;;;;
|
|
||||||
FE3A;;Sy;;;;;;;;;;;;
|
|
||||||
FE3B;;Sy;;;;;;;;;;;;
|
|
||||||
FE3C;;Sy;;;;;;;;;;;;
|
|
||||||
FE3D;;Sy;;;;;;;;;;;;
|
|
||||||
FE3E;;Sy;;;;;;;;;;;;
|
|
||||||
FE3F;;Sy;;;;;;;;;;;;
|
|
||||||
FE40;;Sy;;;;;;;;;;;;
|
|
||||||
FE41;;Sy;;;;;;;;;;;;
|
|
||||||
FE42;;Sy;;;;;;;;;;;;
|
|
||||||
FE43;;Sy;;;;;;;;;;;;
|
|
||||||
FE44;;Sy;;;;;;;;;;;;
|
|
||||||
FE59;;Sy;;;;;;;;;;;;
|
|
||||||
FE5A;;Sy;;;;;;;;;;;;
|
|
||||||
FE5B;;Sy;;;;;;;;;;;;
|
|
||||||
FE5C;;Sy;;;;;;;;;;;;
|
|
||||||
FE5D;;Sy;;;;;;;;;;;;
|
|
||||||
FE5E;;Sy;;;;;;;;;;;;
|
|
||||||
FF08;;Sy;;;;;;;;;;;;
|
|
||||||
FF09;;Sy;;;;;;;;;;;;
|
|
||||||
FF3B;;Sy;;;;;;;;;;;;
|
|
||||||
FF3D;;Sy;;;;;;;;;;;;
|
|
||||||
FF5B;;Sy;;;;;;;;;;;;
|
|
||||||
FF5D;;Sy;;;;;;;;;;;;
|
|
||||||
FF62;;Sy;;;;;;;;;;;;
|
|
||||||
FF63;;Sy;;;;;;;;;;;;
|
|
||||||
#
|
|
||||||
# Hex digit.
|
|
||||||
#
|
|
||||||
0030;;Hd;;;;;;;;;;;;
|
|
||||||
0031;;Hd;;;;;;;;;;;;
|
|
||||||
0032;;Hd;;;;;;;;;;;;
|
|
||||||
0033;;Hd;;;;;;;;;;;;
|
|
||||||
0034;;Hd;;;;;;;;;;;;
|
|
||||||
0035;;Hd;;;;;;;;;;;;
|
|
||||||
0036;;Hd;;;;;;;;;;;;
|
|
||||||
0037;;Hd;;;;;;;;;;;;
|
|
||||||
0038;;Hd;;;;;;;;;;;;
|
|
||||||
0039;;Hd;;;;;;;;;;;;
|
|
||||||
0041;;Hd;;;;;;;;;;;;
|
|
||||||
0042;;Hd;;;;;;;;;;;;
|
|
||||||
0043;;Hd;;;;;;;;;;;;
|
|
||||||
0044;;Hd;;;;;;;;;;;;
|
|
||||||
0045;;Hd;;;;;;;;;;;;
|
|
||||||
0046;;Hd;;;;;;;;;;;;
|
|
||||||
0061;;Hd;;;;;;;;;;;;
|
|
||||||
0062;;Hd;;;;;;;;;;;;
|
|
||||||
0063;;Hd;;;;;;;;;;;;
|
|
||||||
0064;;Hd;;;;;;;;;;;;
|
|
||||||
0065;;Hd;;;;;;;;;;;;
|
|
||||||
0066;;Hd;;;;;;;;;;;;
|
|
||||||
FF10;;Hd;;;;;;;;;;;;
|
|
||||||
FF11;;Hd;;;;;;;;;;;;
|
|
||||||
FF12;;Hd;;;;;;;;;;;;
|
|
||||||
FF13;;Hd;;;;;;;;;;;;
|
|
||||||
FF14;;Hd;;;;;;;;;;;;
|
|
||||||
FF15;;Hd;;;;;;;;;;;;
|
|
||||||
FF16;;Hd;;;;;;;;;;;;
|
|
||||||
FF17;;Hd;;;;;;;;;;;;
|
|
||||||
FF18;;Hd;;;;;;;;;;;;
|
|
||||||
FF19;;Hd;;;;;;;;;;;;
|
|
||||||
FF21;;Hd;;;;;;;;;;;;
|
|
||||||
FF22;;Hd;;;;;;;;;;;;
|
|
||||||
FF23;;Hd;;;;;;;;;;;;
|
|
||||||
FF24;;Hd;;;;;;;;;;;;
|
|
||||||
FF25;;Hd;;;;;;;;;;;;
|
|
||||||
FF26;;Hd;;;;;;;;;;;;
|
|
||||||
FF41;;Hd;;;;;;;;;;;;
|
|
||||||
FF42;;Hd;;;;;;;;;;;;
|
|
||||||
FF43;;Hd;;;;;;;;;;;;
|
|
||||||
FF44;;Hd;;;;;;;;;;;;
|
|
||||||
FF45;;Hd;;;;;;;;;;;;
|
|
||||||
FF46;;Hd;;;;;;;;;;;;
|
|
||||||
#
|
|
||||||
# Quote marks.
|
|
||||||
#
|
|
||||||
0022;;Qm;;;;;;;;;;;;
|
|
||||||
0027;;Qm;;;;;;;;;;;;
|
|
||||||
00AB;;Qm;;;;;;;;;;;;
|
|
||||||
00BB;;Qm;;;;;;;;;;;;
|
|
||||||
2018;;Qm;;;;;;;;;;;;
|
|
||||||
2019;;Qm;;;;;;;;;;;;
|
|
||||||
201A;;Qm;;;;;;;;;;;;
|
|
||||||
201B;;Qm;;;;;;;;;;;;
|
|
||||||
201C;;Qm;;;;;;;;;;;;
|
|
||||||
201D;;Qm;;;;;;;;;;;;
|
|
||||||
201E;;Qm;;;;;;;;;;;;
|
|
||||||
201F;;Qm;;;;;;;;;;;;
|
|
||||||
2039;;Qm;;;;;;;;;;;;
|
|
||||||
203A;;Qm;;;;;;;;;;;;
|
|
||||||
300C;;Qm;;;;;;;;;;;;
|
|
||||||
300D;;Qm;;;;;;;;;;;;
|
|
||||||
300E;;Qm;;;;;;;;;;;;
|
|
||||||
300F;;Qm;;;;;;;;;;;;
|
|
||||||
301D;;Qm;;;;;;;;;;;;
|
|
||||||
301E;;Qm;;;;;;;;;;;;
|
|
||||||
301F;;Qm;;;;;;;;;;;;
|
|
||||||
FE41;;Qm;;;;;;;;;;;;
|
|
||||||
FE42;;Qm;;;;;;;;;;;;
|
|
||||||
FE43;;Qm;;;;;;;;;;;;
|
|
||||||
FE44;;Qm;;;;;;;;;;;;
|
|
||||||
FF02;;Qm;;;;;;;;;;;;
|
|
||||||
FF07;;Qm;;;;;;;;;;;;
|
|
||||||
FF62;;Qm;;;;;;;;;;;;
|
|
||||||
FF63;;Qm;;;;;;;;;;;;
|
|
@ -1,58 +0,0 @@
|
|||||||
#!gmake
|
|
||||||
#
|
|
||||||
# The contents of this file are subject to the Netscape Public License
|
|
||||||
# Version 1.0 (the "NPL"); you may not use this file except in
|
|
||||||
# compliance with the NPL. You may obtain a copy of the NPL at
|
|
||||||
# http://www.mozilla.org/NPL/
|
|
||||||
#
|
|
||||||
# Software distributed under the NPL is distributed on an "AS IS" basis,
|
|
||||||
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the NPL
|
|
||||||
# for the specific language governing rights and limitations under the
|
|
||||||
# NPL.
|
|
||||||
#
|
|
||||||
# The Initial Developer of this code under the NPL is Netscape
|
|
||||||
# Communications Corporation. Portions created by Netscape are
|
|
||||||
# Copyright (C) 1998 Netscape Communications Corporation. All Rights
|
|
||||||
# Reserved.
|
|
||||||
|
|
||||||
DEPTH = ../../..
|
|
||||||
topsrcdir = @top_srcdir@
|
|
||||||
VPATH = @srcdir@
|
|
||||||
srcdir = @srcdir@
|
|
||||||
|
|
||||||
include $(DEPTH)/config/autoconf.mk
|
|
||||||
|
|
||||||
MODULE = ucgendat
|
|
||||||
|
|
||||||
CSRCS = \
|
|
||||||
ucgendat.c \
|
|
||||||
$(NULL)
|
|
||||||
|
|
||||||
include $(topsrcdir)/config/config.mk
|
|
||||||
|
|
||||||
# XXX replace with proper configure test for glib
|
|
||||||
CFLAGS += $(shell glib-config --cflags)
|
|
||||||
|
|
||||||
# XXX need configure test
|
|
||||||
EX_LIBS = $(shell glib-config --libs)
|
|
||||||
|
|
||||||
PROGS = $(OBJDIR)/ucgendat
|
|
||||||
|
|
||||||
TARGETS= $(PROGS)
|
|
||||||
|
|
||||||
include $(topsrcdir)/config/rules.mk
|
|
||||||
|
|
||||||
$(PROGS): $(OBJS)
|
|
||||||
@$(MAKE_OBJDIR)
|
|
||||||
$(CC) -o $@ $(OBJS) $(LD_FLAGS) $(EX_LIBS) $(OS_LIBS)
|
|
||||||
|
|
||||||
export::
|
|
||||||
|
|
||||||
install:: $(TARGETS)
|
|
||||||
$(INSTALL) $(PROGS) $(DIST)/bin
|
|
||||||
$(INSTALL) $(srcdir)/data/ctype.dat $(DIST)/bin/res/unicharutil
|
|
||||||
$(INSTALL) $(srcdir)/data/case.dat $(DIST)/bin/res/unicharutil
|
|
||||||
|
|
||||||
clobber::
|
|
||||||
rm -f $(DIST)/bin/ucgendat
|
|
||||||
rm -f $(PROGS) $(OBJS)
|
|
@ -1,207 +0,0 @@
|
|||||||
#
|
|
||||||
# $Id: UCDATAREADME.txt,v 1.1 1999/01/06 01:46:03 ftang%netscape.com Exp $
|
|
||||||
#
|
|
||||||
|
|
||||||
MUTT UCData Package 1.9
|
|
||||||
-----------------------
|
|
||||||
|
|
||||||
This is a package that supports ctype-like operations for Unicode UCS-2 text
|
|
||||||
(and surrogates), case mapping, and decomposition lookup. To use it, you will
|
|
||||||
need to get the "UnicodeData-2.0.14.txt" (or later) file from the Unicode Web
|
|
||||||
or FTP site.
|
|
||||||
|
|
||||||
This package consists of two parts:
|
|
||||||
|
|
||||||
1. A program called "ucgendat" which generates five data files from the
|
|
||||||
UnicodeData-2.*.txt file. The files are:
|
|
||||||
|
|
||||||
A. case.dat - the case mappings.
|
|
||||||
B. ctype.dat - the character property tables.
|
|
||||||
C. decomp.dat - the character decompositions.
|
|
||||||
D. cmbcl.dat - the non-zero combining classes.
|
|
||||||
E. num.dat - the codes representing numbers.
|
|
||||||
|
|
||||||
2. The "ucdata.[ch]" files which implement the functions needed to
|
|
||||||
check to see if a character matches groups of properties, to map between
|
|
||||||
upper, lower, and title case, to look up the decomposition of a
|
|
||||||
character, look up the combining class of a character, and get the number
|
|
||||||
value of a character.
|
|
||||||
|
|
||||||
A short reference to the functions available is in the "api.txt" file.
|
|
||||||
|
|
||||||
Techie Details
|
|
||||||
==============
|
|
||||||
|
|
||||||
The "ucgendat" program parses files from the command line which are all in the
|
|
||||||
Unicode Character Database (UCDB) format. An additional properties file,
|
|
||||||
"MUTTUCData.txt", provides some extra properties for some characters.
|
|
||||||
|
|
||||||
The program looks for the two character properties fields (2 and 4), the
|
|
||||||
combining class field (3), the decomposition field (5), the numeric value
|
|
||||||
field (8), and the case mapping fields (12, 13, and 14). The decompositions
|
|
||||||
are recursively expanded before being written out.
|
|
||||||
|
|
||||||
The decomposition table contains all the canonical decompositions. This means
|
|
||||||
all decompositions that do not have tags such as "<compat>" or "<font>".
|
|
||||||
|
|
||||||
The data is almost all stored as unsigned longs (32-bits assumed) and the
|
|
||||||
routines that load the data take care of endian swaps when necessary. This
|
|
||||||
also means that surrogates (>= 0x10000) can be placed in the data files the
|
|
||||||
"ucgendat" program parses.
|
|
||||||
|
|
||||||
The data is written as external files and broken into five parts so it can be
|
|
||||||
selectively updated at runtime if necessary.
|
|
||||||
|
|
||||||
The data files currently generated from the "ucgendat" program total about 56K
|
|
||||||
in size all together.
|
|
||||||
|
|
||||||
The format of the binary data files is documented in the "format.txt" file.
|
|
||||||
|
|
||||||
Mark Leisher <mleisher@crl.nmsu.edu>
|
|
||||||
13 December 1998
|
|
||||||
|
|
||||||
CHANGES
|
|
||||||
=======
|
|
||||||
|
|
||||||
Version 1.9
|
|
||||||
-----------
|
|
||||||
1. Fixed a problem with an incorrect amount of storage being allocated for the
|
|
||||||
combining class nodes.
|
|
||||||
|
|
||||||
2. Fixed an invalid initialization in the number code.
|
|
||||||
|
|
||||||
3. Changed the Java template file formatting a bit.
|
|
||||||
|
|
||||||
4. Added tables and function for getting decompositions in the Java class.
|
|
||||||
|
|
||||||
Version 1.8
|
|
||||||
-----------
|
|
||||||
1. Fixed a problem with adding certain ranges.
|
|
||||||
|
|
||||||
2. Added two more macros for testing for identifiers.
|
|
||||||
|
|
||||||
3. Tested with the UnicodeData-2.1.5.txt file.
|
|
||||||
|
|
||||||
Version 1.7
|
|
||||||
-----------
|
|
||||||
1. Fixed a problem with looking up decompositions in "ucgendat."
|
|
||||||
|
|
||||||
Version 1.6
|
|
||||||
-----------
|
|
||||||
1. Added two new properties introduced with UnicodeData-2.1.4.txt.
|
|
||||||
|
|
||||||
2. Changed the "ucgendat.c" program a little to automatically align the
|
|
||||||
property data on a 4-byte boundary when new properties are added.
|
|
||||||
|
|
||||||
3. Changed the "ucgendat.c" programs to only generate canonical
|
|
||||||
decompositions.
|
|
||||||
|
|
||||||
4. Added two new macros ucisinitialpunct() and ucisfinalpunct() to check for
|
|
||||||
initial and final punctuation characters.
|
|
||||||
|
|
||||||
5. Minor additions and changes to the documentation.
|
|
||||||
|
|
||||||
Version 1.5
|
|
||||||
-----------
|
|
||||||
1. Changed all file open calls to include binary mode with "b" for DOS/WIN
|
|
||||||
platforms.
|
|
||||||
|
|
||||||
2. Wrapped the unistd.h include so it won't be included when compiled under
|
|
||||||
Win32.
|
|
||||||
|
|
||||||
3. Fixed a bad range check for hex digits in ucgendat.c.
|
|
||||||
|
|
||||||
4. Fixed a bad endian swap for combining classes.
|
|
||||||
|
|
||||||
5. Added code to make a number table and associated lookup functions.
|
|
||||||
Functions added are ucnumber(), ucdigit(), and ucgetnumber(). The last
|
|
||||||
function is to maintain compatibility with John Cowan's "uctype" package.
|
|
||||||
|
|
||||||
Version 1.4
|
|
||||||
-----------
|
|
||||||
1. Fixed a bug with adding a range.
|
|
||||||
|
|
||||||
2. Fixed a bug with inserting a range in order.
|
|
||||||
|
|
||||||
3. Fixed incorrectly specified ucisdefined() and ucisundefined() macros.
|
|
||||||
|
|
||||||
4. Added the missing unload for the combining class data.
|
|
||||||
|
|
||||||
5. Fixed a bad macro placement in ucisweak().
|
|
||||||
|
|
||||||
Version 1.3
|
|
||||||
-----------
|
|
||||||
1. Bug with case mapping calculations fixed.
|
|
||||||
|
|
||||||
2. Bug with empty character property entries fixed.
|
|
||||||
|
|
||||||
3. Bug with incorrect type in the combining class lookup fixed.
|
|
||||||
|
|
||||||
4. Some corrections done to api.txt.
|
|
||||||
|
|
||||||
5. Bug in certain character property lookups fixed.
|
|
||||||
|
|
||||||
6. Added a character property table that records the defined characters.
|
|
||||||
|
|
||||||
7. Replaced ucisunknown() with ucisdefined() and ucisundefined().
|
|
||||||
|
|
||||||
Version 1.2
|
|
||||||
-----------
|
|
||||||
1. Added code to ucgendat to generate a combining class table.
|
|
||||||
|
|
||||||
2. Fixed an endian problem with the byte count of decompositions.
|
|
||||||
|
|
||||||
3. Fixed some minor problems in the "format.txt" file.
|
|
||||||
|
|
||||||
4. Removed some bogus "Ss" values from MUTTUCData.txt file.
|
|
||||||
|
|
||||||
5. Added API function to get combining class.
|
|
||||||
|
|
||||||
6. Changed the open mode to "rb" so binary data files will be opened correctly
|
|
||||||
on DOS/WIN as well as other platforms.
|
|
||||||
|
|
||||||
7. Added the "api.txt" file.
|
|
||||||
|
|
||||||
Version 1.1
|
|
||||||
-----------
|
|
||||||
1. Added ucisxdigit() which I overlooked.
|
|
||||||
|
|
||||||
2. Added UC_LT to the ucisalpha() macro which I overlooked.
|
|
||||||
|
|
||||||
3. Change uciscntrl() to include UC_CF.
|
|
||||||
|
|
||||||
4. Added ucisocntrl() and ucfntcntrl() macros.
|
|
||||||
|
|
||||||
5. Added a ucisblank() which I overlooked.
|
|
||||||
|
|
||||||
6. Added missing properties to ucissymbol() and ucisnumber().
|
|
||||||
|
|
||||||
7. Added ucisgraph() and ucisprint().
|
|
||||||
|
|
||||||
8. Changed the "Mr" property to "Sy" to mark this subset of mirroring
|
|
||||||
characters as symmetric to avoid trampling the Unicode/ISO10646 sense of
|
|
||||||
mirroring.
|
|
||||||
|
|
||||||
9. Added another property called "Ss" which includes control characters
|
|
||||||
traditionally seen as spaces in the isspace() macro.
|
|
||||||
|
|
||||||
10. Added a bunch of macros to be API compatible with John Cowan's package.
|
|
||||||
|
|
||||||
ACKNOWLEDGEMENTS
|
|
||||||
================
|
|
||||||
|
|
||||||
Thanks go to John Cowan <cowan@locke.ccil.org> for pointing out lots of
|
|
||||||
missing things and giving me stuff, particularly a bunch of new macros.
|
|
||||||
|
|
||||||
Thanks go to Bob Verbrugge <bob_verbrugge@nl.compuware.com> for pointing out
|
|
||||||
various bugs.
|
|
||||||
|
|
||||||
Thanks go to Christophe Pierret <cpierret@businessobjects.com> for pointing
|
|
||||||
out that file modes need to have "b" for DOS/WIN machines, pointing out
|
|
||||||
unistd.h is not a Win 32 header, and pointing out a problem with ucisalnum().
|
|
||||||
|
|
||||||
Thanks go to Kent Johnson <kent@pondview.mv.com> for finding a bug that caused
|
|
||||||
incomplete decompositions to be generated by the "ucgendat" program.
|
|
||||||
|
|
||||||
Thanks go to Valeriy E. Ushakov <uwe@ptc.spbu.ru> for spotting an allocation
|
|
||||||
error and an initialization error.
|
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -1,243 +0,0 @@
|
|||||||
#
|
|
||||||
# $Id: format.txt,v 1.1 1999/01/06 01:46:03 ftang%netscape.com Exp $
|
|
||||||
#
|
|
||||||
|
|
||||||
CHARACTER DATA
|
|
||||||
==============
|
|
||||||
|
|
||||||
This package generates some data files that contain character properties useful
|
|
||||||
for text processing.
|
|
||||||
|
|
||||||
CHARACTER PROPERTIES
|
|
||||||
====================
|
|
||||||
|
|
||||||
The first data file is called "ctype.dat" and contains a compressed form of
|
|
||||||
the character properties found in the Unicode Character Database (UCDB).
|
|
||||||
Additional properties can be specified in limited UCDB format in another file
|
|
||||||
to avoid modifying the original UCDB.
|
|
||||||
|
|
||||||
The following is a property name and code table to be used with the character
|
|
||||||
data:
|
|
||||||
|
|
||||||
NAME CODE DESCRIPTION
|
|
||||||
---------------------
|
|
||||||
Mn 0 Mark, Non-Spacing
|
|
||||||
Mc 1 Mark, Spacing Combining
|
|
||||||
Me 2 Mark, Enclosing
|
|
||||||
Nd 3 Number, Decimal Digit
|
|
||||||
Nl 4 Number, Letter
|
|
||||||
No 5 Number, Other
|
|
||||||
Zs 6 Separator, Space
|
|
||||||
Zl 7 Separator, Line
|
|
||||||
Zp 8 Separator, Paragraph
|
|
||||||
Cc 9 Other, Control
|
|
||||||
Cf 10 Other, Format
|
|
||||||
Cs 11 Other, Surrogate
|
|
||||||
Co 12 Other, Private Use
|
|
||||||
Cn 13 Other, Not Assigned
|
|
||||||
Lu 14 Letter, Uppercase
|
|
||||||
Ll 15 Letter, Lowercase
|
|
||||||
Lt 16 Letter, Titlecase
|
|
||||||
Lm 17 Letter, Modifier
|
|
||||||
Lo 18 Letter, Other
|
|
||||||
Pc 19 Punctuation, Connector
|
|
||||||
Pd 20 Punctuation, Dash
|
|
||||||
Ps 21 Punctuation, Open
|
|
||||||
Pe 22 Punctuation, Close
|
|
||||||
Po 23 Punctuation, Other
|
|
||||||
Sm 24 Symbol, Math
|
|
||||||
Sc 25 Symbol, Currency
|
|
||||||
Sk 26 Symbol, Modifier
|
|
||||||
So 27 Symbol, Other
|
|
||||||
L 28 Left-To-Right
|
|
||||||
R 29 Right-To-Left
|
|
||||||
EN 30 European Number
|
|
||||||
ES 31 European Number Separator
|
|
||||||
ET 32 European Number Terminator
|
|
||||||
AN 33 Arabic Number
|
|
||||||
CS 34 Common Number Separator
|
|
||||||
B 35 Block Separator
|
|
||||||
S 36 Segment Separator
|
|
||||||
WS 37 Whitespace
|
|
||||||
ON 38 Other Neutrals
|
|
||||||
Pi 47 Punctuation, Initial
|
|
||||||
Pf 48 Punctuation, Final
|
|
||||||
#
|
|
||||||
# Implementation specific properties.
|
|
||||||
#
|
|
||||||
Cm 39 Composite
|
|
||||||
Nb 40 Non-Breaking
|
|
||||||
Sy 41 Symmetric (characters which are part of open/close pairs)
|
|
||||||
Hd 42 Hex Digit
|
|
||||||
Qm 43 Quote Mark
|
|
||||||
Mr 44 Mirroring
|
|
||||||
Ss 45 Space, Other (controls viewed as spaces in ctype isspace())
|
|
||||||
Cp 46 Defined character
|
|
||||||
|
|
||||||
The actual binary data is formatted as follows:
|
|
||||||
|
|
||||||
Assumptions: unsigned short is at least 16-bits in size and unsigned long
|
|
||||||
is at least 32-bits in size.
|
|
||||||
|
|
||||||
unsigned short ByteOrderMark
|
|
||||||
unsigned short OffsetArraySize
|
|
||||||
unsigned long Bytes
|
|
||||||
unsigned short Offsets[OffsetArraySize + 1]
|
|
||||||
unsigned long Ranges[N], N = value of Offsets[OffsetArraySize]
|
|
||||||
|
|
||||||
The Bytes field provides the total byte count used for the Offsets[] and
|
|
||||||
Ranges[] arrays. The Offsets[] array is aligned on a 4-byte boundary and
|
|
||||||
there is always one extra node on the end to hold the final index of the
|
|
||||||
Ranges[] array. The Ranges[] array contains pairs of 4-byte values
|
|
||||||
representing a range of Unicode characters. The pairs are arranged in
|
|
||||||
increasing order by the first character code in the range.
|
|
||||||
|
|
||||||
Determining if a particular character is in the property list requires a
|
|
||||||
simple binary search to determine if a character is in any of the ranges
|
|
||||||
for the property.
|
|
||||||
|
|
||||||
If the ByteOrderMark is equal to 0xFFFE, then the data was generated on a
|
|
||||||
machine with a different endian order and the values must be byte-swapped.
|
|
||||||
|
|
||||||
To swap a 16-bit value:
|
|
||||||
c = (c >> 8) | ((c & 0xff) << 8)
|
|
||||||
|
|
||||||
To swap a 32-bit value:
|
|
||||||
c = ((c & 0xff) << 24) | (((c >> 8) & 0xff) << 16) |
|
|
||||||
(((c >> 16) & 0xff) << 8) | (c >> 24)
|
|
||||||
|
|
||||||
CASE MAPPINGS
|
|
||||||
=============
|
|
||||||
|
|
||||||
The next data file is called "case.dat" and contains three case mapping tables
|
|
||||||
in the following order: upper, lower, and title case. Each table is in
|
|
||||||
increasing order by character code and each mapping contains 3 unsigned longs
|
|
||||||
which represent the possible mappings.
|
|
||||||
|
|
||||||
The format for the binary form of these tables is:
|
|
||||||
|
|
||||||
unsigned short ByteOrderMark
|
|
||||||
unsigned short NumMappingNodes, count of all mapping nodes
|
|
||||||
unsigned short CaseTableSizes[2], upper and lower mapping node counts
|
|
||||||
unsigned long CaseTables[NumMappingNodes]
|
|
||||||
|
|
||||||
The starting indexes of the case tables are calculated as following:
|
|
||||||
|
|
||||||
UpperIndex = 0;
|
|
||||||
LowerIndex = CaseTableSizes[0] * 3;
|
|
||||||
TitleIndex = LowerIndex + CaseTableSizes[1] * 3;
|
|
||||||
|
|
||||||
The order of the fields for the three tables are:
|
|
||||||
|
|
||||||
Upper case
|
|
||||||
----------
|
|
||||||
unsigned long upper;
|
|
||||||
unsigned long lower;
|
|
||||||
unsigned long title;
|
|
||||||
|
|
||||||
Lower case
|
|
||||||
----------
|
|
||||||
unsigned long lower;
|
|
||||||
unsigned long upper;
|
|
||||||
unsigned long title;
|
|
||||||
|
|
||||||
Title case
|
|
||||||
----------
|
|
||||||
unsigned long title;
|
|
||||||
unsigned long upper;
|
|
||||||
unsigned long lower;
|
|
||||||
|
|
||||||
If the ByteOrderMark is equal to 0xFFFE, endian swapping is required in the
|
|
||||||
same way as described in the CHARACTER PROPERTIES section.
|
|
||||||
|
|
||||||
Because the tables are in increasing order by character code, locating a
|
|
||||||
mapping requires a simple binary search on one of the 3 codes that make up
|
|
||||||
each node.
|
|
||||||
|
|
||||||
It is important to note that there can only be 65536 mapping nodes which
|
|
||||||
divided into 3 portions allows 21845 nodes for each case mapping table. The
|
|
||||||
distribution of mappings may be more or less than 21845 per table, but only
|
|
||||||
65536 are allowed.
|
|
||||||
|
|
||||||
DECOMPOSITIONS
|
|
||||||
==============
|
|
||||||
|
|
||||||
The next data file is called "decomp.dat" and contains the decomposition data
|
|
||||||
for all characters with decompositions containing more than one character and
|
|
||||||
are *not* compatibility decompositions. Compatibility decompositions are
|
|
||||||
signaled in the UCDB format by the use of the <compat> tag in the
|
|
||||||
decomposition field. Each list of character codes represents a full
|
|
||||||
decomposition of a composite character. The nodes are arranged in increasing
|
|
||||||
order by character code.
|
|
||||||
|
|
||||||
The format for the binary form of this table is:
|
|
||||||
|
|
||||||
unsigned short ByteOrderMark
|
|
||||||
unsigned short NumDecompNodes, count of all decomposition nodes
|
|
||||||
unsigned long Bytes
|
|
||||||
unsigned long DecompNodes[(NumDecompNodes * 2) + 1]
|
|
||||||
unsigned long Decomp[N], N = sum of all counts in DecompNodes[]
|
|
||||||
|
|
||||||
If the ByteOrderMark is equal to 0xFFFE, endian swapping is required in the
|
|
||||||
same way as described in the CHARACTER PROPERTIES section.
|
|
||||||
|
|
||||||
The DecompNodes[] array consists of pairs of unsigned longs, the first of
|
|
||||||
which is the character code and the second is the initial index of the list
|
|
||||||
of character codes representing the decomposition.
|
|
||||||
|
|
||||||
Locating the decomposition of a composite character requires a binary search
|
|
||||||
for a character code in the DecompNodes[] array and using its index to
|
|
||||||
locate the start of the decomposition. The length of the decomposition list
|
|
||||||
is the index in the following element in DecompNode[] minus the current
|
|
||||||
index.
|
|
||||||
|
|
||||||
COMBINING CLASSES
|
|
||||||
=================
|
|
||||||
|
|
||||||
The fourth data file is called "cmbcl.dat" and contains the characters with
|
|
||||||
non-zero combining classes.
|
|
||||||
|
|
||||||
The format for the binary form of this table is:
|
|
||||||
|
|
||||||
unsigned short ByteOrderMark
|
|
||||||
unsigned short NumCCLNodes
|
|
||||||
unsigned long Bytes
|
|
||||||
unsigned long CCLNodes[NumCCLNodes * 3]
|
|
||||||
|
|
||||||
If the ByteOrderMark is equal to 0xFFFE, endian swapping is required in the
|
|
||||||
same way as described in the CHARACTER PROPERTIES section.
|
|
||||||
|
|
||||||
The CCLNodes[] array consists of groups of three unsigned longs. The first
|
|
||||||
and second are the beginning and ending of a range and the third is the
|
|
||||||
combining class of that range.
|
|
||||||
|
|
||||||
If a character is not found in this table, then the combining class is
|
|
||||||
assumed to be 0.
|
|
||||||
|
|
||||||
It is important to note that only 65536 distinct ranges plus combining class
|
|
||||||
can be specified because the NumCCLNodes is usually a 16-bit number.
|
|
||||||
|
|
||||||
NUMBER TABLE
|
|
||||||
============
|
|
||||||
|
|
||||||
The final data file is called "num.dat" and contains the characters that have
|
|
||||||
a numeric value associated with them.
|
|
||||||
|
|
||||||
The format for the binary form of the table is:
|
|
||||||
|
|
||||||
unsigned short ByteOrderMark
|
|
||||||
unsigned short NumNumberNodes
|
|
||||||
unsigned long Bytes
|
|
||||||
unsigned long NumberNodes[NumNumberNodes]
|
|
||||||
unsigned short ValueNodes[(Bytes - (NumNumberNodes * sizeof(unsigned long)))
|
|
||||||
/ sizeof(short)]
|
|
||||||
|
|
||||||
If the ByteOrderMark is equal to 0xFFFE, endian swapping is required in the
|
|
||||||
same way as described in the CHARACTER PROPERTIES section.
|
|
||||||
|
|
||||||
The NumberNodes array contains pairs of values, the first of which is the
|
|
||||||
character code and the second an index into the ValueNodes array. The
|
|
||||||
ValueNodes array contains pairs of integers which represent the numerator
|
|
||||||
and denominator of the numeric value of the character. If the character
|
|
||||||
happens to map to an integer, both the values in ValueNodes will be the
|
|
||||||
same.
|
|
@ -1,60 +0,0 @@
|
|||||||
#!nmake
|
|
||||||
#
|
|
||||||
# The contents of this file are subject to the Netscape Public License
|
|
||||||
# Version 1.0 (the "NPL"); you may not use this file except in
|
|
||||||
# compliance with the NPL. You may obtain a copy of the NPL at
|
|
||||||
# http://www.mozilla.org/NPL/
|
|
||||||
#
|
|
||||||
# Software distributed under the NPL is distributed on an "AS IS" basis,
|
|
||||||
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the NPL
|
|
||||||
# for the specific language governing rights and limitations under the
|
|
||||||
# NPL.
|
|
||||||
#
|
|
||||||
# The Initial Developer of this code under the NPL is Netscape
|
|
||||||
# Communications Corporation. Portions created by Netscape are
|
|
||||||
# Copyright (C) 1998 Netscape Communications Corporation. All Rights
|
|
||||||
# Reserved.
|
|
||||||
DEPTH = ..\..\..
|
|
||||||
|
|
||||||
IGNORE_MANIFEST=1
|
|
||||||
|
|
||||||
|
|
||||||
MAKE_OBJ_TYPE = EXE
|
|
||||||
PROG1 = .\$(OBJDIR)\ucgendat.exe
|
|
||||||
PROGRAMS = $(PROG1)
|
|
||||||
CTYPE_DATA = .\data\ctype.dat
|
|
||||||
CASE_DATA = .\data\case.dat
|
|
||||||
DATA = $(CTYPE_DATA) $(CASE_DATA)
|
|
||||||
|
|
||||||
UCGENFLAGS= -o data
|
|
||||||
TABLESRC = .\UnicodeData-Latest.txt .\MUTTUCData.txt
|
|
||||||
include <$(DEPTH)\config\rules.mak>
|
|
||||||
|
|
||||||
install:: $(PROGRAMS) $(DATA)
|
|
||||||
-for %d in ($(DATA)) do $(MAKE_INSTALL) %d $(DIST)\bin\res\unicharutil
|
|
||||||
|
|
||||||
clobber::
|
|
||||||
-for %p in ($(PROGRAMS)) do $(RM) %p
|
|
||||||
|
|
||||||
# Move this into config/obj.inc when it's allowed
|
|
||||||
.c{.\$(OBJDIR)\}.exe:
|
|
||||||
$(CC) @<<$(CFGFILE)
|
|
||||||
$(CFLAGS)
|
|
||||||
$(LCFLAGS)
|
|
||||||
$(LINCS)
|
|
||||||
$(LINCS_1)
|
|
||||||
$(INCS)
|
|
||||||
$(LLIBS)
|
|
||||||
$(OS_LIBS)
|
|
||||||
-Fd$(PBDFILE)
|
|
||||||
-Fe.\$(OBJDIR)\
|
|
||||||
-Fo.\$(OBJDIR)\
|
|
||||||
$(CURDIR)$(*B).c
|
|
||||||
<<KEEP
|
|
||||||
|
|
||||||
$(PROG1): $(OBJDIR) ucgendat.c
|
|
||||||
|
|
||||||
table:: $(TABLESRC)
|
|
||||||
type $(TABLESRC) > mergedtable.txt
|
|
||||||
$(PROG1) $(UCGENFLAGS) mergedtable.txt
|
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
Loading…
x
Reference in New Issue
Block a user