mirror of
https://github.com/mozilla/gecko-dev.git
synced 2024-11-24 21:31:04 +00:00
790694c449
--HG-- rename : intl/lwbrk/src/Makefile.in => intl/lwbrk/Makefile.in rename : intl/lwbrk/src/crashtests/416721.html => intl/lwbrk/crashtests/416721.html rename : intl/lwbrk/src/crashtests/crashtests.list => intl/lwbrk/crashtests/crashtests.list rename : intl/lwbrk/src/jisx4051class.h => intl/lwbrk/jisx4051class.h rename : intl/lwbrk/src/jisx4051pairtable.txt => intl/lwbrk/jisx4051pairtable.txt rename : intl/lwbrk/src/nsCarbonBreaker.cpp => intl/lwbrk/nsCarbonBreaker.cpp rename : intl/lwbrk/src/nsComplexBreaker.h => intl/lwbrk/nsComplexBreaker.h rename : intl/lwbrk/public/nsILineBreaker.h => intl/lwbrk/nsILineBreaker.h rename : intl/lwbrk/idl/nsISemanticUnitScanner.idl => intl/lwbrk/nsISemanticUnitScanner.idl rename : intl/lwbrk/public/nsIWordBreaker.h => intl/lwbrk/nsIWordBreaker.h rename : intl/lwbrk/src/nsJISx4051LineBreaker.cpp => intl/lwbrk/nsJISx4051LineBreaker.cpp rename : intl/lwbrk/src/nsJISx4051LineBreaker.h => intl/lwbrk/nsJISx4051LineBreaker.h rename : intl/lwbrk/public/nsLWBrkCIID.h => intl/lwbrk/nsLWBrkCIID.h rename : intl/lwbrk/src/nsPangoBreaker.cpp => intl/lwbrk/nsPangoBreaker.cpp rename : intl/lwbrk/src/nsRuleBreaker.cpp => intl/lwbrk/nsRuleBreaker.cpp rename : intl/lwbrk/src/nsSampleWordBreaker.cpp => intl/lwbrk/nsSampleWordBreaker.cpp rename : intl/lwbrk/src/nsSampleWordBreaker.h => intl/lwbrk/nsSampleWordBreaker.h rename : intl/lwbrk/src/nsSemanticUnitScanner.cpp => intl/lwbrk/nsSemanticUnitScanner.cpp rename : intl/lwbrk/src/nsSemanticUnitScanner.h => intl/lwbrk/nsSemanticUnitScanner.h rename : intl/lwbrk/src/nsUniscribeBreaker.cpp => intl/lwbrk/nsUniscribeBreaker.cpp rename : intl/lwbrk/src/rulebrk.c => intl/lwbrk/rulebrk.c rename : intl/lwbrk/src/rulebrk.h => intl/lwbrk/rulebrk.h rename : intl/lwbrk/src/th_char.h => intl/lwbrk/th_char.h
287 lines
9.0 KiB
Plaintext
287 lines
9.0 KiB
Plaintext
|
|
|
|
|
|
/*
|
|
|
|
Simplification of Pair Table in JIS X 4051
|
|
|
|
1. The Origion Table - in 4.1.3
|
|
|
|
In JIS x 4051. The pair table is defined as below
|
|
|
|
Class of
|
|
Leading Class of Trailing Char Class
|
|
Char
|
|
|
|
1 2 3 4 5 6 7 8 9 10 11 12 13 13 14 14 15 16 17 18 19 20
|
|
* # * #
|
|
1 X X X X X X X X X X X X X X X X X X X X X E
|
|
2 X X X X X X
|
|
3 X X X X X X
|
|
4 X X X X X X
|
|
5 X X X X X X
|
|
6 X X X X X X
|
|
7 X X X X X X X
|
|
8 X X X X X X E
|
|
9 X X X X X X
|
|
10 X X X X X X
|
|
11 X X X X X X
|
|
12 X X X X X X
|
|
13 X X X X X X X
|
|
14 X X X X X X X
|
|
15 X X X X X X X X X
|
|
16 X X X X X X X X
|
|
17 X X X X X E
|
|
18 X X X X X X X X X
|
|
19 X E E E E E X X X X X X X X X X X X E X E E
|
|
20 X X X X X E
|
|
|
|
* Same Char
|
|
# Other Char
|
|
|
|
2. Simplified by remove the class which we do not care
|
|
|
|
However, since we do not care about class 13(Subscript), 14(Ruby),
|
|
19(split line note begin quote), and 20(split line note end quote)
|
|
we can simplify this par table into the following
|
|
|
|
Class of
|
|
Leading Class of Trailing Char Class
|
|
Char
|
|
|
|
1 2 3 4 5 6 7 8 9 10 11 12 15 16 17 18
|
|
|
|
1 X X X X X X X X X X X X X X X X
|
|
2 X X X X X
|
|
3 X X X X X
|
|
4 X X X X X
|
|
5 X X X X X
|
|
6 X X X X X
|
|
7 X X X X X X
|
|
8 X X X X X X
|
|
9 X X X X X
|
|
10 X X X X X
|
|
11 X X X X X
|
|
12 X X X X X
|
|
15 X X X X X X X X
|
|
16 X X X X X X X
|
|
17 X X X X X
|
|
18 X X X X X X X X
|
|
|
|
3. Simplified by merged classes
|
|
|
|
After the 2 simplification, the pair table have some duplication
|
|
a. class 2, 3, 4, 5, 6, are the same- we can merged them
|
|
b. class 10, 11, 12, 17 are the same- we can merged them
|
|
|
|
|
|
Class of
|
|
Leading Class of Trailing Char Class
|
|
Char
|
|
|
|
1 [a] 7 8 9 [b]15 16 18
|
|
|
|
1 X X X X X X X X X
|
|
[a] X
|
|
7 X X
|
|
8 X X
|
|
9 X
|
|
[b] X
|
|
15 X X X X
|
|
16 X X X
|
|
18 X X X X
|
|
|
|
|
|
4. Now we use one bit to encode weather it is breakable, and use 2 bytes
|
|
for one row, then the bit table will look like:
|
|
|
|
18 <- 1
|
|
|
|
1 0000 0001 1111 1111 = 0x01FF
|
|
[a] 0000 0000 0000 0010 = 0x0002
|
|
7 0000 0000 0000 0110 = 0x0006
|
|
8 0000 0000 0100 0010 = 0x0042
|
|
9 0000 0000 0000 0010 = 0x0002
|
|
[b] 0000 0000 0000 0010 = 0x0042
|
|
15 0000 0001 0101 0010 = 0x0152
|
|
16 0000 0001 1000 0010 = 0x0182
|
|
17 0000 0001 1100 0010 = 0x01C2
|
|
|
|
*/
|
|
|
|
static uint16_t gJISx4051SimplifiedPair[9] = {
|
|
0x01FF, 0x0002, 0x0006, 0x0042, 0x0002, 0x0042, 0x0152, 0x0182, 0x01C2
|
|
};
|
|
|
|
PRBool XXXX::ClassesToPair(nsJISx4051Cls aCls1, nsJISx4051Cls aCls1)
|
|
{
|
|
NS_ASSERTION( (aCls1 < 9) "invalid class");
|
|
NS_ASSERTION( (aCls2 < 9) "invalid class");
|
|
return ( 0 != (gJISx4051SimplifiedPair[aCls1] & (1L << aCls2) ));
|
|
}
|
|
|
|
|
|
#define X4051_IS_DIGIT(u) ((0x0030 >= (u)) && ((u) >= 0x0039))
|
|
|
|
nsJISx4051Cls XXXX::GetClass(
|
|
PRUnichar aChar, PRUnichar aBefore = 0, PRUnichar aAfter = 0)
|
|
{
|
|
// take care the special case in cls 15
|
|
if( ((0x2C == aChar) || (0x2E == aChar)) &&
|
|
(X4051_IS_DIGIT(aBefore)) && X4051_IS_DIGIT(aAfter)))
|
|
{
|
|
return kJISx4051Cls_15;
|
|
}
|
|
|
|
nsJISx4051Cls cls;
|
|
if(gSingle->Lookup(aChar, &cls))
|
|
return cls;
|
|
|
|
if(gRange->Lookup(aChar, &cls))
|
|
return cls;
|
|
|
|
return kJISx4051Cls_15;
|
|
}
|
|
|
|
|
|
typedef enum {
|
|
kJISx4051Cls_1 = 0,
|
|
kJISx4051Cls_2 = 1,
|
|
kJISx4051Cls_3 = 1,
|
|
kJISx4051Cls_4 = 1,
|
|
kJISx4051Cls_5 = 1,
|
|
kJISx4051Cls_6 = 1,
|
|
kJISx4051Cls_7 = 2,
|
|
kJISx4051Cls_8 = 3,
|
|
kJISx4051Cls_9 = 4,
|
|
kJISx4051Cls_10 = 5,
|
|
kJISx4051Cls_11 = 5,
|
|
kJISx4051Cls_12 = 5,
|
|
// kJISx4051Cls_13 = 0,
|
|
// kJISx4051Cls_14 = 0,
|
|
kJISx4051Cls_15 = 6,
|
|
kJISx4051Cls_16 = 7,
|
|
kJISx4051Cls_17 = 5,
|
|
kJISx4051Cls_18 = 8,
|
|
// kJISx4051Cls_19 = 0,
|
|
// kJISx4051Cls_20 = 0
|
|
} nsJISx4051Cls;
|
|
|
|
|
|
// Table 2
|
|
YYYY(kJISx4051Cls_1 , 0x0028),
|
|
YYYY(kJISx4051Cls_1 , 0x005B),
|
|
YYYY(kJISx4051Cls_1 , 0x007B),
|
|
YYYY(kJISx4051Cls_1 , 0x2018),
|
|
YYYY(kJISx4051Cls_1 , 0x201B),
|
|
YYYY(kJISx4051Cls_1 , 0x201C),
|
|
YYYY(kJISx4051Cls_1 , 0x201F),
|
|
YYYY(kJISx4051Cls_1 , 0x3008),
|
|
YYYY(kJISx4051Cls_1 , 0x300A),
|
|
YYYY(kJISx4051Cls_1 , 0x300C),
|
|
YYYY(kJISx4051Cls_1 , 0x300E),
|
|
YYYY(kJISx4051Cls_1 , 0x3010),
|
|
YYYY(kJISx4051Cls_1 , 0x3014),
|
|
YYYY(kJISx4051Cls_1 , 0x3016),
|
|
YYYY(kJISx4051Cls_1 , 0x3018),
|
|
YYYY(kJISx4051Cls_1 , 0x301A),
|
|
YYYY(kJISx4051Cls_1 , 0x301D),
|
|
|
|
// Table 3
|
|
YYYY(kJISx4051Cls_2 , 0x0029),
|
|
YYYY(kJISx4051Cls_2 , 0x002C),
|
|
YYYY(kJISx4051Cls_2 , 0x005D),
|
|
YYYY(kJISx4051Cls_2 , 0x007D),
|
|
YYYY(kJISx4051Cls_2 , 0x2019),
|
|
YYYY(kJISx4051Cls_2 , 0x201A),
|
|
YYYY(kJISx4051Cls_2 , 0x201D),
|
|
YYYY(kJISx4051Cls_2 , 0x201E),
|
|
YYYY(kJISx4051Cls_2 , 0x3001),
|
|
YYYY(kJISx4051Cls_2 , 0x3009),
|
|
YYYY(kJISx4051Cls_2 , 0x300B),
|
|
YYYY(kJISx4051Cls_2 , 0x300D),
|
|
YYYY(kJISx4051Cls_2 , 0x300F),
|
|
YYYY(kJISx4051Cls_2 , 0x3011),
|
|
YYYY(kJISx4051Cls_2 , 0x3015),
|
|
YYYY(kJISx4051Cls_2 , 0x3017),
|
|
YYYY(kJISx4051Cls_2 , 0x3019),
|
|
YYYY(kJISx4051Cls_2 , 0x301B),
|
|
YYYY(kJISx4051Cls_2 , 0x301E),
|
|
YYYY(kJISx4051Cls_2 , 0x301F),
|
|
|
|
// Table 4
|
|
YYYY(kJISx4051Cls_3 , 0x203C),
|
|
YYYY(kJISx4051Cls_3 , 0x2044),
|
|
YYYY(kJISx4051Cls_3 , 0x301C),
|
|
YYYY(kJISx4051Cls_3 , 0x3041),
|
|
YYYY(kJISx4051Cls_3 , 0x3043),
|
|
YYYY(kJISx4051Cls_3 , 0x3045),
|
|
YYYY(kJISx4051Cls_3 , 0x3047),
|
|
YYYY(kJISx4051Cls_3 , 0x3049),
|
|
YYYY(kJISx4051Cls_3 , 0x3063),
|
|
YYYY(kJISx4051Cls_3 , 0x3083),
|
|
YYYY(kJISx4051Cls_3 , 0x3085),
|
|
YYYY(kJISx4051Cls_3 , 0x3087),
|
|
YYYY(kJISx4051Cls_3 , 0x308E),
|
|
YYYY(kJISx4051Cls_3 , 0x309D),
|
|
YYYY(kJISx4051Cls_3 , 0x309E),
|
|
YYYY(kJISx4051Cls_3 , 0x30A1),
|
|
YYYY(kJISx4051Cls_3 , 0x30A3),
|
|
YYYY(kJISx4051Cls_3 , 0x30A5),
|
|
YYYY(kJISx4051Cls_3 , 0x30A7),
|
|
YYYY(kJISx4051Cls_3 , 0x30A9),
|
|
YYYY(kJISx4051Cls_3 , 0x30C3),
|
|
YYYY(kJISx4051Cls_3 , 0x30E3),
|
|
YYYY(kJISx4051Cls_3 , 0x30E5),
|
|
YYYY(kJISx4051Cls_3 , 0x30E7),
|
|
YYYY(kJISx4051Cls_3 , 0x30EE),
|
|
YYYY(kJISx4051Cls_3 , 0x30F5),
|
|
YYYY(kJISx4051Cls_3 , 0x30F6),
|
|
YYYY(kJISx4051Cls_3 , 0x30FC),
|
|
YYYY(kJISx4051Cls_3 , 0x30FD),
|
|
YYYY(kJISx4051Cls_3 , 0x30FE),
|
|
|
|
// Table 5
|
|
YYYY(kJISx4051Cls_4 , 0x0021),
|
|
YYYY(kJISx4051Cls_4 , 0x003F),
|
|
|
|
// Table 6
|
|
YYYY(kJISx4051Cls_5 , 0x003A),
|
|
YYYY(kJISx4051Cls_5 , 0x003B),
|
|
YYYY(kJISx4051Cls_5 , 0x30FB),
|
|
|
|
// Table 7
|
|
YYYY(kJISx4051Cls_6 , 0x002E),
|
|
YYYY(kJISx4051Cls_6 , 0x3002),
|
|
|
|
// Table 8
|
|
YYYY(kJISx4051Cls_7 , 0x2014),
|
|
YYYY(kJISx4051Cls_7 , 0x2024),
|
|
YYYY(kJISx4051Cls_7 , 0x2025),
|
|
YYYY(kJISx4051Cls_7 , 0x2026),
|
|
|
|
// Table 9
|
|
YYYY(kJISx4051Cls_8 , 0x0024),
|
|
YYYY(kJISx4051Cls_8 , 0x00A3),
|
|
YYYY(kJISx4051Cls_8 , 0x00A5),
|
|
YYYY(kJISx4051Cls_8 , 0x2116),
|
|
|
|
// Table 10
|
|
YYYY(kJISx4051Cls_9 , 0x0025),
|
|
YYYY(kJISx4051Cls_9 , 0x00A2),
|
|
YYYY(kJISx4051Cls_9 , 0x00B0),
|
|
YYYY(kJISx4051Cls_9 , 0x2030),
|
|
YYYY(kJISx4051Cls_9 , 0x2031),
|
|
YYYY(kJISx4051Cls_9 , 0x2032),
|
|
YYYY(kJISx4051Cls_9 , 0x2033),
|
|
|
|
// Table 1
|
|
YYYY(kJISx4051Cls_10, 0x3000),
|
|
|
|
// Table 1
|
|
ZZZZ(kJISx4051Cls_11, 0x3000),
|
|
|
|
|
|
|
|
|