mirror of
https://github.com/openharmony/third_party_rust_unicode-normalization.git
synced 2026-07-01 21:33:59 -04:00
Fix is_public_assigned to include Hangul Syllable and other ranges.
Hangul Syllables and several other ranges are defined in UnicodeData.txt as just their first and last values: ``` AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;; D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;; ``` Teach the unicode.py script how to recognize these, so that it correctly classifies them as assigned ranges, for the `is_public_assigned` predicate.
This commit is contained in:
+13
-2
@@ -102,12 +102,13 @@ class UnicodeData(object):
|
||||
|
||||
assigned_start = 0;
|
||||
prev_char_int = -1;
|
||||
prev_name = "";
|
||||
|
||||
for line in self._fetch("UnicodeData.txt").splitlines():
|
||||
# See ftp://ftp.unicode.org/Public/3.0-Update/UnicodeData-3.0.0.html
|
||||
pieces = line.split(';')
|
||||
assert len(pieces) == 15
|
||||
char, category, cc, decomp = pieces[0], pieces[2], pieces[3], pieces[5]
|
||||
char, name, category, cc, decomp = pieces[0], pieces[1], pieces[2], pieces[3], pieces[5]
|
||||
char_int = int(char, 16)
|
||||
|
||||
name = pieces[1].strip()
|
||||
@@ -126,10 +127,11 @@ class UnicodeData(object):
|
||||
|
||||
assert category != 'Cn', "Unexpected: Unassigned codepoint in UnicodeData.txt"
|
||||
if category not in ['Co', 'Cs']:
|
||||
if char_int != prev_char_int + 1:
|
||||
if char_int != prev_char_int + 1 and not is_first_and_last(prev_name, name):
|
||||
self.general_category_public_assigned.append((assigned_start, prev_char_int))
|
||||
assigned_start = char_int
|
||||
prev_char_int = char_int
|
||||
prev_name = name;
|
||||
|
||||
self.general_category_public_assigned.append((assigned_start, prev_char_int))
|
||||
|
||||
@@ -343,6 +345,15 @@ class UnicodeData(object):
|
||||
|
||||
hexify = lambda c: '{:04X}'.format(c)
|
||||
|
||||
# Test whether `first` and `last` are corresponding "<..., First>" and
|
||||
# "<..., Last>" markers.
|
||||
def is_first_and_last(first, last):
|
||||
if not first.startswith('<') or not first.endswith(', First>'):
|
||||
return False
|
||||
if not last.startswith('<') or not last.endswith(', Last>'):
|
||||
return False
|
||||
return first[1:-8] == last[1:-7]
|
||||
|
||||
def gen_mph_data(name, d, kv_type, kv_callback):
|
||||
(salt, keys) = minimal_perfect_hash(d)
|
||||
out.write("pub(crate) const %s_SALT: &[u16] = &[\n" % name.upper())
|
||||
|
||||
+10
-21
@@ -22051,9 +22051,7 @@ pub fn is_public_assigned(c: char) -> bool {
|
||||
| '\u{3131}'..='\u{318E}'
|
||||
| '\u{3190}'..='\u{31E3}'
|
||||
| '\u{31F0}'..='\u{321E}'
|
||||
| '\u{3220}'..='\u{3400}'
|
||||
| '\u{4DBF}'..='\u{4E00}'
|
||||
| '\u{9FFC}'
|
||||
| '\u{3220}'..='\u{9FFC}'
|
||||
| '\u{A000}'..='\u{A48C}'
|
||||
| '\u{A490}'..='\u{A4C6}'
|
||||
| '\u{A4D0}'..='\u{A62B}'
|
||||
@@ -22083,8 +22081,7 @@ pub fn is_public_assigned(c: char) -> bool {
|
||||
| '\u{AB30}'..='\u{AB6B}'
|
||||
| '\u{AB70}'..='\u{ABED}'
|
||||
| '\u{ABF0}'..='\u{ABF9}'
|
||||
| '\u{AC00}'
|
||||
| '\u{D7A3}'
|
||||
| '\u{AC00}'..='\u{D7A3}'
|
||||
| '\u{D7B0}'..='\u{D7C6}'
|
||||
| '\u{D7CB}'..='\u{D7FB}'
|
||||
| '\u{F900}'..='\u{FA6D}'
|
||||
@@ -22305,11 +22302,9 @@ pub fn is_public_assigned(c: char) -> bool {
|
||||
| '\u{16F8F}'..='\u{16F9F}'
|
||||
| '\u{16FE0}'..='\u{16FE4}'
|
||||
| '\u{16FF0}'..='\u{16FF1}'
|
||||
| '\u{17000}'
|
||||
| '\u{187F7}'
|
||||
| '\u{17000}'..='\u{187F7}'
|
||||
| '\u{18800}'..='\u{18CD5}'
|
||||
| '\u{18D00}'
|
||||
| '\u{18D08}'
|
||||
| '\u{18D00}'..='\u{18D08}'
|
||||
| '\u{1B000}'..='\u{1B11E}'
|
||||
| '\u{1B150}'..='\u{1B152}'
|
||||
| '\u{1B164}'..='\u{1B167}'
|
||||
@@ -22439,19 +22434,13 @@ pub fn is_public_assigned(c: char) -> bool {
|
||||
| '\u{1FB00}'..='\u{1FB92}'
|
||||
| '\u{1FB94}'..='\u{1FBCA}'
|
||||
| '\u{1FBF0}'..='\u{1FBF9}'
|
||||
| '\u{20000}'
|
||||
| '\u{2A6DD}'
|
||||
| '\u{2A700}'
|
||||
| '\u{2B734}'
|
||||
| '\u{2B740}'
|
||||
| '\u{2B81D}'
|
||||
| '\u{2B820}'
|
||||
| '\u{2CEA1}'
|
||||
| '\u{2CEB0}'
|
||||
| '\u{2EBE0}'
|
||||
| '\u{20000}'..='\u{2A6DD}'
|
||||
| '\u{2A700}'..='\u{2B734}'
|
||||
| '\u{2B740}'..='\u{2B81D}'
|
||||
| '\u{2B820}'..='\u{2CEA1}'
|
||||
| '\u{2CEB0}'..='\u{2EBE0}'
|
||||
| '\u{2F800}'..='\u{2FA1D}'
|
||||
| '\u{30000}'
|
||||
| '\u{3134A}'
|
||||
| '\u{30000}'..='\u{3134A}'
|
||||
| '\u{E0001}'
|
||||
| '\u{E0020}'..='\u{E007F}'
|
||||
| '\u{E0100}'..='\u{E01EF}'
|
||||
|
||||
@@ -71,4 +71,51 @@ fn test_public_assigned() {
|
||||
assert!(!is_public_assigned('\u{fffff}'));
|
||||
assert!(!is_public_assigned('\u{10fffe}'));
|
||||
assert!(!is_public_assigned('\u{10ffff}'));
|
||||
|
||||
// Several ranges are defined by "<..., First>" and "<..., Last>" pairs in
|
||||
// UnicodeData.txt:
|
||||
|
||||
// CJK Ideograph Extension A
|
||||
assert!(is_public_assigned('\u{3400}'));
|
||||
assert!(is_public_assigned('\u{4dbf}'));
|
||||
|
||||
// CJK Ideograph
|
||||
assert!(is_public_assigned('\u{4e00}'));
|
||||
assert!(is_public_assigned('\u{9ffc}'));
|
||||
|
||||
// Hangul Syllable
|
||||
assert!(is_public_assigned('\u{ac00}'));
|
||||
assert!(is_public_assigned('\u{d7a3}'));
|
||||
|
||||
// Tangut Ideograph
|
||||
assert!(is_public_assigned('\u{17000}'));
|
||||
assert!(is_public_assigned('\u{187f7}'));
|
||||
|
||||
// Tangut Ideograph Supplement
|
||||
assert!(is_public_assigned('\u{18d00}'));
|
||||
assert!(is_public_assigned('\u{18d08}'));
|
||||
|
||||
// CJK Ideograph Extension B
|
||||
assert!(is_public_assigned('\u{20000}'));
|
||||
assert!(is_public_assigned('\u{2a6dd}'));
|
||||
|
||||
// CJK Ideograph Extension C
|
||||
assert!(is_public_assigned('\u{2a700}'));
|
||||
assert!(is_public_assigned('\u{2b734}'));
|
||||
|
||||
// CJK Ideograph Extension D
|
||||
assert!(is_public_assigned('\u{2b740}'));
|
||||
assert!(is_public_assigned('\u{2b81d}'));
|
||||
|
||||
// CJK Ideograph Extension E
|
||||
assert!(is_public_assigned('\u{2b820}'));
|
||||
assert!(is_public_assigned('\u{2cea1}'));
|
||||
|
||||
// CJK Ideograph Extension F
|
||||
assert!(is_public_assigned('\u{2ceb0}'));
|
||||
assert!(is_public_assigned('\u{2ebe0}'));
|
||||
|
||||
// CJK Ideograph Extension G
|
||||
assert!(is_public_assigned('\u{30000}'));
|
||||
assert!(is_public_assigned('\u{3134a}'));
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user