Fix is_public_assigned to include Hangul Syllable and other ranges.

Hangul Syllables and several other ranges are defined in UnicodeData.txt
as just their first and last values:

```
AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
```

Teach the unicode.py script how to recognize these, so that it correctly
classifies them as assigned ranges, for the `is_public_assigned`
predicate.
This commit is contained in:
Dan Gohman
2021-05-28 14:15:26 -07:00
parent 74f416f8ea
commit 33e73008da
3 changed files with 70 additions and 23 deletions
+13 -2
View File
@@ -102,12 +102,13 @@ class UnicodeData(object):
assigned_start = 0;
prev_char_int = -1;
prev_name = "";
for line in self._fetch("UnicodeData.txt").splitlines():
# See ftp://ftp.unicode.org/Public/3.0-Update/UnicodeData-3.0.0.html
pieces = line.split(';')
assert len(pieces) == 15
char, category, cc, decomp = pieces[0], pieces[2], pieces[3], pieces[5]
char, name, category, cc, decomp = pieces[0], pieces[1], pieces[2], pieces[3], pieces[5]
char_int = int(char, 16)
name = pieces[1].strip()
@@ -126,10 +127,11 @@ class UnicodeData(object):
assert category != 'Cn', "Unexpected: Unassigned codepoint in UnicodeData.txt"
if category not in ['Co', 'Cs']:
if char_int != prev_char_int + 1:
if char_int != prev_char_int + 1 and not is_first_and_last(prev_name, name):
self.general_category_public_assigned.append((assigned_start, prev_char_int))
assigned_start = char_int
prev_char_int = char_int
prev_name = name;
self.general_category_public_assigned.append((assigned_start, prev_char_int))
@@ -343,6 +345,15 @@ class UnicodeData(object):
hexify = lambda c: '{:04X}'.format(c)
# Test whether `first` and `last` are corresponding "<..., First>" and
# "<..., Last>" markers.
def is_first_and_last(first, last):
if not first.startswith('<') or not first.endswith(', First>'):
return False
if not last.startswith('<') or not last.endswith(', Last>'):
return False
return first[1:-8] == last[1:-7]
def gen_mph_data(name, d, kv_type, kv_callback):
(salt, keys) = minimal_perfect_hash(d)
out.write("pub(crate) const %s_SALT: &[u16] = &[\n" % name.upper())
+10 -21
View File
@@ -22051,9 +22051,7 @@ pub fn is_public_assigned(c: char) -> bool {
| '\u{3131}'..='\u{318E}'
| '\u{3190}'..='\u{31E3}'
| '\u{31F0}'..='\u{321E}'
| '\u{3220}'..='\u{3400}'
| '\u{4DBF}'..='\u{4E00}'
| '\u{9FFC}'
| '\u{3220}'..='\u{9FFC}'
| '\u{A000}'..='\u{A48C}'
| '\u{A490}'..='\u{A4C6}'
| '\u{A4D0}'..='\u{A62B}'
@@ -22083,8 +22081,7 @@ pub fn is_public_assigned(c: char) -> bool {
| '\u{AB30}'..='\u{AB6B}'
| '\u{AB70}'..='\u{ABED}'
| '\u{ABF0}'..='\u{ABF9}'
| '\u{AC00}'
| '\u{D7A3}'
| '\u{AC00}'..='\u{D7A3}'
| '\u{D7B0}'..='\u{D7C6}'
| '\u{D7CB}'..='\u{D7FB}'
| '\u{F900}'..='\u{FA6D}'
@@ -22305,11 +22302,9 @@ pub fn is_public_assigned(c: char) -> bool {
| '\u{16F8F}'..='\u{16F9F}'
| '\u{16FE0}'..='\u{16FE4}'
| '\u{16FF0}'..='\u{16FF1}'
| '\u{17000}'
| '\u{187F7}'
| '\u{17000}'..='\u{187F7}'
| '\u{18800}'..='\u{18CD5}'
| '\u{18D00}'
| '\u{18D08}'
| '\u{18D00}'..='\u{18D08}'
| '\u{1B000}'..='\u{1B11E}'
| '\u{1B150}'..='\u{1B152}'
| '\u{1B164}'..='\u{1B167}'
@@ -22439,19 +22434,13 @@ pub fn is_public_assigned(c: char) -> bool {
| '\u{1FB00}'..='\u{1FB92}'
| '\u{1FB94}'..='\u{1FBCA}'
| '\u{1FBF0}'..='\u{1FBF9}'
| '\u{20000}'
| '\u{2A6DD}'
| '\u{2A700}'
| '\u{2B734}'
| '\u{2B740}'
| '\u{2B81D}'
| '\u{2B820}'
| '\u{2CEA1}'
| '\u{2CEB0}'
| '\u{2EBE0}'
| '\u{20000}'..='\u{2A6DD}'
| '\u{2A700}'..='\u{2B734}'
| '\u{2B740}'..='\u{2B81D}'
| '\u{2B820}'..='\u{2CEA1}'
| '\u{2CEB0}'..='\u{2EBE0}'
| '\u{2F800}'..='\u{2FA1D}'
| '\u{30000}'
| '\u{3134A}'
| '\u{30000}'..='\u{3134A}'
| '\u{E0001}'
| '\u{E0020}'..='\u{E007F}'
| '\u{E0100}'..='\u{E01EF}'
+47
View File
@@ -71,4 +71,51 @@ fn test_public_assigned() {
assert!(!is_public_assigned('\u{fffff}'));
assert!(!is_public_assigned('\u{10fffe}'));
assert!(!is_public_assigned('\u{10ffff}'));
// Several ranges are defined by "<..., First>" and "<..., Last>" pairs in
// UnicodeData.txt:
// CJK Ideograph Extension A
assert!(is_public_assigned('\u{3400}'));
assert!(is_public_assigned('\u{4dbf}'));
// CJK Ideograph
assert!(is_public_assigned('\u{4e00}'));
assert!(is_public_assigned('\u{9ffc}'));
// Hangul Syllable
assert!(is_public_assigned('\u{ac00}'));
assert!(is_public_assigned('\u{d7a3}'));
// Tangut Ideograph
assert!(is_public_assigned('\u{17000}'));
assert!(is_public_assigned('\u{187f7}'));
// Tangut Ideograph Supplement
assert!(is_public_assigned('\u{18d00}'));
assert!(is_public_assigned('\u{18d08}'));
// CJK Ideograph Extension B
assert!(is_public_assigned('\u{20000}'));
assert!(is_public_assigned('\u{2a6dd}'));
// CJK Ideograph Extension C
assert!(is_public_assigned('\u{2a700}'));
assert!(is_public_assigned('\u{2b734}'));
// CJK Ideograph Extension D
assert!(is_public_assigned('\u{2b740}'));
assert!(is_public_assigned('\u{2b81d}'));
// CJK Ideograph Extension E
assert!(is_public_assigned('\u{2b820}'));
assert!(is_public_assigned('\u{2cea1}'));
// CJK Ideograph Extension F
assert!(is_public_assigned('\u{2ceb0}'));
assert!(is_public_assigned('\u{2ebe0}'));
// CJK Ideograph Extension G
assert!(is_public_assigned('\u{30000}'));
assert!(is_public_assigned('\u{3134a}'));
}