diff --git a/src/char_data/mod.rs b/src/char_data/mod.rs index 9a455f5..4f03aff 100644 --- a/src/char_data/mod.rs +++ b/src/char_data/mod.rs @@ -42,14 +42,14 @@ pub fn bidi_class(c: char) -> BidiClass { bsearch_range_value_table(c, bidi_class_table) } -/// If this character is an opening bracket according to BidiBrackets.txt, -/// return its corresponding closing bracket. -pub(crate) fn bidi_matched_bracket(c: char) -> Option<(char, bool)> { +/// If this character is a bracket according to BidiBrackets.txt, +/// return the corresponding *normalized* *opening bracket* of the pair, +/// and whether or not it itself is an opening bracket. +pub(crate) fn bidi_matched_opening_bracket(c: char) -> Option<(char, bool)> { for pair in self::tables::bidi_pairs_table { - if pair.0 == c { - return Some((pair.1, true)); - } else if pair.1 == c { - return Some((pair.0, false)); + if pair.0 == c || pair.1 == c { + let skeleton = pair.2.unwrap_or(pair.0); + return Some((skeleton, pair.0 == c)); } } None diff --git a/src/char_data/tables.rs b/src/char_data/tables.rs index 6862d7b..34f49a4 100644 --- a/src/char_data/tables.rs +++ b/src/char_data/tables.rs @@ -508,25 +508,28 @@ pub const bidi_class_table: &'static [(char, char, BidiClass)] = &[ ('\u{f0000}', '\u{ffffd}', L), ('\u{100000}', '\u{10fffd}', L) ]; -pub const bidi_pairs_table: &'static [(char, char)] = &[ - ('\u{28}', '\u{29}'), ('\u{5b}', '\u{5d}'), ('\u{7b}', '\u{7d}'), ('\u{f3a}', '\u{f3b}'), - ('\u{f3c}', '\u{f3d}'), ('\u{169b}', '\u{169c}'), ('\u{2045}', '\u{2046}'), ('\u{207d}', - '\u{207e}'), ('\u{208d}', '\u{208e}'), ('\u{2308}', '\u{2309}'), ('\u{230a}', '\u{230b}'), - ('\u{2329}', '\u{232a}'), ('\u{2768}', '\u{2769}'), ('\u{276a}', '\u{276b}'), ('\u{276c}', - '\u{276d}'), ('\u{276e}', '\u{276f}'), ('\u{2770}', '\u{2771}'), ('\u{2772}', '\u{2773}'), - ('\u{2774}', '\u{2775}'), ('\u{27c5}', '\u{27c6}'), ('\u{27e6}', '\u{27e7}'), ('\u{27e8}', - '\u{27e9}'), ('\u{27ea}', '\u{27eb}'), ('\u{27ec}', '\u{27ed}'), ('\u{27ee}', '\u{27ef}'), - ('\u{2983}', '\u{2984}'), ('\u{2985}', '\u{2986}'), ('\u{2987}', '\u{2988}'), ('\u{2989}', - '\u{298a}'), ('\u{298b}', '\u{298c}'), ('\u{298d}', '\u{2990}'), ('\u{298f}', '\u{298e}'), - ('\u{2991}', '\u{2992}'), ('\u{2993}', '\u{2994}'), ('\u{2995}', '\u{2996}'), ('\u{2997}', - '\u{2998}'), ('\u{29d8}', '\u{29d9}'), ('\u{29da}', '\u{29db}'), ('\u{29fc}', '\u{29fd}'), - ('\u{2e22}', '\u{2e23}'), ('\u{2e24}', '\u{2e25}'), ('\u{2e26}', '\u{2e27}'), ('\u{2e28}', - '\u{2e29}'), ('\u{2e55}', '\u{2e56}'), ('\u{2e57}', '\u{2e58}'), ('\u{2e59}', '\u{2e5a}'), - ('\u{2e5b}', '\u{2e5c}'), ('\u{3008}', '\u{3009}'), ('\u{300a}', '\u{300b}'), ('\u{300c}', - '\u{300d}'), ('\u{300e}', '\u{300f}'), ('\u{3010}', '\u{3011}'), ('\u{3014}', '\u{3015}'), - ('\u{3016}', '\u{3017}'), ('\u{3018}', '\u{3019}'), ('\u{301a}', '\u{301b}'), ('\u{fe59}', - '\u{fe5a}'), ('\u{fe5b}', '\u{fe5c}'), ('\u{fe5d}', '\u{fe5e}'), ('\u{ff08}', '\u{ff09}'), - ('\u{ff3b}', '\u{ff3d}'), ('\u{ff5b}', '\u{ff5d}'), ('\u{ff5f}', '\u{ff60}'), ('\u{ff62}', - '\u{ff63}') +pub const bidi_pairs_table: &'static [(char, char, Option)] = &[ + ('\u{28}', '\u{29}', None), ('\u{5b}', '\u{5d}', None), ('\u{7b}', '\u{7d}', None), ('\u{f3a}', + '\u{f3b}', None), ('\u{f3c}', '\u{f3d}', None), ('\u{169b}', '\u{169c}', None), ('\u{2045}', + '\u{2046}', None), ('\u{207d}', '\u{207e}', None), ('\u{208d}', '\u{208e}', None), ('\u{2308}', + '\u{2309}', None), ('\u{230a}', '\u{230b}', None), ('\u{2329}', '\u{232a}', Some('\u{3008}')), + ('\u{2768}', '\u{2769}', None), ('\u{276a}', '\u{276b}', None), ('\u{276c}', '\u{276d}', None), + ('\u{276e}', '\u{276f}', None), ('\u{2770}', '\u{2771}', None), ('\u{2772}', '\u{2773}', None), + ('\u{2774}', '\u{2775}', None), ('\u{27c5}', '\u{27c6}', None), ('\u{27e6}', '\u{27e7}', None), + ('\u{27e8}', '\u{27e9}', None), ('\u{27ea}', '\u{27eb}', None), ('\u{27ec}', '\u{27ed}', None), + ('\u{27ee}', '\u{27ef}', None), ('\u{2983}', '\u{2984}', None), ('\u{2985}', '\u{2986}', None), + ('\u{2987}', '\u{2988}', None), ('\u{2989}', '\u{298a}', None), ('\u{298b}', '\u{298c}', None), + ('\u{298d}', '\u{2990}', None), ('\u{298f}', '\u{298e}', None), ('\u{2991}', '\u{2992}', None), + ('\u{2993}', '\u{2994}', None), ('\u{2995}', '\u{2996}', None), ('\u{2997}', '\u{2998}', None), + ('\u{29d8}', '\u{29d9}', None), ('\u{29da}', '\u{29db}', None), ('\u{29fc}', '\u{29fd}', None), + ('\u{2e22}', '\u{2e23}', None), ('\u{2e24}', '\u{2e25}', None), ('\u{2e26}', '\u{2e27}', None), + ('\u{2e28}', '\u{2e29}', None), ('\u{2e55}', '\u{2e56}', None), ('\u{2e57}', '\u{2e58}', None), + ('\u{2e59}', '\u{2e5a}', None), ('\u{2e5b}', '\u{2e5c}', None), ('\u{3008}', '\u{3009}', None), + ('\u{300a}', '\u{300b}', None), ('\u{300c}', '\u{300d}', None), ('\u{300e}', '\u{300f}', None), + ('\u{3010}', '\u{3011}', None), ('\u{3014}', '\u{3015}', None), ('\u{3016}', '\u{3017}', None), + ('\u{3018}', '\u{3019}', None), ('\u{301a}', '\u{301b}', None), ('\u{fe59}', '\u{fe5a}', None), + ('\u{fe5b}', '\u{fe5c}', None), ('\u{fe5d}', '\u{fe5e}', None), ('\u{ff08}', '\u{ff09}', None), + ('\u{ff3b}', '\u{ff3d}', None), ('\u{ff5b}', '\u{ff5d}', None), ('\u{ff5f}', '\u{ff60}', None), + ('\u{ff62}', '\u{ff63}', None) ]; diff --git a/src/data_source.rs b/src/data_source.rs index c06bbbf..391edf6 100644 --- a/src/data_source.rs +++ b/src/data_source.rs @@ -14,15 +14,18 @@ use crate::BidiClass; pub trait BidiDataSource { fn bidi_class(&self, c: char) -> BidiClass; /// If this character is a bracket according to BidiBrackets.txt, - /// return its corresponding matched bracket, and whether or not it is an - /// opening bracket + /// return the corresponding *normalized* *opening bracket* of the pair, + /// and whether or not it itself is an opening bracket. + /// + /// This effectively buckets brackets into equivalence classes keyed on the + /// normalized opening bracket. /// /// The default implementation will pull in a small amount of hardcoded data, /// regardless of the `hardcoded-data` feature. This is in part for convenience /// (since this data is small and changes less often), and in part so that this method can be /// added without needing a breaking version bump. /// Override this method in your custom data source to prevent the use of hardcoded data. - fn bidi_matched_bracket(&self, c: char) -> Option<(char, bool)> { - crate::char_data::bidi_matched_bracket(c) + fn bidi_matched_opening_bracket(&self, c: char) -> Option<(char, bool)> { + crate::char_data::bidi_matched_opening_bracket(c) } } diff --git a/src/implicit.rs b/src/implicit.rs index 3d6a027..67e00e2 100644 --- a/src/implicit.rs +++ b/src/implicit.rs @@ -371,7 +371,7 @@ fn identify_bracket_pairs( continue; } - if let Some((matched, is_open)) = data_source.bidi_matched_bracket(ch) { + if let Some((opening, is_open)) = data_source.bidi_matched_opening_bracket(ch) { if is_open { // If an opening paired bracket is found ... @@ -381,7 +381,7 @@ fn identify_bracket_pairs( break; } // ... push its Bidi_Paired_Bracket property value and its text position onto the stack - stack.push((matched, i)) + stack.push((opening, i)) } else { // If a closing paired bracket is found, do the following @@ -392,7 +392,7 @@ fn identify_bracket_pairs( for (stack_index, element) in stack.iter().enumerate().rev() { // Compare the closing paired bracket being inspected or its canonical // equivalent to the bracket in the current stack element. - if element.0 == ch { + if element.0 == opening { // If the values match, meaning the two characters form a bracket pair, then // Append the text position in the current stack element together with the diff --git a/tests/conformance_tests.rs b/tests/conformance_tests.rs index 32988d5..24b23d5 100644 --- a/tests/conformance_tests.rs +++ b/tests/conformance_tests.rs @@ -138,7 +138,7 @@ fn gen_base_levels_for_base_tests(bitset: u8) -> Vec> { } #[test] -#[should_panic(expected = "69 test cases failed! (91638 passed)")] +#[should_panic(expected = "65 test cases failed! (91642 passed)")] fn test_character_conformance() { let test_data = include_str!("data/BidiCharacterTest.txt"); diff --git a/tools/generate.py b/tools/generate.py index 25ad642..69dc6f0 100755 --- a/tools/generate.py +++ b/tools/generate.py @@ -56,7 +56,7 @@ def open_data(name): def is_surrogate(n): return surrogate_codepoints[0] <= n <= surrogate_codepoints[1] -def load_bidi_pairs(): +def load_bidi_pairs(on_decomps): fetch_data(BIDI_BRACKETS_NAME) arr = [] for line in fileinput.input(os.path.join(DATA_DIR, BIDI_BRACKETS_NAME)): @@ -69,12 +69,21 @@ def load_bidi_pairs(): continue cp1 = int(data[0], 16); cp2 = int(data[1], 16); - arr += [(cp1, cp2)] + decomp = None + if cp1 in on_decomps: + decomp = int(on_decomps[cp1], 16) + arr += [(cp1, cp2, decomp)] return arr +# Returns (group_categories, on_decomps), +# where on_decomps is a map containing canonical equivalents for +# ON characters only, and group_categories is the result of group_categories() +# on bidi properties def load_unicode_data(): fetch_data(UNICODE_DATA_NAME) udict = {}; + # Decompositions of all ON characters that have them + on_decomps = {} range_start = -1; for line in fileinput.input(os.path.join(DATA_DIR, UNICODE_DATA_NAME)): @@ -103,6 +112,8 @@ def load_unicode_data(): if bidi not in bidi_class: bidi_class[bidi] = [] + if len(decomp) != 0 and " " not in decomp: + on_decomps[code] = decomp bidi_class[bidi].append(code) # Default Bidi_Class for unassigned codepoints. @@ -124,7 +135,7 @@ def load_unicode_data(): if not code in udict: bidi_class[default].append(code) - return group_categories(bidi_class) + return (group_categories(bidi_class), on_decomps) def group_categories(cats): cats_out = [] @@ -223,8 +234,8 @@ use self::BidiClass::*; file_, "bidi_pairs_table", bidi_pairs_table, - "&'static [(char, char)]", - pfun=lambda x: "(%s,%s)" % (escape_char(x[0]), escape_char(x[1])), + "&'static [(char, char, Option)]", + pfun=lambda x: "(%s,%s,%s)" % (escape_char(x[0]), escape_char(x[1]), "Some(%s)" % escape_char(x[2]) if x[2] else "None"), ) def get_unicode_version(): @@ -249,8 +260,8 @@ if __name__ == "__main__": pub const UNICODE_VERSION: (u64, u64, u64) = (%s, %s, %s); """ % unicode_version) - (bidi_categories, bidi_class_table) = load_unicode_data() - bidi_pairs_table = load_bidi_pairs() + ((bidi_categories, bidi_class_table), on_decomps) = load_unicode_data() + bidi_pairs_table = load_bidi_pairs(on_decomps) emit_bidi_module(file_, bidi_class_table, bidi_categories, bidi_pairs_table) # Fetch test data files