mirror of
https://github.com/openharmony/third_party_rust_unicode-normalization.git
synced 2026-07-01 21:33:59 -04:00
Apply rustfmt
This commit is contained in:
+1
-1
@@ -1,7 +1,7 @@
|
||||
#![feature(test)]
|
||||
#![feature(iterator_step_by)]
|
||||
extern crate unicode_normalization;
|
||||
extern crate test;
|
||||
extern crate unicode_normalization;
|
||||
|
||||
use std::fs;
|
||||
use test::Bencher;
|
||||
|
||||
+1
-1
@@ -6,7 +6,7 @@
|
||||
|
||||
use crate::stream_safe::StreamSafe;
|
||||
pub fn stream_safe(s: &str) -> String {
|
||||
StreamSafe::new(s.chars()).collect()
|
||||
StreamSafe::new(s.chars()).collect()
|
||||
}
|
||||
pub mod quick_check {
|
||||
pub use crate::quick_check::*;
|
||||
|
||||
+5
-5
@@ -7,10 +7,10 @@
|
||||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
use tinyvec::TinyVec;
|
||||
use std::fmt::{self, Write};
|
||||
use std::iter::Fuse;
|
||||
use std::ops::Range;
|
||||
use tinyvec::TinyVec;
|
||||
|
||||
#[derive(Clone)]
|
||||
enum DecompositionType {
|
||||
@@ -37,7 +37,7 @@ pub struct Decompositions<I> {
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn new_canonical<I: Iterator<Item=char>>(iter: I) -> Decompositions<I> {
|
||||
pub fn new_canonical<I: Iterator<Item = char>>(iter: I) -> Decompositions<I> {
|
||||
Decompositions {
|
||||
kind: self::DecompositionType::Canonical,
|
||||
iter: iter.fuse(),
|
||||
@@ -47,7 +47,7 @@ pub fn new_canonical<I: Iterator<Item=char>>(iter: I) -> Decompositions<I> {
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn new_compatible<I: Iterator<Item=char>>(iter: I) -> Decompositions<I> {
|
||||
pub fn new_compatible<I: Iterator<Item = char>>(iter: I) -> Decompositions<I> {
|
||||
Decompositions {
|
||||
kind: self::DecompositionType::Compatible,
|
||||
iter: iter.fuse(),
|
||||
@@ -99,7 +99,7 @@ impl<I> Decompositions<I> {
|
||||
}
|
||||
}
|
||||
|
||||
impl<I: Iterator<Item=char>> Iterator for Decompositions<I> {
|
||||
impl<I: Iterator<Item = char>> Iterator for Decompositions<I> {
|
||||
type Item = char;
|
||||
|
||||
#[inline]
|
||||
@@ -149,7 +149,7 @@ impl<I: Iterator<Item=char>> Iterator for Decompositions<I> {
|
||||
}
|
||||
}
|
||||
|
||||
impl<I: Iterator<Item=char> + Clone> fmt::Display for Decompositions<I> {
|
||||
impl<I: Iterator<Item = char> + Clone> fmt::Display for Decompositions<I> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
for c in self.clone() {
|
||||
f.write_char(c)?;
|
||||
|
||||
+13
-22
@@ -38,60 +38,51 @@
|
||||
//! ```
|
||||
|
||||
#![deny(missing_docs, unsafe_code)]
|
||||
#![doc(html_logo_url = "https://unicode-rs.github.io/unicode-rs_sm.png",
|
||||
html_favicon_url = "https://unicode-rs.github.io/unicode-rs_sm.png")]
|
||||
#![doc(
|
||||
html_logo_url = "https://unicode-rs.github.io/unicode-rs_sm.png",
|
||||
html_favicon_url = "https://unicode-rs.github.io/unicode-rs_sm.png"
|
||||
)]
|
||||
|
||||
extern crate tinyvec;
|
||||
|
||||
pub use crate::tables::UNICODE_VERSION;
|
||||
pub use crate::decompose::Decompositions;
|
||||
pub use crate::quick_check::{
|
||||
is_nfc, is_nfc_quick, is_nfc_stream_safe, is_nfc_stream_safe_quick, is_nfd, is_nfd_quick,
|
||||
is_nfd_stream_safe, is_nfd_stream_safe_quick, is_nfkc, is_nfkc_quick, is_nfkd, is_nfkd_quick,
|
||||
IsNormalized,
|
||||
is_nfc,
|
||||
is_nfc_quick,
|
||||
is_nfkc,
|
||||
is_nfkc_quick,
|
||||
is_nfc_stream_safe,
|
||||
is_nfc_stream_safe_quick,
|
||||
is_nfd,
|
||||
is_nfd_quick,
|
||||
is_nfkd,
|
||||
is_nfkd_quick,
|
||||
is_nfd_stream_safe,
|
||||
is_nfd_stream_safe_quick,
|
||||
};
|
||||
pub use crate::recompose::Recompositions;
|
||||
pub use crate::stream_safe::StreamSafe;
|
||||
pub use crate::tables::UNICODE_VERSION;
|
||||
use std::str::Chars;
|
||||
|
||||
mod decompose;
|
||||
mod lookups;
|
||||
mod normalize;
|
||||
mod perfect_hash;
|
||||
mod recompose;
|
||||
mod quick_check;
|
||||
mod recompose;
|
||||
mod stream_safe;
|
||||
|
||||
#[rustfmt::skip]
|
||||
mod tables;
|
||||
|
||||
#[cfg(test)]
|
||||
mod test;
|
||||
#[doc(hidden)]
|
||||
pub mod __test_api;
|
||||
#[cfg(test)]
|
||||
mod test;
|
||||
|
||||
/// Methods for composing and decomposing characters.
|
||||
pub mod char {
|
||||
pub use crate::normalize::{decompose_canonical, decompose_compatible, compose};
|
||||
pub use crate::normalize::{compose, decompose_canonical, decompose_compatible};
|
||||
|
||||
pub use crate::lookups::{canonical_combining_class, is_combining_mark};
|
||||
}
|
||||
|
||||
|
||||
/// Methods for iterating over strings while applying Unicode normalizations
|
||||
/// as described in
|
||||
/// [Unicode Standard Annex #15](http://www.unicode.org/reports/tr15/).
|
||||
pub trait UnicodeNormalization<I: Iterator<Item=char>> {
|
||||
pub trait UnicodeNormalization<I: Iterator<Item = char>> {
|
||||
/// Returns an iterator over the string in Unicode Normalization Form D
|
||||
/// (canonical decomposition).
|
||||
fn nfd(self) -> Decompositions<I>;
|
||||
@@ -140,7 +131,7 @@ impl<'a> UnicodeNormalization<Chars<'a>> for &'a str {
|
||||
}
|
||||
}
|
||||
|
||||
impl<I: Iterator<Item=char>> UnicodeNormalization<I> for I {
|
||||
impl<I: Iterator<Item = char>> UnicodeNormalization<I> for I {
|
||||
#[inline]
|
||||
fn nfd(self) -> Decompositions<I> {
|
||||
decompose::new_canonical(self)
|
||||
|
||||
+49
-14
@@ -14,42 +14,77 @@ use crate::perfect_hash::mph_lookup;
|
||||
use crate::tables::*;
|
||||
|
||||
/// Look up the canonical combining class for a codepoint.
|
||||
///
|
||||
///
|
||||
/// The value returned is as defined in the Unicode Character Database.
|
||||
pub fn canonical_combining_class(c: char) -> u8 {
|
||||
mph_lookup(c.into(), CANONICAL_COMBINING_CLASS_SALT, CANONICAL_COMBINING_CLASS_KV,
|
||||
u8_lookup_fk, u8_lookup_fv, 0)
|
||||
mph_lookup(
|
||||
c.into(),
|
||||
CANONICAL_COMBINING_CLASS_SALT,
|
||||
CANONICAL_COMBINING_CLASS_KV,
|
||||
u8_lookup_fk,
|
||||
u8_lookup_fv,
|
||||
0,
|
||||
)
|
||||
}
|
||||
|
||||
pub(crate) fn composition_table(c1: char, c2: char) -> Option<char> {
|
||||
if c1 < '\u{10000}' && c2 < '\u{10000}' {
|
||||
mph_lookup((c1 as u32) << 16 | (c2 as u32),
|
||||
COMPOSITION_TABLE_SALT, COMPOSITION_TABLE_KV,
|
||||
pair_lookup_fk, pair_lookup_fv_opt, None)
|
||||
mph_lookup(
|
||||
(c1 as u32) << 16 | (c2 as u32),
|
||||
COMPOSITION_TABLE_SALT,
|
||||
COMPOSITION_TABLE_KV,
|
||||
pair_lookup_fk,
|
||||
pair_lookup_fv_opt,
|
||||
None,
|
||||
)
|
||||
} else {
|
||||
composition_table_astral(c1, c2)
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn canonical_fully_decomposed(c: char) -> Option<&'static [char]> {
|
||||
mph_lookup(c.into(), CANONICAL_DECOMPOSED_SALT, CANONICAL_DECOMPOSED_KV,
|
||||
pair_lookup_fk, pair_lookup_fv_opt, None)
|
||||
mph_lookup(
|
||||
c.into(),
|
||||
CANONICAL_DECOMPOSED_SALT,
|
||||
CANONICAL_DECOMPOSED_KV,
|
||||
pair_lookup_fk,
|
||||
pair_lookup_fv_opt,
|
||||
None,
|
||||
)
|
||||
}
|
||||
|
||||
pub(crate) fn compatibility_fully_decomposed(c: char) -> Option<&'static [char]> {
|
||||
mph_lookup(c.into(), COMPATIBILITY_DECOMPOSED_SALT, COMPATIBILITY_DECOMPOSED_KV,
|
||||
pair_lookup_fk, pair_lookup_fv_opt, None)
|
||||
mph_lookup(
|
||||
c.into(),
|
||||
COMPATIBILITY_DECOMPOSED_SALT,
|
||||
COMPATIBILITY_DECOMPOSED_KV,
|
||||
pair_lookup_fk,
|
||||
pair_lookup_fv_opt,
|
||||
None,
|
||||
)
|
||||
}
|
||||
|
||||
/// Return whether the given character is a combining mark (`General_Category=Mark`)
|
||||
pub fn is_combining_mark(c: char) -> bool {
|
||||
mph_lookup(c.into(), COMBINING_MARK_SALT, COMBINING_MARK_KV,
|
||||
bool_lookup_fk, bool_lookup_fv, false)
|
||||
mph_lookup(
|
||||
c.into(),
|
||||
COMBINING_MARK_SALT,
|
||||
COMBINING_MARK_KV,
|
||||
bool_lookup_fk,
|
||||
bool_lookup_fv,
|
||||
false,
|
||||
)
|
||||
}
|
||||
|
||||
pub fn stream_safe_trailing_nonstarters(c: char) -> usize {
|
||||
mph_lookup(c.into(), TRAILING_NONSTARTERS_SALT, TRAILING_NONSTARTERS_KV,
|
||||
u8_lookup_fk, u8_lookup_fv, 0) as usize
|
||||
mph_lookup(
|
||||
c.into(),
|
||||
TRAILING_NONSTARTERS_SALT,
|
||||
TRAILING_NONSTARTERS_KV,
|
||||
u8_lookup_fk,
|
||||
u8_lookup_fv,
|
||||
0,
|
||||
) as usize
|
||||
}
|
||||
|
||||
/// Extract the key in a 24 bit key and 8 bit value packed in a u32.
|
||||
|
||||
+27
-13
@@ -9,15 +9,20 @@
|
||||
// except according to those terms.
|
||||
|
||||
//! Functions for computing canonical and compatible decompositions for Unicode characters.
|
||||
use crate::lookups::{
|
||||
canonical_fully_decomposed, compatibility_fully_decomposed, composition_table,
|
||||
};
|
||||
use std::char;
|
||||
use std::ops::FnMut;
|
||||
use crate::lookups::{canonical_fully_decomposed, composition_table, compatibility_fully_decomposed};
|
||||
|
||||
/// Compute canonical Unicode decomposition for character.
|
||||
/// See [Unicode Standard Annex #15](http://www.unicode.org/reports/tr15/)
|
||||
/// for more information.
|
||||
#[inline]
|
||||
pub fn decompose_canonical<F>(c: char, emit_char: F) where F: FnMut(char) {
|
||||
pub fn decompose_canonical<F>(c: char, emit_char: F)
|
||||
where
|
||||
F: FnMut(char),
|
||||
{
|
||||
decompose(c, canonical_fully_decomposed, emit_char)
|
||||
}
|
||||
|
||||
@@ -26,14 +31,16 @@ pub fn decompose_canonical<F>(c: char, emit_char: F) where F: FnMut(char) {
|
||||
/// for more information.
|
||||
#[inline]
|
||||
pub fn decompose_compatible<F: FnMut(char)>(c: char, emit_char: F) {
|
||||
let decompose_char = |c| compatibility_fully_decomposed(c)
|
||||
.or_else(|| canonical_fully_decomposed(c));
|
||||
let decompose_char =
|
||||
|c| compatibility_fully_decomposed(c).or_else(|| canonical_fully_decomposed(c));
|
||||
decompose(c, decompose_char, emit_char)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn decompose<D, F>(c: char, decompose_char: D, mut emit_char: F)
|
||||
where D: Fn(char) -> Option<&'static [char]>, F: FnMut(char)
|
||||
where
|
||||
D: Fn(char) -> Option<&'static [char]>,
|
||||
F: FnMut(char),
|
||||
{
|
||||
// 7-bit ASCII never decomposes
|
||||
if c <= '\x7f' {
|
||||
@@ -93,7 +100,10 @@ pub(crate) fn is_hangul_syllable(c: char) -> bool {
|
||||
// Decompose a precomposed Hangul syllable
|
||||
#[allow(unsafe_code)]
|
||||
#[inline(always)]
|
||||
fn decompose_hangul<F>(s: char, mut emit_char: F) where F: FnMut(char) {
|
||||
fn decompose_hangul<F>(s: char, mut emit_char: F)
|
||||
where
|
||||
F: FnMut(char),
|
||||
{
|
||||
let s_index = s as u32 - S_BASE;
|
||||
let l_index = s_index / N_COUNT;
|
||||
unsafe {
|
||||
@@ -113,7 +123,11 @@ fn decompose_hangul<F>(s: char, mut emit_char: F) where F: FnMut(char) {
|
||||
pub(crate) fn hangul_decomposition_length(s: char) -> usize {
|
||||
let si = s as u32 - S_BASE;
|
||||
let ti = si % T_COUNT;
|
||||
if ti > 0 { 3 } else { 2 }
|
||||
if ti > 0 {
|
||||
3
|
||||
} else {
|
||||
2
|
||||
}
|
||||
}
|
||||
|
||||
// Compose a pair of Hangul Jamo
|
||||
@@ -124,17 +138,17 @@ fn compose_hangul(a: char, b: char) -> Option<char> {
|
||||
let (a, b) = (a as u32, b as u32);
|
||||
match (a, b) {
|
||||
// Compose a leading consonant and a vowel together into an LV_Syllable
|
||||
(L_BASE ... L_LAST, V_BASE ... V_LAST) => {
|
||||
(L_BASE...L_LAST, V_BASE...V_LAST) => {
|
||||
let l_index = a - L_BASE;
|
||||
let v_index = b - V_BASE;
|
||||
let lv_index = l_index * N_COUNT + v_index * T_COUNT;
|
||||
let s = S_BASE + lv_index;
|
||||
Some(unsafe {char::from_u32_unchecked(s)})
|
||||
},
|
||||
Some(unsafe { char::from_u32_unchecked(s) })
|
||||
}
|
||||
// Compose an LV_Syllable and a trailing consonant into an LVT_Syllable
|
||||
(S_BASE ... S_LAST, T_FIRST ... T_LAST) if (a - S_BASE) % T_COUNT == 0 => {
|
||||
Some(unsafe {char::from_u32_unchecked(a + (b - T_BASE))})
|
||||
},
|
||||
(S_BASE...S_LAST, T_FIRST...T_LAST) if (a - S_BASE) % T_COUNT == 0 => {
|
||||
Some(unsafe { char::from_u32_unchecked(a + (b - T_BASE)) })
|
||||
}
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
+13
-4
@@ -20,16 +20,25 @@ fn my_hash(key: u32, salt: u32, n: usize) -> usize {
|
||||
}
|
||||
|
||||
/// Do a lookup using minimal perfect hashing.
|
||||
///
|
||||
///
|
||||
/// The table is stored as a sequence of "salt" values, then a sequence of
|
||||
/// values that contain packed key/value pairs. The strategy is to hash twice.
|
||||
/// The first hash retrieves a salt value that makes the second hash unique.
|
||||
/// The hash function doesn't have to be very good, just good enough that the
|
||||
/// resulting map is unique.
|
||||
#[inline]
|
||||
pub(crate) fn mph_lookup<KV, V, FK, FV>(x: u32, salt: &[u16], kv: &[KV], fk: FK, fv: FV,
|
||||
default: V) -> V
|
||||
where KV: Copy, FK: Fn(KV) -> u32, FV: Fn(KV) -> V
|
||||
pub(crate) fn mph_lookup<KV, V, FK, FV>(
|
||||
x: u32,
|
||||
salt: &[u16],
|
||||
kv: &[KV],
|
||||
fk: FK,
|
||||
fv: FV,
|
||||
default: V,
|
||||
) -> V
|
||||
where
|
||||
KV: Copy,
|
||||
FK: Fn(KV) -> u32,
|
||||
FV: Fn(KV) -> V,
|
||||
{
|
||||
let s = salt[my_hash(x, 0, salt.len())] as u32;
|
||||
let key_val = kv[my_hash(x, s, salt.len())];
|
||||
|
||||
+12
-15
@@ -1,7 +1,7 @@
|
||||
use crate::UnicodeNormalization;
|
||||
use crate::lookups::canonical_combining_class;
|
||||
use crate::stream_safe;
|
||||
use crate::tables;
|
||||
use crate::UnicodeNormalization;
|
||||
|
||||
/// The QuickCheck algorithm can quickly determine if a text is or isn't
|
||||
/// normalized without any allocations in many cases, but it has to be able to
|
||||
@@ -19,7 +19,9 @@ pub enum IsNormalized {
|
||||
// https://unicode.org/reports/tr15/#Detecting_Normalization_Forms
|
||||
#[inline]
|
||||
fn quick_check<F, I>(s: I, is_allowed: F, stream_safe: bool) -> IsNormalized
|
||||
where I: Iterator<Item=char>, F: Fn(char) -> IsNormalized
|
||||
where
|
||||
I: Iterator<Item = char>,
|
||||
F: Fn(char) -> IsNormalized,
|
||||
{
|
||||
let mut last_cc = 0u8;
|
||||
let mut nonstarter_count = 0;
|
||||
@@ -42,7 +44,7 @@ fn quick_check<F, I>(s: I, is_allowed: F, stream_safe: bool) -> IsNormalized
|
||||
IsNormalized::No => return IsNormalized::No,
|
||||
IsNormalized::Maybe => {
|
||||
result = IsNormalized::Maybe;
|
||||
},
|
||||
}
|
||||
}
|
||||
if stream_safe {
|
||||
let decomp = stream_safe::classify_nonstarters(ch);
|
||||
@@ -67,38 +69,37 @@ fn quick_check<F, I>(s: I, is_allowed: F, stream_safe: bool) -> IsNormalized
|
||||
/// `IsNormalized::Maybe` if further checks are necessary. In this case a check
|
||||
/// like `s.chars().nfc().eq(s.chars())` should suffice.
|
||||
#[inline]
|
||||
pub fn is_nfc_quick<I: Iterator<Item=char>>(s: I) -> IsNormalized {
|
||||
pub fn is_nfc_quick<I: Iterator<Item = char>>(s: I) -> IsNormalized {
|
||||
quick_check(s, tables::qc_nfc, false)
|
||||
}
|
||||
|
||||
|
||||
/// Quickly check if a string is in NFKC.
|
||||
#[inline]
|
||||
pub fn is_nfkc_quick<I: Iterator<Item=char>>(s: I) -> IsNormalized {
|
||||
pub fn is_nfkc_quick<I: Iterator<Item = char>>(s: I) -> IsNormalized {
|
||||
quick_check(s, tables::qc_nfkc, false)
|
||||
}
|
||||
|
||||
/// Quickly check if a string is in NFD.
|
||||
#[inline]
|
||||
pub fn is_nfd_quick<I: Iterator<Item=char>>(s: I) -> IsNormalized {
|
||||
pub fn is_nfd_quick<I: Iterator<Item = char>>(s: I) -> IsNormalized {
|
||||
quick_check(s, tables::qc_nfd, false)
|
||||
}
|
||||
|
||||
/// Quickly check if a string is in NFKD.
|
||||
#[inline]
|
||||
pub fn is_nfkd_quick<I: Iterator<Item=char>>(s: I) -> IsNormalized {
|
||||
pub fn is_nfkd_quick<I: Iterator<Item = char>>(s: I) -> IsNormalized {
|
||||
quick_check(s, tables::qc_nfkd, false)
|
||||
}
|
||||
|
||||
/// Quickly check if a string is Stream-Safe NFC.
|
||||
#[inline]
|
||||
pub fn is_nfc_stream_safe_quick<I: Iterator<Item=char>>(s: I) -> IsNormalized {
|
||||
pub fn is_nfc_stream_safe_quick<I: Iterator<Item = char>>(s: I) -> IsNormalized {
|
||||
quick_check(s, tables::qc_nfc, true)
|
||||
}
|
||||
|
||||
/// Quickly check if a string is Stream-Safe NFD.
|
||||
#[inline]
|
||||
pub fn is_nfd_stream_safe_quick<I: Iterator<Item=char>>(s: I) -> IsNormalized {
|
||||
pub fn is_nfd_stream_safe_quick<I: Iterator<Item = char>>(s: I) -> IsNormalized {
|
||||
quick_check(s, tables::qc_nfd, true)
|
||||
}
|
||||
|
||||
@@ -164,11 +165,7 @@ pub fn is_nfd_stream_safe(s: &str) -> bool {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::{
|
||||
IsNormalized,
|
||||
is_nfc_stream_safe_quick,
|
||||
is_nfd_stream_safe_quick,
|
||||
};
|
||||
use super::{is_nfc_stream_safe_quick, is_nfd_stream_safe_quick, IsNormalized};
|
||||
|
||||
#[test]
|
||||
fn test_stream_safe_nfd() {
|
||||
|
||||
+37
-43
@@ -9,8 +9,8 @@
|
||||
// except according to those terms.
|
||||
|
||||
use crate::decompose::Decompositions;
|
||||
use tinyvec::TinyVec;
|
||||
use std::fmt::{self, Write};
|
||||
use tinyvec::TinyVec;
|
||||
|
||||
#[derive(Clone)]
|
||||
enum RecompositionState {
|
||||
@@ -30,7 +30,7 @@ pub struct Recompositions<I> {
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn new_canonical<I: Iterator<Item=char>>(iter: I) -> Recompositions<I> {
|
||||
pub fn new_canonical<I: Iterator<Item = char>>(iter: I) -> Recompositions<I> {
|
||||
Recompositions {
|
||||
iter: super::decompose::new_canonical(iter),
|
||||
state: self::RecompositionState::Composing,
|
||||
@@ -41,7 +41,7 @@ pub fn new_canonical<I: Iterator<Item=char>>(iter: I) -> Recompositions<I> {
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn new_compatible<I: Iterator<Item=char>>(iter: I) -> Recompositions<I> {
|
||||
pub fn new_compatible<I: Iterator<Item = char>>(iter: I) -> Recompositions<I> {
|
||||
Recompositions {
|
||||
iter: super::decompose::new_compatible(iter),
|
||||
state: self::RecompositionState::Composing,
|
||||
@@ -51,7 +51,7 @@ pub fn new_compatible<I: Iterator<Item=char>>(iter: I) -> Recompositions<I> {
|
||||
}
|
||||
}
|
||||
|
||||
impl<I: Iterator<Item=char>> Iterator for Recompositions<I> {
|
||||
impl<I: Iterator<Item = char>> Iterator for Recompositions<I> {
|
||||
type Item = char;
|
||||
|
||||
#[inline]
|
||||
@@ -70,26 +70,24 @@ impl<I: Iterator<Item=char>> Iterator for Recompositions<I> {
|
||||
}
|
||||
self.composee = Some(ch);
|
||||
continue;
|
||||
},
|
||||
}
|
||||
Some(k) => k,
|
||||
};
|
||||
match self.last_ccc {
|
||||
None => {
|
||||
match super::char::compose(k, ch) {
|
||||
Some(r) => {
|
||||
self.composee = Some(r);
|
||||
continue;
|
||||
}
|
||||
None => {
|
||||
if ch_class == 0 {
|
||||
self.composee = Some(ch);
|
||||
return Some(k);
|
||||
}
|
||||
self.buffer.push(ch);
|
||||
self.last_ccc = Some(ch_class);
|
||||
}
|
||||
None => match super::char::compose(k, ch) {
|
||||
Some(r) => {
|
||||
self.composee = Some(r);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
None => {
|
||||
if ch_class == 0 {
|
||||
self.composee = Some(ch);
|
||||
return Some(k);
|
||||
}
|
||||
self.buffer.push(ch);
|
||||
self.last_ccc = Some(ch_class);
|
||||
}
|
||||
},
|
||||
Some(l_class) => {
|
||||
if l_class >= ch_class {
|
||||
// `ch` is blocked from `composee`
|
||||
@@ -121,36 +119,32 @@ impl<I: Iterator<Item=char>> Iterator for Recompositions<I> {
|
||||
return self.composee.take();
|
||||
}
|
||||
}
|
||||
Purging(next) => {
|
||||
match self.buffer.get(next).cloned() {
|
||||
None => {
|
||||
self.buffer.clear();
|
||||
self.state = Composing;
|
||||
}
|
||||
s => {
|
||||
self.state = Purging(next + 1);
|
||||
return s
|
||||
}
|
||||
Purging(next) => match self.buffer.get(next).cloned() {
|
||||
None => {
|
||||
self.buffer.clear();
|
||||
self.state = Composing;
|
||||
}
|
||||
}
|
||||
Finished(next) => {
|
||||
match self.buffer.get(next).cloned() {
|
||||
None => {
|
||||
self.buffer.clear();
|
||||
return self.composee.take()
|
||||
}
|
||||
s => {
|
||||
self.state = Finished(next + 1);
|
||||
return s
|
||||
}
|
||||
s => {
|
||||
self.state = Purging(next + 1);
|
||||
return s;
|
||||
}
|
||||
}
|
||||
},
|
||||
Finished(next) => match self.buffer.get(next).cloned() {
|
||||
None => {
|
||||
self.buffer.clear();
|
||||
return self.composee.take();
|
||||
}
|
||||
s => {
|
||||
self.state = Finished(next + 1);
|
||||
return s;
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<I: Iterator<Item=char> + Clone> fmt::Display for Recompositions<I> {
|
||||
impl<I: Iterator<Item = char> + Clone> fmt::Display for Recompositions<I> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
for c in self.clone() {
|
||||
f.write_char(c)?;
|
||||
|
||||
+16
-21
@@ -1,11 +1,8 @@
|
||||
use crate::normalize::{
|
||||
hangul_decomposition_length,
|
||||
is_hangul_syllable,
|
||||
};
|
||||
use crate::lookups::{
|
||||
canonical_combining_class, canonical_fully_decomposed, compatibility_fully_decomposed,
|
||||
stream_safe_trailing_nonstarters,
|
||||
};
|
||||
use crate::normalize::{hangul_decomposition_length, is_hangul_syllable};
|
||||
use crate::tables::stream_safe_leading_nonstarters;
|
||||
|
||||
pub(crate) const MAX_NONSTARTERS: usize = 30;
|
||||
@@ -22,11 +19,15 @@ pub struct StreamSafe<I> {
|
||||
|
||||
impl<I> StreamSafe<I> {
|
||||
pub(crate) fn new(iter: I) -> Self {
|
||||
Self { iter, nonstarter_count: 0, buffer: None }
|
||||
Self {
|
||||
iter,
|
||||
nonstarter_count: 0,
|
||||
buffer: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<I: Iterator<Item=char>> Iterator for StreamSafe<I> {
|
||||
impl<I: Iterator<Item = char>> Iterator for StreamSafe<I> {
|
||||
type Item = char;
|
||||
|
||||
#[inline]
|
||||
@@ -72,7 +73,7 @@ pub(crate) fn classify_nonstarters(c: char) -> Decomposition {
|
||||
leading_nonstarters: 0,
|
||||
trailing_nonstarters: 0,
|
||||
decomposition_len: 1,
|
||||
}
|
||||
};
|
||||
}
|
||||
// Next, special case Hangul, since it's not handled by our tables.
|
||||
if is_hangul_syllable(c) {
|
||||
@@ -82,15 +83,12 @@ pub(crate) fn classify_nonstarters(c: char) -> Decomposition {
|
||||
decomposition_len: hangul_decomposition_length(c),
|
||||
};
|
||||
}
|
||||
let decomp = compatibility_fully_decomposed(c)
|
||||
.or_else(|| canonical_fully_decomposed(c));
|
||||
let decomp = compatibility_fully_decomposed(c).or_else(|| canonical_fully_decomposed(c));
|
||||
match decomp {
|
||||
Some(decomp) => {
|
||||
Decomposition {
|
||||
leading_nonstarters: stream_safe_leading_nonstarters(c),
|
||||
trailing_nonstarters: stream_safe_trailing_nonstarters(c),
|
||||
decomposition_len: decomp.len(),
|
||||
}
|
||||
Some(decomp) => Decomposition {
|
||||
leading_nonstarters: stream_safe_leading_nonstarters(c),
|
||||
trailing_nonstarters: stream_safe_trailing_nonstarters(c),
|
||||
decomposition_len: decomp.len(),
|
||||
},
|
||||
None => {
|
||||
let is_nonstarter = canonical_combining_class(c) != 0;
|
||||
@@ -106,13 +104,10 @@ pub(crate) fn classify_nonstarters(c: char) -> Decomposition {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::{
|
||||
StreamSafe,
|
||||
classify_nonstarters,
|
||||
};
|
||||
use std::char;
|
||||
use crate::normalize::decompose_compatible;
|
||||
use super::{classify_nonstarters, StreamSafe};
|
||||
use crate::lookups::canonical_combining_class;
|
||||
use crate::normalize::decompose_compatible;
|
||||
use std::char;
|
||||
|
||||
fn stream_safe(s: &str) -> String {
|
||||
StreamSafe::new(s.chars()).collect()
|
||||
|
||||
+18
-11
@@ -8,11 +8,9 @@
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
|
||||
use std::char;
|
||||
use super::UnicodeNormalization;
|
||||
use super::char::is_combining_mark;
|
||||
|
||||
use super::UnicodeNormalization;
|
||||
use std::char;
|
||||
|
||||
#[test]
|
||||
fn test_nfd() {
|
||||
@@ -21,8 +19,11 @@ fn test_nfd() {
|
||||
assert_eq!($input.nfd().to_string(), $expected);
|
||||
// A dummy iterator that is not std::str::Chars directly;
|
||||
// note that `id_func` is used to ensure `Clone` implementation
|
||||
assert_eq!($input.chars().map(|c| c).nfd().collect::<String>(), $expected);
|
||||
}
|
||||
assert_eq!(
|
||||
$input.chars().map(|c| c).nfd().collect::<String>(),
|
||||
$expected
|
||||
);
|
||||
};
|
||||
}
|
||||
t!("abc", "abc");
|
||||
t!("\u{1e0b}\u{1c4}", "d\u{307}\u{1c4}");
|
||||
@@ -41,7 +42,7 @@ fn test_nfkd() {
|
||||
macro_rules! t {
|
||||
($input: expr, $expected: expr) => {
|
||||
assert_eq!($input.nfkd().to_string(), $expected);
|
||||
}
|
||||
};
|
||||
}
|
||||
t!("abc", "abc");
|
||||
t!("\u{1e0b}\u{1c4}", "d\u{307}DZ\u{30c}");
|
||||
@@ -60,7 +61,7 @@ fn test_nfc() {
|
||||
macro_rules! t {
|
||||
($input: expr, $expected: expr) => {
|
||||
assert_eq!($input.nfc().to_string(), $expected);
|
||||
}
|
||||
};
|
||||
}
|
||||
t!("abc", "abc");
|
||||
t!("\u{1e0b}\u{1c4}", "\u{1e0b}\u{1c4}");
|
||||
@@ -72,7 +73,10 @@ fn test_nfc() {
|
||||
t!("\u{301}a", "\u{301}a");
|
||||
t!("\u{d4db}", "\u{d4db}");
|
||||
t!("\u{ac1c}", "\u{ac1c}");
|
||||
t!("a\u{300}\u{305}\u{315}\u{5ae}b", "\u{e0}\u{5ae}\u{305}\u{315}b");
|
||||
t!(
|
||||
"a\u{300}\u{305}\u{315}\u{5ae}b",
|
||||
"\u{e0}\u{5ae}\u{305}\u{315}b"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -80,7 +84,7 @@ fn test_nfkc() {
|
||||
macro_rules! t {
|
||||
($input: expr, $expected: expr) => {
|
||||
assert_eq!($input.nfkc().to_string(), $expected);
|
||||
}
|
||||
};
|
||||
}
|
||||
t!("abc", "abc");
|
||||
t!("\u{1e0b}\u{1c4}", "\u{1e0b}D\u{17d}");
|
||||
@@ -92,7 +96,10 @@ fn test_nfkc() {
|
||||
t!("\u{301}a", "\u{301}a");
|
||||
t!("\u{d4db}", "\u{d4db}");
|
||||
t!("\u{ac1c}", "\u{ac1c}");
|
||||
t!("a\u{300}\u{305}\u{315}\u{5ae}b", "\u{e0}\u{5ae}\u{305}\u{315}b");
|
||||
t!(
|
||||
"a\u{300}\u{305}\u{315}\u{5ae}b",
|
||||
"\u{e0}\u{5ae}\u{305}\u{315}b"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
+4
-4
@@ -1,8 +1,6 @@
|
||||
extern crate unicode_normalization;
|
||||
use unicode_normalization::UnicodeNormalization;
|
||||
use unicode_normalization::__test_api::{
|
||||
stream_safe,
|
||||
};
|
||||
use unicode_normalization::__test_api::stream_safe;
|
||||
|
||||
mod data {
|
||||
pub mod normalization_tests;
|
||||
@@ -21,7 +19,9 @@ fn test_normalization_tests_unaffected() {
|
||||
#[test]
|
||||
fn test_official() {
|
||||
macro_rules! normString {
|
||||
($method: ident, $input: expr) => { $input.$method().collect::<String>() }
|
||||
($method: ident, $input: expr) => {
|
||||
$input.$method().collect::<String>()
|
||||
};
|
||||
}
|
||||
|
||||
for test in NORMALIZATION_TESTS {
|
||||
|
||||
Reference in New Issue
Block a user