mirror of
https://github.com/openharmony/third_party_rust_regex.git
synced 2026-06-30 21:37:57 -04:00
style: start using rustfmt
This commit is contained in:
+18
-14
@@ -62,16 +62,18 @@ fn main() {
|
||||
|
||||
if env::var("CARGO_FEATURE_RE_DPHOBOS_DMD").is_ok() {
|
||||
process::Command::new("dmd")
|
||||
.arg("--version")
|
||||
.stdout(process::Stdio::null())
|
||||
.stderr(process::Stdio::null())
|
||||
.spawn()
|
||||
.unwrap();
|
||||
.arg("--version")
|
||||
.stdout(process::Stdio::null())
|
||||
.stderr(process::Stdio::null())
|
||||
.spawn()
|
||||
.unwrap();
|
||||
|
||||
let out_dir = env::var("OUT_DIR").unwrap();
|
||||
let out_file = &format!("-of={}/libdphobos-dmd.a", out_dir);
|
||||
let is_compile_time = env::var("CARGO_FEATURE_RE_DPHOBOS_DMD_CT").is_ok();
|
||||
let extra_args = if is_compile_time { vec!["-version=CtRegex"] } else { vec![] };
|
||||
let is_compile_time =
|
||||
env::var("CARGO_FEATURE_RE_DPHOBOS_DMD_CT").is_ok();
|
||||
let extra_args =
|
||||
if is_compile_time { vec!["-version=CtRegex"] } else { vec![] };
|
||||
|
||||
let res = process::Command::new("dmd")
|
||||
.arg("-w")
|
||||
@@ -99,17 +101,19 @@ fn main() {
|
||||
|
||||
if env::var("CARGO_FEATURE_RE_DPHOBOS_LDC").is_ok() {
|
||||
process::Command::new("ldc")
|
||||
.arg("--version")
|
||||
.stdout(process::Stdio::null())
|
||||
.stderr(process::Stdio::null())
|
||||
.spawn()
|
||||
.unwrap();
|
||||
.arg("--version")
|
||||
.stdout(process::Stdio::null())
|
||||
.stderr(process::Stdio::null())
|
||||
.spawn()
|
||||
.unwrap();
|
||||
|
||||
let out_dir = env::var("OUT_DIR").unwrap();
|
||||
let out_file = &format!("-of={}/libdphobos-ldc.a", out_dir);
|
||||
|
||||
let is_compile_time = env::var("CARGO_FEATURE_RE_DPHOBOS_LDC_CT").is_ok();
|
||||
let extra_args = if is_compile_time { vec!["-d-version=CtRegex"] } else { vec![] };
|
||||
let is_compile_time =
|
||||
env::var("CARGO_FEATURE_RE_DPHOBOS_LDC_CT").is_ok();
|
||||
let extra_args =
|
||||
if is_compile_time { vec!["-d-version=CtRegex"] } else { vec![] };
|
||||
|
||||
let res = process::Command::new("ldc")
|
||||
.arg("-w")
|
||||
|
||||
+14
-15
@@ -60,7 +60,9 @@ cfg_if! {
|
||||
// defined below. Effectively, it allows us to use the same tests for both
|
||||
// native and dynamic regexes.
|
||||
macro_rules! regex {
|
||||
($re:expr) => { ::Regex::new(&$re.to_owned()).unwrap() }
|
||||
($re:expr) => {
|
||||
::Regex::new(&$re.to_owned()).unwrap()
|
||||
};
|
||||
}
|
||||
|
||||
cfg_if! {
|
||||
@@ -119,7 +121,7 @@ cfg_if! {
|
||||
macro_rules! bench_match {
|
||||
($name:ident, $pattern:expr, $haystack:expr) => {
|
||||
bench_is_match!($name, true, regex!($pattern), $haystack);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
// USAGE: bench_not_match!(name, pattern, haystack)
|
||||
@@ -136,7 +138,7 @@ macro_rules! bench_match {
|
||||
macro_rules! bench_not_match {
|
||||
($name:ident, $pattern:expr, $haystack:expr) => {
|
||||
bench_is_match!($name, false, regex!($pattern), $haystack);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
// USAGE: bench_is_match!(name, is_match, regex, haystack)
|
||||
@@ -182,7 +184,7 @@ macro_rules! bench_is_match {
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
// USAGE: bench_find!(name, pattern, count, haystack)
|
||||
@@ -214,7 +216,7 @@ macro_rules! bench_find {
|
||||
assert_eq!($count, count)
|
||||
});
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
// USAGE: bench_captures!(name, pattern, groups, haystack);
|
||||
@@ -229,7 +231,6 @@ macro_rules! bench_find {
|
||||
// the capture groups in question.
|
||||
macro_rules! bench_captures {
|
||||
($name:ident, $pattern:expr, $count:expr, $haystack:expr) => {
|
||||
|
||||
#[cfg(feature = "re-rust")]
|
||||
#[bench]
|
||||
fn $name(b: &mut Bencher) {
|
||||
@@ -242,14 +243,12 @@ macro_rules! bench_captures {
|
||||
let re = RE.lock().unwrap();
|
||||
let text = TEXT.lock().unwrap();
|
||||
b.bytes = text.len() as u64;
|
||||
b.iter(|| {
|
||||
match re.captures(&text) {
|
||||
None => assert!(false, "no captures"),
|
||||
Some(caps) => assert_eq!($count + 1, caps.len()),
|
||||
}
|
||||
b.iter(|| match re.captures(&text) {
|
||||
None => assert!(false, "no captures"),
|
||||
Some(caps) => assert_eq!($count + 1, caps.len()),
|
||||
});
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
// USAGE: bench_is_match_set!(name, is_match, regex, haystack)
|
||||
@@ -275,9 +274,9 @@ macro_rules! bench_is_match_set {
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
// USAGE: bench_matches_set!(name, is_match, regex, haystack)
|
||||
macro_rules! bench_matches_set {
|
||||
($name:ident, $is_match:expr, $re:expr, $haystack:expr) => {
|
||||
@@ -301,7 +300,7 @@ macro_rules! bench_matches_set {
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
cfg_if! {
|
||||
|
||||
+11
-24
@@ -31,30 +31,17 @@ impl Regex {
|
||||
}
|
||||
|
||||
pub fn is_match(&self, text: &str) -> bool {
|
||||
unsafe {
|
||||
d_phobos_regex_is_match(self.re, text.into())
|
||||
}
|
||||
unsafe { d_phobos_regex_is_match(self.re, text.into()) }
|
||||
}
|
||||
|
||||
pub fn find_iter<'r, 't>(&'r self, text: &'t str) -> FindMatches<'r, 't> {
|
||||
FindMatches {
|
||||
re: self,
|
||||
text: text,
|
||||
last_end: 0,
|
||||
last_match: None,
|
||||
}
|
||||
FindMatches { re: self, text: text, last_end: 0, last_match: None }
|
||||
}
|
||||
|
||||
fn find_at(&self, text: &str, start: usize) -> Option<(usize, usize)> {
|
||||
let (mut s, mut e): (usize, usize) = (0, 0);
|
||||
let matched = unsafe {
|
||||
d_phobos_regex_find_at(
|
||||
self.re,
|
||||
text.into(),
|
||||
start,
|
||||
&mut s,
|
||||
&mut e,
|
||||
)
|
||||
d_phobos_regex_find_at(self.re, text.into(), start, &mut s, &mut e)
|
||||
};
|
||||
if matched {
|
||||
Some((s, e))
|
||||
@@ -99,17 +86,17 @@ impl<'a> From<&'a str> for d_string {
|
||||
}
|
||||
}
|
||||
|
||||
extern {
|
||||
extern "C" {
|
||||
fn rt_init() -> i32;
|
||||
fn rt_term() -> i32;
|
||||
fn d_phobos_regex_new(s: d_string) -> *mut d_regex;
|
||||
fn d_phobos_regex_free(r: *mut d_regex);
|
||||
fn d_phobos_regex_is_match(r: *mut d_regex, s: d_string) -> bool;
|
||||
fn d_phobos_regex_find_at(r: *mut d_regex,
|
||||
s: d_string,
|
||||
start: usize,
|
||||
match_start: *mut usize,
|
||||
match_end: *mut usize)
|
||||
-> bool;
|
||||
fn d_phobos_regex_find_at(
|
||||
r: *mut d_regex,
|
||||
s: d_string,
|
||||
start: usize,
|
||||
match_start: *mut usize,
|
||||
match_end: *mut usize,
|
||||
) -> bool;
|
||||
}
|
||||
|
||||
|
||||
@@ -20,12 +20,9 @@ pub mod onig;
|
||||
pub mod pcre1;
|
||||
#[cfg(feature = "re-pcre2")]
|
||||
pub mod pcre2;
|
||||
#[cfg(any(
|
||||
feature = "re-stdcpp",
|
||||
feature = "re-boost",
|
||||
))]
|
||||
pub mod stdcpp;
|
||||
#[cfg(feature = "re-re2")]
|
||||
pub mod re2;
|
||||
#[cfg(any(feature = "re-stdcpp", feature = "re-boost",))]
|
||||
pub mod stdcpp;
|
||||
#[cfg(feature = "re-tcl")]
|
||||
pub mod tcl;
|
||||
|
||||
@@ -21,13 +21,15 @@ impl Regex {
|
||||
|
||||
pub fn is_match(&self, text: &str) -> bool {
|
||||
// Gah. onig's is_match function is anchored, but find is not.
|
||||
self.0.search_with_options(
|
||||
text,
|
||||
0,
|
||||
text.len(),
|
||||
onig::SearchOptions::SEARCH_OPTION_NONE,
|
||||
None,
|
||||
).is_some()
|
||||
self.0
|
||||
.search_with_options(
|
||||
text,
|
||||
0,
|
||||
text.len(),
|
||||
onig::SearchOptions::SEARCH_OPTION_NONE,
|
||||
None,
|
||||
)
|
||||
.is_some()
|
||||
}
|
||||
|
||||
pub fn find_iter<'r, 't>(
|
||||
|
||||
+31
-37
@@ -10,15 +10,14 @@
|
||||
|
||||
#![allow(non_snake_case)]
|
||||
|
||||
use std::ffi::{CString, CStr};
|
||||
use std::ffi::{CStr, CString};
|
||||
use std::fmt;
|
||||
use std::ptr;
|
||||
|
||||
use libc::{c_char, c_int, c_void};
|
||||
use libpcre_sys::{
|
||||
PCRE_UTF8, PCRE_NO_UTF8_CHECK, PCRE_ERROR_NOMATCH,
|
||||
pcre, pcre_extra,
|
||||
pcre_compile, pcre_free, pcre_study, pcre_free_study, pcre_exec,
|
||||
pcre, pcre_compile, pcre_exec, pcre_extra, pcre_free, pcre_free_study,
|
||||
pcre_study, PCRE_ERROR_NOMATCH, PCRE_NO_UTF8_CHECK, PCRE_UTF8,
|
||||
};
|
||||
|
||||
const PCRE_UCP: c_int = 0x20000000;
|
||||
@@ -53,32 +52,29 @@ impl Regex {
|
||||
let pattern = CString::new(pattern.to_owned()).unwrap();
|
||||
let mut errptr: *const c_char = ptr::null();
|
||||
let mut erroffset: c_int = 0;
|
||||
let code = unsafe { pcre_compile(
|
||||
pattern.as_ptr(),
|
||||
PCRE_UCP | PCRE_UTF8,
|
||||
&mut errptr,
|
||||
&mut erroffset,
|
||||
ptr::null(),
|
||||
) };
|
||||
let code = unsafe {
|
||||
pcre_compile(
|
||||
pattern.as_ptr(),
|
||||
PCRE_UCP | PCRE_UTF8,
|
||||
&mut errptr,
|
||||
&mut erroffset,
|
||||
ptr::null(),
|
||||
)
|
||||
};
|
||||
if code.is_null() {
|
||||
let msg = unsafe {
|
||||
CStr::from_ptr(errptr).to_str().unwrap().to_owned()
|
||||
};
|
||||
let msg =
|
||||
unsafe { CStr::from_ptr(errptr).to_str().unwrap().to_owned() };
|
||||
return Err(Error { msg: msg, offset: erroffset });
|
||||
}
|
||||
|
||||
let extra = unsafe { pcre_study(
|
||||
code,
|
||||
PCRE_STUDY_JIT_COMPLETE,
|
||||
&mut errptr,
|
||||
) };
|
||||
let extra =
|
||||
unsafe { pcre_study(code, PCRE_STUDY_JIT_COMPLETE, &mut errptr) };
|
||||
if extra.is_null() {
|
||||
if errptr.is_null() {
|
||||
panic!("unexpected error. Maybe JIT support isn't enabled?");
|
||||
}
|
||||
let msg = unsafe {
|
||||
CStr::from_ptr(errptr).to_str().unwrap().to_owned()
|
||||
};
|
||||
let msg =
|
||||
unsafe { CStr::from_ptr(errptr).to_str().unwrap().to_owned() };
|
||||
return Err(Error { msg: msg, offset: 0 });
|
||||
}
|
||||
Ok(Regex { code: code, extra: extra })
|
||||
@@ -89,26 +85,24 @@ impl Regex {
|
||||
}
|
||||
|
||||
pub fn find_iter<'r, 't>(&'r self, text: &'t str) -> FindMatches<'r, 't> {
|
||||
FindMatches {
|
||||
re: self,
|
||||
text: text,
|
||||
last_match_end: 0,
|
||||
}
|
||||
FindMatches { re: self, text: text, last_match_end: 0 }
|
||||
}
|
||||
|
||||
fn find_at(&self, text: &str, start: usize) -> Option<(usize, usize)> {
|
||||
const OVEC_SIZE: usize = 15 * 3; // hopefully enough for benchmarks?
|
||||
let mut ovec: [c_int; OVEC_SIZE] = [0; OVEC_SIZE];
|
||||
let err = unsafe { pcre_exec(
|
||||
self.code,
|
||||
self.extra,
|
||||
text.as_ptr() as *const i8,
|
||||
text.len() as c_int,
|
||||
start as c_int,
|
||||
PCRE_NO_UTF8_CHECK,
|
||||
ovec.as_mut_ptr(),
|
||||
OVEC_SIZE as c_int,
|
||||
) };
|
||||
let err = unsafe {
|
||||
pcre_exec(
|
||||
self.code,
|
||||
self.extra,
|
||||
text.as_ptr() as *const i8,
|
||||
text.len() as c_int,
|
||||
start as c_int,
|
||||
PCRE_NO_UTF8_CHECK,
|
||||
ovec.as_mut_ptr(),
|
||||
OVEC_SIZE as c_int,
|
||||
)
|
||||
};
|
||||
if err == PCRE_ERROR_NOMATCH {
|
||||
None
|
||||
} else if err < 0 {
|
||||
|
||||
+45
-56
@@ -14,7 +14,7 @@ use std::fmt;
|
||||
use std::ptr;
|
||||
use std::str;
|
||||
|
||||
use libc::{c_int, c_void, size_t, uint8_t, uint32_t};
|
||||
use libc::{c_int, c_void, size_t, uint32_t, uint8_t};
|
||||
|
||||
pub struct Regex {
|
||||
code: *mut code,
|
||||
@@ -42,32 +42,30 @@ impl Regex {
|
||||
pub fn new(pattern: &str) -> Result<Regex, Error> {
|
||||
let mut error_code: c_int = 0;
|
||||
let mut error_offset: size_t = 0;
|
||||
let code = unsafe { pcre2_compile_8(
|
||||
pattern.as_ptr(),
|
||||
pattern.len(),
|
||||
// PCRE2 can get significantly faster in some cases depending
|
||||
// on the permutation of these options (in particular, dropping
|
||||
// UCP). We should endeavor to have a separate "ASCII compatible"
|
||||
// benchmark.
|
||||
PCRE2_UCP | PCRE2_UTF,
|
||||
&mut error_code,
|
||||
&mut error_offset,
|
||||
ptr::null_mut(),
|
||||
) };
|
||||
let code = unsafe {
|
||||
pcre2_compile_8(
|
||||
pattern.as_ptr(),
|
||||
pattern.len(),
|
||||
// PCRE2 can get significantly faster in some cases depending
|
||||
// on the permutation of these options (in particular, dropping
|
||||
// UCP). We should endeavor to have a separate "ASCII compatible"
|
||||
// benchmark.
|
||||
PCRE2_UCP | PCRE2_UTF,
|
||||
&mut error_code,
|
||||
&mut error_offset,
|
||||
ptr::null_mut(),
|
||||
)
|
||||
};
|
||||
if code.is_null() {
|
||||
return Err(Error {
|
||||
code: error_code,
|
||||
offset: error_offset,
|
||||
});
|
||||
return Err(Error { code: error_code, offset: error_offset });
|
||||
}
|
||||
let err = unsafe { pcre2_jit_compile_8(code, PCRE2_JIT_COMPLETE) };
|
||||
if err < 0 {
|
||||
panic!("pcre2_jit_compile_8 failed with error: {:?}", err);
|
||||
}
|
||||
let match_data = unsafe { pcre2_match_data_create_from_pattern_8(
|
||||
code,
|
||||
ptr::null_mut(),
|
||||
) };
|
||||
let match_data = unsafe {
|
||||
pcre2_match_data_create_from_pattern_8(code, ptr::null_mut())
|
||||
};
|
||||
if match_data.is_null() {
|
||||
panic!("could not allocate match_data");
|
||||
}
|
||||
@@ -83,11 +81,7 @@ impl Regex {
|
||||
}
|
||||
|
||||
pub fn find_iter<'r, 't>(&'r self, text: &'t str) -> FindMatches<'r, 't> {
|
||||
FindMatches {
|
||||
re: self,
|
||||
text: text,
|
||||
last_match_end: 0,
|
||||
}
|
||||
FindMatches { re: self, text: text, last_match_end: 0 }
|
||||
}
|
||||
|
||||
fn find_at(&self, text: &str, start: usize) -> Option<(usize, usize)> {
|
||||
@@ -95,15 +89,17 @@ impl Regex {
|
||||
// way to execute a JIT match because it skips sanity checks. We also
|
||||
// explicitly disable the UTF-8 validity check, but it's probably not
|
||||
// necessary.
|
||||
let err = unsafe { pcre2_jit_match_8(
|
||||
self.code,
|
||||
text.as_ptr(),
|
||||
text.len(),
|
||||
start,
|
||||
PCRE2_NO_UTF_CHECK,
|
||||
self.match_data,
|
||||
ptr::null_mut(),
|
||||
) };
|
||||
let err = unsafe {
|
||||
pcre2_jit_match_8(
|
||||
self.code,
|
||||
text.as_ptr(),
|
||||
text.len(),
|
||||
start,
|
||||
PCRE2_NO_UTF_CHECK,
|
||||
self.match_data,
|
||||
ptr::null_mut(),
|
||||
)
|
||||
};
|
||||
if err == PCRE2_ERROR_NOMATCH {
|
||||
None
|
||||
} else if err < 0 {
|
||||
@@ -138,14 +134,15 @@ impl fmt::Debug for Error {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
const BUF_LEN: size_t = 256;
|
||||
let mut buf = [0; BUF_LEN];
|
||||
let len = unsafe { pcre2_get_error_message_8(
|
||||
self.code,
|
||||
buf.as_mut_ptr(),
|
||||
BUF_LEN,
|
||||
) };
|
||||
let len = unsafe {
|
||||
pcre2_get_error_message_8(self.code, buf.as_mut_ptr(), BUF_LEN)
|
||||
};
|
||||
if len < 0 {
|
||||
write!(f, "Unknown PCRE error. (code: {:?}, offset: {:?})",
|
||||
self.code, self.offset)
|
||||
write!(
|
||||
f,
|
||||
"Unknown PCRE error. (code: {:?}, offset: {:?})",
|
||||
self.code, self.offset
|
||||
)
|
||||
} else {
|
||||
let msg = str::from_utf8(&buf[..len as usize]).unwrap();
|
||||
write!(f, "error at {:?}: {}", self.offset, msg)
|
||||
@@ -171,7 +168,7 @@ type general_context = c_void; // unused
|
||||
|
||||
type match_context = c_void; // unused
|
||||
|
||||
extern {
|
||||
extern "C" {
|
||||
fn pcre2_compile_8(
|
||||
pattern: *const uint8_t,
|
||||
len: size_t,
|
||||
@@ -181,27 +178,19 @@ extern {
|
||||
context: *mut compile_context,
|
||||
) -> *mut code;
|
||||
|
||||
fn pcre2_code_free_8(
|
||||
code: *mut code,
|
||||
);
|
||||
fn pcre2_code_free_8(code: *mut code);
|
||||
|
||||
fn pcre2_match_data_create_from_pattern_8(
|
||||
code: *const code,
|
||||
context: *mut general_context,
|
||||
) -> *mut match_data;
|
||||
|
||||
fn pcre2_match_data_free_8(
|
||||
match_data: *mut match_data,
|
||||
);
|
||||
fn pcre2_match_data_free_8(match_data: *mut match_data);
|
||||
|
||||
fn pcre2_get_ovector_pointer_8(
|
||||
match_data: *mut match_data,
|
||||
) -> *mut size_t;
|
||||
fn pcre2_get_ovector_pointer_8(match_data: *mut match_data)
|
||||
-> *mut size_t;
|
||||
|
||||
fn pcre2_jit_compile_8(
|
||||
code: *const code,
|
||||
options: uint32_t,
|
||||
) -> c_int;
|
||||
fn pcre2_jit_compile_8(code: *const code, options: uint32_t) -> c_int;
|
||||
|
||||
fn pcre2_jit_match_8(
|
||||
code: *const code,
|
||||
|
||||
@@ -10,7 +10,7 @@
|
||||
|
||||
#![allow(non_camel_case_types)]
|
||||
|
||||
use libc::{c_uchar, c_int, c_void};
|
||||
use libc::{c_int, c_uchar, c_void};
|
||||
|
||||
/// Regex wraps an RE2 regular expression.
|
||||
///
|
||||
@@ -23,7 +23,9 @@ unsafe impl Send for Regex {}
|
||||
|
||||
impl Drop for Regex {
|
||||
fn drop(&mut self) {
|
||||
unsafe { re2_regexp_free(self.re); }
|
||||
unsafe {
|
||||
re2_regexp_free(self.re);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -42,12 +44,7 @@ impl Regex {
|
||||
}
|
||||
|
||||
pub fn find_iter<'r, 't>(&'r self, text: &'t str) -> FindMatches<'r, 't> {
|
||||
FindMatches {
|
||||
re: self,
|
||||
text: text,
|
||||
last_end: 0,
|
||||
last_match: None,
|
||||
}
|
||||
FindMatches { re: self, text: text, last_end: 0, last_match: None }
|
||||
}
|
||||
|
||||
fn find_at(&self, text: &str, start: usize) -> Option<(usize, usize)> {
|
||||
@@ -143,7 +140,7 @@ impl<'a> From<&'a str> for re2_string {
|
||||
}
|
||||
}
|
||||
|
||||
extern {
|
||||
extern "C" {
|
||||
fn re2_regexp_new(pat: re2_string) -> *mut re2_regexp;
|
||||
fn re2_regexp_free(re: *mut re2_regexp);
|
||||
fn re2_regexp_match(
|
||||
|
||||
@@ -10,7 +10,7 @@
|
||||
|
||||
#![allow(non_camel_case_types)]
|
||||
|
||||
use libc::{c_uchar, c_int, c_void};
|
||||
use libc::{c_int, c_uchar, c_void};
|
||||
|
||||
/// Regex wraps a std::regex regular expression.
|
||||
///
|
||||
@@ -23,7 +23,9 @@ unsafe impl Send for Regex {}
|
||||
|
||||
impl Drop for Regex {
|
||||
fn drop(&mut self) {
|
||||
unsafe { stdcpp_regexp_free(self.re); }
|
||||
unsafe {
|
||||
stdcpp_regexp_free(self.re);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -42,12 +44,7 @@ impl Regex {
|
||||
}
|
||||
|
||||
pub fn find_iter<'r, 't>(&'r self, text: &'t str) -> FindMatches<'r, 't> {
|
||||
FindMatches {
|
||||
re: self,
|
||||
text: text,
|
||||
last_end: 0,
|
||||
last_match: None,
|
||||
}
|
||||
FindMatches { re: self, text: text, last_end: 0, last_match: None }
|
||||
}
|
||||
|
||||
fn find_at(&self, text: &str, start: usize) -> Option<(usize, usize)> {
|
||||
@@ -143,7 +140,7 @@ impl<'a> From<&'a str> for stdcpp_string {
|
||||
}
|
||||
}
|
||||
|
||||
extern {
|
||||
extern "C" {
|
||||
fn stdcpp_regexp_new(pat: stdcpp_string) -> *mut stdcpp_regexp;
|
||||
fn stdcpp_regexp_free(re: *mut stdcpp_regexp);
|
||||
fn stdcpp_regexp_match(
|
||||
|
||||
+21
-39
@@ -80,8 +80,8 @@ pub struct Error(());
|
||||
|
||||
impl Regex {
|
||||
pub fn new(pattern: &str) -> Result<Regex, Error> {
|
||||
ONCE.call_once(|| {
|
||||
unsafe { Tcl_CreateInterp(); }
|
||||
ONCE.call_once(|| unsafe {
|
||||
Tcl_CreateInterp();
|
||||
});
|
||||
|
||||
let pat = Text::new(pattern.to_owned());
|
||||
@@ -91,21 +91,13 @@ impl Regex {
|
||||
if re.is_null() {
|
||||
return Err(Error(()));
|
||||
}
|
||||
Ok(Regex {
|
||||
pat: pat,
|
||||
re: re,
|
||||
})
|
||||
Ok(Regex { pat: pat, re: re })
|
||||
}
|
||||
|
||||
pub fn is_match(&self, text: &Text) -> bool {
|
||||
let result = unsafe { Tcl_RegExpExecObj(
|
||||
ptr::null_mut(),
|
||||
self.re,
|
||||
text.obj,
|
||||
0,
|
||||
1,
|
||||
0,
|
||||
) };
|
||||
let result = unsafe {
|
||||
Tcl_RegExpExecObj(ptr::null_mut(), self.re, text.obj, 0, 1, 0)
|
||||
};
|
||||
if result == -1 {
|
||||
panic!("Tcl_RegExpExecObj failed");
|
||||
}
|
||||
@@ -113,22 +105,20 @@ impl Regex {
|
||||
}
|
||||
|
||||
pub fn find_iter<'r, 't>(&'r self, text: &'t Text) -> FindMatches<'r, 't> {
|
||||
FindMatches {
|
||||
re: self,
|
||||
text: text,
|
||||
last_match: 0,
|
||||
}
|
||||
FindMatches { re: self, text: text, last_match: 0 }
|
||||
}
|
||||
|
||||
fn find_at(&self, text: &Text, start: usize) -> Option<(usize, usize)> {
|
||||
let result = unsafe { Tcl_RegExpExecObj(
|
||||
ptr::null_mut(),
|
||||
self.re,
|
||||
text.obj,
|
||||
start as c_int,
|
||||
1,
|
||||
0,
|
||||
) };
|
||||
let result = unsafe {
|
||||
Tcl_RegExpExecObj(
|
||||
ptr::null_mut(),
|
||||
self.re,
|
||||
text.obj,
|
||||
start as c_int,
|
||||
1,
|
||||
0,
|
||||
)
|
||||
};
|
||||
if result == -1 {
|
||||
panic!("Tcl_RegExpExecObj failed");
|
||||
} else if result == 0 {
|
||||
@@ -207,17 +197,12 @@ struct tcl_regexp_indices {
|
||||
end: c_long,
|
||||
}
|
||||
|
||||
extern {
|
||||
extern "C" {
|
||||
fn Tcl_CreateInterp() -> *mut tcl_interp;
|
||||
|
||||
fn Tcl_NewStringObj(
|
||||
pat: *const c_char,
|
||||
len: c_int,
|
||||
) -> *mut tcl_obj;
|
||||
fn Tcl_NewStringObj(pat: *const c_char, len: c_int) -> *mut tcl_obj;
|
||||
|
||||
fn TclFreeObj(
|
||||
obj: *mut tcl_obj,
|
||||
);
|
||||
fn TclFreeObj(obj: *mut tcl_obj);
|
||||
|
||||
fn Tcl_GetRegExpFromObj(
|
||||
int: *mut tcl_interp,
|
||||
@@ -234,8 +219,5 @@ extern {
|
||||
flags: c_int,
|
||||
) -> c_int;
|
||||
|
||||
fn Tcl_RegExpGetInfo(
|
||||
re: *mut tcl_regexp,
|
||||
info: *mut tcl_regexp_info,
|
||||
);
|
||||
fn Tcl_RegExpGetInfo(re: *mut tcl_regexp, info: *mut tcl_regexp_info);
|
||||
}
|
||||
|
||||
+11
-18
@@ -15,10 +15,7 @@ extern crate libpcre_sys;
|
||||
extern crate memmap;
|
||||
#[cfg(feature = "re-onig")]
|
||||
extern crate onig;
|
||||
#[cfg(any(
|
||||
feature = "re-rust",
|
||||
feature = "re-rust-bytes",
|
||||
))]
|
||||
#[cfg(any(feature = "re-rust", feature = "re-rust-bytes",))]
|
||||
extern crate regex;
|
||||
#[cfg(feature = "re-rust")]
|
||||
extern crate regex_syntax;
|
||||
@@ -71,9 +68,8 @@ fn main() {
|
||||
.and_then(|d| d.deserialize())
|
||||
.unwrap_or_else(|e| e.exit());
|
||||
|
||||
let mmap = unsafe {
|
||||
Mmap::map(&File::open(&args.arg_file).unwrap()).unwrap()
|
||||
};
|
||||
let mmap =
|
||||
unsafe { Mmap::map(&File::open(&args.arg_file).unwrap()).unwrap() };
|
||||
let haystack = unsafe { str::from_utf8_unchecked(&mmap) };
|
||||
|
||||
println!("{}", args.count(&haystack));
|
||||
@@ -108,10 +104,13 @@ macro_rules! nada {
|
||||
($feature:expr, $name:ident) => {
|
||||
#[cfg(not(feature = $feature))]
|
||||
fn $name(_pat: &str, _haystack: &str) -> usize {
|
||||
panic!("Support not enabled. Re-compile with '--features {}' \
|
||||
to enable.", $feature)
|
||||
panic!(
|
||||
"Support not enabled. Re-compile with '--features {}' \
|
||||
to enable.",
|
||||
$feature
|
||||
)
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
nada!("re-onig", count_onig);
|
||||
@@ -135,15 +134,9 @@ fn count_pcre2(pat: &str, haystack: &str) -> usize {
|
||||
Regex::new(pat).unwrap().find_iter(haystack).count()
|
||||
}
|
||||
|
||||
#[cfg(not(any(
|
||||
feature = "re-stdcpp",
|
||||
feature = "re-boost",
|
||||
)))]
|
||||
#[cfg(not(any(feature = "re-stdcpp", feature = "re-boost",)))]
|
||||
nada!("re-stdcpp", count_stdcpp);
|
||||
#[cfg(any(
|
||||
feature = "re-stdcpp",
|
||||
feature = "re-boost",
|
||||
))]
|
||||
#[cfg(any(feature = "re-stdcpp", feature = "re-boost",))]
|
||||
fn count_stdcpp(pat: &str, haystack: &str) -> usize {
|
||||
use ffi::stdcpp::Regex;
|
||||
Regex::new(pat).unwrap().find_iter(haystack).count()
|
||||
|
||||
@@ -11,7 +11,11 @@ use std::io::{self, Read};
|
||||
use std::sync::Arc;
|
||||
use std::thread;
|
||||
|
||||
macro_rules! regex { ($re:expr) => { ::regex::bytes::Regex::new($re).unwrap() } }
|
||||
macro_rules! regex {
|
||||
($re:expr) => {
|
||||
::regex::bytes::Regex::new($re).unwrap()
|
||||
};
|
||||
}
|
||||
|
||||
fn main() {
|
||||
let mut seq = Vec::with_capacity(51 * (1 << 20));
|
||||
|
||||
@@ -16,7 +16,11 @@ use std::io::{self, Read};
|
||||
use std::sync::Arc;
|
||||
use std::thread;
|
||||
|
||||
macro_rules! regex { ($re:expr) => { ::regex::Regex::new($re).unwrap() } }
|
||||
macro_rules! regex {
|
||||
($re:expr) => {
|
||||
::regex::Regex::new($re).unwrap()
|
||||
};
|
||||
}
|
||||
|
||||
fn main() {
|
||||
let mut seq = String::with_capacity(50 * (1 << 20));
|
||||
|
||||
@@ -6,7 +6,7 @@ macro_rules! regex {
|
||||
($re:expr) => {{
|
||||
use regex::internal::ExecBuilder;
|
||||
ExecBuilder::new($re).build().unwrap().into_regex()
|
||||
}}
|
||||
}};
|
||||
}
|
||||
|
||||
fn main() {
|
||||
|
||||
@@ -9,7 +9,11 @@ extern crate regex;
|
||||
|
||||
use std::io::{self, Read};
|
||||
|
||||
macro_rules! regex { ($re:expr) => { ::regex::Regex::new($re).unwrap() } }
|
||||
macro_rules! regex {
|
||||
($re:expr) => {
|
||||
::regex::Regex::new($re).unwrap()
|
||||
};
|
||||
}
|
||||
|
||||
fn main() {
|
||||
let mut seq = String::with_capacity(50 * (1 << 20));
|
||||
|
||||
@@ -9,7 +9,11 @@ extern crate regex;
|
||||
|
||||
use std::io::{self, Read};
|
||||
|
||||
macro_rules! regex { ($re:expr) => { ::regex::Regex::new($re).unwrap() } }
|
||||
macro_rules! regex {
|
||||
($re:expr) => {
|
||||
::regex::Regex::new($re).unwrap()
|
||||
};
|
||||
}
|
||||
|
||||
fn main() {
|
||||
let mut seq = String::with_capacity(50 * (1 << 20));
|
||||
|
||||
@@ -11,7 +11,11 @@ use std::io::{self, Read};
|
||||
use std::sync::Arc;
|
||||
use std::thread;
|
||||
|
||||
macro_rules! regex { ($re:expr) => { ::regex::Regex::new($re).unwrap() } }
|
||||
macro_rules! regex {
|
||||
($re:expr) => {
|
||||
::regex::Regex::new($re).unwrap()
|
||||
};
|
||||
}
|
||||
|
||||
fn main() {
|
||||
let mut seq = String::with_capacity(51 * (1 << 20));
|
||||
|
||||
+10
-11
@@ -1,10 +1,10 @@
|
||||
use ::std::ffi;
|
||||
use ::std::ffi::CString;
|
||||
use ::std::fmt;
|
||||
use ::std::str;
|
||||
use std::ffi;
|
||||
use std::ffi::CString;
|
||||
use std::fmt;
|
||||
use std::str;
|
||||
|
||||
use ::libc::c_char;
|
||||
use ::regex;
|
||||
use libc::c_char;
|
||||
use regex;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct Error {
|
||||
@@ -22,16 +22,15 @@ pub enum ErrorKind {
|
||||
|
||||
impl Error {
|
||||
pub fn new(kind: ErrorKind) -> Error {
|
||||
Error {
|
||||
message: None,
|
||||
kind: kind,
|
||||
}
|
||||
Error { message: None, kind: kind }
|
||||
}
|
||||
|
||||
pub fn is_err(&self) -> bool {
|
||||
match self.kind {
|
||||
ErrorKind::None => false,
|
||||
ErrorKind::Str(_) | ErrorKind::Regex(_) | ErrorKind::Nul(_) => true,
|
||||
ErrorKind::Str(_) | ErrorKind::Regex(_) | ErrorKind::Nul(_) => {
|
||||
true
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,11 +1,10 @@
|
||||
extern crate libc;
|
||||
extern crate regex;
|
||||
|
||||
|
||||
#[macro_use]
|
||||
mod macros;
|
||||
mod rure;
|
||||
mod error;
|
||||
mod rure;
|
||||
|
||||
pub use rure::*;
|
||||
pub use error::*;
|
||||
pub use rure::*;
|
||||
|
||||
+20
-23
@@ -1,9 +1,9 @@
|
||||
use std::collections::HashMap;
|
||||
use std::ops::Deref;
|
||||
use std::ffi::{CStr, CString};
|
||||
use std::ops::Deref;
|
||||
use std::ptr;
|
||||
use std::str;
|
||||
use std::slice;
|
||||
use std::str;
|
||||
|
||||
use libc::{c_char, size_t};
|
||||
use regex::bytes;
|
||||
@@ -56,20 +56,21 @@ pub struct IterCaptureNames {
|
||||
|
||||
impl Deref for Regex {
|
||||
type Target = bytes::Regex;
|
||||
fn deref(&self) -> &bytes::Regex { &self.re }
|
||||
fn deref(&self) -> &bytes::Regex {
|
||||
&self.re
|
||||
}
|
||||
}
|
||||
|
||||
impl Deref for RegexSet {
|
||||
type Target = bytes::RegexSet;
|
||||
fn deref(&self) -> &bytes::RegexSet { &self.re }
|
||||
fn deref(&self) -> &bytes::RegexSet {
|
||||
&self.re
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for Options {
|
||||
fn default() -> Options {
|
||||
Options {
|
||||
size_limit: 10 * (1<<20),
|
||||
dfa_size_limit: 2 * (1<<20),
|
||||
}
|
||||
Options { size_limit: 10 * (1 << 20), dfa_size_limit: 2 * (1 << 20) }
|
||||
}
|
||||
}
|
||||
|
||||
@@ -596,31 +597,27 @@ ffi_fn! {
|
||||
fn rure_escape(
|
||||
pattern: *const u8,
|
||||
length: size_t,
|
||||
error: *mut Error
|
||||
error: *mut Error,
|
||||
) -> *const c_char {
|
||||
let pat: &[u8] = unsafe { slice::from_raw_parts(pattern, length) };
|
||||
let str_pat = match str::from_utf8(pat) {
|
||||
Ok(val) => val,
|
||||
Err(err) => {
|
||||
unsafe {
|
||||
if !error.is_null() {
|
||||
*error = Error::new(ErrorKind::Str(err));
|
||||
}
|
||||
return ptr::null();
|
||||
Err(err) => unsafe {
|
||||
if !error.is_null() {
|
||||
*error = Error::new(ErrorKind::Str(err));
|
||||
}
|
||||
}
|
||||
return ptr::null();
|
||||
},
|
||||
};
|
||||
let esc_pat = regex::escape(str_pat);
|
||||
let c_esc_pat = match CString::new(esc_pat) {
|
||||
Ok(val) => val,
|
||||
Err(err) => {
|
||||
unsafe {
|
||||
if !error.is_null() {
|
||||
*error = Error::new(ErrorKind::Nul(err));
|
||||
}
|
||||
return ptr::null();
|
||||
Err(err) => unsafe {
|
||||
if !error.is_null() {
|
||||
*error = Error::new(ErrorKind::Nul(err));
|
||||
}
|
||||
}
|
||||
return ptr::null();
|
||||
},
|
||||
};
|
||||
c_esc_pat.into_raw() as *const c_char
|
||||
}
|
||||
|
||||
+22
-23
@@ -12,9 +12,9 @@ use std::process;
|
||||
use std::result;
|
||||
|
||||
use docopt::Docopt;
|
||||
use syntax::hir::Hir;
|
||||
use syntax::hir::literal::Literals;
|
||||
use regex::internal::{Compiler, LiteralSearcher};
|
||||
use syntax::hir::literal::Literals;
|
||||
use syntax::hir::Hir;
|
||||
|
||||
const USAGE: &'static str = "
|
||||
Usage:
|
||||
@@ -83,8 +83,8 @@ type Result<T> = result::Result<T, Box<error::Error + Send + Sync>>;
|
||||
|
||||
fn main() {
|
||||
let mut args: Args = Docopt::new(USAGE)
|
||||
.and_then(|d| d.deserialize())
|
||||
.unwrap_or_else(|e| e.exit());
|
||||
.and_then(|d| d.deserialize())
|
||||
.unwrap_or_else(|e| e.exit());
|
||||
if args.flag_dfa_reverse {
|
||||
args.flag_dfa = true;
|
||||
}
|
||||
@@ -131,9 +131,7 @@ fn cmd_ast(args: &Args) -> Result<()> {
|
||||
fn cmd_hir(args: &Args) -> Result<()> {
|
||||
use syntax::ParserBuilder;
|
||||
|
||||
let mut parser = ParserBuilder::new()
|
||||
.allow_invalid_utf8(false)
|
||||
.build();
|
||||
let mut parser = ParserBuilder::new().allow_invalid_utf8(false).build();
|
||||
let hir = parser.parse(&args.arg_pattern)?;
|
||||
println!("{:#?}", hir);
|
||||
Ok(())
|
||||
@@ -141,12 +139,11 @@ fn cmd_hir(args: &Args) -> Result<()> {
|
||||
|
||||
fn cmd_literals(args: &Args) -> Result<()> {
|
||||
let exprs = args.parse_many()?;
|
||||
let mut lits =
|
||||
if args.cmd_prefixes {
|
||||
args.literals(&exprs, |lits, e| lits.union_prefixes(e))
|
||||
} else {
|
||||
args.literals(&exprs, |lits, e| lits.union_suffixes(e))
|
||||
};
|
||||
let mut lits = if args.cmd_prefixes {
|
||||
args.literals(&exprs, |lits, e| lits.union_prefixes(e))
|
||||
} else {
|
||||
args.literals(&exprs, |lits, e| lits.union_suffixes(e))
|
||||
};
|
||||
if !args.flag_all_literals {
|
||||
if args.cmd_prefixes {
|
||||
lits = lits.unambiguous_prefixes();
|
||||
@@ -197,20 +194,20 @@ fn cmd_captures(args: &Args) -> Result<()> {
|
||||
|
||||
fn cmd_compile(args: &Args) -> Result<()> {
|
||||
let exprs = args.parse_many()?;
|
||||
let compiler =
|
||||
args.compiler()
|
||||
.bytes(args.flag_bytes)
|
||||
.only_utf8(!args.flag_bytes)
|
||||
.dfa(args.flag_dfa)
|
||||
.reverse(args.flag_dfa_reverse);
|
||||
let compiler = args
|
||||
.compiler()
|
||||
.bytes(args.flag_bytes)
|
||||
.only_utf8(!args.flag_bytes)
|
||||
.dfa(args.flag_dfa)
|
||||
.reverse(args.flag_dfa_reverse);
|
||||
let prog = compiler.compile(&exprs)?;
|
||||
print!("{:?}", prog);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn cmd_utf8_ranges(args: &Args) -> Result<()> {
|
||||
use syntax::ParserBuilder;
|
||||
use syntax::hir::{self, HirKind};
|
||||
use syntax::ParserBuilder;
|
||||
use utf8_ranges::Utf8Sequences;
|
||||
|
||||
let hir = ParserBuilder::new()
|
||||
@@ -218,9 +215,11 @@ fn cmd_utf8_ranges(args: &Args) -> Result<()> {
|
||||
.parse(&format!("[{}]", args.arg_class))?;
|
||||
let cls = match hir.into_kind() {
|
||||
HirKind::Class(hir::Class::Unicode(cls)) => cls,
|
||||
_ => return Err(
|
||||
format!("unexpected HIR, expected Unicode class").into(),
|
||||
),
|
||||
_ => {
|
||||
return Err(
|
||||
format!("unexpected HIR, expected Unicode class").into()
|
||||
)
|
||||
}
|
||||
};
|
||||
let mut char_count = 0;
|
||||
for (i, range) in cls.iter().enumerate() {
|
||||
|
||||
@@ -59,7 +59,8 @@ fn parse_medium2(b: &mut Bencher) {
|
||||
#[bench]
|
||||
fn parse_medium3(b: &mut Bencher) {
|
||||
b.iter(|| {
|
||||
let re = r"\p{age:3.2}\p{hira}\p{scx:hira}\p{alphabetic}\p{sc:Greek}\pL";
|
||||
let re =
|
||||
r"\p{age:3.2}\p{hira}\p{scx:hira}\p{alphabetic}\p{sc:Greek}\pL";
|
||||
Parser::new().parse(re).unwrap()
|
||||
});
|
||||
}
|
||||
|
||||
+86
-107
@@ -16,7 +16,7 @@ use std::cmp::Ordering;
|
||||
use std::error;
|
||||
use std::fmt;
|
||||
|
||||
pub use ast::visitor::{Visitor, visit};
|
||||
pub use ast::visitor::{visit, Visitor};
|
||||
|
||||
pub mod parse;
|
||||
pub mod print;
|
||||
@@ -202,11 +202,11 @@ impl error::Error for Error {
|
||||
EscapeUnexpectedEof => "unexpected eof (escape sequence)",
|
||||
EscapeUnrecognized => "unrecognized escape sequence",
|
||||
FlagDanglingNegation => "dangling flag negation operator",
|
||||
FlagDuplicate{..} => "duplicate flag",
|
||||
FlagRepeatedNegation{..} => "repeated negation",
|
||||
FlagDuplicate { .. } => "duplicate flag",
|
||||
FlagRepeatedNegation { .. } => "repeated negation",
|
||||
FlagUnexpectedEof => "unexpected eof (flag)",
|
||||
FlagUnrecognized => "unrecognized flag",
|
||||
GroupNameDuplicate{..} => "duplicate capture group name",
|
||||
GroupNameDuplicate { .. } => "duplicate capture group name",
|
||||
GroupNameEmpty => "empty capture group name",
|
||||
GroupNameInvalid => "invalid capture group name",
|
||||
GroupNameUnexpectedEof => "unclosed capture group name",
|
||||
@@ -233,86 +233,67 @@ impl fmt::Display for ErrorKind {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
use self::ErrorKind::*;
|
||||
match *self {
|
||||
CaptureLimitExceeded => {
|
||||
write!(f, "exceeded the maximum number of \
|
||||
capturing groups ({})", ::std::u32::MAX)
|
||||
}
|
||||
CaptureLimitExceeded => write!(
|
||||
f,
|
||||
"exceeded the maximum number of \
|
||||
capturing groups ({})",
|
||||
::std::u32::MAX
|
||||
),
|
||||
ClassEscapeInvalid => {
|
||||
write!(f, "invalid escape sequence found in character class")
|
||||
}
|
||||
ClassRangeInvalid => {
|
||||
write!(f, "invalid character class range, \
|
||||
the start must be <= the end")
|
||||
}
|
||||
ClassRangeInvalid => write!(
|
||||
f,
|
||||
"invalid character class range, \
|
||||
the start must be <= the end"
|
||||
),
|
||||
ClassRangeLiteral => {
|
||||
write!(f, "invalid range boundary, must be a literal")
|
||||
}
|
||||
ClassUnclosed => {
|
||||
write!(f, "unclosed character class")
|
||||
}
|
||||
DecimalEmpty => {
|
||||
write!(f, "decimal literal empty")
|
||||
}
|
||||
DecimalInvalid => {
|
||||
write!(f, "decimal literal invalid")
|
||||
}
|
||||
EscapeHexEmpty => {
|
||||
write!(f, "hexadecimal literal empty")
|
||||
}
|
||||
ClassUnclosed => write!(f, "unclosed character class"),
|
||||
DecimalEmpty => write!(f, "decimal literal empty"),
|
||||
DecimalInvalid => write!(f, "decimal literal invalid"),
|
||||
EscapeHexEmpty => write!(f, "hexadecimal literal empty"),
|
||||
EscapeHexInvalid => {
|
||||
write!(f, "hexadecimal literal is not a Unicode scalar value")
|
||||
}
|
||||
EscapeHexInvalidDigit => {
|
||||
write!(f, "invalid hexadecimal digit")
|
||||
}
|
||||
EscapeUnexpectedEof => {
|
||||
write!(f, "incomplete escape sequence, \
|
||||
reached end of pattern prematurely")
|
||||
}
|
||||
EscapeUnrecognized => {
|
||||
write!(f, "unrecognized escape sequence")
|
||||
}
|
||||
EscapeHexInvalidDigit => write!(f, "invalid hexadecimal digit"),
|
||||
EscapeUnexpectedEof => write!(
|
||||
f,
|
||||
"incomplete escape sequence, \
|
||||
reached end of pattern prematurely"
|
||||
),
|
||||
EscapeUnrecognized => write!(f, "unrecognized escape sequence"),
|
||||
FlagDanglingNegation => {
|
||||
write!(f, "dangling flag negation operator")
|
||||
}
|
||||
FlagDuplicate{..} => {
|
||||
write!(f, "duplicate flag")
|
||||
}
|
||||
FlagRepeatedNegation{..} => {
|
||||
FlagDuplicate { .. } => write!(f, "duplicate flag"),
|
||||
FlagRepeatedNegation { .. } => {
|
||||
write!(f, "flag negation operator repeated")
|
||||
}
|
||||
FlagUnexpectedEof => {
|
||||
write!(f, "expected flag but got end of regex")
|
||||
}
|
||||
FlagUnrecognized => {
|
||||
write!(f, "unrecognized flag")
|
||||
}
|
||||
GroupNameDuplicate{..} => {
|
||||
FlagUnrecognized => write!(f, "unrecognized flag"),
|
||||
GroupNameDuplicate { .. } => {
|
||||
write!(f, "duplicate capture group name")
|
||||
}
|
||||
GroupNameEmpty => {
|
||||
write!(f, "empty capture group name")
|
||||
}
|
||||
GroupNameInvalid => {
|
||||
write!(f, "invalid capture group character")
|
||||
}
|
||||
GroupNameUnexpectedEof => {
|
||||
write!(f, "unclosed capture group name")
|
||||
}
|
||||
GroupUnclosed => {
|
||||
write!(f, "unclosed group")
|
||||
}
|
||||
GroupUnopened => {
|
||||
write!(f, "unopened group")
|
||||
}
|
||||
NestLimitExceeded(limit) => {
|
||||
write!(f, "exceed the maximum number of \
|
||||
nested parentheses/brackets ({})", limit)
|
||||
}
|
||||
RepetitionCountInvalid => {
|
||||
write!(f, "invalid repetition count range, \
|
||||
the start must be <= the end")
|
||||
}
|
||||
GroupNameEmpty => write!(f, "empty capture group name"),
|
||||
GroupNameInvalid => write!(f, "invalid capture group character"),
|
||||
GroupNameUnexpectedEof => write!(f, "unclosed capture group name"),
|
||||
GroupUnclosed => write!(f, "unclosed group"),
|
||||
GroupUnopened => write!(f, "unopened group"),
|
||||
NestLimitExceeded(limit) => write!(
|
||||
f,
|
||||
"exceed the maximum number of \
|
||||
nested parentheses/brackets ({})",
|
||||
limit
|
||||
),
|
||||
RepetitionCountInvalid => write!(
|
||||
f,
|
||||
"invalid repetition count range, \
|
||||
the start must be <= the end"
|
||||
),
|
||||
RepetitionCountDecimalEmpty => {
|
||||
write!(f, "repetition quantifier expects a valid decimal")
|
||||
}
|
||||
@@ -325,10 +306,11 @@ impl fmt::Display for ErrorKind {
|
||||
UnsupportedBackreference => {
|
||||
write!(f, "backreferences are not supported")
|
||||
}
|
||||
UnsupportedLookAround => {
|
||||
write!(f, "look-around, including look-ahead and look-behind, \
|
||||
is not supported")
|
||||
}
|
||||
UnsupportedLookAround => write!(
|
||||
f,
|
||||
"look-around, including look-ahead and look-behind, \
|
||||
is not supported"
|
||||
),
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
@@ -384,7 +366,8 @@ impl fmt::Debug for Position {
|
||||
write!(
|
||||
f,
|
||||
"Position(o: {:?}, l: {:?}, c: {:?})",
|
||||
self.offset, self.line, self.column)
|
||||
self.offset, self.line, self.column
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -868,7 +851,8 @@ impl ClassUnicode {
|
||||
pub fn is_negated(&self) -> bool {
|
||||
match self.kind {
|
||||
ClassUnicodeKind::NamedValue {
|
||||
op: ClassUnicodeOpKind::NotEqual, ..
|
||||
op: ClassUnicodeOpKind::NotEqual,
|
||||
..
|
||||
} => !self.negated,
|
||||
_ => self.negated,
|
||||
}
|
||||
@@ -910,7 +894,7 @@ impl ClassUnicodeOpKind {
|
||||
/// Whether the op is an equality op or not.
|
||||
pub fn is_equal(&self) -> bool {
|
||||
match *self {
|
||||
ClassUnicodeOpKind::Equal|ClassUnicodeOpKind::Colon => true,
|
||||
ClassUnicodeOpKind::Equal | ClassUnicodeOpKind::Colon => true,
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
@@ -1428,26 +1412,24 @@ impl Drop for ClassSet {
|
||||
use std::mem;
|
||||
|
||||
match *self {
|
||||
ClassSet::Item(ref item) => {
|
||||
match *item {
|
||||
ClassSetItem::Empty(_)
|
||||
| ClassSetItem::Literal(_)
|
||||
| ClassSetItem::Range(_)
|
||||
| ClassSetItem::Ascii(_)
|
||||
| ClassSetItem::Unicode(_)
|
||||
| ClassSetItem::Perl(_) => return,
|
||||
ClassSetItem::Bracketed(ref x) => {
|
||||
if x.kind.is_empty() {
|
||||
return;
|
||||
}
|
||||
}
|
||||
ClassSetItem::Union(ref x) => {
|
||||
if x.items.is_empty() {
|
||||
return;
|
||||
}
|
||||
ClassSet::Item(ref item) => match *item {
|
||||
ClassSetItem::Empty(_)
|
||||
| ClassSetItem::Literal(_)
|
||||
| ClassSetItem::Range(_)
|
||||
| ClassSetItem::Ascii(_)
|
||||
| ClassSetItem::Unicode(_)
|
||||
| ClassSetItem::Perl(_) => return,
|
||||
ClassSetItem::Bracketed(ref x) => {
|
||||
if x.kind.is_empty() {
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
ClassSetItem::Union(ref x) => {
|
||||
if x.items.is_empty() {
|
||||
return;
|
||||
}
|
||||
}
|
||||
},
|
||||
ClassSet::BinaryOp(ref op) => {
|
||||
if op.lhs.is_empty() && op.rhs.is_empty() {
|
||||
return;
|
||||
@@ -1460,23 +1442,20 @@ impl Drop for ClassSet {
|
||||
let mut stack = vec![mem::replace(self, empty_set())];
|
||||
while let Some(mut set) = stack.pop() {
|
||||
match set {
|
||||
ClassSet::Item(ref mut item) => {
|
||||
match *item {
|
||||
ClassSetItem::Empty(_)
|
||||
| ClassSetItem::Literal(_)
|
||||
| ClassSetItem::Range(_)
|
||||
| ClassSetItem::Ascii(_)
|
||||
| ClassSetItem::Unicode(_)
|
||||
| ClassSetItem::Perl(_) => {}
|
||||
ClassSetItem::Bracketed(ref mut x) => {
|
||||
stack.push(mem::replace(&mut x.kind, empty_set()));
|
||||
}
|
||||
ClassSetItem::Union(ref mut x) => {
|
||||
stack.extend(
|
||||
x.items.drain(..).map(ClassSet::Item));
|
||||
}
|
||||
ClassSet::Item(ref mut item) => match *item {
|
||||
ClassSetItem::Empty(_)
|
||||
| ClassSetItem::Literal(_)
|
||||
| ClassSetItem::Range(_)
|
||||
| ClassSetItem::Ascii(_)
|
||||
| ClassSetItem::Unicode(_)
|
||||
| ClassSetItem::Perl(_) => {}
|
||||
ClassSetItem::Bracketed(ref mut x) => {
|
||||
stack.push(mem::replace(&mut x.kind, empty_set()));
|
||||
}
|
||||
}
|
||||
ClassSetItem::Union(ref mut x) => {
|
||||
stack.extend(x.items.drain(..).map(ClassSet::Item));
|
||||
}
|
||||
},
|
||||
ClassSet::BinaryOp(ref mut op) => {
|
||||
stack.push(mem::replace(&mut op.lhs, empty_set()));
|
||||
stack.push(mem::replace(&mut op.rhs, empty_set()));
|
||||
@@ -1515,7 +1494,7 @@ mod tests {
|
||||
// We run our test on a thread with a small stack size so we can
|
||||
// force the issue more easily.
|
||||
thread::Builder::new()
|
||||
.stack_size(1<<10)
|
||||
.stack_size(1 << 10)
|
||||
.spawn(run)
|
||||
.unwrap()
|
||||
.join()
|
||||
|
||||
+1665
-1229
File diff suppressed because it is too large
Load Diff
@@ -14,8 +14,8 @@ This module provides a regular expression printer for `Ast`.
|
||||
|
||||
use std::fmt;
|
||||
|
||||
use ast::{self, Ast};
|
||||
use ast::visitor::{self, Visitor};
|
||||
use ast::{self, Ast};
|
||||
|
||||
/// A builder for constructing a printer.
|
||||
///
|
||||
@@ -34,15 +34,11 @@ impl Default for PrinterBuilder {
|
||||
|
||||
impl PrinterBuilder {
|
||||
fn new() -> PrinterBuilder {
|
||||
PrinterBuilder {
|
||||
_priv: (),
|
||||
}
|
||||
PrinterBuilder { _priv: () }
|
||||
}
|
||||
|
||||
fn build(&self) -> Printer {
|
||||
Printer {
|
||||
_priv: (),
|
||||
}
|
||||
Printer { _priv: () }
|
||||
}
|
||||
}
|
||||
|
||||
@@ -95,7 +91,7 @@ impl<'p, W: fmt::Write> Visitor for Writer<'p, W> {
|
||||
Ast::Class(ast::Class::Bracketed(ref x)) => {
|
||||
self.fmt_class_bracketed_pre(x)
|
||||
}
|
||||
_ => Ok(())
|
||||
_ => Ok(()),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -253,9 +249,7 @@ impl<'p, W: fmt::Write> Writer<'p, W> {
|
||||
Special(ast::SpecialLiteralKind::FormFeed) => {
|
||||
self.wtr.write_str(r"\f")
|
||||
}
|
||||
Special(ast::SpecialLiteralKind::Tab) => {
|
||||
self.wtr.write_str(r"\t")
|
||||
}
|
||||
Special(ast::SpecialLiteralKind::Tab) => self.wtr.write_str(r"\t"),
|
||||
Special(ast::SpecialLiteralKind::LineFeed) => {
|
||||
self.wtr.write_str(r"\n")
|
||||
}
|
||||
@@ -296,16 +290,14 @@ impl<'p, W: fmt::Write> Writer<'p, W> {
|
||||
for item in &ast.items {
|
||||
match item.kind {
|
||||
FlagsItemKind::Negation => self.wtr.write_str("-"),
|
||||
FlagsItemKind::Flag(ref flag) => {
|
||||
match *flag {
|
||||
Flag::CaseInsensitive => self.wtr.write_str("i"),
|
||||
Flag::MultiLine => self.wtr.write_str("m"),
|
||||
Flag::DotMatchesNewLine => self.wtr.write_str("s"),
|
||||
Flag::SwapGreed => self.wtr.write_str("U"),
|
||||
Flag::Unicode => self.wtr.write_str("u"),
|
||||
Flag::IgnoreWhitespace => self.wtr.write_str("x"),
|
||||
}
|
||||
}
|
||||
FlagsItemKind::Flag(ref flag) => match *flag {
|
||||
Flag::CaseInsensitive => self.wtr.write_str("i"),
|
||||
Flag::MultiLine => self.wtr.write_str("m"),
|
||||
Flag::DotMatchesNewLine => self.wtr.write_str("s"),
|
||||
Flag::SwapGreed => self.wtr.write_str("U"),
|
||||
Flag::Unicode => self.wtr.write_str("u"),
|
||||
Flag::IgnoreWhitespace => self.wtr.write_str("x"),
|
||||
},
|
||||
}?;
|
||||
}
|
||||
Ok(())
|
||||
@@ -414,15 +406,16 @@ impl<'p, W: fmt::Write> Writer<'p, W> {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use ast::parse::ParserBuilder;
|
||||
use super::Printer;
|
||||
use ast::parse::ParserBuilder;
|
||||
|
||||
fn roundtrip(given: &str) {
|
||||
roundtrip_with(|b| b, given);
|
||||
}
|
||||
|
||||
fn roundtrip_with<F>(mut f: F, given: &str)
|
||||
where F: FnMut(&mut ParserBuilder) -> &mut ParserBuilder
|
||||
where
|
||||
F: FnMut(&mut ParserBuilder) -> &mut ParserBuilder,
|
||||
{
|
||||
let mut builder = ParserBuilder::new();
|
||||
f(&mut builder);
|
||||
|
||||
@@ -181,9 +181,7 @@ enum ClassFrame<'a> {
|
||||
tail: &'a [ast::ClassSetItem],
|
||||
},
|
||||
/// The stack frame used while a binary class operation.
|
||||
Binary {
|
||||
op: &'a ast::ClassSetBinaryOp,
|
||||
},
|
||||
Binary { op: &'a ast::ClassSetBinaryOp },
|
||||
/// A stack frame allocated just before descending into a binary operator's
|
||||
/// left hand child node.
|
||||
BinaryLHS {
|
||||
@@ -193,10 +191,7 @@ enum ClassFrame<'a> {
|
||||
},
|
||||
/// A stack frame allocated just before descending into a binary operator's
|
||||
/// right hand child node.
|
||||
BinaryRHS {
|
||||
op: &'a ast::ClassSetBinaryOp,
|
||||
rhs: &'a ast::ClassSet,
|
||||
},
|
||||
BinaryRHS { op: &'a ast::ClassSetBinaryOp, rhs: &'a ast::ClassSet },
|
||||
}
|
||||
|
||||
/// A representation of the inductive step when performing structural induction
|
||||
@@ -249,7 +244,7 @@ impl<'a> HeapVisitor<'a> {
|
||||
// If this is a concat/alternate, then we might have additional
|
||||
// inductive steps to process.
|
||||
if let Some(x) = self.pop(frame) {
|
||||
if let Frame::Alternation {..} = x {
|
||||
if let Frame::Alternation { .. } = x {
|
||||
visitor.visit_alternation_in()?;
|
||||
}
|
||||
ast = x.child();
|
||||
@@ -282,18 +277,13 @@ impl<'a> HeapVisitor<'a> {
|
||||
Ast::Group(ref x) => Some(Frame::Group(x)),
|
||||
Ast::Concat(ref x) if x.asts.is_empty() => None,
|
||||
Ast::Concat(ref x) => {
|
||||
Some(Frame::Concat {
|
||||
head: &x.asts[0],
|
||||
tail: &x.asts[1..],
|
||||
})
|
||||
Some(Frame::Concat { head: &x.asts[0], tail: &x.asts[1..] })
|
||||
}
|
||||
Ast::Alternation(ref x) if x.asts.is_empty() => None,
|
||||
Ast::Alternation(ref x) => {
|
||||
Some(Frame::Alternation {
|
||||
head: &x.asts[0],
|
||||
tail: &x.asts[1..],
|
||||
})
|
||||
}
|
||||
Ast::Alternation(ref x) => Some(Frame::Alternation {
|
||||
head: &x.asts[0],
|
||||
tail: &x.asts[1..],
|
||||
}),
|
||||
_ => None,
|
||||
})
|
||||
}
|
||||
@@ -308,10 +298,7 @@ impl<'a> HeapVisitor<'a> {
|
||||
if tail.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(Frame::Concat {
|
||||
head: &tail[0],
|
||||
tail: &tail[1..],
|
||||
})
|
||||
Some(Frame::Concat { head: &tail[0], tail: &tail[1..] })
|
||||
}
|
||||
}
|
||||
Frame::Alternation { tail, .. } => {
|
||||
@@ -403,18 +390,12 @@ impl<'a> HeapVisitor<'a> {
|
||||
|
||||
/// Build a stack frame for the given class node if one is needed (which
|
||||
/// occurs if and only if there are child nodes). Otherwise, return None.
|
||||
fn induct_class(
|
||||
&self,
|
||||
ast: &ClassInduct<'a>,
|
||||
) -> Option<ClassFrame<'a>> {
|
||||
fn induct_class(&self, ast: &ClassInduct<'a>) -> Option<ClassFrame<'a>> {
|
||||
match *ast {
|
||||
ClassInduct::Item(&ast::ClassSetItem::Bracketed(ref x)) => {
|
||||
match x.kind {
|
||||
ast::ClassSet::Item(ref item) => {
|
||||
Some(ClassFrame::Union {
|
||||
head: item,
|
||||
tail: &[],
|
||||
})
|
||||
Some(ClassFrame::Union { head: item, tail: &[] })
|
||||
}
|
||||
ast::ClassSet::BinaryOp(ref op) => {
|
||||
Some(ClassFrame::Binary { op: op })
|
||||
@@ -431,13 +412,11 @@ impl<'a> HeapVisitor<'a> {
|
||||
})
|
||||
}
|
||||
}
|
||||
ClassInduct::BinaryOp(op) => {
|
||||
Some(ClassFrame::BinaryLHS {
|
||||
op: op,
|
||||
lhs: &op.lhs,
|
||||
rhs: &op.rhs,
|
||||
})
|
||||
}
|
||||
ClassInduct::BinaryOp(op) => Some(ClassFrame::BinaryLHS {
|
||||
op: op,
|
||||
lhs: &op.lhs,
|
||||
rhs: &op.rhs,
|
||||
}),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
@@ -456,14 +435,11 @@ impl<'a> HeapVisitor<'a> {
|
||||
})
|
||||
}
|
||||
}
|
||||
ClassFrame::Binary {..} => None,
|
||||
ClassFrame::Binary { .. } => None,
|
||||
ClassFrame::BinaryLHS { op, rhs, .. } => {
|
||||
Some(ClassFrame::BinaryRHS {
|
||||
op: op,
|
||||
rhs: rhs,
|
||||
})
|
||||
Some(ClassFrame::BinaryRHS { op: op, rhs: rhs })
|
||||
}
|
||||
ClassFrame::BinaryRHS {..} => None,
|
||||
ClassFrame::BinaryRHS { .. } => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -514,10 +490,10 @@ impl<'a> ClassInduct<'a> {
|
||||
impl<'a> fmt::Debug for ClassFrame<'a> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
let x = match *self {
|
||||
ClassFrame::Union{..} => "Union",
|
||||
ClassFrame::Binary{..} => "Binary",
|
||||
ClassFrame::BinaryLHS{..} => "BinaryLHS",
|
||||
ClassFrame::BinaryRHS{..} => "BinaryRHS",
|
||||
ClassFrame::Union { .. } => "Union",
|
||||
ClassFrame::Binary { .. } => "Binary",
|
||||
ClassFrame::BinaryLHS { .. } => "BinaryLHS",
|
||||
ClassFrame::BinaryRHS { .. } => "BinaryRHS",
|
||||
};
|
||||
write!(f, "{}", x)
|
||||
}
|
||||
@@ -526,31 +502,27 @@ impl<'a> fmt::Debug for ClassFrame<'a> {
|
||||
impl<'a> fmt::Debug for ClassInduct<'a> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
let x = match *self {
|
||||
ClassInduct::Item(it) => {
|
||||
match *it {
|
||||
ast::ClassSetItem::Empty(_) => "Item(Empty)",
|
||||
ast::ClassSetItem::Literal(_) => "Item(Literal)",
|
||||
ast::ClassSetItem::Range(_) => "Item(Range)",
|
||||
ast::ClassSetItem::Ascii(_) => "Item(Ascii)",
|
||||
ast::ClassSetItem::Perl(_) => "Item(Perl)",
|
||||
ast::ClassSetItem::Unicode(_) => "Item(Unicode)",
|
||||
ast::ClassSetItem::Bracketed(_) => "Item(Bracketed)",
|
||||
ast::ClassSetItem::Union(_) => "Item(Union)",
|
||||
ClassInduct::Item(it) => match *it {
|
||||
ast::ClassSetItem::Empty(_) => "Item(Empty)",
|
||||
ast::ClassSetItem::Literal(_) => "Item(Literal)",
|
||||
ast::ClassSetItem::Range(_) => "Item(Range)",
|
||||
ast::ClassSetItem::Ascii(_) => "Item(Ascii)",
|
||||
ast::ClassSetItem::Perl(_) => "Item(Perl)",
|
||||
ast::ClassSetItem::Unicode(_) => "Item(Unicode)",
|
||||
ast::ClassSetItem::Bracketed(_) => "Item(Bracketed)",
|
||||
ast::ClassSetItem::Union(_) => "Item(Union)",
|
||||
},
|
||||
ClassInduct::BinaryOp(it) => match it.kind {
|
||||
ast::ClassSetBinaryOpKind::Intersection => {
|
||||
"BinaryOp(Intersection)"
|
||||
}
|
||||
}
|
||||
ClassInduct::BinaryOp(it) => {
|
||||
match it.kind {
|
||||
ast::ClassSetBinaryOpKind::Intersection => {
|
||||
"BinaryOp(Intersection)"
|
||||
}
|
||||
ast::ClassSetBinaryOpKind::Difference => {
|
||||
"BinaryOp(Difference)"
|
||||
}
|
||||
ast::ClassSetBinaryOpKind::SymmetricDifference => {
|
||||
"BinaryOp(SymmetricDifference)"
|
||||
}
|
||||
ast::ClassSetBinaryOpKind::Difference => {
|
||||
"BinaryOp(Difference)"
|
||||
}
|
||||
}
|
||||
ast::ClassSetBinaryOpKind::SymmetricDifference => {
|
||||
"BinaryOp(SymmetricDifference)"
|
||||
}
|
||||
},
|
||||
};
|
||||
write!(f, "{}", x)
|
||||
}
|
||||
|
||||
@@ -117,8 +117,11 @@ impl<'e, E: fmt::Display> fmt::Display for Formatter<'e, E> {
|
||||
for span in &spans.multi_line {
|
||||
notes.push(format!(
|
||||
"on line {} (column {}) through line {} (column {})",
|
||||
span.start.line, span.start.column,
|
||||
span.end.line, span.end.column - 1));
|
||||
span.start.line,
|
||||
span.start.column,
|
||||
span.end.line,
|
||||
span.end.column - 1
|
||||
));
|
||||
}
|
||||
writeln!(f, "{}", notes.join("\n"))?;
|
||||
}
|
||||
@@ -174,11 +177,7 @@ impl<'p> Spans<'p> {
|
||||
line_count += 1;
|
||||
}
|
||||
let line_number_width =
|
||||
if line_count <= 1 {
|
||||
0
|
||||
} else {
|
||||
line_count.to_string().len()
|
||||
};
|
||||
if line_count <= 1 { 0 } else { line_count.to_string().len() };
|
||||
let mut spans = Spans {
|
||||
pattern: &fmter.pattern,
|
||||
line_number_width: line_number_width,
|
||||
@@ -310,11 +309,14 @@ mod tests {
|
||||
// See: https://github.com/rust-lang/regex/issues/545
|
||||
#[test]
|
||||
fn repetition_quantifier_expects_a_valid_decimal() {
|
||||
assert_panic_message(r"\\u{[^}]*}", r#"
|
||||
assert_panic_message(
|
||||
r"\\u{[^}]*}",
|
||||
r#"
|
||||
regex parse error:
|
||||
\\u{[^}]*}
|
||||
^
|
||||
error: repetition quantifier expects a valid decimal
|
||||
"#);
|
||||
"#,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -40,7 +40,7 @@ impl<I: Interval> IntervalSet<I> {
|
||||
///
|
||||
/// The given ranges do not need to be in any specific order, and ranges
|
||||
/// may overlap.
|
||||
pub fn new<T: IntoIterator<Item=I>>(intervals: T) -> IntervalSet<I> {
|
||||
pub fn new<T: IntoIterator<Item = I>>(intervals: T) -> IntervalSet<I> {
|
||||
let mut set = IntervalSet { ranges: intervals.into_iter().collect() };
|
||||
set.canonicalize();
|
||||
set
|
||||
@@ -146,8 +146,7 @@ impl<I: Interval> IntervalSet<I> {
|
||||
// each class.
|
||||
let drain_end = self.ranges.len();
|
||||
let (mut a, mut b) = (0, 0);
|
||||
'LOOP:
|
||||
while a < drain_end && b < other.ranges.len() {
|
||||
'LOOP: while a < drain_end && b < other.ranges.len() {
|
||||
// Basically, the easy cases are when neither range overlaps with
|
||||
// each other. If the `b` range is less than our current `a`
|
||||
// range, then we can skip it and move on.
|
||||
@@ -447,11 +446,13 @@ pub trait Interval:
|
||||
let (lower1, upper1) = (self.lower(), self.upper());
|
||||
let (lower2, upper2) = (other.lower(), other.upper());
|
||||
(lower2 <= lower1 && lower1 <= upper2)
|
||||
&& (lower2 <= upper1 && upper1 <= upper2)
|
||||
&& (lower2 <= upper1 && upper1 <= upper2)
|
||||
}
|
||||
}
|
||||
|
||||
pub trait Bound: Copy + Clone + Debug + Eq + PartialEq + PartialOrd + Ord {
|
||||
pub trait Bound:
|
||||
Copy + Clone + Debug + Eq + PartialEq + PartialOrd + Ord
|
||||
{
|
||||
fn min_value() -> Self;
|
||||
fn max_value() -> Self;
|
||||
fn as_u32(self) -> u32;
|
||||
@@ -460,17 +461,33 @@ pub trait Bound: Copy + Clone + Debug + Eq + PartialEq + PartialOrd + Ord {
|
||||
}
|
||||
|
||||
impl Bound for u8 {
|
||||
fn min_value() -> Self { u8::MIN }
|
||||
fn max_value() -> Self { u8::MAX }
|
||||
fn as_u32(self) -> u32 { self as u32 }
|
||||
fn increment(self) -> Self { self.checked_add(1).unwrap() }
|
||||
fn decrement(self) -> Self { self.checked_sub(1).unwrap() }
|
||||
fn min_value() -> Self {
|
||||
u8::MIN
|
||||
}
|
||||
fn max_value() -> Self {
|
||||
u8::MAX
|
||||
}
|
||||
fn as_u32(self) -> u32 {
|
||||
self as u32
|
||||
}
|
||||
fn increment(self) -> Self {
|
||||
self.checked_add(1).unwrap()
|
||||
}
|
||||
fn decrement(self) -> Self {
|
||||
self.checked_sub(1).unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
impl Bound for char {
|
||||
fn min_value() -> Self { '\x00' }
|
||||
fn max_value() -> Self { '\u{10FFFF}' }
|
||||
fn as_u32(self) -> u32 { self as u32 }
|
||||
fn min_value() -> Self {
|
||||
'\x00'
|
||||
}
|
||||
fn max_value() -> Self {
|
||||
'\u{10FFFF}'
|
||||
}
|
||||
fn as_u32(self) -> u32 {
|
||||
self as u32
|
||||
}
|
||||
|
||||
fn increment(self) -> Self {
|
||||
match self {
|
||||
|
||||
+345
-205
@@ -61,11 +61,7 @@ pub struct Literal {
|
||||
impl Literals {
|
||||
/// Returns a new empty set of literals using default limits.
|
||||
pub fn empty() -> Literals {
|
||||
Literals {
|
||||
lits: vec![],
|
||||
limit_size: 250,
|
||||
limit_class: 10,
|
||||
}
|
||||
Literals { lits: vec![], limit_size: 250, limit_class: 10 }
|
||||
}
|
||||
|
||||
/// Returns a set of literal prefixes extracted from the given `Hir`.
|
||||
@@ -162,8 +158,7 @@ impl Literals {
|
||||
/// Returns a new empty set of literals using this set's limits.
|
||||
pub fn to_empty(&self) -> Literals {
|
||||
let mut lits = Literals::empty();
|
||||
lits.set_limit_size(self.limit_size)
|
||||
.set_limit_class(self.limit_class);
|
||||
lits.set_limit_size(self.limit_size).set_limit_class(self.limit_class);
|
||||
lits
|
||||
}
|
||||
|
||||
@@ -177,10 +172,8 @@ impl Literals {
|
||||
for lit in &self.lits[1..] {
|
||||
len = cmp::min(
|
||||
len,
|
||||
lit.iter()
|
||||
.zip(lit0)
|
||||
.take_while(|&(a, b)| a == b)
|
||||
.count());
|
||||
lit.iter().zip(lit0).take_while(|&(a, b)| a == b).count(),
|
||||
);
|
||||
}
|
||||
&self.lits[0][..len]
|
||||
}
|
||||
@@ -196,10 +189,11 @@ impl Literals {
|
||||
len = cmp::min(
|
||||
len,
|
||||
lit.iter()
|
||||
.rev()
|
||||
.zip(lit0.iter().rev())
|
||||
.take_while(|&(a, b)| a == b)
|
||||
.count());
|
||||
.rev()
|
||||
.zip(lit0.iter().rev())
|
||||
.take_while(|&(a, b)| a == b)
|
||||
.count(),
|
||||
);
|
||||
}
|
||||
&self.lits[0][self.lits[0].len() - len..]
|
||||
}
|
||||
@@ -243,8 +237,7 @@ impl Literals {
|
||||
}
|
||||
let mut old: Vec<Literal> = self.lits.iter().cloned().collect();
|
||||
let mut new = self.to_empty();
|
||||
'OUTER:
|
||||
while let Some(mut candidate) = old.pop() {
|
||||
'OUTER: while let Some(mut candidate) = old.pop() {
|
||||
if candidate.is_empty() {
|
||||
continue;
|
||||
}
|
||||
@@ -437,7 +430,8 @@ impl Literals {
|
||||
}
|
||||
let mut i = 1;
|
||||
while size + (i * self.lits.len()) <= self.limit_size
|
||||
&& i < bytes.len() {
|
||||
&& i < bytes.len()
|
||||
{
|
||||
i += 1;
|
||||
}
|
||||
for lit in &mut self.lits {
|
||||
@@ -579,22 +573,20 @@ impl Literals {
|
||||
}
|
||||
// This is an approximation since codepoints in a char class can encode
|
||||
// to 1-4 bytes.
|
||||
let new_byte_count =
|
||||
if self.lits.is_empty() {
|
||||
size
|
||||
} else {
|
||||
self.lits
|
||||
.iter()
|
||||
.fold(0, |accum, lit| {
|
||||
accum + if lit.is_cut() {
|
||||
// If the literal is cut, then we'll never add
|
||||
// anything to it, so don't count it.
|
||||
0
|
||||
} else {
|
||||
(lit.len() + 1) * size
|
||||
}
|
||||
})
|
||||
};
|
||||
let new_byte_count = if self.lits.is_empty() {
|
||||
size
|
||||
} else {
|
||||
self.lits.iter().fold(0, |accum, lit| {
|
||||
accum
|
||||
+ if lit.is_cut() {
|
||||
// If the literal is cut, then we'll never add
|
||||
// anything to it, so don't count it.
|
||||
0
|
||||
} else {
|
||||
(lit.len() + 1) * size
|
||||
}
|
||||
})
|
||||
};
|
||||
new_byte_count > self.limit_size
|
||||
}
|
||||
}
|
||||
@@ -621,34 +613,27 @@ fn prefixes(expr: &Hir, lits: &mut Literals) {
|
||||
HirKind::Group(hir::Group { ref hir, .. }) => {
|
||||
prefixes(&**hir, lits);
|
||||
}
|
||||
HirKind::Repetition(ref x) => {
|
||||
match x.kind {
|
||||
hir::RepetitionKind::ZeroOrOne => {
|
||||
repeat_zero_or_one_literals(&x.hir, lits, prefixes);
|
||||
}
|
||||
hir::RepetitionKind::ZeroOrMore => {
|
||||
repeat_zero_or_more_literals(&x.hir, lits, prefixes);
|
||||
}
|
||||
hir::RepetitionKind::OneOrMore => {
|
||||
repeat_one_or_more_literals(&x.hir, lits, prefixes);
|
||||
}
|
||||
hir::RepetitionKind::Range(ref rng) => {
|
||||
let (min, max) = match *rng {
|
||||
hir::RepetitionRange::Exactly(m) => {
|
||||
(m, Some(m))
|
||||
}
|
||||
hir::RepetitionRange::AtLeast(m) => {
|
||||
(m, None)
|
||||
}
|
||||
hir::RepetitionRange::Bounded(m, n) => {
|
||||
(m, Some(n))
|
||||
}
|
||||
};
|
||||
repeat_range_literals(
|
||||
&x.hir, min, max, x.greedy, lits, prefixes)
|
||||
}
|
||||
HirKind::Repetition(ref x) => match x.kind {
|
||||
hir::RepetitionKind::ZeroOrOne => {
|
||||
repeat_zero_or_one_literals(&x.hir, lits, prefixes);
|
||||
}
|
||||
}
|
||||
hir::RepetitionKind::ZeroOrMore => {
|
||||
repeat_zero_or_more_literals(&x.hir, lits, prefixes);
|
||||
}
|
||||
hir::RepetitionKind::OneOrMore => {
|
||||
repeat_one_or_more_literals(&x.hir, lits, prefixes);
|
||||
}
|
||||
hir::RepetitionKind::Range(ref rng) => {
|
||||
let (min, max) = match *rng {
|
||||
hir::RepetitionRange::Exactly(m) => (m, Some(m)),
|
||||
hir::RepetitionRange::AtLeast(m) => (m, None),
|
||||
hir::RepetitionRange::Bounded(m, n) => (m, Some(n)),
|
||||
};
|
||||
repeat_range_literals(
|
||||
&x.hir, min, max, x.greedy, lits, prefixes,
|
||||
)
|
||||
}
|
||||
},
|
||||
HirKind::Concat(ref es) if es.is_empty() => {}
|
||||
HirKind::Concat(ref es) if es.len() == 1 => prefixes(&es[0], lits),
|
||||
HirKind::Concat(ref es) => {
|
||||
@@ -704,34 +689,27 @@ fn suffixes(expr: &Hir, lits: &mut Literals) {
|
||||
HirKind::Group(hir::Group { ref hir, .. }) => {
|
||||
suffixes(&**hir, lits);
|
||||
}
|
||||
HirKind::Repetition(ref x) => {
|
||||
match x.kind {
|
||||
hir::RepetitionKind::ZeroOrOne => {
|
||||
repeat_zero_or_one_literals(&x.hir, lits, suffixes);
|
||||
}
|
||||
hir::RepetitionKind::ZeroOrMore => {
|
||||
repeat_zero_or_more_literals(&x.hir, lits, suffixes);
|
||||
}
|
||||
hir::RepetitionKind::OneOrMore => {
|
||||
repeat_one_or_more_literals(&x.hir, lits, suffixes);
|
||||
}
|
||||
hir::RepetitionKind::Range(ref rng) => {
|
||||
let (min, max) = match *rng {
|
||||
hir::RepetitionRange::Exactly(m) => {
|
||||
(m, Some(m))
|
||||
}
|
||||
hir::RepetitionRange::AtLeast(m) => {
|
||||
(m, None)
|
||||
}
|
||||
hir::RepetitionRange::Bounded(m, n) => {
|
||||
(m, Some(n))
|
||||
}
|
||||
};
|
||||
repeat_range_literals(
|
||||
&x.hir, min, max, x.greedy, lits, suffixes)
|
||||
}
|
||||
HirKind::Repetition(ref x) => match x.kind {
|
||||
hir::RepetitionKind::ZeroOrOne => {
|
||||
repeat_zero_or_one_literals(&x.hir, lits, suffixes);
|
||||
}
|
||||
}
|
||||
hir::RepetitionKind::ZeroOrMore => {
|
||||
repeat_zero_or_more_literals(&x.hir, lits, suffixes);
|
||||
}
|
||||
hir::RepetitionKind::OneOrMore => {
|
||||
repeat_one_or_more_literals(&x.hir, lits, suffixes);
|
||||
}
|
||||
hir::RepetitionKind::Range(ref rng) => {
|
||||
let (min, max) = match *rng {
|
||||
hir::RepetitionRange::Exactly(m) => (m, Some(m)),
|
||||
hir::RepetitionRange::AtLeast(m) => (m, None),
|
||||
hir::RepetitionRange::Bounded(m, n) => (m, Some(n)),
|
||||
};
|
||||
repeat_range_literals(
|
||||
&x.hir, min, max, x.greedy, lits, suffixes,
|
||||
)
|
||||
}
|
||||
},
|
||||
HirKind::Concat(ref es) if es.is_empty() => {}
|
||||
HirKind::Concat(ref es) if es.len() == 1 => suffixes(&es[0], lits),
|
||||
HirKind::Concat(ref es) => {
|
||||
@@ -822,11 +800,14 @@ fn repeat_range_literals<F: FnMut(&Hir, &mut Literals)>(
|
||||
// This is a bit conservative. If `max` is set, then we could
|
||||
// treat this as a finite set of alternations. For now, we
|
||||
// just treat it as `e*`.
|
||||
f(&Hir::repetition(hir::Repetition {
|
||||
kind: hir::RepetitionKind::ZeroOrMore,
|
||||
greedy: greedy,
|
||||
hir: Box::new(e.clone()),
|
||||
}), lits);
|
||||
f(
|
||||
&Hir::repetition(hir::Repetition {
|
||||
kind: hir::RepetitionKind::ZeroOrMore,
|
||||
greedy: greedy,
|
||||
hir: Box::new(e.clone()),
|
||||
}),
|
||||
lits,
|
||||
);
|
||||
} else {
|
||||
if min > 0 {
|
||||
let n = cmp::min(lits.limit_size, min as usize);
|
||||
@@ -869,10 +850,10 @@ fn alternate_literals<F: FnMut(&Hir, &mut Literals)>(
|
||||
impl fmt::Debug for Literals {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
f.debug_struct("Literals")
|
||||
.field("lits", &self.lits)
|
||||
.field("limit_size", &self.limit_size)
|
||||
.field("limit_class", &self.limit_class)
|
||||
.finish()
|
||||
.field("lits", &self.lits)
|
||||
.field("limit_size", &self.limit_size)
|
||||
.field("limit_class", &self.limit_class)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -921,16 +902,22 @@ impl fmt::Debug for Literal {
|
||||
}
|
||||
|
||||
impl AsRef<[u8]> for Literal {
|
||||
fn as_ref(&self) -> &[u8] { &self.v }
|
||||
fn as_ref(&self) -> &[u8] {
|
||||
&self.v
|
||||
}
|
||||
}
|
||||
|
||||
impl ops::Deref for Literal {
|
||||
type Target = Vec<u8>;
|
||||
fn deref(&self) -> &Vec<u8> { &self.v }
|
||||
fn deref(&self) -> &Vec<u8> {
|
||||
&self.v
|
||||
}
|
||||
}
|
||||
|
||||
impl ops::DerefMut for Literal {
|
||||
fn deref_mut(&mut self) -> &mut Vec<u8> { &mut self.v }
|
||||
fn deref_mut(&mut self) -> &mut Vec<u8> {
|
||||
&mut self.v
|
||||
}
|
||||
}
|
||||
|
||||
fn position(needle: &[u8], mut haystack: &[u8]) -> Option<usize> {
|
||||
@@ -986,24 +973,22 @@ fn escape_byte(byte: u8) -> String {
|
||||
}
|
||||
|
||||
fn cls_char_count(cls: &hir::ClassUnicode) -> usize {
|
||||
cls.iter()
|
||||
.map(|&r| 1 + (r.end as u32) - (r.start as u32))
|
||||
.sum::<u32>() as usize
|
||||
cls.iter().map(|&r| 1 + (r.end as u32) - (r.start as u32)).sum::<u32>()
|
||||
as usize
|
||||
}
|
||||
|
||||
fn cls_byte_count(cls: &hir::ClassBytes) -> usize {
|
||||
cls.iter()
|
||||
.map(|&r| 1 + (r.end as u32) - (r.start as u32))
|
||||
.sum::<u32>() as usize
|
||||
cls.iter().map(|&r| 1 + (r.end as u32) - (r.start as u32)).sum::<u32>()
|
||||
as usize
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::fmt;
|
||||
|
||||
use ParserBuilder;
|
||||
use super::{escape_bytes, Literal, Literals};
|
||||
use hir::Hir;
|
||||
use super::{Literals, Literal, escape_bytes};
|
||||
use ParserBuilder;
|
||||
|
||||
// To make test failures easier to read.
|
||||
#[derive(Debug, Eq, PartialEq)]
|
||||
@@ -1014,15 +999,13 @@ mod tests {
|
||||
fn escape_lits(blits: &[Literal]) -> Vec<ULiteral> {
|
||||
let mut ulits = vec![];
|
||||
for blit in blits {
|
||||
ulits.push(ULiteral {
|
||||
v: escape_bytes(&blit),
|
||||
cut: blit.is_cut(),
|
||||
});
|
||||
ulits
|
||||
.push(ULiteral { v: escape_bytes(&blit), cut: blit.is_cut() });
|
||||
}
|
||||
ulits
|
||||
}
|
||||
|
||||
fn create_lits<I: IntoIterator<Item=Literal>>(it: I) -> Literals {
|
||||
fn create_lits<I: IntoIterator<Item = Literal>>(it: I) -> Literals {
|
||||
Literals {
|
||||
lits: it.into_iter().collect(),
|
||||
limit_size: 0,
|
||||
@@ -1038,7 +1021,9 @@ mod tests {
|
||||
}
|
||||
|
||||
impl ULiteral {
|
||||
fn is_cut(&self) -> bool { self.cut }
|
||||
fn is_cut(&self) -> bool {
|
||||
self.cut
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for ULiteral {
|
||||
@@ -1131,18 +1116,36 @@ mod tests {
|
||||
test_lit!(pfx_one_lit2, prefixes, "abc", M("abc"));
|
||||
test_lit!(pfx_one_lit3, prefixes, "(?u)☃", M("\\xe2\\x98\\x83"));
|
||||
test_lit!(pfx_one_lit4, prefixes, "(?ui)☃", M("\\xe2\\x98\\x83"));
|
||||
test_lit!(pfx_class1, prefixes, "[1-4]",
|
||||
M("1"), M("2"), M("3"), M("4"));
|
||||
test_lit!(pfx_class2, prefixes, "(?u)[☃Ⅰ]",
|
||||
M("\\xe2\\x85\\xa0"), M("\\xe2\\x98\\x83"));
|
||||
test_lit!(pfx_class3, prefixes, "(?ui)[☃Ⅰ]",
|
||||
M("\\xe2\\x85\\xa0"), M("\\xe2\\x85\\xb0"),
|
||||
M("\\xe2\\x98\\x83"));
|
||||
test_lit!(pfx_one_lit_casei1, prefixes, "(?i)a",
|
||||
M("A"), M("a"));
|
||||
test_lit!(pfx_one_lit_casei2, prefixes, "(?i)abc",
|
||||
M("ABC"), M("aBC"), M("AbC"), M("abC"),
|
||||
M("ABc"), M("aBc"), M("Abc"), M("abc"));
|
||||
test_lit!(pfx_class1, prefixes, "[1-4]", M("1"), M("2"), M("3"), M("4"));
|
||||
test_lit!(
|
||||
pfx_class2,
|
||||
prefixes,
|
||||
"(?u)[☃Ⅰ]",
|
||||
M("\\xe2\\x85\\xa0"),
|
||||
M("\\xe2\\x98\\x83")
|
||||
);
|
||||
test_lit!(
|
||||
pfx_class3,
|
||||
prefixes,
|
||||
"(?ui)[☃Ⅰ]",
|
||||
M("\\xe2\\x85\\xa0"),
|
||||
M("\\xe2\\x85\\xb0"),
|
||||
M("\\xe2\\x98\\x83")
|
||||
);
|
||||
test_lit!(pfx_one_lit_casei1, prefixes, "(?i)a", M("A"), M("a"));
|
||||
test_lit!(
|
||||
pfx_one_lit_casei2,
|
||||
prefixes,
|
||||
"(?i)abc",
|
||||
M("ABC"),
|
||||
M("aBC"),
|
||||
M("AbC"),
|
||||
M("abC"),
|
||||
M("ABc"),
|
||||
M("aBc"),
|
||||
M("Abc"),
|
||||
M("abc")
|
||||
);
|
||||
test_lit!(pfx_group1, prefixes, "(a)", M("a"));
|
||||
test_lit!(pfx_rep_zero_or_one1, prefixes, "a?");
|
||||
test_lit!(pfx_rep_zero_or_one2, prefixes, "(?:abc)?");
|
||||
@@ -1162,11 +1165,28 @@ mod tests {
|
||||
// Test regexes with concatenations.
|
||||
test_lit!(pfx_cat1, prefixes, "(?:a)(?:b)", M("ab"));
|
||||
test_lit!(pfx_cat2, prefixes, "[ab]z", M("az"), M("bz"));
|
||||
test_lit!(pfx_cat3, prefixes, "(?i)[ab]z",
|
||||
M("AZ"), M("BZ"), M("aZ"), M("bZ"),
|
||||
M("Az"), M("Bz"), M("az"), M("bz"));
|
||||
test_lit!(pfx_cat4, prefixes, "[ab][yz]",
|
||||
M("ay"), M("by"), M("az"), M("bz"));
|
||||
test_lit!(
|
||||
pfx_cat3,
|
||||
prefixes,
|
||||
"(?i)[ab]z",
|
||||
M("AZ"),
|
||||
M("BZ"),
|
||||
M("aZ"),
|
||||
M("bZ"),
|
||||
M("Az"),
|
||||
M("Bz"),
|
||||
M("az"),
|
||||
M("bz")
|
||||
);
|
||||
test_lit!(
|
||||
pfx_cat4,
|
||||
prefixes,
|
||||
"[ab][yz]",
|
||||
M("ay"),
|
||||
M("by"),
|
||||
M("az"),
|
||||
M("bz")
|
||||
);
|
||||
test_lit!(pfx_cat5, prefixes, "a*b", C("a"), M("b"));
|
||||
test_lit!(pfx_cat6, prefixes, "a*b*c", C("a"), C("b"), M("c"));
|
||||
test_lit!(pfx_cat7, prefixes, "a*b*c+", C("a"), C("b"), C("c"));
|
||||
@@ -1190,8 +1210,17 @@ mod tests {
|
||||
test_lit!(pfx_alt4, prefixes, "a|b*");
|
||||
test_lit!(pfx_alt5, prefixes, "a|b+", M("a"), C("b"));
|
||||
test_lit!(pfx_alt6, prefixes, "a|(?:b|c*)");
|
||||
test_lit!(pfx_alt7, prefixes, "(a|b)*c|(a|ab)*c",
|
||||
C("a"), C("b"), M("c"), C("a"), C("ab"), M("c"));
|
||||
test_lit!(
|
||||
pfx_alt7,
|
||||
prefixes,
|
||||
"(a|b)*c|(a|ab)*c",
|
||||
C("a"),
|
||||
C("b"),
|
||||
M("c"),
|
||||
C("a"),
|
||||
C("ab"),
|
||||
M("c")
|
||||
);
|
||||
test_lit!(pfx_alt8, prefixes, "a*b|c", C("a"), M("b"), M("c"));
|
||||
|
||||
// Test regexes with empty assertions.
|
||||
@@ -1228,7 +1257,11 @@ mod tests {
|
||||
pfx_crazy1,
|
||||
prefixes,
|
||||
r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]",
|
||||
C("Mo\\'am"), C("Mu\\'am"), C("Moam"), C("Muam"));
|
||||
C("Mo\\'am"),
|
||||
C("Mu\\'am"),
|
||||
C("Moam"),
|
||||
C("Muam")
|
||||
);
|
||||
|
||||
// ************************************************************************
|
||||
// Tests for quiting prefix literal search.
|
||||
@@ -1269,16 +1302,41 @@ mod tests {
|
||||
test_exhausted!(pfx_exhausted1, prefixes, "[a-z]");
|
||||
test_exhausted!(pfx_exhausted2, prefixes, "[a-z]*A");
|
||||
test_exhausted!(pfx_exhausted3, prefixes, "A[a-z]Z", C("A"));
|
||||
test_exhausted!(pfx_exhausted4, prefixes, "(?i)foobar",
|
||||
C("FO"), C("fO"), C("Fo"), C("fo"));
|
||||
test_exhausted!(pfx_exhausted5, prefixes, "(?:ab){100}",
|
||||
C("abababababababababab"));
|
||||
test_exhausted!(pfx_exhausted6, prefixes, "(?:(?:ab){100})*cd",
|
||||
C("ababababab"), M("cd"));
|
||||
test_exhausted!(pfx_exhausted7, prefixes, "z(?:(?:ab){100})*cd",
|
||||
C("zababababab"), M("zcd"));
|
||||
test_exhausted!(pfx_exhausted8, prefixes, "aaaaaaaaaaaaaaaaaaaaz",
|
||||
C("aaaaaaaaaaaaaaaaaaaa"));
|
||||
test_exhausted!(
|
||||
pfx_exhausted4,
|
||||
prefixes,
|
||||
"(?i)foobar",
|
||||
C("FO"),
|
||||
C("fO"),
|
||||
C("Fo"),
|
||||
C("fo")
|
||||
);
|
||||
test_exhausted!(
|
||||
pfx_exhausted5,
|
||||
prefixes,
|
||||
"(?:ab){100}",
|
||||
C("abababababababababab")
|
||||
);
|
||||
test_exhausted!(
|
||||
pfx_exhausted6,
|
||||
prefixes,
|
||||
"(?:(?:ab){100})*cd",
|
||||
C("ababababab"),
|
||||
M("cd")
|
||||
);
|
||||
test_exhausted!(
|
||||
pfx_exhausted7,
|
||||
prefixes,
|
||||
"z(?:(?:ab){100})*cd",
|
||||
C("zababababab"),
|
||||
M("zcd")
|
||||
);
|
||||
test_exhausted!(
|
||||
pfx_exhausted8,
|
||||
prefixes,
|
||||
"aaaaaaaaaaaaaaaaaaaaz",
|
||||
C("aaaaaaaaaaaaaaaaaaaa")
|
||||
);
|
||||
|
||||
// ************************************************************************
|
||||
// Tests for suffix literal extraction.
|
||||
@@ -1289,18 +1347,36 @@ mod tests {
|
||||
test_lit!(sfx_one_lit2, suffixes, "abc", M("abc"));
|
||||
test_lit!(sfx_one_lit3, suffixes, "(?u)☃", M("\\xe2\\x98\\x83"));
|
||||
test_lit!(sfx_one_lit4, suffixes, "(?ui)☃", M("\\xe2\\x98\\x83"));
|
||||
test_lit!(sfx_class1, suffixes, "[1-4]",
|
||||
M("1"), M("2"), M("3"), M("4"));
|
||||
test_lit!(sfx_class2, suffixes, "(?u)[☃Ⅰ]",
|
||||
M("\\xe2\\x85\\xa0"), M("\\xe2\\x98\\x83"));
|
||||
test_lit!(sfx_class3, suffixes, "(?ui)[☃Ⅰ]",
|
||||
M("\\xe2\\x85\\xa0"), M("\\xe2\\x85\\xb0"),
|
||||
M("\\xe2\\x98\\x83"));
|
||||
test_lit!(sfx_one_lit_casei1, suffixes, "(?i)a",
|
||||
M("A"), M("a"));
|
||||
test_lit!(sfx_one_lit_casei2, suffixes, "(?i)abc",
|
||||
M("ABC"), M("ABc"), M("AbC"), M("Abc"),
|
||||
M("aBC"), M("aBc"), M("abC"), M("abc"));
|
||||
test_lit!(sfx_class1, suffixes, "[1-4]", M("1"), M("2"), M("3"), M("4"));
|
||||
test_lit!(
|
||||
sfx_class2,
|
||||
suffixes,
|
||||
"(?u)[☃Ⅰ]",
|
||||
M("\\xe2\\x85\\xa0"),
|
||||
M("\\xe2\\x98\\x83")
|
||||
);
|
||||
test_lit!(
|
||||
sfx_class3,
|
||||
suffixes,
|
||||
"(?ui)[☃Ⅰ]",
|
||||
M("\\xe2\\x85\\xa0"),
|
||||
M("\\xe2\\x85\\xb0"),
|
||||
M("\\xe2\\x98\\x83")
|
||||
);
|
||||
test_lit!(sfx_one_lit_casei1, suffixes, "(?i)a", M("A"), M("a"));
|
||||
test_lit!(
|
||||
sfx_one_lit_casei2,
|
||||
suffixes,
|
||||
"(?i)abc",
|
||||
M("ABC"),
|
||||
M("ABc"),
|
||||
M("AbC"),
|
||||
M("Abc"),
|
||||
M("aBC"),
|
||||
M("aBc"),
|
||||
M("abC"),
|
||||
M("abc")
|
||||
);
|
||||
test_lit!(sfx_group1, suffixes, "(a)", M("a"));
|
||||
test_lit!(sfx_rep_zero_or_one1, suffixes, "a?");
|
||||
test_lit!(sfx_rep_zero_or_one2, suffixes, "(?:abc)?");
|
||||
@@ -1320,11 +1396,28 @@ mod tests {
|
||||
// Test regexes with concatenations.
|
||||
test_lit!(sfx_cat1, suffixes, "(?:a)(?:b)", M("ab"));
|
||||
test_lit!(sfx_cat2, suffixes, "[ab]z", M("az"), M("bz"));
|
||||
test_lit!(sfx_cat3, suffixes, "(?i)[ab]z",
|
||||
M("AZ"), M("Az"), M("BZ"), M("Bz"),
|
||||
M("aZ"), M("az"), M("bZ"), M("bz"));
|
||||
test_lit!(sfx_cat4, suffixes, "[ab][yz]",
|
||||
M("ay"), M("az"), M("by"), M("bz"));
|
||||
test_lit!(
|
||||
sfx_cat3,
|
||||
suffixes,
|
||||
"(?i)[ab]z",
|
||||
M("AZ"),
|
||||
M("Az"),
|
||||
M("BZ"),
|
||||
M("Bz"),
|
||||
M("aZ"),
|
||||
M("az"),
|
||||
M("bZ"),
|
||||
M("bz")
|
||||
);
|
||||
test_lit!(
|
||||
sfx_cat4,
|
||||
suffixes,
|
||||
"[ab][yz]",
|
||||
M("ay"),
|
||||
M("az"),
|
||||
M("by"),
|
||||
M("bz")
|
||||
);
|
||||
test_lit!(sfx_cat5, suffixes, "a*b", C("ab"), M("b"));
|
||||
test_lit!(sfx_cat6, suffixes, "a*b*c", C("bc"), C("ac"), M("c"));
|
||||
test_lit!(sfx_cat7, suffixes, "a*b*c+", C("c"));
|
||||
@@ -1348,8 +1441,17 @@ mod tests {
|
||||
test_lit!(sfx_alt4, suffixes, "a|b*");
|
||||
test_lit!(sfx_alt5, suffixes, "a|b+", M("a"), C("b"));
|
||||
test_lit!(sfx_alt6, suffixes, "a|(?:b|c*)");
|
||||
test_lit!(sfx_alt7, suffixes, "(a|b)*c|(a|ab)*c",
|
||||
C("ac"), C("bc"), M("c"), C("ac"), C("abc"), M("c"));
|
||||
test_lit!(
|
||||
sfx_alt7,
|
||||
suffixes,
|
||||
"(a|b)*c|(a|ab)*c",
|
||||
C("ac"),
|
||||
C("bc"),
|
||||
M("c"),
|
||||
C("ac"),
|
||||
C("abc"),
|
||||
M("c")
|
||||
);
|
||||
test_lit!(sfx_alt8, suffixes, "a*b|c", C("ab"), M("b"), M("c"));
|
||||
|
||||
// Test regexes with empty assertions.
|
||||
@@ -1385,16 +1487,41 @@ mod tests {
|
||||
test_exhausted!(sfx_exhausted1, suffixes, "[a-z]");
|
||||
test_exhausted!(sfx_exhausted2, suffixes, "A[a-z]*");
|
||||
test_exhausted!(sfx_exhausted3, suffixes, "A[a-z]Z", C("Z"));
|
||||
test_exhausted!(sfx_exhausted4, suffixes, "(?i)foobar",
|
||||
C("AR"), C("Ar"), C("aR"), C("ar"));
|
||||
test_exhausted!(sfx_exhausted5, suffixes, "(?:ab){100}",
|
||||
C("abababababababababab"));
|
||||
test_exhausted!(sfx_exhausted6, suffixes, "cd(?:(?:ab){100})*",
|
||||
C("ababababab"), M("cd"));
|
||||
test_exhausted!(sfx_exhausted7, suffixes, "cd(?:(?:ab){100})*z",
|
||||
C("abababababz"), M("cdz"));
|
||||
test_exhausted!(sfx_exhausted8, suffixes, "zaaaaaaaaaaaaaaaaaaaa",
|
||||
C("aaaaaaaaaaaaaaaaaaaa"));
|
||||
test_exhausted!(
|
||||
sfx_exhausted4,
|
||||
suffixes,
|
||||
"(?i)foobar",
|
||||
C("AR"),
|
||||
C("Ar"),
|
||||
C("aR"),
|
||||
C("ar")
|
||||
);
|
||||
test_exhausted!(
|
||||
sfx_exhausted5,
|
||||
suffixes,
|
||||
"(?:ab){100}",
|
||||
C("abababababababababab")
|
||||
);
|
||||
test_exhausted!(
|
||||
sfx_exhausted6,
|
||||
suffixes,
|
||||
"cd(?:(?:ab){100})*",
|
||||
C("ababababab"),
|
||||
M("cd")
|
||||
);
|
||||
test_exhausted!(
|
||||
sfx_exhausted7,
|
||||
suffixes,
|
||||
"cd(?:(?:ab){100})*z",
|
||||
C("abababababz"),
|
||||
M("cdz")
|
||||
);
|
||||
test_exhausted!(
|
||||
sfx_exhausted8,
|
||||
suffixes,
|
||||
"zaaaaaaaaaaaaaaaaaaaa",
|
||||
C("aaaaaaaaaaaaaaaaaaaa")
|
||||
);
|
||||
|
||||
// ************************************************************************
|
||||
// Tests for generating unambiguous literal sets.
|
||||
@@ -1404,8 +1531,7 @@ mod tests {
|
||||
($name:ident, $given:expr, $expected:expr) => {
|
||||
#[test]
|
||||
fn $name() {
|
||||
let given: Vec<Literal> =
|
||||
$given
|
||||
let given: Vec<Literal> = $given
|
||||
.into_iter()
|
||||
.map(|ul| {
|
||||
let cut = ul.is_cut();
|
||||
@@ -1420,35 +1546,52 @@ mod tests {
|
||||
}
|
||||
|
||||
test_unamb!(unambiguous1, vec![M("z"), M("azb")], vec![C("a"), C("z")]);
|
||||
test_unamb!(unambiguous2,
|
||||
vec![M("zaaaaaa"), M("aa")], vec![C("aa"), C("z")]);
|
||||
test_unamb!(unambiguous3,
|
||||
vec![M("Sherlock"), M("Watson")],
|
||||
vec![M("Sherlock"), M("Watson")]);
|
||||
test_unamb!(
|
||||
unambiguous2,
|
||||
vec![M("zaaaaaa"), M("aa")],
|
||||
vec![C("aa"), C("z")]
|
||||
);
|
||||
test_unamb!(
|
||||
unambiguous3,
|
||||
vec![M("Sherlock"), M("Watson")],
|
||||
vec![M("Sherlock"), M("Watson")]
|
||||
);
|
||||
test_unamb!(unambiguous4, vec![M("abc"), M("bc")], vec![C("a"), C("bc")]);
|
||||
test_unamb!(unambiguous5, vec![M("bc"), M("abc")], vec![C("a"), C("bc")]);
|
||||
test_unamb!(unambiguous6, vec![M("a"), M("aa")], vec![C("a")]);
|
||||
test_unamb!(unambiguous7, vec![M("aa"), M("a")], vec![C("a")]);
|
||||
test_unamb!(unambiguous8, vec![M("ab"), M("a")], vec![C("a")]);
|
||||
test_unamb!(unambiguous9,
|
||||
vec![M("ac"), M("bc"), M("c"), M("ac"), M("abc"), M("c")],
|
||||
vec![C("a"), C("b"), C("c")]);
|
||||
test_unamb!(unambiguous10,
|
||||
vec![M("Mo'"), M("Mu'"), M("Mo"), M("Mu")],
|
||||
vec![C("Mo"), C("Mu")]);
|
||||
test_unamb!(unambiguous11,
|
||||
vec![M("zazb"), M("azb")], vec![C("a"), C("z")]);
|
||||
test_unamb!(
|
||||
unambiguous9,
|
||||
vec![M("ac"), M("bc"), M("c"), M("ac"), M("abc"), M("c")],
|
||||
vec![C("a"), C("b"), C("c")]
|
||||
);
|
||||
test_unamb!(
|
||||
unambiguous10,
|
||||
vec![M("Mo'"), M("Mu'"), M("Mo"), M("Mu")],
|
||||
vec![C("Mo"), C("Mu")]
|
||||
);
|
||||
test_unamb!(
|
||||
unambiguous11,
|
||||
vec![M("zazb"), M("azb")],
|
||||
vec![C("a"), C("z")]
|
||||
);
|
||||
test_unamb!(unambiguous12, vec![M("foo"), C("foo")], vec![C("foo")]);
|
||||
test_unamb!(unambiguous13,
|
||||
vec![M("ABCX"), M("CDAX"), M("BCX")],
|
||||
vec![C("A"), C("BCX"), C("CD")]);
|
||||
test_unamb!(unambiguous14,
|
||||
vec![M("IMGX"), M("MVIX"), M("MGX"), M("DSX")],
|
||||
vec![M("DSX"), C("I"), C("MGX"), C("MV")]);
|
||||
test_unamb!(unambiguous15,
|
||||
vec![M("IMG_"), M("MG_"), M("CIMG")],
|
||||
vec![C("C"), C("I"), C("MG_")]);
|
||||
|
||||
test_unamb!(
|
||||
unambiguous13,
|
||||
vec![M("ABCX"), M("CDAX"), M("BCX")],
|
||||
vec![C("A"), C("BCX"), C("CD")]
|
||||
);
|
||||
test_unamb!(
|
||||
unambiguous14,
|
||||
vec![M("IMGX"), M("MVIX"), M("MGX"), M("DSX")],
|
||||
vec![M("DSX"), C("I"), C("MGX"), C("MV")]
|
||||
);
|
||||
test_unamb!(
|
||||
unambiguous15,
|
||||
vec![M("IMG_"), M("MG_"), M("CIMG")],
|
||||
vec![C("C"), C("I"), C("MG_")]
|
||||
);
|
||||
|
||||
// ************************************************************************
|
||||
// Tests for suffix trimming.
|
||||
@@ -1457,8 +1600,7 @@ mod tests {
|
||||
($name:ident, $trim:expr, $given:expr, $expected:expr) => {
|
||||
#[test]
|
||||
fn $name() {
|
||||
let given: Vec<Literal> =
|
||||
$given
|
||||
let given: Vec<Literal> = $given
|
||||
.into_iter()
|
||||
.map(|ul| {
|
||||
let cut = ul.is_cut();
|
||||
@@ -1469,7 +1611,7 @@ mod tests {
|
||||
let got = lits.trim_suffix($trim).unwrap();
|
||||
assert_eq!($expected, escape_lits(got.literals()));
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
test_trim!(trim1, 1, vec![M("ab"), M("yz")], vec![C("a"), C("y")]);
|
||||
@@ -1485,8 +1627,7 @@ mod tests {
|
||||
($name:ident, $given:expr, $expected:expr) => {
|
||||
#[test]
|
||||
fn $name() {
|
||||
let given: Vec<Literal> =
|
||||
$given
|
||||
let given: Vec<Literal> = $given
|
||||
.into_iter()
|
||||
.map(|s: &str| Literal {
|
||||
v: s.to_owned().into_bytes(),
|
||||
@@ -1521,8 +1662,7 @@ mod tests {
|
||||
($name:ident, $given:expr, $expected:expr) => {
|
||||
#[test]
|
||||
fn $name() {
|
||||
let given: Vec<Literal> =
|
||||
$given
|
||||
let given: Vec<Literal> = $given
|
||||
.into_iter()
|
||||
.map(|s: &str| Literal {
|
||||
v: s.to_owned().into_bytes(),
|
||||
|
||||
+143
-132
@@ -21,7 +21,7 @@ use ast::Span;
|
||||
use hir::interval::{Interval, IntervalSet, IntervalSetIter};
|
||||
use unicode;
|
||||
|
||||
pub use hir::visitor::{Visitor, visit};
|
||||
pub use hir::visitor::{visit, Visitor};
|
||||
|
||||
mod interval;
|
||||
pub mod literal;
|
||||
@@ -229,10 +229,7 @@ impl Hir {
|
||||
info.set_match_empty(true);
|
||||
info.set_literal(true);
|
||||
info.set_alternation_literal(true);
|
||||
Hir {
|
||||
kind: HirKind::Empty,
|
||||
info: info,
|
||||
}
|
||||
Hir { kind: HirKind::Empty, info: info }
|
||||
}
|
||||
|
||||
/// Creates a literal HIR expression.
|
||||
@@ -257,10 +254,7 @@ impl Hir {
|
||||
info.set_match_empty(false);
|
||||
info.set_literal(true);
|
||||
info.set_alternation_literal(true);
|
||||
Hir {
|
||||
kind: HirKind::Literal(lit),
|
||||
info: info,
|
||||
}
|
||||
Hir { kind: HirKind::Literal(lit), info: info }
|
||||
}
|
||||
|
||||
/// Creates a class HIR expression.
|
||||
@@ -277,10 +271,7 @@ impl Hir {
|
||||
info.set_match_empty(false);
|
||||
info.set_literal(false);
|
||||
info.set_alternation_literal(false);
|
||||
Hir {
|
||||
kind: HirKind::Class(class),
|
||||
info: info,
|
||||
}
|
||||
Hir { kind: HirKind::Class(class), info: info }
|
||||
}
|
||||
|
||||
/// Creates an anchor assertion HIR expression.
|
||||
@@ -313,10 +304,7 @@ impl Hir {
|
||||
if let Anchor::EndLine = anchor {
|
||||
info.set_line_anchored_end(true);
|
||||
}
|
||||
Hir {
|
||||
kind: HirKind::Anchor(anchor),
|
||||
info: info,
|
||||
}
|
||||
Hir { kind: HirKind::Anchor(anchor), info: info }
|
||||
}
|
||||
|
||||
/// Creates a word boundary assertion HIR expression.
|
||||
@@ -339,10 +327,7 @@ impl Hir {
|
||||
if let WordBoundary::AsciiNegate = word_boundary {
|
||||
info.set_always_utf8(false);
|
||||
}
|
||||
Hir {
|
||||
kind: HirKind::WordBoundary(word_boundary),
|
||||
info: info,
|
||||
}
|
||||
Hir { kind: HirKind::WordBoundary(word_boundary), info: info }
|
||||
}
|
||||
|
||||
/// Creates a repetition HIR expression.
|
||||
@@ -353,26 +338,23 @@ impl Hir {
|
||||
// If this operator can match the empty string, then it can never
|
||||
// be anchored.
|
||||
info.set_anchored_start(
|
||||
!rep.is_match_empty() && rep.hir.is_anchored_start()
|
||||
!rep.is_match_empty() && rep.hir.is_anchored_start(),
|
||||
);
|
||||
info.set_anchored_end(
|
||||
!rep.is_match_empty() && rep.hir.is_anchored_end()
|
||||
!rep.is_match_empty() && rep.hir.is_anchored_end(),
|
||||
);
|
||||
info.set_line_anchored_start(
|
||||
!rep.is_match_empty() && rep.hir.is_anchored_start()
|
||||
!rep.is_match_empty() && rep.hir.is_anchored_start(),
|
||||
);
|
||||
info.set_line_anchored_end(
|
||||
!rep.is_match_empty() && rep.hir.is_anchored_end()
|
||||
!rep.is_match_empty() && rep.hir.is_anchored_end(),
|
||||
);
|
||||
info.set_any_anchored_start(rep.hir.is_any_anchored_start());
|
||||
info.set_any_anchored_end(rep.hir.is_any_anchored_end());
|
||||
info.set_match_empty(rep.is_match_empty() || rep.hir.is_match_empty());
|
||||
info.set_literal(false);
|
||||
info.set_alternation_literal(false);
|
||||
Hir {
|
||||
kind: HirKind::Repetition(rep),
|
||||
info: info,
|
||||
}
|
||||
Hir { kind: HirKind::Repetition(rep), info: info }
|
||||
}
|
||||
|
||||
/// Creates a group HIR expression.
|
||||
@@ -389,10 +371,7 @@ impl Hir {
|
||||
info.set_match_empty(group.hir.is_match_empty());
|
||||
info.set_literal(false);
|
||||
info.set_alternation_literal(false);
|
||||
Hir {
|
||||
kind: HirKind::Group(group),
|
||||
info: info,
|
||||
}
|
||||
Hir { kind: HirKind::Group(group), info: info }
|
||||
}
|
||||
|
||||
/// Returns the concatenation of the given expressions.
|
||||
@@ -401,7 +380,7 @@ impl Hir {
|
||||
pub fn concat(mut exprs: Vec<Hir>) -> Hir {
|
||||
match exprs.len() {
|
||||
0 => Hir::empty(),
|
||||
1 => { exprs.pop().unwrap() }
|
||||
1 => exprs.pop().unwrap(),
|
||||
_ => {
|
||||
let mut info = HirInfo::new();
|
||||
info.set_always_utf8(true);
|
||||
@@ -420,14 +399,12 @@ impl Hir {
|
||||
let x = info.is_all_assertions() && e.is_all_assertions();
|
||||
info.set_all_assertions(x);
|
||||
|
||||
let x =
|
||||
info.is_any_anchored_start()
|
||||
let x = info.is_any_anchored_start()
|
||||
|| e.is_any_anchored_start();
|
||||
info.set_any_anchored_start(x);
|
||||
|
||||
let x =
|
||||
info.is_any_anchored_end()
|
||||
|| e.is_any_anchored_end();
|
||||
info.is_any_anchored_end() || e.is_any_anchored_end();
|
||||
info.set_any_anchored_end(x);
|
||||
|
||||
let x = info.is_match_empty() && e.is_match_empty();
|
||||
@@ -436,8 +413,7 @@ impl Hir {
|
||||
let x = info.is_literal() && e.is_literal();
|
||||
info.set_literal(x);
|
||||
|
||||
let x =
|
||||
info.is_alternation_literal()
|
||||
let x = info.is_alternation_literal()
|
||||
&& e.is_alternation_literal();
|
||||
info.set_alternation_literal(x);
|
||||
}
|
||||
@@ -451,45 +427,42 @@ impl Hir {
|
||||
// is actually one that is either not an assertion or is
|
||||
// specifically the StartText assertion.
|
||||
info.set_anchored_start(
|
||||
exprs.iter()
|
||||
exprs
|
||||
.iter()
|
||||
.take_while(|e| {
|
||||
e.is_anchored_start() || e.is_all_assertions()
|
||||
})
|
||||
.any(|e| {
|
||||
e.is_anchored_start()
|
||||
}));
|
||||
.any(|e| e.is_anchored_start()),
|
||||
);
|
||||
// Similarly for the end anchor, but in reverse.
|
||||
info.set_anchored_end(
|
||||
exprs.iter()
|
||||
exprs
|
||||
.iter()
|
||||
.rev()
|
||||
.take_while(|e| {
|
||||
e.is_anchored_end() || e.is_all_assertions()
|
||||
})
|
||||
.any(|e| {
|
||||
e.is_anchored_end()
|
||||
}));
|
||||
.any(|e| e.is_anchored_end()),
|
||||
);
|
||||
// Repeat the process for line anchors.
|
||||
info.set_line_anchored_start(
|
||||
exprs.iter()
|
||||
exprs
|
||||
.iter()
|
||||
.take_while(|e| {
|
||||
e.is_line_anchored_start() || e.is_all_assertions()
|
||||
})
|
||||
.any(|e| {
|
||||
e.is_line_anchored_start()
|
||||
}));
|
||||
.any(|e| e.is_line_anchored_start()),
|
||||
);
|
||||
info.set_line_anchored_end(
|
||||
exprs.iter()
|
||||
exprs
|
||||
.iter()
|
||||
.rev()
|
||||
.take_while(|e| {
|
||||
e.is_line_anchored_end() || e.is_all_assertions()
|
||||
})
|
||||
.any(|e| {
|
||||
e.is_line_anchored_end()
|
||||
}));
|
||||
Hir {
|
||||
kind: HirKind::Concat(exprs),
|
||||
info: info,
|
||||
}
|
||||
.any(|e| e.is_line_anchored_end()),
|
||||
);
|
||||
Hir { kind: HirKind::Concat(exprs), info: info }
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -537,28 +510,21 @@ impl Hir {
|
||||
&& e.is_line_anchored_end();
|
||||
info.set_line_anchored_end(x);
|
||||
|
||||
let x =
|
||||
info.is_any_anchored_start()
|
||||
let x = info.is_any_anchored_start()
|
||||
|| e.is_any_anchored_start();
|
||||
info.set_any_anchored_start(x);
|
||||
|
||||
let x =
|
||||
info.is_any_anchored_end()
|
||||
|| e.is_any_anchored_end();
|
||||
info.is_any_anchored_end() || e.is_any_anchored_end();
|
||||
info.set_any_anchored_end(x);
|
||||
|
||||
let x = info.is_match_empty() || e.is_match_empty();
|
||||
info.set_match_empty(x);
|
||||
|
||||
let x =
|
||||
info.is_alternation_literal()
|
||||
&& e.is_literal();
|
||||
let x = info.is_alternation_literal() && e.is_literal();
|
||||
info.set_alternation_literal(x);
|
||||
}
|
||||
Hir {
|
||||
kind: HirKind::Alternation(exprs),
|
||||
info: info,
|
||||
}
|
||||
Hir { kind: HirKind::Alternation(exprs), info: info }
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -860,7 +826,8 @@ impl ClassUnicode {
|
||||
/// The given ranges do not need to be in any specific order, and ranges
|
||||
/// may overlap.
|
||||
pub fn new<I>(ranges: I) -> ClassUnicode
|
||||
where I: IntoIterator<Item=ClassUnicodeRange>
|
||||
where
|
||||
I: IntoIterator<Item = ClassUnicodeRange>,
|
||||
{
|
||||
ClassUnicode { set: IntervalSet::new(ranges) }
|
||||
}
|
||||
@@ -958,32 +925,43 @@ pub struct ClassUnicodeRange {
|
||||
|
||||
impl fmt::Debug for ClassUnicodeRange {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
let start =
|
||||
if !self.start.is_whitespace() && !self.start.is_control() {
|
||||
self.start.to_string()
|
||||
} else {
|
||||
format!("0x{:X}", self.start as u32)
|
||||
};
|
||||
let end =
|
||||
if !self.end.is_whitespace() && !self.end.is_control() {
|
||||
self.end.to_string()
|
||||
} else {
|
||||
format!("0x{:X}", self.end as u32)
|
||||
};
|
||||
let start = if !self.start.is_whitespace() && !self.start.is_control()
|
||||
{
|
||||
self.start.to_string()
|
||||
} else {
|
||||
format!("0x{:X}", self.start as u32)
|
||||
};
|
||||
let end = if !self.end.is_whitespace() && !self.end.is_control() {
|
||||
self.end.to_string()
|
||||
} else {
|
||||
format!("0x{:X}", self.end as u32)
|
||||
};
|
||||
f.debug_struct("ClassUnicodeRange")
|
||||
.field("start", &start)
|
||||
.field("end", &end)
|
||||
.finish()
|
||||
.field("start", &start)
|
||||
.field("end", &end)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
impl Interval for ClassUnicodeRange {
|
||||
type Bound = char;
|
||||
|
||||
#[inline] fn lower(&self) -> char { self.start }
|
||||
#[inline] fn upper(&self) -> char { self.end }
|
||||
#[inline] fn set_lower(&mut self, bound: char) { self.start = bound; }
|
||||
#[inline] fn set_upper(&mut self, bound: char) { self.end = bound; }
|
||||
#[inline]
|
||||
fn lower(&self) -> char {
|
||||
self.start
|
||||
}
|
||||
#[inline]
|
||||
fn upper(&self) -> char {
|
||||
self.end
|
||||
}
|
||||
#[inline]
|
||||
fn set_lower(&mut self, bound: char) {
|
||||
self.start = bound;
|
||||
}
|
||||
#[inline]
|
||||
fn set_upper(&mut self, bound: char) {
|
||||
self.end = bound;
|
||||
}
|
||||
|
||||
/// Apply simple case folding to this Unicode scalar value range.
|
||||
///
|
||||
@@ -1053,7 +1031,8 @@ impl ClassBytes {
|
||||
/// The given ranges do not need to be in any specific order, and ranges
|
||||
/// may overlap.
|
||||
pub fn new<I>(ranges: I) -> ClassBytes
|
||||
where I: IntoIterator<Item=ClassBytesRange>
|
||||
where
|
||||
I: IntoIterator<Item = ClassBytesRange>,
|
||||
{
|
||||
ClassBytes { set: IntervalSet::new(ranges) }
|
||||
}
|
||||
@@ -1160,10 +1139,22 @@ pub struct ClassBytesRange {
|
||||
impl Interval for ClassBytesRange {
|
||||
type Bound = u8;
|
||||
|
||||
#[inline] fn lower(&self) -> u8 { self.start }
|
||||
#[inline] fn upper(&self) -> u8 { self.end }
|
||||
#[inline] fn set_lower(&mut self, bound: u8) { self.start = bound; }
|
||||
#[inline] fn set_upper(&mut self, bound: u8) { self.end = bound; }
|
||||
#[inline]
|
||||
fn lower(&self) -> u8 {
|
||||
self.start
|
||||
}
|
||||
#[inline]
|
||||
fn upper(&self) -> u8 {
|
||||
self.end
|
||||
}
|
||||
#[inline]
|
||||
fn set_lower(&mut self, bound: u8) {
|
||||
self.start = bound;
|
||||
}
|
||||
#[inline]
|
||||
fn set_upper(&mut self, bound: u8) {
|
||||
self.end = bound;
|
||||
}
|
||||
|
||||
/// Apply simple case folding to this byte range. Only ASCII case mappings
|
||||
/// (for a-z) are applied.
|
||||
@@ -1271,8 +1262,8 @@ impl WordBoundary {
|
||||
/// Returns true if and only if this word boundary assertion is negated.
|
||||
pub fn is_negated(&self) -> bool {
|
||||
match *self {
|
||||
WordBoundary::Unicode | WordBoundary::Ascii => false,
|
||||
WordBoundary::UnicodeNegate | WordBoundary::AsciiNegate => true,
|
||||
WordBoundary::Unicode | WordBoundary::Ascii => false,
|
||||
WordBoundary::UnicodeNegate | WordBoundary::AsciiNegate => true,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1454,9 +1445,7 @@ macro_rules! define_bool {
|
||||
|
||||
impl HirInfo {
|
||||
fn new() -> HirInfo {
|
||||
HirInfo {
|
||||
bools: 0,
|
||||
}
|
||||
HirInfo { bools: 0 }
|
||||
}
|
||||
|
||||
define_bool!(0, is_always_utf8, set_always_utf8);
|
||||
@@ -1485,10 +1474,8 @@ mod tests {
|
||||
}
|
||||
|
||||
fn bclass(ranges: &[(u8, u8)]) -> ClassBytes {
|
||||
let ranges: Vec<ClassBytesRange> = ranges
|
||||
.iter()
|
||||
.map(|&(s, e)| ClassBytesRange::new(s, e))
|
||||
.collect();
|
||||
let ranges: Vec<ClassBytesRange> =
|
||||
ranges.iter().map(|&(s, e)| ClassBytesRange::new(s, e)).collect();
|
||||
ClassBytes::new(ranges)
|
||||
}
|
||||
|
||||
@@ -1520,7 +1507,10 @@ mod tests {
|
||||
cls_
|
||||
}
|
||||
|
||||
fn usymdifference(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode {
|
||||
fn usymdifference(
|
||||
cls1: &ClassUnicode,
|
||||
cls2: &ClassUnicode,
|
||||
) -> ClassUnicode {
|
||||
let mut cls_ = cls1.clone();
|
||||
cls_.symmetric_difference(cls2);
|
||||
cls_
|
||||
@@ -1601,8 +1591,12 @@ mod tests {
|
||||
assert_eq!(expected, uranges(&cls));
|
||||
|
||||
let cls = uclass(&[
|
||||
('c', 'f'), ('a', 'g'), ('d', 'j'), ('a', 'c'),
|
||||
('m', 'p'), ('l', 's'),
|
||||
('c', 'f'),
|
||||
('a', 'g'),
|
||||
('d', 'j'),
|
||||
('a', 'c'),
|
||||
('m', 'p'),
|
||||
('l', 's'),
|
||||
]);
|
||||
let expected = vec![('a', 'j'), ('l', 's')];
|
||||
assert_eq!(expected, uranges(&cls));
|
||||
@@ -1615,7 +1609,6 @@ mod tests {
|
||||
let expected = vec![('\x00', '\u{10FFFF}')];
|
||||
assert_eq!(expected, uranges(&cls));
|
||||
|
||||
|
||||
let cls = uclass(&[('a', 'a'), ('b', 'b')]);
|
||||
let expected = vec![('a', 'b')];
|
||||
assert_eq!(expected, uranges(&cls));
|
||||
@@ -1636,8 +1629,12 @@ mod tests {
|
||||
assert_eq!(expected, branges(&cls));
|
||||
|
||||
let cls = bclass(&[
|
||||
(b'c', b'f'), (b'a', b'g'), (b'd', b'j'), (b'a', b'c'),
|
||||
(b'm', b'p'), (b'l', b's'),
|
||||
(b'c', b'f'),
|
||||
(b'a', b'g'),
|
||||
(b'd', b'j'),
|
||||
(b'a', b'c'),
|
||||
(b'm', b'p'),
|
||||
(b'l', b's'),
|
||||
]);
|
||||
let expected = vec![(b'a', b'j'), (b'l', b's')];
|
||||
assert_eq!(expected, branges(&cls));
|
||||
@@ -1658,19 +1655,27 @@ mod tests {
|
||||
#[test]
|
||||
fn class_case_fold_unicode() {
|
||||
let cls = uclass(&[
|
||||
('C', 'F'), ('A', 'G'), ('D', 'J'), ('A', 'C'),
|
||||
('M', 'P'), ('L', 'S'), ('c', 'f'),
|
||||
('C', 'F'),
|
||||
('A', 'G'),
|
||||
('D', 'J'),
|
||||
('A', 'C'),
|
||||
('M', 'P'),
|
||||
('L', 'S'),
|
||||
('c', 'f'),
|
||||
]);
|
||||
let expected = uclass(&[
|
||||
('A', 'J'), ('L', 'S'),
|
||||
('a', 'j'), ('l', 's'),
|
||||
('A', 'J'),
|
||||
('L', 'S'),
|
||||
('a', 'j'),
|
||||
('l', 's'),
|
||||
('\u{17F}', '\u{17F}'),
|
||||
]);
|
||||
assert_eq!(expected, ucasefold(&cls));
|
||||
|
||||
let cls = uclass(&[('A', 'Z')]);
|
||||
let expected = uclass(&[
|
||||
('A', 'Z'), ('a', 'z'),
|
||||
('A', 'Z'),
|
||||
('a', 'z'),
|
||||
('\u{17F}', '\u{17F}'),
|
||||
('\u{212A}', '\u{212A}'),
|
||||
]);
|
||||
@@ -1678,7 +1683,8 @@ mod tests {
|
||||
|
||||
let cls = uclass(&[('a', 'z')]);
|
||||
let expected = uclass(&[
|
||||
('A', 'Z'), ('a', 'z'),
|
||||
('A', 'Z'),
|
||||
('a', 'z'),
|
||||
('\u{17F}', '\u{17F}'),
|
||||
('\u{212A}', '\u{212A}'),
|
||||
]);
|
||||
@@ -1696,9 +1702,8 @@ mod tests {
|
||||
assert_eq!(cls, ucasefold(&cls));
|
||||
|
||||
let cls = uclass(&[('k', 'k')]);
|
||||
let expected = uclass(&[
|
||||
('K', 'K'), ('k', 'k'), ('\u{212A}', '\u{212A}'),
|
||||
]);
|
||||
let expected =
|
||||
uclass(&[('K', 'K'), ('k', 'k'), ('\u{212A}', '\u{212A}')]);
|
||||
assert_eq!(expected, ucasefold(&cls));
|
||||
|
||||
let cls = uclass(&[('@', '@')]);
|
||||
@@ -1708,13 +1713,16 @@ mod tests {
|
||||
#[test]
|
||||
fn class_case_fold_bytes() {
|
||||
let cls = bclass(&[
|
||||
(b'C', b'F'), (b'A', b'G'), (b'D', b'J'), (b'A', b'C'),
|
||||
(b'M', b'P'), (b'L', b'S'), (b'c', b'f'),
|
||||
]);
|
||||
let expected = bclass(&[
|
||||
(b'A', b'J'), (b'L', b'S'),
|
||||
(b'a', b'j'), (b'l', b's'),
|
||||
(b'C', b'F'),
|
||||
(b'A', b'G'),
|
||||
(b'D', b'J'),
|
||||
(b'A', b'C'),
|
||||
(b'M', b'P'),
|
||||
(b'L', b'S'),
|
||||
(b'c', b'f'),
|
||||
]);
|
||||
let expected =
|
||||
bclass(&[(b'A', b'J'), (b'L', b'S'), (b'a', b'j'), (b'l', b's')]);
|
||||
assert_eq!(expected, bcasefold(&cls));
|
||||
|
||||
let cls = bclass(&[(b'A', b'Z')]);
|
||||
@@ -1756,7 +1764,9 @@ mod tests {
|
||||
|
||||
let cls = uclass(&[('a', 'c'), ('x', 'z')]);
|
||||
let expected = uclass(&[
|
||||
('\x00', '\x60'), ('\x64', '\x77'), ('\x7B', '\u{10FFFF}'),
|
||||
('\x00', '\x60'),
|
||||
('\x64', '\x77'),
|
||||
('\x7B', '\u{10FFFF}'),
|
||||
]);
|
||||
assert_eq!(expected, unegate(&cls));
|
||||
|
||||
@@ -1776,9 +1786,8 @@ mod tests {
|
||||
let expected = uclass(&[('\x00', '\u{10FFFF}')]);
|
||||
assert_eq!(expected, unegate(&cls));
|
||||
|
||||
let cls = uclass(&[
|
||||
('\x00', '\u{10FFFD}'), ('\u{10FFFF}', '\u{10FFFF}'),
|
||||
]);
|
||||
let cls =
|
||||
uclass(&[('\x00', '\u{10FFFD}'), ('\u{10FFFF}', '\u{10FFFF}')]);
|
||||
let expected = uclass(&[('\u{10FFFE}', '\u{10FFFE}')]);
|
||||
assert_eq!(expected, unegate(&cls));
|
||||
|
||||
@@ -1811,7 +1820,9 @@ mod tests {
|
||||
|
||||
let cls = bclass(&[(b'a', b'c'), (b'x', b'z')]);
|
||||
let expected = bclass(&[
|
||||
(b'\x00', b'\x60'), (b'\x64', b'\x77'), (b'\x7B', b'\xFF'),
|
||||
(b'\x00', b'\x60'),
|
||||
(b'\x64', b'\x77'),
|
||||
(b'\x7B', b'\xFF'),
|
||||
]);
|
||||
assert_eq!(expected, bnegate(&cls));
|
||||
|
||||
@@ -2183,7 +2194,7 @@ mod tests {
|
||||
// We run our test on a thread with a small stack size so we can
|
||||
// force the issue more easily.
|
||||
thread::Builder::new()
|
||||
.stack_size(1<<10)
|
||||
.stack_size(1 << 10)
|
||||
.spawn(run)
|
||||
.unwrap()
|
||||
.join()
|
||||
|
||||
@@ -4,8 +4,8 @@ This module provides a regular expression printer for `Hir`.
|
||||
|
||||
use std::fmt;
|
||||
|
||||
use hir::{self, Hir, HirKind};
|
||||
use hir::visitor::{self, Visitor};
|
||||
use hir::{self, Hir, HirKind};
|
||||
use is_meta_character;
|
||||
|
||||
/// A builder for constructing a printer.
|
||||
@@ -25,15 +25,11 @@ impl Default for PrinterBuilder {
|
||||
|
||||
impl PrinterBuilder {
|
||||
fn new() -> PrinterBuilder {
|
||||
PrinterBuilder {
|
||||
_priv: (),
|
||||
}
|
||||
PrinterBuilder { _priv: () }
|
||||
}
|
||||
|
||||
fn build(&self) -> Printer {
|
||||
Printer {
|
||||
_priv: (),
|
||||
}
|
||||
Printer { _priv: () }
|
||||
}
|
||||
}
|
||||
|
||||
@@ -149,19 +145,17 @@ impl<'p, W: fmt::Write> Visitor for Writer<'p, W> {
|
||||
HirKind::WordBoundary(hir::WordBoundary::AsciiNegate) => {
|
||||
self.wtr.write_str(r"(?-u:\B)")?;
|
||||
}
|
||||
HirKind::Group(ref x) => {
|
||||
match x.kind {
|
||||
hir::GroupKind::CaptureIndex(_) => {
|
||||
self.wtr.write_str("(")?;
|
||||
}
|
||||
hir::GroupKind::CaptureName { ref name, .. } => {
|
||||
write!(self.wtr, "(?P<{}>", name)?;
|
||||
}
|
||||
hir::GroupKind::NonCapturing => {
|
||||
self.wtr.write_str("(?:")?;
|
||||
}
|
||||
HirKind::Group(ref x) => match x.kind {
|
||||
hir::GroupKind::CaptureIndex(_) => {
|
||||
self.wtr.write_str("(")?;
|
||||
}
|
||||
}
|
||||
hir::GroupKind::CaptureName { ref name, .. } => {
|
||||
write!(self.wtr, "(?P<{}>", name)?;
|
||||
}
|
||||
hir::GroupKind::NonCapturing => {
|
||||
self.wtr.write_str("(?:")?;
|
||||
}
|
||||
},
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -187,19 +181,17 @@ impl<'p, W: fmt::Write> Visitor for Writer<'p, W> {
|
||||
hir::RepetitionKind::OneOrMore => {
|
||||
self.wtr.write_str("+")?;
|
||||
}
|
||||
hir::RepetitionKind::Range(ref x) => {
|
||||
match *x {
|
||||
hir::RepetitionRange::Exactly(m) => {
|
||||
write!(self.wtr, "{{{}}}", m)?;
|
||||
}
|
||||
hir::RepetitionRange::AtLeast(m) => {
|
||||
write!(self.wtr, "{{{},}}", m)?;
|
||||
}
|
||||
hir::RepetitionRange::Bounded(m, n) => {
|
||||
write!(self.wtr, "{{{},{}}}", m, n)?;
|
||||
}
|
||||
hir::RepetitionKind::Range(ref x) => match *x {
|
||||
hir::RepetitionRange::Exactly(m) => {
|
||||
write!(self.wtr, "{{{}}}", m)?;
|
||||
}
|
||||
}
|
||||
hir::RepetitionRange::AtLeast(m) => {
|
||||
write!(self.wtr, "{{{},}}", m)?;
|
||||
}
|
||||
hir::RepetitionRange::Bounded(m, n) => {
|
||||
write!(self.wtr, "{{{},{}}}", m, n)?;
|
||||
}
|
||||
},
|
||||
}
|
||||
if !x.greedy {
|
||||
self.wtr.write_str("?")?;
|
||||
@@ -246,8 +238,8 @@ impl<'p, W: fmt::Write> Writer<'p, W> {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use ParserBuilder;
|
||||
use super::Printer;
|
||||
use ParserBuilder;
|
||||
|
||||
fn roundtrip(given: &str, expected: &str) {
|
||||
roundtrip_with(|b| b, given, expected);
|
||||
@@ -258,7 +250,8 @@ mod tests {
|
||||
}
|
||||
|
||||
fn roundtrip_with<F>(mut f: F, given: &str, expected: &str)
|
||||
where F: FnMut(&mut ParserBuilder) -> &mut ParserBuilder
|
||||
where
|
||||
F: FnMut(&mut ParserBuilder) -> &mut ParserBuilder,
|
||||
{
|
||||
let mut builder = ParserBuilder::new();
|
||||
f(&mut builder);
|
||||
|
||||
+864
-596
File diff suppressed because it is too large
Load Diff
@@ -140,7 +140,7 @@ impl<'a> HeapVisitor<'a> {
|
||||
// If this is a concat/alternate, then we might have additional
|
||||
// inductive steps to process.
|
||||
if let Some(x) = self.pop(frame) {
|
||||
if let Frame::Alternation {..} = x {
|
||||
if let Frame::Alternation { .. } = x {
|
||||
visitor.visit_alternation_in()?;
|
||||
}
|
||||
hir = x.child();
|
||||
@@ -162,17 +162,11 @@ impl<'a> HeapVisitor<'a> {
|
||||
HirKind::Group(ref x) => Some(Frame::Group(x)),
|
||||
HirKind::Concat(ref x) if x.is_empty() => None,
|
||||
HirKind::Concat(ref x) => {
|
||||
Some(Frame::Concat {
|
||||
head: &x[0],
|
||||
tail: &x[1..],
|
||||
})
|
||||
Some(Frame::Concat { head: &x[0], tail: &x[1..] })
|
||||
}
|
||||
HirKind::Alternation(ref x) if x.is_empty() => None,
|
||||
HirKind::Alternation(ref x) => {
|
||||
Some(Frame::Alternation {
|
||||
head: &x[0],
|
||||
tail: &x[1..],
|
||||
})
|
||||
Some(Frame::Alternation { head: &x[0], tail: &x[1..] })
|
||||
}
|
||||
_ => None,
|
||||
}
|
||||
@@ -188,10 +182,7 @@ impl<'a> HeapVisitor<'a> {
|
||||
if tail.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(Frame::Concat {
|
||||
head: &tail[0],
|
||||
tail: &tail[1..],
|
||||
})
|
||||
Some(Frame::Concat { head: &tail[0], tail: &tail[1..] })
|
||||
}
|
||||
}
|
||||
Frame::Alternation { tail, .. } => {
|
||||
|
||||
@@ -152,8 +152,8 @@ pub fn escape_into(text: &str, buf: &mut String) {
|
||||
/// `false` is fixed and won't change in a semver compatible release.
|
||||
pub fn is_meta_character(c: char) -> bool {
|
||||
match c {
|
||||
'\\' | '.' | '+' | '*' | '?' | '(' | ')' | '|' |
|
||||
'[' | ']' | '{' | '}' | '^' | '$' | '#' | '&' | '-' | '~' => true,
|
||||
'\\' | '.' | '+' | '*' | '?' | '(' | ')' | '|' | '[' | ']' | '{'
|
||||
| '}' | '^' | '$' | '#' | '&' | '-' | '~' => true,
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
@@ -183,7 +183,8 @@ pub fn is_word_character(c: char) -> bool {
|
||||
} else {
|
||||
Ordering::Less
|
||||
}
|
||||
}).is_ok()
|
||||
})
|
||||
.is_ok()
|
||||
}
|
||||
|
||||
/// Returns true if and only if the given character is an ASCII word character.
|
||||
@@ -192,7 +193,7 @@ pub fn is_word_character(c: char) -> bool {
|
||||
/// `[_0-9a-zA-Z]'.
|
||||
pub fn is_word_byte(c: u8) -> bool {
|
||||
match c {
|
||||
b'_' | b'0' ..= b'9' | b'a' ..= b'z' | b'A' ..= b'Z' => true,
|
||||
b'_' | b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' => true,
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
@@ -205,7 +206,8 @@ mod tests {
|
||||
fn escape_meta() {
|
||||
assert_eq!(
|
||||
escape(r"\.+*?()|[]{}^$#&-~"),
|
||||
r"\\\.\+\*\?\(\)\|\[\]\{\}\^\$\#\&\-\~".to_string());
|
||||
r"\\\.\+\*\?\(\)\|\[\]\{\}\^\$\#\&\-\~".to_string()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -25,10 +25,7 @@ impl ParserBuilder {
|
||||
|
||||
/// Build a parser from this configuration with the given pattern.
|
||||
pub fn build(&self) -> Parser {
|
||||
Parser {
|
||||
ast: self.ast.build(),
|
||||
hir: self.hir.build(),
|
||||
}
|
||||
Parser { ast: self.ast.build(), hir: self.hir.build() }
|
||||
}
|
||||
|
||||
/// Set the nesting limit for this parser.
|
||||
@@ -132,10 +129,7 @@ impl ParserBuilder {
|
||||
///
|
||||
/// By default this is disabled. It may alternatively be selectively
|
||||
/// enabled in the regular expression itself via the `s` flag.
|
||||
pub fn dot_matches_new_line(
|
||||
&mut self,
|
||||
yes: bool,
|
||||
) -> &mut ParserBuilder {
|
||||
pub fn dot_matches_new_line(&mut self, yes: bool) -> &mut ParserBuilder {
|
||||
self.hir.dot_matches_new_line(yes);
|
||||
self
|
||||
}
|
||||
|
||||
+30
-38
@@ -78,7 +78,8 @@ pub fn contains_simple_case_mapping(start: char, end: char) -> bool {
|
||||
} else {
|
||||
Ordering::Less
|
||||
}
|
||||
}).is_ok()
|
||||
})
|
||||
.is_ok()
|
||||
}
|
||||
|
||||
/// A query for finding a character class defined by Unicode. This supports
|
||||
@@ -148,13 +149,13 @@ impl<'a> ClassQuery<'a> {
|
||||
None => return Err(Error::PropertyValueNotFound),
|
||||
Some(vals) => vals,
|
||||
};
|
||||
let canon_val = match canonical_value(
|
||||
vals,
|
||||
&property_value,
|
||||
) {
|
||||
None => return Err(Error::PropertyValueNotFound),
|
||||
Some(canon_val) => canon_val,
|
||||
};
|
||||
let canon_val =
|
||||
match canonical_value(vals, &property_value) {
|
||||
None => {
|
||||
return Err(Error::PropertyValueNotFound)
|
||||
}
|
||||
Some(canon_val) => canon_val,
|
||||
};
|
||||
CanonicalClassQuery::ByValue {
|
||||
property_name: canon_name,
|
||||
property_value: canon_val,
|
||||
@@ -212,14 +213,10 @@ pub fn class<'a>(query: ClassQuery<'a>) -> Result<hir::ClassUnicode> {
|
||||
use self::CanonicalClassQuery::*;
|
||||
|
||||
match query.canonicalize()? {
|
||||
Binary(name) => {
|
||||
property_set(property_bool::BY_NAME, name)
|
||||
.map(hir_class)
|
||||
.ok_or(Error::PropertyNotFound)
|
||||
}
|
||||
GeneralCategory("Any") => {
|
||||
Ok(hir_class(&[('\0', '\u{10FFFF}')]))
|
||||
}
|
||||
Binary(name) => property_set(property_bool::BY_NAME, name)
|
||||
.map(hir_class)
|
||||
.ok_or(Error::PropertyNotFound),
|
||||
GeneralCategory("Any") => Ok(hir_class(&[('\0', '\u{10FFFF}')])),
|
||||
GeneralCategory("Assigned") => {
|
||||
let mut cls =
|
||||
property_set(general_category::BY_NAME, "Unassigned")
|
||||
@@ -228,19 +225,13 @@ pub fn class<'a>(query: ClassQuery<'a>) -> Result<hir::ClassUnicode> {
|
||||
cls.negate();
|
||||
Ok(cls)
|
||||
}
|
||||
GeneralCategory("ASCII") => {
|
||||
Ok(hir_class(&[('\0', '\x7F')]))
|
||||
}
|
||||
GeneralCategory(name) => {
|
||||
property_set(general_category::BY_NAME, name)
|
||||
.map(hir_class)
|
||||
.ok_or(Error::PropertyValueNotFound)
|
||||
}
|
||||
Script(name) => {
|
||||
property_set(script::BY_NAME, name)
|
||||
.map(hir_class)
|
||||
.ok_or(Error::PropertyValueNotFound)
|
||||
}
|
||||
GeneralCategory("ASCII") => Ok(hir_class(&[('\0', '\x7F')])),
|
||||
GeneralCategory(name) => property_set(general_category::BY_NAME, name)
|
||||
.map(hir_class)
|
||||
.ok_or(Error::PropertyValueNotFound),
|
||||
Script(name) => property_set(script::BY_NAME, name)
|
||||
.map(hir_class)
|
||||
.ok_or(Error::PropertyValueNotFound),
|
||||
ByValue { property_name: "Age", property_value } => {
|
||||
let mut class = hir::ClassUnicode::empty();
|
||||
for set in ages(property_value)? {
|
||||
@@ -253,11 +244,12 @@ pub fn class<'a>(query: ClassQuery<'a>) -> Result<hir::ClassUnicode> {
|
||||
.map(hir_class)
|
||||
.ok_or(Error::PropertyValueNotFound)
|
||||
}
|
||||
ByValue { property_name: "Grapheme_Cluster_Break", property_value } => {
|
||||
property_set(grapheme_cluster_break::BY_NAME, property_value)
|
||||
.map(hir_class)
|
||||
.ok_or(Error::PropertyValueNotFound)
|
||||
}
|
||||
ByValue {
|
||||
property_name: "Grapheme_Cluster_Break",
|
||||
property_value,
|
||||
} => property_set(grapheme_cluster_break::BY_NAME, property_value)
|
||||
.map(hir_class)
|
||||
.ok_or(Error::PropertyValueNotFound),
|
||||
ByValue { property_name: "Sentence_Break", property_value } => {
|
||||
property_set(sentence_break::BY_NAME, property_value)
|
||||
.map(hir_class)
|
||||
@@ -320,8 +312,7 @@ fn normalize(x: &str) -> String {
|
||||
|
||||
fn property_values(
|
||||
canonical_property_name: &'static str,
|
||||
) -> Option<PropertyValues>
|
||||
{
|
||||
) -> Option<PropertyValues> {
|
||||
ucd_util::property_values(PROPERTY_VALUES, canonical_property_name)
|
||||
}
|
||||
|
||||
@@ -373,7 +364,7 @@ fn ages(canonical_age: &str) -> Result<AgeIter> {
|
||||
let pos = AGES.iter().position(|&(age, _)| canonical_age == age);
|
||||
match pos {
|
||||
None => Err(Error::PropertyValueNotFound),
|
||||
Some(i) => Ok(AgeIter { ages: &AGES[..i+1] }),
|
||||
Some(i) => Ok(AgeIter { ages: &AGES[..i + 1] }),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -452,6 +443,7 @@ mod tests {
|
||||
let q = ClassQuery::OneLetter('C');
|
||||
assert_eq!(
|
||||
q.canonicalize().unwrap(),
|
||||
CanonicalClassQuery::GeneralCategory("Other"));
|
||||
CanonicalClassQuery::GeneralCategory("Other")
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
+1542
-433
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -6,7 +6,7 @@ pub mod perl_word;
|
||||
pub mod property_bool;
|
||||
pub mod property_names;
|
||||
pub mod property_values;
|
||||
pub mod script_extension;
|
||||
pub mod script;
|
||||
pub mod script_extension;
|
||||
pub mod sentence_break;
|
||||
pub mod word_break;
|
||||
|
||||
@@ -5,202 +5,721 @@
|
||||
// ucd-generate is available on crates.io.
|
||||
|
||||
pub const PERL_WORD: &'static [(char, char)] = &[
|
||||
('0', '9'), ('A', 'Z'), ('_', '_'), ('a', 'z'), ('ª', 'ª'), ('µ', 'µ'),
|
||||
('º', 'º'), ('À', 'Ö'), ('Ø', 'ö'), ('ø', 'ˁ'), ('ˆ', 'ˑ'),
|
||||
('ˠ', 'ˤ'), ('ˬ', 'ˬ'), ('ˮ', 'ˮ'), ('\u{300}', 'ʹ'), ('Ͷ', 'ͷ'),
|
||||
('ͺ', 'ͽ'), ('Ϳ', 'Ϳ'), ('Ά', 'Ά'), ('Έ', 'Ί'), ('Ό', 'Ό'),
|
||||
('Ύ', 'Ρ'), ('Σ', 'ϵ'), ('Ϸ', 'ҁ'), ('\u{483}', 'ԯ'), ('Ա', 'Ֆ'),
|
||||
('ՙ', 'ՙ'), ('ՠ', 'ֈ'), ('\u{591}', '\u{5bd}'), ('\u{5bf}', '\u{5bf}'),
|
||||
('\u{5c1}', '\u{5c2}'), ('\u{5c4}', '\u{5c5}'), ('\u{5c7}', '\u{5c7}'),
|
||||
('א', 'ת'), ('ׯ', 'ײ'), ('\u{610}', '\u{61a}'), ('ؠ', '٩'),
|
||||
('ٮ', 'ۓ'), ('ە', '\u{6dc}'), ('\u{6df}', '\u{6e8}'), ('\u{6ea}', 'ۼ'),
|
||||
('ۿ', 'ۿ'), ('ܐ', '\u{74a}'), ('ݍ', 'ޱ'), ('߀', 'ߵ'), ('ߺ', 'ߺ'),
|
||||
('\u{7fd}', '\u{7fd}'), ('ࠀ', '\u{82d}'), ('ࡀ', '\u{85b}'),
|
||||
('ࡠ', 'ࡪ'), ('ࢠ', 'ࢴ'), ('ࢶ', 'ࢽ'), ('\u{8d3}', '\u{8e1}'),
|
||||
('\u{8e3}', '\u{963}'), ('०', '९'), ('ॱ', 'ঃ'), ('অ', 'ঌ'),
|
||||
('এ', 'ঐ'), ('ও', 'ন'), ('প', 'র'), ('ল', 'ল'),
|
||||
('শ', 'হ'), ('\u{9bc}', '\u{9c4}'), ('ে', 'ৈ'), ('ো', 'ৎ'),
|
||||
('\u{9d7}', '\u{9d7}'), ('ড়', 'ঢ়'), ('য়', '\u{9e3}'), ('০', 'ৱ'),
|
||||
('ৼ', 'ৼ'), ('\u{9fe}', '\u{9fe}'), ('\u{a01}', 'ਃ'), ('ਅ', 'ਊ'),
|
||||
('ਏ', 'ਐ'), ('ਓ', 'ਨ'), ('ਪ', 'ਰ'), ('ਲ', 'ਲ਼'),
|
||||
('ਵ', 'ਸ਼'), ('ਸ', 'ਹ'), ('\u{a3c}', '\u{a3c}'), ('ਾ', '\u{a42}'),
|
||||
('\u{a47}', '\u{a48}'), ('\u{a4b}', '\u{a4d}'), ('\u{a51}', '\u{a51}'),
|
||||
('ਖ਼', 'ੜ'), ('ਫ਼', 'ਫ਼'), ('੦', '\u{a75}'), ('\u{a81}', 'ઃ'),
|
||||
('અ', 'ઍ'), ('એ', 'ઑ'), ('ઓ', 'ન'), ('પ', 'ર'),
|
||||
('લ', 'ળ'), ('વ', 'હ'), ('\u{abc}', '\u{ac5}'), ('\u{ac7}', 'ૉ'),
|
||||
('ો', '\u{acd}'), ('ૐ', 'ૐ'), ('ૠ', '\u{ae3}'), ('૦', '૯'),
|
||||
('ૹ', '\u{aff}'), ('\u{b01}', 'ଃ'), ('ଅ', 'ଌ'), ('ଏ', 'ଐ'),
|
||||
('ଓ', 'ନ'), ('ପ', 'ର'), ('ଲ', 'ଳ'), ('ଵ', 'ହ'),
|
||||
('\u{b3c}', '\u{b44}'), ('େ', 'ୈ'), ('ୋ', '\u{b4d}'),
|
||||
('\u{b56}', '\u{b57}'), ('ଡ଼', 'ଢ଼'), ('ୟ', '\u{b63}'), ('୦', '୯'),
|
||||
('ୱ', 'ୱ'), ('\u{b82}', 'ஃ'), ('அ', 'ஊ'), ('எ', 'ஐ'),
|
||||
('ஒ', 'க'), ('ங', 'ச'), ('ஜ', 'ஜ'), ('ஞ', 'ட'),
|
||||
('ண', 'த'), ('ந', 'ப'), ('ம', 'ஹ'), ('\u{bbe}', 'ூ'),
|
||||
('ெ', 'ை'), ('ொ', '\u{bcd}'), ('ௐ', 'ௐ'), ('\u{bd7}', '\u{bd7}'),
|
||||
('௦', '௯'), ('\u{c00}', 'ఌ'), ('ఎ', 'ఐ'), ('ఒ', 'న'),
|
||||
('ప', 'హ'), ('ఽ', 'ౄ'), ('\u{c46}', '\u{c48}'),
|
||||
('\u{c4a}', '\u{c4d}'), ('\u{c55}', '\u{c56}'), ('ౘ', 'ౚ'),
|
||||
('ౠ', '\u{c63}'), ('౦', '౯'), ('ಀ', 'ಃ'), ('ಅ', 'ಌ'),
|
||||
('ಎ', 'ಐ'), ('ಒ', 'ನ'), ('ಪ', 'ಳ'), ('ವ', 'ಹ'),
|
||||
('\u{cbc}', 'ೄ'), ('\u{cc6}', 'ೈ'), ('ೊ', '\u{ccd}'),
|
||||
('\u{cd5}', '\u{cd6}'), ('ೞ', 'ೞ'), ('ೠ', '\u{ce3}'), ('೦', '೯'),
|
||||
('ೱ', 'ೲ'), ('\u{d00}', 'ഃ'), ('അ', 'ഌ'), ('എ', 'ഐ'),
|
||||
('ഒ', '\u{d44}'), ('െ', 'ൈ'), ('ൊ', 'ൎ'), ('ൔ', '\u{d57}'),
|
||||
('ൟ', '\u{d63}'), ('൦', '൯'), ('ൺ', 'ൿ'), ('ං', 'ඃ'),
|
||||
('අ', 'ඖ'), ('ක', 'න'), ('ඳ', 'ර'), ('ල', 'ල'),
|
||||
('ව', 'ෆ'), ('\u{dca}', '\u{dca}'), ('\u{dcf}', '\u{dd4}'),
|
||||
('\u{dd6}', '\u{dd6}'), ('ෘ', '\u{ddf}'), ('෦', '෯'), ('ෲ', 'ෳ'),
|
||||
('ก', '\u{e3a}'), ('เ', '\u{e4e}'), ('๐', '๙'), ('ກ', 'ຂ'),
|
||||
('ຄ', 'ຄ'), ('\u{e86}', 'ຊ'), ('\u{e8c}', 'ຣ'), ('ລ', 'ລ'),
|
||||
('ວ', 'ຽ'), ('ເ', 'ໄ'), ('ໆ', 'ໆ'), ('\u{ec8}', '\u{ecd}'),
|
||||
('໐', '໙'), ('ໜ', 'ໟ'), ('ༀ', 'ༀ'), ('\u{f18}', '\u{f19}'),
|
||||
('༠', '༩'), ('\u{f35}', '\u{f35}'), ('\u{f37}', '\u{f37}'),
|
||||
('\u{f39}', '\u{f39}'), ('༾', 'ཇ'), ('ཉ', 'ཬ'),
|
||||
('\u{f71}', '\u{f84}'), ('\u{f86}', '\u{f97}'), ('\u{f99}', '\u{fbc}'),
|
||||
('\u{fc6}', '\u{fc6}'), ('က', '၉'), ('ၐ', '\u{109d}'), ('Ⴀ', 'Ⴥ'),
|
||||
('Ⴧ', 'Ⴧ'), ('Ⴭ', 'Ⴭ'), ('ა', 'ჺ'), ('ჼ', 'ቈ'),
|
||||
('ቊ', 'ቍ'), ('ቐ', 'ቖ'), ('ቘ', 'ቘ'), ('ቚ', 'ቝ'),
|
||||
('በ', 'ኈ'), ('ኊ', 'ኍ'), ('ነ', 'ኰ'), ('ኲ', 'ኵ'),
|
||||
('ኸ', 'ኾ'), ('ዀ', 'ዀ'), ('ዂ', 'ዅ'), ('ወ', 'ዖ'),
|
||||
('ዘ', 'ጐ'), ('ጒ', 'ጕ'), ('ጘ', 'ፚ'), ('\u{135d}', '\u{135f}'),
|
||||
('ᎀ', 'ᎏ'), ('Ꭰ', 'Ᏽ'), ('ᏸ', 'ᏽ'), ('ᐁ', 'ᙬ'),
|
||||
('ᙯ', 'ᙿ'), ('ᚁ', 'ᚚ'), ('ᚠ', 'ᛪ'), ('ᛮ', 'ᛸ'),
|
||||
('ᜀ', 'ᜌ'), ('ᜎ', '\u{1714}'), ('ᜠ', '\u{1734}'),
|
||||
('ᝀ', '\u{1753}'), ('ᝠ', 'ᝬ'), ('ᝮ', 'ᝰ'),
|
||||
('\u{1772}', '\u{1773}'), ('ក', '\u{17d3}'), ('ៗ', 'ៗ'),
|
||||
('ៜ', '\u{17dd}'), ('០', '៩'), ('\u{180b}', '\u{180d}'),
|
||||
('᠐', '᠙'), ('ᠠ', 'ᡸ'), ('ᢀ', 'ᢪ'), ('ᢰ', 'ᣵ'),
|
||||
('ᤀ', 'ᤞ'), ('\u{1920}', 'ᤫ'), ('ᤰ', '\u{193b}'), ('᥆', 'ᥭ'),
|
||||
('ᥰ', 'ᥴ'), ('ᦀ', 'ᦫ'), ('ᦰ', 'ᧉ'), ('᧐', '᧙'),
|
||||
('ᨀ', '\u{1a1b}'), ('ᨠ', '\u{1a5e}'), ('\u{1a60}', '\u{1a7c}'),
|
||||
('\u{1a7f}', '᪉'), ('᪐', '᪙'), ('ᪧ', 'ᪧ'),
|
||||
('\u{1ab0}', '\u{1abe}'), ('\u{1b00}', 'ᭋ'), ('᭐', '᭙'),
|
||||
('\u{1b6b}', '\u{1b73}'), ('\u{1b80}', '᯳'), ('ᰀ', '\u{1c37}'),
|
||||
('᱀', '᱉'), ('ᱍ', 'ᱽ'), ('ᲀ', 'ᲈ'), ('Ა', 'Ჺ'),
|
||||
('Ჽ', 'Ჿ'), ('\u{1cd0}', '\u{1cd2}'), ('\u{1cd4}', '\u{1cfa}'),
|
||||
('ᴀ', '\u{1df9}'), ('\u{1dfb}', 'ἕ'), ('Ἐ', 'Ἕ'), ('ἠ', 'ὅ'),
|
||||
('Ὀ', 'Ὅ'), ('ὐ', 'ὗ'), ('Ὑ', 'Ὑ'), ('Ὓ', 'Ὓ'),
|
||||
('Ὕ', 'Ὕ'), ('Ὗ', 'ώ'), ('ᾀ', 'ᾴ'), ('ᾶ', 'ᾼ'),
|
||||
('ι', 'ι'), ('ῂ', 'ῄ'), ('ῆ', 'ῌ'), ('ῐ', 'ΐ'),
|
||||
('ῖ', 'Ί'), ('ῠ', 'Ῥ'), ('ῲ', 'ῴ'), ('ῶ', 'ῼ'),
|
||||
('\u{200c}', '\u{200d}'), ('‿', '⁀'), ('⁔', '⁔'), ('ⁱ', 'ⁱ'),
|
||||
('ⁿ', 'ⁿ'), ('ₐ', 'ₜ'), ('\u{20d0}', '\u{20f0}'), ('ℂ', 'ℂ'),
|
||||
('ℇ', 'ℇ'), ('ℊ', 'ℓ'), ('ℕ', 'ℕ'), ('ℙ', 'ℝ'),
|
||||
('ℤ', 'ℤ'), ('Ω', 'Ω'), ('ℨ', 'ℨ'), ('K', 'ℭ'),
|
||||
('ℯ', 'ℹ'), ('ℼ', 'ℿ'), ('ⅅ', 'ⅉ'), ('ⅎ', 'ⅎ'),
|
||||
('Ⅰ', 'ↈ'), ('Ⓐ', 'ⓩ'), ('Ⰰ', 'Ⱞ'), ('ⰰ', 'ⱞ'),
|
||||
('Ⱡ', 'ⳤ'), ('Ⳬ', 'ⳳ'), ('ⴀ', 'ⴥ'), ('ⴧ', 'ⴧ'),
|
||||
('ⴭ', 'ⴭ'), ('ⴰ', 'ⵧ'), ('ⵯ', 'ⵯ'), ('\u{2d7f}', 'ⶖ'),
|
||||
('ⶠ', 'ⶦ'), ('ⶨ', 'ⶮ'), ('ⶰ', 'ⶶ'), ('ⶸ', 'ⶾ'),
|
||||
('ⷀ', 'ⷆ'), ('ⷈ', 'ⷎ'), ('ⷐ', 'ⷖ'), ('ⷘ', 'ⷞ'),
|
||||
('\u{2de0}', '\u{2dff}'), ('ⸯ', 'ⸯ'), ('々', '〇'),
|
||||
('〡', '\u{302f}'), ('〱', '〵'), ('〸', '〼'), ('ぁ', 'ゖ'),
|
||||
('\u{3099}', '\u{309a}'), ('ゝ', 'ゟ'), ('ァ', 'ヺ'), ('ー', 'ヿ'),
|
||||
('ㄅ', 'ㄯ'), ('ㄱ', 'ㆎ'), ('ㆠ', 'ㆺ'), ('ㇰ', 'ㇿ'),
|
||||
('㐀', '䶵'), ('一', '鿯'), ('ꀀ', 'ꒌ'), ('ꓐ', 'ꓽ'),
|
||||
('ꔀ', 'ꘌ'), ('ꘐ', 'ꘫ'), ('Ꙁ', '\u{a672}'),
|
||||
('\u{a674}', '\u{a67d}'), ('ꙿ', '\u{a6f1}'), ('ꜗ', 'ꜟ'),
|
||||
('Ꜣ', 'ꞈ'), ('Ꞌ', '\u{a7bf}'), ('\u{a7c2}', '\u{a7c6}'),
|
||||
('ꟷ', 'ꠧ'), ('ꡀ', 'ꡳ'), ('ꢀ', '\u{a8c5}'), ('꣐', '꣙'),
|
||||
('\u{a8e0}', 'ꣷ'), ('ꣻ', 'ꣻ'), ('ꣽ', '\u{a92d}'), ('ꤰ', '꥓'),
|
||||
('ꥠ', 'ꥼ'), ('\u{a980}', '꧀'), ('ꧏ', '꧙'), ('ꧠ', 'ꧾ'),
|
||||
('ꨀ', '\u{aa36}'), ('ꩀ', 'ꩍ'), ('꩐', '꩙'), ('ꩠ', 'ꩶ'),
|
||||
('ꩺ', 'ꫂ'), ('ꫛ', 'ꫝ'), ('ꫠ', 'ꫯ'), ('ꫲ', '\u{aaf6}'),
|
||||
('ꬁ', 'ꬆ'), ('ꬉ', 'ꬎ'), ('ꬑ', 'ꬖ'), ('ꬠ', 'ꬦ'),
|
||||
('ꬨ', 'ꬮ'), ('ꬰ', 'ꭚ'), ('ꭜ', '\u{ab67}'), ('ꭰ', 'ꯪ'),
|
||||
('꯬', '\u{abed}'), ('꯰', '꯹'), ('가', '힣'), ('ힰ', 'ퟆ'),
|
||||
('ퟋ', 'ퟻ'), ('豈', '舘'), ('並', '龎'), ('ff', 'st'),
|
||||
('ﬓ', 'ﬗ'), ('יִ', 'ﬨ'), ('שׁ', 'זּ'), ('טּ', 'לּ'),
|
||||
('מּ', 'מּ'), ('נּ', 'סּ'), ('ףּ', 'פּ'), ('צּ', 'ﮱ'),
|
||||
('ﯓ', 'ﴽ'), ('ﵐ', 'ﶏ'), ('ﶒ', 'ﷇ'), ('ﷰ', 'ﷻ'),
|
||||
('\u{fe00}', '\u{fe0f}'), ('\u{fe20}', '\u{fe2f}'), ('︳', '︴'),
|
||||
('﹍', '﹏'), ('ﹰ', 'ﹴ'), ('ﹶ', 'ﻼ'), ('0', '9'),
|
||||
('A', 'Z'), ('_', '_'), ('a', 'z'), ('ヲ', 'ᄒ'),
|
||||
('ᅡ', 'ᅦ'), ('ᅧ', 'ᅬ'), ('ᅭ', 'ᅲ'), ('ᅳ', 'ᅵ'),
|
||||
('𐀀', '𐀋'), ('𐀍', '𐀦'), ('𐀨', '𐀺'), ('𐀼', '𐀽'),
|
||||
('𐀿', '𐁍'), ('𐁐', '𐁝'), ('𐂀', '𐃺'), ('𐅀', '𐅴'),
|
||||
('\u{101fd}', '\u{101fd}'), ('𐊀', '𐊜'), ('𐊠', '𐋐'),
|
||||
('\u{102e0}', '\u{102e0}'), ('𐌀', '𐌟'), ('𐌭', '𐍊'),
|
||||
('𐍐', '\u{1037a}'), ('𐎀', '𐎝'), ('𐎠', '𐏃'), ('𐏈', '𐏏'),
|
||||
('𐏑', '𐏕'), ('𐐀', '𐒝'), ('𐒠', '𐒩'), ('𐒰', '𐓓'),
|
||||
('𐓘', '𐓻'), ('𐔀', '𐔧'), ('𐔰', '𐕣'), ('𐘀', '𐜶'),
|
||||
('𐝀', '𐝕'), ('𐝠', '𐝧'), ('𐠀', '𐠅'), ('𐠈', '𐠈'),
|
||||
('𐠊', '𐠵'), ('𐠷', '𐠸'), ('𐠼', '𐠼'), ('𐠿', '𐡕'),
|
||||
('𐡠', '𐡶'), ('𐢀', '𐢞'), ('𐣠', '𐣲'), ('𐣴', '𐣵'),
|
||||
('𐤀', '𐤕'), ('𐤠', '𐤹'), ('𐦀', '𐦷'), ('𐦾', '𐦿'),
|
||||
('𐨀', '\u{10a03}'), ('\u{10a05}', '\u{10a06}'), ('\u{10a0c}', '𐨓'),
|
||||
('𐨕', '𐨗'), ('𐨙', '𐨵'), ('\u{10a38}', '\u{10a3a}'),
|
||||
('\u{10a3f}', '\u{10a3f}'), ('𐩠', '𐩼'), ('𐪀', '𐪜'),
|
||||
('𐫀', '𐫇'), ('𐫉', '\u{10ae6}'), ('𐬀', '𐬵'), ('𐭀', '𐭕'),
|
||||
('𐭠', '𐭲'), ('𐮀', '𐮑'), ('𐰀', '𐱈'), ('𐲀', '𐲲'),
|
||||
('𐳀', '𐳲'), ('𐴀', '\u{10d27}'), ('𐴰', '𐴹'), ('𐼀', '𐼜'),
|
||||
('𐼧', '𐼧'), ('𐼰', '\u{10f50}'), ('\u{10fe0}', '\u{10ff6}'),
|
||||
('𑀀', '\u{11046}'), ('𑁦', '𑁯'), ('\u{1107f}', '\u{110ba}'),
|
||||
('𑃐', '𑃨'), ('𑃰', '𑃹'), ('\u{11100}', '\u{11134}'),
|
||||
('𑄶', '𑄿'), ('𑅄', '𑅆'), ('𑅐', '\u{11173}'), ('𑅶', '𑅶'),
|
||||
('\u{11180}', '𑇄'), ('\u{111c9}', '\u{111cc}'), ('𑇐', '𑇚'),
|
||||
('𑇜', '𑇜'), ('𑈀', '𑈑'), ('𑈓', '\u{11237}'),
|
||||
('\u{1123e}', '\u{1123e}'), ('𑊀', '𑊆'), ('𑊈', '𑊈'),
|
||||
('𑊊', '𑊍'), ('𑊏', '𑊝'), ('𑊟', '𑊨'), ('𑊰', '\u{112ea}'),
|
||||
('𑋰', '𑋹'), ('\u{11300}', '𑌃'), ('𑌅', '𑌌'), ('𑌏', '𑌐'),
|
||||
('𑌓', '𑌨'), ('𑌪', '𑌰'), ('𑌲', '𑌳'), ('𑌵', '𑌹'),
|
||||
('\u{1133b}', '𑍄'), ('𑍇', '𑍈'), ('𑍋', '𑍍'), ('𑍐', '𑍐'),
|
||||
('\u{11357}', '\u{11357}'), ('𑍝', '𑍣'), ('\u{11366}', '\u{1136c}'),
|
||||
('\u{11370}', '\u{11374}'), ('𑐀', '𑑊'), ('𑑐', '𑑙'),
|
||||
('\u{1145e}', '\u{1145f}'), ('𑒀', '𑓅'), ('𑓇', '𑓇'),
|
||||
('𑓐', '𑓙'), ('𑖀', '\u{115b5}'), ('𑖸', '\u{115c0}'),
|
||||
('𑗘', '\u{115dd}'), ('𑘀', '\u{11640}'), ('𑙄', '𑙄'),
|
||||
('𑙐', '𑙙'), ('𑚀', '\u{116b8}'), ('𑛀', '𑛉'), ('𑜀', '𑜚'),
|
||||
('\u{1171d}', '\u{1172b}'), ('𑜰', '𑜹'), ('𑠀', '\u{1183a}'),
|
||||
('𑢠', '𑣩'), ('𑣿', '𑣿'), ('\u{119a0}', '\u{119a7}'),
|
||||
('\u{119aa}', '\u{119d7}'), ('\u{119da}', '\u{119e1}'),
|
||||
('\u{119e3}', '\u{119e4}'), ('𑨀', '\u{11a3e}'),
|
||||
('\u{11a47}', '\u{11a47}'), ('𑩐', '\u{11a99}'), ('𑪝', '𑪝'),
|
||||
('𑫀', '𑫸'), ('𑰀', '𑰈'), ('𑰊', '\u{11c36}'),
|
||||
('\u{11c38}', '𑱀'), ('𑱐', '𑱙'), ('𑱲', '𑲏'),
|
||||
('\u{11c92}', '\u{11ca7}'), ('𑲩', '\u{11cb6}'), ('𑴀', '𑴆'),
|
||||
('𑴈', '𑴉'), ('𑴋', '\u{11d36}'), ('\u{11d3a}', '\u{11d3a}'),
|
||||
('\u{11d3c}', '\u{11d3d}'), ('\u{11d3f}', '\u{11d47}'), ('𑵐', '𑵙'),
|
||||
('𑵠', '𑵥'), ('𑵧', '𑵨'), ('𑵪', '𑶎'),
|
||||
('\u{11d90}', '\u{11d91}'), ('𑶓', '𑶘'), ('𑶠', '𑶩'),
|
||||
('𑻠', '𑻶'), ('𒀀', '𒎙'), ('𒐀', '𒑮'), ('𒒀', '𒕃'),
|
||||
('𓀀', '𓐮'), ('𔐀', '𔙆'), ('𖠀', '𖨸'), ('𖩀', '𖩞'),
|
||||
('𖩠', '𖩩'), ('𖫐', '𖫭'), ('\u{16af0}', '\u{16af4}'),
|
||||
('𖬀', '\u{16b36}'), ('𖭀', '𖭃'), ('𖭐', '𖭙'), ('𖭣', '𖭷'),
|
||||
('𖭽', '𖮏'), ('𖹀', '𖹿'), ('𖼀', '\u{16f4a}'),
|
||||
('\u{16f4f}', '\u{16f87}'), ('\u{16f8f}', '𖾟'), ('𖿠', '𖿡'),
|
||||
('\u{16fe3}', '\u{16fe3}'), ('𗀀', '\u{187f7}'), ('𘠀', '𘫲'),
|
||||
('𛀀', '𛄞'), ('\u{1b150}', '\u{1b152}'), ('\u{1b164}', '\u{1b167}'),
|
||||
('𛅰', '𛋻'), ('𛰀', '𛱪'), ('𛱰', '𛱼'), ('𛲀', '𛲈'),
|
||||
('𛲐', '𛲙'), ('\u{1bc9d}', '\u{1bc9e}'), ('\u{1d165}', '\u{1d169}'),
|
||||
('𝅭', '\u{1d172}'), ('\u{1d17b}', '\u{1d182}'),
|
||||
('\u{1d185}', '\u{1d18b}'), ('\u{1d1aa}', '\u{1d1ad}'),
|
||||
('\u{1d242}', '\u{1d244}'), ('𝐀', '𝑔'), ('𝑖', '𝒜'),
|
||||
('𝒞', '𝒟'), ('𝒢', '𝒢'), ('𝒥', '𝒦'), ('𝒩', '𝒬'),
|
||||
('𝒮', '𝒹'), ('𝒻', '𝒻'), ('𝒽', '𝓃'), ('𝓅', '𝔅'),
|
||||
('𝔇', '𝔊'), ('𝔍', '𝔔'), ('𝔖', '𝔜'), ('𝔞', '𝔹'),
|
||||
('𝔻', '𝔾'), ('𝕀', '𝕄'), ('𝕆', '𝕆'), ('𝕊', '𝕐'),
|
||||
('𝕒', '𝚥'), ('𝚨', '𝛀'), ('𝛂', '𝛚'), ('𝛜', '𝛺'),
|
||||
('𝛼', '𝜔'), ('𝜖', '𝜴'), ('𝜶', '𝝎'), ('𝝐', '𝝮'),
|
||||
('𝝰', '𝞈'), ('𝞊', '𝞨'), ('𝞪', '𝟂'), ('𝟄', '𝟋'),
|
||||
('𝟎', '𝟿'), ('\u{1da00}', '\u{1da36}'), ('\u{1da3b}', '\u{1da6c}'),
|
||||
('\u{1da75}', '\u{1da75}'), ('\u{1da84}', '\u{1da84}'),
|
||||
('\u{1da9b}', '\u{1da9f}'), ('\u{1daa1}', '\u{1daaf}'),
|
||||
('\u{1e000}', '\u{1e006}'), ('\u{1e008}', '\u{1e018}'),
|
||||
('\u{1e01b}', '\u{1e021}'), ('\u{1e023}', '\u{1e024}'),
|
||||
('\u{1e026}', '\u{1e02a}'), ('\u{1e100}', '\u{1e12c}'),
|
||||
('\u{1e130}', '\u{1e13d}'), ('\u{1e140}', '\u{1e149}'),
|
||||
('\u{1e14e}', '\u{1e14e}'), ('\u{1e2c0}', '\u{1e2f9}'), ('𞠀', '𞣄'),
|
||||
('\u{1e8d0}', '\u{1e8d6}'), ('𞤀', '\u{1e94b}'), ('𞥐', '𞥙'),
|
||||
('𞸀', '𞸃'), ('𞸅', '𞸟'), ('𞸡', '𞸢'), ('𞸤', '𞸤'),
|
||||
('𞸧', '𞸧'), ('𞸩', '𞸲'), ('𞸴', '𞸷'), ('𞸹', '𞸹'),
|
||||
('𞸻', '𞸻'), ('𞹂', '𞹂'), ('𞹇', '𞹇'), ('𞹉', '𞹉'),
|
||||
('𞹋', '𞹋'), ('𞹍', '𞹏'), ('𞹑', '𞹒'), ('𞹔', '𞹔'),
|
||||
('𞹗', '𞹗'), ('𞹙', '𞹙'), ('𞹛', '𞹛'), ('𞹝', '𞹝'),
|
||||
('𞹟', '𞹟'), ('𞹡', '𞹢'), ('𞹤', '𞹤'), ('𞹧', '𞹪'),
|
||||
('𞹬', '𞹲'), ('𞹴', '𞹷'), ('𞹹', '𞹼'), ('𞹾', '𞹾'),
|
||||
('𞺀', '𞺉'), ('𞺋', '𞺛'), ('𞺡', '𞺣'), ('𞺥', '𞺩'),
|
||||
('𞺫', '𞺻'), ('🄰', '🅉'), ('🅐', '🅩'), ('🅰', '🆉'),
|
||||
('𠀀', '𪛖'), ('𪜀', '𫜴'), ('𫝀', '𫠝'), ('𫠠', '𬺡'),
|
||||
('𬺰', '𮯠'), ('丽', '𪘀'), ('\u{e0100}', '\u{e01ef}'),
|
||||
('0', '9'),
|
||||
('A', 'Z'),
|
||||
('_', '_'),
|
||||
('a', 'z'),
|
||||
('ª', 'ª'),
|
||||
('µ', 'µ'),
|
||||
('º', 'º'),
|
||||
('À', 'Ö'),
|
||||
('Ø', 'ö'),
|
||||
('ø', 'ˁ'),
|
||||
('ˆ', 'ˑ'),
|
||||
('ˠ', 'ˤ'),
|
||||
('ˬ', 'ˬ'),
|
||||
('ˮ', 'ˮ'),
|
||||
('\u{300}', 'ʹ'),
|
||||
('Ͷ', 'ͷ'),
|
||||
('ͺ', 'ͽ'),
|
||||
('Ϳ', 'Ϳ'),
|
||||
('Ά', 'Ά'),
|
||||
('Έ', 'Ί'),
|
||||
('Ό', 'Ό'),
|
||||
('Ύ', 'Ρ'),
|
||||
('Σ', 'ϵ'),
|
||||
('Ϸ', 'ҁ'),
|
||||
('\u{483}', 'ԯ'),
|
||||
('Ա', 'Ֆ'),
|
||||
('ՙ', 'ՙ'),
|
||||
('ՠ', 'ֈ'),
|
||||
('\u{591}', '\u{5bd}'),
|
||||
('\u{5bf}', '\u{5bf}'),
|
||||
('\u{5c1}', '\u{5c2}'),
|
||||
('\u{5c4}', '\u{5c5}'),
|
||||
('\u{5c7}', '\u{5c7}'),
|
||||
('א', 'ת'),
|
||||
('ׯ', 'ײ'),
|
||||
('\u{610}', '\u{61a}'),
|
||||
('ؠ', '٩'),
|
||||
('ٮ', 'ۓ'),
|
||||
('ە', '\u{6dc}'),
|
||||
('\u{6df}', '\u{6e8}'),
|
||||
('\u{6ea}', 'ۼ'),
|
||||
('ۿ', 'ۿ'),
|
||||
('ܐ', '\u{74a}'),
|
||||
('ݍ', 'ޱ'),
|
||||
('߀', 'ߵ'),
|
||||
('ߺ', 'ߺ'),
|
||||
('\u{7fd}', '\u{7fd}'),
|
||||
('ࠀ', '\u{82d}'),
|
||||
('ࡀ', '\u{85b}'),
|
||||
('ࡠ', 'ࡪ'),
|
||||
('ࢠ', 'ࢴ'),
|
||||
('ࢶ', 'ࢽ'),
|
||||
('\u{8d3}', '\u{8e1}'),
|
||||
('\u{8e3}', '\u{963}'),
|
||||
('०', '९'),
|
||||
('ॱ', 'ঃ'),
|
||||
('অ', 'ঌ'),
|
||||
('এ', 'ঐ'),
|
||||
('ও', 'ন'),
|
||||
('প', 'র'),
|
||||
('ল', 'ল'),
|
||||
('শ', 'হ'),
|
||||
('\u{9bc}', '\u{9c4}'),
|
||||
('ে', 'ৈ'),
|
||||
('ো', 'ৎ'),
|
||||
('\u{9d7}', '\u{9d7}'),
|
||||
('ড়', 'ঢ়'),
|
||||
('য়', '\u{9e3}'),
|
||||
('০', 'ৱ'),
|
||||
('ৼ', 'ৼ'),
|
||||
('\u{9fe}', '\u{9fe}'),
|
||||
('\u{a01}', 'ਃ'),
|
||||
('ਅ', 'ਊ'),
|
||||
('ਏ', 'ਐ'),
|
||||
('ਓ', 'ਨ'),
|
||||
('ਪ', 'ਰ'),
|
||||
('ਲ', 'ਲ਼'),
|
||||
('ਵ', 'ਸ਼'),
|
||||
('ਸ', 'ਹ'),
|
||||
('\u{a3c}', '\u{a3c}'),
|
||||
('ਾ', '\u{a42}'),
|
||||
('\u{a47}', '\u{a48}'),
|
||||
('\u{a4b}', '\u{a4d}'),
|
||||
('\u{a51}', '\u{a51}'),
|
||||
('ਖ਼', 'ੜ'),
|
||||
('ਫ਼', 'ਫ਼'),
|
||||
('੦', '\u{a75}'),
|
||||
('\u{a81}', 'ઃ'),
|
||||
('અ', 'ઍ'),
|
||||
('એ', 'ઑ'),
|
||||
('ઓ', 'ન'),
|
||||
('પ', 'ર'),
|
||||
('લ', 'ળ'),
|
||||
('વ', 'હ'),
|
||||
('\u{abc}', '\u{ac5}'),
|
||||
('\u{ac7}', 'ૉ'),
|
||||
('ો', '\u{acd}'),
|
||||
('ૐ', 'ૐ'),
|
||||
('ૠ', '\u{ae3}'),
|
||||
('૦', '૯'),
|
||||
('ૹ', '\u{aff}'),
|
||||
('\u{b01}', 'ଃ'),
|
||||
('ଅ', 'ଌ'),
|
||||
('ଏ', 'ଐ'),
|
||||
('ଓ', 'ନ'),
|
||||
('ପ', 'ର'),
|
||||
('ଲ', 'ଳ'),
|
||||
('ଵ', 'ହ'),
|
||||
('\u{b3c}', '\u{b44}'),
|
||||
('େ', 'ୈ'),
|
||||
('ୋ', '\u{b4d}'),
|
||||
('\u{b56}', '\u{b57}'),
|
||||
('ଡ଼', 'ଢ଼'),
|
||||
('ୟ', '\u{b63}'),
|
||||
('୦', '୯'),
|
||||
('ୱ', 'ୱ'),
|
||||
('\u{b82}', 'ஃ'),
|
||||
('அ', 'ஊ'),
|
||||
('எ', 'ஐ'),
|
||||
('ஒ', 'க'),
|
||||
('ங', 'ச'),
|
||||
('ஜ', 'ஜ'),
|
||||
('ஞ', 'ட'),
|
||||
('ண', 'த'),
|
||||
('ந', 'ப'),
|
||||
('ம', 'ஹ'),
|
||||
('\u{bbe}', 'ூ'),
|
||||
('ெ', 'ை'),
|
||||
('ொ', '\u{bcd}'),
|
||||
('ௐ', 'ௐ'),
|
||||
('\u{bd7}', '\u{bd7}'),
|
||||
('௦', '௯'),
|
||||
('\u{c00}', 'ఌ'),
|
||||
('ఎ', 'ఐ'),
|
||||
('ఒ', 'న'),
|
||||
('ప', 'హ'),
|
||||
('ఽ', 'ౄ'),
|
||||
('\u{c46}', '\u{c48}'),
|
||||
('\u{c4a}', '\u{c4d}'),
|
||||
('\u{c55}', '\u{c56}'),
|
||||
('ౘ', 'ౚ'),
|
||||
('ౠ', '\u{c63}'),
|
||||
('౦', '౯'),
|
||||
('ಀ', 'ಃ'),
|
||||
('ಅ', 'ಌ'),
|
||||
('ಎ', 'ಐ'),
|
||||
('ಒ', 'ನ'),
|
||||
('ಪ', 'ಳ'),
|
||||
('ವ', 'ಹ'),
|
||||
('\u{cbc}', 'ೄ'),
|
||||
('\u{cc6}', 'ೈ'),
|
||||
('ೊ', '\u{ccd}'),
|
||||
('\u{cd5}', '\u{cd6}'),
|
||||
('ೞ', 'ೞ'),
|
||||
('ೠ', '\u{ce3}'),
|
||||
('೦', '೯'),
|
||||
('ೱ', 'ೲ'),
|
||||
('\u{d00}', 'ഃ'),
|
||||
('അ', 'ഌ'),
|
||||
('എ', 'ഐ'),
|
||||
('ഒ', '\u{d44}'),
|
||||
('െ', 'ൈ'),
|
||||
('ൊ', 'ൎ'),
|
||||
('ൔ', '\u{d57}'),
|
||||
('ൟ', '\u{d63}'),
|
||||
('൦', '൯'),
|
||||
('ൺ', 'ൿ'),
|
||||
('ං', 'ඃ'),
|
||||
('අ', 'ඖ'),
|
||||
('ක', 'න'),
|
||||
('ඳ', 'ර'),
|
||||
('ල', 'ල'),
|
||||
('ව', 'ෆ'),
|
||||
('\u{dca}', '\u{dca}'),
|
||||
('\u{dcf}', '\u{dd4}'),
|
||||
('\u{dd6}', '\u{dd6}'),
|
||||
('ෘ', '\u{ddf}'),
|
||||
('෦', '෯'),
|
||||
('ෲ', 'ෳ'),
|
||||
('ก', '\u{e3a}'),
|
||||
('เ', '\u{e4e}'),
|
||||
('๐', '๙'),
|
||||
('ກ', 'ຂ'),
|
||||
('ຄ', 'ຄ'),
|
||||
('\u{e86}', 'ຊ'),
|
||||
('\u{e8c}', 'ຣ'),
|
||||
('ລ', 'ລ'),
|
||||
('ວ', 'ຽ'),
|
||||
('ເ', 'ໄ'),
|
||||
('ໆ', 'ໆ'),
|
||||
('\u{ec8}', '\u{ecd}'),
|
||||
('໐', '໙'),
|
||||
('ໜ', 'ໟ'),
|
||||
('ༀ', 'ༀ'),
|
||||
('\u{f18}', '\u{f19}'),
|
||||
('༠', '༩'),
|
||||
('\u{f35}', '\u{f35}'),
|
||||
('\u{f37}', '\u{f37}'),
|
||||
('\u{f39}', '\u{f39}'),
|
||||
('༾', 'ཇ'),
|
||||
('ཉ', 'ཬ'),
|
||||
('\u{f71}', '\u{f84}'),
|
||||
('\u{f86}', '\u{f97}'),
|
||||
('\u{f99}', '\u{fbc}'),
|
||||
('\u{fc6}', '\u{fc6}'),
|
||||
('က', '၉'),
|
||||
('ၐ', '\u{109d}'),
|
||||
('Ⴀ', 'Ⴥ'),
|
||||
('Ⴧ', 'Ⴧ'),
|
||||
('Ⴭ', 'Ⴭ'),
|
||||
('ა', 'ჺ'),
|
||||
('ჼ', 'ቈ'),
|
||||
('ቊ', 'ቍ'),
|
||||
('ቐ', 'ቖ'),
|
||||
('ቘ', 'ቘ'),
|
||||
('ቚ', 'ቝ'),
|
||||
('በ', 'ኈ'),
|
||||
('ኊ', 'ኍ'),
|
||||
('ነ', 'ኰ'),
|
||||
('ኲ', 'ኵ'),
|
||||
('ኸ', 'ኾ'),
|
||||
('ዀ', 'ዀ'),
|
||||
('ዂ', 'ዅ'),
|
||||
('ወ', 'ዖ'),
|
||||
('ዘ', 'ጐ'),
|
||||
('ጒ', 'ጕ'),
|
||||
('ጘ', 'ፚ'),
|
||||
('\u{135d}', '\u{135f}'),
|
||||
('ᎀ', 'ᎏ'),
|
||||
('Ꭰ', 'Ᏽ'),
|
||||
('ᏸ', 'ᏽ'),
|
||||
('ᐁ', 'ᙬ'),
|
||||
('ᙯ', 'ᙿ'),
|
||||
('ᚁ', 'ᚚ'),
|
||||
('ᚠ', 'ᛪ'),
|
||||
('ᛮ', 'ᛸ'),
|
||||
('ᜀ', 'ᜌ'),
|
||||
('ᜎ', '\u{1714}'),
|
||||
('ᜠ', '\u{1734}'),
|
||||
('ᝀ', '\u{1753}'),
|
||||
('ᝠ', 'ᝬ'),
|
||||
('ᝮ', 'ᝰ'),
|
||||
('\u{1772}', '\u{1773}'),
|
||||
('ក', '\u{17d3}'),
|
||||
('ៗ', 'ៗ'),
|
||||
('ៜ', '\u{17dd}'),
|
||||
('០', '៩'),
|
||||
('\u{180b}', '\u{180d}'),
|
||||
('᠐', '᠙'),
|
||||
('ᠠ', 'ᡸ'),
|
||||
('ᢀ', 'ᢪ'),
|
||||
('ᢰ', 'ᣵ'),
|
||||
('ᤀ', 'ᤞ'),
|
||||
('\u{1920}', 'ᤫ'),
|
||||
('ᤰ', '\u{193b}'),
|
||||
('᥆', 'ᥭ'),
|
||||
('ᥰ', 'ᥴ'),
|
||||
('ᦀ', 'ᦫ'),
|
||||
('ᦰ', 'ᧉ'),
|
||||
('᧐', '᧙'),
|
||||
('ᨀ', '\u{1a1b}'),
|
||||
('ᨠ', '\u{1a5e}'),
|
||||
('\u{1a60}', '\u{1a7c}'),
|
||||
('\u{1a7f}', '᪉'),
|
||||
('᪐', '᪙'),
|
||||
('ᪧ', 'ᪧ'),
|
||||
('\u{1ab0}', '\u{1abe}'),
|
||||
('\u{1b00}', 'ᭋ'),
|
||||
('᭐', '᭙'),
|
||||
('\u{1b6b}', '\u{1b73}'),
|
||||
('\u{1b80}', '᯳'),
|
||||
('ᰀ', '\u{1c37}'),
|
||||
('᱀', '᱉'),
|
||||
('ᱍ', 'ᱽ'),
|
||||
('ᲀ', 'ᲈ'),
|
||||
('Ა', 'Ჺ'),
|
||||
('Ჽ', 'Ჿ'),
|
||||
('\u{1cd0}', '\u{1cd2}'),
|
||||
('\u{1cd4}', '\u{1cfa}'),
|
||||
('ᴀ', '\u{1df9}'),
|
||||
('\u{1dfb}', 'ἕ'),
|
||||
('Ἐ', 'Ἕ'),
|
||||
('ἠ', 'ὅ'),
|
||||
('Ὀ', 'Ὅ'),
|
||||
('ὐ', 'ὗ'),
|
||||
('Ὑ', 'Ὑ'),
|
||||
('Ὓ', 'Ὓ'),
|
||||
('Ὕ', 'Ὕ'),
|
||||
('Ὗ', 'ώ'),
|
||||
('ᾀ', 'ᾴ'),
|
||||
('ᾶ', 'ᾼ'),
|
||||
('ι', 'ι'),
|
||||
('ῂ', 'ῄ'),
|
||||
('ῆ', 'ῌ'),
|
||||
('ῐ', 'ΐ'),
|
||||
('ῖ', 'Ί'),
|
||||
('ῠ', 'Ῥ'),
|
||||
('ῲ', 'ῴ'),
|
||||
('ῶ', 'ῼ'),
|
||||
('\u{200c}', '\u{200d}'),
|
||||
('‿', '⁀'),
|
||||
('⁔', '⁔'),
|
||||
('ⁱ', 'ⁱ'),
|
||||
('ⁿ', 'ⁿ'),
|
||||
('ₐ', 'ₜ'),
|
||||
('\u{20d0}', '\u{20f0}'),
|
||||
('ℂ', 'ℂ'),
|
||||
('ℇ', 'ℇ'),
|
||||
('ℊ', 'ℓ'),
|
||||
('ℕ', 'ℕ'),
|
||||
('ℙ', 'ℝ'),
|
||||
('ℤ', 'ℤ'),
|
||||
('Ω', 'Ω'),
|
||||
('ℨ', 'ℨ'),
|
||||
('K', 'ℭ'),
|
||||
('ℯ', 'ℹ'),
|
||||
('ℼ', 'ℿ'),
|
||||
('ⅅ', 'ⅉ'),
|
||||
('ⅎ', 'ⅎ'),
|
||||
('Ⅰ', 'ↈ'),
|
||||
('Ⓐ', 'ⓩ'),
|
||||
('Ⰰ', 'Ⱞ'),
|
||||
('ⰰ', 'ⱞ'),
|
||||
('Ⱡ', 'ⳤ'),
|
||||
('Ⳬ', 'ⳳ'),
|
||||
('ⴀ', 'ⴥ'),
|
||||
('ⴧ', 'ⴧ'),
|
||||
('ⴭ', 'ⴭ'),
|
||||
('ⴰ', 'ⵧ'),
|
||||
('ⵯ', 'ⵯ'),
|
||||
('\u{2d7f}', 'ⶖ'),
|
||||
('ⶠ', 'ⶦ'),
|
||||
('ⶨ', 'ⶮ'),
|
||||
('ⶰ', 'ⶶ'),
|
||||
('ⶸ', 'ⶾ'),
|
||||
('ⷀ', 'ⷆ'),
|
||||
('ⷈ', 'ⷎ'),
|
||||
('ⷐ', 'ⷖ'),
|
||||
('ⷘ', 'ⷞ'),
|
||||
('\u{2de0}', '\u{2dff}'),
|
||||
('ⸯ', 'ⸯ'),
|
||||
('々', '〇'),
|
||||
('〡', '\u{302f}'),
|
||||
('〱', '〵'),
|
||||
('〸', '〼'),
|
||||
('ぁ', 'ゖ'),
|
||||
('\u{3099}', '\u{309a}'),
|
||||
('ゝ', 'ゟ'),
|
||||
('ァ', 'ヺ'),
|
||||
('ー', 'ヿ'),
|
||||
('ㄅ', 'ㄯ'),
|
||||
('ㄱ', 'ㆎ'),
|
||||
('ㆠ', 'ㆺ'),
|
||||
('ㇰ', 'ㇿ'),
|
||||
('㐀', '䶵'),
|
||||
('一', '鿯'),
|
||||
('ꀀ', 'ꒌ'),
|
||||
('ꓐ', 'ꓽ'),
|
||||
('ꔀ', 'ꘌ'),
|
||||
('ꘐ', 'ꘫ'),
|
||||
('Ꙁ', '\u{a672}'),
|
||||
('\u{a674}', '\u{a67d}'),
|
||||
('ꙿ', '\u{a6f1}'),
|
||||
('ꜗ', 'ꜟ'),
|
||||
('Ꜣ', 'ꞈ'),
|
||||
('Ꞌ', '\u{a7bf}'),
|
||||
('\u{a7c2}', '\u{a7c6}'),
|
||||
('ꟷ', 'ꠧ'),
|
||||
('ꡀ', 'ꡳ'),
|
||||
('ꢀ', '\u{a8c5}'),
|
||||
('꣐', '꣙'),
|
||||
('\u{a8e0}', 'ꣷ'),
|
||||
('ꣻ', 'ꣻ'),
|
||||
('ꣽ', '\u{a92d}'),
|
||||
('ꤰ', '꥓'),
|
||||
('ꥠ', 'ꥼ'),
|
||||
('\u{a980}', '꧀'),
|
||||
('ꧏ', '꧙'),
|
||||
('ꧠ', 'ꧾ'),
|
||||
('ꨀ', '\u{aa36}'),
|
||||
('ꩀ', 'ꩍ'),
|
||||
('꩐', '꩙'),
|
||||
('ꩠ', 'ꩶ'),
|
||||
('ꩺ', 'ꫂ'),
|
||||
('ꫛ', 'ꫝ'),
|
||||
('ꫠ', 'ꫯ'),
|
||||
('ꫲ', '\u{aaf6}'),
|
||||
('ꬁ', 'ꬆ'),
|
||||
('ꬉ', 'ꬎ'),
|
||||
('ꬑ', 'ꬖ'),
|
||||
('ꬠ', 'ꬦ'),
|
||||
('ꬨ', 'ꬮ'),
|
||||
('ꬰ', 'ꭚ'),
|
||||
('ꭜ', '\u{ab67}'),
|
||||
('ꭰ', 'ꯪ'),
|
||||
('꯬', '\u{abed}'),
|
||||
('꯰', '꯹'),
|
||||
('가', '힣'),
|
||||
('ힰ', 'ퟆ'),
|
||||
('ퟋ', 'ퟻ'),
|
||||
('豈', '舘'),
|
||||
('並', '龎'),
|
||||
('ff', 'st'),
|
||||
('ﬓ', 'ﬗ'),
|
||||
('יִ', 'ﬨ'),
|
||||
('שׁ', 'זּ'),
|
||||
('טּ', 'לּ'),
|
||||
('מּ', 'מּ'),
|
||||
('נּ', 'סּ'),
|
||||
('ףּ', 'פּ'),
|
||||
('צּ', 'ﮱ'),
|
||||
('ﯓ', 'ﴽ'),
|
||||
('ﵐ', 'ﶏ'),
|
||||
('ﶒ', 'ﷇ'),
|
||||
('ﷰ', 'ﷻ'),
|
||||
('\u{fe00}', '\u{fe0f}'),
|
||||
('\u{fe20}', '\u{fe2f}'),
|
||||
('︳', '︴'),
|
||||
('﹍', '﹏'),
|
||||
('ﹰ', 'ﹴ'),
|
||||
('ﹶ', 'ﻼ'),
|
||||
('0', '9'),
|
||||
('A', 'Z'),
|
||||
('_', '_'),
|
||||
('a', 'z'),
|
||||
('ヲ', 'ᄒ'),
|
||||
('ᅡ', 'ᅦ'),
|
||||
('ᅧ', 'ᅬ'),
|
||||
('ᅭ', 'ᅲ'),
|
||||
('ᅳ', 'ᅵ'),
|
||||
('𐀀', '𐀋'),
|
||||
('𐀍', '𐀦'),
|
||||
('𐀨', '𐀺'),
|
||||
('𐀼', '𐀽'),
|
||||
('𐀿', '𐁍'),
|
||||
('𐁐', '𐁝'),
|
||||
('𐂀', '𐃺'),
|
||||
('𐅀', '𐅴'),
|
||||
('\u{101fd}', '\u{101fd}'),
|
||||
('𐊀', '𐊜'),
|
||||
('𐊠', '𐋐'),
|
||||
('\u{102e0}', '\u{102e0}'),
|
||||
('𐌀', '𐌟'),
|
||||
('𐌭', '𐍊'),
|
||||
('𐍐', '\u{1037a}'),
|
||||
('𐎀', '𐎝'),
|
||||
('𐎠', '𐏃'),
|
||||
('𐏈', '𐏏'),
|
||||
('𐏑', '𐏕'),
|
||||
('𐐀', '𐒝'),
|
||||
('𐒠', '𐒩'),
|
||||
('𐒰', '𐓓'),
|
||||
('𐓘', '𐓻'),
|
||||
('𐔀', '𐔧'),
|
||||
('𐔰', '𐕣'),
|
||||
('𐘀', '𐜶'),
|
||||
('𐝀', '𐝕'),
|
||||
('𐝠', '𐝧'),
|
||||
('𐠀', '𐠅'),
|
||||
('𐠈', '𐠈'),
|
||||
('𐠊', '𐠵'),
|
||||
('𐠷', '𐠸'),
|
||||
('𐠼', '𐠼'),
|
||||
('𐠿', '𐡕'),
|
||||
('𐡠', '𐡶'),
|
||||
('𐢀', '𐢞'),
|
||||
('𐣠', '𐣲'),
|
||||
('𐣴', '𐣵'),
|
||||
('𐤀', '𐤕'),
|
||||
('𐤠', '𐤹'),
|
||||
('𐦀', '𐦷'),
|
||||
('𐦾', '𐦿'),
|
||||
('𐨀', '\u{10a03}'),
|
||||
('\u{10a05}', '\u{10a06}'),
|
||||
('\u{10a0c}', '𐨓'),
|
||||
('𐨕', '𐨗'),
|
||||
('𐨙', '𐨵'),
|
||||
('\u{10a38}', '\u{10a3a}'),
|
||||
('\u{10a3f}', '\u{10a3f}'),
|
||||
('𐩠', '𐩼'),
|
||||
('𐪀', '𐪜'),
|
||||
('𐫀', '𐫇'),
|
||||
('𐫉', '\u{10ae6}'),
|
||||
('𐬀', '𐬵'),
|
||||
('𐭀', '𐭕'),
|
||||
('𐭠', '𐭲'),
|
||||
('𐮀', '𐮑'),
|
||||
('𐰀', '𐱈'),
|
||||
('𐲀', '𐲲'),
|
||||
('𐳀', '𐳲'),
|
||||
('𐴀', '\u{10d27}'),
|
||||
('𐴰', '𐴹'),
|
||||
('𐼀', '𐼜'),
|
||||
('𐼧', '𐼧'),
|
||||
('𐼰', '\u{10f50}'),
|
||||
('\u{10fe0}', '\u{10ff6}'),
|
||||
('𑀀', '\u{11046}'),
|
||||
('𑁦', '𑁯'),
|
||||
('\u{1107f}', '\u{110ba}'),
|
||||
('𑃐', '𑃨'),
|
||||
('𑃰', '𑃹'),
|
||||
('\u{11100}', '\u{11134}'),
|
||||
('𑄶', '𑄿'),
|
||||
('𑅄', '𑅆'),
|
||||
('𑅐', '\u{11173}'),
|
||||
('𑅶', '𑅶'),
|
||||
('\u{11180}', '𑇄'),
|
||||
('\u{111c9}', '\u{111cc}'),
|
||||
('𑇐', '𑇚'),
|
||||
('𑇜', '𑇜'),
|
||||
('𑈀', '𑈑'),
|
||||
('𑈓', '\u{11237}'),
|
||||
('\u{1123e}', '\u{1123e}'),
|
||||
('𑊀', '𑊆'),
|
||||
('𑊈', '𑊈'),
|
||||
('𑊊', '𑊍'),
|
||||
('𑊏', '𑊝'),
|
||||
('𑊟', '𑊨'),
|
||||
('𑊰', '\u{112ea}'),
|
||||
('𑋰', '𑋹'),
|
||||
('\u{11300}', '𑌃'),
|
||||
('𑌅', '𑌌'),
|
||||
('𑌏', '𑌐'),
|
||||
('𑌓', '𑌨'),
|
||||
('𑌪', '𑌰'),
|
||||
('𑌲', '𑌳'),
|
||||
('𑌵', '𑌹'),
|
||||
('\u{1133b}', '𑍄'),
|
||||
('𑍇', '𑍈'),
|
||||
('𑍋', '𑍍'),
|
||||
('𑍐', '𑍐'),
|
||||
('\u{11357}', '\u{11357}'),
|
||||
('𑍝', '𑍣'),
|
||||
('\u{11366}', '\u{1136c}'),
|
||||
('\u{11370}', '\u{11374}'),
|
||||
('𑐀', '𑑊'),
|
||||
('𑑐', '𑑙'),
|
||||
('\u{1145e}', '\u{1145f}'),
|
||||
('𑒀', '𑓅'),
|
||||
('𑓇', '𑓇'),
|
||||
('𑓐', '𑓙'),
|
||||
('𑖀', '\u{115b5}'),
|
||||
('𑖸', '\u{115c0}'),
|
||||
('𑗘', '\u{115dd}'),
|
||||
('𑘀', '\u{11640}'),
|
||||
('𑙄', '𑙄'),
|
||||
('𑙐', '𑙙'),
|
||||
('𑚀', '\u{116b8}'),
|
||||
('𑛀', '𑛉'),
|
||||
('𑜀', '𑜚'),
|
||||
('\u{1171d}', '\u{1172b}'),
|
||||
('𑜰', '𑜹'),
|
||||
('𑠀', '\u{1183a}'),
|
||||
('𑢠', '𑣩'),
|
||||
('𑣿', '𑣿'),
|
||||
('\u{119a0}', '\u{119a7}'),
|
||||
('\u{119aa}', '\u{119d7}'),
|
||||
('\u{119da}', '\u{119e1}'),
|
||||
('\u{119e3}', '\u{119e4}'),
|
||||
('𑨀', '\u{11a3e}'),
|
||||
('\u{11a47}', '\u{11a47}'),
|
||||
('𑩐', '\u{11a99}'),
|
||||
('𑪝', '𑪝'),
|
||||
('𑫀', '𑫸'),
|
||||
('𑰀', '𑰈'),
|
||||
('𑰊', '\u{11c36}'),
|
||||
('\u{11c38}', '𑱀'),
|
||||
('𑱐', '𑱙'),
|
||||
('𑱲', '𑲏'),
|
||||
('\u{11c92}', '\u{11ca7}'),
|
||||
('𑲩', '\u{11cb6}'),
|
||||
('𑴀', '𑴆'),
|
||||
('𑴈', '𑴉'),
|
||||
('𑴋', '\u{11d36}'),
|
||||
('\u{11d3a}', '\u{11d3a}'),
|
||||
('\u{11d3c}', '\u{11d3d}'),
|
||||
('\u{11d3f}', '\u{11d47}'),
|
||||
('𑵐', '𑵙'),
|
||||
('𑵠', '𑵥'),
|
||||
('𑵧', '𑵨'),
|
||||
('𑵪', '𑶎'),
|
||||
('\u{11d90}', '\u{11d91}'),
|
||||
('𑶓', '𑶘'),
|
||||
('𑶠', '𑶩'),
|
||||
('𑻠', '𑻶'),
|
||||
('𒀀', '𒎙'),
|
||||
('𒐀', '𒑮'),
|
||||
('𒒀', '𒕃'),
|
||||
('𓀀', '𓐮'),
|
||||
('𔐀', '𔙆'),
|
||||
('𖠀', '𖨸'),
|
||||
('𖩀', '𖩞'),
|
||||
('𖩠', '𖩩'),
|
||||
('𖫐', '𖫭'),
|
||||
('\u{16af0}', '\u{16af4}'),
|
||||
('𖬀', '\u{16b36}'),
|
||||
('𖭀', '𖭃'),
|
||||
('𖭐', '𖭙'),
|
||||
('𖭣', '𖭷'),
|
||||
('𖭽', '𖮏'),
|
||||
('𖹀', '𖹿'),
|
||||
('𖼀', '\u{16f4a}'),
|
||||
('\u{16f4f}', '\u{16f87}'),
|
||||
('\u{16f8f}', '𖾟'),
|
||||
('𖿠', '𖿡'),
|
||||
('\u{16fe3}', '\u{16fe3}'),
|
||||
('𗀀', '\u{187f7}'),
|
||||
('𘠀', '𘫲'),
|
||||
('𛀀', '𛄞'),
|
||||
('\u{1b150}', '\u{1b152}'),
|
||||
('\u{1b164}', '\u{1b167}'),
|
||||
('𛅰', '𛋻'),
|
||||
('𛰀', '𛱪'),
|
||||
('𛱰', '𛱼'),
|
||||
('𛲀', '𛲈'),
|
||||
('𛲐', '𛲙'),
|
||||
('\u{1bc9d}', '\u{1bc9e}'),
|
||||
('\u{1d165}', '\u{1d169}'),
|
||||
('𝅭', '\u{1d172}'),
|
||||
('\u{1d17b}', '\u{1d182}'),
|
||||
('\u{1d185}', '\u{1d18b}'),
|
||||
('\u{1d1aa}', '\u{1d1ad}'),
|
||||
('\u{1d242}', '\u{1d244}'),
|
||||
('𝐀', '𝑔'),
|
||||
('𝑖', '𝒜'),
|
||||
('𝒞', '𝒟'),
|
||||
('𝒢', '𝒢'),
|
||||
('𝒥', '𝒦'),
|
||||
('𝒩', '𝒬'),
|
||||
('𝒮', '𝒹'),
|
||||
('𝒻', '𝒻'),
|
||||
('𝒽', '𝓃'),
|
||||
('𝓅', '𝔅'),
|
||||
('𝔇', '𝔊'),
|
||||
('𝔍', '𝔔'),
|
||||
('𝔖', '𝔜'),
|
||||
('𝔞', '𝔹'),
|
||||
('𝔻', '𝔾'),
|
||||
('𝕀', '𝕄'),
|
||||
('𝕆', '𝕆'),
|
||||
('𝕊', '𝕐'),
|
||||
('𝕒', '𝚥'),
|
||||
('𝚨', '𝛀'),
|
||||
('𝛂', '𝛚'),
|
||||
('𝛜', '𝛺'),
|
||||
('𝛼', '𝜔'),
|
||||
('𝜖', '𝜴'),
|
||||
('𝜶', '𝝎'),
|
||||
('𝝐', '𝝮'),
|
||||
('𝝰', '𝞈'),
|
||||
('𝞊', '𝞨'),
|
||||
('𝞪', '𝟂'),
|
||||
('𝟄', '𝟋'),
|
||||
('𝟎', '𝟿'),
|
||||
('\u{1da00}', '\u{1da36}'),
|
||||
('\u{1da3b}', '\u{1da6c}'),
|
||||
('\u{1da75}', '\u{1da75}'),
|
||||
('\u{1da84}', '\u{1da84}'),
|
||||
('\u{1da9b}', '\u{1da9f}'),
|
||||
('\u{1daa1}', '\u{1daaf}'),
|
||||
('\u{1e000}', '\u{1e006}'),
|
||||
('\u{1e008}', '\u{1e018}'),
|
||||
('\u{1e01b}', '\u{1e021}'),
|
||||
('\u{1e023}', '\u{1e024}'),
|
||||
('\u{1e026}', '\u{1e02a}'),
|
||||
('\u{1e100}', '\u{1e12c}'),
|
||||
('\u{1e130}', '\u{1e13d}'),
|
||||
('\u{1e140}', '\u{1e149}'),
|
||||
('\u{1e14e}', '\u{1e14e}'),
|
||||
('\u{1e2c0}', '\u{1e2f9}'),
|
||||
('𞠀', '𞣄'),
|
||||
('\u{1e8d0}', '\u{1e8d6}'),
|
||||
('𞤀', '\u{1e94b}'),
|
||||
('𞥐', '𞥙'),
|
||||
('𞸀', '𞸃'),
|
||||
('𞸅', '𞸟'),
|
||||
('𞸡', '𞸢'),
|
||||
('𞸤', '𞸤'),
|
||||
('𞸧', '𞸧'),
|
||||
('𞸩', '𞸲'),
|
||||
('𞸴', '𞸷'),
|
||||
('𞸹', '𞸹'),
|
||||
('𞸻', '𞸻'),
|
||||
('𞹂', '𞹂'),
|
||||
('𞹇', '𞹇'),
|
||||
('𞹉', '𞹉'),
|
||||
('𞹋', '𞹋'),
|
||||
('𞹍', '𞹏'),
|
||||
('𞹑', '𞹒'),
|
||||
('𞹔', '𞹔'),
|
||||
('𞹗', '𞹗'),
|
||||
('𞹙', '𞹙'),
|
||||
('𞹛', '𞹛'),
|
||||
('𞹝', '𞹝'),
|
||||
('𞹟', '𞹟'),
|
||||
('𞹡', '𞹢'),
|
||||
('𞹤', '𞹤'),
|
||||
('𞹧', '𞹪'),
|
||||
('𞹬', '𞹲'),
|
||||
('𞹴', '𞹷'),
|
||||
('𞹹', '𞹼'),
|
||||
('𞹾', '𞹾'),
|
||||
('𞺀', '𞺉'),
|
||||
('𞺋', '𞺛'),
|
||||
('𞺡', '𞺣'),
|
||||
('𞺥', '𞺩'),
|
||||
('𞺫', '𞺻'),
|
||||
('🄰', '🅉'),
|
||||
('🅐', '🅩'),
|
||||
('🅰', '🆉'),
|
||||
('𠀀', '𪛖'),
|
||||
('𪜀', '𫜴'),
|
||||
('𫝀', '𫠝'),
|
||||
('𫠠', '𬺡'),
|
||||
('𬺰', '𮯠'),
|
||||
('丽', '𪘀'),
|
||||
('\u{e0100}', '\u{e01ef}'),
|
||||
];
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -5,147 +5,249 @@
|
||||
// ucd-generate is available on crates.io.
|
||||
|
||||
pub const PROPERTY_NAMES: &'static [(&'static str, &'static str)] = &[
|
||||
("age", "Age"), ("ahex", "ASCII_Hex_Digit"), ("alpha", "Alphabetic"),
|
||||
("alphabetic", "Alphabetic"), ("asciihexdigit", "ASCII_Hex_Digit"),
|
||||
("bc", "Bidi_Class"), ("bidic", "Bidi_Control"),
|
||||
("bidiclass", "Bidi_Class"), ("bidicontrol", "Bidi_Control"),
|
||||
("bidim", "Bidi_Mirrored"), ("bidimirrored", "Bidi_Mirrored"),
|
||||
("bidimirroringglyph", "Bidi_Mirroring_Glyph"),
|
||||
("bidipairedbracket", "Bidi_Paired_Bracket"),
|
||||
("bidipairedbrackettype", "Bidi_Paired_Bracket_Type"), ("blk", "Block"),
|
||||
("block", "Block"), ("bmg", "Bidi_Mirroring_Glyph"),
|
||||
("bpb", "Bidi_Paired_Bracket"), ("bpt", "Bidi_Paired_Bracket_Type"),
|
||||
("canonicalcombiningclass", "Canonical_Combining_Class"),
|
||||
("cased", "Cased"), ("casefolding", "Case_Folding"),
|
||||
("caseignorable", "Case_Ignorable"), ("ccc", "Canonical_Combining_Class"),
|
||||
("ce", "Composition_Exclusion"), ("cf", "Case_Folding"),
|
||||
("changeswhencasefolded", "Changes_When_Casefolded"),
|
||||
("changeswhencasemapped", "Changes_When_Casemapped"),
|
||||
("changeswhenlowercased", "Changes_When_Lowercased"),
|
||||
("changeswhennfkccasefolded", "Changes_When_NFKC_Casefolded"),
|
||||
("changeswhentitlecased", "Changes_When_Titlecased"),
|
||||
("changeswhenuppercased", "Changes_When_Uppercased"),
|
||||
("ci", "Case_Ignorable"), ("cjkaccountingnumeric", "kAccountingNumeric"),
|
||||
("cjkcompatibilityvariant", "kCompatibilityVariant"),
|
||||
("cjkiicore", "kIICore"), ("cjkirggsource", "kIRG_GSource"),
|
||||
("cjkirghsource", "kIRG_HSource"), ("cjkirgjsource", "kIRG_JSource"),
|
||||
("cjkirgkpsource", "kIRG_KPSource"), ("cjkirgksource", "kIRG_KSource"),
|
||||
("cjkirgmsource", "kIRG_MSource"), ("cjkirgtsource", "kIRG_TSource"),
|
||||
("cjkirgusource", "kIRG_USource"), ("cjkirgvsource", "kIRG_VSource"),
|
||||
("cjkothernumeric", "kOtherNumeric"),
|
||||
("cjkprimarynumeric", "kPrimaryNumeric"), ("cjkrsunicode", "kRSUnicode"),
|
||||
("compex", "Full_Composition_Exclusion"),
|
||||
("compositionexclusion", "Composition_Exclusion"),
|
||||
("cwcf", "Changes_When_Casefolded"), ("cwcm", "Changes_When_Casemapped"),
|
||||
("cwkcf", "Changes_When_NFKC_Casefolded"),
|
||||
("cwl", "Changes_When_Lowercased"), ("cwt", "Changes_When_Titlecased"),
|
||||
("cwu", "Changes_When_Uppercased"), ("dash", "Dash"),
|
||||
("decompositionmapping", "Decomposition_Mapping"),
|
||||
("decompositiontype", "Decomposition_Type"),
|
||||
("defaultignorablecodepoint", "Default_Ignorable_Code_Point"),
|
||||
("dep", "Deprecated"), ("deprecated", "Deprecated"),
|
||||
("di", "Default_Ignorable_Code_Point"), ("dia", "Diacritic"),
|
||||
("diacritic", "Diacritic"), ("dm", "Decomposition_Mapping"),
|
||||
("dt", "Decomposition_Type"), ("ea", "East_Asian_Width"),
|
||||
("eastasianwidth", "East_Asian_Width"), ("emoji", "Emoji"),
|
||||
("emojicomponent", "Emoji_Component"), ("emojimodifier", "Emoji_Modifier"),
|
||||
("emojimodifierbase", "Emoji_Modifier_Base"),
|
||||
("emojipresentation", "Emoji_Presentation"),
|
||||
("equideo", "Equivalent_Unified_Ideograph"),
|
||||
("equivalentunifiedideograph", "Equivalent_Unified_Ideograph"),
|
||||
("expandsonnfc", "Expands_On_NFC"), ("expandsonnfd", "Expands_On_NFD"),
|
||||
("expandsonnfkc", "Expands_On_NFKC"), ("expandsonnfkd", "Expands_On_NFKD"),
|
||||
("ext", "Extender"), ("extendedpictographic", "Extended_Pictographic"),
|
||||
("extender", "Extender"), ("fcnfkc", "FC_NFKC_Closure"),
|
||||
("fcnfkcclosure", "FC_NFKC_Closure"),
|
||||
("fullcompositionexclusion", "Full_Composition_Exclusion"),
|
||||
("gc", "General_Category"), ("gcb", "Grapheme_Cluster_Break"),
|
||||
("generalcategory", "General_Category"), ("graphemebase", "Grapheme_Base"),
|
||||
("graphemeclusterbreak", "Grapheme_Cluster_Break"),
|
||||
("graphemeextend", "Grapheme_Extend"), ("graphemelink", "Grapheme_Link"),
|
||||
("grbase", "Grapheme_Base"), ("grext", "Grapheme_Extend"),
|
||||
("grlink", "Grapheme_Link"), ("hangulsyllabletype", "Hangul_Syllable_Type"),
|
||||
("hex", "Hex_Digit"), ("hexdigit", "Hex_Digit"),
|
||||
("hst", "Hangul_Syllable_Type"), ("hyphen", "Hyphen"),
|
||||
("idc", "ID_Continue"), ("idcontinue", "ID_Continue"),
|
||||
("ideo", "Ideographic"), ("ideographic", "Ideographic"),
|
||||
("ids", "ID_Start"), ("idsb", "IDS_Binary_Operator"),
|
||||
("idsbinaryoperator", "IDS_Binary_Operator"),
|
||||
("idst", "IDS_Trinary_Operator"), ("idstart", "ID_Start"),
|
||||
("idstrinaryoperator", "IDS_Trinary_Operator"),
|
||||
("indicpositionalcategory", "Indic_Positional_Category"),
|
||||
("indicsyllabiccategory", "Indic_Syllabic_Category"),
|
||||
("inpc", "Indic_Positional_Category"), ("insc", "Indic_Syllabic_Category"),
|
||||
("isc", "ISO_Comment"), ("jamoshortname", "Jamo_Short_Name"),
|
||||
("jg", "Joining_Group"), ("joinc", "Join_Control"),
|
||||
("joincontrol", "Join_Control"), ("joininggroup", "Joining_Group"),
|
||||
("joiningtype", "Joining_Type"), ("jsn", "Jamo_Short_Name"),
|
||||
("jt", "Joining_Type"), ("kaccountingnumeric", "kAccountingNumeric"),
|
||||
("kcompatibilityvariant", "kCompatibilityVariant"), ("kiicore", "kIICore"),
|
||||
("kirggsource", "kIRG_GSource"), ("kirghsource", "kIRG_HSource"),
|
||||
("kirgjsource", "kIRG_JSource"), ("kirgkpsource", "kIRG_KPSource"),
|
||||
("kirgksource", "kIRG_KSource"), ("kirgmsource", "kIRG_MSource"),
|
||||
("kirgtsource", "kIRG_TSource"), ("kirgusource", "kIRG_USource"),
|
||||
("kirgvsource", "kIRG_VSource"), ("kothernumeric", "kOtherNumeric"),
|
||||
("kprimarynumeric", "kPrimaryNumeric"), ("krsunicode", "kRSUnicode"),
|
||||
("lb", "Line_Break"), ("lc", "Lowercase_Mapping"),
|
||||
("linebreak", "Line_Break"), ("loe", "Logical_Order_Exception"),
|
||||
("logicalorderexception", "Logical_Order_Exception"),
|
||||
("lower", "Lowercase"), ("lowercase", "Lowercase"),
|
||||
("lowercasemapping", "Lowercase_Mapping"), ("math", "Math"), ("na", "Name"),
|
||||
("na1", "Unicode_1_Name"), ("name", "Name"), ("namealias", "Name_Alias"),
|
||||
("nchar", "Noncharacter_Code_Point"), ("nfcqc", "NFC_Quick_Check"),
|
||||
("nfcquickcheck", "NFC_Quick_Check"), ("nfdqc", "NFD_Quick_Check"),
|
||||
("nfdquickcheck", "NFD_Quick_Check"), ("nfkccasefold", "NFKC_Casefold"),
|
||||
("nfkccf", "NFKC_Casefold"), ("nfkcqc", "NFKC_Quick_Check"),
|
||||
("nfkcquickcheck", "NFKC_Quick_Check"), ("nfkdqc", "NFKD_Quick_Check"),
|
||||
("nfkdquickcheck", "NFKD_Quick_Check"),
|
||||
("noncharactercodepoint", "Noncharacter_Code_Point"),
|
||||
("nt", "Numeric_Type"), ("numerictype", "Numeric_Type"),
|
||||
("numericvalue", "Numeric_Value"), ("nv", "Numeric_Value"),
|
||||
("oalpha", "Other_Alphabetic"), ("ocomment", "ISO_Comment"),
|
||||
("odi", "Other_Default_Ignorable_Code_Point"),
|
||||
("ogrext", "Other_Grapheme_Extend"), ("oidc", "Other_ID_Continue"),
|
||||
("oids", "Other_ID_Start"), ("olower", "Other_Lowercase"),
|
||||
("omath", "Other_Math"), ("otheralphabetic", "Other_Alphabetic"),
|
||||
("otherdefaultignorablecodepoint", "Other_Default_Ignorable_Code_Point"),
|
||||
("othergraphemeextend", "Other_Grapheme_Extend"),
|
||||
("otheridcontinue", "Other_ID_Continue"),
|
||||
("otheridstart", "Other_ID_Start"), ("otherlowercase", "Other_Lowercase"),
|
||||
("othermath", "Other_Math"), ("otheruppercase", "Other_Uppercase"),
|
||||
("oupper", "Other_Uppercase"), ("patsyn", "Pattern_Syntax"),
|
||||
("patternsyntax", "Pattern_Syntax"),
|
||||
("patternwhitespace", "Pattern_White_Space"),
|
||||
("patws", "Pattern_White_Space"), ("pcm", "Prepended_Concatenation_Mark"),
|
||||
("prependedconcatenationmark", "Prepended_Concatenation_Mark"),
|
||||
("qmark", "Quotation_Mark"), ("quotationmark", "Quotation_Mark"),
|
||||
("radical", "Radical"), ("regionalindicator", "Regional_Indicator"),
|
||||
("ri", "Regional_Indicator"), ("sb", "Sentence_Break"), ("sc", "Script"),
|
||||
("scf", "Simple_Case_Folding"), ("script", "Script"),
|
||||
("scriptextensions", "Script_Extensions"), ("scx", "Script_Extensions"),
|
||||
("sd", "Soft_Dotted"), ("sentencebreak", "Sentence_Break"),
|
||||
("sentenceterminal", "Sentence_Terminal"), ("sfc", "Simple_Case_Folding"),
|
||||
("simplecasefolding", "Simple_Case_Folding"),
|
||||
("simplelowercasemapping", "Simple_Lowercase_Mapping"),
|
||||
("simpletitlecasemapping", "Simple_Titlecase_Mapping"),
|
||||
("simpleuppercasemapping", "Simple_Uppercase_Mapping"),
|
||||
("slc", "Simple_Lowercase_Mapping"), ("softdotted", "Soft_Dotted"),
|
||||
("space", "White_Space"), ("stc", "Simple_Titlecase_Mapping"),
|
||||
("sterm", "Sentence_Terminal"), ("suc", "Simple_Uppercase_Mapping"),
|
||||
("tc", "Titlecase_Mapping"), ("term", "Terminal_Punctuation"),
|
||||
("terminalpunctuation", "Terminal_Punctuation"),
|
||||
("titlecasemapping", "Titlecase_Mapping"), ("uc", "Uppercase_Mapping"),
|
||||
("uideo", "Unified_Ideograph"), ("unicode1name", "Unicode_1_Name"),
|
||||
("unicoderadicalstroke", "kRSUnicode"),
|
||||
("unifiedideograph", "Unified_Ideograph"), ("upper", "Uppercase"),
|
||||
("uppercase", "Uppercase"), ("uppercasemapping", "Uppercase_Mapping"),
|
||||
("urs", "kRSUnicode"), ("variationselector", "Variation_Selector"),
|
||||
("verticalorientation", "Vertical_Orientation"),
|
||||
("vo", "Vertical_Orientation"), ("vs", "Variation_Selector"),
|
||||
("wb", "Word_Break"), ("whitespace", "White_Space"),
|
||||
("wordbreak", "Word_Break"), ("wspace", "White_Space"),
|
||||
("xidc", "XID_Continue"), ("xidcontinue", "XID_Continue"),
|
||||
("xids", "XID_Start"), ("xidstart", "XID_Start"),
|
||||
("xonfc", "Expands_On_NFC"), ("xonfd", "Expands_On_NFD"),
|
||||
("xonfkc", "Expands_On_NFKC"), ("xonfkd", "Expands_On_NFKD"),
|
||||
("age", "Age"),
|
||||
("ahex", "ASCII_Hex_Digit"),
|
||||
("alpha", "Alphabetic"),
|
||||
("alphabetic", "Alphabetic"),
|
||||
("asciihexdigit", "ASCII_Hex_Digit"),
|
||||
("bc", "Bidi_Class"),
|
||||
("bidic", "Bidi_Control"),
|
||||
("bidiclass", "Bidi_Class"),
|
||||
("bidicontrol", "Bidi_Control"),
|
||||
("bidim", "Bidi_Mirrored"),
|
||||
("bidimirrored", "Bidi_Mirrored"),
|
||||
("bidimirroringglyph", "Bidi_Mirroring_Glyph"),
|
||||
("bidipairedbracket", "Bidi_Paired_Bracket"),
|
||||
("bidipairedbrackettype", "Bidi_Paired_Bracket_Type"),
|
||||
("blk", "Block"),
|
||||
("block", "Block"),
|
||||
("bmg", "Bidi_Mirroring_Glyph"),
|
||||
("bpb", "Bidi_Paired_Bracket"),
|
||||
("bpt", "Bidi_Paired_Bracket_Type"),
|
||||
("canonicalcombiningclass", "Canonical_Combining_Class"),
|
||||
("cased", "Cased"),
|
||||
("casefolding", "Case_Folding"),
|
||||
("caseignorable", "Case_Ignorable"),
|
||||
("ccc", "Canonical_Combining_Class"),
|
||||
("ce", "Composition_Exclusion"),
|
||||
("cf", "Case_Folding"),
|
||||
("changeswhencasefolded", "Changes_When_Casefolded"),
|
||||
("changeswhencasemapped", "Changes_When_Casemapped"),
|
||||
("changeswhenlowercased", "Changes_When_Lowercased"),
|
||||
("changeswhennfkccasefolded", "Changes_When_NFKC_Casefolded"),
|
||||
("changeswhentitlecased", "Changes_When_Titlecased"),
|
||||
("changeswhenuppercased", "Changes_When_Uppercased"),
|
||||
("ci", "Case_Ignorable"),
|
||||
("cjkaccountingnumeric", "kAccountingNumeric"),
|
||||
("cjkcompatibilityvariant", "kCompatibilityVariant"),
|
||||
("cjkiicore", "kIICore"),
|
||||
("cjkirggsource", "kIRG_GSource"),
|
||||
("cjkirghsource", "kIRG_HSource"),
|
||||
("cjkirgjsource", "kIRG_JSource"),
|
||||
("cjkirgkpsource", "kIRG_KPSource"),
|
||||
("cjkirgksource", "kIRG_KSource"),
|
||||
("cjkirgmsource", "kIRG_MSource"),
|
||||
("cjkirgtsource", "kIRG_TSource"),
|
||||
("cjkirgusource", "kIRG_USource"),
|
||||
("cjkirgvsource", "kIRG_VSource"),
|
||||
("cjkothernumeric", "kOtherNumeric"),
|
||||
("cjkprimarynumeric", "kPrimaryNumeric"),
|
||||
("cjkrsunicode", "kRSUnicode"),
|
||||
("compex", "Full_Composition_Exclusion"),
|
||||
("compositionexclusion", "Composition_Exclusion"),
|
||||
("cwcf", "Changes_When_Casefolded"),
|
||||
("cwcm", "Changes_When_Casemapped"),
|
||||
("cwkcf", "Changes_When_NFKC_Casefolded"),
|
||||
("cwl", "Changes_When_Lowercased"),
|
||||
("cwt", "Changes_When_Titlecased"),
|
||||
("cwu", "Changes_When_Uppercased"),
|
||||
("dash", "Dash"),
|
||||
("decompositionmapping", "Decomposition_Mapping"),
|
||||
("decompositiontype", "Decomposition_Type"),
|
||||
("defaultignorablecodepoint", "Default_Ignorable_Code_Point"),
|
||||
("dep", "Deprecated"),
|
||||
("deprecated", "Deprecated"),
|
||||
("di", "Default_Ignorable_Code_Point"),
|
||||
("dia", "Diacritic"),
|
||||
("diacritic", "Diacritic"),
|
||||
("dm", "Decomposition_Mapping"),
|
||||
("dt", "Decomposition_Type"),
|
||||
("ea", "East_Asian_Width"),
|
||||
("eastasianwidth", "East_Asian_Width"),
|
||||
("emoji", "Emoji"),
|
||||
("emojicomponent", "Emoji_Component"),
|
||||
("emojimodifier", "Emoji_Modifier"),
|
||||
("emojimodifierbase", "Emoji_Modifier_Base"),
|
||||
("emojipresentation", "Emoji_Presentation"),
|
||||
("equideo", "Equivalent_Unified_Ideograph"),
|
||||
("equivalentunifiedideograph", "Equivalent_Unified_Ideograph"),
|
||||
("expandsonnfc", "Expands_On_NFC"),
|
||||
("expandsonnfd", "Expands_On_NFD"),
|
||||
("expandsonnfkc", "Expands_On_NFKC"),
|
||||
("expandsonnfkd", "Expands_On_NFKD"),
|
||||
("ext", "Extender"),
|
||||
("extendedpictographic", "Extended_Pictographic"),
|
||||
("extender", "Extender"),
|
||||
("fcnfkc", "FC_NFKC_Closure"),
|
||||
("fcnfkcclosure", "FC_NFKC_Closure"),
|
||||
("fullcompositionexclusion", "Full_Composition_Exclusion"),
|
||||
("gc", "General_Category"),
|
||||
("gcb", "Grapheme_Cluster_Break"),
|
||||
("generalcategory", "General_Category"),
|
||||
("graphemebase", "Grapheme_Base"),
|
||||
("graphemeclusterbreak", "Grapheme_Cluster_Break"),
|
||||
("graphemeextend", "Grapheme_Extend"),
|
||||
("graphemelink", "Grapheme_Link"),
|
||||
("grbase", "Grapheme_Base"),
|
||||
("grext", "Grapheme_Extend"),
|
||||
("grlink", "Grapheme_Link"),
|
||||
("hangulsyllabletype", "Hangul_Syllable_Type"),
|
||||
("hex", "Hex_Digit"),
|
||||
("hexdigit", "Hex_Digit"),
|
||||
("hst", "Hangul_Syllable_Type"),
|
||||
("hyphen", "Hyphen"),
|
||||
("idc", "ID_Continue"),
|
||||
("idcontinue", "ID_Continue"),
|
||||
("ideo", "Ideographic"),
|
||||
("ideographic", "Ideographic"),
|
||||
("ids", "ID_Start"),
|
||||
("idsb", "IDS_Binary_Operator"),
|
||||
("idsbinaryoperator", "IDS_Binary_Operator"),
|
||||
("idst", "IDS_Trinary_Operator"),
|
||||
("idstart", "ID_Start"),
|
||||
("idstrinaryoperator", "IDS_Trinary_Operator"),
|
||||
("indicpositionalcategory", "Indic_Positional_Category"),
|
||||
("indicsyllabiccategory", "Indic_Syllabic_Category"),
|
||||
("inpc", "Indic_Positional_Category"),
|
||||
("insc", "Indic_Syllabic_Category"),
|
||||
("isc", "ISO_Comment"),
|
||||
("jamoshortname", "Jamo_Short_Name"),
|
||||
("jg", "Joining_Group"),
|
||||
("joinc", "Join_Control"),
|
||||
("joincontrol", "Join_Control"),
|
||||
("joininggroup", "Joining_Group"),
|
||||
("joiningtype", "Joining_Type"),
|
||||
("jsn", "Jamo_Short_Name"),
|
||||
("jt", "Joining_Type"),
|
||||
("kaccountingnumeric", "kAccountingNumeric"),
|
||||
("kcompatibilityvariant", "kCompatibilityVariant"),
|
||||
("kiicore", "kIICore"),
|
||||
("kirggsource", "kIRG_GSource"),
|
||||
("kirghsource", "kIRG_HSource"),
|
||||
("kirgjsource", "kIRG_JSource"),
|
||||
("kirgkpsource", "kIRG_KPSource"),
|
||||
("kirgksource", "kIRG_KSource"),
|
||||
("kirgmsource", "kIRG_MSource"),
|
||||
("kirgtsource", "kIRG_TSource"),
|
||||
("kirgusource", "kIRG_USource"),
|
||||
("kirgvsource", "kIRG_VSource"),
|
||||
("kothernumeric", "kOtherNumeric"),
|
||||
("kprimarynumeric", "kPrimaryNumeric"),
|
||||
("krsunicode", "kRSUnicode"),
|
||||
("lb", "Line_Break"),
|
||||
("lc", "Lowercase_Mapping"),
|
||||
("linebreak", "Line_Break"),
|
||||
("loe", "Logical_Order_Exception"),
|
||||
("logicalorderexception", "Logical_Order_Exception"),
|
||||
("lower", "Lowercase"),
|
||||
("lowercase", "Lowercase"),
|
||||
("lowercasemapping", "Lowercase_Mapping"),
|
||||
("math", "Math"),
|
||||
("na", "Name"),
|
||||
("na1", "Unicode_1_Name"),
|
||||
("name", "Name"),
|
||||
("namealias", "Name_Alias"),
|
||||
("nchar", "Noncharacter_Code_Point"),
|
||||
("nfcqc", "NFC_Quick_Check"),
|
||||
("nfcquickcheck", "NFC_Quick_Check"),
|
||||
("nfdqc", "NFD_Quick_Check"),
|
||||
("nfdquickcheck", "NFD_Quick_Check"),
|
||||
("nfkccasefold", "NFKC_Casefold"),
|
||||
("nfkccf", "NFKC_Casefold"),
|
||||
("nfkcqc", "NFKC_Quick_Check"),
|
||||
("nfkcquickcheck", "NFKC_Quick_Check"),
|
||||
("nfkdqc", "NFKD_Quick_Check"),
|
||||
("nfkdquickcheck", "NFKD_Quick_Check"),
|
||||
("noncharactercodepoint", "Noncharacter_Code_Point"),
|
||||
("nt", "Numeric_Type"),
|
||||
("numerictype", "Numeric_Type"),
|
||||
("numericvalue", "Numeric_Value"),
|
||||
("nv", "Numeric_Value"),
|
||||
("oalpha", "Other_Alphabetic"),
|
||||
("ocomment", "ISO_Comment"),
|
||||
("odi", "Other_Default_Ignorable_Code_Point"),
|
||||
("ogrext", "Other_Grapheme_Extend"),
|
||||
("oidc", "Other_ID_Continue"),
|
||||
("oids", "Other_ID_Start"),
|
||||
("olower", "Other_Lowercase"),
|
||||
("omath", "Other_Math"),
|
||||
("otheralphabetic", "Other_Alphabetic"),
|
||||
("otherdefaultignorablecodepoint", "Other_Default_Ignorable_Code_Point"),
|
||||
("othergraphemeextend", "Other_Grapheme_Extend"),
|
||||
("otheridcontinue", "Other_ID_Continue"),
|
||||
("otheridstart", "Other_ID_Start"),
|
||||
("otherlowercase", "Other_Lowercase"),
|
||||
("othermath", "Other_Math"),
|
||||
("otheruppercase", "Other_Uppercase"),
|
||||
("oupper", "Other_Uppercase"),
|
||||
("patsyn", "Pattern_Syntax"),
|
||||
("patternsyntax", "Pattern_Syntax"),
|
||||
("patternwhitespace", "Pattern_White_Space"),
|
||||
("patws", "Pattern_White_Space"),
|
||||
("pcm", "Prepended_Concatenation_Mark"),
|
||||
("prependedconcatenationmark", "Prepended_Concatenation_Mark"),
|
||||
("qmark", "Quotation_Mark"),
|
||||
("quotationmark", "Quotation_Mark"),
|
||||
("radical", "Radical"),
|
||||
("regionalindicator", "Regional_Indicator"),
|
||||
("ri", "Regional_Indicator"),
|
||||
("sb", "Sentence_Break"),
|
||||
("sc", "Script"),
|
||||
("scf", "Simple_Case_Folding"),
|
||||
("script", "Script"),
|
||||
("scriptextensions", "Script_Extensions"),
|
||||
("scx", "Script_Extensions"),
|
||||
("sd", "Soft_Dotted"),
|
||||
("sentencebreak", "Sentence_Break"),
|
||||
("sentenceterminal", "Sentence_Terminal"),
|
||||
("sfc", "Simple_Case_Folding"),
|
||||
("simplecasefolding", "Simple_Case_Folding"),
|
||||
("simplelowercasemapping", "Simple_Lowercase_Mapping"),
|
||||
("simpletitlecasemapping", "Simple_Titlecase_Mapping"),
|
||||
("simpleuppercasemapping", "Simple_Uppercase_Mapping"),
|
||||
("slc", "Simple_Lowercase_Mapping"),
|
||||
("softdotted", "Soft_Dotted"),
|
||||
("space", "White_Space"),
|
||||
("stc", "Simple_Titlecase_Mapping"),
|
||||
("sterm", "Sentence_Terminal"),
|
||||
("suc", "Simple_Uppercase_Mapping"),
|
||||
("tc", "Titlecase_Mapping"),
|
||||
("term", "Terminal_Punctuation"),
|
||||
("terminalpunctuation", "Terminal_Punctuation"),
|
||||
("titlecasemapping", "Titlecase_Mapping"),
|
||||
("uc", "Uppercase_Mapping"),
|
||||
("uideo", "Unified_Ideograph"),
|
||||
("unicode1name", "Unicode_1_Name"),
|
||||
("unicoderadicalstroke", "kRSUnicode"),
|
||||
("unifiedideograph", "Unified_Ideograph"),
|
||||
("upper", "Uppercase"),
|
||||
("uppercase", "Uppercase"),
|
||||
("uppercasemapping", "Uppercase_Mapping"),
|
||||
("urs", "kRSUnicode"),
|
||||
("variationselector", "Variation_Selector"),
|
||||
("verticalorientation", "Vertical_Orientation"),
|
||||
("vo", "Vertical_Orientation"),
|
||||
("vs", "Variation_Selector"),
|
||||
("wb", "Word_Break"),
|
||||
("whitespace", "White_Space"),
|
||||
("wordbreak", "Word_Break"),
|
||||
("wspace", "White_Space"),
|
||||
("xidc", "XID_Continue"),
|
||||
("xidcontinue", "XID_Continue"),
|
||||
("xids", "XID_Start"),
|
||||
("xidstart", "XID_Start"),
|
||||
("xonfc", "Expands_On_NFC"),
|
||||
("xonfd", "Expands_On_NFD"),
|
||||
("xonfkc", "Expands_On_NFKC"),
|
||||
("xonfkd", "Expands_On_NFKD"),
|
||||
];
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
+2
-1
@@ -1 +1,2 @@
|
||||
disable_all_formatting = true
|
||||
max_width = 79
|
||||
use_small_heuristics = "max"
|
||||
|
||||
+3
-8
@@ -28,7 +28,7 @@
|
||||
|
||||
use exec::ProgramCache;
|
||||
use input::{Input, InputAt};
|
||||
use prog::{Program, InstPtr};
|
||||
use prog::{InstPtr, Program};
|
||||
use re_trait::Slot;
|
||||
|
||||
type Bits = u32;
|
||||
@@ -131,8 +131,7 @@ impl<'a, 'm, 'r, 's, I: Input> Bounded<'a, 'm, 'r, 's, I> {
|
||||
// inputs/regexes in the first place.)
|
||||
let visited_len =
|
||||
(self.prog.len() * (self.input.len() + 1) + BIT_SIZE - 1)
|
||||
/
|
||||
BIT_SIZE;
|
||||
/ BIT_SIZE;
|
||||
self.m.visited.truncate(visited_len);
|
||||
for v in &mut self.m.visited {
|
||||
*v = 0;
|
||||
@@ -153,11 +152,7 @@ impl<'a, 'm, 'r, 's, I: Input> Bounded<'a, 'm, 'r, 's, I> {
|
||||
// If this is an anchored regex at the beginning of the input, then
|
||||
// we're either already done or we only need to try backtracking once.
|
||||
if self.prog.is_anchored_start {
|
||||
return if !at.is_start() {
|
||||
false
|
||||
} else {
|
||||
self.backtrack(at)
|
||||
};
|
||||
return if !at.is_start() { false } else { self.backtrack(at) };
|
||||
}
|
||||
let mut matched = false;
|
||||
loop {
|
||||
|
||||
+101
-100
@@ -13,13 +13,13 @@ use std::iter;
|
||||
use std::result;
|
||||
use std::sync::Arc;
|
||||
|
||||
use syntax::is_word_byte;
|
||||
use syntax::hir::{self, Hir};
|
||||
use syntax::is_word_byte;
|
||||
use utf8_ranges::{Utf8Range, Utf8Sequence, Utf8Sequences};
|
||||
|
||||
use prog::{
|
||||
Program, Inst, InstPtr, EmptyLook,
|
||||
InstSave, InstSplit, InstEmptyLook, InstChar, InstRanges, InstBytes,
|
||||
EmptyLook, Inst, InstBytes, InstChar, InstEmptyLook, InstPtr, InstRanges,
|
||||
InstSave, InstSplit, Program,
|
||||
};
|
||||
|
||||
use Error;
|
||||
@@ -119,10 +119,7 @@ impl Compiler {
|
||||
/// The compiler is guaranteed to succeed unless the program exceeds the
|
||||
/// specified size limit. If the size limit is exceeded, then compilation
|
||||
/// stops and returns an error.
|
||||
pub fn compile(
|
||||
mut self,
|
||||
exprs: &[Hir],
|
||||
) -> result::Result<Program, Error> {
|
||||
pub fn compile(mut self, exprs: &[Hir]) -> result::Result<Program, Error> {
|
||||
debug_assert!(exprs.len() >= 1);
|
||||
self.num_exprs = exprs.len();
|
||||
if exprs.len() == 1 {
|
||||
@@ -262,16 +259,12 @@ impl Compiler {
|
||||
self.check_size()?;
|
||||
match *expr.kind() {
|
||||
Empty => Ok(Patch { hole: Hole::None, entry: self.insts.len() }),
|
||||
Literal(hir::Literal::Unicode(c)) => {
|
||||
self.c_char(c)
|
||||
}
|
||||
Literal(hir::Literal::Unicode(c)) => self.c_char(c),
|
||||
Literal(hir::Literal::Byte(b)) => {
|
||||
assert!(self.compiled.uses_bytes());
|
||||
self.c_byte(b)
|
||||
}
|
||||
Class(hir::Class::Unicode(ref cls)) => {
|
||||
self.c_class(cls.ranges())
|
||||
}
|
||||
Class(hir::Class::Unicode(ref cls)) => self.c_class(cls.ranges()),
|
||||
Class(hir::Class::Bytes(ref cls)) => {
|
||||
if self.compiled.uses_bytes() {
|
||||
self.c_class_bytes(cls.ranges())
|
||||
@@ -331,25 +324,23 @@ impl Compiler {
|
||||
self.byte_classes.set_word_boundary();
|
||||
self.c_empty_look(prog::EmptyLook::NotWordBoundaryAscii)
|
||||
}
|
||||
Group(ref g) => {
|
||||
match g.kind {
|
||||
hir::GroupKind::NonCapturing => self.c(&g.hir),
|
||||
hir::GroupKind::CaptureIndex(index) => {
|
||||
if index as usize >= self.compiled.captures.len() {
|
||||
self.compiled.captures.push(None);
|
||||
}
|
||||
self.c_capture(2 * index as usize, &g.hir)
|
||||
}
|
||||
hir::GroupKind::CaptureName { index, ref name } => {
|
||||
if index as usize >= self.compiled.captures.len() {
|
||||
let n = name.to_string();
|
||||
self.compiled.captures.push(Some(n.clone()));
|
||||
self.capture_name_idx.insert(n, index as usize);
|
||||
}
|
||||
self.c_capture(2 * index as usize, &g.hir)
|
||||
Group(ref g) => match g.kind {
|
||||
hir::GroupKind::NonCapturing => self.c(&g.hir),
|
||||
hir::GroupKind::CaptureIndex(index) => {
|
||||
if index as usize >= self.compiled.captures.len() {
|
||||
self.compiled.captures.push(None);
|
||||
}
|
||||
self.c_capture(2 * index as usize, &g.hir)
|
||||
}
|
||||
}
|
||||
hir::GroupKind::CaptureName { index, ref name } => {
|
||||
if index as usize >= self.compiled.captures.len() {
|
||||
let n = name.to_string();
|
||||
self.compiled.captures.push(Some(n.clone()));
|
||||
self.capture_name_idx.insert(n, index as usize);
|
||||
}
|
||||
self.c_capture(2 * index as usize, &g.hir)
|
||||
}
|
||||
},
|
||||
Concat(ref es) => {
|
||||
if self.compiled.is_reverse {
|
||||
self.c_concat(es.iter().rev())
|
||||
@@ -402,10 +393,7 @@ impl Compiler {
|
||||
fn c_class(&mut self, ranges: &[hir::ClassUnicodeRange]) -> Result {
|
||||
assert!(!ranges.is_empty());
|
||||
if self.compiled.uses_bytes() {
|
||||
CompileClass {
|
||||
c: self,
|
||||
ranges: ranges,
|
||||
}.compile()
|
||||
CompileClass { c: self, ranges: ranges }.compile()
|
||||
} else {
|
||||
let ranges: Vec<(char, char)> =
|
||||
ranges.iter().map(|r| (r.start(), r.end())).collect();
|
||||
@@ -434,16 +422,17 @@ impl Compiler {
|
||||
let next = self.insts.len();
|
||||
self.byte_classes.set_range(r.start(), r.end());
|
||||
holes.push(self.push_hole(InstHole::Bytes {
|
||||
start: r.start(), end: r.end(),
|
||||
start: r.start(),
|
||||
end: r.end(),
|
||||
}));
|
||||
prev_hole = self.fill_split(split, Some(next), None);
|
||||
}
|
||||
let next = self.insts.len();
|
||||
let r = &ranges[ranges.len() - 1];
|
||||
self.byte_classes.set_range(r.start(), r.end());
|
||||
holes.push(self.push_hole(InstHole::Bytes {
|
||||
start: r.start(), end: r.end(),
|
||||
}));
|
||||
holes.push(
|
||||
self.push_hole(InstHole::Bytes { start: r.start(), end: r.end() }),
|
||||
);
|
||||
self.fill(prev_hole, next);
|
||||
Ok(Patch { hole: Hole::Many(holes), entry: first_split_entry })
|
||||
}
|
||||
@@ -454,7 +443,9 @@ impl Compiler {
|
||||
}
|
||||
|
||||
fn c_concat<'a, I>(&mut self, exprs: I) -> Result
|
||||
where I: IntoIterator<Item=&'a Hir> {
|
||||
where
|
||||
I: IntoIterator<Item = &'a Hir>,
|
||||
{
|
||||
let mut exprs = exprs.into_iter();
|
||||
let first = match exprs.next() {
|
||||
Some(expr) => expr,
|
||||
@@ -473,7 +464,9 @@ impl Compiler {
|
||||
|
||||
fn c_alternate(&mut self, exprs: &[Hir]) -> Result {
|
||||
debug_assert!(
|
||||
exprs.len() >= 2, "alternates must have at least 2 exprs");
|
||||
exprs.len() >= 2,
|
||||
"alternates must have at least 2 exprs"
|
||||
);
|
||||
|
||||
// Initial entry point is always the first split.
|
||||
let first_split_entry = self.insts.len();
|
||||
@@ -496,7 +489,9 @@ impl Compiler {
|
||||
// anyway, so don't feel too bad.
|
||||
return Err(Error::Syntax(
|
||||
"alternations cannot currently contain \
|
||||
empty sub-expressions".to_string()));
|
||||
empty sub-expressions"
|
||||
.to_string(),
|
||||
));
|
||||
}
|
||||
holes.push(hole);
|
||||
prev_hole = self.fill_split(split, Some(entry), None);
|
||||
@@ -507,7 +502,9 @@ impl Compiler {
|
||||
// TODO(burntsushi): See TODO above.
|
||||
return Err(Error::Syntax(
|
||||
"alternations cannot currently contain \
|
||||
empty sub-expressions".to_string()));
|
||||
empty sub-expressions"
|
||||
.to_string(),
|
||||
));
|
||||
}
|
||||
holes.push(hole);
|
||||
self.fill(prev_hole, entry);
|
||||
@@ -662,24 +659,24 @@ impl Compiler {
|
||||
) -> Hole {
|
||||
match hole {
|
||||
Hole::None => Hole::None,
|
||||
Hole::One(pc) => {
|
||||
match (goto1, goto2) {
|
||||
(Some(goto1), Some(goto2)) => {
|
||||
self.insts[pc].fill_split(goto1, goto2);
|
||||
Hole::None
|
||||
}
|
||||
(Some(goto1), None) => {
|
||||
self.insts[pc].half_fill_split_goto1(goto1);
|
||||
Hole::One(pc)
|
||||
}
|
||||
(None, Some(goto2)) => {
|
||||
self.insts[pc].half_fill_split_goto2(goto2);
|
||||
Hole::One(pc)
|
||||
}
|
||||
(None, None) => unreachable!("at least one of the split \
|
||||
holes must be filled"),
|
||||
Hole::One(pc) => match (goto1, goto2) {
|
||||
(Some(goto1), Some(goto2)) => {
|
||||
self.insts[pc].fill_split(goto1, goto2);
|
||||
Hole::None
|
||||
}
|
||||
}
|
||||
(Some(goto1), None) => {
|
||||
self.insts[pc].half_fill_split_goto1(goto1);
|
||||
Hole::One(pc)
|
||||
}
|
||||
(None, Some(goto2)) => {
|
||||
self.insts[pc].half_fill_split_goto2(goto2);
|
||||
Hole::One(pc)
|
||||
}
|
||||
(None, None) => unreachable!(
|
||||
"at least one of the split \
|
||||
holes must be filled"
|
||||
),
|
||||
},
|
||||
Hole::Many(holes) => {
|
||||
let mut new_holes = vec![];
|
||||
for hole in holes {
|
||||
@@ -749,8 +746,11 @@ impl MaybeInst {
|
||||
MaybeInst::Split2(goto2) => {
|
||||
Inst::Split(InstSplit { goto1: goto, goto2: goto2 })
|
||||
}
|
||||
_ => unreachable!("not all instructions were compiled! \
|
||||
found uncompiled instruction: {:?}", self),
|
||||
_ => unreachable!(
|
||||
"not all instructions were compiled! \
|
||||
found uncompiled instruction: {:?}",
|
||||
self
|
||||
),
|
||||
};
|
||||
*self = MaybeInst::Compiled(filled);
|
||||
}
|
||||
@@ -760,8 +760,11 @@ impl MaybeInst {
|
||||
MaybeInst::Split => {
|
||||
Inst::Split(InstSplit { goto1: goto1, goto2: goto2 })
|
||||
}
|
||||
_ => unreachable!("must be called on Split instruction, \
|
||||
instead it was called on: {:?}", self),
|
||||
_ => unreachable!(
|
||||
"must be called on Split instruction, \
|
||||
instead it was called on: {:?}",
|
||||
self
|
||||
),
|
||||
};
|
||||
*self = MaybeInst::Compiled(filled);
|
||||
}
|
||||
@@ -769,8 +772,11 @@ impl MaybeInst {
|
||||
fn half_fill_split_goto1(&mut self, goto1: InstPtr) {
|
||||
let half_filled = match *self {
|
||||
MaybeInst::Split => goto1,
|
||||
_ => unreachable!("must be called on Split instruction, \
|
||||
instead it was called on: {:?}", self),
|
||||
_ => unreachable!(
|
||||
"must be called on Split instruction, \
|
||||
instead it was called on: {:?}",
|
||||
self
|
||||
),
|
||||
};
|
||||
*self = MaybeInst::Split1(half_filled);
|
||||
}
|
||||
@@ -778,8 +784,11 @@ impl MaybeInst {
|
||||
fn half_fill_split_goto2(&mut self, goto2: InstPtr) {
|
||||
let half_filled = match *self {
|
||||
MaybeInst::Split => goto2,
|
||||
_ => unreachable!("must be called on Split instruction, \
|
||||
instead it was called on: {:?}", self),
|
||||
_ => unreachable!(
|
||||
"must be called on Split instruction, \
|
||||
instead it was called on: {:?}",
|
||||
self
|
||||
),
|
||||
};
|
||||
*self = MaybeInst::Split2(half_filled);
|
||||
}
|
||||
@@ -787,8 +796,11 @@ impl MaybeInst {
|
||||
fn unwrap(self) -> Inst {
|
||||
match self {
|
||||
MaybeInst::Compiled(inst) => inst,
|
||||
_ => unreachable!("must be called on a compiled instruction, \
|
||||
instead it was called on: {:?}", self),
|
||||
_ => unreachable!(
|
||||
"must be called on a compiled instruction, \
|
||||
instead it was called on: {:?}",
|
||||
self
|
||||
),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -805,27 +817,19 @@ enum InstHole {
|
||||
impl InstHole {
|
||||
fn fill(&self, goto: InstPtr) -> Inst {
|
||||
match *self {
|
||||
InstHole::Save { slot } => Inst::Save(InstSave {
|
||||
goto: goto,
|
||||
slot: slot,
|
||||
}),
|
||||
InstHole::EmptyLook { look } => Inst::EmptyLook(InstEmptyLook {
|
||||
goto: goto,
|
||||
look: look,
|
||||
}),
|
||||
InstHole::Char { c } => Inst::Char(InstChar {
|
||||
goto: goto,
|
||||
c: c,
|
||||
}),
|
||||
InstHole::Ranges { ref ranges } => Inst::Ranges(InstRanges {
|
||||
goto: goto,
|
||||
ranges: ranges.clone(),
|
||||
}),
|
||||
InstHole::Bytes { start, end } => Inst::Bytes(InstBytes {
|
||||
goto: goto,
|
||||
start: start,
|
||||
end: end,
|
||||
}),
|
||||
InstHole::Save { slot } => {
|
||||
Inst::Save(InstSave { goto: goto, slot: slot })
|
||||
}
|
||||
InstHole::EmptyLook { look } => {
|
||||
Inst::EmptyLook(InstEmptyLook { goto: goto, look: look })
|
||||
}
|
||||
InstHole::Char { c } => Inst::Char(InstChar { goto: goto, c: c }),
|
||||
InstHole::Ranges { ref ranges } => {
|
||||
Inst::Ranges(InstRanges { goto: goto, ranges: ranges.clone() })
|
||||
}
|
||||
InstHole::Bytes { start, end } => {
|
||||
Inst::Bytes(InstBytes { goto: goto, start: start, end: end })
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -868,15 +872,13 @@ impl<'a, 'b> CompileClass<'a, 'b> {
|
||||
last_split = self.c.push_split_hole();
|
||||
let Patch { hole, entry } = self.c_utf8_seq(&utf8_seq)?;
|
||||
holes.push(hole);
|
||||
last_split = self.c.fill_split(last_split, Some(entry), None);
|
||||
last_split =
|
||||
self.c.fill_split(last_split, Some(entry), None);
|
||||
}
|
||||
}
|
||||
}
|
||||
self.c.utf8_seqs = Some(utf8_seqs);
|
||||
Ok(Patch {
|
||||
hole: Hole::Many(holes),
|
||||
entry: initial_entry.unwrap(),
|
||||
})
|
||||
Ok(Patch { hole: Hole::Many(holes), entry: initial_entry.unwrap() })
|
||||
}
|
||||
|
||||
fn c_utf8_seq(&mut self, seq: &Utf8Sequence) -> Result {
|
||||
@@ -888,7 +890,9 @@ impl<'a, 'b> CompileClass<'a, 'b> {
|
||||
}
|
||||
|
||||
fn c_utf8_seq_<'r, I>(&mut self, seq: I) -> Result
|
||||
where I: IntoIterator<Item=&'r Utf8Range> {
|
||||
where
|
||||
I: IntoIterator<Item = &'r Utf8Range>,
|
||||
{
|
||||
// The initial instruction for each UTF-8 sequence should be the same.
|
||||
let mut from_inst = ::std::usize::MAX;
|
||||
let mut last_hole = Hole::None;
|
||||
@@ -983,10 +987,7 @@ impl SuffixCache {
|
||||
}
|
||||
}
|
||||
*pos = self.dense.len();
|
||||
self.dense.push(SuffixCacheEntry {
|
||||
key: key,
|
||||
pc: pc,
|
||||
});
|
||||
self.dense.push(SuffixCacheEntry { key: key, pc: pc });
|
||||
None
|
||||
}
|
||||
|
||||
|
||||
+105
-119
@@ -293,7 +293,7 @@ struct InstPtrs<'a> {
|
||||
data: &'a [u8],
|
||||
}
|
||||
|
||||
impl <'a>Iterator for InstPtrs<'a> {
|
||||
impl<'a> Iterator for InstPtrs<'a> {
|
||||
type Item = usize;
|
||||
|
||||
fn next(&mut self) -> Option<usize> {
|
||||
@@ -316,10 +316,7 @@ impl State {
|
||||
}
|
||||
|
||||
fn inst_ptrs(&self) -> InstPtrs {
|
||||
InstPtrs {
|
||||
base: 0,
|
||||
data: &self.data[1..],
|
||||
}
|
||||
InstPtrs { base: 0, data: &self.data[1..] }
|
||||
}
|
||||
}
|
||||
|
||||
@@ -371,7 +368,7 @@ type StatePtr = u32;
|
||||
|
||||
/// An unknown state means that the state has not been computed yet, and that
|
||||
/// the only way to progress is to compute it.
|
||||
const STATE_UNKNOWN: StatePtr = 1<<31;
|
||||
const STATE_UNKNOWN: StatePtr = 1 << 31;
|
||||
|
||||
/// A dead state means that the state has been computed and it is known that
|
||||
/// once it is entered, no future match can ever occur.
|
||||
@@ -385,12 +382,12 @@ const STATE_QUIT: StatePtr = STATE_DEAD + 1;
|
||||
/// A start state is a state that the DFA can start in.
|
||||
///
|
||||
/// Note that start states have their lower bits set to a state pointer.
|
||||
const STATE_START: StatePtr = 1<<30;
|
||||
const STATE_START: StatePtr = 1 << 30;
|
||||
|
||||
/// A match state means that the regex has successfully matched.
|
||||
///
|
||||
/// Note that match states have their lower bits set to a state pointer.
|
||||
const STATE_MATCH: StatePtr = 1<<29;
|
||||
const STATE_MATCH: StatePtr = 1 << 29;
|
||||
|
||||
/// The maximum state pointer. This is useful to mask out the "valid" state
|
||||
/// pointer from a state with the "start" or "match" bits set.
|
||||
@@ -449,8 +446,7 @@ impl CacheInner {
|
||||
/// Resets the cache size to account for fixed costs, such as the program
|
||||
/// and stack sizes.
|
||||
fn reset_size(&mut self) {
|
||||
self.size =
|
||||
(self.start_states.len() * mem::size_of::<StatePtr>())
|
||||
self.size = (self.start_states.len() * mem::size_of::<StatePtr>())
|
||||
+ (self.stack.len() * mem::size_of::<InstPtr>());
|
||||
}
|
||||
}
|
||||
@@ -476,15 +472,12 @@ impl<'a> Fsm<'a> {
|
||||
cache: &mut cache.inner,
|
||||
};
|
||||
let (empty_flags, state_flags) = dfa.start_flags(text, at);
|
||||
dfa.start = match dfa.start_state(
|
||||
&mut cache.qcur,
|
||||
empty_flags,
|
||||
state_flags,
|
||||
) {
|
||||
None => return Result::Quit,
|
||||
Some(STATE_DEAD) => return Result::NoMatch(at),
|
||||
Some(si) => si,
|
||||
};
|
||||
dfa.start =
|
||||
match dfa.start_state(&mut cache.qcur, empty_flags, state_flags) {
|
||||
None => return Result::Quit,
|
||||
Some(STATE_DEAD) => return Result::NoMatch(at),
|
||||
Some(si) => si,
|
||||
};
|
||||
debug_assert!(dfa.start != STATE_UNKNOWN);
|
||||
dfa.exec_at(&mut cache.qcur, &mut cache.qnext, text)
|
||||
}
|
||||
@@ -509,15 +502,12 @@ impl<'a> Fsm<'a> {
|
||||
cache: &mut cache.inner,
|
||||
};
|
||||
let (empty_flags, state_flags) = dfa.start_flags_reverse(text, at);
|
||||
dfa.start = match dfa.start_state(
|
||||
&mut cache.qcur,
|
||||
empty_flags,
|
||||
state_flags,
|
||||
) {
|
||||
None => return Result::Quit,
|
||||
Some(STATE_DEAD) => return Result::NoMatch(at),
|
||||
Some(si) => si,
|
||||
};
|
||||
dfa.start =
|
||||
match dfa.start_state(&mut cache.qcur, empty_flags, state_flags) {
|
||||
None => return Result::Quit,
|
||||
Some(STATE_DEAD) => return Result::NoMatch(at),
|
||||
Some(si) => si,
|
||||
};
|
||||
debug_assert!(dfa.start != STATE_UNKNOWN);
|
||||
dfa.exec_at_reverse(&mut cache.qcur, &mut cache.qnext, text)
|
||||
}
|
||||
@@ -543,15 +533,12 @@ impl<'a> Fsm<'a> {
|
||||
cache: &mut cache.inner,
|
||||
};
|
||||
let (empty_flags, state_flags) = dfa.start_flags(text, at);
|
||||
dfa.start = match dfa.start_state(
|
||||
&mut cache.qcur,
|
||||
empty_flags,
|
||||
state_flags,
|
||||
) {
|
||||
None => return Result::Quit,
|
||||
Some(STATE_DEAD) => return Result::NoMatch(at),
|
||||
Some(si) => si,
|
||||
};
|
||||
dfa.start =
|
||||
match dfa.start_state(&mut cache.qcur, empty_flags, state_flags) {
|
||||
None => return Result::Quit,
|
||||
Some(STATE_DEAD) => return Result::NoMatch(at),
|
||||
Some(si) => si,
|
||||
};
|
||||
debug_assert!(dfa.start != STATE_UNKNOWN);
|
||||
let result = dfa.exec_at(&mut cache.qcur, &mut cache.qnext, text);
|
||||
if result.is_match() {
|
||||
@@ -675,8 +662,8 @@ impl<'a> Fsm<'a> {
|
||||
// match states are final. Therefore, we can quit.
|
||||
if self.prog.matches.len() > 1 {
|
||||
let state = self.state(next_si);
|
||||
let just_matches = state.inst_ptrs()
|
||||
.all(|ip| self.prog[ip].is_match());
|
||||
let just_matches =
|
||||
state.inst_ptrs().all(|ip| self.prog[ip].is_match());
|
||||
if just_matches {
|
||||
return result;
|
||||
}
|
||||
@@ -688,7 +675,8 @@ impl<'a> Fsm<'a> {
|
||||
// we've left this particular state.
|
||||
let cur = at;
|
||||
while (next_si & !STATE_MATCH) == prev_si
|
||||
&& at + 2 < text.len() {
|
||||
&& at + 2 < text.len()
|
||||
{
|
||||
// Argument for safety is in the definition of next_si.
|
||||
next_si = unsafe {
|
||||
self.next_si(next_si & !STATE_MATCH, text, at)
|
||||
@@ -809,7 +797,7 @@ impl<'a> Fsm<'a> {
|
||||
next_si &= !STATE_MATCH;
|
||||
result = Result::Match(at + 1);
|
||||
if self.quit_after_match {
|
||||
return result
|
||||
return result;
|
||||
}
|
||||
self.last_match_si = next_si;
|
||||
prev_si = next_si;
|
||||
@@ -987,7 +975,8 @@ impl<'a> Fsm<'a> {
|
||||
if !self.continue_past_first_match() {
|
||||
break;
|
||||
} else if self.prog.matches.len() > 1
|
||||
&& !qnext.contains(ip as usize) {
|
||||
&& !qnext.contains(ip as usize)
|
||||
{
|
||||
// If we are continuing on to find other matches,
|
||||
// then keep a record of the match states we've seen.
|
||||
qnext.insert(ip);
|
||||
@@ -996,24 +985,26 @@ impl<'a> Fsm<'a> {
|
||||
Bytes(ref inst) => {
|
||||
if b.as_byte().map_or(false, |b| inst.matches(b)) {
|
||||
self.follow_epsilons(
|
||||
inst.goto as InstPtr, qnext, empty_flags);
|
||||
inst.goto as InstPtr,
|
||||
qnext,
|
||||
empty_flags,
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let cache =
|
||||
if b.is_eof() && self.prog.matches.len() > 1 {
|
||||
// If we're processing the last byte of the input and we're
|
||||
// matching a regex set, then make the next state contain the
|
||||
// previous states transitions. We do this so that the main
|
||||
// matching loop can extract all of the match instructions.
|
||||
mem::swap(qcur, qnext);
|
||||
// And don't cache this state because it's totally bunk.
|
||||
false
|
||||
} else {
|
||||
true
|
||||
};
|
||||
let cache = if b.is_eof() && self.prog.matches.len() > 1 {
|
||||
// If we're processing the last byte of the input and we're
|
||||
// matching a regex set, then make the next state contain the
|
||||
// previous states transitions. We do this so that the main
|
||||
// matching loop can extract all of the match instructions.
|
||||
mem::swap(qcur, qnext);
|
||||
// And don't cache this state because it's totally bunk.
|
||||
false
|
||||
} else {
|
||||
true
|
||||
};
|
||||
|
||||
// We've now built up the set of NFA states that ought to comprise the
|
||||
// next DFA state, so try to find it in the cache, and if it doesn't
|
||||
@@ -1022,14 +1013,11 @@ impl<'a> Fsm<'a> {
|
||||
// N.B. We pass `&mut si` here because the cache may clear itself if
|
||||
// it has gotten too full. When that happens, the location of the
|
||||
// current state may change.
|
||||
let mut next = match self.cached_state(
|
||||
qnext,
|
||||
state_flags,
|
||||
Some(&mut si),
|
||||
) {
|
||||
None => return None,
|
||||
Some(next) => next,
|
||||
};
|
||||
let mut next =
|
||||
match self.cached_state(qnext, state_flags, Some(&mut si)) {
|
||||
None => return None,
|
||||
Some(next) => next,
|
||||
};
|
||||
if (self.start & !STATE_START) == next {
|
||||
// Start states can never be match states since all matches are
|
||||
// delayed by one byte.
|
||||
@@ -1077,8 +1065,8 @@ impl<'a> Fsm<'a> {
|
||||
q: &mut SparseSet,
|
||||
flags: EmptyFlags,
|
||||
) {
|
||||
use prog::Inst::*;
|
||||
use prog::EmptyLook::*;
|
||||
use prog::Inst::*;
|
||||
|
||||
// We need to traverse the NFA to follow epsilon transitions, so avoid
|
||||
// recursion with an explicit stack.
|
||||
@@ -1116,7 +1104,9 @@ impl<'a> Fsm<'a> {
|
||||
WordBoundaryAscii if flags.word_boundary => {
|
||||
ip = inst.goto as InstPtr;
|
||||
}
|
||||
NotWordBoundaryAscii if flags.not_word_boundary => {
|
||||
NotWordBoundaryAscii
|
||||
if flags.not_word_boundary =>
|
||||
{
|
||||
ip = inst.goto as InstPtr;
|
||||
}
|
||||
WordBoundary if flags.word_boundary => {
|
||||
@@ -1180,20 +1170,16 @@ impl<'a> Fsm<'a> {
|
||||
Some(v) => v,
|
||||
};
|
||||
// In the cache? Cool. Done.
|
||||
if let Some(si) = self
|
||||
.cache
|
||||
.compiled
|
||||
.get_ptr(&key)
|
||||
{
|
||||
if let Some(si) = self.cache.compiled.get_ptr(&key) {
|
||||
return Some(si);
|
||||
}
|
||||
// If the cache has gotten too big, wipe it.
|
||||
if self.approximate_size() > self.prog.dfa_size_limit
|
||||
&& !self.clear_cache_and_save(current_state)
|
||||
{
|
||||
// Ooops. DFA is giving up.
|
||||
return None;
|
||||
}
|
||||
{
|
||||
// Ooops. DFA is giving up.
|
||||
return None;
|
||||
}
|
||||
// Allocate room for our state and add it.
|
||||
self.add_state(key)
|
||||
}
|
||||
@@ -1224,10 +1210,8 @@ impl<'a> Fsm<'a> {
|
||||
// are conditional, we need to make them part of a state's key in the
|
||||
// cache.
|
||||
|
||||
let mut insts = mem::replace(
|
||||
&mut self.cache.insts_scratch_space,
|
||||
vec![],
|
||||
);
|
||||
let mut insts =
|
||||
mem::replace(&mut self.cache.insts_scratch_space, vec![]);
|
||||
insts.clear();
|
||||
// Reserve 1 byte for flags.
|
||||
insts.push(0);
|
||||
@@ -1255,14 +1239,13 @@ impl<'a> Fsm<'a> {
|
||||
// see a match when expanding NFA states previously, then this is a
|
||||
// dead state and no amount of additional input can transition out
|
||||
// of this state.
|
||||
let opt_state =
|
||||
if insts.len() == 1 && !state_flags.is_match() {
|
||||
None
|
||||
} else {
|
||||
let StateFlags(f) = *state_flags;
|
||||
insts[0] = f;
|
||||
Some(State { data: Arc::from(&*insts) })
|
||||
};
|
||||
let opt_state = if insts.len() == 1 && !state_flags.is_match() {
|
||||
None
|
||||
} else {
|
||||
let StateFlags(f) = *state_flags;
|
||||
insts[0] = f;
|
||||
Some(State { data: Arc::from(&*insts) })
|
||||
};
|
||||
self.cache.insts_scratch_space = insts;
|
||||
opt_state
|
||||
}
|
||||
@@ -1312,7 +1295,8 @@ impl<'a> Fsm<'a> {
|
||||
let nstates = self.cache.compiled.len();
|
||||
if self.cache.flush_count >= 3
|
||||
&& self.at >= self.last_cache_flush
|
||||
&& (self.at - self.last_cache_flush) <= 10 * nstates {
|
||||
&& (self.at - self.last_cache_flush) <= 10 * nstates
|
||||
{
|
||||
return false;
|
||||
}
|
||||
// Update statistics tracking cache flushes.
|
||||
@@ -1402,14 +1386,13 @@ impl<'a> Fsm<'a> {
|
||||
// matches are delayed by one byte, start states can never be match
|
||||
// states.
|
||||
let flagi = {
|
||||
(((empty_flags.start as u8) << 0) |
|
||||
((empty_flags.end as u8) << 1) |
|
||||
((empty_flags.start_line as u8) << 2) |
|
||||
((empty_flags.end_line as u8) << 3) |
|
||||
((empty_flags.word_boundary as u8) << 4) |
|
||||
((empty_flags.not_word_boundary as u8) << 5) |
|
||||
((state_flags.is_word() as u8) << 6))
|
||||
as usize
|
||||
(((empty_flags.start as u8) << 0)
|
||||
| ((empty_flags.end as u8) << 1)
|
||||
| ((empty_flags.start_line as u8) << 2)
|
||||
| ((empty_flags.end_line as u8) << 3)
|
||||
| ((empty_flags.word_boundary as u8) << 4)
|
||||
| ((empty_flags.not_word_boundary as u8) << 5)
|
||||
| ((state_flags.is_word() as u8) << 6)) as usize
|
||||
};
|
||||
match self.cache.start_states[flagi] {
|
||||
STATE_UNKNOWN => {}
|
||||
@@ -1519,15 +1502,15 @@ impl<'a> Fsm<'a> {
|
||||
}
|
||||
// Finally, put our actual state on to our heap of states and index it
|
||||
// so we can find it later.
|
||||
self.cache.size +=
|
||||
self.cache.trans.state_heap_size()
|
||||
self.cache.size += self.cache.trans.state_heap_size()
|
||||
+ state.data.len()
|
||||
+ (2 * mem::size_of::<State>())
|
||||
+ mem::size_of::<StatePtr>();
|
||||
self.cache.compiled.insert(state, si);
|
||||
// Transition table and set of states and map should all be in sync.
|
||||
debug_assert!(self.cache.compiled.len()
|
||||
== self.cache.trans.num_states());
|
||||
debug_assert!(
|
||||
self.cache.compiled.len() == self.cache.trans.num_states()
|
||||
);
|
||||
Some(si)
|
||||
}
|
||||
|
||||
@@ -1581,8 +1564,8 @@ impl<'a> Fsm<'a> {
|
||||
/// Returns true if there is a prefix we can quickly search for.
|
||||
fn has_prefix(&self) -> bool {
|
||||
!self.prog.is_reverse
|
||||
&& !self.prog.prefixes.is_empty()
|
||||
&& !self.prog.is_anchored_start
|
||||
&& !self.prog.prefixes.is_empty()
|
||||
&& !self.prog.is_anchored_start
|
||||
}
|
||||
|
||||
/// Sets the STATE_START bit in the given state pointer if and only if
|
||||
@@ -1674,10 +1657,7 @@ impl Transitions {
|
||||
/// The number of byte classes corresponds to the stride. Every state will
|
||||
/// have `num_byte_classes` slots for transitions.
|
||||
fn new(num_byte_classes: usize) -> Transitions {
|
||||
Transitions {
|
||||
table: vec![],
|
||||
num_byte_classes: num_byte_classes,
|
||||
}
|
||||
Transitions { table: vec![], num_byte_classes: num_byte_classes }
|
||||
}
|
||||
|
||||
/// Returns the total number of states currently in this table.
|
||||
@@ -1752,9 +1732,15 @@ impl StateFlags {
|
||||
}
|
||||
|
||||
impl Byte {
|
||||
fn byte(b: u8) -> Self { Byte(b as u16) }
|
||||
fn eof() -> Self { Byte(256) }
|
||||
fn is_eof(&self) -> bool { self.0 == 256 }
|
||||
fn byte(b: u8) -> Self {
|
||||
Byte(b as u16)
|
||||
}
|
||||
fn eof() -> Self {
|
||||
Byte(256)
|
||||
}
|
||||
fn is_eof(&self) -> bool {
|
||||
self.0 == 256
|
||||
}
|
||||
|
||||
fn is_ascii_word(&self) -> bool {
|
||||
let b = match self.as_byte() {
|
||||
@@ -1780,9 +1766,9 @@ impl fmt::Debug for State {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
let ips: Vec<usize> = self.inst_ptrs().collect();
|
||||
f.debug_struct("State")
|
||||
.field("flags", &self.flags())
|
||||
.field("insts", &ips)
|
||||
.finish()
|
||||
.field("flags", &self.flags())
|
||||
.field("insts", &ips)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1821,10 +1807,10 @@ impl<'a> fmt::Debug for TransitionsRow<'a> {
|
||||
impl fmt::Debug for StateFlags {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
f.debug_struct("StateFlags")
|
||||
.field("is_match", &self.is_match())
|
||||
.field("is_word", &self.is_word())
|
||||
.field("has_empty", &self.has_empty())
|
||||
.finish()
|
||||
.field("is_match", &self.is_match())
|
||||
.field("is_word", &self.is_word())
|
||||
.field("has_empty", &self.has_empty())
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1914,12 +1900,12 @@ fn read_varu32(data: &[u8]) -> (u32, usize) {
|
||||
mod tests {
|
||||
extern crate rand;
|
||||
|
||||
use std::sync::Arc;
|
||||
use quickcheck::{QuickCheck, StdGen, quickcheck};
|
||||
use super::{
|
||||
StateFlags, State, push_inst_ptr,
|
||||
write_varu32, read_varu32, write_vari32, read_vari32,
|
||||
push_inst_ptr, read_vari32, read_varu32, write_vari32, write_varu32,
|
||||
State, StateFlags,
|
||||
};
|
||||
use quickcheck::{quickcheck, QuickCheck, StdGen};
|
||||
use std::sync::Arc;
|
||||
|
||||
#[test]
|
||||
fn prop_state_encode_decode() {
|
||||
|
||||
+6
-7
@@ -42,10 +42,11 @@ impl fmt::Display for Error {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
match *self {
|
||||
Error::Syntax(ref err) => err.fmt(f),
|
||||
Error::CompiledTooBig(limit) => {
|
||||
write!(f, "Compiled regex exceeds size limit of {} bytes.",
|
||||
limit)
|
||||
}
|
||||
Error::CompiledTooBig(limit) => write!(
|
||||
f,
|
||||
"Compiled regex exceeds size limit of {} bytes.",
|
||||
limit
|
||||
),
|
||||
Error::__Nonexhaustive => unreachable!(),
|
||||
}
|
||||
}
|
||||
@@ -68,9 +69,7 @@ impl fmt::Debug for Error {
|
||||
Ok(())
|
||||
}
|
||||
Error::CompiledTooBig(limit) => {
|
||||
f.debug_tuple("CompiledTooBig")
|
||||
.field(&limit)
|
||||
.finish()
|
||||
f.debug_tuple("CompiledTooBig").field(&limit).finish()
|
||||
}
|
||||
Error::__Nonexhaustive => {
|
||||
f.debug_tuple("__Nonexhaustive").finish()
|
||||
|
||||
+138
-119
@@ -13,10 +13,10 @@ use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
|
||||
use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind};
|
||||
use thread_local::CachedThreadLocal;
|
||||
use syntax::ParserBuilder;
|
||||
use syntax::hir::Hir;
|
||||
use syntax::hir::literal::Literals;
|
||||
use syntax::hir::Hir;
|
||||
use syntax::ParserBuilder;
|
||||
use thread_local::CachedThreadLocal;
|
||||
|
||||
use backtrack;
|
||||
use compile::Compiler;
|
||||
@@ -29,7 +29,7 @@ use prog::Program;
|
||||
use re_builder::RegexOptions;
|
||||
use re_bytes;
|
||||
use re_set;
|
||||
use re_trait::{RegularExpression, Slot, Locations};
|
||||
use re_trait::{Locations, RegularExpression, Slot};
|
||||
use re_unicode;
|
||||
use utf8::next_utf8;
|
||||
|
||||
@@ -136,7 +136,10 @@ impl ExecBuilder {
|
||||
/// are completely unsupported. (This means both `find` and `captures`
|
||||
/// wont work.)
|
||||
pub fn new_many<I, S>(res: I) -> Self
|
||||
where S: AsRef<str>, I: IntoIterator<Item=S> {
|
||||
where
|
||||
S: AsRef<str>,
|
||||
I: IntoIterator<Item = S>,
|
||||
{
|
||||
let mut opts = RegexOptions::default();
|
||||
opts.pats = res.into_iter().map(|s| s.as_ref().to_owned()).collect();
|
||||
Self::new_options(opts)
|
||||
@@ -226,21 +229,19 @@ impl ExecBuilder {
|
||||
// If we're compiling a regex set and that set has any anchored
|
||||
// expressions, then disable all literal optimizations.
|
||||
for pat in &self.options.pats {
|
||||
let mut parser =
|
||||
ParserBuilder::new()
|
||||
.octal(self.options.octal)
|
||||
.case_insensitive(self.options.case_insensitive)
|
||||
.multi_line(self.options.multi_line)
|
||||
.dot_matches_new_line(self.options.dot_matches_new_line)
|
||||
.swap_greed(self.options.swap_greed)
|
||||
.ignore_whitespace(self.options.ignore_whitespace)
|
||||
.unicode(self.options.unicode)
|
||||
.allow_invalid_utf8(!self.only_utf8)
|
||||
.nest_limit(self.options.nest_limit)
|
||||
.build();
|
||||
let expr = parser
|
||||
.parse(pat)
|
||||
.map_err(|e| Error::Syntax(e.to_string()))?;
|
||||
let mut parser = ParserBuilder::new()
|
||||
.octal(self.options.octal)
|
||||
.case_insensitive(self.options.case_insensitive)
|
||||
.multi_line(self.options.multi_line)
|
||||
.dot_matches_new_line(self.options.dot_matches_new_line)
|
||||
.swap_greed(self.options.swap_greed)
|
||||
.ignore_whitespace(self.options.ignore_whitespace)
|
||||
.unicode(self.options.unicode)
|
||||
.allow_invalid_utf8(!self.only_utf8)
|
||||
.nest_limit(self.options.nest_limit)
|
||||
.build();
|
||||
let expr =
|
||||
parser.parse(pat).map_err(|e| Error::Syntax(e.to_string()))?;
|
||||
bytes = bytes || !expr.is_always_utf8();
|
||||
|
||||
if !expr.is_anchored_start() && expr.is_any_anchored_start() {
|
||||
@@ -303,25 +304,22 @@ impl ExecBuilder {
|
||||
return Ok(Exec { ro: ro, cache: CachedThreadLocal::new() });
|
||||
}
|
||||
let parsed = self.parse()?;
|
||||
let mut nfa =
|
||||
Compiler::new()
|
||||
.size_limit(self.options.size_limit)
|
||||
.bytes(self.bytes || parsed.bytes)
|
||||
.only_utf8(self.only_utf8)
|
||||
.compile(&parsed.exprs)?;
|
||||
let mut dfa =
|
||||
Compiler::new()
|
||||
.size_limit(self.options.size_limit)
|
||||
.dfa(true)
|
||||
.only_utf8(self.only_utf8)
|
||||
.compile(&parsed.exprs)?;
|
||||
let mut dfa_reverse =
|
||||
Compiler::new()
|
||||
.size_limit(self.options.size_limit)
|
||||
.dfa(true)
|
||||
.only_utf8(self.only_utf8)
|
||||
.reverse(true)
|
||||
.compile(&parsed.exprs)?;
|
||||
let mut nfa = Compiler::new()
|
||||
.size_limit(self.options.size_limit)
|
||||
.bytes(self.bytes || parsed.bytes)
|
||||
.only_utf8(self.only_utf8)
|
||||
.compile(&parsed.exprs)?;
|
||||
let mut dfa = Compiler::new()
|
||||
.size_limit(self.options.size_limit)
|
||||
.dfa(true)
|
||||
.only_utf8(self.only_utf8)
|
||||
.compile(&parsed.exprs)?;
|
||||
let mut dfa_reverse = Compiler::new()
|
||||
.size_limit(self.options.size_limit)
|
||||
.dfa(true)
|
||||
.only_utf8(self.only_utf8)
|
||||
.reverse(true)
|
||||
.compile(&parsed.exprs)?;
|
||||
|
||||
nfa.prefixes = LiteralSearcher::prefixes(parsed.prefixes);
|
||||
dfa.prefixes = nfa.prefixes.clone();
|
||||
@@ -366,7 +364,9 @@ impl ExecBuilder {
|
||||
impl<'c> RegularExpression for ExecNoSyncStr<'c> {
|
||||
type Text = str;
|
||||
|
||||
fn slots_len(&self) -> usize { self.0.slots_len() }
|
||||
fn slots_len(&self) -> usize {
|
||||
self.0.slots_len()
|
||||
}
|
||||
|
||||
fn next_after_empty(&self, text: &str, i: usize) -> usize {
|
||||
next_utf8(text.as_bytes(), i)
|
||||
@@ -511,18 +511,14 @@ impl<'c> RegularExpression for ExecNoSync<'c> {
|
||||
return None;
|
||||
}
|
||||
match self.ro.match_type {
|
||||
MatchType::Literal(ty) => {
|
||||
self.find_literals(ty, text, start)
|
||||
}
|
||||
MatchType::Dfa => {
|
||||
match self.find_dfa_forward(text, start) {
|
||||
dfa::Result::Match((s, e)) => Some((s, e)),
|
||||
dfa::Result::NoMatch(_) => None,
|
||||
dfa::Result::Quit => {
|
||||
self.find_nfa(MatchNfaType::Auto, text, start)
|
||||
}
|
||||
MatchType::Literal(ty) => self.find_literals(ty, text, start),
|
||||
MatchType::Dfa => match self.find_dfa_forward(text, start) {
|
||||
dfa::Result::Match((s, e)) => Some((s, e)),
|
||||
dfa::Result::NoMatch(_) => None,
|
||||
dfa::Result::Quit => {
|
||||
self.find_nfa(MatchNfaType::Auto, text, start)
|
||||
}
|
||||
}
|
||||
},
|
||||
MatchType::DfaAnchoredReverse => {
|
||||
match self.find_dfa_anchored_reverse(text, start) {
|
||||
dfa::Result::Match((s, e)) => Some((s, e)),
|
||||
@@ -587,7 +583,12 @@ impl<'c> RegularExpression for ExecNoSync<'c> {
|
||||
MatchType::Literal(ty) => {
|
||||
self.find_literals(ty, text, start).and_then(|(s, e)| {
|
||||
self.captures_nfa_type(
|
||||
MatchNfaType::Auto, slots, text, s, e)
|
||||
MatchNfaType::Auto,
|
||||
slots,
|
||||
text,
|
||||
s,
|
||||
e,
|
||||
)
|
||||
})
|
||||
}
|
||||
MatchType::Dfa => {
|
||||
@@ -595,10 +596,13 @@ impl<'c> RegularExpression for ExecNoSync<'c> {
|
||||
self.captures_nfa(slots, text, start)
|
||||
} else {
|
||||
match self.find_dfa_forward(text, start) {
|
||||
dfa::Result::Match((s, e)) => {
|
||||
self.captures_nfa_type(
|
||||
MatchNfaType::Auto, slots, text, s, e)
|
||||
}
|
||||
dfa::Result::Match((s, e)) => self.captures_nfa_type(
|
||||
MatchNfaType::Auto,
|
||||
slots,
|
||||
text,
|
||||
s,
|
||||
e,
|
||||
),
|
||||
dfa::Result::NoMatch(_) => None,
|
||||
dfa::Result::Quit => {
|
||||
self.captures_nfa(slots, text, start)
|
||||
@@ -608,20 +612,26 @@ impl<'c> RegularExpression for ExecNoSync<'c> {
|
||||
}
|
||||
MatchType::DfaAnchoredReverse => {
|
||||
match self.find_dfa_anchored_reverse(text, start) {
|
||||
dfa::Result::Match((s, e)) => {
|
||||
self.captures_nfa_type(
|
||||
MatchNfaType::Auto, slots, text, s, e)
|
||||
}
|
||||
dfa::Result::Match((s, e)) => self.captures_nfa_type(
|
||||
MatchNfaType::Auto,
|
||||
slots,
|
||||
text,
|
||||
s,
|
||||
e,
|
||||
),
|
||||
dfa::Result::NoMatch(_) => None,
|
||||
dfa::Result::Quit => self.captures_nfa(slots, text, start),
|
||||
}
|
||||
}
|
||||
MatchType::DfaSuffix => {
|
||||
match self.find_dfa_reverse_suffix(text, start) {
|
||||
dfa::Result::Match((s, e)) => {
|
||||
self.captures_nfa_type(
|
||||
MatchNfaType::Auto, slots, text, s, e)
|
||||
}
|
||||
dfa::Result::Match((s, e)) => self.captures_nfa_type(
|
||||
MatchNfaType::Auto,
|
||||
slots,
|
||||
text,
|
||||
s,
|
||||
e,
|
||||
),
|
||||
dfa::Result::NoMatch(_) => None,
|
||||
dfa::Result::Quit => self.captures_nfa(slots, text, start),
|
||||
}
|
||||
@@ -650,13 +660,13 @@ impl<'c> ExecNoSync<'c> {
|
||||
match ty {
|
||||
Unanchored => {
|
||||
let lits = &self.ro.nfa.prefixes;
|
||||
lits.find(&text[start..])
|
||||
.map(|(s, e)| (start + s, start + e))
|
||||
lits.find(&text[start..]).map(|(s, e)| (start + s, start + e))
|
||||
}
|
||||
AnchoredStart => {
|
||||
let lits = &self.ro.nfa.prefixes;
|
||||
if !self.ro.nfa.is_anchored_start
|
||||
|| (self.ro.nfa.is_anchored_start && start == 0) {
|
||||
|| (self.ro.nfa.is_anchored_start && start == 0)
|
||||
{
|
||||
lits.find_start(&text[start..])
|
||||
.map(|(s, e)| (start + s, start + e))
|
||||
} else {
|
||||
@@ -668,11 +678,13 @@ impl<'c> ExecNoSync<'c> {
|
||||
lits.find_end(&text[start..])
|
||||
.map(|(s, e)| (start + s, start + e))
|
||||
}
|
||||
AhoCorasick => {
|
||||
self.ro.ac.as_ref().unwrap()
|
||||
.find(&text[start..])
|
||||
.map(|m| (start + m.start(), start + m.end()))
|
||||
}
|
||||
AhoCorasick => self
|
||||
.ro
|
||||
.ac
|
||||
.as_ref()
|
||||
.unwrap()
|
||||
.find(&text[start..])
|
||||
.map(|m| (start + m.start(), start + m.end())),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -855,11 +867,7 @@ impl<'c> ExecNoSync<'c> {
|
||||
/// Ideally, we could use shortest_nfa(...).is_some() and get the same
|
||||
/// performance characteristics, but regex sets don't have captures, which
|
||||
/// shortest_nfa depends on.
|
||||
fn match_nfa(
|
||||
&self,
|
||||
text: &[u8],
|
||||
start: usize,
|
||||
) -> bool {
|
||||
fn match_nfa(&self, text: &[u8], start: usize) -> bool {
|
||||
self.match_nfa_type(MatchNfaType::Auto, text, start)
|
||||
}
|
||||
|
||||
@@ -893,7 +901,7 @@ impl<'c> ExecNoSync<'c> {
|
||||
true,
|
||||
text,
|
||||
start,
|
||||
text.len()
|
||||
text.len(),
|
||||
) {
|
||||
slots[1]
|
||||
} else {
|
||||
@@ -916,7 +924,7 @@ impl<'c> ExecNoSync<'c> {
|
||||
false,
|
||||
text,
|
||||
start,
|
||||
text.len()
|
||||
text.len(),
|
||||
) {
|
||||
match (slots[0], slots[1]) {
|
||||
(Some(s), Some(e)) => Some((s, e)),
|
||||
@@ -937,7 +945,12 @@ impl<'c> ExecNoSync<'c> {
|
||||
start: usize,
|
||||
) -> Option<(usize, usize)> {
|
||||
self.captures_nfa_type(
|
||||
MatchNfaType::Auto, slots, text, start, text.len())
|
||||
MatchNfaType::Auto,
|
||||
slots,
|
||||
text,
|
||||
start,
|
||||
text.len(),
|
||||
)
|
||||
}
|
||||
|
||||
/// Like captures_nfa, but allows specification of type of NFA engine.
|
||||
@@ -980,10 +993,14 @@ impl<'c> ExecNoSync<'c> {
|
||||
match ty {
|
||||
Auto => unreachable!(),
|
||||
Backtrack => self.exec_backtrack(matches, slots, text, start, end),
|
||||
PikeVM => {
|
||||
self.exec_pikevm(
|
||||
matches, slots, quit_after_match, text, start, end)
|
||||
}
|
||||
PikeVM => self.exec_pikevm(
|
||||
matches,
|
||||
slots,
|
||||
quit_after_match,
|
||||
text,
|
||||
start,
|
||||
end,
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1006,7 +1023,8 @@ impl<'c> ExecNoSync<'c> {
|
||||
quit_after_match,
|
||||
ByteInput::new(text, self.ro.nfa.only_utf8),
|
||||
start,
|
||||
end)
|
||||
end,
|
||||
)
|
||||
} else {
|
||||
pikevm::Fsm::exec(
|
||||
&self.ro.nfa,
|
||||
@@ -1016,7 +1034,8 @@ impl<'c> ExecNoSync<'c> {
|
||||
quit_after_match,
|
||||
CharInput::new(text),
|
||||
start,
|
||||
end)
|
||||
end,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1037,7 +1056,8 @@ impl<'c> ExecNoSync<'c> {
|
||||
slots,
|
||||
ByteInput::new(text, self.ro.nfa.only_utf8),
|
||||
start,
|
||||
end)
|
||||
end,
|
||||
)
|
||||
} else {
|
||||
backtrack::Bounded::exec(
|
||||
&self.ro.nfa,
|
||||
@@ -1046,7 +1066,8 @@ impl<'c> ExecNoSync<'c> {
|
||||
slots,
|
||||
CharInput::new(text),
|
||||
start,
|
||||
end)
|
||||
end,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1083,22 +1104,26 @@ impl<'c> ExecNoSync<'c> {
|
||||
) {
|
||||
dfa::Result::Match(_) => true,
|
||||
dfa::Result::NoMatch(_) => false,
|
||||
dfa::Result::Quit => {
|
||||
self.exec_nfa(
|
||||
MatchNfaType::Auto,
|
||||
matches,
|
||||
&mut [],
|
||||
false,
|
||||
text,
|
||||
start,
|
||||
text.len())
|
||||
}
|
||||
dfa::Result::Quit => self.exec_nfa(
|
||||
MatchNfaType::Auto,
|
||||
matches,
|
||||
&mut [],
|
||||
false,
|
||||
text,
|
||||
start,
|
||||
text.len(),
|
||||
),
|
||||
}
|
||||
}
|
||||
Nfa(ty) => {
|
||||
self.exec_nfa(
|
||||
ty, matches, &mut [], false, text, start, text.len())
|
||||
}
|
||||
Nfa(ty) => self.exec_nfa(
|
||||
ty,
|
||||
matches,
|
||||
&mut [],
|
||||
false,
|
||||
text,
|
||||
start,
|
||||
text.len(),
|
||||
),
|
||||
Nothing => false,
|
||||
}
|
||||
}
|
||||
@@ -1106,7 +1131,7 @@ impl<'c> ExecNoSync<'c> {
|
||||
#[inline(always)] // reduces constant overhead
|
||||
fn is_anchor_end_match(&self, text: &[u8]) -> bool {
|
||||
// Only do this check if the haystack is big (>1MB).
|
||||
if text.len() > (1<<20) && self.ro.nfa.is_anchored_end {
|
||||
if text.len() > (1 << 20) && self.ro.nfa.is_anchored_end {
|
||||
let lcs = self.ro.suffixes.lcs();
|
||||
if lcs.len() >= 1 && !lcs.is_suffix(text) {
|
||||
return false;
|
||||
@@ -1130,9 +1155,8 @@ impl Exec {
|
||||
/// Get a searcher that isn't Sync.
|
||||
#[inline(always)] // reduces constant overhead
|
||||
pub fn searcher(&self) -> ExecNoSync {
|
||||
let create = || {
|
||||
Box::new(RefCell::new(ProgramCacheInner::new(&self.ro)))
|
||||
};
|
||||
let create =
|
||||
|| Box::new(RefCell::new(ProgramCacheInner::new(&self.ro)));
|
||||
ExecNoSync {
|
||||
ro: &self.ro, // a clone is too expensive here! (and not needed)
|
||||
cache: self.cache.get_or(create),
|
||||
@@ -1187,10 +1211,7 @@ impl Exec {
|
||||
|
||||
impl Clone for Exec {
|
||||
fn clone(&self) -> Exec {
|
||||
Exec {
|
||||
ro: self.ro.clone(),
|
||||
cache: CachedThreadLocal::new(),
|
||||
}
|
||||
Exec { ro: self.ro.clone(), cache: CachedThreadLocal::new() }
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1373,15 +1394,13 @@ fn alternation_literals(expr: &Hir) -> Option<Vec<Vec<u8>>> {
|
||||
_ => return None, // one literal isn't worth it
|
||||
};
|
||||
|
||||
let extendlit = |lit: &Literal, dst: &mut Vec<u8>| {
|
||||
match *lit {
|
||||
Literal::Unicode(c) => {
|
||||
let mut buf = [0; 4];
|
||||
dst.extend_from_slice(c.encode_utf8(&mut buf).as_bytes());
|
||||
}
|
||||
Literal::Byte(b) => {
|
||||
dst.push(b);
|
||||
}
|
||||
let extendlit = |lit: &Literal, dst: &mut Vec<u8>| match *lit {
|
||||
Literal::Unicode(c) => {
|
||||
let mut buf = [0; 4];
|
||||
dst.extend_from_slice(c.encode_utf8(&mut buf).as_bytes());
|
||||
}
|
||||
Literal::Byte(b) => {
|
||||
dst.push(b);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
+14
-14
@@ -35,12 +35,12 @@ pub fn expand_str(
|
||||
replacement = &replacement[cap_ref.end..];
|
||||
match cap_ref.cap {
|
||||
Ref::Number(i) => {
|
||||
dst.push_str(
|
||||
caps.get(i).map(|m| m.as_str()).unwrap_or(""));
|
||||
dst.push_str(caps.get(i).map(|m| m.as_str()).unwrap_or(""));
|
||||
}
|
||||
Ref::Named(name) => {
|
||||
dst.push_str(
|
||||
caps.name(name).map(|m| m.as_str()).unwrap_or(""));
|
||||
caps.name(name).map(|m| m.as_str()).unwrap_or(""),
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -77,12 +77,12 @@ pub fn expand_bytes(
|
||||
replacement = &replacement[cap_ref.end..];
|
||||
match cap_ref.cap {
|
||||
Ref::Number(i) => {
|
||||
dst.extend(
|
||||
caps.get(i).map(|m| m.as_bytes()).unwrap_or(b""));
|
||||
dst.extend(caps.get(i).map(|m| m.as_bytes()).unwrap_or(b""));
|
||||
}
|
||||
Ref::Named(name) => {
|
||||
dst.extend(
|
||||
caps.name(name).map(|m| m.as_bytes()).unwrap_or(b""));
|
||||
caps.name(name).map(|m| m.as_bytes()).unwrap_or(b""),
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -149,8 +149,8 @@ fn find_cap_ref<T: ?Sized + AsRef<[u8]>>(
|
||||
// We just verified that the range 0..cap_end is valid ASCII, so it must
|
||||
// therefore be valid UTF-8. If we really cared, we could avoid this UTF-8
|
||||
// check with either unsafe or by parsing the number straight from &[u8].
|
||||
let cap = str::from_utf8(&rep[i..cap_end])
|
||||
.expect("valid UTF-8 capture name");
|
||||
let cap =
|
||||
str::from_utf8(&rep[i..cap_end]).expect("valid UTF-8 capture name");
|
||||
if brace {
|
||||
if !rep.get(cap_end).map_or(false, |&b| b == b'}') {
|
||||
return None;
|
||||
@@ -169,14 +169,14 @@ fn find_cap_ref<T: ?Sized + AsRef<[u8]>>(
|
||||
/// Returns true if and only if the given byte is allowed in a capture name.
|
||||
fn is_valid_cap_letter(b: &u8) -> bool {
|
||||
match *b {
|
||||
b'0' ..= b'9' | b'a' ..= b'z' | b'A' ..= b'Z' | b'_' => true,
|
||||
b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' | b'_' => true,
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::{CaptureRef, find_cap_ref};
|
||||
use super::{find_cap_ref, CaptureRef};
|
||||
|
||||
macro_rules! find {
|
||||
($name:ident, $text:expr) => {
|
||||
@@ -213,8 +213,8 @@ mod tests {
|
||||
find!(find_cap_ref11, "$");
|
||||
find!(find_cap_ref12, " ");
|
||||
find!(find_cap_ref13, "");
|
||||
find!(find_cap_ref14, "$1-$2", c!(1,2));
|
||||
find!(find_cap_ref15, "$1_$2", c!("1_",3));
|
||||
find!(find_cap_ref16, "$x-$y", c!("x",2));
|
||||
find!(find_cap_ref17, "$x_$y", c!("x_",3));
|
||||
find!(find_cap_ref14, "$1-$2", c!(1, 2));
|
||||
find!(find_cap_ref15, "$1_$2", c!("1_", 3));
|
||||
find!(find_cap_ref16, "$x-$y", c!("x", 2));
|
||||
find!(find_cap_ref17, "$x_$y", c!("x_", 3));
|
||||
}
|
||||
|
||||
+43
-43
@@ -12,38 +12,38 @@
|
||||
// edit directly
|
||||
|
||||
pub const BYTE_FREQUENCIES: [u8; 256] = [
|
||||
55, // '\x00'
|
||||
52, // '\x01'
|
||||
51, // '\x02'
|
||||
50, // '\x03'
|
||||
49, // '\x04'
|
||||
48, // '\x05'
|
||||
47, // '\x06'
|
||||
46, // '\x07'
|
||||
45, // '\x08'
|
||||
55, // '\x00'
|
||||
52, // '\x01'
|
||||
51, // '\x02'
|
||||
50, // '\x03'
|
||||
49, // '\x04'
|
||||
48, // '\x05'
|
||||
47, // '\x06'
|
||||
46, // '\x07'
|
||||
45, // '\x08'
|
||||
103, // '\t'
|
||||
242, // '\n'
|
||||
66, // '\x0b'
|
||||
67, // '\x0c'
|
||||
66, // '\x0b'
|
||||
67, // '\x0c'
|
||||
229, // '\r'
|
||||
44, // '\x0e'
|
||||
43, // '\x0f'
|
||||
42, // '\x10'
|
||||
41, // '\x11'
|
||||
40, // '\x12'
|
||||
39, // '\x13'
|
||||
38, // '\x14'
|
||||
37, // '\x15'
|
||||
36, // '\x16'
|
||||
35, // '\x17'
|
||||
34, // '\x18'
|
||||
33, // '\x19'
|
||||
56, // '\x1a'
|
||||
32, // '\x1b'
|
||||
31, // '\x1c'
|
||||
30, // '\x1d'
|
||||
29, // '\x1e'
|
||||
28, // '\x1f'
|
||||
44, // '\x0e'
|
||||
43, // '\x0f'
|
||||
42, // '\x10'
|
||||
41, // '\x11'
|
||||
40, // '\x12'
|
||||
39, // '\x13'
|
||||
38, // '\x14'
|
||||
37, // '\x15'
|
||||
36, // '\x16'
|
||||
35, // '\x17'
|
||||
34, // '\x18'
|
||||
33, // '\x19'
|
||||
56, // '\x1a'
|
||||
32, // '\x1b'
|
||||
31, // '\x1c'
|
||||
30, // '\x1d'
|
||||
29, // '\x1e'
|
||||
28, // '\x1f'
|
||||
255, // ' '
|
||||
148, // '!'
|
||||
164, // '"'
|
||||
@@ -139,7 +139,7 @@ pub const BYTE_FREQUENCIES: [u8; 256] = [
|
||||
205, // '|'
|
||||
181, // '}'
|
||||
127, // '~'
|
||||
27, // '\x7f'
|
||||
27, // '\x7f'
|
||||
212, // '\x80'
|
||||
211, // '\x81'
|
||||
210, // '\x82'
|
||||
@@ -151,11 +151,11 @@ pub const BYTE_FREQUENCIES: [u8; 256] = [
|
||||
131, // '\x88'
|
||||
172, // '\x89'
|
||||
105, // '\x8a'
|
||||
80, // '\x8b'
|
||||
98, // '\x8c'
|
||||
96, // '\x8d'
|
||||
97, // '\x8e'
|
||||
81, // '\x8f'
|
||||
80, // '\x8b'
|
||||
98, // '\x8c'
|
||||
96, // '\x8d'
|
||||
97, // '\x8e'
|
||||
81, // '\x8f'
|
||||
207, // '\x90'
|
||||
145, // '\x91'
|
||||
116, // '\x92'
|
||||
@@ -170,7 +170,7 @@ pub const BYTE_FREQUENCIES: [u8; 256] = [
|
||||
110, // '\x9b'
|
||||
124, // '\x9c'
|
||||
111, // '\x9d'
|
||||
82, // '\x9e'
|
||||
82, // '\x9e'
|
||||
108, // '\x9f'
|
||||
118, // '\xa0'
|
||||
141, // '¡'
|
||||
@@ -180,14 +180,14 @@ pub const BYTE_FREQUENCIES: [u8; 256] = [
|
||||
125, // '¥'
|
||||
165, // '¦'
|
||||
117, // '§'
|
||||
92, // '¨'
|
||||
92, // '¨'
|
||||
106, // '©'
|
||||
83, // 'ª'
|
||||
72, // '«'
|
||||
99, // '¬'
|
||||
93, // '\xad'
|
||||
65, // '®'
|
||||
79, // '¯'
|
||||
83, // 'ª'
|
||||
72, // '«'
|
||||
99, // '¬'
|
||||
93, // '\xad'
|
||||
65, // '®'
|
||||
79, // '¯'
|
||||
166, // '°'
|
||||
237, // '±'
|
||||
163, // '²'
|
||||
|
||||
+34
-27
@@ -18,7 +18,7 @@ use syntax;
|
||||
|
||||
use literal::LiteralSearcher;
|
||||
use prog::InstEmptyLook;
|
||||
use utf8::{decode_utf8, decode_last_utf8};
|
||||
use utf8::{decode_last_utf8, decode_utf8};
|
||||
|
||||
/// Represents a location in the input.
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
@@ -105,18 +105,26 @@ pub trait Input {
|
||||
fn len(&self) -> usize;
|
||||
|
||||
/// Whether the input is empty.
|
||||
fn is_empty(&self) -> bool { self.len() == 0 }
|
||||
fn is_empty(&self) -> bool {
|
||||
self.len() == 0
|
||||
}
|
||||
|
||||
/// Return the given input as a sequence of bytes.
|
||||
fn as_bytes(&self) -> &[u8];
|
||||
}
|
||||
|
||||
impl<'a, T: Input> Input for &'a T {
|
||||
fn at(&self, i: usize) -> InputAt { (**self).at(i) }
|
||||
fn at(&self, i: usize) -> InputAt {
|
||||
(**self).at(i)
|
||||
}
|
||||
|
||||
fn next_char(&self, at: InputAt) -> Char { (**self).next_char(at) }
|
||||
fn next_char(&self, at: InputAt) -> Char {
|
||||
(**self).next_char(at)
|
||||
}
|
||||
|
||||
fn previous_char(&self, at: InputAt) -> Char { (**self).previous_char(at) }
|
||||
fn previous_char(&self, at: InputAt) -> Char {
|
||||
(**self).previous_char(at)
|
||||
}
|
||||
|
||||
fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool {
|
||||
(**self).is_empty_match(at, empty)
|
||||
@@ -130,9 +138,13 @@ impl<'a, T: Input> Input for &'a T {
|
||||
(**self).prefix_at(prefixes, at)
|
||||
}
|
||||
|
||||
fn len(&self) -> usize { (**self).len() }
|
||||
fn len(&self) -> usize {
|
||||
(**self).len()
|
||||
}
|
||||
|
||||
fn as_bytes(&self) -> &[u8] { (**self).as_bytes() }
|
||||
fn as_bytes(&self) -> &[u8] {
|
||||
(**self).as_bytes()
|
||||
}
|
||||
}
|
||||
|
||||
/// An input reader over characters.
|
||||
@@ -157,12 +169,7 @@ impl<'t> ops::Deref for CharInput<'t> {
|
||||
impl<'t> Input for CharInput<'t> {
|
||||
fn at(&self, i: usize) -> InputAt {
|
||||
let c = decode_utf8(&self[i..]).map(|(c, _)| c).into();
|
||||
InputAt {
|
||||
pos: i,
|
||||
c: c,
|
||||
byte: None,
|
||||
len: c.len_utf8(),
|
||||
}
|
||||
InputAt { pos: i, c: c, byte: None, len: c.len_utf8() }
|
||||
}
|
||||
|
||||
fn next_char(&self, at: InputAt) -> Char {
|
||||
@@ -232,10 +239,7 @@ pub struct ByteInput<'t> {
|
||||
impl<'t> ByteInput<'t> {
|
||||
/// Return a new byte-based input reader for the given string.
|
||||
pub fn new(text: &'t [u8], only_utf8: bool) -> ByteInput<'t> {
|
||||
ByteInput {
|
||||
text: text,
|
||||
only_utf8: only_utf8,
|
||||
}
|
||||
ByteInput { text: text, only_utf8: only_utf8 }
|
||||
}
|
||||
}
|
||||
|
||||
@@ -249,12 +253,7 @@ impl<'t> ops::Deref for ByteInput<'t> {
|
||||
|
||||
impl<'t> Input for ByteInput<'t> {
|
||||
fn at(&self, i: usize) -> InputAt {
|
||||
InputAt {
|
||||
pos: i,
|
||||
c: None.into(),
|
||||
byte: self.get(i).cloned(),
|
||||
len: 1,
|
||||
}
|
||||
InputAt { pos: i, c: None.into(), byte: self.get(i).cloned(), len: 1 }
|
||||
}
|
||||
|
||||
fn next_char(&self, at: InputAt) -> Char {
|
||||
@@ -357,7 +356,9 @@ impl fmt::Debug for Char {
|
||||
impl Char {
|
||||
/// Returns true iff the character is absent.
|
||||
#[inline]
|
||||
pub fn is_none(self) -> bool { self.0 == u32::MAX }
|
||||
pub fn is_none(self) -> bool {
|
||||
self.0 == u32::MAX
|
||||
}
|
||||
|
||||
/// Returns the length of the character's UTF-8 encoding.
|
||||
///
|
||||
@@ -386,7 +387,9 @@ impl Char {
|
||||
}
|
||||
|
||||
impl From<char> for Char {
|
||||
fn from(c: char) -> Char { Char(c as u32) }
|
||||
fn from(c: char) -> Char {
|
||||
Char(c as u32)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Option<char>> for Char {
|
||||
@@ -397,12 +400,16 @@ impl From<Option<char>> for Char {
|
||||
|
||||
impl PartialEq<char> for Char {
|
||||
#[inline]
|
||||
fn eq(&self, other: &char) -> bool { self.0 == *other as u32 }
|
||||
fn eq(&self, other: &char) -> bool {
|
||||
self.0 == *other as u32
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialEq<Char> for char {
|
||||
#[inline]
|
||||
fn eq(&self, other: &Char) -> bool { *self as u32 == other.0 }
|
||||
fn eq(&self, other: &Char) -> bool {
|
||||
*self as u32 == other.0
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialOrd<char> for Char {
|
||||
|
||||
+10
-12
@@ -530,10 +530,10 @@ extern crate thread_local;
|
||||
#[cfg(test)]
|
||||
#[macro_use]
|
||||
extern crate quickcheck;
|
||||
extern crate regex_syntax as syntax;
|
||||
extern crate utf8_ranges;
|
||||
#[cfg(test)]
|
||||
extern crate doc_comment;
|
||||
extern crate regex_syntax as syntax;
|
||||
extern crate utf8_ranges;
|
||||
|
||||
#[cfg(test)]
|
||||
doc_comment::doctest!("../README.md");
|
||||
@@ -541,19 +541,17 @@ doc_comment::doctest!("../README.md");
|
||||
#[cfg(feature = "use_std")]
|
||||
pub use error::Error;
|
||||
#[cfg(feature = "use_std")]
|
||||
pub use re_builder::unicode::*;
|
||||
#[cfg(feature = "use_std")]
|
||||
pub use re_builder::set_unicode::*;
|
||||
#[cfg(feature = "use_std")]
|
||||
pub use re_builder::unicode::*;
|
||||
#[cfg(feature = "use_std")]
|
||||
pub use re_set::unicode::*;
|
||||
#[cfg(feature = "use_std")]
|
||||
#[cfg(feature = "use_std")]
|
||||
pub use re_unicode::{
|
||||
Regex, Match, Captures,
|
||||
CaptureNames, Matches, CaptureMatches, SubCaptureMatches,
|
||||
CaptureLocations, Locations,
|
||||
Replacer, ReplacerRef, NoExpand, Split, SplitN,
|
||||
escape,
|
||||
escape, CaptureLocations, CaptureMatches, CaptureNames, Captures,
|
||||
Locations, Match, Matches, NoExpand, Regex, Replacer, ReplacerRef, Split,
|
||||
SplitN, SubCaptureMatches,
|
||||
};
|
||||
|
||||
/**
|
||||
@@ -652,7 +650,6 @@ pub mod bytes {
|
||||
}
|
||||
|
||||
mod backtrack;
|
||||
mod utf8;
|
||||
mod compile;
|
||||
mod dfa;
|
||||
mod error;
|
||||
@@ -671,6 +668,7 @@ mod re_set;
|
||||
mod re_trait;
|
||||
mod re_unicode;
|
||||
mod sparse;
|
||||
mod utf8;
|
||||
mod vector;
|
||||
|
||||
/// The `internal` module exists to support suspicious activity, such as
|
||||
@@ -681,7 +679,7 @@ mod vector;
|
||||
pub mod internal {
|
||||
pub use compile::Compiler;
|
||||
pub use exec::{Exec, ExecBuilder};
|
||||
pub use input::{Char, Input, CharInput, InputAt};
|
||||
pub use input::{Char, CharInput, Input, InputAt};
|
||||
pub use literal::LiteralSearcher;
|
||||
pub use prog::{Program, Inst, EmptyLook, InstRanges};
|
||||
pub use prog::{EmptyLook, Inst, InstRanges, Program};
|
||||
}
|
||||
|
||||
+57
-36
@@ -15,9 +15,9 @@ use aho_corasick::{self, AhoCorasick, AhoCorasickBuilder};
|
||||
use memchr::{memchr, memchr2, memchr3};
|
||||
use syntax::hir::literal::{Literal, Literals};
|
||||
|
||||
use self::teddy_avx2::Teddy as TeddyAVX2;
|
||||
use self::teddy_ssse3::Teddy as TeddySSSE3;
|
||||
use freqs::BYTE_FREQUENCIES;
|
||||
use self::teddy_avx2::{Teddy as TeddyAVX2};
|
||||
use self::teddy_ssse3::{Teddy as TeddySSSE3};
|
||||
|
||||
mod teddy_avx2;
|
||||
mod teddy_ssse3;
|
||||
@@ -409,7 +409,7 @@ impl SingleByteSet {
|
||||
|
||||
fn approximate_size(&self) -> usize {
|
||||
(self.dense.len() * mem::size_of::<u8>())
|
||||
+ (self.sparse.len() * mem::size_of::<bool>())
|
||||
+ (self.sparse.len() * mem::size_of::<bool>())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -652,12 +652,14 @@ impl BoyerMooreSearch {
|
||||
|
||||
if haystack.len() > short_circut {
|
||||
// just 1 for the md2 shift
|
||||
let backstop = haystack.len() - ((NUM_UNROLL + 1) * self.pattern.len());
|
||||
let backstop =
|
||||
haystack.len() - ((NUM_UNROLL + 1) * self.pattern.len());
|
||||
loop {
|
||||
window_end = match self.skip_loop(haystack, window_end, backstop) {
|
||||
Some(i) => i,
|
||||
None => return None,
|
||||
};
|
||||
window_end =
|
||||
match self.skip_loop(haystack, window_end, backstop) {
|
||||
Some(i) => i,
|
||||
None => return None,
|
||||
};
|
||||
if window_end >= backstop {
|
||||
break;
|
||||
}
|
||||
@@ -690,7 +692,7 @@ impl BoyerMooreSearch {
|
||||
}
|
||||
|
||||
fn len(&self) -> usize {
|
||||
return self.pattern.len()
|
||||
return self.pattern.len();
|
||||
}
|
||||
|
||||
/// The key heuristic behind which the BoyerMooreSearch lives.
|
||||
@@ -774,7 +776,8 @@ impl BoyerMooreSearch {
|
||||
/// if it never reappears. If `skip_loop` hits the backstop
|
||||
/// it will leave early.
|
||||
#[inline]
|
||||
fn skip_loop(&self,
|
||||
fn skip_loop(
|
||||
&self,
|
||||
haystack: &[u8],
|
||||
mut window_end: usize,
|
||||
backstop: usize,
|
||||
@@ -787,25 +790,35 @@ impl BoyerMooreSearch {
|
||||
};
|
||||
|
||||
loop {
|
||||
let mut skip = skip_of(window_end); window_end += skip;
|
||||
skip = skip_of(window_end); window_end += skip;
|
||||
let mut skip = skip_of(window_end);
|
||||
window_end += skip;
|
||||
skip = skip_of(window_end);
|
||||
window_end += skip;
|
||||
if skip != 0 {
|
||||
skip = skip_of(window_end); window_end += skip;
|
||||
skip = skip_of(window_end); window_end += skip;
|
||||
skip = skip_of(window_end); window_end += skip;
|
||||
skip = skip_of(window_end);
|
||||
window_end += skip;
|
||||
skip = skip_of(window_end);
|
||||
window_end += skip;
|
||||
skip = skip_of(window_end);
|
||||
window_end += skip;
|
||||
if skip != 0 {
|
||||
skip = skip_of(window_end); window_end += skip;
|
||||
skip = skip_of(window_end); window_end += skip;
|
||||
skip = skip_of(window_end); window_end += skip;
|
||||
skip = skip_of(window_end);
|
||||
window_end += skip;
|
||||
skip = skip_of(window_end);
|
||||
window_end += skip;
|
||||
skip = skip_of(window_end);
|
||||
window_end += skip;
|
||||
if skip != 0 {
|
||||
skip = skip_of(window_end); window_end += skip;
|
||||
skip = skip_of(window_end); window_end += skip;
|
||||
skip = skip_of(window_end);
|
||||
window_end += skip;
|
||||
skip = skip_of(window_end);
|
||||
window_end += skip;
|
||||
|
||||
// If ten iterations did not make at least 16 words
|
||||
// worth of progress, we just fall back on memchr.
|
||||
if window_end - window_end_snapshot >
|
||||
16 * mem::size_of::<usize>() {
|
||||
|
||||
if window_end - window_end_snapshot
|
||||
> 16 * mem::size_of::<usize>()
|
||||
{
|
||||
// Returning a window_end >= backstop will immediatly
|
||||
// break us out of the inner loop in `find`.
|
||||
if window_end >= backstop {
|
||||
@@ -823,7 +836,11 @@ impl BoyerMooreSearch {
|
||||
match memchr(self.guard, &haystack[window_end..]) {
|
||||
None => return None,
|
||||
Some(g_idx) => {
|
||||
return Some(window_end + g_idx + self.guard_reverse_idx);
|
||||
return Some(
|
||||
window_end
|
||||
+ g_idx
|
||||
+ self.guard_reverse_idx,
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -961,11 +978,12 @@ mod tests {
|
||||
#[test]
|
||||
fn bm_memchr_fallback_indexing_bug() {
|
||||
let mut haystack = vec![
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 87, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 87, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
];
|
||||
let needle = vec![1, 1, 1, 1, 32, 32, 87];
|
||||
let needle_start = haystack.len();
|
||||
haystack.extend(needle.clone());
|
||||
@@ -981,7 +999,8 @@ mod tests {
|
||||
e_data.clone_created(entity_id, entity_to_add.entity_id);
|
||||
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
|
||||
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
|
||||
".to_vec();
|
||||
"
|
||||
.to_vec();
|
||||
let needle = b"clone_created".to_vec();
|
||||
|
||||
let searcher = BoyerMooreSearch::new(needle);
|
||||
@@ -992,10 +1011,11 @@ aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
|
||||
#[test]
|
||||
fn bm_win_gnu_indexing_bug() {
|
||||
let haystack_raw = vec![
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
];
|
||||
let needle = vec![1, 1, 1, 1, 1, 1, 1];
|
||||
let haystack = haystack_raw.as_slice();
|
||||
|
||||
@@ -1013,8 +1033,9 @@ aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
|
||||
|
||||
for i in 0..(haystack.len() - (needle.len() - 1)) {
|
||||
if haystack[i] == needle[0]
|
||||
&& &haystack[i..(i+needle.len())] == needle {
|
||||
return Some(i)
|
||||
&& &haystack[i..(i + needle.len())] == needle
|
||||
{
|
||||
return Some(i);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -11,10 +11,22 @@ pub struct Match {
|
||||
}
|
||||
|
||||
impl Teddy {
|
||||
pub fn available() -> bool { false }
|
||||
pub fn new(_pats: &Literals) -> Option<Teddy> { None }
|
||||
pub fn patterns(&self) -> &[Vec<u8>] { &[] }
|
||||
pub fn len(&self) -> usize { 0 }
|
||||
pub fn approximate_size(&self) -> usize { 0 }
|
||||
pub fn find(&self, _haystack: &[u8]) -> Option<Match> { None }
|
||||
pub fn available() -> bool {
|
||||
false
|
||||
}
|
||||
pub fn new(_pats: &Literals) -> Option<Teddy> {
|
||||
None
|
||||
}
|
||||
pub fn patterns(&self) -> &[Vec<u8>] {
|
||||
&[]
|
||||
}
|
||||
pub fn len(&self) -> usize {
|
||||
0
|
||||
}
|
||||
pub fn approximate_size(&self) -> usize {
|
||||
0
|
||||
}
|
||||
pub fn find(&self, _haystack: &[u8]) -> Option<Match> {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
@@ -12,7 +12,7 @@ use std::cmp;
|
||||
use aho_corasick::{self, AhoCorasick, AhoCorasickBuilder};
|
||||
use syntax::hir::literal::Literals;
|
||||
|
||||
use vector::avx2::{AVX2VectorBuilder, u8x32};
|
||||
use vector::avx2::{u8x32, AVX2VectorBuilder};
|
||||
|
||||
/// Corresponds to the number of bytes read at a time in the haystack.
|
||||
const BLOCK_SIZE: usize = 32;
|
||||
@@ -70,7 +70,8 @@ impl Teddy {
|
||||
return None;
|
||||
}
|
||||
|
||||
let pats: Vec<_> = pats.literals().iter().map(|p|p.to_vec()).collect();
|
||||
let pats: Vec<_> =
|
||||
pats.literals().iter().map(|p| p.to_vec()).collect();
|
||||
let min_len = pats.iter().map(|p| p.len()).min().unwrap_or(0);
|
||||
// Don't allow any empty patterns and require that we have at
|
||||
// least one pattern.
|
||||
@@ -347,12 +348,10 @@ impl Teddy {
|
||||
/// block based approach.
|
||||
#[inline(never)]
|
||||
fn slow(&self, haystack: &[u8], pos: usize) -> Option<Match> {
|
||||
self.ac.find(&haystack[pos..]).map(|m| {
|
||||
Match {
|
||||
pat: m.pattern(),
|
||||
start: pos + m.start(),
|
||||
end: pos + m.end(),
|
||||
}
|
||||
self.ac.find(&haystack[pos..]).map(|m| Match {
|
||||
pat: m.pattern(),
|
||||
start: pos + m.start(),
|
||||
end: pos + m.end(),
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -450,10 +449,7 @@ struct Mask {
|
||||
impl Mask {
|
||||
/// Create a new mask with no members.
|
||||
fn new(vb: AVX2VectorBuilder) -> Mask {
|
||||
Mask {
|
||||
lo: vb.u8x32_splat(0),
|
||||
hi: vb.u8x32_splat(0),
|
||||
}
|
||||
Mask { lo: vb.u8x32_splat(0), hi: vb.u8x32_splat(0) }
|
||||
}
|
||||
|
||||
/// Adds the given byte to the given bucket.
|
||||
|
||||
@@ -11,10 +11,22 @@ pub struct Match {
|
||||
}
|
||||
|
||||
impl Teddy {
|
||||
pub fn available() -> bool { false }
|
||||
pub fn new(_pats: &Literals) -> Option<Teddy> { None }
|
||||
pub fn patterns(&self) -> &[Vec<u8>] { &[] }
|
||||
pub fn len(&self) -> usize { 0 }
|
||||
pub fn approximate_size(&self) -> usize { 0 }
|
||||
pub fn find(&self, _haystack: &[u8]) -> Option<Match> { None }
|
||||
pub fn available() -> bool {
|
||||
false
|
||||
}
|
||||
pub fn new(_pats: &Literals) -> Option<Teddy> {
|
||||
None
|
||||
}
|
||||
pub fn patterns(&self) -> &[Vec<u8>] {
|
||||
&[]
|
||||
}
|
||||
pub fn len(&self) -> usize {
|
||||
0
|
||||
}
|
||||
pub fn approximate_size(&self) -> usize {
|
||||
0
|
||||
}
|
||||
pub fn find(&self, _haystack: &[u8]) -> Option<Match> {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
@@ -323,7 +323,7 @@ use std::cmp;
|
||||
use aho_corasick::{self, AhoCorasick, AhoCorasickBuilder};
|
||||
use syntax::hir::literal::Literals;
|
||||
|
||||
use vector::ssse3::{SSSE3VectorBuilder, u8x16};
|
||||
use vector::ssse3::{u8x16, SSSE3VectorBuilder};
|
||||
|
||||
/// Corresponds to the number of bytes read at a time in the haystack.
|
||||
const BLOCK_SIZE: usize = 16;
|
||||
@@ -381,7 +381,8 @@ impl Teddy {
|
||||
return None;
|
||||
}
|
||||
|
||||
let pats: Vec<_> = pats.literals().iter().map(|p|p.to_vec()).collect();
|
||||
let pats: Vec<_> =
|
||||
pats.literals().iter().map(|p| p.to_vec()).collect();
|
||||
let min_len = pats.iter().map(|p| p.len()).min().unwrap_or(0);
|
||||
// Don't allow any empty patterns and require that we have at
|
||||
// least one pattern.
|
||||
@@ -657,12 +658,10 @@ impl Teddy {
|
||||
/// block based approach.
|
||||
#[inline(never)]
|
||||
fn slow(&self, haystack: &[u8], pos: usize) -> Option<Match> {
|
||||
self.ac.find(&haystack[pos..]).map(|m| {
|
||||
Match {
|
||||
pat: m.pattern(),
|
||||
start: pos + m.start(),
|
||||
end: pos + m.end(),
|
||||
}
|
||||
self.ac.find(&haystack[pos..]).map(|m| Match {
|
||||
pat: m.pattern(),
|
||||
start: pos + m.start(),
|
||||
end: pos + m.end(),
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -760,10 +759,7 @@ struct Mask {
|
||||
impl Mask {
|
||||
/// Create a new mask with no members.
|
||||
fn new(vb: SSSE3VectorBuilder) -> Mask {
|
||||
Mask {
|
||||
lo: vb.u8x16_splat(0),
|
||||
hi: vb.u8x16_splat(0),
|
||||
}
|
||||
Mask { lo: vb.u8x16_splat(0), hi: vb.u8x16_splat(0) }
|
||||
}
|
||||
|
||||
/// Adds the given byte to the given bucket.
|
||||
|
||||
+2
-2
@@ -1,6 +1,6 @@
|
||||
use std::str::pattern::{Pattern, Searcher, SearchStep};
|
||||
use std::str::pattern::{Pattern, SearchStep, Searcher};
|
||||
|
||||
use re_unicode::{Regex, Matches};
|
||||
use re_unicode::{Matches, Regex};
|
||||
|
||||
pub struct RegexSearcher<'r, 't> {
|
||||
haystack: &'t str,
|
||||
|
||||
+9
-19
@@ -29,7 +29,7 @@ use std::mem;
|
||||
|
||||
use exec::ProgramCache;
|
||||
use input::{Input, InputAt};
|
||||
use prog::{Program, InstPtr};
|
||||
use prog::{InstPtr, Program};
|
||||
use re_trait::Slot;
|
||||
use sparse::SparseSet;
|
||||
|
||||
@@ -86,11 +86,7 @@ impl Cache {
|
||||
/// Create a new allocation used by the NFA machine to record execution
|
||||
/// and captures.
|
||||
pub fn new(_prog: &Program) -> Self {
|
||||
Cache {
|
||||
clist: Threads::new(),
|
||||
nlist: Threads::new(),
|
||||
stack: vec![],
|
||||
}
|
||||
Cache { clist: Threads::new(), nlist: Threads::new(), stack: vec![] }
|
||||
}
|
||||
}
|
||||
|
||||
@@ -114,11 +110,7 @@ impl<'r, I: Input> Fsm<'r, I> {
|
||||
cache.clist.resize(prog.len(), prog.captures.len());
|
||||
cache.nlist.resize(prog.len(), prog.captures.len());
|
||||
let at = input.at(start);
|
||||
Fsm {
|
||||
prog: prog,
|
||||
stack: &mut cache.stack,
|
||||
input: input,
|
||||
}.exec_(
|
||||
Fsm { prog: prog, stack: &mut cache.stack, input: input }.exec_(
|
||||
&mut cache.clist,
|
||||
&mut cache.nlist,
|
||||
matches,
|
||||
@@ -143,7 +135,7 @@ impl<'r, I: Input> Fsm<'r, I> {
|
||||
let mut all_matched = false;
|
||||
clist.set.clear();
|
||||
nlist.set.clear();
|
||||
'LOOP: loop {
|
||||
'LOOP: loop {
|
||||
if clist.set.is_empty() {
|
||||
// Three ways to bail out when our current set of threads is
|
||||
// empty.
|
||||
@@ -157,7 +149,8 @@ impl<'r, I: Input> Fsm<'r, I> {
|
||||
// soon as the last thread dies.
|
||||
if (matched && matches.len() <= 1)
|
||||
|| all_matched
|
||||
|| (!at.is_start() && self.prog.is_anchored_start) {
|
||||
|| (!at.is_start() && self.prog.is_anchored_start)
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -176,7 +169,8 @@ impl<'r, I: Input> Fsm<'r, I> {
|
||||
// a state starting at the current position in the input for the
|
||||
// beginning of the program only if we don't already have a match.
|
||||
if clist.set.is_empty()
|
||||
|| (!self.prog.is_anchored_start && !all_matched) {
|
||||
|| (!self.prog.is_anchored_start && !all_matched)
|
||||
{
|
||||
self.add(&mut clist, slots, 0, at);
|
||||
}
|
||||
// The previous call to "add" actually inspects the position just
|
||||
@@ -357,11 +351,7 @@ impl<'r, I: Input> Fsm<'r, I> {
|
||||
|
||||
impl Threads {
|
||||
fn new() -> Self {
|
||||
Threads {
|
||||
set: SparseSet::new(0),
|
||||
caps: vec![],
|
||||
slots_per_thread: 0,
|
||||
}
|
||||
Threads { set: SparseSet::new(0), caps: vec![], slots_per_thread: 0 }
|
||||
}
|
||||
|
||||
fn resize(&mut self, num_insts: usize, ncaps: usize) {
|
||||
|
||||
+40
-29
@@ -1,8 +1,8 @@
|
||||
use std::collections::HashMap;
|
||||
use std::cmp::Ordering;
|
||||
use std::collections::HashMap;
|
||||
use std::fmt;
|
||||
use std::ops::Deref;
|
||||
use std::mem;
|
||||
use std::ops::Deref;
|
||||
use std::slice;
|
||||
use std::sync::Arc;
|
||||
|
||||
@@ -93,7 +93,7 @@ impl Program {
|
||||
is_anchored_end: false,
|
||||
has_unicode_word_boundary: false,
|
||||
prefixes: LiteralSearcher::empty(),
|
||||
dfa_size_limit: 2 * (1<<20),
|
||||
dfa_size_limit: 2 * (1 << 20),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -149,12 +149,12 @@ impl Program {
|
||||
// Unicode codepoint programs) to store non-overlapping codepoint
|
||||
// ranges. To keep this operation constant time, we ignore them.
|
||||
(self.len() * mem::size_of::<Inst>())
|
||||
+ (self.matches.len() * mem::size_of::<InstPtr>())
|
||||
+ (self.captures.len() * mem::size_of::<Option<String>>())
|
||||
+ (self.capture_name_idx.len() *
|
||||
(mem::size_of::<String>() + mem::size_of::<usize>()))
|
||||
+ (self.byte_classes.len() * mem::size_of::<u8>())
|
||||
+ self.prefixes.approximate_size()
|
||||
+ (self.matches.len() * mem::size_of::<InstPtr>())
|
||||
+ (self.captures.len() * mem::size_of::<Option<String>>())
|
||||
+ (self.capture_name_idx.len()
|
||||
* (mem::size_of::<String>() + mem::size_of::<usize>()))
|
||||
+ (self.byte_classes.len() * mem::size_of::<u8>())
|
||||
+ self.prefixes.approximate_size()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -187,16 +187,17 @@ impl fmt::Debug for Program {
|
||||
|
||||
for (pc, inst) in self.iter().enumerate() {
|
||||
match *inst {
|
||||
Match(slot) => {
|
||||
write!(f, "{:04} Match({:?})", pc, slot)?
|
||||
}
|
||||
Match(slot) => write!(f, "{:04} Match({:?})", pc, slot)?,
|
||||
Save(ref inst) => {
|
||||
let s = format!("{:04} Save({})", pc, inst.slot);
|
||||
write!(f, "{}", with_goto(pc, inst.goto, s))?;
|
||||
}
|
||||
Split(ref inst) => {
|
||||
write!(
|
||||
f, "{:04} Split({}, {})", pc, inst.goto1, inst.goto2)?;
|
||||
f,
|
||||
"{:04} Split({}, {})",
|
||||
pc, inst.goto1, inst.goto2
|
||||
)?;
|
||||
}
|
||||
EmptyLook(ref inst) => {
|
||||
let s = format!("{:?}", inst.look);
|
||||
@@ -207,19 +208,25 @@ impl fmt::Debug for Program {
|
||||
write!(f, "{:04} {}", pc, with_goto(pc, inst.goto, s))?;
|
||||
}
|
||||
Ranges(ref inst) => {
|
||||
let ranges = inst.ranges
|
||||
let ranges = inst
|
||||
.ranges
|
||||
.iter()
|
||||
.map(|r| format!("{:?}-{:?}", r.0, r.1))
|
||||
.collect::<Vec<String>>()
|
||||
.join(", ");
|
||||
write!(
|
||||
f, "{:04} {}", pc, with_goto(pc, inst.goto, ranges))?;
|
||||
f,
|
||||
"{:04} {}",
|
||||
pc,
|
||||
with_goto(pc, inst.goto, ranges)
|
||||
)?;
|
||||
}
|
||||
Bytes(ref inst) => {
|
||||
let s = format!(
|
||||
"Bytes({}, {})",
|
||||
visible_byte(inst.start),
|
||||
visible_byte(inst.end));
|
||||
visible_byte(inst.end)
|
||||
);
|
||||
write!(f, "{:04} {}", pc, with_goto(pc, inst.goto, s))?;
|
||||
}
|
||||
}
|
||||
@@ -235,7 +242,9 @@ impl fmt::Debug for Program {
|
||||
impl<'a> IntoIterator for &'a Program {
|
||||
type Item = &'a Inst;
|
||||
type IntoIter = slice::Iter<'a, Inst>;
|
||||
fn into_iter(self) -> Self::IntoIter { self.iter() }
|
||||
fn into_iter(self) -> Self::IntoIter {
|
||||
self.iter()
|
||||
}
|
||||
}
|
||||
|
||||
/// Inst is an instruction code in a Regex program.
|
||||
@@ -382,24 +391,26 @@ impl InstRanges {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
self.ranges.binary_search_by(|r| {
|
||||
if r.1 < c {
|
||||
Ordering::Less
|
||||
} else if r.0 > c {
|
||||
Ordering::Greater
|
||||
} else {
|
||||
Ordering::Equal
|
||||
}
|
||||
}).is_ok()
|
||||
self.ranges
|
||||
.binary_search_by(|r| {
|
||||
if r.1 < c {
|
||||
Ordering::Less
|
||||
} else if r.0 > c {
|
||||
Ordering::Greater
|
||||
} else {
|
||||
Ordering::Equal
|
||||
}
|
||||
})
|
||||
.is_ok()
|
||||
}
|
||||
|
||||
/// Return the number of distinct characters represented by all of the
|
||||
/// ranges.
|
||||
pub fn num_chars(&self) -> usize {
|
||||
self.ranges.iter()
|
||||
self.ranges
|
||||
.iter()
|
||||
.map(|&(s, e)| 1 + (e as u32) - (s as u32))
|
||||
.fold(0, |acc, len| acc + len)
|
||||
as usize
|
||||
.fold(0, |acc, len| acc + len) as usize
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
+346
-305
@@ -29,8 +29,8 @@ impl Default for RegexOptions {
|
||||
fn default() -> Self {
|
||||
RegexOptions {
|
||||
pats: vec![],
|
||||
size_limit: 10 * (1<<20),
|
||||
dfa_size_limit: 2 * (1<<20),
|
||||
size_limit: 10 * (1 << 20),
|
||||
dfa_size_limit: 2 * (1 << 20),
|
||||
nest_limit: 250,
|
||||
case_insensitive: false,
|
||||
multi_line: false,
|
||||
@@ -46,180 +46,195 @@ impl Default for RegexOptions {
|
||||
macro_rules! define_builder {
|
||||
($name:ident, $regex_mod:ident, $only_utf8:expr) => {
|
||||
pub mod $name {
|
||||
use super::RegexOptions;
|
||||
use error::Error;
|
||||
use exec::ExecBuilder;
|
||||
use super::RegexOptions;
|
||||
|
||||
use $regex_mod::Regex;
|
||||
|
||||
/// A configurable builder for a regular expression.
|
||||
///
|
||||
/// A builder can be used to configure how the regex is built, for example, by
|
||||
/// setting the default flags (which can be overridden in the expression
|
||||
/// itself) or setting various limits.
|
||||
pub struct RegexBuilder(RegexOptions);
|
||||
/// A configurable builder for a regular expression.
|
||||
///
|
||||
/// A builder can be used to configure how the regex is built, for example, by
|
||||
/// setting the default flags (which can be overridden in the expression
|
||||
/// itself) or setting various limits.
|
||||
pub struct RegexBuilder(RegexOptions);
|
||||
|
||||
impl RegexBuilder {
|
||||
/// Create a new regular expression builder with the given pattern.
|
||||
///
|
||||
/// If the pattern is invalid, then an error will be returned when
|
||||
/// `build` is called.
|
||||
pub fn new(pattern: &str) -> RegexBuilder {
|
||||
let mut builder = RegexBuilder(RegexOptions::default());
|
||||
builder.0.pats.push(pattern.to_owned());
|
||||
builder
|
||||
}
|
||||
impl RegexBuilder {
|
||||
/// Create a new regular expression builder with the given pattern.
|
||||
///
|
||||
/// If the pattern is invalid, then an error will be returned when
|
||||
/// `build` is called.
|
||||
pub fn new(pattern: &str) -> RegexBuilder {
|
||||
let mut builder = RegexBuilder(RegexOptions::default());
|
||||
builder.0.pats.push(pattern.to_owned());
|
||||
builder
|
||||
}
|
||||
|
||||
/// Consume the builder and compile the regular expression.
|
||||
///
|
||||
/// Note that calling `as_str` on the resulting `Regex` will produce the
|
||||
/// pattern given to `new` verbatim. Notably, it will not incorporate any
|
||||
/// of the flags set on this builder.
|
||||
pub fn build(&self) -> Result<Regex, Error> {
|
||||
ExecBuilder::new_options(self.0.clone())
|
||||
.only_utf8($only_utf8)
|
||||
.build()
|
||||
.map(Regex::from)
|
||||
}
|
||||
/// Consume the builder and compile the regular expression.
|
||||
///
|
||||
/// Note that calling `as_str` on the resulting `Regex` will produce the
|
||||
/// pattern given to `new` verbatim. Notably, it will not incorporate any
|
||||
/// of the flags set on this builder.
|
||||
pub fn build(&self) -> Result<Regex, Error> {
|
||||
ExecBuilder::new_options(self.0.clone())
|
||||
.only_utf8($only_utf8)
|
||||
.build()
|
||||
.map(Regex::from)
|
||||
}
|
||||
|
||||
/// Set the value for the case insensitive (`i`) flag.
|
||||
///
|
||||
/// When enabled, letters in the pattern will match both upper case and
|
||||
/// lower case variants.
|
||||
pub fn case_insensitive(&mut self, yes: bool) -> &mut RegexBuilder {
|
||||
self.0.case_insensitive = yes;
|
||||
self
|
||||
}
|
||||
/// Set the value for the case insensitive (`i`) flag.
|
||||
///
|
||||
/// When enabled, letters in the pattern will match both upper case and
|
||||
/// lower case variants.
|
||||
pub fn case_insensitive(
|
||||
&mut self,
|
||||
yes: bool,
|
||||
) -> &mut RegexBuilder {
|
||||
self.0.case_insensitive = yes;
|
||||
self
|
||||
}
|
||||
|
||||
/// Set the value for the multi-line matching (`m`) flag.
|
||||
///
|
||||
/// When enabled, `^` matches the beginning of lines and `$` matches the
|
||||
/// end of lines.
|
||||
///
|
||||
/// By default, they match beginning/end of the input.
|
||||
pub fn multi_line(&mut self, yes: bool) -> &mut RegexBuilder {
|
||||
self.0.multi_line = yes;
|
||||
self
|
||||
}
|
||||
/// Set the value for the multi-line matching (`m`) flag.
|
||||
///
|
||||
/// When enabled, `^` matches the beginning of lines and `$` matches the
|
||||
/// end of lines.
|
||||
///
|
||||
/// By default, they match beginning/end of the input.
|
||||
pub fn multi_line(&mut self, yes: bool) -> &mut RegexBuilder {
|
||||
self.0.multi_line = yes;
|
||||
self
|
||||
}
|
||||
|
||||
/// Set the value for the any character (`s`) flag, where in `.` matches
|
||||
/// anything when `s` is set and matches anything except for new line when
|
||||
/// it is not set (the default).
|
||||
///
|
||||
/// N.B. "matches anything" means "any byte" when Unicode is disabled and
|
||||
/// means "any valid UTF-8 encoding of any Unicode scalar value" when
|
||||
/// Unicode is enabled.
|
||||
pub fn dot_matches_new_line(&mut self, yes: bool) -> &mut RegexBuilder {
|
||||
self.0.dot_matches_new_line = yes;
|
||||
self
|
||||
}
|
||||
/// Set the value for the any character (`s`) flag, where in `.` matches
|
||||
/// anything when `s` is set and matches anything except for new line when
|
||||
/// it is not set (the default).
|
||||
///
|
||||
/// N.B. "matches anything" means "any byte" when Unicode is disabled and
|
||||
/// means "any valid UTF-8 encoding of any Unicode scalar value" when
|
||||
/// Unicode is enabled.
|
||||
pub fn dot_matches_new_line(
|
||||
&mut self,
|
||||
yes: bool,
|
||||
) -> &mut RegexBuilder {
|
||||
self.0.dot_matches_new_line = yes;
|
||||
self
|
||||
}
|
||||
|
||||
/// Set the value for the greedy swap (`U`) flag.
|
||||
///
|
||||
/// When enabled, a pattern like `a*` is lazy (tries to find shortest
|
||||
/// match) and `a*?` is greedy (tries to find longest match).
|
||||
///
|
||||
/// By default, `a*` is greedy and `a*?` is lazy.
|
||||
pub fn swap_greed(&mut self, yes: bool) -> &mut RegexBuilder {
|
||||
self.0.swap_greed = yes;
|
||||
self
|
||||
}
|
||||
/// Set the value for the greedy swap (`U`) flag.
|
||||
///
|
||||
/// When enabled, a pattern like `a*` is lazy (tries to find shortest
|
||||
/// match) and `a*?` is greedy (tries to find longest match).
|
||||
///
|
||||
/// By default, `a*` is greedy and `a*?` is lazy.
|
||||
pub fn swap_greed(&mut self, yes: bool) -> &mut RegexBuilder {
|
||||
self.0.swap_greed = yes;
|
||||
self
|
||||
}
|
||||
|
||||
/// Set the value for the ignore whitespace (`x`) flag.
|
||||
///
|
||||
/// When enabled, whitespace such as new lines and spaces will be ignored
|
||||
/// between expressions of the pattern, and `#` can be used to start a
|
||||
/// comment until the next new line.
|
||||
pub fn ignore_whitespace(&mut self, yes: bool) -> &mut RegexBuilder {
|
||||
self.0.ignore_whitespace = yes;
|
||||
self
|
||||
}
|
||||
/// Set the value for the ignore whitespace (`x`) flag.
|
||||
///
|
||||
/// When enabled, whitespace such as new lines and spaces will be ignored
|
||||
/// between expressions of the pattern, and `#` can be used to start a
|
||||
/// comment until the next new line.
|
||||
pub fn ignore_whitespace(
|
||||
&mut self,
|
||||
yes: bool,
|
||||
) -> &mut RegexBuilder {
|
||||
self.0.ignore_whitespace = yes;
|
||||
self
|
||||
}
|
||||
|
||||
/// Set the value for the Unicode (`u`) flag.
|
||||
///
|
||||
/// Enabled by default. When disabled, character classes such as `\w` only
|
||||
/// match ASCII word characters instead of all Unicode word characters.
|
||||
pub fn unicode(&mut self, yes: bool) -> &mut RegexBuilder {
|
||||
self.0.unicode = yes;
|
||||
self
|
||||
}
|
||||
/// Set the value for the Unicode (`u`) flag.
|
||||
///
|
||||
/// Enabled by default. When disabled, character classes such as `\w` only
|
||||
/// match ASCII word characters instead of all Unicode word characters.
|
||||
pub fn unicode(&mut self, yes: bool) -> &mut RegexBuilder {
|
||||
self.0.unicode = yes;
|
||||
self
|
||||
}
|
||||
|
||||
/// Whether to support octal syntax or not.
|
||||
///
|
||||
/// Octal syntax is a little-known way of uttering Unicode codepoints in
|
||||
/// a regular expression. For example, `a`, `\x61`, `\u0061` and
|
||||
/// `\141` are all equivalent regular expressions, where the last example
|
||||
/// shows octal syntax.
|
||||
///
|
||||
/// While supporting octal syntax isn't in and of itself a problem, it does
|
||||
/// make good error messages harder. That is, in PCRE based regex engines,
|
||||
/// syntax like `\0` invokes a backreference, which is explicitly
|
||||
/// unsupported in Rust's regex engine. However, many users expect it to
|
||||
/// be supported. Therefore, when octal support is disabled, the error
|
||||
/// message will explicitly mention that backreferences aren't supported.
|
||||
///
|
||||
/// Octal syntax is disabled by default.
|
||||
pub fn octal(&mut self, yes: bool) -> &mut RegexBuilder {
|
||||
self.0.octal = yes;
|
||||
self
|
||||
}
|
||||
/// Whether to support octal syntax or not.
|
||||
///
|
||||
/// Octal syntax is a little-known way of uttering Unicode codepoints in
|
||||
/// a regular expression. For example, `a`, `\x61`, `\u0061` and
|
||||
/// `\141` are all equivalent regular expressions, where the last example
|
||||
/// shows octal syntax.
|
||||
///
|
||||
/// While supporting octal syntax isn't in and of itself a problem, it does
|
||||
/// make good error messages harder. That is, in PCRE based regex engines,
|
||||
/// syntax like `\0` invokes a backreference, which is explicitly
|
||||
/// unsupported in Rust's regex engine. However, many users expect it to
|
||||
/// be supported. Therefore, when octal support is disabled, the error
|
||||
/// message will explicitly mention that backreferences aren't supported.
|
||||
///
|
||||
/// Octal syntax is disabled by default.
|
||||
pub fn octal(&mut self, yes: bool) -> &mut RegexBuilder {
|
||||
self.0.octal = yes;
|
||||
self
|
||||
}
|
||||
|
||||
/// Set the approximate size limit of the compiled regular expression.
|
||||
///
|
||||
/// This roughly corresponds to the number of bytes occupied by a single
|
||||
/// compiled program. If the program exceeds this number, then a
|
||||
/// compilation error is returned.
|
||||
pub fn size_limit(&mut self, limit: usize) -> &mut RegexBuilder {
|
||||
self.0.size_limit = limit;
|
||||
self
|
||||
}
|
||||
/// Set the approximate size limit of the compiled regular expression.
|
||||
///
|
||||
/// This roughly corresponds to the number of bytes occupied by a single
|
||||
/// compiled program. If the program exceeds this number, then a
|
||||
/// compilation error is returned.
|
||||
pub fn size_limit(
|
||||
&mut self,
|
||||
limit: usize,
|
||||
) -> &mut RegexBuilder {
|
||||
self.0.size_limit = limit;
|
||||
self
|
||||
}
|
||||
|
||||
/// Set the approximate size of the cache used by the DFA.
|
||||
///
|
||||
/// This roughly corresponds to the number of bytes that the DFA will
|
||||
/// use while searching.
|
||||
///
|
||||
/// Note that this is a *per thread* limit. There is no way to set a global
|
||||
/// limit. In particular, if a regex is used from multiple threads
|
||||
/// simultaneously, then each thread may use up to the number of bytes
|
||||
/// specified here.
|
||||
pub fn dfa_size_limit(&mut self, limit: usize) -> &mut RegexBuilder {
|
||||
self.0.dfa_size_limit = limit;
|
||||
self
|
||||
}
|
||||
/// Set the approximate size of the cache used by the DFA.
|
||||
///
|
||||
/// This roughly corresponds to the number of bytes that the DFA will
|
||||
/// use while searching.
|
||||
///
|
||||
/// Note that this is a *per thread* limit. There is no way to set a global
|
||||
/// limit. In particular, if a regex is used from multiple threads
|
||||
/// simultaneously, then each thread may use up to the number of bytes
|
||||
/// specified here.
|
||||
pub fn dfa_size_limit(
|
||||
&mut self,
|
||||
limit: usize,
|
||||
) -> &mut RegexBuilder {
|
||||
self.0.dfa_size_limit = limit;
|
||||
self
|
||||
}
|
||||
|
||||
/// Set the nesting limit for this parser.
|
||||
///
|
||||
/// The nesting limit controls how deep the abstract syntax tree is allowed
|
||||
/// to be. If the AST exceeds the given limit (e.g., with too many nested
|
||||
/// groups), then an error is returned by the parser.
|
||||
///
|
||||
/// The purpose of this limit is to act as a heuristic to prevent stack
|
||||
/// overflow for consumers that do structural induction on an `Ast` using
|
||||
/// explicit recursion. While this crate never does this (instead using
|
||||
/// constant stack space and moving the call stack to the heap), other
|
||||
/// crates may.
|
||||
///
|
||||
/// This limit is not checked until the entire Ast is parsed. Therefore,
|
||||
/// if callers want to put a limit on the amount of heap space used, then
|
||||
/// they should impose a limit on the length, in bytes, of the concrete
|
||||
/// pattern string. In particular, this is viable since this parser
|
||||
/// implementation will limit itself to heap space proportional to the
|
||||
/// length of the pattern string.
|
||||
///
|
||||
/// Note that a nest limit of `0` will return a nest limit error for most
|
||||
/// patterns but not all. For example, a nest limit of `0` permits `a` but
|
||||
/// not `ab`, since `ab` requires a concatenation, which results in a nest
|
||||
/// depth of `1`. In general, a nest limit is not something that manifests
|
||||
/// in an obvious way in the concrete syntax, therefore, it should not be
|
||||
/// used in a granular way.
|
||||
pub fn nest_limit(&mut self, limit: u32) -> &mut RegexBuilder {
|
||||
self.0.nest_limit = limit;
|
||||
self
|
||||
}
|
||||
}
|
||||
/// Set the nesting limit for this parser.
|
||||
///
|
||||
/// The nesting limit controls how deep the abstract syntax tree is allowed
|
||||
/// to be. If the AST exceeds the given limit (e.g., with too many nested
|
||||
/// groups), then an error is returned by the parser.
|
||||
///
|
||||
/// The purpose of this limit is to act as a heuristic to prevent stack
|
||||
/// overflow for consumers that do structural induction on an `Ast` using
|
||||
/// explicit recursion. While this crate never does this (instead using
|
||||
/// constant stack space and moving the call stack to the heap), other
|
||||
/// crates may.
|
||||
///
|
||||
/// This limit is not checked until the entire Ast is parsed. Therefore,
|
||||
/// if callers want to put a limit on the amount of heap space used, then
|
||||
/// they should impose a limit on the length, in bytes, of the concrete
|
||||
/// pattern string. In particular, this is viable since this parser
|
||||
/// implementation will limit itself to heap space proportional to the
|
||||
/// length of the pattern string.
|
||||
///
|
||||
/// Note that a nest limit of `0` will return a nest limit error for most
|
||||
/// patterns but not all. For example, a nest limit of `0` permits `a` but
|
||||
/// not `ab`, since `ab` requires a concatenation, which results in a nest
|
||||
/// depth of `1`. In general, a nest limit is not something that manifests
|
||||
/// in an obvious way in the concrete syntax, therefore, it should not be
|
||||
/// used in a granular way.
|
||||
pub fn nest_limit(&mut self, limit: u32) -> &mut RegexBuilder {
|
||||
self.0.nest_limit = limit;
|
||||
self
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
define_builder!(bytes, re_bytes, false);
|
||||
@@ -228,160 +243,186 @@ define_builder!(unicode, re_unicode, true);
|
||||
macro_rules! define_set_builder {
|
||||
($name:ident, $regex_mod:ident, $only_utf8:expr) => {
|
||||
pub mod $name {
|
||||
use super::RegexOptions;
|
||||
use error::Error;
|
||||
use exec::ExecBuilder;
|
||||
use super::RegexOptions;
|
||||
|
||||
use re_set::$regex_mod::RegexSet;
|
||||
|
||||
/// A configurable builder for a set of regular expressions.
|
||||
///
|
||||
/// A builder can be used to configure how the regexes are built, for example,
|
||||
/// by setting the default flags (which can be overridden in the expression
|
||||
/// itself) or setting various limits.
|
||||
pub struct RegexSetBuilder(RegexOptions);
|
||||
/// A configurable builder for a set of regular expressions.
|
||||
///
|
||||
/// A builder can be used to configure how the regexes are built, for example,
|
||||
/// by setting the default flags (which can be overridden in the expression
|
||||
/// itself) or setting various limits.
|
||||
pub struct RegexSetBuilder(RegexOptions);
|
||||
|
||||
impl RegexSetBuilder {
|
||||
/// Create a new regular expression builder with the given pattern.
|
||||
///
|
||||
/// If the pattern is invalid, then an error will be returned when
|
||||
/// `build` is called.
|
||||
pub fn new<I, S>(patterns: I) -> RegexSetBuilder
|
||||
where S: AsRef<str>, I: IntoIterator<Item=S> {
|
||||
let mut builder = RegexSetBuilder(RegexOptions::default());
|
||||
for pat in patterns {
|
||||
builder.0.pats.push(pat.as_ref().to_owned());
|
||||
impl RegexSetBuilder {
|
||||
/// Create a new regular expression builder with the given pattern.
|
||||
///
|
||||
/// If the pattern is invalid, then an error will be returned when
|
||||
/// `build` is called.
|
||||
pub fn new<I, S>(patterns: I) -> RegexSetBuilder
|
||||
where
|
||||
S: AsRef<str>,
|
||||
I: IntoIterator<Item = S>,
|
||||
{
|
||||
let mut builder = RegexSetBuilder(RegexOptions::default());
|
||||
for pat in patterns {
|
||||
builder.0.pats.push(pat.as_ref().to_owned());
|
||||
}
|
||||
builder
|
||||
}
|
||||
|
||||
/// Consume the builder and compile the regular expressions into a set.
|
||||
pub fn build(&self) -> Result<RegexSet, Error> {
|
||||
ExecBuilder::new_options(self.0.clone())
|
||||
.only_utf8($only_utf8)
|
||||
.build()
|
||||
.map(RegexSet::from)
|
||||
}
|
||||
|
||||
/// Set the value for the case insensitive (`i`) flag.
|
||||
pub fn case_insensitive(
|
||||
&mut self,
|
||||
yes: bool,
|
||||
) -> &mut RegexSetBuilder {
|
||||
self.0.case_insensitive = yes;
|
||||
self
|
||||
}
|
||||
|
||||
/// Set the value for the multi-line matching (`m`) flag.
|
||||
pub fn multi_line(
|
||||
&mut self,
|
||||
yes: bool,
|
||||
) -> &mut RegexSetBuilder {
|
||||
self.0.multi_line = yes;
|
||||
self
|
||||
}
|
||||
|
||||
/// Set the value for the any character (`s`) flag, where in `.` matches
|
||||
/// anything when `s` is set and matches anything except for new line when
|
||||
/// it is not set (the default).
|
||||
///
|
||||
/// N.B. "matches anything" means "any byte" for `regex::bytes::RegexSet`
|
||||
/// expressions and means "any Unicode scalar value" for `regex::RegexSet`
|
||||
/// expressions.
|
||||
pub fn dot_matches_new_line(
|
||||
&mut self,
|
||||
yes: bool,
|
||||
) -> &mut RegexSetBuilder {
|
||||
self.0.dot_matches_new_line = yes;
|
||||
self
|
||||
}
|
||||
|
||||
/// Set the value for the greedy swap (`U`) flag.
|
||||
pub fn swap_greed(
|
||||
&mut self,
|
||||
yes: bool,
|
||||
) -> &mut RegexSetBuilder {
|
||||
self.0.swap_greed = yes;
|
||||
self
|
||||
}
|
||||
|
||||
/// Set the value for the ignore whitespace (`x`) flag.
|
||||
pub fn ignore_whitespace(
|
||||
&mut self,
|
||||
yes: bool,
|
||||
) -> &mut RegexSetBuilder {
|
||||
self.0.ignore_whitespace = yes;
|
||||
self
|
||||
}
|
||||
|
||||
/// Set the value for the Unicode (`u`) flag.
|
||||
pub fn unicode(&mut self, yes: bool) -> &mut RegexSetBuilder {
|
||||
self.0.unicode = yes;
|
||||
self
|
||||
}
|
||||
|
||||
/// Whether to support octal syntax or not.
|
||||
///
|
||||
/// Octal syntax is a little-known way of uttering Unicode codepoints in
|
||||
/// a regular expression. For example, `a`, `\x61`, `\u0061` and
|
||||
/// `\141` are all equivalent regular expressions, where the last example
|
||||
/// shows octal syntax.
|
||||
///
|
||||
/// While supporting octal syntax isn't in and of itself a problem, it does
|
||||
/// make good error messages harder. That is, in PCRE based regex engines,
|
||||
/// syntax like `\0` invokes a backreference, which is explicitly
|
||||
/// unsupported in Rust's regex engine. However, many users expect it to
|
||||
/// be supported. Therefore, when octal support is disabled, the error
|
||||
/// message will explicitly mention that backreferences aren't supported.
|
||||
///
|
||||
/// Octal syntax is disabled by default.
|
||||
pub fn octal(&mut self, yes: bool) -> &mut RegexSetBuilder {
|
||||
self.0.octal = yes;
|
||||
self
|
||||
}
|
||||
|
||||
/// Set the approximate size limit of the compiled regular expression.
|
||||
///
|
||||
/// This roughly corresponds to the number of bytes occupied by a single
|
||||
/// compiled program. If the program exceeds this number, then a
|
||||
/// compilation error is returned.
|
||||
pub fn size_limit(
|
||||
&mut self,
|
||||
limit: usize,
|
||||
) -> &mut RegexSetBuilder {
|
||||
self.0.size_limit = limit;
|
||||
self
|
||||
}
|
||||
|
||||
/// Set the approximate size of the cache used by the DFA.
|
||||
///
|
||||
/// This roughly corresponds to the number of bytes that the DFA will
|
||||
/// use while searching.
|
||||
///
|
||||
/// Note that this is a *per thread* limit. There is no way to set a global
|
||||
/// limit. In particular, if a regex is used from multiple threads
|
||||
/// simultaneously, then each thread may use up to the number of bytes
|
||||
/// specified here.
|
||||
pub fn dfa_size_limit(
|
||||
&mut self,
|
||||
limit: usize,
|
||||
) -> &mut RegexSetBuilder {
|
||||
self.0.dfa_size_limit = limit;
|
||||
self
|
||||
}
|
||||
|
||||
/// Set the nesting limit for this parser.
|
||||
///
|
||||
/// The nesting limit controls how deep the abstract syntax tree is allowed
|
||||
/// to be. If the AST exceeds the given limit (e.g., with too many nested
|
||||
/// groups), then an error is returned by the parser.
|
||||
///
|
||||
/// The purpose of this limit is to act as a heuristic to prevent stack
|
||||
/// overflow for consumers that do structural induction on an `Ast` using
|
||||
/// explicit recursion. While this crate never does this (instead using
|
||||
/// constant stack space and moving the call stack to the heap), other
|
||||
/// crates may.
|
||||
///
|
||||
/// This limit is not checked until the entire Ast is parsed. Therefore,
|
||||
/// if callers want to put a limit on the amount of heap space used, then
|
||||
/// they should impose a limit on the length, in bytes, of the concrete
|
||||
/// pattern string. In particular, this is viable since this parser
|
||||
/// implementation will limit itself to heap space proportional to the
|
||||
/// length of the pattern string.
|
||||
///
|
||||
/// Note that a nest limit of `0` will return a nest limit error for most
|
||||
/// patterns but not all. For example, a nest limit of `0` permits `a` but
|
||||
/// not `ab`, since `ab` requires a concatenation, which results in a nest
|
||||
/// depth of `1`. In general, a nest limit is not something that manifests
|
||||
/// in an obvious way in the concrete syntax, therefore, it should not be
|
||||
/// used in a granular way.
|
||||
pub fn nest_limit(
|
||||
&mut self,
|
||||
limit: u32,
|
||||
) -> &mut RegexSetBuilder {
|
||||
self.0.nest_limit = limit;
|
||||
self
|
||||
}
|
||||
}
|
||||
}
|
||||
builder
|
||||
}
|
||||
|
||||
/// Consume the builder and compile the regular expressions into a set.
|
||||
pub fn build(&self) -> Result<RegexSet, Error> {
|
||||
ExecBuilder::new_options(self.0.clone())
|
||||
.only_utf8($only_utf8)
|
||||
.build()
|
||||
.map(RegexSet::from)
|
||||
}
|
||||
|
||||
/// Set the value for the case insensitive (`i`) flag.
|
||||
pub fn case_insensitive(&mut self, yes: bool) -> &mut RegexSetBuilder {
|
||||
self.0.case_insensitive = yes;
|
||||
self
|
||||
}
|
||||
|
||||
/// Set the value for the multi-line matching (`m`) flag.
|
||||
pub fn multi_line(&mut self, yes: bool) -> &mut RegexSetBuilder {
|
||||
self.0.multi_line = yes;
|
||||
self
|
||||
}
|
||||
|
||||
/// Set the value for the any character (`s`) flag, where in `.` matches
|
||||
/// anything when `s` is set and matches anything except for new line when
|
||||
/// it is not set (the default).
|
||||
///
|
||||
/// N.B. "matches anything" means "any byte" for `regex::bytes::RegexSet`
|
||||
/// expressions and means "any Unicode scalar value" for `regex::RegexSet`
|
||||
/// expressions.
|
||||
pub fn dot_matches_new_line(&mut self, yes: bool) -> &mut RegexSetBuilder {
|
||||
self.0.dot_matches_new_line = yes;
|
||||
self
|
||||
}
|
||||
|
||||
/// Set the value for the greedy swap (`U`) flag.
|
||||
pub fn swap_greed(&mut self, yes: bool) -> &mut RegexSetBuilder {
|
||||
self.0.swap_greed = yes;
|
||||
self
|
||||
}
|
||||
|
||||
/// Set the value for the ignore whitespace (`x`) flag.
|
||||
pub fn ignore_whitespace(&mut self, yes: bool) -> &mut RegexSetBuilder {
|
||||
self.0.ignore_whitespace = yes;
|
||||
self
|
||||
}
|
||||
|
||||
/// Set the value for the Unicode (`u`) flag.
|
||||
pub fn unicode(&mut self, yes: bool) -> &mut RegexSetBuilder {
|
||||
self.0.unicode = yes;
|
||||
self
|
||||
}
|
||||
|
||||
/// Whether to support octal syntax or not.
|
||||
///
|
||||
/// Octal syntax is a little-known way of uttering Unicode codepoints in
|
||||
/// a regular expression. For example, `a`, `\x61`, `\u0061` and
|
||||
/// `\141` are all equivalent regular expressions, where the last example
|
||||
/// shows octal syntax.
|
||||
///
|
||||
/// While supporting octal syntax isn't in and of itself a problem, it does
|
||||
/// make good error messages harder. That is, in PCRE based regex engines,
|
||||
/// syntax like `\0` invokes a backreference, which is explicitly
|
||||
/// unsupported in Rust's regex engine. However, many users expect it to
|
||||
/// be supported. Therefore, when octal support is disabled, the error
|
||||
/// message will explicitly mention that backreferences aren't supported.
|
||||
///
|
||||
/// Octal syntax is disabled by default.
|
||||
pub fn octal(&mut self, yes: bool) -> &mut RegexSetBuilder {
|
||||
self.0.octal = yes;
|
||||
self
|
||||
}
|
||||
|
||||
/// Set the approximate size limit of the compiled regular expression.
|
||||
///
|
||||
/// This roughly corresponds to the number of bytes occupied by a single
|
||||
/// compiled program. If the program exceeds this number, then a
|
||||
/// compilation error is returned.
|
||||
pub fn size_limit(&mut self, limit: usize) -> &mut RegexSetBuilder {
|
||||
self.0.size_limit = limit;
|
||||
self
|
||||
}
|
||||
|
||||
/// Set the approximate size of the cache used by the DFA.
|
||||
///
|
||||
/// This roughly corresponds to the number of bytes that the DFA will
|
||||
/// use while searching.
|
||||
///
|
||||
/// Note that this is a *per thread* limit. There is no way to set a global
|
||||
/// limit. In particular, if a regex is used from multiple threads
|
||||
/// simultaneously, then each thread may use up to the number of bytes
|
||||
/// specified here.
|
||||
pub fn dfa_size_limit(&mut self, limit: usize) -> &mut RegexSetBuilder {
|
||||
self.0.dfa_size_limit = limit;
|
||||
self
|
||||
}
|
||||
|
||||
/// Set the nesting limit for this parser.
|
||||
///
|
||||
/// The nesting limit controls how deep the abstract syntax tree is allowed
|
||||
/// to be. If the AST exceeds the given limit (e.g., with too many nested
|
||||
/// groups), then an error is returned by the parser.
|
||||
///
|
||||
/// The purpose of this limit is to act as a heuristic to prevent stack
|
||||
/// overflow for consumers that do structural induction on an `Ast` using
|
||||
/// explicit recursion. While this crate never does this (instead using
|
||||
/// constant stack space and moving the call stack to the heap), other
|
||||
/// crates may.
|
||||
///
|
||||
/// This limit is not checked until the entire Ast is parsed. Therefore,
|
||||
/// if callers want to put a limit on the amount of heap space used, then
|
||||
/// they should impose a limit on the length, in bytes, of the concrete
|
||||
/// pattern string. In particular, this is viable since this parser
|
||||
/// implementation will limit itself to heap space proportional to the
|
||||
/// length of the pattern string.
|
||||
///
|
||||
/// Note that a nest limit of `0` will return a nest limit error for most
|
||||
/// patterns but not all. For example, a nest limit of `0` permits `a` but
|
||||
/// not `ab`, since `ab` requires a concatenation, which results in a nest
|
||||
/// depth of `1`. In general, a nest limit is not something that manifests
|
||||
/// in an obvious way in the concrete syntax, therefore, it should not be
|
||||
/// used in a granular way.
|
||||
pub fn nest_limit(&mut self, limit: u32) -> &mut RegexSetBuilder {
|
||||
self.0.nest_limit = limit;
|
||||
self
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
define_set_builder!(set_bytes, bytes, false);
|
||||
|
||||
+28
-28
@@ -17,9 +17,9 @@ use std::sync::Arc;
|
||||
|
||||
use memchr::memchr;
|
||||
|
||||
use error::Error;
|
||||
use exec::{Exec, ExecNoSync};
|
||||
use expand::expand_bytes;
|
||||
use error::Error;
|
||||
use re_builder::bytes::RegexBuilder;
|
||||
use re_trait::{self, RegularExpression, SubCapturesPosIter};
|
||||
|
||||
@@ -55,11 +55,7 @@ impl<'t> Match<'t> {
|
||||
/// Creates a new match from the given haystack and byte offsets.
|
||||
#[inline]
|
||||
fn new(haystack: &'t [u8], start: usize, end: usize) -> Match<'t> {
|
||||
Match {
|
||||
text: haystack,
|
||||
start: start,
|
||||
end: end,
|
||||
}
|
||||
Match { text: haystack, start: start, end: end }
|
||||
}
|
||||
}
|
||||
|
||||
@@ -314,10 +310,7 @@ impl Regex {
|
||||
/// # }
|
||||
/// ```
|
||||
pub fn split<'r, 't>(&'r self, text: &'t [u8]) -> Split<'r, 't> {
|
||||
Split {
|
||||
finder: self.find_iter(text),
|
||||
last: 0,
|
||||
}
|
||||
Split { finder: self.find_iter(text), last: 0 }
|
||||
}
|
||||
|
||||
/// Returns an iterator of at most `limit` substrings of `text` delimited
|
||||
@@ -345,10 +338,7 @@ impl Regex {
|
||||
text: &'t [u8],
|
||||
limit: usize,
|
||||
) -> SplitN<'r, 't> {
|
||||
SplitN {
|
||||
splits: self.split(text),
|
||||
n: limit,
|
||||
}
|
||||
SplitN { splits: self.split(text), n: limit }
|
||||
}
|
||||
|
||||
/// Replaces the leftmost-first match with the replacement provided. The
|
||||
@@ -502,7 +492,7 @@ impl Regex {
|
||||
let mut last_match = 0;
|
||||
for (i, m) in it {
|
||||
if limit > 0 && i >= limit {
|
||||
break
|
||||
break;
|
||||
}
|
||||
new.extend_from_slice(&text[last_match..m.start()]);
|
||||
new.extend_from_slice(&rep);
|
||||
@@ -522,7 +512,7 @@ impl Regex {
|
||||
let mut last_match = 0;
|
||||
for (i, cap) in it {
|
||||
if limit > 0 && i >= limit {
|
||||
break
|
||||
break;
|
||||
}
|
||||
// unwrap on 0 is OK because captures only reports matches
|
||||
let m = cap.get(0).unwrap();
|
||||
@@ -597,7 +587,9 @@ impl Regex {
|
||||
text: &'t [u8],
|
||||
start: usize,
|
||||
) -> Option<Match<'t>> {
|
||||
self.0.searcher().find_at(text, start)
|
||||
self.0
|
||||
.searcher()
|
||||
.find_at(text, start)
|
||||
.map(|(s, e)| Match::new(text, s, e))
|
||||
}
|
||||
|
||||
@@ -712,7 +704,9 @@ impl<'r, 't> Iterator for Matches<'r, 't> {
|
||||
///
|
||||
/// `'r` is the lifetime of the compiled regular expression and `'t` is the
|
||||
/// lifetime of the matched byte string.
|
||||
pub struct CaptureMatches<'r, 't>(re_trait::CaptureMatches<'t, ExecNoSync<'r>>);
|
||||
pub struct CaptureMatches<'r, 't>(
|
||||
re_trait::CaptureMatches<'t, ExecNoSync<'r>>,
|
||||
);
|
||||
|
||||
impl<'r, 't> Iterator for CaptureMatches<'r, 't> {
|
||||
type Item = Captures<'t>;
|
||||
@@ -775,7 +769,7 @@ impl<'r, 't> Iterator for SplitN<'r, 't> {
|
||||
|
||||
fn next(&mut self) -> Option<&'t [u8]> {
|
||||
if self.n == 0 {
|
||||
return None
|
||||
return None;
|
||||
}
|
||||
self.n -= 1;
|
||||
if self.n == 0 {
|
||||
@@ -799,7 +793,9 @@ impl<'r> Iterator for CaptureNames<'r> {
|
||||
type Item = Option<&'r str>;
|
||||
|
||||
fn next(&mut self) -> Option<Option<&'r str>> {
|
||||
self.0.next().as_ref()
|
||||
self.0
|
||||
.next()
|
||||
.as_ref()
|
||||
.map(|slot| slot.as_ref().map(|name| name.as_ref()))
|
||||
}
|
||||
|
||||
@@ -918,10 +914,7 @@ impl<'t> Captures<'t> {
|
||||
///
|
||||
/// The first match always corresponds to the overall match of the regex.
|
||||
pub fn iter<'c>(&'c self) -> SubCaptureMatches<'c, 't> {
|
||||
SubCaptureMatches {
|
||||
caps: self,
|
||||
it: self.locs.iter(),
|
||||
}
|
||||
SubCaptureMatches { caps: self, it: self.locs.iter() }
|
||||
}
|
||||
|
||||
/// Expands all instances of `$name` in `replacement` to the corresponding
|
||||
@@ -1011,7 +1004,8 @@ impl<'t> Index<usize> for Captures<'t> {
|
||||
type Output = [u8];
|
||||
|
||||
fn index(&self, i: usize) -> &[u8] {
|
||||
self.get(i).map(|m| m.as_bytes())
|
||||
self.get(i)
|
||||
.map(|m| m.as_bytes())
|
||||
.unwrap_or_else(|| panic!("no group at index '{}'", i))
|
||||
}
|
||||
}
|
||||
@@ -1032,7 +1026,8 @@ impl<'t, 'i> Index<&'i str> for Captures<'t> {
|
||||
type Output = [u8];
|
||||
|
||||
fn index<'a>(&'a self, name: &'i str) -> &'a [u8] {
|
||||
self.name(name).map(|m| m.as_bytes())
|
||||
self.name(name)
|
||||
.map(|m| m.as_bytes())
|
||||
.unwrap_or_else(|| panic!("no group named '{}'", name))
|
||||
}
|
||||
}
|
||||
@@ -1055,7 +1050,8 @@ impl<'c, 't> Iterator for SubCaptureMatches<'c, 't> {
|
||||
type Item = Option<Match<'t>>;
|
||||
|
||||
fn next(&mut self) -> Option<Option<Match<'t>>> {
|
||||
self.it.next()
|
||||
self.it
|
||||
.next()
|
||||
.map(|cap| cap.map(|(s, e)| Match::new(self.caps.text, s, e)))
|
||||
}
|
||||
}
|
||||
@@ -1142,7 +1138,11 @@ impl<'a> Replacer for &'a [u8] {
|
||||
}
|
||||
}
|
||||
|
||||
impl<F, T> Replacer for F where F: FnMut(&Captures) -> T, T: AsRef<[u8]> {
|
||||
impl<F, T> Replacer for F
|
||||
where
|
||||
F: FnMut(&Captures) -> T,
|
||||
T: AsRef<[u8]>,
|
||||
{
|
||||
fn replace_append(&mut self, caps: &Captures, dst: &mut Vec<u8>) {
|
||||
dst.extend_from_slice((*self)(caps).as_ref());
|
||||
}
|
||||
|
||||
+5
-5
@@ -219,9 +219,9 @@ impl RegexSet {
|
||||
|
||||
/// Returns the patterns that this set will match on.
|
||||
///
|
||||
/// This function can be used to determine the pattern for a match. The
|
||||
/// slice returned has exactly as many patterns givens to this regex set,
|
||||
/// and the order of the slice is the same as the order of the patterns
|
||||
/// This function can be used to determine the pattern for a match. The
|
||||
/// slice returned has exactly as many patterns givens to this regex set,
|
||||
/// and the order of the slice is the same as the order of the patterns
|
||||
/// provided to the set.
|
||||
///
|
||||
/// # Example
|
||||
@@ -328,7 +328,7 @@ impl Iterator for SetMatchesIntoIter {
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> (usize, Option<usize>) {
|
||||
self.0.size_hint()
|
||||
self.0.size_hint()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -368,7 +368,7 @@ impl<'a> Iterator for SetMatchesIter<'a> {
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> (usize, Option<usize>) {
|
||||
self.0.size_hint()
|
||||
self.0.size_hint()
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
+32
-29
@@ -75,9 +75,7 @@ impl<'c> Iterator for SubCapturesPosIter<'c> {
|
||||
}
|
||||
let x = match self.locs.pos(self.idx) {
|
||||
None => Some(None),
|
||||
Some((s, e)) => {
|
||||
Some(Some((s, e)))
|
||||
}
|
||||
Some((s, e)) => Some(Some((s, e))),
|
||||
};
|
||||
self.idx += 1;
|
||||
x
|
||||
@@ -124,11 +122,7 @@ pub trait RegularExpression: Sized {
|
||||
) -> Option<usize>;
|
||||
|
||||
/// Returns whether the regex matches the text given.
|
||||
fn is_match_at(
|
||||
&self,
|
||||
text: &Self::Text,
|
||||
start: usize,
|
||||
) -> bool;
|
||||
fn is_match_at(&self, text: &Self::Text, start: usize) -> bool;
|
||||
|
||||
/// Returns the leftmost-first match location if one exists.
|
||||
fn find_at(
|
||||
@@ -148,37 +142,34 @@ pub trait RegularExpression: Sized {
|
||||
|
||||
/// Returns an iterator over all non-overlapping successive leftmost-first
|
||||
/// matches.
|
||||
fn find_iter (
|
||||
self,
|
||||
text: &Self::Text,
|
||||
) -> Matches<Self> {
|
||||
Matches {
|
||||
re: self,
|
||||
text: text,
|
||||
last_end: 0,
|
||||
last_match: None,
|
||||
}
|
||||
fn find_iter(self, text: &Self::Text) -> Matches<Self> {
|
||||
Matches { re: self, text: text, last_end: 0, last_match: None }
|
||||
}
|
||||
|
||||
/// Returns an iterator over all non-overlapping successive leftmost-first
|
||||
/// matches with captures.
|
||||
fn captures_iter(
|
||||
self,
|
||||
text: &Self::Text,
|
||||
) -> CaptureMatches<Self> {
|
||||
fn captures_iter(self, text: &Self::Text) -> CaptureMatches<Self> {
|
||||
CaptureMatches(self.find_iter(text))
|
||||
}
|
||||
}
|
||||
|
||||
/// An iterator over all non-overlapping successive leftmost-first matches.
|
||||
pub struct Matches<'t, R> where R: RegularExpression, R::Text: 't {
|
||||
pub struct Matches<'t, R>
|
||||
where
|
||||
R: RegularExpression,
|
||||
R::Text: 't,
|
||||
{
|
||||
re: R,
|
||||
text: &'t R::Text,
|
||||
last_end: usize,
|
||||
last_match: Option<usize>,
|
||||
}
|
||||
|
||||
impl<'t, R> Matches<'t, R> where R: RegularExpression, R::Text: 't {
|
||||
impl<'t, R> Matches<'t, R>
|
||||
where
|
||||
R: RegularExpression,
|
||||
R::Text: 't,
|
||||
{
|
||||
/// Return the text being searched.
|
||||
pub fn text(&self) -> &'t R::Text {
|
||||
self.text
|
||||
@@ -191,7 +182,10 @@ impl<'t, R> Matches<'t, R> where R: RegularExpression, R::Text: 't {
|
||||
}
|
||||
|
||||
impl<'t, R> Iterator for Matches<'t, R>
|
||||
where R: RegularExpression, R::Text: 't + AsRef<[u8]> {
|
||||
where
|
||||
R: RegularExpression,
|
||||
R::Text: 't + AsRef<[u8]>,
|
||||
{
|
||||
type Item = (usize, usize);
|
||||
|
||||
fn next(&mut self) -> Option<(usize, usize)> {
|
||||
@@ -223,9 +217,15 @@ impl<'t, R> Iterator for Matches<'t, R>
|
||||
/// An iterator over all non-overlapping successive leftmost-first matches with
|
||||
/// captures.
|
||||
pub struct CaptureMatches<'t, R>(Matches<'t, R>)
|
||||
where R: RegularExpression, R::Text: 't;
|
||||
where
|
||||
R: RegularExpression,
|
||||
R::Text: 't;
|
||||
|
||||
impl<'t, R> CaptureMatches<'t, R> where R: RegularExpression, R::Text: 't {
|
||||
impl<'t, R> CaptureMatches<'t, R>
|
||||
where
|
||||
R: RegularExpression,
|
||||
R::Text: 't,
|
||||
{
|
||||
/// Return the text being searched.
|
||||
pub fn text(&self) -> &'t R::Text {
|
||||
self.0.text()
|
||||
@@ -238,12 +238,15 @@ impl<'t, R> CaptureMatches<'t, R> where R: RegularExpression, R::Text: 't {
|
||||
}
|
||||
|
||||
impl<'t, R> Iterator for CaptureMatches<'t, R>
|
||||
where R: RegularExpression, R::Text: 't + AsRef<[u8]> {
|
||||
where
|
||||
R: RegularExpression,
|
||||
R::Text: 't + AsRef<[u8]>,
|
||||
{
|
||||
type Item = Locations;
|
||||
|
||||
fn next(&mut self) -> Option<Locations> {
|
||||
if self.0.last_end > self.0.text.as_ref().len() {
|
||||
return None
|
||||
return None;
|
||||
}
|
||||
let mut locs = self.0.re.locations();
|
||||
let (s, e) = match self.0.re.captures_read_at(
|
||||
|
||||
+30
-30
@@ -64,11 +64,7 @@ impl<'t> Match<'t> {
|
||||
/// Creates a new match from the given haystack and byte offsets.
|
||||
#[inline]
|
||||
fn new(haystack: &'t str, start: usize, end: usize) -> Match<'t> {
|
||||
Match {
|
||||
text: haystack,
|
||||
start: start,
|
||||
end: end,
|
||||
}
|
||||
Match { text: haystack, start: start, end: end }
|
||||
}
|
||||
}
|
||||
|
||||
@@ -368,10 +364,7 @@ impl Regex {
|
||||
/// # }
|
||||
/// ```
|
||||
pub fn split<'r, 't>(&'r self, text: &'t str) -> Split<'r, 't> {
|
||||
Split {
|
||||
finder: self.find_iter(text),
|
||||
last: 0,
|
||||
}
|
||||
Split { finder: self.find_iter(text), last: 0 }
|
||||
}
|
||||
|
||||
/// Returns an iterator of at most `limit` substrings of `text` delimited
|
||||
@@ -394,12 +387,12 @@ impl Regex {
|
||||
/// assert_eq!(fields, vec!("Hey", "How", "are you?"));
|
||||
/// # }
|
||||
/// ```
|
||||
pub fn splitn<'r, 't>(&'r self, text: &'t str, limit: usize)
|
||||
-> SplitN<'r, 't> {
|
||||
SplitN {
|
||||
splits: self.split(text),
|
||||
n: limit,
|
||||
}
|
||||
pub fn splitn<'r, 't>(
|
||||
&'r self,
|
||||
text: &'t str,
|
||||
limit: usize,
|
||||
) -> SplitN<'r, 't> {
|
||||
SplitN { splits: self.split(text), n: limit }
|
||||
}
|
||||
|
||||
/// Replaces the leftmost-first match with the replacement provided.
|
||||
@@ -558,7 +551,7 @@ impl Regex {
|
||||
let mut last_match = 0;
|
||||
for (i, m) in it {
|
||||
if limit > 0 && i >= limit {
|
||||
break
|
||||
break;
|
||||
}
|
||||
new.push_str(&text[last_match..m.start()]);
|
||||
new.push_str(&rep);
|
||||
@@ -578,7 +571,7 @@ impl Regex {
|
||||
let mut last_match = 0;
|
||||
for (i, cap) in it {
|
||||
if limit > 0 && i >= limit {
|
||||
break
|
||||
break;
|
||||
}
|
||||
// unwrap on 0 is OK because captures only reports matches
|
||||
let m = cap.get(0).unwrap();
|
||||
@@ -653,9 +646,10 @@ impl Regex {
|
||||
text: &'t str,
|
||||
start: usize,
|
||||
) -> Option<Match<'t>> {
|
||||
self.0.searcher_str().find_at(text, start).map(|(s, e)| {
|
||||
Match::new(text, s, e)
|
||||
})
|
||||
self.0
|
||||
.searcher_str()
|
||||
.find_at(text, start)
|
||||
.map(|(s, e)| Match::new(text, s, e))
|
||||
}
|
||||
|
||||
/// This is like `captures`, but uses
|
||||
@@ -815,7 +809,7 @@ impl<'r, 't> Iterator for SplitN<'r, 't> {
|
||||
|
||||
fn next(&mut self) -> Option<&'t str> {
|
||||
if self.n == 0 {
|
||||
return None
|
||||
return None;
|
||||
}
|
||||
self.n -= 1;
|
||||
if self.n == 0 {
|
||||
@@ -937,10 +931,7 @@ impl<'t> Captures<'t> {
|
||||
///
|
||||
/// The first match always corresponds to the overall match of the regex.
|
||||
pub fn iter<'c>(&'c self) -> SubCaptureMatches<'c, 't> {
|
||||
SubCaptureMatches {
|
||||
caps: self,
|
||||
it: self.locs.iter(),
|
||||
}
|
||||
SubCaptureMatches { caps: self, it: self.locs.iter() }
|
||||
}
|
||||
|
||||
/// Expands all instances of `$name` in `replacement` to the corresponding
|
||||
@@ -1015,7 +1006,8 @@ impl<'t> Index<usize> for Captures<'t> {
|
||||
type Output = str;
|
||||
|
||||
fn index(&self, i: usize) -> &str {
|
||||
self.get(i).map(|m| m.as_str())
|
||||
self.get(i)
|
||||
.map(|m| m.as_str())
|
||||
.unwrap_or_else(|| panic!("no group at index '{}'", i))
|
||||
}
|
||||
}
|
||||
@@ -1036,7 +1028,8 @@ impl<'t, 'i> Index<&'i str> for Captures<'t> {
|
||||
type Output = str;
|
||||
|
||||
fn index<'a>(&'a self, name: &'i str) -> &'a str {
|
||||
self.name(name).map(|m| m.as_str())
|
||||
self.name(name)
|
||||
.map(|m| m.as_str())
|
||||
.unwrap_or_else(|| panic!("no group named '{}'", name))
|
||||
}
|
||||
}
|
||||
@@ -1059,7 +1052,8 @@ impl<'c, 't> Iterator for SubCaptureMatches<'c, 't> {
|
||||
type Item = Option<Match<'t>>;
|
||||
|
||||
fn next(&mut self) -> Option<Option<Match<'t>>> {
|
||||
self.it.next()
|
||||
self.it
|
||||
.next()
|
||||
.map(|cap| cap.map(|(s, e)| Match::new(self.caps.text, s, e)))
|
||||
}
|
||||
}
|
||||
@@ -1071,7 +1065,9 @@ impl<'c, 't> Iterator for SubCaptureMatches<'c, 't> {
|
||||
///
|
||||
/// `'r` is the lifetime of the compiled regular expression and `'t` is the
|
||||
/// lifetime of the matched string.
|
||||
pub struct CaptureMatches<'r, 't>(re_trait::CaptureMatches<'t, ExecNoSyncStr<'r>>);
|
||||
pub struct CaptureMatches<'r, 't>(
|
||||
re_trait::CaptureMatches<'t, ExecNoSyncStr<'r>>,
|
||||
);
|
||||
|
||||
impl<'r, 't> Iterator for CaptureMatches<'r, 't> {
|
||||
type Item = Captures<'t>;
|
||||
@@ -1184,7 +1180,11 @@ impl<'a> Replacer for &'a str {
|
||||
}
|
||||
}
|
||||
|
||||
impl<F, T> Replacer for F where F: FnMut(&Captures) -> T, T: AsRef<str> {
|
||||
impl<F, T> Replacer for F
|
||||
where
|
||||
F: FnMut(&Captures) -> T,
|
||||
T: AsRef<str>,
|
||||
{
|
||||
fn replace_append(&mut self, caps: &Captures, dst: &mut String) {
|
||||
dst.push_str((*self)(caps).as_ref());
|
||||
}
|
||||
|
||||
+3
-1
@@ -71,5 +71,7 @@ impl Deref for SparseSet {
|
||||
impl<'a> IntoIterator for &'a SparseSet {
|
||||
type Item = &'a usize;
|
||||
type IntoIter = slice::Iter<'a, usize>;
|
||||
fn into_iter(self) -> Self::IntoIter { self.iter() }
|
||||
fn into_iter(self) -> Self::IntoIter {
|
||||
self.iter()
|
||||
}
|
||||
}
|
||||
|
||||
+35
-30
@@ -11,7 +11,6 @@
|
||||
/// Should this be factored out into a separate crate? It seems independently
|
||||
/// useful. There are other crates that already exist (e.g., `utf-8`) that have
|
||||
/// overlapping use cases. Not sure what to do.
|
||||
|
||||
use std::char;
|
||||
|
||||
const TAG_CONT: u8 = 0b1000_0000;
|
||||
@@ -56,7 +55,7 @@ pub fn decode_utf8(src: &[u8]) -> Option<(char, usize)> {
|
||||
Some(&b) => b,
|
||||
};
|
||||
match b0 {
|
||||
0b110_00000 ..= 0b110_11111 => {
|
||||
0b110_00000..=0b110_11111 => {
|
||||
if src.len() < 2 {
|
||||
return None;
|
||||
}
|
||||
@@ -64,14 +63,13 @@ pub fn decode_utf8(src: &[u8]) -> Option<(char, usize)> {
|
||||
if 0b11_000000 & b1 != TAG_CONT {
|
||||
return None;
|
||||
}
|
||||
let cp = ((b0 & !TAG_TWO) as u32) << 6
|
||||
| ((b1 & !TAG_CONT) as u32);
|
||||
let cp = ((b0 & !TAG_TWO) as u32) << 6 | ((b1 & !TAG_CONT) as u32);
|
||||
match cp {
|
||||
0x80 ..= 0x7FF => char::from_u32(cp).map(|cp| (cp, 2)),
|
||||
0x80..=0x7FF => char::from_u32(cp).map(|cp| (cp, 2)),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
0b1110_0000 ..= 0b1110_1111 => {
|
||||
0b1110_0000..=0b1110_1111 => {
|
||||
if src.len() < 3 {
|
||||
return None;
|
||||
}
|
||||
@@ -83,15 +81,15 @@ pub fn decode_utf8(src: &[u8]) -> Option<(char, usize)> {
|
||||
return None;
|
||||
}
|
||||
let cp = ((b0 & !TAG_THREE) as u32) << 12
|
||||
| ((b1 & !TAG_CONT) as u32) << 6
|
||||
| ((b2 & !TAG_CONT) as u32);
|
||||
| ((b1 & !TAG_CONT) as u32) << 6
|
||||
| ((b2 & !TAG_CONT) as u32);
|
||||
match cp {
|
||||
// char::from_u32 will disallow surrogate codepoints.
|
||||
0x800 ..= 0xFFFF => char::from_u32(cp).map(|cp| (cp, 3)),
|
||||
0x800..=0xFFFF => char::from_u32(cp).map(|cp| (cp, 3)),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
0b11110_000 ..= 0b11110_111 => {
|
||||
0b11110_000..=0b11110_111 => {
|
||||
if src.len() < 4 {
|
||||
return None;
|
||||
}
|
||||
@@ -106,11 +104,11 @@ pub fn decode_utf8(src: &[u8]) -> Option<(char, usize)> {
|
||||
return None;
|
||||
}
|
||||
let cp = ((b0 & !TAG_FOUR) as u32) << 18
|
||||
| ((b1 & !TAG_CONT) as u32) << 12
|
||||
| ((b2 & !TAG_CONT) as u32) << 6
|
||||
| ((b3 & !TAG_CONT) as u32);
|
||||
| ((b1 & !TAG_CONT) as u32) << 12
|
||||
| ((b2 & !TAG_CONT) as u32) << 6
|
||||
| ((b3 & !TAG_CONT) as u32);
|
||||
match cp {
|
||||
0x10000 ..= 0x10FFFF => char::from_u32(cp).map(|cp| (cp, 4)),
|
||||
0x10000..=0x10FFFF => char::from_u32(cp).map(|cp| (cp, 4)),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
@@ -152,8 +150,7 @@ mod tests {
|
||||
use quickcheck::quickcheck;
|
||||
|
||||
use super::{
|
||||
TAG_CONT, TAG_TWO, TAG_THREE, TAG_FOUR,
|
||||
decode_utf8, decode_last_utf8,
|
||||
decode_last_utf8, decode_utf8, TAG_CONT, TAG_FOUR, TAG_THREE, TAG_TWO,
|
||||
};
|
||||
|
||||
#[test]
|
||||
@@ -209,9 +206,12 @@ mod tests {
|
||||
let mut tmp = [0; 4];
|
||||
let n = given_cp.encode_utf8(&mut tmp).len();
|
||||
let (got_cp, _) = decode_last_utf8(&tmp[..n]).unwrap();
|
||||
let expected_cp =
|
||||
str::from_utf8(&tmp[..n]).unwrap()
|
||||
.chars().rev().next().unwrap();
|
||||
let expected_cp = str::from_utf8(&tmp[..n])
|
||||
.unwrap()
|
||||
.chars()
|
||||
.rev()
|
||||
.next()
|
||||
.unwrap();
|
||||
got_cp == expected_cp
|
||||
}
|
||||
quickcheck(p as fn(char) -> bool)
|
||||
@@ -229,12 +229,13 @@ mod tests {
|
||||
assert_eq!(decode_utf8(&[0xC3]), None); // 2 bytes
|
||||
assert_eq!(decode_utf8(&[0xEF, 0xBF]), None); // 3 bytes
|
||||
assert_eq!(decode_utf8(&[0xF4, 0x8F, 0xBF]), None); // 4 bytes
|
||||
// Not a minimal UTF-8 sequence
|
||||
// Not a minimal UTF-8 sequence
|
||||
assert_eq!(decode_utf8(&[TAG_TWO, TAG_CONT | b'a']), None);
|
||||
assert_eq!(decode_utf8(&[TAG_THREE, TAG_CONT, TAG_CONT | b'a']), None);
|
||||
assert_eq!(decode_utf8(&[
|
||||
TAG_FOUR, TAG_CONT, TAG_CONT, TAG_CONT | b'a',
|
||||
]), None);
|
||||
assert_eq!(
|
||||
decode_utf8(&[TAG_FOUR, TAG_CONT, TAG_CONT, TAG_CONT | b'a',]),
|
||||
None
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -247,13 +248,17 @@ mod tests {
|
||||
assert_eq!(decode_last_utf8(&[0xC3]), None); // 2 bytes
|
||||
assert_eq!(decode_last_utf8(&[0xEF, 0xBF]), None); // 3 bytes
|
||||
assert_eq!(decode_last_utf8(&[0xF4, 0x8F, 0xBF]), None); // 4 bytes
|
||||
// Not a minimal UTF-8 sequence
|
||||
// Not a minimal UTF-8 sequence
|
||||
assert_eq!(decode_last_utf8(&[TAG_TWO, TAG_CONT | b'a']), None);
|
||||
assert_eq!(decode_last_utf8(&[
|
||||
TAG_THREE, TAG_CONT, TAG_CONT | b'a',
|
||||
]), None);
|
||||
assert_eq!(decode_last_utf8(&[
|
||||
TAG_FOUR, TAG_CONT, TAG_CONT, TAG_CONT | b'a',
|
||||
]), None);
|
||||
assert_eq!(
|
||||
decode_last_utf8(&[TAG_THREE, TAG_CONT, TAG_CONT | b'a',]),
|
||||
None
|
||||
);
|
||||
assert_eq!(
|
||||
decode_last_utf8(
|
||||
&[TAG_FOUR, TAG_CONT, TAG_CONT, TAG_CONT | b'a',]
|
||||
),
|
||||
None
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
+3
-7
@@ -59,7 +59,7 @@ impl AVX2VectorBuilder {
|
||||
#[allow(non_camel_case_types)]
|
||||
#[repr(transparent)]
|
||||
pub struct u8x32 {
|
||||
vector: __m256i
|
||||
vector: __m256i,
|
||||
}
|
||||
|
||||
impl u8x32 {
|
||||
@@ -122,9 +122,7 @@ impl u8x32 {
|
||||
#[inline]
|
||||
pub fn movemask(self) -> u32 {
|
||||
// Safe because we know AVX2 is enabled.
|
||||
unsafe {
|
||||
_mm256_movemask_epi8(self.vector) as u32
|
||||
}
|
||||
unsafe { _mm256_movemask_epi8(self.vector) as u32 }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
@@ -162,9 +160,7 @@ impl u8x32 {
|
||||
#[inline]
|
||||
pub fn bit_shift_right_4(self) -> u8x32 {
|
||||
// Safe because we know AVX2 is enabled.
|
||||
unsafe {
|
||||
u8x32 { vector: _mm256_srli_epi16(self.vector, 4) }
|
||||
}
|
||||
unsafe { u8x32 { vector: _mm256_srli_epi16(self.vector, 4) } }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
|
||||
+4
-10
@@ -80,7 +80,7 @@ impl SSSE3VectorBuilder {
|
||||
#[allow(non_camel_case_types)]
|
||||
#[repr(transparent)]
|
||||
pub struct u8x16 {
|
||||
vector: __m128i
|
||||
vector: __m128i,
|
||||
}
|
||||
|
||||
impl u8x16 {
|
||||
@@ -135,17 +135,13 @@ impl u8x16 {
|
||||
#[inline]
|
||||
pub fn and(self, other: u8x16) -> u8x16 {
|
||||
// Safe because we know SSSE3 is enabled.
|
||||
unsafe {
|
||||
u8x16 { vector: _mm_and_si128(self.vector, other.vector) }
|
||||
}
|
||||
unsafe { u8x16 { vector: _mm_and_si128(self.vector, other.vector) } }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn movemask(self) -> u32 {
|
||||
// Safe because we know SSSE3 is enabled.
|
||||
unsafe {
|
||||
_mm_movemask_epi8(self.vector) as u32
|
||||
}
|
||||
unsafe { _mm_movemask_epi8(self.vector) as u32 }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
@@ -167,9 +163,7 @@ impl u8x16 {
|
||||
#[inline]
|
||||
pub fn bit_shift_right_4(self) -> u8x16 {
|
||||
// Safe because we know SSSE3 is enabled.
|
||||
unsafe {
|
||||
u8x16 { vector: _mm_srli_epi16(self.vector, 4) }
|
||||
}
|
||||
unsafe { u8x16 { vector: _mm_srli_epi16(self.vector, 4) } }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
|
||||
+54
-26
@@ -19,15 +19,19 @@ fn one_zero_length_match() {
|
||||
#[test]
|
||||
fn many_zero_length_match() {
|
||||
let re = regex!(r"\d*");
|
||||
assert_eq!(vec![(0, 0), (1, 2), (3, 3), (4, 4), (5, 6)],
|
||||
findall!(re, "a1bbb2"));
|
||||
assert_eq!(
|
||||
vec![(0, 0), (1, 2), (3, 3), (4, 4), (5, 6)],
|
||||
findall!(re, "a1bbb2")
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn many_sequential_zero_length_match() {
|
||||
let re = regex!(r"\d?");
|
||||
assert_eq!(vec![(0, 0), (1, 2), (2, 3), (4, 5), (6, 6)],
|
||||
findall!(re, "a12b3c"));
|
||||
assert_eq!(
|
||||
vec![(0, 0), (1, 2), (2, 3), (4, 5), (6, 6)],
|
||||
findall!(re, "a12b3c")
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -59,10 +63,11 @@ fn empty_match_find_iter() {
|
||||
#[test]
|
||||
fn empty_match_captures_iter() {
|
||||
let re = regex!(r".*?");
|
||||
let ms: Vec<_> = re.captures_iter(text!("abc"))
|
||||
.map(|c| c.get(0).unwrap())
|
||||
.map(|m| (m.start(), m.end()))
|
||||
.collect();
|
||||
let ms: Vec<_> = re
|
||||
.captures_iter(text!("abc"))
|
||||
.map(|c| c.get(0).unwrap())
|
||||
.map(|m| (m.start(), m.end()))
|
||||
.collect();
|
||||
assert_eq!(ms, vec![(0, 0), (1, 1), (2, 2), (3, 3)]);
|
||||
}
|
||||
|
||||
@@ -71,8 +76,10 @@ fn capture_names() {
|
||||
let re = regex!(r"(.)(?P<a>.)");
|
||||
assert_eq!(3, re.captures_len());
|
||||
assert_eq!((3, Some(3)), re.capture_names().size_hint());
|
||||
assert_eq!(vec![None, None, Some("a")],
|
||||
re.capture_names().collect::<Vec<_>>());
|
||||
assert_eq!(
|
||||
vec![None, None, Some("a")],
|
||||
re.capture_names().collect::<Vec<_>>()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -128,9 +135,15 @@ fn capture_misc() {
|
||||
|
||||
assert_eq!(5, cap.len());
|
||||
|
||||
assert_eq!((0, 3), { let m = cap.get(0).unwrap(); (m.start(), m.end()) });
|
||||
assert_eq!((0, 3), {
|
||||
let m = cap.get(0).unwrap();
|
||||
(m.start(), m.end())
|
||||
});
|
||||
assert_eq!(None, cap.get(2));
|
||||
assert_eq!((2, 3), { let m = cap.get(4).unwrap(); (m.start(), m.end()) });
|
||||
assert_eq!((2, 3), {
|
||||
let m = cap.get(4).unwrap();
|
||||
(m.start(), m.end())
|
||||
});
|
||||
|
||||
assert_eq!(t!("abc"), match_text!(cap.get(0).unwrap()));
|
||||
assert_eq!(None, cap.get(2));
|
||||
@@ -164,19 +177,34 @@ expand!(expand2, r"(?P<foo>\w+)", "abc", "$0", "abc");
|
||||
expand!(expand3, r"(?P<foo>\w+)", "abc", "$1", "abc");
|
||||
expand!(expand4, r"(?P<foo>\w+)", "abc", "$$1", "$1");
|
||||
expand!(expand5, r"(?P<foo>\w+)", "abc", "$$foo", "$foo");
|
||||
expand!(expand6, r"(?P<a>\w+)\s+(?P<b>\d+)",
|
||||
"abc 123", "$b$a", "123abc");
|
||||
expand!(expand7, r"(?P<a>\w+)\s+(?P<b>\d+)",
|
||||
"abc 123", "z$bz$az", "z");
|
||||
expand!(expand8, r"(?P<a>\w+)\s+(?P<b>\d+)",
|
||||
"abc 123", ".$b.$a.", ".123.abc.");
|
||||
expand!(expand9, r"(?P<a>\w+)\s+(?P<b>\d+)",
|
||||
"abc 123", " $b $a ", " 123 abc ");
|
||||
expand!(expand10, r"(?P<a>\w+)\s+(?P<b>\d+)",
|
||||
"abc 123", "$bz$az", "");
|
||||
expand!(expand6, r"(?P<a>\w+)\s+(?P<b>\d+)", "abc 123", "$b$a", "123abc");
|
||||
expand!(expand7, r"(?P<a>\w+)\s+(?P<b>\d+)", "abc 123", "z$bz$az", "z");
|
||||
expand!(
|
||||
expand8,
|
||||
r"(?P<a>\w+)\s+(?P<b>\d+)",
|
||||
"abc 123",
|
||||
".$b.$a.",
|
||||
".123.abc."
|
||||
);
|
||||
expand!(
|
||||
expand9,
|
||||
r"(?P<a>\w+)\s+(?P<b>\d+)",
|
||||
"abc 123",
|
||||
" $b $a ",
|
||||
" 123 abc "
|
||||
);
|
||||
expand!(expand10, r"(?P<a>\w+)\s+(?P<b>\d+)", "abc 123", "$bz$az", "");
|
||||
|
||||
split!(split1, r"\s+", "a b\nc\td\n\t e",
|
||||
&[t!("a"), t!("b"), t!("c"), t!("d"), t!("e")]);
|
||||
split!(split2, r"\b", "a b c",
|
||||
&[t!(""), t!("a"), t!(" "), t!("b"), t!(" "), t!("c")]);
|
||||
split!(
|
||||
split1,
|
||||
r"\s+",
|
||||
"a b\nc\td\n\t e",
|
||||
&[t!("a"), t!("b"), t!("c"), t!("d"), t!("e")]
|
||||
);
|
||||
split!(
|
||||
split2,
|
||||
r"\b",
|
||||
"a b c",
|
||||
&[t!(""), t!("a"), t!(" "), t!("b"), t!(" "), t!("c")]
|
||||
);
|
||||
split!(split3, r"a$", "a", &[t!("")]);
|
||||
|
||||
+9
-6
@@ -6,18 +6,21 @@ fn empty_match_unicode_find_iter() {
|
||||
// Tests that we still yield byte ranges at valid UTF-8 sequence boundaries
|
||||
// even when we're susceptible to empty width matches.
|
||||
let re = regex!(r".*?");
|
||||
assert_eq!(vec![(0, 0), (3, 3), (4, 4), (7, 7), (8, 8)],
|
||||
findall!(re, "Ⅰ1Ⅱ2"));
|
||||
assert_eq!(
|
||||
vec![(0, 0), (3, 3), (4, 4), (7, 7), (8, 8)],
|
||||
findall!(re, "Ⅰ1Ⅱ2")
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn empty_match_unicode_captures_iter() {
|
||||
// Same as empty_match_unicode_find_iter, but tests capture iteration.
|
||||
let re = regex!(r".*?");
|
||||
let ms: Vec<_> = re.captures_iter(text!("Ⅰ1Ⅱ2"))
|
||||
.map(|c| c.get(0).unwrap())
|
||||
.map(|m| (m.start(), m.end()))
|
||||
.collect();
|
||||
let ms: Vec<_> = re
|
||||
.captures_iter(text!("Ⅰ1Ⅱ2"))
|
||||
.map(|c| c.get(0).unwrap())
|
||||
.map(|m| (m.start(), m.end()))
|
||||
.collect();
|
||||
assert_eq!(vec![(0, 0), (3, 3), (4, 4), (7, 7), (8, 8)], ms);
|
||||
}
|
||||
|
||||
|
||||
+42
-15
@@ -3,7 +3,11 @@
|
||||
|
||||
// A silly wrapper to make it possible to write and match raw bytes.
|
||||
struct R<'a>(&'a [u8]);
|
||||
impl<'a> R<'a> { fn as_bytes(&self) -> &'a [u8] { self.0 } }
|
||||
impl<'a> R<'a> {
|
||||
fn as_bytes(&self) -> &'a [u8] {
|
||||
self.0
|
||||
}
|
||||
}
|
||||
|
||||
mat!(word_boundary, r"(?-u) \b", " δ", None);
|
||||
mat!(word_boundary_unicode, r" \b", " δ", Some((0, 1)));
|
||||
@@ -20,8 +24,14 @@ mat!(perl_s_unicode, r"\s+", " \u{1680}", Some((0, 4)));
|
||||
// The first `(.+)` matches two Unicode codepoints, but can't match the 5th
|
||||
// byte, which isn't valid UTF-8. The second (byte based) `(.+)` takes over and
|
||||
// matches.
|
||||
mat!(mixed1, r"(.+)(?-u)(.+)", R(b"\xCE\x93\xCE\x94\xFF"),
|
||||
Some((0, 5)), Some((0, 4)), Some((4, 5)));
|
||||
mat!(
|
||||
mixed1,
|
||||
r"(.+)(?-u)(.+)",
|
||||
R(b"\xCE\x93\xCE\x94\xFF"),
|
||||
Some((0, 5)),
|
||||
Some((0, 4)),
|
||||
Some((4, 5))
|
||||
);
|
||||
|
||||
mat!(case_ascii_one, r"(?i-u)a", "A", Some((0, 1)));
|
||||
mat!(case_ascii_class, r"(?i-u)[a-z]+", "AaAaA", Some((0, 5)));
|
||||
@@ -37,32 +47,49 @@ mat!(dotstar_prefix_not_unicode1, r"(?-u)a", R(b"\xFFa"), Some((1, 2)));
|
||||
mat!(dotstar_prefix_not_unicode2, r"a", R(b"\xFFa"), Some((1, 2)));
|
||||
|
||||
// Have fun with null bytes.
|
||||
mat!(null_bytes, r"(?-u)(?P<cstr>[^\x00]+)\x00",
|
||||
R(b"foo\x00"), Some((0, 4)), Some((0, 3)));
|
||||
mat!(
|
||||
null_bytes,
|
||||
r"(?-u)(?P<cstr>[^\x00]+)\x00",
|
||||
R(b"foo\x00"),
|
||||
Some((0, 4)),
|
||||
Some((0, 3))
|
||||
);
|
||||
|
||||
// Test that lookahead operators work properly in the face of invalid UTF-8.
|
||||
// See: https://github.com/rust-lang/regex/issues/277
|
||||
matiter!(invalidutf8_anchor1,
|
||||
r"(?-u)\xcc?^",
|
||||
R(b"\x8d#;\x1a\xa4s3\x05foobarX\\\x0f0t\xe4\x9b\xa4"),
|
||||
(0, 0));
|
||||
matiter!(
|
||||
invalidutf8_anchor1,
|
||||
r"(?-u)\xcc?^",
|
||||
R(b"\x8d#;\x1a\xa4s3\x05foobarX\\\x0f0t\xe4\x9b\xa4"),
|
||||
(0, 0)
|
||||
);
|
||||
matiter!(invalidutf8_anchor2,
|
||||
r"(?-u)^\xf7|4\xff\d\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a##########[] d\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a##########\[] #####\x80\S7|$",
|
||||
R(b"\x8d#;\x1a\xa4s3\x05foobarX\\\x0f0t\xe4\x9b\xa4"),
|
||||
(22, 22));
|
||||
matiter!(invalidutf8_anchor3,
|
||||
r"(?-u)^|ddp\xff\xffdddddlQd@\x80",
|
||||
R(b"\x8d#;\x1a\xa4s3\x05foobarX\\\x0f0t\xe4\x9b\xa4"),
|
||||
(0, 0));
|
||||
matiter!(
|
||||
invalidutf8_anchor3,
|
||||
r"(?-u)^|ddp\xff\xffdddddlQd@\x80",
|
||||
R(b"\x8d#;\x1a\xa4s3\x05foobarX\\\x0f0t\xe4\x9b\xa4"),
|
||||
(0, 0)
|
||||
);
|
||||
|
||||
// See https://github.com/rust-lang/regex/issues/303
|
||||
#[test]
|
||||
fn negated_full_byte_range() {
|
||||
assert!(::regex::bytes::Regex::new(r#"(?-u)[^\x00-\xff]"#).is_err());
|
||||
assert!(::regex::bytes::Regex::new(r#"(?-u)[^\x00-\xff]"#).is_err());
|
||||
}
|
||||
|
||||
matiter!(word_boundary_ascii1, r"(?-u:\B)x(?-u:\B)", "áxβ");
|
||||
matiter!(word_boundary_ascii2, r"(?-u:\B)", "0\u{7EF5E}", (2, 2), (3, 3), (4, 4), (5, 5));
|
||||
matiter!(
|
||||
word_boundary_ascii2,
|
||||
r"(?-u:\B)",
|
||||
"0\u{7EF5E}",
|
||||
(2, 2),
|
||||
(3, 3),
|
||||
(4, 4),
|
||||
(5, 5)
|
||||
);
|
||||
|
||||
// See: https://github.com/rust-lang/regex/issues/264
|
||||
mat!(ascii_boundary_no_capture, r"(?-u)\B", "\u{28f3e}", Some((0, 0)));
|
||||
|
||||
+182
-153
@@ -19,80 +19,92 @@ use regex::internal::ExecBuilder;
|
||||
/// and shrinking and whatnot.
|
||||
pub fn backends_are_consistent(re: &str) -> Result<u64, String> {
|
||||
let standard_backends = vec![
|
||||
("bounded_backtracking_re",
|
||||
(
|
||||
"bounded_backtracking_re",
|
||||
ExecBuilder::new(re)
|
||||
.bounded_backtracking()
|
||||
.build()
|
||||
.map(|exec| exec.into_regex())
|
||||
.map_err(|err| format!("{}", err))?),
|
||||
|
||||
("pikevm_re",
|
||||
.bounded_backtracking()
|
||||
.build()
|
||||
.map(|exec| exec.into_regex())
|
||||
.map_err(|err| format!("{}", err))?,
|
||||
),
|
||||
(
|
||||
"pikevm_re",
|
||||
ExecBuilder::new(re)
|
||||
.nfa()
|
||||
.build()
|
||||
.map(|exec| exec.into_regex())
|
||||
.map_err(|err| format!("{}", err))?),
|
||||
|
||||
("default_re",
|
||||
.nfa()
|
||||
.build()
|
||||
.map(|exec| exec.into_regex())
|
||||
.map_err(|err| format!("{}", err))?,
|
||||
),
|
||||
(
|
||||
"default_re",
|
||||
ExecBuilder::new(re)
|
||||
.build()
|
||||
.map(|exec| exec.into_regex())
|
||||
.map_err(|err| format!("{}", err))?),
|
||||
.build()
|
||||
.map(|exec| exec.into_regex())
|
||||
.map_err(|err| format!("{}", err))?,
|
||||
),
|
||||
];
|
||||
|
||||
let utf8bytes_backends = vec![
|
||||
("bounded_backtracking_utf8bytes_re",
|
||||
(
|
||||
"bounded_backtracking_utf8bytes_re",
|
||||
ExecBuilder::new(re)
|
||||
.bounded_backtracking()
|
||||
.bytes(true)
|
||||
.build()
|
||||
.map(|exec| exec.into_regex())
|
||||
.map_err(|err| format!("{}", err))?),
|
||||
|
||||
("pikevm_utf8bytes_re",
|
||||
.bounded_backtracking()
|
||||
.bytes(true)
|
||||
.build()
|
||||
.map(|exec| exec.into_regex())
|
||||
.map_err(|err| format!("{}", err))?,
|
||||
),
|
||||
(
|
||||
"pikevm_utf8bytes_re",
|
||||
ExecBuilder::new(re)
|
||||
.nfa()
|
||||
.bytes(true)
|
||||
.build()
|
||||
.map(|exec| exec.into_regex())
|
||||
.map_err(|err| format!("{}", err))?),
|
||||
|
||||
("default_utf8bytes_re",
|
||||
.nfa()
|
||||
.bytes(true)
|
||||
.build()
|
||||
.map(|exec| exec.into_regex())
|
||||
.map_err(|err| format!("{}", err))?,
|
||||
),
|
||||
(
|
||||
"default_utf8bytes_re",
|
||||
ExecBuilder::new(re)
|
||||
.bytes(true)
|
||||
.build()
|
||||
.map(|exec| exec.into_regex())
|
||||
.map_err(|err| format!("{}", err))?),
|
||||
.bytes(true)
|
||||
.build()
|
||||
.map(|exec| exec.into_regex())
|
||||
.map_err(|err| format!("{}", err))?,
|
||||
),
|
||||
];
|
||||
|
||||
let bytes_backends = vec![
|
||||
("bounded_backtracking_bytes_re",
|
||||
(
|
||||
"bounded_backtracking_bytes_re",
|
||||
ExecBuilder::new(re)
|
||||
.bounded_backtracking()
|
||||
.only_utf8(false)
|
||||
.build()
|
||||
.map(|exec| exec.into_byte_regex())
|
||||
.map_err(|err| format!("{}", err))?),
|
||||
|
||||
("pikevm_bytes_re",
|
||||
.bounded_backtracking()
|
||||
.only_utf8(false)
|
||||
.build()
|
||||
.map(|exec| exec.into_byte_regex())
|
||||
.map_err(|err| format!("{}", err))?,
|
||||
),
|
||||
(
|
||||
"pikevm_bytes_re",
|
||||
ExecBuilder::new(re)
|
||||
.nfa()
|
||||
.only_utf8(false)
|
||||
.build()
|
||||
.map(|exec| exec.into_byte_regex())
|
||||
.map_err(|err| format!("{}", err))?),
|
||||
|
||||
("default_bytes_re",
|
||||
.nfa()
|
||||
.only_utf8(false)
|
||||
.build()
|
||||
.map(|exec| exec.into_byte_regex())
|
||||
.map_err(|err| format!("{}", err))?,
|
||||
),
|
||||
(
|
||||
"default_bytes_re",
|
||||
ExecBuilder::new(re)
|
||||
.only_utf8(false)
|
||||
.build()
|
||||
.map(|exec| exec.into_byte_regex())
|
||||
.map_err(|err| format!("{}", err))?),
|
||||
.only_utf8(false)
|
||||
.build()
|
||||
.map(|exec| exec.into_byte_regex())
|
||||
.map_err(|err| format!("{}", err))?,
|
||||
),
|
||||
];
|
||||
|
||||
Ok(string_checker::check_backends(&standard_backends)?
|
||||
+ string_checker::check_backends(&utf8bytes_backends)?
|
||||
+ bytes_checker::check_backends(&bytes_backends)?)
|
||||
+ string_checker::check_backends(&utf8bytes_backends)?
|
||||
+ bytes_checker::check_backends(&bytes_backends)?)
|
||||
}
|
||||
|
||||
//
|
||||
@@ -101,113 +113,130 @@ pub fn backends_are_consistent(re: &str) -> Result<u64, String> {
|
||||
|
||||
macro_rules! checker {
|
||||
($module_name:ident, $regex_type:path, $mk_input:expr) => {
|
||||
mod $module_name {
|
||||
use quickcheck;
|
||||
use quickcheck::{Arbitrary, TestResult};
|
||||
|
||||
mod $module_name {
|
||||
use quickcheck;
|
||||
use quickcheck::{TestResult, Arbitrary};
|
||||
pub fn check_backends(
|
||||
backends: &[(&str, $regex_type)],
|
||||
) -> Result<u64, String> {
|
||||
let mut total_passed = 0;
|
||||
for regex in backends[1..].iter() {
|
||||
total_passed += quickcheck_regex_eq(&backends[0], regex)?;
|
||||
}
|
||||
|
||||
pub fn check_backends(
|
||||
backends: &[(&str, $regex_type)]
|
||||
) -> Result<u64, String> {
|
||||
let mut total_passed = 0;
|
||||
for regex in backends[1..].iter() {
|
||||
total_passed += quickcheck_regex_eq(&backends[0], regex)?;
|
||||
}
|
||||
|
||||
Ok(total_passed)
|
||||
}
|
||||
|
||||
fn quickcheck_regex_eq(
|
||||
&(name1, ref re1): &(&str, $regex_type),
|
||||
&(name2, ref re2): &(&str, $regex_type),
|
||||
) -> Result<u64, String> {
|
||||
quickcheck::QuickCheck::new()
|
||||
.quicktest(RegexEqualityTest::new(re1.clone(), re2.clone()))
|
||||
.map_err(|err|
|
||||
format!("{}(/{}/) and {}(/{}/) are inconsistent.\
|
||||
QuickCheck Err: {:?}",
|
||||
name1, re1, name2, re2, err))
|
||||
}
|
||||
|
||||
struct RegexEqualityTest {
|
||||
re1: $regex_type,
|
||||
re2: $regex_type,
|
||||
}
|
||||
impl RegexEqualityTest {
|
||||
fn new(re1: $regex_type, re2: $regex_type) -> Self {
|
||||
RegexEqualityTest {
|
||||
re1: re1,
|
||||
re2: re2,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl quickcheck::Testable for RegexEqualityTest {
|
||||
fn result<G: quickcheck::Gen>(&self, gen: &mut G) -> TestResult {
|
||||
let input = $mk_input(gen);
|
||||
let input = &input;
|
||||
|
||||
if self.re1.find(&input) != self.re2.find(input) {
|
||||
return TestResult::error(
|
||||
format!("find mismatch input={:?}", input));
|
||||
Ok(total_passed)
|
||||
}
|
||||
|
||||
let cap1 = self.re1.captures(input);
|
||||
let cap2 = self.re2.captures(input);
|
||||
match (cap1, cap2) {
|
||||
(None, None) => {}
|
||||
(Some(cap1), Some(cap2)) => {
|
||||
for (c1, c2) in cap1.iter().zip(cap2.iter()) {
|
||||
if c1 != c2 {
|
||||
return TestResult::error(
|
||||
format!("captures mismatch input={:?}", input));
|
||||
fn quickcheck_regex_eq(
|
||||
&(name1, ref re1): &(&str, $regex_type),
|
||||
&(name2, ref re2): &(&str, $regex_type),
|
||||
) -> Result<u64, String> {
|
||||
quickcheck::QuickCheck::new()
|
||||
.quicktest(RegexEqualityTest::new(
|
||||
re1.clone(),
|
||||
re2.clone(),
|
||||
))
|
||||
.map_err(|err| {
|
||||
format!(
|
||||
"{}(/{}/) and {}(/{}/) are inconsistent.\
|
||||
QuickCheck Err: {:?}",
|
||||
name1, re1, name2, re2, err
|
||||
)
|
||||
})
|
||||
}
|
||||
|
||||
struct RegexEqualityTest {
|
||||
re1: $regex_type,
|
||||
re2: $regex_type,
|
||||
}
|
||||
impl RegexEqualityTest {
|
||||
fn new(re1: $regex_type, re2: $regex_type) -> Self {
|
||||
RegexEqualityTest { re1: re1, re2: re2 }
|
||||
}
|
||||
}
|
||||
|
||||
impl quickcheck::Testable for RegexEqualityTest {
|
||||
fn result<G: quickcheck::Gen>(
|
||||
&self,
|
||||
gen: &mut G,
|
||||
) -> TestResult {
|
||||
let input = $mk_input(gen);
|
||||
let input = &input;
|
||||
|
||||
if self.re1.find(&input) != self.re2.find(input) {
|
||||
return TestResult::error(format!(
|
||||
"find mismatch input={:?}",
|
||||
input
|
||||
));
|
||||
}
|
||||
|
||||
let cap1 = self.re1.captures(input);
|
||||
let cap2 = self.re2.captures(input);
|
||||
match (cap1, cap2) {
|
||||
(None, None) => {}
|
||||
(Some(cap1), Some(cap2)) => {
|
||||
for (c1, c2) in cap1.iter().zip(cap2.iter()) {
|
||||
if c1 != c2 {
|
||||
return TestResult::error(format!(
|
||||
"captures mismatch input={:?}",
|
||||
input
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
return TestResult::error(format!(
|
||||
"captures mismatch input={:?}",
|
||||
input
|
||||
))
|
||||
}
|
||||
}
|
||||
}
|
||||
_ => return TestResult::error(
|
||||
format!("captures mismatch input={:?}", input)),
|
||||
}
|
||||
|
||||
let fi1 = self.re1.find_iter(input);
|
||||
let fi2 = self.re2.find_iter(input);
|
||||
for (m1, m2) in fi1.zip(fi2) {
|
||||
if m1 != m2 {
|
||||
return TestResult::error(
|
||||
format!("find_iter mismatch input={:?}", input));
|
||||
}
|
||||
}
|
||||
|
||||
let ci1 = self.re1.captures_iter(input);
|
||||
let ci2 = self.re2.captures_iter(input);
|
||||
for (cap1, cap2) in ci1.zip(ci2) {
|
||||
for (c1, c2) in cap1.iter().zip(cap2.iter()) {
|
||||
if c1 != c2 {
|
||||
return TestResult::error(
|
||||
format!("captures_iter mismatch input={:?}", input));
|
||||
let fi1 = self.re1.find_iter(input);
|
||||
let fi2 = self.re2.find_iter(input);
|
||||
for (m1, m2) in fi1.zip(fi2) {
|
||||
if m1 != m2 {
|
||||
return TestResult::error(format!(
|
||||
"find_iter mismatch input={:?}",
|
||||
input
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
let ci1 = self.re1.captures_iter(input);
|
||||
let ci2 = self.re2.captures_iter(input);
|
||||
for (cap1, cap2) in ci1.zip(ci2) {
|
||||
for (c1, c2) in cap1.iter().zip(cap2.iter()) {
|
||||
if c1 != c2 {
|
||||
return TestResult::error(format!(
|
||||
"captures_iter mismatch input={:?}",
|
||||
input
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let s1 = self.re1.split(input);
|
||||
let s2 = self.re2.split(input);
|
||||
for (chunk1, chunk2) in s1.zip(s2) {
|
||||
if chunk1 != chunk2 {
|
||||
return TestResult::error(format!(
|
||||
"split mismatch input={:?}",
|
||||
input
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
TestResult::from_bool(true)
|
||||
}
|
||||
}
|
||||
|
||||
let s1 = self.re1.split(input);
|
||||
let s2 = self.re2.split(input);
|
||||
for (chunk1, chunk2) in s1.zip(s2) {
|
||||
if chunk1 != chunk2 {
|
||||
return TestResult::error(
|
||||
format!("split mismatch input={:?}", input));
|
||||
}
|
||||
}
|
||||
|
||||
TestResult::from_bool(true)
|
||||
}
|
||||
}
|
||||
|
||||
} // mod
|
||||
} // rule case
|
||||
} // mod
|
||||
}; // rule case
|
||||
} // macro_rules!
|
||||
|
||||
checker!(string_checker,
|
||||
::regex::Regex,
|
||||
|gen| String::arbitrary(gen));
|
||||
checker!(bytes_checker,
|
||||
::regex::bytes::Regex,
|
||||
|gen| Vec::<u8>::arbitrary(gen));
|
||||
checker!(string_checker, ::regex::Regex, |gen| String::arbitrary(gen));
|
||||
checker!(bytes_checker, ::regex::bytes::Regex, |gen| Vec::<u8>::arbitrary(
|
||||
gen
|
||||
));
|
||||
|
||||
+66
-25
@@ -1,41 +1,82 @@
|
||||
mat!(ascii_literal, r"a", "a", Some((0, 1)));
|
||||
|
||||
// Some crazy expressions from regular-expressions.info.
|
||||
mat!(match_ranges,
|
||||
r"\b(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])\b",
|
||||
"num: 255", Some((5, 8)));
|
||||
mat!(match_ranges_not,
|
||||
r"\b(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])\b",
|
||||
"num: 256", None);
|
||||
mat!(
|
||||
match_ranges,
|
||||
r"\b(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])\b",
|
||||
"num: 255",
|
||||
Some((5, 8))
|
||||
);
|
||||
mat!(
|
||||
match_ranges_not,
|
||||
r"\b(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])\b",
|
||||
"num: 256",
|
||||
None
|
||||
);
|
||||
mat!(match_float1, r"[-+]?[0-9]*\.?[0-9]+", "0.1", Some((0, 3)));
|
||||
mat!(match_float2, r"[-+]?[0-9]*\.?[0-9]+", "0.1.2", Some((0, 3)));
|
||||
mat!(match_float3, r"[-+]?[0-9]*\.?[0-9]+", "a1.2", Some((1, 4)));
|
||||
mat!(match_float4, r"^[-+]?[0-9]*\.?[0-9]+$", "1.a", None);
|
||||
mat!(match_email, r"(?i)\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}\b",
|
||||
"mine is jam.slam@gmail.com ", Some((8, 26)));
|
||||
mat!(match_email_not, r"(?i)\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}\b",
|
||||
"mine is jam.slam@gmail ", None);
|
||||
mat!(
|
||||
match_email,
|
||||
r"(?i)\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}\b",
|
||||
"mine is jam.slam@gmail.com ",
|
||||
Some((8, 26))
|
||||
);
|
||||
mat!(
|
||||
match_email_not,
|
||||
r"(?i)\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}\b",
|
||||
"mine is jam.slam@gmail ",
|
||||
None
|
||||
);
|
||||
mat!(match_email_big, r"[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?",
|
||||
"mine is jam.slam@gmail.com ", Some((8, 26)));
|
||||
mat!(match_date1,
|
||||
r"^(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])$",
|
||||
"1900-01-01", Some((0, 10)));
|
||||
mat!(match_date2,
|
||||
r"^(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])$",
|
||||
"1900-00-01", None);
|
||||
mat!(match_date3,
|
||||
r"^(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])$",
|
||||
"1900-13-01", None);
|
||||
mat!(
|
||||
match_date1,
|
||||
r"^(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])$",
|
||||
"1900-01-01",
|
||||
Some((0, 10))
|
||||
);
|
||||
mat!(
|
||||
match_date2,
|
||||
r"^(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])$",
|
||||
"1900-00-01",
|
||||
None
|
||||
);
|
||||
mat!(
|
||||
match_date3,
|
||||
r"^(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])$",
|
||||
"1900-13-01",
|
||||
None
|
||||
);
|
||||
|
||||
// Do some crazy dancing with the start/end assertions.
|
||||
matiter!(match_start_end_empty, r"^$", "", (0, 0));
|
||||
matiter!(match_start_end_empty_many_1, r"^$^$^$", "", (0, 0));
|
||||
matiter!(match_start_end_empty_many_2, r"^^^$$$", "", (0, 0));
|
||||
matiter!(match_start_end_empty_rev, r"$^", "", (0, 0));
|
||||
matiter!(match_start_end_empty_rep, r"(?:^$)*", "a\nb\nc",
|
||||
(0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (5, 5));
|
||||
matiter!(match_start_end_empty_rep_rev, r"(?:$^)*", "a\nb\nc",
|
||||
(0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (5, 5));
|
||||
matiter!(
|
||||
match_start_end_empty_rep,
|
||||
r"(?:^$)*",
|
||||
"a\nb\nc",
|
||||
(0, 0),
|
||||
(1, 1),
|
||||
(2, 2),
|
||||
(3, 3),
|
||||
(4, 4),
|
||||
(5, 5)
|
||||
);
|
||||
matiter!(
|
||||
match_start_end_empty_rep_rev,
|
||||
r"(?:$^)*",
|
||||
"a\nb\nc",
|
||||
(0, 0),
|
||||
(1, 1),
|
||||
(2, 2),
|
||||
(3, 3),
|
||||
(4, 4),
|
||||
(5, 5)
|
||||
);
|
||||
|
||||
// Test negated character classes.
|
||||
mat!(negclass_letters, r"[^ac]", "acx", Some((2, 3)));
|
||||
@@ -80,7 +121,7 @@ matiter!(match_empty11, r"b|()+", "abc", (0, 0), (1, 2), (3, 3));
|
||||
#[test]
|
||||
fn dfa_handles_pathological_case() {
|
||||
fn ones_and_zeroes(count: usize) -> String {
|
||||
use rand::{Rng, thread_rng};
|
||||
use rand::{thread_rng, Rng};
|
||||
|
||||
let mut rng = thread_rng();
|
||||
let mut s = String::new();
|
||||
@@ -394,7 +435,7 @@ fn nest_limit_makes_it_parse() {
|
||||
[24]1
|
||||
)\d{2}
|
||||
)\d{3}
|
||||
"#
|
||||
"#,
|
||||
)
|
||||
.build()
|
||||
.unwrap();
|
||||
|
||||
+6
-1
@@ -4,7 +4,12 @@ mat!(match_flag_weird_case_not, "(?i)a(?-i)bc", "ABC", None);
|
||||
mat!(match_flag_case_dotnl, "(?is)a.", "A\n", Some((0, 2)));
|
||||
mat!(match_flag_case_dotnl_toggle, "(?is)a.(?-is)a.", "A\nab", Some((0, 4)));
|
||||
mat!(match_flag_case_dotnl_toggle_not, "(?is)a.(?-is)a.", "A\na\n", None);
|
||||
mat!(match_flag_case_dotnl_toggle_ok, "(?is)a.(?-is:a.)?", "A\na\n", Some((0, 2)));
|
||||
mat!(
|
||||
match_flag_case_dotnl_toggle_ok,
|
||||
"(?is)a.(?-is:a.)?",
|
||||
"A\na\n",
|
||||
Some((0, 2))
|
||||
);
|
||||
mat!(match_flag_multi, "(?m)(?:^\\d+$\n?)+", "123\n456\n789", Some((0, 11)));
|
||||
mat!(match_flag_ungreedy, "(?U)a+", "aa", Some((0, 1)));
|
||||
mat!(match_flag_ungreedy_greedy, "(?U)a+?", "aa", Some((0, 2)));
|
||||
|
||||
+1391
-176
File diff suppressed because it is too large
Load Diff
+139
-44
@@ -1,49 +1,144 @@
|
||||
matiter!(match_multi_1, r"(?m)^[a-z]+$", "abc\ndef\nxyz",
|
||||
(0, 3), (4, 7), (8, 11));
|
||||
matiter!(
|
||||
match_multi_1,
|
||||
r"(?m)^[a-z]+$",
|
||||
"abc\ndef\nxyz",
|
||||
(0, 3),
|
||||
(4, 7),
|
||||
(8, 11)
|
||||
);
|
||||
matiter!(match_multi_2, r"(?m)^$", "abc\ndef\nxyz");
|
||||
matiter!(match_multi_3, r"(?m)^", "abc\ndef\nxyz",
|
||||
(0, 0), (4, 4), (8, 8));
|
||||
matiter!(match_multi_4, r"(?m)$", "abc\ndef\nxyz",
|
||||
(3, 3), (7, 7), (11, 11));
|
||||
matiter!(match_multi_5, r"(?m)^[a-z]", "abc\ndef\nxyz",
|
||||
(0, 1), (4, 5), (8, 9));
|
||||
matiter!(match_multi_3, r"(?m)^", "abc\ndef\nxyz", (0, 0), (4, 4), (8, 8));
|
||||
matiter!(match_multi_4, r"(?m)$", "abc\ndef\nxyz", (3, 3), (7, 7), (11, 11));
|
||||
matiter!(
|
||||
match_multi_5,
|
||||
r"(?m)^[a-z]",
|
||||
"abc\ndef\nxyz",
|
||||
(0, 1),
|
||||
(4, 5),
|
||||
(8, 9)
|
||||
);
|
||||
matiter!(match_multi_6, r"(?m)[a-z]^", "abc\ndef\nxyz");
|
||||
matiter!(match_multi_7, r"(?m)[a-z]$", "abc\ndef\nxyz",
|
||||
(2, 3), (6, 7), (10, 11));
|
||||
matiter!(
|
||||
match_multi_7,
|
||||
r"(?m)[a-z]$",
|
||||
"abc\ndef\nxyz",
|
||||
(2, 3),
|
||||
(6, 7),
|
||||
(10, 11)
|
||||
);
|
||||
matiter!(match_multi_8, r"(?m)$[a-z]", "abc\ndef\nxyz");
|
||||
matiter!(match_multi_9, r"(?m)^$", "", (0, 0));
|
||||
|
||||
matiter!(match_multi_rep_1, r"(?m)(?:^$)*", "a\nb\nc",
|
||||
(0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (5, 5));
|
||||
matiter!(match_multi_rep_2, r"(?m)(?:^|a)+", "a\naaa\n",
|
||||
(0, 0), (2, 2), (3, 5), (6, 6));
|
||||
matiter!(match_multi_rep_3, r"(?m)(?:^|a)*", "a\naaa\n",
|
||||
(0, 1), (2, 5), (6, 6));
|
||||
matiter!(match_multi_rep_4, r"(?m)(?:^[a-z])+", "abc\ndef\nxyz",
|
||||
(0, 1), (4, 5), (8, 9));
|
||||
matiter!(match_multi_rep_5, r"(?m)(?:^[a-z]{3}\n?)+", "abc\ndef\nxyz",
|
||||
(0, 11));
|
||||
matiter!(match_multi_rep_6, r"(?m)(?:^[a-z]{3}\n?)*", "abc\ndef\nxyz",
|
||||
(0, 11));
|
||||
matiter!(match_multi_rep_7, r"(?m)(?:\n?[a-z]{3}$)+", "abc\ndef\nxyz",
|
||||
(0, 11));
|
||||
matiter!(match_multi_rep_8, r"(?m)(?:\n?[a-z]{3}$)*", "abc\ndef\nxyz",
|
||||
(0, 11));
|
||||
matiter!(match_multi_rep_9, r"(?m)^*", "\naa\n",
|
||||
(0, 0), (1, 1), (2, 2), (3, 3), (4, 4));
|
||||
matiter!(match_multi_rep_10, r"(?m)^+", "\naa\n",
|
||||
(0, 0), (1, 1), (4, 4));
|
||||
matiter!(match_multi_rep_11, r"(?m)$*", "\naa\n",
|
||||
(0, 0), (1, 1), (2, 2), (3, 3), (4, 4));
|
||||
matiter!(match_multi_rep_12, r"(?m)$+", "\naa\n",
|
||||
(0, 0), (3, 3), (4, 4));
|
||||
matiter!(match_multi_rep_13, r"(?m)(?:$\n)+", "\n\naaa\n\n",
|
||||
(0, 2), (5, 7));
|
||||
matiter!(match_multi_rep_14, r"(?m)(?:$\n)*", "\n\naaa\n\n",
|
||||
(0, 2), (3, 3), (4, 4), (5, 7));
|
||||
matiter!(match_multi_rep_15, r"(?m)(?:$\n^)+", "\n\naaa\n\n",
|
||||
(0, 2), (5, 7));
|
||||
matiter!(match_multi_rep_16, r"(?m)(?:^|$)+", "\n\naaa\n\n",
|
||||
(0, 0), (1, 1), (2, 2), (5, 5), (6, 6), (7, 7));
|
||||
matiter!(match_multi_rep_17, r"(?m)(?:$\n)*", "\n\naaa\n\n",
|
||||
(0, 2), (3, 3), (4, 4), (5, 7));
|
||||
matiter!(
|
||||
match_multi_rep_1,
|
||||
r"(?m)(?:^$)*",
|
||||
"a\nb\nc",
|
||||
(0, 0),
|
||||
(1, 1),
|
||||
(2, 2),
|
||||
(3, 3),
|
||||
(4, 4),
|
||||
(5, 5)
|
||||
);
|
||||
matiter!(
|
||||
match_multi_rep_2,
|
||||
r"(?m)(?:^|a)+",
|
||||
"a\naaa\n",
|
||||
(0, 0),
|
||||
(2, 2),
|
||||
(3, 5),
|
||||
(6, 6)
|
||||
);
|
||||
matiter!(
|
||||
match_multi_rep_3,
|
||||
r"(?m)(?:^|a)*",
|
||||
"a\naaa\n",
|
||||
(0, 1),
|
||||
(2, 5),
|
||||
(6, 6)
|
||||
);
|
||||
matiter!(
|
||||
match_multi_rep_4,
|
||||
r"(?m)(?:^[a-z])+",
|
||||
"abc\ndef\nxyz",
|
||||
(0, 1),
|
||||
(4, 5),
|
||||
(8, 9)
|
||||
);
|
||||
matiter!(
|
||||
match_multi_rep_5,
|
||||
r"(?m)(?:^[a-z]{3}\n?)+",
|
||||
"abc\ndef\nxyz",
|
||||
(0, 11)
|
||||
);
|
||||
matiter!(
|
||||
match_multi_rep_6,
|
||||
r"(?m)(?:^[a-z]{3}\n?)*",
|
||||
"abc\ndef\nxyz",
|
||||
(0, 11)
|
||||
);
|
||||
matiter!(
|
||||
match_multi_rep_7,
|
||||
r"(?m)(?:\n?[a-z]{3}$)+",
|
||||
"abc\ndef\nxyz",
|
||||
(0, 11)
|
||||
);
|
||||
matiter!(
|
||||
match_multi_rep_8,
|
||||
r"(?m)(?:\n?[a-z]{3}$)*",
|
||||
"abc\ndef\nxyz",
|
||||
(0, 11)
|
||||
);
|
||||
matiter!(
|
||||
match_multi_rep_9,
|
||||
r"(?m)^*",
|
||||
"\naa\n",
|
||||
(0, 0),
|
||||
(1, 1),
|
||||
(2, 2),
|
||||
(3, 3),
|
||||
(4, 4)
|
||||
);
|
||||
matiter!(match_multi_rep_10, r"(?m)^+", "\naa\n", (0, 0), (1, 1), (4, 4));
|
||||
matiter!(
|
||||
match_multi_rep_11,
|
||||
r"(?m)$*",
|
||||
"\naa\n",
|
||||
(0, 0),
|
||||
(1, 1),
|
||||
(2, 2),
|
||||
(3, 3),
|
||||
(4, 4)
|
||||
);
|
||||
matiter!(match_multi_rep_12, r"(?m)$+", "\naa\n", (0, 0), (3, 3), (4, 4));
|
||||
matiter!(match_multi_rep_13, r"(?m)(?:$\n)+", "\n\naaa\n\n", (0, 2), (5, 7));
|
||||
matiter!(
|
||||
match_multi_rep_14,
|
||||
r"(?m)(?:$\n)*",
|
||||
"\n\naaa\n\n",
|
||||
(0, 2),
|
||||
(3, 3),
|
||||
(4, 4),
|
||||
(5, 7)
|
||||
);
|
||||
matiter!(match_multi_rep_15, r"(?m)(?:$\n^)+", "\n\naaa\n\n", (0, 2), (5, 7));
|
||||
matiter!(
|
||||
match_multi_rep_16,
|
||||
r"(?m)(?:^|$)+",
|
||||
"\n\naaa\n\n",
|
||||
(0, 0),
|
||||
(1, 1),
|
||||
(2, 2),
|
||||
(5, 5),
|
||||
(6, 6),
|
||||
(7, 7)
|
||||
);
|
||||
matiter!(
|
||||
match_multi_rep_17,
|
||||
r"(?m)(?:$\n)*",
|
||||
"\n\naaa\n\n",
|
||||
(0, 2),
|
||||
(3, 3),
|
||||
(4, 4),
|
||||
(5, 7)
|
||||
);
|
||||
|
||||
+62
-20
@@ -64,11 +64,31 @@ mat!(anchored_prefix2, r"^a\S", "foo boo a ", None);
|
||||
mat!(anchored_prefix3, r"^-[a-z]", "r-f", None);
|
||||
|
||||
// See: https://github.com/rust-lang/regex/issues/204
|
||||
split!(split_on_word_boundary, r"\b", r"Should this (work?)",
|
||||
&[t!(""), t!("Should"), t!(" "), t!("this"),
|
||||
t!(" ("), t!("work"), t!("?)")]);
|
||||
matiter!(word_boundary_dfa, r"\b", "a b c",
|
||||
(0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (5, 5));
|
||||
split!(
|
||||
split_on_word_boundary,
|
||||
r"\b",
|
||||
r"Should this (work?)",
|
||||
&[
|
||||
t!(""),
|
||||
t!("Should"),
|
||||
t!(" "),
|
||||
t!("this"),
|
||||
t!(" ("),
|
||||
t!("work"),
|
||||
t!("?)")
|
||||
]
|
||||
);
|
||||
matiter!(
|
||||
word_boundary_dfa,
|
||||
r"\b",
|
||||
"a b c",
|
||||
(0, 0),
|
||||
(1, 1),
|
||||
(2, 2),
|
||||
(3, 3),
|
||||
(4, 4),
|
||||
(5, 5)
|
||||
);
|
||||
|
||||
// See: https://github.com/rust-lang/regex/issues/268
|
||||
matiter!(partial_anchor, r"^a|b", "ba", (0, 1));
|
||||
@@ -81,8 +101,16 @@ ismatch!(partial_anchor_alternate_end, r"a$|z", "ayyyyy", false);
|
||||
mat!(lits_unambiguous1, r"(ABC|CDA|BC)X", "CDAX", Some((0, 4)));
|
||||
|
||||
// See: https://github.com/rust-lang/regex/issues/291
|
||||
mat!(lits_unambiguous2, r"((IMG|CAM|MG|MB2)_|(DSCN|CIMG))(?P<n>[0-9]+)$",
|
||||
"CIMG2341", Some((0, 8)), Some((0, 4)), None, Some((0, 4)), Some((4, 8)));
|
||||
mat!(
|
||||
lits_unambiguous2,
|
||||
r"((IMG|CAM|MG|MB2)_|(DSCN|CIMG))(?P<n>[0-9]+)$",
|
||||
"CIMG2341",
|
||||
Some((0, 8)),
|
||||
Some((0, 4)),
|
||||
None,
|
||||
Some((0, 4)),
|
||||
Some((4, 8))
|
||||
);
|
||||
|
||||
// See: https://github.com/rust-lang/regex/issues/271
|
||||
mat!(endl_or_wb, r"(?m:$)|(?-u:\b)", "\u{6084e}", Some((4, 4)));
|
||||
@@ -101,36 +129,50 @@ matiter!(reverse_suffix3, r"\d\d\d000", "153.230000\n", (4, 10));
|
||||
|
||||
// See: https://github.com/rust-lang/regex/issues/334
|
||||
// See: https://github.com/rust-lang/regex/issues/557
|
||||
mat!(captures_after_dfa_premature_end1, r"a(b*(X|$))?", "abcbX",
|
||||
Some((0, 1)), None, None);
|
||||
mat!(captures_after_dfa_premature_end2, r"a(bc*(X|$))?", "abcbX",
|
||||
Some((0, 1)), None, None);
|
||||
mat!(captures_after_dfa_premature_end3, r"(aa$)?", "aaz",
|
||||
Some((0, 0)));
|
||||
mat!(
|
||||
captures_after_dfa_premature_end1,
|
||||
r"a(b*(X|$))?",
|
||||
"abcbX",
|
||||
Some((0, 1)),
|
||||
None,
|
||||
None
|
||||
);
|
||||
mat!(
|
||||
captures_after_dfa_premature_end2,
|
||||
r"a(bc*(X|$))?",
|
||||
"abcbX",
|
||||
Some((0, 1)),
|
||||
None,
|
||||
None
|
||||
);
|
||||
mat!(captures_after_dfa_premature_end3, r"(aa$)?", "aaz", Some((0, 0)));
|
||||
|
||||
// See: https://github.com/rust-lang/regex/issues/437
|
||||
ismatch!(
|
||||
literal_panic,
|
||||
r"typename type\-parameter\-\d+\-\d+::.+",
|
||||
"test",
|
||||
false);
|
||||
false
|
||||
);
|
||||
|
||||
// See: https://github.com/rust-lang/regex/issues/533
|
||||
ismatch!(
|
||||
blank_matches_nothing_between_space_and_tab,
|
||||
r"[[:blank:]]",
|
||||
"\u{a}\u{b}\u{c}\u{d}\u{e}\u{f}\
|
||||
\u{10}\u{11}\u{12}\u{13}\u{14}\u{15}\u{16}\u{17}\
|
||||
\u{18}\u{19}\u{1a}\u{1b}\u{1c}\u{1d}\u{1e}\u{1f}",
|
||||
false);
|
||||
\u{10}\u{11}\u{12}\u{13}\u{14}\u{15}\u{16}\u{17}\
|
||||
\u{18}\u{19}\u{1a}\u{1b}\u{1c}\u{1d}\u{1e}\u{1f}",
|
||||
false
|
||||
);
|
||||
|
||||
ismatch!(
|
||||
inverted_blank_matches_everything_between_space_and_tab,
|
||||
r"^[[:^blank:]]+$",
|
||||
"\u{a}\u{b}\u{c}\u{d}\u{e}\u{f}\
|
||||
\u{10}\u{11}\u{12}\u{13}\u{14}\u{15}\u{16}\u{17}\
|
||||
\u{18}\u{19}\u{1a}\u{1b}\u{1c}\u{1d}\u{1e}\u{1f}",
|
||||
true);
|
||||
\u{10}\u{11}\u{12}\u{13}\u{14}\u{15}\u{16}\u{17}\
|
||||
\u{18}\u{19}\u{1a}\u{1b}\u{1c}\u{1d}\u{1e}\u{1f}",
|
||||
true
|
||||
);
|
||||
|
||||
// Tests that our Aho-Corasick optimization works correctly. It only
|
||||
// kicks in when we have >32 literals. By "works correctly," we mean that
|
||||
|
||||
+84
-25
@@ -13,40 +13,99 @@ replace!(first, replace, r"\d", "age: 26", t!("Z"), "age: Z6");
|
||||
replace!(plus, replace, r"\d+", "age: 26", t!("Z"), "age: Z");
|
||||
replace!(all, replace_all, r"\d", "age: 26", t!("Z"), "age: ZZ");
|
||||
replace!(groups, replace, r"(\S+)\s+(\S+)", "w1 w2", t!("$2 $1"), "w2 w1");
|
||||
replace!(double_dollar, replace,
|
||||
r"(\S+)\s+(\S+)", "w1 w2", t!("$2 $$1"), "w2 $1");
|
||||
replace!(
|
||||
double_dollar,
|
||||
replace,
|
||||
r"(\S+)\s+(\S+)",
|
||||
"w1 w2",
|
||||
t!("$2 $$1"),
|
||||
"w2 $1"
|
||||
);
|
||||
// replace!(adjacent_index, replace,
|
||||
// r"([^aeiouy])ies$", "skies", t!("$1y"), "sky");
|
||||
replace!(named, replace_all,
|
||||
r"(?P<first>\S+)\s+(?P<last>\S+)(?P<space>\s*)",
|
||||
"w1 w2 w3 w4", t!("$last $first$space"), "w2 w1 w4 w3");
|
||||
replace!(trim, replace_all, "^[ \t]+|[ \t]+$", " \t trim me\t \t",
|
||||
t!(""), "trim me");
|
||||
// r"([^aeiouy])ies$", "skies", t!("$1y"), "sky");
|
||||
replace!(
|
||||
named,
|
||||
replace_all,
|
||||
r"(?P<first>\S+)\s+(?P<last>\S+)(?P<space>\s*)",
|
||||
"w1 w2 w3 w4",
|
||||
t!("$last $first$space"),
|
||||
"w2 w1 w4 w3"
|
||||
);
|
||||
replace!(
|
||||
trim,
|
||||
replace_all,
|
||||
"^[ \t]+|[ \t]+$",
|
||||
" \t trim me\t \t",
|
||||
t!(""),
|
||||
"trim me"
|
||||
);
|
||||
replace!(number_hypen, replace, r"(.)(.)", "ab", t!("$1-$2"), "a-b");
|
||||
// replace!(number_underscore, replace, r"(.)(.)", "ab", t!("$1_$2"), "a_b");
|
||||
replace!(simple_expand, replace_all, r"(\w) (\w)", "a b", t!("$2 $1"), "b a");
|
||||
replace!(literal_dollar1, replace_all,
|
||||
r"(\w+) (\w+)", "a b", t!("$$1"), "$1");
|
||||
replace!(literal_dollar2, replace_all,
|
||||
r"(\w+) (\w+)", "a b", t!("$2 $$c $1"), "b $c a");
|
||||
replace!(no_expand1, replace,
|
||||
r"(\S+)\s+(\S+)", "w1 w2", no_expand!("$2 $1"), "$2 $1");
|
||||
replace!(no_expand2, replace,
|
||||
r"(\S+)\s+(\S+)", "w1 w2", no_expand!("$$1"), "$$1");
|
||||
replace!(literal_dollar1, replace_all, r"(\w+) (\w+)", "a b", t!("$$1"), "$1");
|
||||
replace!(
|
||||
literal_dollar2,
|
||||
replace_all,
|
||||
r"(\w+) (\w+)",
|
||||
"a b",
|
||||
t!("$2 $$c $1"),
|
||||
"b $c a"
|
||||
);
|
||||
replace!(
|
||||
no_expand1,
|
||||
replace,
|
||||
r"(\S+)\s+(\S+)",
|
||||
"w1 w2",
|
||||
no_expand!("$2 $1"),
|
||||
"$2 $1"
|
||||
);
|
||||
replace!(
|
||||
no_expand2,
|
||||
replace,
|
||||
r"(\S+)\s+(\S+)",
|
||||
"w1 w2",
|
||||
no_expand!("$$1"),
|
||||
"$$1"
|
||||
);
|
||||
use_!(Captures);
|
||||
replace!(closure_returning_reference, replace, r"(\d+)", "age: 26",
|
||||
|captures: &Captures| {
|
||||
match_text!(captures.get(1).unwrap())[0..1].to_owned()
|
||||
}, "age: 2");
|
||||
replace!(closure_returning_value, replace, r"\d+", "age: 26",
|
||||
|_captures: &Captures| t!("Z").to_owned(), "age: Z");
|
||||
|
||||
replace!(
|
||||
closure_returning_reference,
|
||||
replace,
|
||||
r"(\d+)",
|
||||
"age: 26",
|
||||
|captures: &Captures| {
|
||||
match_text!(captures.get(1).unwrap())[0..1].to_owned()
|
||||
},
|
||||
"age: 2"
|
||||
);
|
||||
replace!(
|
||||
closure_returning_value,
|
||||
replace,
|
||||
r"\d+",
|
||||
"age: 26",
|
||||
|_captures: &Captures| t!("Z").to_owned(),
|
||||
"age: Z"
|
||||
);
|
||||
|
||||
// See https://github.com/rust-lang/regex/issues/314
|
||||
replace!(match_at_start_replace_with_empty, replace_all, r"foo", "foobar", t!(""), "bar");
|
||||
replace!(
|
||||
match_at_start_replace_with_empty,
|
||||
replace_all,
|
||||
r"foo",
|
||||
"foobar",
|
||||
t!(""),
|
||||
"bar"
|
||||
);
|
||||
|
||||
// See https://github.com/rust-lang/regex/issues/393
|
||||
replace!(single_empty_match, replace, r"^", "bar", t!("foo"), "foobar");
|
||||
|
||||
// See https://github.com/rust-lang/regex/issues/399
|
||||
replace!(capture_longest_possible_name, replace_all, r"(.)", "b", t!("${1}a $1a"), "ba ");
|
||||
replace!(
|
||||
capture_longest_possible_name,
|
||||
replace_all,
|
||||
r"(.)",
|
||||
"b",
|
||||
t!("${1}a $1a"),
|
||||
"ba "
|
||||
);
|
||||
|
||||
+59
-30
@@ -31,36 +31,65 @@ macro_rules! searcher {
|
||||
}
|
||||
|
||||
searcher!(searcher_empty_regex_empty_haystack, r"", "", Match(0, 0));
|
||||
searcher!(searcher_empty_regex, r"", "ab",
|
||||
Match(0, 0), Reject(0, 1), Match(1, 1), Reject(1, 2), Match(2, 2));
|
||||
searcher!(
|
||||
searcher_empty_regex,
|
||||
r"",
|
||||
"ab",
|
||||
Match(0, 0),
|
||||
Reject(0, 1),
|
||||
Match(1, 1),
|
||||
Reject(1, 2),
|
||||
Match(2, 2)
|
||||
);
|
||||
searcher!(searcher_empty_haystack, r"\d", "");
|
||||
searcher!(searcher_one_match, r"\d", "5",
|
||||
Match(0, 1));
|
||||
searcher!(searcher_no_match, r"\d", "a",
|
||||
Reject(0, 1));
|
||||
searcher!(searcher_two_adjacent_matches, r"\d", "56",
|
||||
Match(0, 1), Match(1, 2));
|
||||
searcher!(searcher_two_non_adjacent_matches, r"\d", "5a6",
|
||||
Match(0, 1), Reject(1, 2), Match(2, 3));
|
||||
searcher!(searcher_reject_first, r"\d", "a6",
|
||||
Reject(0, 1), Match(1, 2));
|
||||
searcher!(searcher_one_zero_length_matches, r"\d*", "a1b2",
|
||||
Match(0, 0), // ^
|
||||
Reject(0, 1), // a
|
||||
Match(1, 2), // a1
|
||||
Reject(2, 3), // a1b
|
||||
Match(3, 4), // a1b2
|
||||
searcher!(searcher_one_match, r"\d", "5", Match(0, 1));
|
||||
searcher!(searcher_no_match, r"\d", "a", Reject(0, 1));
|
||||
searcher!(
|
||||
searcher_two_adjacent_matches,
|
||||
r"\d",
|
||||
"56",
|
||||
Match(0, 1),
|
||||
Match(1, 2)
|
||||
);
|
||||
searcher!(searcher_many_zero_length_matches, r"\d*", "a1bbb2",
|
||||
Match(0, 0), // ^
|
||||
Reject(0, 1), // a
|
||||
Match(1, 2), // a1
|
||||
Reject(2, 3), // a1b
|
||||
Match(3, 3), // a1bb
|
||||
Reject(3, 4), // a1bb
|
||||
Match(4, 4), // a1bbb
|
||||
Reject(4, 5), // a1bbb
|
||||
Match(5, 6), // a1bbba
|
||||
searcher!(
|
||||
searcher_two_non_adjacent_matches,
|
||||
r"\d",
|
||||
"5a6",
|
||||
Match(0, 1),
|
||||
Reject(1, 2),
|
||||
Match(2, 3)
|
||||
);
|
||||
searcher!(searcher_reject_first, r"\d", "a6", Reject(0, 1), Match(1, 2));
|
||||
searcher!(
|
||||
searcher_one_zero_length_matches,
|
||||
r"\d*",
|
||||
"a1b2",
|
||||
Match(0, 0), // ^
|
||||
Reject(0, 1), // a
|
||||
Match(1, 2), // a1
|
||||
Reject(2, 3), // a1b
|
||||
Match(3, 4), // a1b2
|
||||
);
|
||||
searcher!(
|
||||
searcher_many_zero_length_matches,
|
||||
r"\d*",
|
||||
"a1bbb2",
|
||||
Match(0, 0), // ^
|
||||
Reject(0, 1), // a
|
||||
Match(1, 2), // a1
|
||||
Reject(2, 3), // a1b
|
||||
Match(3, 3), // a1bb
|
||||
Reject(3, 4), // a1bb
|
||||
Match(4, 4), // a1bbb
|
||||
Reject(4, 5), // a1bbb
|
||||
Match(5, 6), // a1bbba
|
||||
);
|
||||
searcher!(
|
||||
searcher_unicode,
|
||||
r".+?",
|
||||
"Ⅰ1Ⅱ2",
|
||||
Match(0, 3),
|
||||
Match(3, 4),
|
||||
Match(4, 7),
|
||||
Match(7, 8)
|
||||
);
|
||||
searcher!(searcher_unicode, r".+?", "Ⅰ1Ⅱ2",
|
||||
Match(0, 3), Match(3, 4), Match(4, 7), Match(7, 8));
|
||||
|
||||
+8
-1
@@ -19,7 +19,14 @@ matset!(set18, &["a", "β"], "β", 1);
|
||||
|
||||
nomatset!(nset1, &["a", "a"], "b");
|
||||
nomatset!(nset2, &["^foo", "bar$"], "bar foo");
|
||||
nomatset!(nset3, { let xs: &[&str] = &[]; xs }, "a");
|
||||
nomatset!(
|
||||
nset3,
|
||||
{
|
||||
let xs: &[&str] = &[];
|
||||
xs
|
||||
},
|
||||
"a"
|
||||
);
|
||||
nomatset!(nset4, &[r"^rooted$", r"\.log$"], "notrooted");
|
||||
|
||||
// See: https://github.com/rust-lang/regex/issues/187
|
||||
|
||||
@@ -6,7 +6,7 @@ macro_rules! shortmat {
|
||||
let re = regex!($re);
|
||||
assert_eq!($shortest_match, re.shortest_match(text));
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
shortmat!(t01, r"a+", r"aa", Some(1));
|
||||
|
||||
@@ -17,14 +17,16 @@ macro_rules! regex_new {
|
||||
($re:expr) => {{
|
||||
use regex::internal::ExecBuilder;
|
||||
ExecBuilder::new($re)
|
||||
.bounded_backtracking().build().map(|e| e.into_regex())
|
||||
}}
|
||||
.bounded_backtracking()
|
||||
.build()
|
||||
.map(|e| e.into_regex())
|
||||
}};
|
||||
}
|
||||
|
||||
macro_rules! regex {
|
||||
($re:expr) => {
|
||||
regex_new!($re).unwrap()
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
macro_rules! regex_set_new {
|
||||
@@ -34,13 +36,13 @@ macro_rules! regex_set_new {
|
||||
.bounded_backtracking()
|
||||
.build()
|
||||
.map(|e| e.into_regex_set())
|
||||
}}
|
||||
}};
|
||||
}
|
||||
|
||||
macro_rules! regex_set {
|
||||
($res:expr) => {
|
||||
regex_set_new!($res).unwrap()
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
// Must come before other module definitions.
|
||||
|
||||
@@ -19,13 +19,13 @@ macro_rules! regex_new {
|
||||
.only_utf8(false)
|
||||
.build()
|
||||
.map(|e| e.into_byte_regex())
|
||||
}}
|
||||
}};
|
||||
}
|
||||
|
||||
macro_rules! regex {
|
||||
($re:expr) => {
|
||||
regex_new!($re).unwrap()
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
macro_rules! regex_set_new {
|
||||
@@ -36,13 +36,13 @@ macro_rules! regex_set_new {
|
||||
.only_utf8(false)
|
||||
.build()
|
||||
.map(|e| e.into_byte_regex_set())
|
||||
}}
|
||||
}};
|
||||
}
|
||||
|
||||
macro_rules! regex_set {
|
||||
($res:expr) => {
|
||||
regex_set_new!($res).unwrap()
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
// Must come before other module definitions.
|
||||
|
||||
@@ -17,14 +17,17 @@ macro_rules! regex_new {
|
||||
($re:expr) => {{
|
||||
use regex::internal::ExecBuilder;
|
||||
ExecBuilder::new($re)
|
||||
.bounded_backtracking().bytes(true).build().map(|e| e.into_regex())
|
||||
}}
|
||||
.bounded_backtracking()
|
||||
.bytes(true)
|
||||
.build()
|
||||
.map(|e| e.into_regex())
|
||||
}};
|
||||
}
|
||||
|
||||
macro_rules! regex {
|
||||
($re:expr) => {
|
||||
regex_new!($re).unwrap()
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
macro_rules! regex_set_new {
|
||||
@@ -35,13 +38,13 @@ macro_rules! regex_set_new {
|
||||
.bytes(true)
|
||||
.build()
|
||||
.map(|e| e.into_regex_set())
|
||||
}}
|
||||
}};
|
||||
}
|
||||
|
||||
macro_rules! regex_set {
|
||||
($res:expr) => {
|
||||
regex_set_new!($res).unwrap()
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
// Must come before other module definitions.
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
extern crate regex;
|
||||
extern crate quickcheck;
|
||||
extern crate regex;
|
||||
|
||||
/*
|
||||
* This test is a minimal version of <rofl_0> and <subdiff_0>
|
||||
@@ -45,12 +45,12 @@ mod crates_regex {
|
||||
|
||||
if option_env!("RUST_REGEX_RANDOM_TEST").is_some() {
|
||||
match backends_are_consistent($regex_src) {
|
||||
Ok(_) => {},
|
||||
Ok(_) => {}
|
||||
Err(err) => panic!("{}", err),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
include!("crates_regex.rs");
|
||||
|
||||
@@ -24,26 +24,26 @@ macro_rules! regex_new {
|
||||
($re:expr) => {{
|
||||
use regex::Regex;
|
||||
Regex::new($re)
|
||||
}}
|
||||
}};
|
||||
}
|
||||
|
||||
macro_rules! regex {
|
||||
($re:expr) => {
|
||||
regex_new!($re).unwrap()
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
macro_rules! regex_set_new {
|
||||
($re:expr) => {{
|
||||
use regex::RegexSet;
|
||||
RegexSet::new($re)
|
||||
}}
|
||||
}};
|
||||
}
|
||||
|
||||
macro_rules! regex_set {
|
||||
($res:expr) => {
|
||||
regex_set_new!($res).unwrap()
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
// Must come before other module definitions.
|
||||
@@ -88,9 +88,9 @@ fn allow_octal() {
|
||||
|
||||
#[test]
|
||||
fn oibits() {
|
||||
use std::panic::UnwindSafe;
|
||||
use regex::{Regex, RegexBuilder};
|
||||
use regex::bytes;
|
||||
use regex::{Regex, RegexBuilder};
|
||||
use std::panic::UnwindSafe;
|
||||
|
||||
fn assert_send<T: Send>() {}
|
||||
fn assert_sync<T: Sync>() {}
|
||||
@@ -114,8 +114,8 @@ fn oibits() {
|
||||
// See: https://github.com/rust-lang/regex/issues/568
|
||||
#[test]
|
||||
fn oibits_regression() {
|
||||
use std::panic;
|
||||
use regex::Regex;
|
||||
use std::panic;
|
||||
|
||||
let _ = panic::catch_unwind(|| Regex::new("a").unwrap());
|
||||
}
|
||||
|
||||
@@ -15,26 +15,26 @@ macro_rules! regex_new {
|
||||
($re:expr) => {{
|
||||
use regex::bytes::Regex;
|
||||
Regex::new($re)
|
||||
}}
|
||||
}};
|
||||
}
|
||||
|
||||
macro_rules! regex_set_new {
|
||||
($res:expr) => {{
|
||||
use regex::bytes::RegexSet;
|
||||
RegexSet::new($res)
|
||||
}}
|
||||
}};
|
||||
}
|
||||
|
||||
macro_rules! regex {
|
||||
($re:expr) => {
|
||||
regex_new!($re).unwrap()
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
macro_rules! regex_set {
|
||||
($res:expr) => {
|
||||
regex_set_new!($res).unwrap()
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
// Must come before other module definitions.
|
||||
@@ -43,7 +43,11 @@ include!("macros.rs");
|
||||
|
||||
// A silly wrapper to make it possible to write and match raw bytes.
|
||||
struct R<'a>(&'a [u8]);
|
||||
impl<'a> R<'a> { fn as_bytes(&self) -> &'a [u8] { self.0 } }
|
||||
impl<'a> R<'a> {
|
||||
fn as_bytes(&self) -> &'a [u8] {
|
||||
self.0
|
||||
}
|
||||
}
|
||||
|
||||
// See: https://github.com/rust-lang/regex/issues/321
|
||||
//
|
||||
@@ -51,10 +55,18 @@ impl<'a> R<'a> { fn as_bytes(&self) -> &'a [u8] { self.0 } }
|
||||
// regex engine.
|
||||
mat!(invalid_utf8_nfa1, r".", R(b"\xD4\xC2\x65\x2B\x0E\xFE"), Some((2, 3)));
|
||||
mat!(invalid_utf8_nfa2, r"${2}ä", R(b"\xD4\xC2\x65\x2B\x0E\xFE"), None);
|
||||
mat!(invalid_utf8_nfa3, r".", R(b"\x0A\xDB\x82\x6E\x33\x01\xDD\x33\xCD"),
|
||||
Some((1, 3)));
|
||||
mat!(invalid_utf8_nfa4, r"${2}ä", R(b"\x0A\xDB\x82\x6E\x33\x01\xDD\x33\xCD"),
|
||||
None);
|
||||
mat!(
|
||||
invalid_utf8_nfa3,
|
||||
r".",
|
||||
R(b"\x0A\xDB\x82\x6E\x33\x01\xDD\x33\xCD"),
|
||||
Some((1, 3))
|
||||
);
|
||||
mat!(
|
||||
invalid_utf8_nfa4,
|
||||
r"${2}ä",
|
||||
R(b"\x0A\xDB\x82\x6E\x33\x01\xDD\x33\xCD"),
|
||||
None
|
||||
);
|
||||
|
||||
mod api;
|
||||
mod bytes;
|
||||
|
||||
+4
-4
@@ -17,26 +17,26 @@ macro_rules! regex_new {
|
||||
($re:expr) => {{
|
||||
use regex::internal::ExecBuilder;
|
||||
ExecBuilder::new($re).nfa().build().map(|e| e.into_regex())
|
||||
}}
|
||||
}};
|
||||
}
|
||||
|
||||
macro_rules! regex {
|
||||
($re:expr) => {
|
||||
regex_new!($re).unwrap()
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
macro_rules! regex_set_new {
|
||||
($re:expr) => {{
|
||||
use regex::internal::ExecBuilder;
|
||||
ExecBuilder::new_many($re).nfa().build().map(|e| e.into_regex_set())
|
||||
}}
|
||||
}};
|
||||
}
|
||||
|
||||
macro_rules! regex_set {
|
||||
($res:expr) => {
|
||||
regex_set_new!($res).unwrap()
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
// Must come before other module definitions.
|
||||
|
||||
@@ -19,13 +19,13 @@ macro_rules! regex_new {
|
||||
.only_utf8(false)
|
||||
.build()
|
||||
.map(|e| e.into_byte_regex())
|
||||
}}
|
||||
}};
|
||||
}
|
||||
|
||||
macro_rules! regex {
|
||||
($re:expr) => {
|
||||
regex_new!($re).unwrap()
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
macro_rules! regex_set_new {
|
||||
@@ -36,13 +36,13 @@ macro_rules! regex_set_new {
|
||||
.only_utf8(false)
|
||||
.build()
|
||||
.map(|e| e.into_byte_regex_set())
|
||||
}}
|
||||
}};
|
||||
}
|
||||
|
||||
macro_rules! regex_set {
|
||||
($res:expr) => {
|
||||
regex_set_new!($res).unwrap()
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
// Must come before other module definitions.
|
||||
|
||||
@@ -17,27 +17,30 @@ macro_rules! regex_new {
|
||||
($re:expr) => {{
|
||||
use regex::internal::ExecBuilder;
|
||||
ExecBuilder::new($re).nfa().bytes(true).build().map(|e| e.into_regex())
|
||||
}}
|
||||
}};
|
||||
}
|
||||
|
||||
macro_rules! regex {
|
||||
($re:expr) => {
|
||||
regex_new!($re).unwrap()
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
macro_rules! regex_set_new {
|
||||
($re:expr) => {{
|
||||
use regex::internal::ExecBuilder;
|
||||
ExecBuilder::new_many($re)
|
||||
.nfa().bytes(true).build().map(|e| e.into_regex_set())
|
||||
}}
|
||||
.nfa()
|
||||
.bytes(true)
|
||||
.build()
|
||||
.map(|e| e.into_regex_set())
|
||||
}};
|
||||
}
|
||||
|
||||
macro_rules! regex_set {
|
||||
($res:expr) => {
|
||||
regex_set_new!($res).unwrap()
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
// Must come before other module definitions.
|
||||
|
||||
+186
-92
@@ -34,106 +34,200 @@ mat!(uni_not_boundary_ogham, r"\d\B", "6 ", None);
|
||||
//
|
||||
// We should test more, but there's a lot. Write a script to generate more of
|
||||
// these tests.
|
||||
mat!(uni_class_gencat_cased_letter,
|
||||
r"\p{Cased_Letter}", "A", Some((0, 3)));
|
||||
mat!(uni_class_gencat_close_punctuation,
|
||||
r"\p{Close_Punctuation}", "❯", Some((0, 3)));
|
||||
mat!(uni_class_gencat_connector_punctuation,
|
||||
r"\p{Connector_Punctuation}", "⁀", Some((0, 3)));
|
||||
mat!(uni_class_gencat_control,
|
||||
r"\p{Control}", "\u{9f}", Some((0, 2)));
|
||||
mat!(uni_class_gencat_currency_symbol,
|
||||
r"\p{Currency_Symbol}", "£", Some((0, 3)));
|
||||
mat!(uni_class_gencat_dash_punctuation,
|
||||
r"\p{Dash_Punctuation}", "〰", Some((0, 3)));
|
||||
mat!(uni_class_gencat_decimal_numer,
|
||||
r"\p{Decimal_Number}", "𑓙", Some((0, 4)));
|
||||
mat!(uni_class_gencat_enclosing_mark,
|
||||
r"\p{Enclosing_Mark}", "\u{A672}", Some((0, 3)));
|
||||
mat!(uni_class_gencat_final_punctuation,
|
||||
r"\p{Final_Punctuation}", "⸡", Some((0, 3)));
|
||||
mat!(uni_class_gencat_format,
|
||||
r"\p{Format}", "\u{E007F}", Some((0, 4)));
|
||||
mat!(uni_class_gencat_initial_punctuation,
|
||||
r"\p{Initial_Punctuation}", "⸜", Some((0, 3)));
|
||||
mat!(uni_class_gencat_letter,
|
||||
r"\p{Letter}", "Έ", Some((0, 2)));
|
||||
mat!(uni_class_gencat_letter_number,
|
||||
r"\p{Letter_Number}", "ↂ", Some((0, 3)));
|
||||
mat!(uni_class_gencat_line_separator,
|
||||
r"\p{Line_Separator}", "\u{2028}", Some((0, 3)));
|
||||
mat!(uni_class_gencat_lowercase_letter,
|
||||
r"\p{Lowercase_Letter}", "ϛ", Some((0, 2)));
|
||||
mat!(uni_class_gencat_mark,
|
||||
r"\p{Mark}", "\u{E01EF}", Some((0, 4)));
|
||||
mat!(uni_class_gencat_math,
|
||||
r"\p{Math}", "⋿", Some((0, 3)));
|
||||
mat!(uni_class_gencat_modifier_letter,
|
||||
r"\p{Modifier_Letter}", "𖭃", Some((0, 4)));
|
||||
mat!(uni_class_gencat_modifier_symbol,
|
||||
r"\p{Modifier_Symbol}", "🏿", Some((0, 4)));
|
||||
mat!(uni_class_gencat_nonspacing_mark,
|
||||
r"\p{Nonspacing_Mark}", "\u{1E94A}", Some((0, 4)));
|
||||
mat!(uni_class_gencat_number,
|
||||
r"\p{Number}", "⓿", Some((0, 3)));
|
||||
mat!(uni_class_gencat_open_punctuation,
|
||||
r"\p{Open_Punctuation}", "⦅", Some((0, 3)));
|
||||
mat!(uni_class_gencat_other,
|
||||
r"\p{Other}", "\u{bc9}", Some((0, 3)));
|
||||
mat!(uni_class_gencat_other_letter,
|
||||
r"\p{Other_Letter}", "ꓷ", Some((0, 3)));
|
||||
mat!(uni_class_gencat_other_number,
|
||||
r"\p{Other_Number}", "㉏", Some((0, 3)));
|
||||
mat!(uni_class_gencat_other_punctuation,
|
||||
r"\p{Other_Punctuation}", "𞥞", Some((0, 4)));
|
||||
mat!(uni_class_gencat_other_symbol,
|
||||
r"\p{Other_Symbol}", "⅌", Some((0, 3)));
|
||||
mat!(uni_class_gencat_paragraph_separator,
|
||||
r"\p{Paragraph_Separator}", "\u{2029}", Some((0, 3)));
|
||||
mat!(uni_class_gencat_private_use,
|
||||
r"\p{Private_Use}", "\u{10FFFD}", Some((0, 4)));
|
||||
mat!(uni_class_gencat_punctuation,
|
||||
r"\p{Punctuation}", "𑁍", Some((0, 4)));
|
||||
mat!(uni_class_gencat_separator,
|
||||
r"\p{Separator}", "\u{3000}", Some((0, 3)));
|
||||
mat!(uni_class_gencat_space_separator,
|
||||
r"\p{Space_Separator}", "\u{205F}", Some((0, 3)));
|
||||
mat!(uni_class_gencat_spacing_mark,
|
||||
r"\p{Spacing_Mark}", "\u{16F7E}", Some((0, 4)));
|
||||
mat!(uni_class_gencat_symbol,
|
||||
r"\p{Symbol}", "⯈", Some((0, 3)));
|
||||
mat!(uni_class_gencat_titlecase_letter,
|
||||
r"\p{Titlecase_Letter}", "ῼ", Some((0, 3)));
|
||||
mat!(uni_class_gencat_unassigned,
|
||||
r"\p{Unassigned}", "\u{10FFFF}", Some((0, 4)));
|
||||
mat!(uni_class_gencat_uppercase_letter,
|
||||
r"\p{Uppercase_Letter}", "Ꝋ", Some((0, 3)));
|
||||
mat!(uni_class_gencat_cased_letter, r"\p{Cased_Letter}", "A", Some((0, 3)));
|
||||
mat!(
|
||||
uni_class_gencat_close_punctuation,
|
||||
r"\p{Close_Punctuation}",
|
||||
"❯",
|
||||
Some((0, 3))
|
||||
);
|
||||
mat!(
|
||||
uni_class_gencat_connector_punctuation,
|
||||
r"\p{Connector_Punctuation}",
|
||||
"⁀",
|
||||
Some((0, 3))
|
||||
);
|
||||
mat!(uni_class_gencat_control, r"\p{Control}", "\u{9f}", Some((0, 2)));
|
||||
mat!(
|
||||
uni_class_gencat_currency_symbol,
|
||||
r"\p{Currency_Symbol}",
|
||||
"£",
|
||||
Some((0, 3))
|
||||
);
|
||||
mat!(
|
||||
uni_class_gencat_dash_punctuation,
|
||||
r"\p{Dash_Punctuation}",
|
||||
"〰",
|
||||
Some((0, 3))
|
||||
);
|
||||
mat!(
|
||||
uni_class_gencat_decimal_numer,
|
||||
r"\p{Decimal_Number}",
|
||||
"𑓙",
|
||||
Some((0, 4))
|
||||
);
|
||||
mat!(
|
||||
uni_class_gencat_enclosing_mark,
|
||||
r"\p{Enclosing_Mark}",
|
||||
"\u{A672}",
|
||||
Some((0, 3))
|
||||
);
|
||||
mat!(
|
||||
uni_class_gencat_final_punctuation,
|
||||
r"\p{Final_Punctuation}",
|
||||
"⸡",
|
||||
Some((0, 3))
|
||||
);
|
||||
mat!(uni_class_gencat_format, r"\p{Format}", "\u{E007F}", Some((0, 4)));
|
||||
mat!(
|
||||
uni_class_gencat_initial_punctuation,
|
||||
r"\p{Initial_Punctuation}",
|
||||
"⸜",
|
||||
Some((0, 3))
|
||||
);
|
||||
mat!(uni_class_gencat_letter, r"\p{Letter}", "Έ", Some((0, 2)));
|
||||
mat!(
|
||||
uni_class_gencat_letter_number,
|
||||
r"\p{Letter_Number}",
|
||||
"ↂ",
|
||||
Some((0, 3))
|
||||
);
|
||||
mat!(
|
||||
uni_class_gencat_line_separator,
|
||||
r"\p{Line_Separator}",
|
||||
"\u{2028}",
|
||||
Some((0, 3))
|
||||
);
|
||||
mat!(
|
||||
uni_class_gencat_lowercase_letter,
|
||||
r"\p{Lowercase_Letter}",
|
||||
"ϛ",
|
||||
Some((0, 2))
|
||||
);
|
||||
mat!(uni_class_gencat_mark, r"\p{Mark}", "\u{E01EF}", Some((0, 4)));
|
||||
mat!(uni_class_gencat_math, r"\p{Math}", "⋿", Some((0, 3)));
|
||||
mat!(
|
||||
uni_class_gencat_modifier_letter,
|
||||
r"\p{Modifier_Letter}",
|
||||
"𖭃",
|
||||
Some((0, 4))
|
||||
);
|
||||
mat!(
|
||||
uni_class_gencat_modifier_symbol,
|
||||
r"\p{Modifier_Symbol}",
|
||||
"🏿",
|
||||
Some((0, 4))
|
||||
);
|
||||
mat!(
|
||||
uni_class_gencat_nonspacing_mark,
|
||||
r"\p{Nonspacing_Mark}",
|
||||
"\u{1E94A}",
|
||||
Some((0, 4))
|
||||
);
|
||||
mat!(uni_class_gencat_number, r"\p{Number}", "⓿", Some((0, 3)));
|
||||
mat!(
|
||||
uni_class_gencat_open_punctuation,
|
||||
r"\p{Open_Punctuation}",
|
||||
"⦅",
|
||||
Some((0, 3))
|
||||
);
|
||||
mat!(uni_class_gencat_other, r"\p{Other}", "\u{bc9}", Some((0, 3)));
|
||||
mat!(uni_class_gencat_other_letter, r"\p{Other_Letter}", "ꓷ", Some((0, 3)));
|
||||
mat!(uni_class_gencat_other_number, r"\p{Other_Number}", "㉏", Some((0, 3)));
|
||||
mat!(
|
||||
uni_class_gencat_other_punctuation,
|
||||
r"\p{Other_Punctuation}",
|
||||
"𞥞",
|
||||
Some((0, 4))
|
||||
);
|
||||
mat!(uni_class_gencat_other_symbol, r"\p{Other_Symbol}", "⅌", Some((0, 3)));
|
||||
mat!(
|
||||
uni_class_gencat_paragraph_separator,
|
||||
r"\p{Paragraph_Separator}",
|
||||
"\u{2029}",
|
||||
Some((0, 3))
|
||||
);
|
||||
mat!(
|
||||
uni_class_gencat_private_use,
|
||||
r"\p{Private_Use}",
|
||||
"\u{10FFFD}",
|
||||
Some((0, 4))
|
||||
);
|
||||
mat!(uni_class_gencat_punctuation, r"\p{Punctuation}", "𑁍", Some((0, 4)));
|
||||
mat!(uni_class_gencat_separator, r"\p{Separator}", "\u{3000}", Some((0, 3)));
|
||||
mat!(
|
||||
uni_class_gencat_space_separator,
|
||||
r"\p{Space_Separator}",
|
||||
"\u{205F}",
|
||||
Some((0, 3))
|
||||
);
|
||||
mat!(
|
||||
uni_class_gencat_spacing_mark,
|
||||
r"\p{Spacing_Mark}",
|
||||
"\u{16F7E}",
|
||||
Some((0, 4))
|
||||
);
|
||||
mat!(uni_class_gencat_symbol, r"\p{Symbol}", "⯈", Some((0, 3)));
|
||||
mat!(
|
||||
uni_class_gencat_titlecase_letter,
|
||||
r"\p{Titlecase_Letter}",
|
||||
"ῼ",
|
||||
Some((0, 3))
|
||||
);
|
||||
mat!(
|
||||
uni_class_gencat_unassigned,
|
||||
r"\p{Unassigned}",
|
||||
"\u{10FFFF}",
|
||||
Some((0, 4))
|
||||
);
|
||||
mat!(
|
||||
uni_class_gencat_uppercase_letter,
|
||||
r"\p{Uppercase_Letter}",
|
||||
"Ꝋ",
|
||||
Some((0, 3))
|
||||
);
|
||||
|
||||
// Test a smattering of properties.
|
||||
mat!(uni_class_prop_emoji1, r"\p{Emoji}", "\u{23E9}", Some((0, 3)));
|
||||
mat!(uni_class_prop_emoji2, r"\p{emoji}", "\u{1F21A}", Some((0, 4)));
|
||||
mat!(uni_class_prop_picto1,
|
||||
r"\p{extendedpictographic}", "\u{1FA6E}", Some((0, 4)));
|
||||
mat!(uni_class_prop_picto2,
|
||||
r"\p{extendedpictographic}", "\u{1FFFD}", Some((0, 4)));
|
||||
mat!(
|
||||
uni_class_prop_picto1,
|
||||
r"\p{extendedpictographic}",
|
||||
"\u{1FA6E}",
|
||||
Some((0, 4))
|
||||
);
|
||||
mat!(
|
||||
uni_class_prop_picto2,
|
||||
r"\p{extendedpictographic}",
|
||||
"\u{1FFFD}",
|
||||
Some((0, 4))
|
||||
);
|
||||
|
||||
// grapheme_cluster_break
|
||||
mat!(uni_class_gcb_prepend,
|
||||
r"\p{grapheme_cluster_break=prepend}", "\u{11D46}", Some((0, 4)));
|
||||
mat!(uni_class_gcb_ri1,
|
||||
r"\p{gcb=regional_indicator}", "\u{1F1E6}", Some((0, 4)));
|
||||
mat!(uni_class_gcb_ri2,
|
||||
r"\p{gcb=ri}", "\u{1F1E7}", Some((0, 4)));
|
||||
mat!(uni_class_gcb_ri3,
|
||||
r"\p{gcb=regionalindicator}", "\u{1F1FF}", Some((0, 4)));
|
||||
mat!(uni_class_gcb_lvt,
|
||||
r"\p{gcb=lvt}", "\u{C989}", Some((0, 3)));
|
||||
mat!(uni_class_gcb_zwj,
|
||||
r"\p{gcb=zwj}", "\u{200D}", Some((0, 3)));
|
||||
mat!(
|
||||
uni_class_gcb_prepend,
|
||||
r"\p{grapheme_cluster_break=prepend}",
|
||||
"\u{11D46}",
|
||||
Some((0, 4))
|
||||
);
|
||||
mat!(
|
||||
uni_class_gcb_ri1,
|
||||
r"\p{gcb=regional_indicator}",
|
||||
"\u{1F1E6}",
|
||||
Some((0, 4))
|
||||
);
|
||||
mat!(uni_class_gcb_ri2, r"\p{gcb=ri}", "\u{1F1E7}", Some((0, 4)));
|
||||
mat!(
|
||||
uni_class_gcb_ri3,
|
||||
r"\p{gcb=regionalindicator}",
|
||||
"\u{1F1FF}",
|
||||
Some((0, 4))
|
||||
);
|
||||
mat!(uni_class_gcb_lvt, r"\p{gcb=lvt}", "\u{C989}", Some((0, 3)));
|
||||
mat!(uni_class_gcb_zwj, r"\p{gcb=zwj}", "\u{200D}", Some((0, 3)));
|
||||
|
||||
// word_break
|
||||
mat!(uni_class_wb1,
|
||||
r"\p{word_break=Hebrew_Letter}", "\u{FB46}", Some((0, 3)));
|
||||
mat!(uni_class_wb1, r"\p{word_break=Hebrew_Letter}", "\u{FB46}", Some((0, 3)));
|
||||
mat!(uni_class_wb2, r"\p{wb=hebrewletter}", "\u{FB46}", Some((0, 3)));
|
||||
mat!(uni_class_wb3, r"\p{wb=ExtendNumLet}", "\u{FF3F}", Some((0, 3)));
|
||||
mat!(uni_class_wb4, r"\p{wb=WSegSpace}", "\u{3000}", Some((0, 3)));
|
||||
|
||||
Reference in New Issue
Block a user