Bug 1669162 - Update mapped_hyph to 0.4.2 so that .dic parse errors are non-fatal. r=heycam

Differential Revision: https://phabricator.services.mozilla.com/D92435
This commit is contained in:
Jonathan Kew 2020-10-12 10:23:27 +00:00
parent 6259946a20
commit 1c54e5ffd7
9 changed files with 51 additions and 19 deletions

View File

@ -40,7 +40,7 @@ rev = "21c26326f5f45f415c49eac4ba5bc41a2f961321"
[source."https://github.com/jfkthame/mapped_hyph.git"]
git = "https://github.com/jfkthame/mapped_hyph.git"
replace-with = "vendored-sources"
tag = "v0.4.0"
tag = "v0.4.2"
[source."https://github.com/hsivonen/packed_simd"]
git = "https://github.com/hsivonen/packed_simd"

6
Cargo.lock generated
View File

@ -2890,10 +2890,12 @@ dependencies = [
[[package]]
name = "mapped_hyph"
version = "0.4.0"
source = "git+https://github.com/jfkthame/mapped_hyph.git?tag=v0.4.0#c7737af5ebe9b404c6b7eed6006785ea41337ca1"
version = "0.4.2"
source = "git+https://github.com/jfkthame/mapped_hyph.git?tag=v0.4.2#d0d2e862cea33c262d8173a2dddbe0b50fdd6775"
dependencies = [
"arrayref",
"env_logger 0.7.1",
"log",
"memmap",
]

0
third_party/rust/ahash/smhasher/clone_smhasher.sh vendored Normal file → Executable file
View File

View File

@ -1 +1 @@
{"files":{".travis.yml":"4d1af7257c9619f7ae66fc271ba2c1be5f063640ae8ceaa235c8c8aaf32f44ea","COPYRIGHT":"4df931055b82b96e13ad475c4cee3de5afa69a54a4c611c9d7dc6252d858d9c8","Cargo.toml":"16e0ee523f5c3b1bc1f85771d2102b8bb1cffa3bde3631293cfd441387e9f881","LICENSE-APACHE":"cfc7749b96f63bd31c3c42b5c471bf756814053e847c10f3eb003417bc523d30","LICENSE-MIT":"4ad721b5b6a3d39ca3e2202f403d897c4a1d42896486dd58963a81f8e64ef61d","README.md":"2c91137faee83f0805a9b9123e105670bf60c2fe45ce6536fb92df7ef85017a5","benches/bench.rs":"ed7143e66ecf8bfb12c87d1f9344157d97696b8194de9132d061129bc80d8d52","cbindgen.toml":"452e79bea00e2a0c16a03ac04e454a0c5955becf2d0306ccce7d1c13d3bcc51a","doc/mapped_hyph_format.md":"2f2487cf536fe4b03db6e4b384be06744ec30b3f299519492288306a93127fbb","hyph_en_US.hyf":"6262b4c5118fe277ab4add8689d9524ca72097564652baec67a8fcd5029ec9b0","src/bin/hyf_compile.rs":"04f8d4d9e47cbc1793d3b23a6cf840b37f3989d3817846ea0e45be3a08cafb29","src/builder.rs":"b6200c19ea24c1b3defbf3b6b4ded350b4d45e170a7b8798d9063c47cfd45cc3","src/ffi.rs":"652ad1b1f450af6afa4b04e3e3e73da1ada294d1c82eda117db87c9e0b9b73ac","src/lib.rs":"d9fc9daad71cda70570ed61538001d46ac204a62a72d4d4faa43be70c62d6faa","src/main.rs":"666befeb39cb1a7dfb66c6b9218d5f7b6c4ed09dbbbc8cfff6b749a33a99ebcf","tests/base.hyf":"d8bf57c6280cfa1d357d3fdba156ce64afbd9df58e28eeb084dfe3f80972b73f","tests/base.hyph":"a3f1fab24c101701fdf21e8359685d80611ab970304e2bd89ef024768b3700c8","tests/base.word":"1136c9a421b242262661b9a65723f87a5ecf77ae38eabcea057832d036d567fd","tests/compound.hyf":"929c1ba6676e4c43bc649d0abf4275ea9e8b02bffaa5acdf704a710813a7a13c","tests/compound4.hyf":"2093287bc41ee30ff9bdbf278f1f8209cb1d1a78236b46e9060af2a881572b8e","tests/compound5.hyf":"0942a5dfbb8d0ef3a937ab9da0418abb41300357cde49f4c477a59a11b2cb6bd","tests/compound6.hyf":"ebad958c2692a5b439b31e324020ed27c42dc05bd5b8c6a6dea4669e6ccf76b4","tests/hyphen.hyf":"92b8a5c86aac6a0b9f0eb7330a057065d6985fd047e851cae47039995c682d4d","tests/lhmin.hyf":"23c886704fafee7d9c54b2478029cf69a5fa946c2f2442bd86697bca5933c88d","tests/num.hyf":"4834fabe78b5c81815434d4562ce3322541649e1ea1edc555a498574bc8b237e","tests/rhmin.hyf":"239cb3d4d7f904abb43b57241e12cc1396e636220c3806e64666aca7ca46cc42","tests/settings2.hyf":"9fc4855e0b952a3593db1efef080b93ce7f1c6fe6798db0440e2bf0cc986ffa2","tests/settings3.hyf":"867db207b485a06e7d60ad10735c9111f10516ee3a5afd6306c683ace3454491","tests/test.rs":"5c81ae59b9384b70d9461407999dac1fde9214398876c4433fbbde9571cc1d94"},"package":null}
{"files":{".travis.yml":"4d1af7257c9619f7ae66fc271ba2c1be5f063640ae8ceaa235c8c8aaf32f44ea","COPYRIGHT":"4df931055b82b96e13ad475c4cee3de5afa69a54a4c611c9d7dc6252d858d9c8","Cargo.toml":"1ae148acc03da96f02dd7ed1b0c5757056df59f47af1cdb0ec261a1ca859637e","LICENSE-APACHE":"cfc7749b96f63bd31c3c42b5c471bf756814053e847c10f3eb003417bc523d30","LICENSE-MIT":"4ad721b5b6a3d39ca3e2202f403d897c4a1d42896486dd58963a81f8e64ef61d","README.md":"2c91137faee83f0805a9b9123e105670bf60c2fe45ce6536fb92df7ef85017a5","benches/bench.rs":"ed7143e66ecf8bfb12c87d1f9344157d97696b8194de9132d061129bc80d8d52","cbindgen.toml":"452e79bea00e2a0c16a03ac04e454a0c5955becf2d0306ccce7d1c13d3bcc51a","doc/mapped_hyph_format.md":"2f2487cf536fe4b03db6e4b384be06744ec30b3f299519492288306a93127fbb","hyph_en_US.hyf":"6262b4c5118fe277ab4add8689d9524ca72097564652baec67a8fcd5029ec9b0","src/bin/hyf_compile.rs":"69a1c9c9124d4c4d6e8bb2fe3946547a1395723b247f7f7234e1b60941f202bd","src/builder.rs":"4169a89fb3a5025b06edeb8a6435a18814d58799d15861c3639a2ed9c63c628b","src/ffi.rs":"652ad1b1f450af6afa4b04e3e3e73da1ada294d1c82eda117db87c9e0b9b73ac","src/lib.rs":"bfee464e22d4e13057a9eebe968847195c528b73c229047ef67dfd084c45f6b7","src/main.rs":"666befeb39cb1a7dfb66c6b9218d5f7b6c4ed09dbbbc8cfff6b749a33a99ebcf","tests/base.hyf":"d8bf57c6280cfa1d357d3fdba156ce64afbd9df58e28eeb084dfe3f80972b73f","tests/base.hyph":"a3f1fab24c101701fdf21e8359685d80611ab970304e2bd89ef024768b3700c8","tests/base.word":"1136c9a421b242262661b9a65723f87a5ecf77ae38eabcea057832d036d567fd","tests/compound.hyf":"929c1ba6676e4c43bc649d0abf4275ea9e8b02bffaa5acdf704a710813a7a13c","tests/compound4.hyf":"2093287bc41ee30ff9bdbf278f1f8209cb1d1a78236b46e9060af2a881572b8e","tests/compound5.hyf":"0942a5dfbb8d0ef3a937ab9da0418abb41300357cde49f4c477a59a11b2cb6bd","tests/compound6.hyf":"ebad958c2692a5b439b31e324020ed27c42dc05bd5b8c6a6dea4669e6ccf76b4","tests/hyphen.hyf":"92b8a5c86aac6a0b9f0eb7330a057065d6985fd047e851cae47039995c682d4d","tests/lhmin.hyf":"23c886704fafee7d9c54b2478029cf69a5fa946c2f2442bd86697bca5933c88d","tests/num.hyf":"4834fabe78b5c81815434d4562ce3322541649e1ea1edc555a498574bc8b237e","tests/rhmin.hyf":"239cb3d4d7f904abb43b57241e12cc1396e636220c3806e64666aca7ca46cc42","tests/settings2.hyf":"9fc4855e0b952a3593db1efef080b93ce7f1c6fe6798db0440e2bf0cc986ffa2","tests/settings3.hyf":"867db207b485a06e7d60ad10735c9111f10516ee3a5afd6306c683ace3454491","tests/test.rs":"5c81ae59b9384b70d9461407999dac1fde9214398876c4433fbbde9571cc1d94"},"package":null}

View File

@ -1,7 +1,7 @@
[package]
name = "mapped_hyph"
description = "Hyphenation using precompiled memory-mapped tables"
version = "0.4.0"
version = "0.4.2"
authors = ["Jonathan Kew <jfkthame@gmail.com>"]
license = "MIT/Apache-2.0"
edition = "2018"
@ -9,6 +9,8 @@ edition = "2018"
[dependencies]
memmap = "0.7.0"
arrayref = "0.3.5"
log = "0.4"
env_logger = "0.7.1"
[dev-dependencies]
criterion = "0.3"

View File

@ -8,11 +8,13 @@
// except according to those terms.
extern crate mapped_hyph;
extern crate env_logger;
use std::env;
use std::fs::File;
fn main() -> std::io::Result<()> {
env_logger::init();
let args: Vec<String> = env::args().collect();
if args.len() == 3 {
let in_file = File::open(&args[1])?;

View File

@ -132,7 +132,10 @@ impl LevelBuilder {
let mut got_digit = false;
for byte in bytes {
if *byte <= b'9' && *byte >= b'0' {
assert!(!got_digit, "invalid pattern \"{}\": consecutive digits", pattern);
if got_digit {
warn!("invalid pattern \"{}\": consecutive digits", pattern);
return;
}
digits.push(*byte);
got_digit = true;
} else {
@ -157,7 +160,10 @@ impl LevelBuilder {
// Convert repl_index and repl_cut from Unicode char to byte indexing.
let start = if text[0] == b'.' { 1 } else { 0 };
if start == 1 {
assert_eq!(digits[0], b'0', "unexpected digit before start of word");
if digits[0] != b'0' {
warn!("invalid pattern \"{}\": unexpected digit before start of word", pattern);
return;
}
digits.remove(0);
}
let word = std::str::from_utf8(&text[start..]).unwrap();
@ -171,7 +177,10 @@ impl LevelBuilder {
// (which should not already have a match_string).
let mut state_num = self.find_state_number_for(&text);
let mut state = &mut self.states[state_num as usize];
assert!(state.match_string.is_none(), "duplicate pattern?");
if state.match_string.is_some() {
warn!("duplicate pattern \"{}\" discarded", pattern);
return;
}
if !digits.is_empty() {
state.match_string = Some(digits);
}
@ -188,7 +197,7 @@ impl LevelBuilder {
text.truncate(text.len() - 1);
state_num = self.find_state_number_for(&text);
if let Some(exists) = self.states[state_num as usize].transitions.0.insert(ch, last_state) {
assert_eq!(exists, last_state, "overwriting existing transition?");
assert_eq!(exists, last_state, "overwriting existing transition at pattern \"{}\"", pattern);
break;
}
}
@ -349,7 +358,7 @@ impl LevelBuilder {
/// machine transitions, etc.
/// The returned Vec can be passed to write_hyf_file() to generate a flattened
/// representation of the state machine in mapped_hyph's binary format.
fn read_dic_file<T: Read>(dic_file: T, compress: bool) -> Vec<LevelBuilder> {
fn read_dic_file<T: Read>(dic_file: T, compress: bool) -> Result<Vec<LevelBuilder>, &'static str> {
let reader = BufReader::new(dic_file);
let mut builders = Vec::<LevelBuilder>::new();
@ -370,14 +379,19 @@ fn read_dic_file<T: Read>(dic_file: T, compress: bool) -> Vec<LevelBuilder> {
if trimmed.as_bytes()[0] >= b'A' && trimmed.as_bytes()[0] <= b'Z' {
// First line is encoding; we only support UTF-8.
if builder.encoding.is_none() {
assert_eq!(trimmed, "UTF-8", "Only UTF-8 patterns are accepted!");
if trimmed != "UTF-8" {
return Err("Only UTF-8 patterns are accepted!");
};
builder.encoding = Some(trimmed);
continue;
}
// Check for valid keyword-value pairs.
if trimmed.contains(' ') {
let parts: Vec<&str> = trimmed.split(' ').collect();
assert!(parts.len() == 2);
if parts.len() != 2 {
warn!("unrecognized keyword/values: {}", trimmed);
continue;
}
let keyword = parts[0];
let value = parts[1];
match keyword {
@ -386,7 +400,7 @@ fn read_dic_file<T: Read>(dic_file: T, compress: bool) -> Vec<LevelBuilder> {
"COMPOUNDLEFTHYPHENMIN" => builder.clh_min = value.parse::<u8>().unwrap(),
"COMPOUNDRIGHTHYPHENMIN" => builder.crh_min = value.parse::<u8>().unwrap(),
"NOHYPHEN" => builder.nohyphen = Some(trimmed),
_ => println!("unknown keyword: {}", trimmed),
_ => warn!("unknown keyword: {}", trimmed),
}
continue;
}
@ -396,11 +410,15 @@ fn read_dic_file<T: Read>(dic_file: T, compress: bool) -> Vec<LevelBuilder> {
builder = builders.last_mut().unwrap();
continue;
}
println!("unknown keyword: {}", trimmed);
warn!("unknown keyword: {}", trimmed);
continue;
}
// Patterns should always be provided in lowercase; complain if not, and discard
// the bad pattern.
if trimmed != trimmed.to_lowercase() {
warn!("pattern \"{}\" not lowercased at line {}", trimmed, index);
continue;
}
// Patterns should always be provided in lowercase; complain if not.
assert_eq!(trimmed, trimmed.to_lowercase(), "pattern \"{}\" not lowercased at line {}", trimmed, index);
builder.add_pattern(&trimmed);
}
@ -446,7 +464,7 @@ fn read_dic_file<T: Read>(dic_file: T, compress: bool) -> Vec<LevelBuilder> {
}
}
builders
Ok(builders)
}
/// Write out the state machines representing a set of hyphenation rules
@ -481,5 +499,11 @@ fn write_hyf_file<T: Write>(hyf_file: &mut T, levels: Vec<LevelBuilder>) -> std:
/// to `hyf_file`. The `compress` param determines whether extra processing to reduce the
/// size of the output is performed.
pub fn compile<T1: Read, T2: Write>(dic_file: T1, hyf_file: &mut T2, compress: bool) -> std::io::Result<()> {
write_hyf_file(hyf_file, read_dic_file(dic_file, compress))
match read_dic_file(dic_file, compress) {
Ok(dic) => write_hyf_file(hyf_file, dic),
Err(e) => {
warn!("parse error: {}", e);
return Err(Error::from(ErrorKind::InvalidData))
}
}
}

View File

@ -10,6 +10,8 @@
#[macro_use]
extern crate arrayref;
extern crate memmap;
#[macro_use]
extern crate log;
use std::slice;
use std::str;

View File

@ -46,7 +46,7 @@ mdns_service = { path="../../../../dom/media/webrtc/transport/mdns_service", opt
neqo_glue = { path = "../../../../netwerk/socket/neqo_glue" }
rlbox_lucet_sandbox = { version = "0.1.0", optional = true }
wgpu_bindings = { path = "../../../../gfx/wgpu_bindings", optional = true }
mapped_hyph = { git = "https://github.com/jfkthame/mapped_hyph.git", tag = "v0.4.0" }
mapped_hyph = { git = "https://github.com/jfkthame/mapped_hyph.git", tag = "v0.4.2" }
remote = { path = "../../../../remote", optional = true }
fog_control = { path = "../../../components/glean", optional = true }
app_services_logger = { path = "../../../../services/common/app_services_logger" }