From 34c0dd1fe4f1efe3f9c6eb7296fe7bd6f673c6c0 Mon Sep 17 00:00:00 2001 From: David Tolnay Date: Thu, 3 Nov 2022 23:05:57 -0700 Subject: [PATCH] Replace ucd-parse dependency --- generate/Cargo.toml | 1 - generate/src/parse.rs | 44 +++++++++++++++++++++++++++++++++++-------- 2 files changed, 36 insertions(+), 9 deletions(-) diff --git a/generate/Cargo.toml b/generate/Cargo.toml index f7e8455..5d7f92f 100644 --- a/generate/Cargo.toml +++ b/generate/Cargo.toml @@ -7,4 +7,3 @@ publish = false [dependencies] anyhow = "1" -ucd-parse = "0.1.10" diff --git a/generate/src/parse.rs b/generate/src/parse.rs index 39109aa..6a3b9b7 100644 --- a/generate/src/parse.rs +++ b/generate/src/parse.rs @@ -1,7 +1,7 @@ -use anyhow::Result; +use anyhow::{bail, Result}; use std::collections::BTreeSet as Set; +use std::fs; use std::path::Path; -use ucd_parse::CoreProperty; pub struct Properties { xid_start: Set, @@ -24,17 +24,45 @@ pub fn parse_xid_properties(ucd_dir: &Path) -> Result { xid_continue: Set::new(), }; - let prop_list: Vec = ucd_parse::parse(ucd_dir)?; - for core in prop_list { - let set = match core.property.as_str() { + let filename = "DerivedCoreProperties.txt"; + let path = ucd_dir.join(filename); + let contents = fs::read_to_string(path)?; + for (i, line) in contents.lines().enumerate() { + if line.starts_with('#') || line.trim().is_empty() { + continue; + } + let (lo, hi, name) = match parse_line(line) { + Some(line) => line, + None => bail!("{} line {} is unexpected:\n{}", filename, i, line), + }; + let set = match name { "XID_Start" => &mut properties.xid_start, "XID_Continue" => &mut properties.xid_continue, _ => continue, }; - for codepoint in core.codepoints { - set.insert(codepoint.value()); - } + set.extend(lo..=hi); } Ok(properties) } + +fn parse_line(line: &str) -> Option<(u32, u32, &str)> { + let (mut codepoint, rest) = line.split_once(';')?; + + let (lo, hi); + codepoint = codepoint.trim(); + if let Some((a, b)) = codepoint.split_once("..") { + lo = parse_codepoint(a)?; + hi = parse_codepoint(b)?; + } else { + lo = parse_codepoint(codepoint)?; + hi = lo; + } + + let name = rest.trim().split('#').next()?.trim_end(); + Some((lo, hi, name)) +} + +fn parse_codepoint(s: &str) -> Option { + u32::from_str_radix(s, 16).ok() +}