diff --git a/.travis.yml b/.travis.yml index 1258356..6b8519f 100644 --- a/.travis.yml +++ b/.travis.yml @@ -5,7 +5,7 @@ os: - windows rust: - - 1.27.0 + - 1.31.0 - nightly - beta - stable diff --git a/Cargo.toml b/Cargo.toml index 74921cd..7f5826c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,6 +14,10 @@ repository = "https://github.com/dguo/strsim-rs" documentation = "https://docs.rs/strsim/" exclude = ["/.travis.yml", "/appveyor.yml", "/dev"] +[dependencies] +hashbrown = "0.1.8" +ndarray = "0.12.1" + [badges] travis-ci = { repository = "dguo/strsim-rs" } appveyor = { repository = "dguo/strsim-rs" } diff --git a/src/lib.rs b/src/lib.rs index 41c5789..e49531a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,13 +1,18 @@ //! This library implements string similarity metrics. +extern crate hashbrown; +extern crate ndarray; + use std::char; use std::cmp::{max, min}; -use std::collections::HashMap; use std::error::Error; use std::fmt::{self, Display, Formatter}; use std::hash::Hash; use std::str::Chars; +use hashbrown::HashMap; +use ndarray::Array2; + #[derive(Debug, PartialEq)] pub enum StrSimError { DifferentLengthArgs, @@ -103,7 +108,7 @@ pub fn generic_jaro<'a, 'b, Iter1, Iter2, Elem1, Elem2>(a: &'a Iter1, b: &'b Ite for (j, b_elem) in b.into_iter().enumerate() { if min_bound <= j && j <= max_bound && a_elem == b_elem && - !b_consumed[j] { + !b_consumed[j] { b_consumed[j] = true; matches += 1.0; @@ -121,8 +126,8 @@ pub fn generic_jaro<'a, 'b, Iter1, Iter2, Elem1, Elem2>(a: &'a Iter1, b: &'b Ite 0.0 } else { (1.0 / 3.0) * ((matches / a_len as f64) + - (matches / b_len as f64) + - ((matches - transpositions) / matches)) + (matches / b_len as f64) + + ((matches - transpositions) / matches)) } } @@ -161,7 +166,7 @@ pub fn generic_jaro_winkler<'a, 'b, Iter1, Iter2, Elem1, Elem2>(a: &'a Iter1, b: let prefix_length = a.into_iter() .zip(b.into_iter()) .take_while(|&(ref a_elem, ref b_elem)| a_elem == b_elem) - .count(); + .count(); let jaro_winkler_distance = jaro_distance + (0.1 * prefix_length as f64 * (1.0 - jaro_distance)); @@ -197,25 +202,21 @@ pub fn generic_levenshtein<'a, 'b, Iter1, Iter2, Elem1, Elem2>(a: &'a Iter1, b: where &'a Iter1: IntoIterator, &'b Iter2: IntoIterator, Elem1: PartialEq { - let a_len = a.into_iter().count(); let b_len = b.into_iter().count(); - if a_len == 0 { return b_len; } - if b_len == 0 { return a_len; } + if a.into_iter().next().is_none() { return b_len; } let mut cache: Vec = (1..b_len+1).collect(); let mut result = 0; - let mut distance_a; - let mut distance_b; for (i, a_elem) in a.into_iter().enumerate() { - result = i; - distance_b = i; + result = i + 1; + let mut distance_b = i; for (j, b_elem) in b.into_iter().enumerate() { - let cost = if a_elem == b_elem { 0 } else { 1 }; - distance_a = distance_b + cost; + let cost = if a_elem == b_elem { 0usize } else { 1usize }; + let distance_a = distance_b + cost; distance_b = cache[j]; result = min(result + 1, min(distance_a, distance_b + 1)); cache[j] = result; @@ -293,7 +294,7 @@ pub fn osa_distance(a: &str, b: &str) -> usize { min(prev_distances[j + 1] + 1, prev_distances[j] + cost)); if i > 0 && j > 0 && a_char != b_char && - a_char == prev_b_char && b_char == prev_a_char { + a_char == prev_b_char && b_char == prev_a_char { curr_distances[j + 1] = min(curr_distances[j + 1], prev_two_distances[j - 1] + 1); } @@ -326,55 +327,53 @@ pub fn generic_damerau_levenshtein(a_elems: &[Elem], b_elems: &[Elem]) -> if a_len == 0 { return b_len; } if b_len == 0 { return a_len; } - let mut distances = vec![vec![0; b_len + 2]; a_len + 2]; + let mut distances = Array2::::zeros((a_len + 2, b_len + 2)); let max_distance = a_len + b_len; - distances[0][0] = max_distance; + distances[[0, 0]] = max_distance; for i in 0..(a_len + 1) { - distances[i + 1][0] = max_distance; - distances[i + 1][1] = i; + distances[[i + 1, 0]] = max_distance; + distances[[i + 1, 1]] = i; } for j in 0..(b_len + 1) { - distances[0][j + 1] = max_distance; - distances[1][j + 1] = j; + distances[[0, j + 1]] = max_distance; + distances[[1, j + 1]] = j; } - let mut elems: HashMap = HashMap::new(); + let mut elems: HashMap = HashMap::with_capacity(64); for i in 1..(a_len + 1) { let mut db = 0; for j in 1..(b_len + 1) { let k = match elems.get(&b_elems[j - 1]) { - Some(value) => value.clone(), + Some(&value) => value, None => 0 }; - let l = db; - let mut cost = 1; + let insertion_cost = distances[[i, j + 1]] + 1; + let deletion_cost = distances[[i + 1, j]] + 1; + let transposition_cost = distances[[k, db]] + (i - k - 1) + 1 + + (j - db - 1); + + let mut substitution_cost = distances[[i, j]] + 1; if a_elems[i - 1] == b_elems[j - 1] { - cost = 0; db = j; + substitution_cost -= 1; } - let substitution_cost = distances[i][j] + cost; - let insertion_cost = distances[i][j + 1] + 1; - let deletion_cost = distances[i + 1][j] + 1; - let transposition_cost = distances[k][l] + (i - k - 1) + 1 + - (j - l - 1); - - distances[i + 1][j + 1] = min(substitution_cost, - min(insertion_cost, - min(deletion_cost, - transposition_cost))); + distances[[i + 1, j + 1]] = min(substitution_cost, + min(insertion_cost, + min(deletion_cost, + transposition_cost))); } elems.insert(a_elems[i - 1].clone(), i); } - distances[a_len + 1][b_len + 1] + distances[[a_len + 1, b_len + 1]] } /// Like optimal string alignment, but substrings can be edited an unlimited @@ -457,7 +456,7 @@ mod tests { #[test] fn jaro_both_empty() { - assert_eq!(1.0, jaro("", "")); + assert_eq!(1.0, jaro("", "")); } #[test] @@ -545,9 +544,9 @@ mod tests { #[test] fn jaro_winkler_multibyte() { assert!((0.89 - jaro_winkler("testabctest", "testöঙ香test")).abs() < - 0.001); + 0.001); assert!((0.89 - jaro_winkler("testöঙ香test", "testabctest")).abs() < - 0.001); + 0.001); } #[test] @@ -580,7 +579,7 @@ mod tests { #[test] fn jaro_winkler_long_prefix() { assert!((0.911 - jaro_winkler("cheeseburger", "cheese fries")).abs() < - 0.001); + 0.001); } #[test] @@ -597,7 +596,7 @@ mod tests { fn jaro_winkler_very_long_prefix() { assert!((1.0 - jaro_winkler("thequickbrownfoxjumpedoverx", "thequickbrownfoxjumpedovery")).abs() < - 0.001); + 0.001); } #[test]