diff --git a/LICENSE b/LICENSE index 1aacdb8..8d1fbe1 100644 --- a/LICENSE +++ b/LICENSE @@ -2,6 +2,7 @@ The MIT License (MIT) Copyright (c) 2015 Danny Guo Copyright (c) 2016 Titus Wormer +Copyright (c) 2018 Akash Kurdekar Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -20,4 +21,3 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - diff --git a/README.md b/README.md index 58dfaa8..23e50f8 100644 --- a/README.md +++ b/README.md @@ -11,6 +11,7 @@ - [Optimal string alignment] - [Damerau-Levenshtein] - distance & normalized - [Jaro and Jaro-Winkler] - this implementation of Jaro-Winkler does not limit the common prefix length + - [Sørensen-Dice] The normalized versions return values between `0.0` and `1.0`, where `1.0` means an exact match. @@ -38,7 +39,7 @@ extern crate strsim; use strsim::{hamming, levenshtein, normalized_levenshtein, osa_distance, damerau_levenshtein, normalized_damerau_levenshtein, jaro, - jaro_winkler}; + jaro_winkler, sorensen_dice}; fn main() { match hamming("hamming", "hammers") { @@ -62,6 +63,9 @@ fn main() { assert!((jaro_winkler("cheeseburger", "cheese fries") - 0.911).abs() < 0.001); + + assert_eq!(sorensen_dice("web applications", "applications of the web"), + 0.7878787878787878); } ``` @@ -94,4 +98,5 @@ Benchmarks require a Nightly toolchain. Run `$ cargo +nightly bench`. [Levenshtein]:http://en.wikipedia.org/wiki/Levenshtein_distance [Hamming]:http://en.wikipedia.org/wiki/Hamming_distance [Optimal string alignment]:https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance#Optimal_string_alignment_distance +[Sørensen-Dice]:http://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient [Docker]:https://docs.docker.com/engine/installation/ diff --git a/benches/benches.rs b/benches/benches.rs index 8a2098a..b7ba829 100644 --- a/benches/benches.rs +++ b/benches/benches.rs @@ -88,4 +88,13 @@ mod benches { strsim::normalized_damerau_levenshtein(&a, &b); }) } + + #[bench] + fn bench_sorensen_dice(bencher: &mut Bencher) { + let a = "Philosopher Friedrich Nietzsche"; + let b = "Philosopher Jean-Paul Sartre"; + bencher.iter(|| { + strsim::sorensen_dice(&a, &b); + }) + } } diff --git a/src/lib.rs b/src/lib.rs index 0ddeeca..480a642 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -411,10 +411,88 @@ pub fn normalized_damerau_levenshtein(a: &str, b: &str) -> f64 { 1.0 - (damerau_levenshtein(a, b) as f64) / (a.chars().count().max(b.chars().count()) as f64) } +/// Returns an Iterator of char tuples. +fn bigrams(s: &str) -> impl Iterator + '_ { + s.chars().zip(s.chars().skip(1)) +} + + +/// Calculates a Sørensen-Dice similarity distance using bigrams. +/// See http://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient. +/// +/// ``` +/// use strsim::sorensen_dice; +/// +/// assert_eq!(1.0, sorensen_dice("", "")); +/// assert_eq!(0.0, sorensen_dice("", "a")); +/// assert_eq!(0.0, sorensen_dice("french", "quebec")); +/// assert_eq!(1.0, sorensen_dice("ferris", "ferris")); +/// assert_eq!(1.0, sorensen_dice("ferris", "ferris")); +/// assert_eq!(0.8888888888888888, sorensen_dice("feris", "ferris")); +/// ``` +pub fn sorensen_dice(a: &str, b: &str) -> f64 { + // implementation guided by + // https://github.com/aceakash/string-similarity/blob/f83ba3cd7bae874c20c429774e911ae8cff8bced/src/index.js#L6 + + let a: String = a.chars().filter(|&x| !char::is_whitespace(x)).collect(); + let b: String = b.chars().filter(|&x| !char::is_whitespace(x)).collect(); + + if a.len() == 0 && b.len() == 0 { + return 1.0; + } + + if a.len() == 0 || b.len() == 0 { + return 0.0; + } + + if a == b { + return 1.0; + } + + if a.len() == 1 && b.len() == 1 { + return 0.0; + } + + if a.len() < 2 || b.len() < 2 { + return 0.0; + } + + let mut a_bigrams: HashMap<(char, char), usize> = HashMap::new(); + + for bigram in bigrams(&a) { + *a_bigrams.entry(bigram).or_insert(0) += 1; + } + + let mut intersection_size = 0; + + for bigram in bigrams(&b) { + a_bigrams.entry(bigram).and_modify(|bi| { + if *bi > 0 { + *bi -= 1; + intersection_size += 1; + } + }); + } + + (2 * intersection_size) as f64 / (a.len() + b.len() - 2) as f64 +} + + #[cfg(test)] mod tests { use super::*; + #[test] + fn bigrams_iterator() { + let mut bi = bigrams("abcde"); + + assert_eq!(Some(('a', 'b')), bi.next()); + assert_eq!(Some(('b', 'c')), bi.next()); + assert_eq!(Some(('c', 'd')), bi.next()); + assert_eq!(Some(('d', 'e')), bi.next()); + assert_eq!(None, bi.next()); + } + fn assert_hamming_dist(dist: usize, str1: &str, str2: &str) { assert_eq!(Ok(dist), hamming(str1, str2)); } @@ -870,4 +948,58 @@ mod tests { fn normalized_damerau_levenshtein_identical_strings() { assert!((normalized_damerau_levenshtein("sunglasses", "sunglasses") - 1.0).abs() < 0.00001); } + + #[test] + fn sorensen_dice_all() { + // test cases taken from + // https://github.com/aceakash/string-similarity/blob/f83ba3cd7bae874c20c429774e911ae8cff8bced/src/spec/index.spec.js#L11 + + assert_eq!(1.0, sorensen_dice("a", "a")); + assert_eq!(0.0, sorensen_dice("a", "b")); + assert_eq!(1.0, sorensen_dice("", "")); + assert_eq!(0.0, sorensen_dice("a", "")); + assert_eq!(0.0, sorensen_dice("", "a")); + assert_eq!(1.0, sorensen_dice("apple event", "apple event")); + assert_eq!(0.9090909090909091, sorensen_dice("iphone", "iphone x")); + assert_eq!(0.0, sorensen_dice("french", "quebec")); + assert_eq!(1.0, sorensen_dice("france", "france")); + assert_eq!(0.2, sorensen_dice("fRaNce", "france")); + assert_eq!(0.8, sorensen_dice("healed", "sealed")); + assert_eq!( + 0.7878787878787878, + sorensen_dice("web applications", "applications of the web") + ); + assert_eq!( + 0.92, + sorensen_dice( + "this will have a typo somewhere", + "this will huve a typo somewhere" + ) + ); + assert_eq!( + 0.6060606060606061, + sorensen_dice( + "Olive-green table for sale, in extremely good condition.", + "For sale: table in very good condition, olive green in colour." + ) + ); + assert_eq!( + 0.2558139534883721, + sorensen_dice( + "Olive-green table for sale, in extremely good condition.", + "For sale: green Subaru Impreza, 210,000 miles" + ) + ); + assert_eq!( + 0.1411764705882353, + sorensen_dice( + "Olive-green table for sale, in extremely good condition.", + "Wanted: mountain bike with at least 21 gears." + ) + ); + assert_eq!( + 0.7741935483870968, + sorensen_dice("this has one extra word", "this has one word") + ); + } }