mirror of
https://gitee.com/openharmony/third_party_rust_strsim-rs
synced 2024-11-23 07:39:51 +00:00
add sorensen-dice distance function
This commit is contained in:
parent
74ab292905
commit
6f8a8cb49f
2
LICENSE
2
LICENSE
@ -2,6 +2,7 @@ The MIT License (MIT)
|
||||
|
||||
Copyright (c) 2015 Danny Guo
|
||||
Copyright (c) 2016 Titus Wormer <tituswormer@gmail.com>
|
||||
Copyright (c) 2018 Akash Kurdekar
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
@ -20,4 +21,3 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
|
||||
|
@ -11,6 +11,7 @@
|
||||
- [Optimal string alignment]
|
||||
- [Damerau-Levenshtein] - distance & normalized
|
||||
- [Jaro and Jaro-Winkler] - this implementation of Jaro-Winkler does not limit the common prefix length
|
||||
- [Sørensen-Dice]
|
||||
|
||||
The normalized versions return values between `0.0` and `1.0`, where `1.0` means
|
||||
an exact match.
|
||||
@ -38,7 +39,7 @@ extern crate strsim;
|
||||
|
||||
use strsim::{hamming, levenshtein, normalized_levenshtein, osa_distance,
|
||||
damerau_levenshtein, normalized_damerau_levenshtein, jaro,
|
||||
jaro_winkler};
|
||||
jaro_winkler, sorensen_dice};
|
||||
|
||||
fn main() {
|
||||
match hamming("hamming", "hammers") {
|
||||
@ -62,6 +63,9 @@ fn main() {
|
||||
|
||||
assert!((jaro_winkler("cheeseburger", "cheese fries") - 0.911).abs() <
|
||||
0.001);
|
||||
|
||||
assert_eq!(sorensen_dice("web applications", "applications of the web"),
|
||||
0.7878787878787878);
|
||||
}
|
||||
```
|
||||
|
||||
@ -94,4 +98,5 @@ Benchmarks require a Nightly toolchain. Run `$ cargo +nightly bench`.
|
||||
[Levenshtein]:http://en.wikipedia.org/wiki/Levenshtein_distance
|
||||
[Hamming]:http://en.wikipedia.org/wiki/Hamming_distance
|
||||
[Optimal string alignment]:https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance#Optimal_string_alignment_distance
|
||||
[Sørensen-Dice]:http://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient
|
||||
[Docker]:https://docs.docker.com/engine/installation/
|
||||
|
@ -88,4 +88,13 @@ mod benches {
|
||||
strsim::normalized_damerau_levenshtein(&a, &b);
|
||||
})
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_sorensen_dice(bencher: &mut Bencher) {
|
||||
let a = "Philosopher Friedrich Nietzsche";
|
||||
let b = "Philosopher Jean-Paul Sartre";
|
||||
bencher.iter(|| {
|
||||
strsim::sorensen_dice(&a, &b);
|
||||
})
|
||||
}
|
||||
}
|
||||
|
132
src/lib.rs
132
src/lib.rs
@ -411,10 +411,88 @@ pub fn normalized_damerau_levenshtein(a: &str, b: &str) -> f64 {
|
||||
1.0 - (damerau_levenshtein(a, b) as f64) / (a.chars().count().max(b.chars().count()) as f64)
|
||||
}
|
||||
|
||||
/// Returns an Iterator of char tuples.
|
||||
fn bigrams(s: &str) -> impl Iterator<Item=(char, char)> + '_ {
|
||||
s.chars().zip(s.chars().skip(1))
|
||||
}
|
||||
|
||||
|
||||
/// Calculates a Sørensen-Dice similarity distance using bigrams.
|
||||
/// See http://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient.
|
||||
///
|
||||
/// ```
|
||||
/// use strsim::sorensen_dice;
|
||||
///
|
||||
/// assert_eq!(1.0, sorensen_dice("", ""));
|
||||
/// assert_eq!(0.0, sorensen_dice("", "a"));
|
||||
/// assert_eq!(0.0, sorensen_dice("french", "quebec"));
|
||||
/// assert_eq!(1.0, sorensen_dice("ferris", "ferris"));
|
||||
/// assert_eq!(1.0, sorensen_dice("ferris", "ferris"));
|
||||
/// assert_eq!(0.8888888888888888, sorensen_dice("feris", "ferris"));
|
||||
/// ```
|
||||
pub fn sorensen_dice(a: &str, b: &str) -> f64 {
|
||||
// implementation guided by
|
||||
// https://github.com/aceakash/string-similarity/blob/f83ba3cd7bae874c20c429774e911ae8cff8bced/src/index.js#L6
|
||||
|
||||
let a: String = a.chars().filter(|&x| !char::is_whitespace(x)).collect();
|
||||
let b: String = b.chars().filter(|&x| !char::is_whitespace(x)).collect();
|
||||
|
||||
if a.len() == 0 && b.len() == 0 {
|
||||
return 1.0;
|
||||
}
|
||||
|
||||
if a.len() == 0 || b.len() == 0 {
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
if a == b {
|
||||
return 1.0;
|
||||
}
|
||||
|
||||
if a.len() == 1 && b.len() == 1 {
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
if a.len() < 2 || b.len() < 2 {
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
let mut a_bigrams: HashMap<(char, char), usize> = HashMap::new();
|
||||
|
||||
for bigram in bigrams(&a) {
|
||||
*a_bigrams.entry(bigram).or_insert(0) += 1;
|
||||
}
|
||||
|
||||
let mut intersection_size = 0;
|
||||
|
||||
for bigram in bigrams(&b) {
|
||||
a_bigrams.entry(bigram).and_modify(|bi| {
|
||||
if *bi > 0 {
|
||||
*bi -= 1;
|
||||
intersection_size += 1;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
(2 * intersection_size) as f64 / (a.len() + b.len() - 2) as f64
|
||||
}
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn bigrams_iterator() {
|
||||
let mut bi = bigrams("abcde");
|
||||
|
||||
assert_eq!(Some(('a', 'b')), bi.next());
|
||||
assert_eq!(Some(('b', 'c')), bi.next());
|
||||
assert_eq!(Some(('c', 'd')), bi.next());
|
||||
assert_eq!(Some(('d', 'e')), bi.next());
|
||||
assert_eq!(None, bi.next());
|
||||
}
|
||||
|
||||
fn assert_hamming_dist(dist: usize, str1: &str, str2: &str) {
|
||||
assert_eq!(Ok(dist), hamming(str1, str2));
|
||||
}
|
||||
@ -870,4 +948,58 @@ mod tests {
|
||||
fn normalized_damerau_levenshtein_identical_strings() {
|
||||
assert!((normalized_damerau_levenshtein("sunglasses", "sunglasses") - 1.0).abs() < 0.00001);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn sorensen_dice_all() {
|
||||
// test cases taken from
|
||||
// https://github.com/aceakash/string-similarity/blob/f83ba3cd7bae874c20c429774e911ae8cff8bced/src/spec/index.spec.js#L11
|
||||
|
||||
assert_eq!(1.0, sorensen_dice("a", "a"));
|
||||
assert_eq!(0.0, sorensen_dice("a", "b"));
|
||||
assert_eq!(1.0, sorensen_dice("", ""));
|
||||
assert_eq!(0.0, sorensen_dice("a", ""));
|
||||
assert_eq!(0.0, sorensen_dice("", "a"));
|
||||
assert_eq!(1.0, sorensen_dice("apple event", "apple event"));
|
||||
assert_eq!(0.9090909090909091, sorensen_dice("iphone", "iphone x"));
|
||||
assert_eq!(0.0, sorensen_dice("french", "quebec"));
|
||||
assert_eq!(1.0, sorensen_dice("france", "france"));
|
||||
assert_eq!(0.2, sorensen_dice("fRaNce", "france"));
|
||||
assert_eq!(0.8, sorensen_dice("healed", "sealed"));
|
||||
assert_eq!(
|
||||
0.7878787878787878,
|
||||
sorensen_dice("web applications", "applications of the web")
|
||||
);
|
||||
assert_eq!(
|
||||
0.92,
|
||||
sorensen_dice(
|
||||
"this will have a typo somewhere",
|
||||
"this will huve a typo somewhere"
|
||||
)
|
||||
);
|
||||
assert_eq!(
|
||||
0.6060606060606061,
|
||||
sorensen_dice(
|
||||
"Olive-green table for sale, in extremely good condition.",
|
||||
"For sale: table in very good condition, olive green in colour."
|
||||
)
|
||||
);
|
||||
assert_eq!(
|
||||
0.2558139534883721,
|
||||
sorensen_dice(
|
||||
"Olive-green table for sale, in extremely good condition.",
|
||||
"For sale: green Subaru Impreza, 210,000 miles"
|
||||
)
|
||||
);
|
||||
assert_eq!(
|
||||
0.1411764705882353,
|
||||
sorensen_dice(
|
||||
"Olive-green table for sale, in extremely good condition.",
|
||||
"Wanted: mountain bike with at least 21 gears."
|
||||
)
|
||||
);
|
||||
assert_eq!(
|
||||
0.7741935483870968,
|
||||
sorensen_dice("this has one extra word", "this has one word")
|
||||
);
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user