Add normalized Levenshtein and Damerau-Levenstein (#20)

* Add tests for 'normalized_levenshtein'

* Implement 'normalized_levenshtein'

* Implement 'normalized_damerau_levenshtein'

* Add benchmarking of new functions

* Move test cases from integration tests to unit tests

* Use 'is_empty' instead of 'len'

* Count chars instead of bytes

* Update Readme
This commit is contained in:
Viktor Lazarev 2018-08-19 23:30:28 +02:00 committed by Danny Guo
parent e178e6995f
commit ab17c1e9ab
4 changed files with 124 additions and 6 deletions

View File

@ -2,9 +2,9 @@
[Rust](https://www.rust-lang.org) implementations of [string similarity metrics]:
- [Hamming]
- [Levenshtein]
- [Levenshtein] - distance & normalized
- [Optimal string alignment]
- [Damerau-Levenshtein]
- [Damerau-Levenshtein] - distance & normalized
- [Jaro and Jaro-Winkler] - this implementation of Jaro-Winkler does not limit the common prefix length
### Installation
@ -23,8 +23,8 @@ version in the
```rust
extern crate strsim;
use strsim::{hamming, levenshtein, osa_distance, damerau_levenshtein, jaro,
jaro_winkler};
use strsim::{hamming, levenshtein, normalized_levenshtein, osa_distance, damerau_levenshtein,
normalized_damerau_levenshtein, jaro, jaro_winkler};
fn main() {
match hamming("hamming", "hammers") {
@ -34,10 +34,14 @@ fn main() {
assert_eq!(3, levenshtein("kitten", "sitting"));
assert!((normalized_levenshtein("kitten", "sitting") - 0.57142).abs() < 0.00001);
assert_eq!(3, osa_distance("ac", "cba"));
assert_eq!(2, damerau_levenshtein("ac", "cba"));
assert!((normalized_damerau_levenshtein("levenshtein", "löwenbräu") - 0.27272).abs() < 0.00001)
assert!((0.392 - jaro("Friedrich Nietzsche", "Jean-Paul Sartre")).abs() <
0.001);

View File

@ -46,6 +46,15 @@ mod benches {
})
}
#[bench]
fn bench_normalized_levenshtein(bencher: &mut Bencher) {
let a = "Philosopher Friedrich Nietzsche";
let b = "Philosopher Jean-Paul Sartre";
bencher.iter(|| {
strsim::normalized_levenshtein(&a, &b);
})
}
#[bench]
fn bench_osa_distance(bencher: &mut Bencher) {
let a = "Philosopher Friedrich Nietzsche";
@ -63,4 +72,13 @@ mod benches {
strsim::damerau_levenshtein(&a, &b);
})
}
#[bench]
fn bench_normalized_damerau_levenshtein(bencher: &mut Bencher) {
let a = "Philosopher Friedrich Nietzsche";
let b = "Philosopher Jean-Paul Sartre";
bencher.iter(|| {
strsim::normalized_damerau_levenshtein(&a, &b);
})
}
}

View File

@ -171,6 +171,24 @@ pub fn levenshtein(a: &str, b: &str) -> usize {
result
}
/// Calculates normalized score of the Levenshtein algorithm
///
/// ```
/// use strsim::normalized_levenshtein;
///
/// assert!((normalized_levenshtein("kitten", "sitting") - 0.57142).abs() < 0.00001);
/// assert!((normalized_levenshtein("", "") - 1.0).abs() < 0.00001);
/// assert!(normalized_levenshtein("", "second").abs() < 0.00001);
/// assert!(normalized_levenshtein("first", "").abs() < 0.00001);
/// assert!((normalized_levenshtein("string", "string") - 1.0).abs() < 0.00001);
/// ```
pub fn normalized_levenshtein(a: &str, b: &str) -> f64 {
if a.is_empty() && b.is_empty() {
return 1.0;
}
1.0 - (levenshtein(a, b) as f64) / (a.chars().count().max(b.chars().count()) as f64)
}
/// Like Levenshtein but allows for adjacent transpositions. Each substring can
/// only be edited once.
///
@ -295,6 +313,24 @@ pub fn damerau_levenshtein(a: &str, b: &str) -> usize {
distances[a_len + 1][b_len + 1]
}
/// Calculates normalized score of the DamerauLevenshtein algorithm
///
/// ```
/// use strsim::normalized_damerau_levenshtein;
///
/// assert!((normalized_damerau_levenshtein("levenshtein", "löwenbräu") - 0.27272).abs() < 0.00001);
/// assert!((normalized_damerau_levenshtein("", "") - 1.0).abs() < 0.00001);
/// assert!(normalized_damerau_levenshtein("", "flower").abs() < 0.00001);
/// assert!(normalized_damerau_levenshtein("tree", "").abs() < 0.00001);
/// assert!((normalized_damerau_levenshtein("sunglasses", "sunglasses") - 1.0).abs() < 0.00001);
/// ```
pub fn normalized_damerau_levenshtein(a: &str, b: &str) -> f64 {
if a.is_empty() && b.is_empty() {
return 1.0;
}
1.0 - (damerau_levenshtein(a, b) as f64) / (a.chars().count().max(b.chars().count()) as f64)
}
#[cfg(test)]
mod tests {
use super::*;
@ -530,6 +566,31 @@ mod tests {
assert_eq!(6, levenshtein("kitten", ""));
}
#[test]
fn normalized_levenshtein_diff_short() {
assert!((normalized_levenshtein("kitten", "sitting") - 0.57142).abs() < 0.00001);
}
#[test]
fn normalized_levenshtein_for_empty_strings() {
assert!((normalized_levenshtein("", "") - 1.0).abs() < 0.00001);
}
#[test]
fn normalized_levenshtein_first_empty() {
assert!(normalized_levenshtein("", "second").abs() < 0.00001);
}
#[test]
fn normalized_levenshtein_second_empty() {
assert!(normalized_levenshtein("first", "").abs() < 0.00001);
}
#[test]
fn normalized_levenshtein_identical_strings() {
assert!((normalized_levenshtein("identical", "identical") - 1.0).abs() < 0.00001);
}
#[test]
fn osa_distance_empty() {
assert_eq!(0, osa_distance("", ""));
@ -695,4 +756,29 @@ mod tests {
fn damerau_levenshtein_unrestricted_edit() {
assert_eq!(3, damerau_levenshtein("a cat", "an abct"));
}
#[test]
fn normalized_damerau_levenshtein_diff_short() {
assert!((normalized_damerau_levenshtein("levenshtein", "löwenbräu") - 0.27272).abs() < 0.00001);
}
#[test]
fn normalized_damerau_levenshtein_for_empty_strings() {
assert!((normalized_damerau_levenshtein("", "") - 1.0).abs() < 0.00001);
}
#[test]
fn normalized_damerau_levenshtein_first_empty() {
assert!(normalized_damerau_levenshtein("", "flower").abs() < 0.00001);
}
#[test]
fn normalized_damerau_levenshtein_second_empty() {
assert!(normalized_damerau_levenshtein("tree", "").abs() < 0.00001);
}
#[test]
fn normalized_damerau_levenshtein_identical_strings() {
assert!((normalized_damerau_levenshtein("sunglasses", "sunglasses") - 1.0).abs() < 0.00001);
}
}

View File

@ -1,7 +1,7 @@
extern crate strsim;
use strsim::{hamming, levenshtein, osa_distance, damerau_levenshtein, jaro,
jaro_winkler};
use strsim::{hamming, levenshtein, normalized_levenshtein, osa_distance,damerau_levenshtein,
normalized_damerau_levenshtein, jaro, jaro_winkler};
#[test]
fn hamming_works() {
@ -16,6 +16,11 @@ fn levenshtein_works() {
assert_eq!(3, levenshtein("kitten", "sitting"));
}
#[test]
fn normalized_levenshtein_works() {
assert!((normalized_levenshtein("kitten", "sitting") - 0.57142).abs() < 0.00001);
}
#[test]
fn osa_distance_works() {
assert_eq!(3, osa_distance("ac", "cba"));
@ -26,6 +31,11 @@ fn damerau_levenshtein_works() {
assert_eq!(2, damerau_levenshtein("ac", "cba"));
}
#[test]
fn normalized_damerau_levenshtein_works() {
assert!((normalized_damerau_levenshtein("levenshtein", "löwenbräu") - 0.27272).abs() < 0.00001);
}
#[test]
fn jaro_works() {
assert!((0.392 - jaro("Friedrich Nietzsche", "Jean-Paul Sartre")).abs() <