mirror of
https://gitee.com/openharmony/third_party_rust_strsim-rs
synced 2024-11-22 23:29:43 +00:00
Add normalized Levenshtein and Damerau-Levenstein (#20)
* Add tests for 'normalized_levenshtein' * Implement 'normalized_levenshtein' * Implement 'normalized_damerau_levenshtein' * Add benchmarking of new functions * Move test cases from integration tests to unit tests * Use 'is_empty' instead of 'len' * Count chars instead of bytes * Update Readme
This commit is contained in:
parent
e178e6995f
commit
ab17c1e9ab
12
README.md
12
README.md
@ -2,9 +2,9 @@
|
||||
|
||||
[Rust](https://www.rust-lang.org) implementations of [string similarity metrics]:
|
||||
- [Hamming]
|
||||
- [Levenshtein]
|
||||
- [Levenshtein] - distance & normalized
|
||||
- [Optimal string alignment]
|
||||
- [Damerau-Levenshtein]
|
||||
- [Damerau-Levenshtein] - distance & normalized
|
||||
- [Jaro and Jaro-Winkler] - this implementation of Jaro-Winkler does not limit the common prefix length
|
||||
|
||||
### Installation
|
||||
@ -23,8 +23,8 @@ version in the
|
||||
```rust
|
||||
extern crate strsim;
|
||||
|
||||
use strsim::{hamming, levenshtein, osa_distance, damerau_levenshtein, jaro,
|
||||
jaro_winkler};
|
||||
use strsim::{hamming, levenshtein, normalized_levenshtein, osa_distance, damerau_levenshtein,
|
||||
normalized_damerau_levenshtein, jaro, jaro_winkler};
|
||||
|
||||
fn main() {
|
||||
match hamming("hamming", "hammers") {
|
||||
@ -34,10 +34,14 @@ fn main() {
|
||||
|
||||
assert_eq!(3, levenshtein("kitten", "sitting"));
|
||||
|
||||
assert!((normalized_levenshtein("kitten", "sitting") - 0.57142).abs() < 0.00001);
|
||||
|
||||
assert_eq!(3, osa_distance("ac", "cba"));
|
||||
|
||||
assert_eq!(2, damerau_levenshtein("ac", "cba"));
|
||||
|
||||
assert!((normalized_damerau_levenshtein("levenshtein", "löwenbräu") - 0.27272).abs() < 0.00001)
|
||||
|
||||
assert!((0.392 - jaro("Friedrich Nietzsche", "Jean-Paul Sartre")).abs() <
|
||||
0.001);
|
||||
|
||||
|
@ -46,6 +46,15 @@ mod benches {
|
||||
})
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_normalized_levenshtein(bencher: &mut Bencher) {
|
||||
let a = "Philosopher Friedrich Nietzsche";
|
||||
let b = "Philosopher Jean-Paul Sartre";
|
||||
bencher.iter(|| {
|
||||
strsim::normalized_levenshtein(&a, &b);
|
||||
})
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_osa_distance(bencher: &mut Bencher) {
|
||||
let a = "Philosopher Friedrich Nietzsche";
|
||||
@ -63,4 +72,13 @@ mod benches {
|
||||
strsim::damerau_levenshtein(&a, &b);
|
||||
})
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_normalized_damerau_levenshtein(bencher: &mut Bencher) {
|
||||
let a = "Philosopher Friedrich Nietzsche";
|
||||
let b = "Philosopher Jean-Paul Sartre";
|
||||
bencher.iter(|| {
|
||||
strsim::normalized_damerau_levenshtein(&a, &b);
|
||||
})
|
||||
}
|
||||
}
|
||||
|
86
src/lib.rs
86
src/lib.rs
@ -171,6 +171,24 @@ pub fn levenshtein(a: &str, b: &str) -> usize {
|
||||
result
|
||||
}
|
||||
|
||||
/// Calculates normalized score of the Levenshtein algorithm
|
||||
///
|
||||
/// ```
|
||||
/// use strsim::normalized_levenshtein;
|
||||
///
|
||||
/// assert!((normalized_levenshtein("kitten", "sitting") - 0.57142).abs() < 0.00001);
|
||||
/// assert!((normalized_levenshtein("", "") - 1.0).abs() < 0.00001);
|
||||
/// assert!(normalized_levenshtein("", "second").abs() < 0.00001);
|
||||
/// assert!(normalized_levenshtein("first", "").abs() < 0.00001);
|
||||
/// assert!((normalized_levenshtein("string", "string") - 1.0).abs() < 0.00001);
|
||||
/// ```
|
||||
pub fn normalized_levenshtein(a: &str, b: &str) -> f64 {
|
||||
if a.is_empty() && b.is_empty() {
|
||||
return 1.0;
|
||||
}
|
||||
1.0 - (levenshtein(a, b) as f64) / (a.chars().count().max(b.chars().count()) as f64)
|
||||
}
|
||||
|
||||
/// Like Levenshtein but allows for adjacent transpositions. Each substring can
|
||||
/// only be edited once.
|
||||
///
|
||||
@ -295,6 +313,24 @@ pub fn damerau_levenshtein(a: &str, b: &str) -> usize {
|
||||
distances[a_len + 1][b_len + 1]
|
||||
}
|
||||
|
||||
/// Calculates normalized score of the Damerau–Levenshtein algorithm
|
||||
///
|
||||
/// ```
|
||||
/// use strsim::normalized_damerau_levenshtein;
|
||||
///
|
||||
/// assert!((normalized_damerau_levenshtein("levenshtein", "löwenbräu") - 0.27272).abs() < 0.00001);
|
||||
/// assert!((normalized_damerau_levenshtein("", "") - 1.0).abs() < 0.00001);
|
||||
/// assert!(normalized_damerau_levenshtein("", "flower").abs() < 0.00001);
|
||||
/// assert!(normalized_damerau_levenshtein("tree", "").abs() < 0.00001);
|
||||
/// assert!((normalized_damerau_levenshtein("sunglasses", "sunglasses") - 1.0).abs() < 0.00001);
|
||||
/// ```
|
||||
pub fn normalized_damerau_levenshtein(a: &str, b: &str) -> f64 {
|
||||
if a.is_empty() && b.is_empty() {
|
||||
return 1.0;
|
||||
}
|
||||
1.0 - (damerau_levenshtein(a, b) as f64) / (a.chars().count().max(b.chars().count()) as f64)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
@ -530,6 +566,31 @@ mod tests {
|
||||
assert_eq!(6, levenshtein("kitten", ""));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn normalized_levenshtein_diff_short() {
|
||||
assert!((normalized_levenshtein("kitten", "sitting") - 0.57142).abs() < 0.00001);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn normalized_levenshtein_for_empty_strings() {
|
||||
assert!((normalized_levenshtein("", "") - 1.0).abs() < 0.00001);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn normalized_levenshtein_first_empty() {
|
||||
assert!(normalized_levenshtein("", "second").abs() < 0.00001);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn normalized_levenshtein_second_empty() {
|
||||
assert!(normalized_levenshtein("first", "").abs() < 0.00001);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn normalized_levenshtein_identical_strings() {
|
||||
assert!((normalized_levenshtein("identical", "identical") - 1.0).abs() < 0.00001);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn osa_distance_empty() {
|
||||
assert_eq!(0, osa_distance("", ""));
|
||||
@ -695,4 +756,29 @@ mod tests {
|
||||
fn damerau_levenshtein_unrestricted_edit() {
|
||||
assert_eq!(3, damerau_levenshtein("a cat", "an abct"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn normalized_damerau_levenshtein_diff_short() {
|
||||
assert!((normalized_damerau_levenshtein("levenshtein", "löwenbräu") - 0.27272).abs() < 0.00001);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn normalized_damerau_levenshtein_for_empty_strings() {
|
||||
assert!((normalized_damerau_levenshtein("", "") - 1.0).abs() < 0.00001);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn normalized_damerau_levenshtein_first_empty() {
|
||||
assert!(normalized_damerau_levenshtein("", "flower").abs() < 0.00001);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn normalized_damerau_levenshtein_second_empty() {
|
||||
assert!(normalized_damerau_levenshtein("tree", "").abs() < 0.00001);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn normalized_damerau_levenshtein_identical_strings() {
|
||||
assert!((normalized_damerau_levenshtein("sunglasses", "sunglasses") - 1.0).abs() < 0.00001);
|
||||
}
|
||||
}
|
||||
|
14
tests/lib.rs
14
tests/lib.rs
@ -1,7 +1,7 @@
|
||||
extern crate strsim;
|
||||
|
||||
use strsim::{hamming, levenshtein, osa_distance, damerau_levenshtein, jaro,
|
||||
jaro_winkler};
|
||||
use strsim::{hamming, levenshtein, normalized_levenshtein, osa_distance,damerau_levenshtein,
|
||||
normalized_damerau_levenshtein, jaro, jaro_winkler};
|
||||
|
||||
#[test]
|
||||
fn hamming_works() {
|
||||
@ -16,6 +16,11 @@ fn levenshtein_works() {
|
||||
assert_eq!(3, levenshtein("kitten", "sitting"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn normalized_levenshtein_works() {
|
||||
assert!((normalized_levenshtein("kitten", "sitting") - 0.57142).abs() < 0.00001);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn osa_distance_works() {
|
||||
assert_eq!(3, osa_distance("ac", "cba"));
|
||||
@ -26,6 +31,11 @@ fn damerau_levenshtein_works() {
|
||||
assert_eq!(2, damerau_levenshtein("ac", "cba"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn normalized_damerau_levenshtein_works() {
|
||||
assert!((normalized_damerau_levenshtein("levenshtein", "löwenbräu") - 0.27272).abs() < 0.00001);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn jaro_works() {
|
||||
assert!((0.392 - jaro("Friedrich Nietzsche", "Jean-Paul Sartre")).abs() <
|
||||
|
Loading…
Reference in New Issue
Block a user