Add distances calculus against vector of strings

Add levenshtein_against_vec, damerau_levenshtein_against_vec,
jaro_against_vec, jaro_winkler_against_vec
to calculate the distances agains a vector of strings
This commit is contained in:
Olivier Varène 2015-06-09 16:54:20 +02:00
parent 0125b88c41
commit a0226ea984
2 changed files with 109 additions and 2 deletions

View File

@ -10,7 +10,7 @@ Rust implementations of [string similarity metrics]. Should compile cleanly on b
```toml
# Cargo.toml
[dependencies]
strsim = "0.3.0"
strsim = "0.4.0"
```
### Usage
@ -18,7 +18,7 @@ strsim = "0.3.0"
```rust
extern crate strsim;
use strsim::{hamming, levenshtein, damerau_levenshtein, jaro, jaro_winkler};
use strsim::{hamming, levenshtein, damerau_levenshtein, jaro, jaro_winkler, levenshtein_against_vec, damerau_levenshtein_against_vec, jaro_against_vec, jaro_winkler_against_vec};
fn main() {
match hamming("hamming", "hammers") {
@ -35,6 +35,33 @@ fn main() {
assert!((0.911 - jaro_winkler("cheeseburger", "cheese fries")).abs() <
0.001);
//
// Vector to calculate distances against
let v = vec!["test","test1","test12","test123","","tset"];
// levenshtein
let mut distances = levenshtein_against_vec("test",&v);
let mut expected = vec![0,1,2,3,4,2];
assert_eq!(distances,expected);
// damereau_levenshtein
distances = damerau_levenshtein_against_vec("test",&v);
expected = vec![0,1,2,3,4,1];
assert_eq!(distances,expected);
// jaro
distances = jaro_against_vec("test",&v);
expected = vec![1.0, 0.933333, 0.888889, 0.857143, 0.0, 0.916667];
let mut delta: f64 = res.iter().zip(expected.iter()).map(|(x,y)| (x-y).abs() as f64 ).fold(0.0, |x,y| x+y as f64);
assert(true, (delta < 0.0001) );
// jaro_winkler
distances = jaro_winkler_against_vec("test",&v);
expected = vec![1.0, 0.96, 0.933333, 0.914286, 0.0, 0.925];
delta = res.iter().zip(expected.iter()).map(|(x,y)| (x-y).abs() as f64 ).fold(0.0, |x,y| x+y as f64);
assert(true, (delta < 0.0001) );
}
```

View File

@ -99,6 +99,27 @@ pub fn jaro(a: &str, b: &str) -> f64 {
}
}
/// Calculates the Jaro similarities between a string and a vector of strings. The returned value is a vector of values
/// between 0.0 and 1.0 (higher value means more similar).
///
/// ```
/// use strsim::jaro_against_vec;
///
/// let v = vec!["test","test1","test12","test123","","tset"];
/// let res = jaro_against_vec("test",&v);
/// let expected = vec![1.0, 0.933333, 0.888889, 0.857143, 0.0, 0.916667];
/// let delta: f64 = res.iter().zip(expected.iter()).map(|(x,y)| (x-y).abs() as f64 ).fold(0.0, |x,y| x+y as f64);
/// assert_eq!(true, (delta.abs() < 0.0001) );
/// ```
pub fn jaro_against_vec(a: &str, v: &Vec<&str>) -> Vec<f64> {
let mut r: Vec<f64> = Vec::with_capacity(v.len()+1);
for b in v.iter() {
r.push( jaro(a,b) );
}
return r;
}
/// Like Jaro but gives a boost to strings that have a common prefix.
///
/// ```
@ -126,6 +147,25 @@ pub fn jaro_winkler(a: &str, b: &str) -> f64 {
}
}
/// Like Jaro but gives a boost to strings that have a common prefix.
///
/// ```
/// use strsim::jaro_winkler_against_vec;
///
/// let v = vec!["test","test1","test12","test123","","tset"];
/// let res = jaro_winkler_against_vec("test",&v);
/// let expected = vec![1.0, 0.96, 0.933333, 0.914286, 0.0, 0.925];
/// let delta: f64 = res.iter().zip(expected.iter()).map(|(x,y)| (x-y).abs() as f64 ).fold(0.0, |x,y| x+y as f64);
/// assert_eq!(true, (delta.abs() < 0.0001) );
/// ```
pub fn jaro_winkler_against_vec(a: &str, v: &Vec<&str>) -> Vec<f64> {
let mut r: Vec<f64> = Vec::with_capacity(v.len()+1);
for b in v.iter() {
r.push( jaro_winkler(a,b) );
}
return r;
}
/// Calculates the minimum number of insertions, deletions, and substitutions
/// required to change one string into the other.
///
@ -163,6 +203,26 @@ pub fn levenshtein(a: &str, b: &str) -> usize {
curr_distances[b.len()]
}
/// Calculates the minimum number of insertions, deletions, and substitutions
/// required to change one string into the other from a vector of strings.
///
/// ```
/// use strsim::levenshtein_against_vec;
///
/// let v = vec!["test","test1","test12","test123","","tset"];
/// let res = levenshtein_against_vec("test",&v);
/// let expected = vec![0,1,2,3,4,2];
/// assert_eq!(expected,res);
/// ```
pub fn levenshtein_against_vec(a: &str, v: &Vec<&str>) -> Vec<usize> {
let mut r: Vec<usize> = Vec::with_capacity(v.len()+1);
for b in v.iter() {
r.push( levenshtein(a,b) );
}
return r;
}
/// Same as Levenshtein but allows for adjacent transpositions.
///
/// ```
@ -213,6 +273,26 @@ pub fn damerau_levenshtein(a: &str, b: &str) -> usize {
curr_distances[b.len()]
}
/// Same as Levenshtein but allows for adjacent transpositions
///
/// ```
/// use strsim::damerau_levenshtein_against_vec;
///
/// let v = vec!["test","test1","test12","test123","","tset"];
/// let res = damerau_levenshtein_against_vec("test",&v);
/// let expected = vec![0,1,2,3,4,1];
/// assert_eq!(expected,res);
/// ```
pub fn damerau_levenshtein_against_vec(a: &str, v: &Vec<&str>) -> Vec<usize> {
let mut r: Vec<usize> = Vec::with_capacity(v.len()+1);
for b in v.iter() {
r.push( damerau_levenshtein(a,b) );
}
return r;
}
#[cfg(test)]
mod tests {
use super::*;