mirror of
https://gitee.com/openharmony/third_party_rust_strsim-rs
synced 2024-11-23 07:39:51 +00:00
Fix Damerau-Levenshtein (#12)
Rename the original implementation to osa_distance, and implement the actual metric.
This commit is contained in:
parent
b2fc338541
commit
f8d8c1fff8
13
CHANGELOG.md
13
CHANGELOG.md
@ -1,6 +1,14 @@
|
||||
# Change Log
|
||||
This project attempts to adhere to [Semantic Versioning](http://semver.org).
|
||||
|
||||
## [0.6.0] - (2016-12-26)
|
||||
### Added
|
||||
- Add optimal string alignment distance
|
||||
|
||||
### Fixed
|
||||
- Fix Damerau-Levenshtein implementation (previous implementation was actually
|
||||
optimal string alignment; see this [Damerau-Levenshtein explanation])
|
||||
|
||||
## [0.5.2] - (2016-11-21)
|
||||
### Changed
|
||||
- Remove Cargo generated documentation in favor of a [docs.rs] link
|
||||
@ -73,7 +81,8 @@ vector of results (thanks @ovarene)
|
||||
### Added
|
||||
- Implement Hamming, Jaro, Jaro-Winkler, and Levenshtein
|
||||
|
||||
[Unreleased]: https://github.com/dguo/strsim-rs/compare/0.5.2...HEAD
|
||||
[Unreleased]: https://github.com/dguo/strsim-rs/compare/0.6.0...HEAD
|
||||
[0.6.0]: https://github.com/dguo/strsim-rs/compare/0.5.2...0.6.0
|
||||
[0.5.2]: https://github.com/dguo/strsim-rs/compare/0.5.1...0.5.2
|
||||
[0.5.1]: https://github.com/dguo/strsim-rs/compare/0.5.0...0.5.1
|
||||
[0.5.0]: https://github.com/dguo/strsim-rs/compare/0.4.1...0.5.0
|
||||
@ -89,4 +98,6 @@ vector of results (thanks @ovarene)
|
||||
[0.1.1]: https://github.com/dguo/strsim-rs/compare/0.1.0...0.1.1
|
||||
[0.1.0]: https://github.com/dguo/strsim-rs/compare/fabad4...0.1.0
|
||||
[docs.rs]: https://docs.rs/strsim/
|
||||
[Damerau-Levenshtein explanation]:
|
||||
http://scarcitycomputing.blogspot.com/2013/04/damerau-levenshtein-edit-distance.html
|
||||
|
||||
|
@ -1,7 +1,7 @@
|
||||
[package]
|
||||
|
||||
name = "strsim"
|
||||
version = "0.5.2"
|
||||
version = "0.6.0"
|
||||
authors = ["Danny Guo <dannyguo91@gmail.com>"]
|
||||
description = """
|
||||
Implementations of string similarity metrics.
|
||||
|
38
README.md
38
README.md
@ -2,31 +2,31 @@
|
||||
|
||||
[Rust](https://www.rust-lang.org) implementations of [string similarity metrics]:
|
||||
- [Hamming]
|
||||
- [Levenshtein] and [Damerau-Levenshtein]
|
||||
- [Levenshtein]
|
||||
- [Optimal string alignment]
|
||||
- [Damerau-Levenshtein]
|
||||
- [Jaro and Jaro-Winkler] - this implementation of Jaro-Winkler does not limit the common prefix length
|
||||
|
||||
### Installation
|
||||
|
||||
```toml
|
||||
# Cargo.toml
|
||||
[dependencies]
|
||||
strsim = "0.5.2"
|
||||
strsim = "0.6.0"
|
||||
```
|
||||
|
||||
### [Documentation](https://docs.rs/strsim/)
|
||||
|
||||
You can change the version in the url to see the documentation for an older
|
||||
version in the
|
||||
[changelog](https://github.com/dguo/strsim-rs/blob/master/CHANGELOG.md).
|
||||
|
||||
### Usage
|
||||
|
||||
```rust
|
||||
extern crate strsim;
|
||||
|
||||
use strsim::{hamming, levenshtein, damerau_levenshtein, jaro, jaro_winkler,
|
||||
levenshtein_against_vec, damerau_levenshtein_against_vec,
|
||||
jaro_against_vec, jaro_winkler_against_vec};
|
||||
use strsim::{hamming, levenshtein, osa_distance, damerau_levenshtein, jaro,
|
||||
jaro_winkler, levenshtein_against_vec, osa_distance_against_vec,
|
||||
damerau_levenshtein_against_vec, jaro_against_vec,
|
||||
jaro_winkler_against_vec};
|
||||
|
||||
fn main() {
|
||||
match hamming("hamming", "hammers") {
|
||||
@ -36,7 +36,9 @@ fn main() {
|
||||
|
||||
assert_eq!(3, levenshtein("kitten", "sitting"));
|
||||
|
||||
assert_eq!(1, damerau_levenshtein("specter", "spectre"));
|
||||
assert_eq!(3, osa_distance("ac", "cba"));
|
||||
|
||||
assert_eq!(2, damerau_levenshtein("ac", "cba"));
|
||||
|
||||
assert!((0.392 - jaro("Friedrich Nietzsche", "Jean-Paul Sartre")).abs() <
|
||||
0.001);
|
||||
@ -45,13 +47,16 @@ fn main() {
|
||||
0.001);
|
||||
|
||||
// get vectors of values back
|
||||
let v = vec!["test", "test1", "test12", "test123", "", "tset"];
|
||||
let v = vec!["test", "test1", "test12", "test123", "", "tset", "tsvet"];
|
||||
|
||||
assert_eq!(levenshtein_against_vec("test", &v),
|
||||
vec![0, 1, 2, 3, 4, 2]);
|
||||
vec![0, 1, 2, 3, 4, 2, 3]);
|
||||
|
||||
assert_eq!(osa_distance_against_vec("test", &v),
|
||||
vec![0, 1, 2, 3, 4, 1, 3]);
|
||||
|
||||
assert_eq!(damerau_levenshtein_against_vec("test", &v),
|
||||
vec![0, 1, 2, 3, 4, 1]);
|
||||
vec![0, 1, 2, 3, 4, 1, 2]);
|
||||
|
||||
let jaro_distances = jaro_against_vec("test", &v);
|
||||
let jaro_expected = vec![1.0, 0.933333, 0.888889, 0.857143, 0.0, 0.916667];
|
||||
@ -72,11 +77,11 @@ fn main() {
|
||||
```
|
||||
|
||||
### Development
|
||||
|
||||
Install [Vagrant](https://www.vagrantup.com), and run `vagrant up`.
|
||||
If you don't want to install Rust itself, you can install [Docker], and run
|
||||
`$ ./dev`. This should bring up a temporary container from which you can run
|
||||
[cargo] commands.
|
||||
|
||||
### License
|
||||
|
||||
[MIT](https://github.com/dguo/strsim-rs/blob/master/LICENSE)
|
||||
|
||||
[string similarity metrics]:http://en.wikipedia.org/wiki/String_metric
|
||||
@ -84,4 +89,7 @@ Install [Vagrant](https://www.vagrantup.com), and run `vagrant up`.
|
||||
[Jaro and Jaro-Winkler]:http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance
|
||||
[Levenshtein]:http://en.wikipedia.org/wiki/Levenshtein_distance
|
||||
[Hamming]:http://en.wikipedia.org/wiki/Hamming_distance
|
||||
[Optimal string alignment]:https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance#Optimal_string_alignment_distance
|
||||
[Docker]:https://docs.docker.com/engine/installation/
|
||||
[cargo]:https://github.com/rust-lang/cargo
|
||||
|
||||
|
222
src/lib.rs
222
src/lib.rs
@ -1,8 +1,8 @@
|
||||
//! This library implements string similarity metrics. Includes Hamming,
|
||||
//! Levenshtein, Jaro, and Jaro-Winkler.
|
||||
//! This library implements string similarity metrics.
|
||||
|
||||
use std::char;
|
||||
use std::cmp::{max, min};
|
||||
use std::collections::HashMap;
|
||||
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub enum StrSimError {
|
||||
@ -225,14 +225,15 @@ pub fn levenshtein_against_vec(a: &str, v: &[&str]) -> Vec<usize> {
|
||||
v.iter().map(|b| levenshtein(a, b)).collect()
|
||||
}
|
||||
|
||||
/// Same as Levenshtein but allows for adjacent transpositions.
|
||||
/// Like Levenshtein but allows for adjacent transpositions. Each substring can
|
||||
/// only be edited once.
|
||||
///
|
||||
/// ```
|
||||
/// use strsim::damerau_levenshtein;
|
||||
/// use strsim::osa_distance;
|
||||
///
|
||||
/// assert_eq!(3, damerau_levenshtein("damerau", "aderua"));
|
||||
/// assert_eq!(3, osa_distance("ab", "bca"));
|
||||
/// ```
|
||||
pub fn damerau_levenshtein(a: &str, b: &str) -> usize {
|
||||
pub fn osa_distance(a: &str, b: &str) -> usize {
|
||||
let a_len = a.chars().count();
|
||||
let b_len = b.chars().count();
|
||||
if a == b { return 0; }
|
||||
@ -275,7 +276,93 @@ pub fn damerau_levenshtein(a: &str, b: &str) -> usize {
|
||||
}
|
||||
|
||||
curr_distances[b_len]
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/// Calculates the optimal string alignment distance between a string and each
|
||||
/// string in a vector. Returns a vector of corresponding values.
|
||||
///
|
||||
/// ```
|
||||
/// use strsim::osa_distance_against_vec;
|
||||
///
|
||||
/// let v = vec!["test", "test1", "test12", "test123", "", "tset"];
|
||||
/// let result = osa_distance_against_vec("test", &v);
|
||||
/// let expected = vec![0, 1, 2, 3, 4, 1];
|
||||
/// assert_eq!(expected, result);
|
||||
/// ```
|
||||
pub fn osa_distance_against_vec(a: &str, v: &[&str]) -> Vec<usize> {
|
||||
v.iter().map(|b| osa_distance(a, b)).collect()
|
||||
}
|
||||
|
||||
/// Like optimal string alignment, but substrings can be edited an unlimited
|
||||
/// number of times, and the triangle inequality holds.
|
||||
///
|
||||
/// ```
|
||||
/// use strsim::damerau_levenshtein;
|
||||
///
|
||||
/// assert_eq!(2, damerau_levenshtein("ab", "bca"));
|
||||
/// ```
|
||||
pub fn damerau_levenshtein(a: &str, b: &str) -> usize {
|
||||
if a == b { return 0; }
|
||||
|
||||
let a_chars: Vec<char> = a.chars().collect();
|
||||
let b_chars: Vec<char> = b.chars().collect();
|
||||
let a_len = a_chars.len();
|
||||
let b_len = b_chars.len();
|
||||
|
||||
if a_len == 0 { return b_len; }
|
||||
if b_len == 0 { return a_len; }
|
||||
|
||||
let mut distances = vec![vec![0; b_len + 2]; a_len + 2];
|
||||
let max_distance = a_len + b_len;
|
||||
distances[0][0] = max_distance;
|
||||
|
||||
for i in 0..(a_len + 1) {
|
||||
distances[i + 1][0] = max_distance;
|
||||
distances[i + 1][1] = i;
|
||||
}
|
||||
|
||||
for j in 0..(b_len + 1) {
|
||||
distances[0][j + 1] = max_distance;
|
||||
distances[1][j + 1] = j;
|
||||
}
|
||||
|
||||
let mut chars: HashMap<char, usize> = HashMap::new();
|
||||
|
||||
for i in 1..(a_len + 1) {
|
||||
let mut db = 0;
|
||||
|
||||
for j in 1..(b_len + 1) {
|
||||
let k = match chars.get(&b_chars[j - 1]) {
|
||||
Some(value) => value.clone(),
|
||||
None => 0
|
||||
};
|
||||
|
||||
let l = db;
|
||||
|
||||
let mut cost = 1;
|
||||
if a_chars[i - 1] == b_chars[j - 1] {
|
||||
cost = 0;
|
||||
db = j;
|
||||
}
|
||||
|
||||
let substitution_cost = distances[i][j] + cost;
|
||||
let insertion_cost = distances[i][j + 1] + 1;
|
||||
let deletion_cost = distances[i + 1][j] + 1;
|
||||
let transposition_cost = distances[k][l] + (i - k - 1) + 1 +
|
||||
(j - l - 1);
|
||||
|
||||
distances[i + 1][j + 1] = min(substitution_cost,
|
||||
min(insertion_cost,
|
||||
min(deletion_cost,
|
||||
transposition_cost)));
|
||||
}
|
||||
|
||||
chars.insert(a_chars[i - 1], i);
|
||||
}
|
||||
|
||||
distances[a_len + 1][b_len + 1]
|
||||
}
|
||||
|
||||
/// Calculates the Damerau-Levenshtein distance between a string and each string
|
||||
/// in a vector. Returns a vector of corresponding values.
|
||||
@ -527,6 +614,89 @@ mod tests {
|
||||
assert_eq!(6, levenshtein("kitten", ""));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn osa_distance_empty() {
|
||||
assert_eq!(0, osa_distance("", ""));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn osa_distance_same() {
|
||||
assert_eq!(0, osa_distance("damerau", "damerau"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn osa_distance_first_empty() {
|
||||
assert_eq!(7, osa_distance("", "damerau"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn osa_distance_second_empty() {
|
||||
assert_eq!(7, osa_distance("damerau", ""));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn osa_distance_diff() {
|
||||
assert_eq!(3, osa_distance("ca", "abc"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn osa_distance_diff_short() {
|
||||
assert_eq!(3, osa_distance("damerau", "aderua"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn osa_distance_diff_reversed() {
|
||||
assert_eq!(3, osa_distance("aderua", "damerau"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn osa_distance_diff_multibyte() {
|
||||
assert_eq!(3, osa_distance("öঙ香", "abc"));
|
||||
assert_eq!(3, osa_distance("abc", "öঙ香"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn osa_distance_diff_unequal_length() {
|
||||
assert_eq!(6, osa_distance("damerau", "aderuaxyz"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn osa_distance_diff_unequal_length_reversed() {
|
||||
assert_eq!(6, osa_distance("aderuaxyz", "damerau"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn osa_distance_diff_comedians() {
|
||||
assert_eq!(5, osa_distance("Stewart", "Colbert"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn osa_distance_many_transpositions() {
|
||||
assert_eq!(4, osa_distance("abcdefghijkl", "bacedfgihjlk"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn osa_distance_diff_longer() {
|
||||
let a = "The quick brown fox jumped over the angry dog.";
|
||||
let b = "Lehem ipsum dolor sit amet, dicta latine an eam.";
|
||||
assert_eq!(36, osa_distance(a, b));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn osa_distance_beginning_transposition() {
|
||||
assert_eq!(1, osa_distance("foobar", "ofobar"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn osa_distance_end_transposition() {
|
||||
assert_eq!(1, osa_distance("specter", "spectre"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn osa_distance_restricted_edit() {
|
||||
assert_eq!(4, osa_distance("a cat", "an abct"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn damerau_levenshtein_empty() {
|
||||
assert_eq!(0, damerau_levenshtein("", ""));
|
||||
@ -547,6 +717,11 @@ mod tests {
|
||||
assert_eq!(7, damerau_levenshtein("damerau", ""));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn damerau_levenshtein_diff() {
|
||||
assert_eq!(2, damerau_levenshtein("ca", "abc"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn damerau_levenshtein_diff_short() {
|
||||
assert_eq!(3, damerau_levenshtein("damerau", "aderua"));
|
||||
@ -600,6 +775,11 @@ mod tests {
|
||||
assert_eq!(1, damerau_levenshtein("specter", "spectre"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn damerau_levenshtein_unrestricted_edit() {
|
||||
assert_eq!(3, damerau_levenshtein("a cat", "an abct"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn levenshtein_against_vec_empty() {
|
||||
let v = Vec::new();
|
||||
@ -624,6 +804,30 @@ mod tests {
|
||||
assert_eq!(expected, result);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn osa_distance_against_vec_empty() {
|
||||
let v = Vec::new();
|
||||
let result = osa_distance_against_vec("test", &v);
|
||||
let expected: Vec<usize> = Vec::new();
|
||||
assert_eq!(expected, result);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn osa_distance_against_vec_one() {
|
||||
let v = vec!["etst"];
|
||||
let result = osa_distance_against_vec("test", &v);
|
||||
let expected = vec![1];
|
||||
assert_eq!(expected, result);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn osa_distance_against_vec_many() {
|
||||
let v = vec!["test", "test1", "test12", "test123", "", "tsvet"];
|
||||
let result = osa_distance_against_vec("test", &v);
|
||||
let expected = vec![0, 1, 2, 3, 4, 3];
|
||||
assert_eq!(expected, result);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn damerau_levenshtein_against_vec_empty() {
|
||||
let v = Vec::new();
|
||||
@ -642,9 +846,9 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn damerau_levenshtein_against_vec_many() {
|
||||
let v = vec!["test", "test1", "test12", "test123", "", "tset"];
|
||||
let v = vec!["test", "test1", "test12", "test123", "", "tsvet"];
|
||||
let result = damerau_levenshtein_against_vec("test", &v);
|
||||
let expected = vec![0, 1, 2, 3, 4, 1];
|
||||
let expected = vec![0, 1, 2, 3, 4, 2];
|
||||
assert_eq!(expected, result);
|
||||
}
|
||||
|
||||
|
30
tests/lib.rs
30
tests/lib.rs
@ -1,8 +1,9 @@
|
||||
extern crate strsim;
|
||||
|
||||
use strsim::{hamming, levenshtein, damerau_levenshtein, jaro, jaro_winkler,
|
||||
levenshtein_against_vec, damerau_levenshtein_against_vec,
|
||||
jaro_against_vec, jaro_winkler_against_vec};
|
||||
use strsim::{hamming, levenshtein, osa_distance, damerau_levenshtein, jaro,
|
||||
jaro_winkler, levenshtein_against_vec, osa_distance_against_vec,
|
||||
damerau_levenshtein_against_vec, jaro_against_vec,
|
||||
jaro_winkler_against_vec};
|
||||
|
||||
#[test]
|
||||
fn hamming_works() {
|
||||
@ -17,9 +18,14 @@ fn levenshtein_works() {
|
||||
assert_eq!(3, levenshtein("kitten", "sitting"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn osa_distance_works() {
|
||||
assert_eq!(3, osa_distance("ac", "cba"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn damerau_levenshtein_works() {
|
||||
assert_eq!(3, damerau_levenshtein("damerau", "aderua"));
|
||||
assert_eq!(2, damerau_levenshtein("ac", "cba"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
@ -36,17 +42,25 @@ fn jaro_winkler_works() {
|
||||
|
||||
#[test]
|
||||
fn levenshtein_against_vec_works() {
|
||||
let v = vec!["test", "test1", "test12", "test123", "", "tset"];
|
||||
let v = vec!["test", "test1", "test12", "test123", "", "tset", "tsvet"];
|
||||
let result = levenshtein_against_vec("test", &v);
|
||||
let expected = vec![0, 1, 2, 3, 4, 2];
|
||||
let expected = vec![0, 1, 2, 3, 4, 2, 3];
|
||||
assert_eq!(expected, result);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn osa_distance_against_vec_works() {
|
||||
let v = vec!["test", "test1", "test12", "test123", "", "tset", "tsvet"];
|
||||
let result = osa_distance_against_vec("test", &v);
|
||||
let expected = vec![0, 1, 2, 3, 4, 1, 3];
|
||||
assert_eq!(expected, result);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn damerau_levenshtein_against_vec_works() {
|
||||
let v = vec!["test", "test1", "test12", "test123", "", "tset"];
|
||||
let v = vec!["test", "test1", "test12", "test123", "", "tset", "tsvet"];
|
||||
let result = damerau_levenshtein_against_vec("test", &v);
|
||||
let expected = vec![0, 1, 2, 3, 4, 1];
|
||||
let expected = vec![0, 1, 2, 3, 4, 1, 2];
|
||||
assert_eq!(expected, result);
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user