Fix Damerau-Levenshtein (#12)

Rename the original implementation to osa_distance, and implement the actual
metric.
This commit is contained in:
Danny Guo 2016-12-26 22:54:32 -05:00 committed by GitHub
parent b2fc338541
commit f8d8c1fff8
5 changed files with 271 additions and 34 deletions

View File

@ -1,6 +1,14 @@
# Change Log
This project attempts to adhere to [Semantic Versioning](http://semver.org).
## [0.6.0] - (2016-12-26)
### Added
- Add optimal string alignment distance
### Fixed
- Fix Damerau-Levenshtein implementation (previous implementation was actually
optimal string alignment; see this [Damerau-Levenshtein explanation])
## [0.5.2] - (2016-11-21)
### Changed
- Remove Cargo generated documentation in favor of a [docs.rs] link
@ -73,7 +81,8 @@ vector of results (thanks @ovarene)
### Added
- Implement Hamming, Jaro, Jaro-Winkler, and Levenshtein
[Unreleased]: https://github.com/dguo/strsim-rs/compare/0.5.2...HEAD
[Unreleased]: https://github.com/dguo/strsim-rs/compare/0.6.0...HEAD
[0.6.0]: https://github.com/dguo/strsim-rs/compare/0.5.2...0.6.0
[0.5.2]: https://github.com/dguo/strsim-rs/compare/0.5.1...0.5.2
[0.5.1]: https://github.com/dguo/strsim-rs/compare/0.5.0...0.5.1
[0.5.0]: https://github.com/dguo/strsim-rs/compare/0.4.1...0.5.0
@ -89,4 +98,6 @@ vector of results (thanks @ovarene)
[0.1.1]: https://github.com/dguo/strsim-rs/compare/0.1.0...0.1.1
[0.1.0]: https://github.com/dguo/strsim-rs/compare/fabad4...0.1.0
[docs.rs]: https://docs.rs/strsim/
[Damerau-Levenshtein explanation]:
http://scarcitycomputing.blogspot.com/2013/04/damerau-levenshtein-edit-distance.html

View File

@ -1,7 +1,7 @@
[package]
name = "strsim"
version = "0.5.2"
version = "0.6.0"
authors = ["Danny Guo <dannyguo91@gmail.com>"]
description = """
Implementations of string similarity metrics.

View File

@ -2,31 +2,31 @@
[Rust](https://www.rust-lang.org) implementations of [string similarity metrics]:
- [Hamming]
- [Levenshtein] and [Damerau-Levenshtein]
- [Levenshtein]
- [Optimal string alignment]
- [Damerau-Levenshtein]
- [Jaro and Jaro-Winkler] - this implementation of Jaro-Winkler does not limit the common prefix length
### Installation
```toml
# Cargo.toml
[dependencies]
strsim = "0.5.2"
strsim = "0.6.0"
```
### [Documentation](https://docs.rs/strsim/)
You can change the version in the url to see the documentation for an older
version in the
[changelog](https://github.com/dguo/strsim-rs/blob/master/CHANGELOG.md).
### Usage
```rust
extern crate strsim;
use strsim::{hamming, levenshtein, damerau_levenshtein, jaro, jaro_winkler,
levenshtein_against_vec, damerau_levenshtein_against_vec,
jaro_against_vec, jaro_winkler_against_vec};
use strsim::{hamming, levenshtein, osa_distance, damerau_levenshtein, jaro,
jaro_winkler, levenshtein_against_vec, osa_distance_against_vec,
damerau_levenshtein_against_vec, jaro_against_vec,
jaro_winkler_against_vec};
fn main() {
match hamming("hamming", "hammers") {
@ -36,7 +36,9 @@ fn main() {
assert_eq!(3, levenshtein("kitten", "sitting"));
assert_eq!(1, damerau_levenshtein("specter", "spectre"));
assert_eq!(3, osa_distance("ac", "cba"));
assert_eq!(2, damerau_levenshtein("ac", "cba"));
assert!((0.392 - jaro("Friedrich Nietzsche", "Jean-Paul Sartre")).abs() <
0.001);
@ -45,13 +47,16 @@ fn main() {
0.001);
// get vectors of values back
let v = vec!["test", "test1", "test12", "test123", "", "tset"];
let v = vec!["test", "test1", "test12", "test123", "", "tset", "tsvet"];
assert_eq!(levenshtein_against_vec("test", &v),
vec![0, 1, 2, 3, 4, 2]);
vec![0, 1, 2, 3, 4, 2, 3]);
assert_eq!(osa_distance_against_vec("test", &v),
vec![0, 1, 2, 3, 4, 1, 3]);
assert_eq!(damerau_levenshtein_against_vec("test", &v),
vec![0, 1, 2, 3, 4, 1]);
vec![0, 1, 2, 3, 4, 1, 2]);
let jaro_distances = jaro_against_vec("test", &v);
let jaro_expected = vec![1.0, 0.933333, 0.888889, 0.857143, 0.0, 0.916667];
@ -72,11 +77,11 @@ fn main() {
```
### Development
Install [Vagrant](https://www.vagrantup.com), and run `vagrant up`.
If you don't want to install Rust itself, you can install [Docker], and run
`$ ./dev`. This should bring up a temporary container from which you can run
[cargo] commands.
### License
[MIT](https://github.com/dguo/strsim-rs/blob/master/LICENSE)
[string similarity metrics]:http://en.wikipedia.org/wiki/String_metric
@ -84,4 +89,7 @@ Install [Vagrant](https://www.vagrantup.com), and run `vagrant up`.
[Jaro and Jaro-Winkler]:http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance
[Levenshtein]:http://en.wikipedia.org/wiki/Levenshtein_distance
[Hamming]:http://en.wikipedia.org/wiki/Hamming_distance
[Optimal string alignment]:https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance#Optimal_string_alignment_distance
[Docker]:https://docs.docker.com/engine/installation/
[cargo]:https://github.com/rust-lang/cargo

View File

@ -1,8 +1,8 @@
//! This library implements string similarity metrics. Includes Hamming,
//! Levenshtein, Jaro, and Jaro-Winkler.
//! This library implements string similarity metrics.
use std::char;
use std::cmp::{max, min};
use std::collections::HashMap;
#[derive(Debug, PartialEq)]
pub enum StrSimError {
@ -225,14 +225,15 @@ pub fn levenshtein_against_vec(a: &str, v: &[&str]) -> Vec<usize> {
v.iter().map(|b| levenshtein(a, b)).collect()
}
/// Same as Levenshtein but allows for adjacent transpositions.
/// Like Levenshtein but allows for adjacent transpositions. Each substring can
/// only be edited once.
///
/// ```
/// use strsim::damerau_levenshtein;
/// use strsim::osa_distance;
///
/// assert_eq!(3, damerau_levenshtein("damerau", "aderua"));
/// assert_eq!(3, osa_distance("ab", "bca"));
/// ```
pub fn damerau_levenshtein(a: &str, b: &str) -> usize {
pub fn osa_distance(a: &str, b: &str) -> usize {
let a_len = a.chars().count();
let b_len = b.chars().count();
if a == b { return 0; }
@ -275,7 +276,93 @@ pub fn damerau_levenshtein(a: &str, b: &str) -> usize {
}
curr_distances[b_len]
}
}
/// Calculates the optimal string alignment distance between a string and each
/// string in a vector. Returns a vector of corresponding values.
///
/// ```
/// use strsim::osa_distance_against_vec;
///
/// let v = vec!["test", "test1", "test12", "test123", "", "tset"];
/// let result = osa_distance_against_vec("test", &v);
/// let expected = vec![0, 1, 2, 3, 4, 1];
/// assert_eq!(expected, result);
/// ```
pub fn osa_distance_against_vec(a: &str, v: &[&str]) -> Vec<usize> {
v.iter().map(|b| osa_distance(a, b)).collect()
}
/// Like optimal string alignment, but substrings can be edited an unlimited
/// number of times, and the triangle inequality holds.
///
/// ```
/// use strsim::damerau_levenshtein;
///
/// assert_eq!(2, damerau_levenshtein("ab", "bca"));
/// ```
pub fn damerau_levenshtein(a: &str, b: &str) -> usize {
if a == b { return 0; }
let a_chars: Vec<char> = a.chars().collect();
let b_chars: Vec<char> = b.chars().collect();
let a_len = a_chars.len();
let b_len = b_chars.len();
if a_len == 0 { return b_len; }
if b_len == 0 { return a_len; }
let mut distances = vec![vec![0; b_len + 2]; a_len + 2];
let max_distance = a_len + b_len;
distances[0][0] = max_distance;
for i in 0..(a_len + 1) {
distances[i + 1][0] = max_distance;
distances[i + 1][1] = i;
}
for j in 0..(b_len + 1) {
distances[0][j + 1] = max_distance;
distances[1][j + 1] = j;
}
let mut chars: HashMap<char, usize> = HashMap::new();
for i in 1..(a_len + 1) {
let mut db = 0;
for j in 1..(b_len + 1) {
let k = match chars.get(&b_chars[j - 1]) {
Some(value) => value.clone(),
None => 0
};
let l = db;
let mut cost = 1;
if a_chars[i - 1] == b_chars[j - 1] {
cost = 0;
db = j;
}
let substitution_cost = distances[i][j] + cost;
let insertion_cost = distances[i][j + 1] + 1;
let deletion_cost = distances[i + 1][j] + 1;
let transposition_cost = distances[k][l] + (i - k - 1) + 1 +
(j - l - 1);
distances[i + 1][j + 1] = min(substitution_cost,
min(insertion_cost,
min(deletion_cost,
transposition_cost)));
}
chars.insert(a_chars[i - 1], i);
}
distances[a_len + 1][b_len + 1]
}
/// Calculates the Damerau-Levenshtein distance between a string and each string
/// in a vector. Returns a vector of corresponding values.
@ -527,6 +614,89 @@ mod tests {
assert_eq!(6, levenshtein("kitten", ""));
}
#[test]
fn osa_distance_empty() {
assert_eq!(0, osa_distance("", ""));
}
#[test]
fn osa_distance_same() {
assert_eq!(0, osa_distance("damerau", "damerau"));
}
#[test]
fn osa_distance_first_empty() {
assert_eq!(7, osa_distance("", "damerau"));
}
#[test]
fn osa_distance_second_empty() {
assert_eq!(7, osa_distance("damerau", ""));
}
#[test]
fn osa_distance_diff() {
assert_eq!(3, osa_distance("ca", "abc"));
}
#[test]
fn osa_distance_diff_short() {
assert_eq!(3, osa_distance("damerau", "aderua"));
}
#[test]
fn osa_distance_diff_reversed() {
assert_eq!(3, osa_distance("aderua", "damerau"));
}
#[test]
fn osa_distance_diff_multibyte() {
assert_eq!(3, osa_distance("öঙ香", "abc"));
assert_eq!(3, osa_distance("abc", "öঙ香"));
}
#[test]
fn osa_distance_diff_unequal_length() {
assert_eq!(6, osa_distance("damerau", "aderuaxyz"));
}
#[test]
fn osa_distance_diff_unequal_length_reversed() {
assert_eq!(6, osa_distance("aderuaxyz", "damerau"));
}
#[test]
fn osa_distance_diff_comedians() {
assert_eq!(5, osa_distance("Stewart", "Colbert"));
}
#[test]
fn osa_distance_many_transpositions() {
assert_eq!(4, osa_distance("abcdefghijkl", "bacedfgihjlk"));
}
#[test]
fn osa_distance_diff_longer() {
let a = "The quick brown fox jumped over the angry dog.";
let b = "Lehem ipsum dolor sit amet, dicta latine an eam.";
assert_eq!(36, osa_distance(a, b));
}
#[test]
fn osa_distance_beginning_transposition() {
assert_eq!(1, osa_distance("foobar", "ofobar"));
}
#[test]
fn osa_distance_end_transposition() {
assert_eq!(1, osa_distance("specter", "spectre"));
}
#[test]
fn osa_distance_restricted_edit() {
assert_eq!(4, osa_distance("a cat", "an abct"));
}
#[test]
fn damerau_levenshtein_empty() {
assert_eq!(0, damerau_levenshtein("", ""));
@ -547,6 +717,11 @@ mod tests {
assert_eq!(7, damerau_levenshtein("damerau", ""));
}
#[test]
fn damerau_levenshtein_diff() {
assert_eq!(2, damerau_levenshtein("ca", "abc"));
}
#[test]
fn damerau_levenshtein_diff_short() {
assert_eq!(3, damerau_levenshtein("damerau", "aderua"));
@ -600,6 +775,11 @@ mod tests {
assert_eq!(1, damerau_levenshtein("specter", "spectre"));
}
#[test]
fn damerau_levenshtein_unrestricted_edit() {
assert_eq!(3, damerau_levenshtein("a cat", "an abct"));
}
#[test]
fn levenshtein_against_vec_empty() {
let v = Vec::new();
@ -624,6 +804,30 @@ mod tests {
assert_eq!(expected, result);
}
#[test]
fn osa_distance_against_vec_empty() {
let v = Vec::new();
let result = osa_distance_against_vec("test", &v);
let expected: Vec<usize> = Vec::new();
assert_eq!(expected, result);
}
#[test]
fn osa_distance_against_vec_one() {
let v = vec!["etst"];
let result = osa_distance_against_vec("test", &v);
let expected = vec![1];
assert_eq!(expected, result);
}
#[test]
fn osa_distance_against_vec_many() {
let v = vec!["test", "test1", "test12", "test123", "", "tsvet"];
let result = osa_distance_against_vec("test", &v);
let expected = vec![0, 1, 2, 3, 4, 3];
assert_eq!(expected, result);
}
#[test]
fn damerau_levenshtein_against_vec_empty() {
let v = Vec::new();
@ -642,9 +846,9 @@ mod tests {
#[test]
fn damerau_levenshtein_against_vec_many() {
let v = vec!["test", "test1", "test12", "test123", "", "tset"];
let v = vec!["test", "test1", "test12", "test123", "", "tsvet"];
let result = damerau_levenshtein_against_vec("test", &v);
let expected = vec![0, 1, 2, 3, 4, 1];
let expected = vec![0, 1, 2, 3, 4, 2];
assert_eq!(expected, result);
}

View File

@ -1,8 +1,9 @@
extern crate strsim;
use strsim::{hamming, levenshtein, damerau_levenshtein, jaro, jaro_winkler,
levenshtein_against_vec, damerau_levenshtein_against_vec,
jaro_against_vec, jaro_winkler_against_vec};
use strsim::{hamming, levenshtein, osa_distance, damerau_levenshtein, jaro,
jaro_winkler, levenshtein_against_vec, osa_distance_against_vec,
damerau_levenshtein_against_vec, jaro_against_vec,
jaro_winkler_against_vec};
#[test]
fn hamming_works() {
@ -17,9 +18,14 @@ fn levenshtein_works() {
assert_eq!(3, levenshtein("kitten", "sitting"));
}
#[test]
fn osa_distance_works() {
assert_eq!(3, osa_distance("ac", "cba"));
}
#[test]
fn damerau_levenshtein_works() {
assert_eq!(3, damerau_levenshtein("damerau", "aderua"));
assert_eq!(2, damerau_levenshtein("ac", "cba"));
}
#[test]
@ -36,17 +42,25 @@ fn jaro_winkler_works() {
#[test]
fn levenshtein_against_vec_works() {
let v = vec!["test", "test1", "test12", "test123", "", "tset"];
let v = vec!["test", "test1", "test12", "test123", "", "tset", "tsvet"];
let result = levenshtein_against_vec("test", &v);
let expected = vec![0, 1, 2, 3, 4, 2];
let expected = vec![0, 1, 2, 3, 4, 2, 3];
assert_eq!(expected, result);
}
#[test]
fn osa_distance_against_vec_works() {
let v = vec!["test", "test1", "test12", "test123", "", "tset", "tsvet"];
let result = osa_distance_against_vec("test", &v);
let expected = vec![0, 1, 2, 3, 4, 1, 3];
assert_eq!(expected, result);
}
#[test]
fn damerau_levenshtein_against_vec_works() {
let v = vec!["test", "test1", "test12", "test123", "", "tset"];
let v = vec!["test", "test1", "test12", "test123", "", "tset", "tsvet"];
let result = damerau_levenshtein_against_vec("test", &v);
let expected = vec![0, 1, 2, 3, 4, 1];
let expected = vec![0, 1, 2, 3, 4, 1, 2];
assert_eq!(expected, result);
}