Skip to content

Commit

Permalink
Revert "[performance] Improve the performance of damerau_levenshtein (#…
Browse files Browse the repository at this point in the history
…32)"

This reverts commit d6717db.
  • Loading branch information
dguo authored Apr 30, 2019
1 parent 7aacbc5 commit 1a56487
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 46 deletions.
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ os:
- windows

rust:
- 1.31.0
- 1.27.0
- nightly
- beta
- stable
Expand Down
4 changes: 0 additions & 4 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,5 @@ repository = "https://github.com/dguo/strsim-rs"
documentation = "https://docs.rs/strsim/"
exclude = ["/.travis.yml", "/dev"]

[dependencies]
hashbrown = "0.1.8"
ndarray = "0.12.1"

[badges]
travis-ci = { repository = "dguo/strsim-rs" }
83 changes: 42 additions & 41 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,18 +1,13 @@
//! This library implements string similarity metrics.

extern crate hashbrown;
extern crate ndarray;

use std::char;
use std::cmp::{max, min};
use std::collections::HashMap;
use std::error::Error;
use std::fmt::{self, Display, Formatter};
use std::hash::Hash;
use std::str::Chars;

use hashbrown::HashMap;
use ndarray::Array2;

#[derive(Debug, PartialEq)]
pub enum StrSimError {
DifferentLengthArgs,
Expand Down Expand Up @@ -108,7 +103,7 @@ pub fn generic_jaro<'a, 'b, Iter1, Iter2, Elem1, Elem2>(a: &'a Iter1, b: &'b Ite

for (j, b_elem) in b.into_iter().enumerate() {
if min_bound <= j && j <= max_bound && a_elem == b_elem &&
!b_consumed[j] {
!b_consumed[j] {
b_consumed[j] = true;
matches += 1.0;

Expand All @@ -126,8 +121,8 @@ pub fn generic_jaro<'a, 'b, Iter1, Iter2, Elem1, Elem2>(a: &'a Iter1, b: &'b Ite
0.0
} else {
(1.0 / 3.0) * ((matches / a_len as f64) +
(matches / b_len as f64) +
((matches - transpositions) / matches))
(matches / b_len as f64) +
((matches - transpositions) / matches))
}
}

Expand Down Expand Up @@ -166,7 +161,7 @@ pub fn generic_jaro_winkler<'a, 'b, Iter1, Iter2, Elem1, Elem2>(a: &'a Iter1, b:
let prefix_length = a.into_iter()
.zip(b.into_iter())
.take_while(|&(ref a_elem, ref b_elem)| a_elem == b_elem)
.count();
.count();

let jaro_winkler_distance =
jaro_distance + (0.1 * prefix_length as f64 * (1.0 - jaro_distance));
Expand Down Expand Up @@ -202,21 +197,25 @@ pub fn generic_levenshtein<'a, 'b, Iter1, Iter2, Elem1, Elem2>(a: &'a Iter1, b:
where &'a Iter1: IntoIterator<Item=Elem1>,
&'b Iter2: IntoIterator<Item=Elem2>,
Elem1: PartialEq<Elem2> {
let a_len = a.into_iter().count();
let b_len = b.into_iter().count();

if a.into_iter().next().is_none() { return b_len; }
if a_len == 0 { return b_len; }
if b_len == 0 { return a_len; }

let mut cache: Vec<usize> = (1..b_len+1).collect();

let mut result = 0;
let mut distance_a;
let mut distance_b;

for (i, a_elem) in a.into_iter().enumerate() {
result = i + 1;
let mut distance_b = i;
result = i;
distance_b = i;

for (j, b_elem) in b.into_iter().enumerate() {
let cost = if a_elem == b_elem { 0usize } else { 1usize };
let distance_a = distance_b + cost;
let cost = if a_elem == b_elem { 0 } else { 1 };
distance_a = distance_b + cost;
distance_b = cache[j];
result = min(result + 1, min(distance_a, distance_b + 1));
cache[j] = result;
Expand Down Expand Up @@ -294,7 +293,7 @@ pub fn osa_distance(a: &str, b: &str) -> usize {
min(prev_distances[j + 1] + 1,
prev_distances[j] + cost));
if i > 0 && j > 0 && a_char != b_char &&
a_char == prev_b_char && b_char == prev_a_char {
a_char == prev_b_char && b_char == prev_a_char {
curr_distances[j + 1] = min(curr_distances[j + 1],
prev_two_distances[j - 1] + 1);
}
Expand Down Expand Up @@ -327,53 +326,55 @@ pub fn generic_damerau_levenshtein<Elem>(a_elems: &[Elem], b_elems: &[Elem]) ->
if a_len == 0 { return b_len; }
if b_len == 0 { return a_len; }

let mut distances = Array2::<usize>::zeros((a_len + 2, b_len + 2));
let mut distances = vec![vec![0; b_len + 2]; a_len + 2];
let max_distance = a_len + b_len;
distances[[0, 0]] = max_distance;
distances[0][0] = max_distance;

for i in 0..(a_len + 1) {
distances[[i + 1, 0]] = max_distance;
distances[[i + 1, 1]] = i;
distances[i + 1][0] = max_distance;
distances[i + 1][1] = i;
}

for j in 0..(b_len + 1) {
distances[[0, j + 1]] = max_distance;
distances[[1, j + 1]] = j;
distances[0][j + 1] = max_distance;
distances[1][j + 1] = j;
}

let mut elems: HashMap<Elem, usize> = HashMap::with_capacity(64);
let mut elems: HashMap<Elem, usize> = HashMap::new();

for i in 1..(a_len + 1) {
let mut db = 0;

for j in 1..(b_len + 1) {
let k = match elems.get(&b_elems[j - 1]) {
Some(&value) => value,
Some(value) => value.clone(),
None => 0
};

let l = db;

let insertion_cost = distances[[i, j + 1]] + 1;
let deletion_cost = distances[[i + 1, j]] + 1;
let transposition_cost = distances[[k, db]] + (i - k - 1) + 1 +
(j - db - 1);

let mut substitution_cost = distances[[i, j]] + 1;
let mut cost = 1;
if a_elems[i - 1] == b_elems[j - 1] {
cost = 0;
db = j;
substitution_cost -= 1;
}

distances[[i + 1, j + 1]] = min(substitution_cost,
min(insertion_cost,
min(deletion_cost,
transposition_cost)));
let substitution_cost = distances[i][j] + cost;
let insertion_cost = distances[i][j + 1] + 1;
let deletion_cost = distances[i + 1][j] + 1;
let transposition_cost = distances[k][l] + (i - k - 1) + 1 +
(j - l - 1);

distances[i + 1][j + 1] = min(substitution_cost,
min(insertion_cost,
min(deletion_cost,
transposition_cost)));
}

elems.insert(a_elems[i - 1].clone(), i);
}

distances[[a_len + 1, b_len + 1]]
distances[a_len + 1][b_len + 1]
}

/// Like optimal string alignment, but substrings can be edited an unlimited
Expand Down Expand Up @@ -456,7 +457,7 @@ mod tests {

#[test]
fn jaro_both_empty() {
assert_eq!(1.0, jaro("", ""));
assert_eq!(1.0, jaro("", ""));
}

#[test]
Expand Down Expand Up @@ -544,9 +545,9 @@ mod tests {
#[test]
fn jaro_winkler_multibyte() {
assert!((0.89 - jaro_winkler("testabctest", "testöঙ香test")).abs() <
0.001);
0.001);
assert!((0.89 - jaro_winkler("testöঙ香test", "testabctest")).abs() <
0.001);
0.001);
}

#[test]
Expand Down Expand Up @@ -579,7 +580,7 @@ mod tests {
#[test]
fn jaro_winkler_long_prefix() {
assert!((0.911 - jaro_winkler("cheeseburger", "cheese fries")).abs() <
0.001);
0.001);
}

#[test]
Expand All @@ -596,7 +597,7 @@ mod tests {
fn jaro_winkler_very_long_prefix() {
assert!((1.0 - jaro_winkler("thequickbrownfoxjumpedoverx",
"thequickbrownfoxjumpedovery")).abs() <
0.001);
0.001);
}

#[test]
Expand Down

0 comments on commit 1a56487

Please sign in to comment.