Skip to content

Commit

Permalink
only boost similarity in Jaro-Winkler once the Jaro similarity exceed…
Browse files Browse the repository at this point in the history
…s 0.7
  • Loading branch information
maxbachmann committed Jan 4, 2024
1 parent 438e2c6 commit 1d2d6b8
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 9 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ This project attempts to adhere to [Semantic Versioning](http://semver.org).
- reduce runtime in our own benchmark by more than `70%`
- reduce binary size by more than `25%`

- only boost similarity in Jaro-Winkler once the Jaro similarity exceeds 0.7

### Fixed

- Fix transposition counting in Jaro and Jaro-Winkler.
Expand Down
22 changes: 13 additions & 9 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -179,16 +179,20 @@ where
&'b Iter2: IntoIterator<Item = Elem2>,
Elem1: PartialEq<Elem2>,
{
let jaro_distance = generic_jaro(a, b);
let sim = generic_jaro(a, b);

let prefix_length = a
.into_iter()
.take(4)
.zip(b.into_iter())
.take_while(|(a_elem, b_elem)| a_elem == b_elem)
.count();
if sim > 0.7 {
let prefix_length = a
.into_iter()
.take(4)
.zip(b.into_iter())
.take_while(|(a_elem, b_elem)| a_elem == b_elem)
.count();

return jaro_distance + 0.1 * prefix_length as f64 * (1.0 - jaro_distance);
sim + 0.1 * prefix_length as f64 * (1.0 - sim)
} else {
sim
}
}

/// Like Jaro but gives a boost to strings that have a common prefix.
Expand Down Expand Up @@ -918,7 +922,7 @@ mod tests {

#[test]
fn jaro_winkler_names() {
assert!((0.562 - jaro_winkler("Friedrich Nietzsche", "Fran-Paul Sartre")).abs() < 0.001);
assert!((0.452 - jaro_winkler("Friedrich Nietzsche", "Fran-Paul Sartre")).abs() < 0.001);
}

#[test]
Expand Down

0 comments on commit 1d2d6b8

Please sign in to comment.