Skip to content

Commit

Permalink
Added support for Damerau-Levenshtein
Browse files Browse the repository at this point in the history
  • Loading branch information
Toflar committed Sep 28, 2023
1 parent 8f64ef2 commit ab617b9
Show file tree
Hide file tree
Showing 7 changed files with 138 additions and 13 deletions.
8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,14 @@ You can configure the maximum index length and maximum alphabet size with the `C
paper for details on what they do. There's no such thing as a recommended size as it very much depends on what
you want to index and or search.

This library deviates from the research paper in one aspect: it not only supports regular Levenshtein but also
Damerau-Levenshtein which allows transpositions. So whereas the Levenshtein algorithm would calculate a distance of
`2` between `Muster` and `Mustre`, Damerau-Levenshtein only calculates `1` as `er` and `re` are swapped/transposed.
You can enable the usage of Damerau-Levenshtein using the third constructor argument of `Config`: `new Config(6, 4,
true)`.

Note: TODO.

## Customization

This library ships with the algorithm readily prepared for you to use. The main customization areas will be
Expand Down
2 changes: 1 addition & 1 deletion src/Alphabet/Utf8Alphabet.php
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ class Utf8Alphabet implements AlphabetInterface

public function map(string $char, int $alphabetSize): int
{
if (!isset($this->cache[$alphabetSize][$char])) {
if (! isset($this->cache[$alphabetSize][$char])) {
// +1 in order to never assign 0
$this->cache[$alphabetSize][$char] = (mb_ord($char, 'UTF-8') % $alphabetSize) + 1;
}
Expand Down
8 changes: 7 additions & 1 deletion src/Config.php
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@ class Config
{
public function __construct(
private int $indexLength,
private int $alphabetSize
private int $alphabetSize,
private bool $useDamerauLevenshtein = false
) {
}

Expand All @@ -19,4 +20,9 @@ public function getAlphabetSize(): int
{
return $this->alphabetSize;
}

public function useDamerauLevenshtein(): bool
{
return $this->useDamerauLevenshtein;
}
}
57 changes: 56 additions & 1 deletion src/Levenshtein.php
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

class Levenshtein
{
public static function distance(string $string1, string $string2, int $insertionCost = 1, $replacementCost = 1, $deletionCost = 1)
public static function distance(string $string1, string $string2, int $insertionCost = 1, $replacementCost = 1, $deletionCost = 1): int
{
$string1 = mb_convert_encoding($string1, 'ASCII', 'utf8');
$string2 = mb_convert_encoding($string2, 'ASCII', 'utf8');
Expand All @@ -15,4 +15,59 @@ public static function distance(string $string1, string $string2, int $insertion

return levenshtein($string1, $string2, $insertionCost, $replacementCost, $deletionCost);
}

public static function distanceDamerau(string $string1, string $string2, int $insertionCost = 1, $replacementCost = 1, $deletionCost = 1, $transpositionCost = 1): int
{
$string1Length = mb_strlen($string1);
$string2Length = mb_strlen($string2);
$matrix = [[]];

for ($i = 0; $i <= $string1Length; $i += 1) {
$matrix[$i][0] = $i > 0 ? $matrix[$i - 1][0] + $deletionCost : 0;
}

for ($i = 0; $i <= $string2Length; $i += 1) {
$matrix[0][$i] = $i > 0 ? $matrix[0][$i - 1] + $insertionCost : 0;
}

for ($i = 1; $i <= $string1Length; $i += 1) {
$cOne = mb_substr($string1, $i - 1, 1, 'UTF-8');
for ($j = 1; $j <= $string2Length; $j += 1) {
$cTwo = mb_substr($string2, $j - 1, 1, 'UTF-8');

if ($cOne === $cTwo) {
$cost = 0;
$trans = 0;
} else {
$cost = $replacementCost;
$trans = $transpositionCost;
}

// Deletion cost
$del = $matrix[$i - 1][$j] + $deletionCost;

// Insertion cost
$ins = $matrix[$i][$j - 1] + $insertionCost;

// Substitution cost, 0 if same
$sub = $matrix[$i - 1][$j - 1] + $cost;

// Compute optimal
$matrix[$i][$j] = min($del, $ins, $sub);

// Transposition cost
if ($i > 1 && $j > 1) {
$ccOne = mb_substr($string1, $i - 2, 1, 'UTF-8');
$ccTwo = mb_substr($string2, $j - 2, 1, 'UTF-8');

if ($cOne === $ccTwo && $ccOne === $cTwo) {
// Transposition cost is computed as minimal of two
$matrix[$i][$j] = min($matrix[$i][$j], $matrix[$i - 2][$j - 2] + $trans);
}
}
}
}

return $matrix[$string1Length][$string2Length];
}
}
22 changes: 19 additions & 3 deletions src/StateSetIndex.php
Original file line number Diff line number Diff line change
Expand Up @@ -81,12 +81,21 @@ public function index(array $strings): array
public function find(string $string, int $editDistance): array
{
$acceptedStringsPerState = $this->findAcceptedStrings($string, $editDistance);

$stringLength = mb_strlen($string);
$filtered = [];

foreach ($acceptedStringsPerState as $acceptedStrings) {
foreach ($acceptedStrings as $acceptedString) {
if (Levenshtein::distance($string, $acceptedString) <= $editDistance) {
// Early aborts (cheaper) for cases we know are absolutely never going to match
if (abs($stringLength - mb_strlen($acceptedString)) > $editDistance) {
continue;
}

$distance = $this->config->useDamerauLevenshtein() ?
Levenshtein::distanceDamerau($string, $acceptedString) :
Levenshtein::distance($string, $acceptedString);

if ($distance <= $editDistance) {
$filtered[] = $acceptedString;
}
}
Expand Down Expand Up @@ -121,7 +130,9 @@ public function findMatchingStates(string $string, int $editDistance): array
// Initial states
$states = $this->getReachableStates(0, $editDistance);

$this->loopOverEveryCharacter($string, function (int $mappedChar, $char) use (&$states, $editDistance) {
$prevChar = null;

$this->loopOverEveryCharacter($string, function (int $mappedChar, $char) use (&$states, &$prevChar, $editDistance) {
$nextStates = new CostAnnotatedStateSet();

foreach ($states->all() as $state => $cost) {
Expand All @@ -140,6 +151,10 @@ public function findMatchingStates(string $string, int $editDistance): array
if ($i === $this->getAlphabet()->map($char, $this->config->getAlphabetSize())) {
// Match
$newStates->add($newState, $cost);
} elseif (null !== $prevChar && $i === $this->getAlphabet()->map($prevChar, $this->config->getAlphabetSize())) {
// Transposition according to Damerau-Levenshtein (this is not part of the research paper and
// the only deviation from it)
$newStates->add($newState, $cost);
} elseif ($cost + 1 <= $editDistance) {
// Substitution
$newStates->add($newState, $cost + 1);
Expand All @@ -157,6 +172,7 @@ public function findMatchingStates(string $string, int $editDistance): array
}
}

$prevChar = $this->config->useDamerauLevenshtein() ? $char : null;
$states = $nextStates;
});

Expand Down
15 changes: 15 additions & 0 deletions tests/LevenshteinTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,20 @@ public function testLevenshtein(): void
$this->assertSame(1, Levenshtein::distance('héllo', 'hello'));
$this->assertSame(2, Levenshtein::distance('garçonnière', 'garconniere'));
$this->assertSame(1, Levenshtein::distance('garçonnière', 'garçonniere'));

// Transposition (o and ç are swapped = distance of 2 in regular Levenshtein)
$this->assertSame(2, Levenshtein::distance('garçonnière', 'garoçnnière'));
}

public function testDamerauLevenshtein(): void
{
$this->assertSame(1, Levenshtein::distanceDamerau('hello', 'helo'));
$this->assertSame(2, Levenshtein::distanceDamerau('hello', 'heo'));
$this->assertSame(1, Levenshtein::distanceDamerau('héllo', 'hello'));
$this->assertSame(2, Levenshtein::distanceDamerau('garçonnière', 'garconniere'));
$this->assertSame(1, Levenshtein::distanceDamerau('garçonnière', 'garçonniere'));

// Transposition (o and ç are swapped = distance of 1 in Damerau-Levenshtein)
$this->assertSame(1, Levenshtein::distanceDamerau('garçonnière', 'garoçnnière'));
}
}
39 changes: 32 additions & 7 deletions tests/StateSetIndexTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -4,19 +4,17 @@

use PHPUnit\Framework\TestCase;
use Toflar\StateSetIndex\Alphabet\InMemoryAlphabet;
use Toflar\StateSetIndex\Alphabet\Utf8Alphabet;
use Toflar\StateSetIndex\Config;
use Toflar\StateSetIndex\DataStore\InMemoryDataStore;
use Toflar\StateSetIndex\DataStore\NullDataStore;
use Toflar\StateSetIndex\StateSet\InMemoryStateSet;
use Toflar\StateSetIndex\StateSetIndex;

class StateSetIndexTest extends TestCase
{
public function testResultsMatchResearchPaper(): void
{
$stateSet = new InMemoryStateSet();
$dataStore = new InMemoryDataStore();
$stringSet = ['Mueller', 'Müller', 'Muentner', 'Muster', 'Mustermann'];

$stateSetIndex = new StateSetIndex(
new Config(6, 4),
new InMemoryAlphabet([
Expand All @@ -32,14 +30,41 @@ public function testResultsMatchResearchPaper(): void
'm' => 2,
'a' => 3,
]),
$stateSet,
$dataStore
new InMemoryStateSet(),
new InMemoryDataStore()
);

$stateSetIndex->index($stringSet);
$stateSetIndex->index(['Mueller', 'Müller', 'Muentner', 'Muster', 'Mustermann']);

$this->assertSame([104, 419, 467, 1677, 1811], $stateSetIndex->findMatchingStates('Mustre', 2));
$this->assertSame([1811 => ['Mueller'], 1677 => ['Muster', 'Mustermann']], $stateSetIndex->findAcceptedStrings('Mustre', 2));
$this->assertSame(['Muster'], $stateSetIndex->find('Mustre', 2));

// Should consider transposition (Damerau-Levenshtein) as distance of 2
$this->assertSame([104, 419, 467, 1677, 1811], $stateSetIndex->findMatchingStates('Mustremann', 2));
$this->assertSame(['Mustermann'], $stateSetIndex->find('Mustremann', 2));
$this->assertSame([419], $stateSetIndex->findMatchingStates('Mustremann', 1));
$this->assertSame([], $stateSetIndex->find('Mustremann', 1));
}

public function testWithUtf8Alphabet(): void
{
$stateSetIndex = new StateSetIndex(new Config(6, 4), new Utf8Alphabet(), new InMemoryStateSet(), new InMemoryDataStore());
$stateSetIndex->index(['Mueller', 'Müller', 'Muentner', 'Muster', 'Mustermann']);

$this->assertSame([177, 710, 2710, 2843], $stateSetIndex->findMatchingStates('Mustre', 2));
$this->assertSame([2710 => ['Mueller'], 2843 => ['Muster', 'Mustermann']], $stateSetIndex->findAcceptedStrings('Mustre', 2));
$this->assertSame(['Muster'], $stateSetIndex->find('Mustre', 2));
}

public function testDamerauLevenshtein(): void
{
$stateSetIndex = new StateSetIndex(new Config(6, 4, true), new Utf8Alphabet(), new InMemoryStateSet(), new InMemoryDataStore());
$stateSetIndex->index(['Mueller', 'Müller', 'Muentner', 'Muster', 'Mustermann']);

// Should consider transposition (Damerau-Levenshtein) as distance of 1
$this->assertSame([677, 710, 2710, 2743, 2843], $stateSetIndex->findMatchingStates('Mustremann', 1));
$this->assertSame([2710 => ['Mueller'], 2743 => ['Muentner'], 2843 => ['Muster', 'Mustermann']], $stateSetIndex->findAcceptedStrings('Mustremann', 1));
$this->assertSame(['Mustermann'], $stateSetIndex->find('Mustremann', 1));
}
}

0 comments on commit ab617b9

Please sign in to comment.