From a14fc43507c1366d9761f17f8551150b59645280 Mon Sep 17 00:00:00 2001 From: Egor Date: Mon, 18 Mar 2024 22:40:37 +0800 Subject: [PATCH] Process non ascii symbols (#6) --- src/Levenshtein.php | 29 ++++++++++++++++++++++++----- tests/LevenshteinTest.php | 5 +++++ 2 files changed, 29 insertions(+), 5 deletions(-) diff --git a/src/Levenshtein.php b/src/Levenshtein.php index 2bac390..7fed48f 100644 --- a/src/Levenshtein.php +++ b/src/Levenshtein.php @@ -6,13 +6,32 @@ class Levenshtein { public static function distance(string $string1, string $string2, int $insertionCost = 1, $replacementCost = 1, $deletionCost = 1): int { - $string1 = mb_convert_encoding($string1, 'ASCII', 'utf8'); - $string2 = mb_convert_encoding($string2, 'ASCII', 'utf8'); + $map = []; + $string1 = self::utf8_to_extended_ascii($string1, $map); + $string2 = self::utf8_to_extended_ascii($string2, $map); - if (false === $string1 || false === $string2) { - throw new \InvalidArgumentException('Both, string1 and string2 have to be valid utf-8 strings.'); + return levenshtein($string1, $string2, $insertionCost, $replacementCost, $deletionCost); + } + + private static function utf8_to_extended_ascii($str, &$map) + { + // find all multibyte characters (cf. utf-8 encoding specs) + $matches = []; + if (!preg_match_all('/[\xC0-\xF7][\x80-\xBF]+/', $str, $matches)) { + return $str; + } // plain ascii string + + // update the encoding map with the characters not already met + foreach ($matches[0] as $mbc) { + if (!isset($map[$mbc])) { + if (\count($map) >= 128) { + throw new \InvalidArgumentException('Strings with more than 128 individual unicode characters are not supported.'); + } + $map[$mbc] = \chr(128 + \count($map)); + } } - return levenshtein($string1, $string2, $insertionCost, $replacementCost, $deletionCost); + // finally remap non-ascii characters + return strtr($str, $map); } } diff --git a/tests/LevenshteinTest.php b/tests/LevenshteinTest.php index 718cbe5..3765a37 100644 --- a/tests/LevenshteinTest.php +++ b/tests/LevenshteinTest.php @@ -12,7 +12,12 @@ public function testLevenshtein(): void $this->assertSame(1, Levenshtein::distance('hello', 'helo')); $this->assertSame(2, Levenshtein::distance('hello', 'heo')); $this->assertSame(1, Levenshtein::distance('héllo', 'hello')); + $this->assertSame(2, Levenshtein::distance('Ñörbärm', 'Üörbarm')); $this->assertSame(2, Levenshtein::distance('garçonnière', 'garconniere')); $this->assertSame(1, Levenshtein::distance('garçonnière', 'garçonniere')); + $this->assertSame(1, Levenshtein::distance('пожар', 'пажар')); + $this->assertSame(1, Levenshtein::distance('пожар', 'пожаr')); + $this->assertSame(2, Levenshtein::distance('слово', 'слива')); + $this->assertNotSame(0, Levenshtein::distance('стул', 'вода')); } } \ No newline at end of file