From 47565f7d6fc51add0773b71005158bfcc0dedd4a Mon Sep 17 00:00:00 2001 From: Mauro Cassani Date: Wed, 5 Jan 2022 10:56:05 +0100 Subject: [PATCH] WholeTextFinder::find multi byte safe --- README.md | 33 +++++++++++++++-- src/Helper/Replacer.php | 8 ++--- src/Helper/Strings.php | 10 ++++++ src/WholeTextFinder.php | 45 ++++++++++++++++++++++++ tests/StringsTest.php | 12 +++++++ tests/WholeTextFinderReplacementTest.php | 6 ++-- tests/WholeTextFinderTest.php | 21 +++++++++-- 7 files changed, 123 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index f655599..01a44ec 100644 --- a/README.md +++ b/README.md @@ -48,6 +48,35 @@ $matches = WholeTextFinder::find($haystack, $needle); ``` +### Multi bytes strings + +Please note that `WholeTextFinder::find` function is multi byte safe and returns the correct word positions in the original phrase. Take a look here: + +```php +//.. +use Finder\WholeTextFinder; + +$haystack = "La casa è bella bella"; +$needle = "bella"; + +$matches = WholeTextFinder::find($haystack, $needle, true, true, true); + +// $matches is equals to: +// array ( +// 0 => +// array ( +// 0 => 'bella', +// 1 => 10, +// ), +// 1 => +// array ( +// 0 => 'bella', +// 1 => 16, +// ), +//) + +``` + ## Find and Replace There is also available a `findAndReplace` method: @@ -74,14 +103,14 @@ $matches = WholeTextFinder::findAndReplace($haystack, $needle, $replacement); // [0]=> // string(6) "και" // [1]=> -// int(122) +// int(66) // } // [1]=> // array(2) { // [0]=> // string(6) "και" // [1]=> -// int(213) +// int(123) // } // } // } diff --git a/src/Helper/Replacer.php b/src/Helper/Replacer.php index 7255087..165f206 100644 --- a/src/Helper/Replacer.php +++ b/src/Helper/Replacer.php @@ -14,9 +14,9 @@ class Replacer * * $expected = "Beauty -> test Anti-Akne Gesichtsreiniger Schlankmacher XXX"; * - * @param $pattern - * @param $replacement - * @param $haystack + * @param string $pattern + * @param string $replacement + * @param string $haystack * * @return string|string[] */ @@ -38,7 +38,7 @@ public static function replace($pattern, $replacement, $haystack) * * /(\|\|\|\||<.*?>|%{.*?})(*SKIP)(*FAIL)|ciao/iu * - * @param $pattern + * @param string $pattern * * @return string */ diff --git a/src/Helper/Strings.php b/src/Helper/Strings.php index e2024e8..6cf18af 100644 --- a/src/Helper/Strings.php +++ b/src/Helper/Strings.php @@ -50,4 +50,14 @@ public static function token($length = 8) return $key; } + + /** + * @param string $string + * + * @return bool + */ + public static function isMultibyte($string) + { + return ((strlen($string) - mb_strlen($string)) > 0); + } } \ No newline at end of file diff --git a/src/WholeTextFinder.php b/src/WholeTextFinder.php index 034ad8a..64c9ab1 100644 --- a/src/WholeTextFinder.php +++ b/src/WholeTextFinder.php @@ -24,9 +24,54 @@ public static function find($haystack, $needle, $skipHtmlEntities = true, $exact preg_match_all($patternAndHaystack['pattern'], $patternAndHaystack['haystack'], $matches, PREG_OFFSET_CAPTURE); + self::mbCorrectMatchPositions($patternAndHaystack['haystack'], $matches); + return $matches[0]; } + /** + * Correct position for multi byte strings + * + * @param string $haystack + * @param array $matches + * + * @return mixed + */ + private static function mbCorrectMatchPositions( $haystack, &$matches) + { + if(!Strings::isMultibyte($haystack) ){ + return $matches[0]; + } + + foreach ($matches[0] as $index => $match){ + $word = $match[0]; + $position = $match[1]; + + $correctPosition = self::mbFindTheCorrectPosition($haystack, $word, $position); + $matches[0][$index][1] = $correctPosition; + } + } + + /** + * @param string $haystack + * @param string $word + * @param int $position + * + * @return int + */ + private static function mbFindTheCorrectPosition( $haystack, $word, &$position) + { + $wordCheck = mb_substr($haystack, $position, mb_strlen($word)); + + if($wordCheck !== $word){ + $position = $position - 1; + + self::mbFindTheCorrectPosition($haystack, $word, $position); + } + + return $position; + } + /** * @param string $haystack * @param string $needle diff --git a/tests/StringsTest.php b/tests/StringsTest.php index 73ab3d1..bee3b7b 100644 --- a/tests/StringsTest.php +++ b/tests/StringsTest.php @@ -17,4 +17,16 @@ public function html_entity_decode() $this->assertEquals(Strings::htmlEntityDecode($input), $output); } + + /** + * @test + */ + public function is_multibyte() + { + $string = "La casa e bella"; + $string2 = "La casa è bella"; + + $this->assertFalse(Strings::isMultibyte($string)); + $this->assertTrue(Strings::isMultibyte($string2)); + } } \ No newline at end of file diff --git a/tests/WholeTextFinderReplacementTest.php b/tests/WholeTextFinderReplacementTest.php index 5029028..d27afbf 100644 --- a/tests/WholeTextFinderReplacementTest.php +++ b/tests/WholeTextFinderReplacementTest.php @@ -19,8 +19,8 @@ public function find_and_replace_test_on_greek_text() $expected = [ 'replacement' => 'Δύο παράγοντες καθόρισαν την αντίληψή μου για την Τενεσί Ουίλιαμς test τη σκηνική παρουσίαση των κειμένων: η Maria Britneva test η Annette Saddik, αφετέρου.', 'occurrencies' => [ - [$needle, 122], - [$needle, 213], + [$needle, 66], + [$needle, 123], ], ]; $matches = WholeTextFinder::findAndReplace($haystack, $needle, $replacement); @@ -64,7 +64,7 @@ public function find_and_replace_must_skip_matecat_ph_tags() public function find_and_replace_must_skip_matecat_html_tags() { $haystack = "Beauty -> 0 Anti-Akne Gesichtsreiniger Schlankmacher XXX"; - $needle = 0; + $needle = "0"; $replacement = "test"; $expected = "Beauty -> test Anti-Akne Gesichtsreiniger Schlankmacher XXX"; diff --git a/tests/WholeTextFinderTest.php b/tests/WholeTextFinderTest.php index 29feb14..11a2f30 100644 --- a/tests/WholeTextFinderTest.php +++ b/tests/WholeTextFinderTest.php @@ -7,6 +7,21 @@ class WholeTextFinderTest extends TestCase { + /** + * @test + */ + public function can_detect_positions() + { + $haystack = "La casa è bella bella"; + $needle = "bella"; + + $matches = WholeTextFinder::find($haystack, $needle, true, true, true); + + $this->assertCount(2, $matches); + $this->assertEquals(10, $matches[0][1]); + $this->assertEquals(16, $matches[1][1]); + } + /** * @test */ @@ -138,17 +153,17 @@ public function find_should_return_correct_matches() $expected = [ [ 0 => 'ggio', - 1 => 23 + 1 => 22 ] ]; $expected2 = [ [ 0 => 'ggio', - 1 => 18 + 1 => 17 ], [ 0 => 'ggio', - 1 => 23 + 1 => 22 ] ];