From d8e326722d43440c9e2dde2e8a7cdac16a2b1c51 Mon Sep 17 00:00:00 2001 From: Patrick Ziegler Date: Thu, 12 Dec 2024 22:30:08 +0100 Subject: [PATCH] Move TextMatcher from Platform UI to Equinox Common The TextMatcher class is used in the UI component, despite not depending on any UI classes. By moving it to Equinox, it can be used anywhere in the Eclipse Platform. --- .../common/tests/text/StringMatcherTests.java | 2 +- .../common/tests/text/TextMatcherTest.java | 108 ++++++++++++ .../META-INF/MANIFEST.MF | 2 +- .../org/eclipse/core/text/StringMatcher.java | 158 +++++++++++++++++- 4 files changed, 261 insertions(+), 9 deletions(-) create mode 100644 bundles/org.eclipse.equinox.common.tests/src/org/eclipse/equinox/common/tests/text/TextMatcherTest.java diff --git a/bundles/org.eclipse.equinox.common.tests/src/org/eclipse/equinox/common/tests/text/StringMatcherTests.java b/bundles/org.eclipse.equinox.common.tests/src/org/eclipse/equinox/common/tests/text/StringMatcherTests.java index 503936a998d..30986ffb641 100644 --- a/bundles/org.eclipse.equinox.common.tests/src/org/eclipse/equinox/common/tests/text/StringMatcherTests.java +++ b/bundles/org.eclipse.equinox.common.tests/src/org/eclipse/equinox/common/tests/text/StringMatcherTests.java @@ -16,7 +16,7 @@ @RunWith(Suite.class) @SuiteClasses({ StringMatcherFindTest.class, StringMatcherPlainTest.class, StringMatcherWildcardTest.class, - StringMatcherPrefixTest.class, StringMatcherOtherTest.class }) + StringMatcherPrefixTest.class, StringMatcherOtherTest.class, TextMatcherTest.class }) public class StringMatcherTests { // empty } diff --git a/bundles/org.eclipse.equinox.common.tests/src/org/eclipse/equinox/common/tests/text/TextMatcherTest.java b/bundles/org.eclipse.equinox.common.tests/src/org/eclipse/equinox/common/tests/text/TextMatcherTest.java new file mode 100644 index 00000000000..65c278536fd --- /dev/null +++ b/bundles/org.eclipse.equinox.common.tests/src/org/eclipse/equinox/common/tests/text/TextMatcherTest.java @@ -0,0 +1,108 @@ +/******************************************************************************* + * Copyright (c) 2020, 2024 Thomas Wolf and others. + * + * This program and the accompanying materials + * are made available under the terms of the Eclipse Public License 2.0 + * which accompanies this distribution, and is available at + * https://www.eclipse.org/legal/epl-2.0/ + * + * SPDX-License-Identifier: EPL-2.0 + *******************************************************************************/ +package org.eclipse.equinox.common.tests.text; + +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +import org.eclipse.core.text.StringMatcher; +import org.junit.Test; + +/** + * Tests for {@link StringMatcher}. + */ +public class TextMatcherTest { + + @Test + public void testEmpty() { + assertTrue(new StringMatcher("", false, false, false).match("")); + assertFalse(new StringMatcher("", false, false, false).match("foo")); + assertFalse(new StringMatcher("", false, false, false).match("foo bar baz")); + assertTrue(new StringMatcher("", false, true, false).match("")); + assertFalse(new StringMatcher("", false, true, false).match("foo")); + assertFalse(new StringMatcher("", false, true, false).match("foo bar baz")); + } + + @Test + public void testSuffixes() { + assertFalse(new StringMatcher("fo*ar", false, false, false).match("foobar_123")); + assertFalse(new StringMatcher("fo*ar", false, false, false).match("foobar_baz")); + } + + @Test + public void testChinese() { + assertTrue(new StringMatcher("喜欢", false, false, false).match("我 喜欢 吃 苹果。")); + // This test would work only if word-splitting used the ICU BreakIterator. + // "Words" are as shown above. + // assertTrue(new StringMatcher("喜欢", false, false).match("我喜欢吃苹果。")); + } + + @Test + public void testSingleWords() { + assertTrue(new StringMatcher("huhn", false, false, false).match("hahn henne hühner küken huhn")); + assertTrue(new StringMatcher("h?hner", false, false, false).match("hahn henne hühner küken huhn")); + assertTrue(new StringMatcher("h*hner", false, false, false).match("hahn henne hühner küken huhn")); + assertTrue(new StringMatcher("hühner", false, false, false).match("hahn henne hühner küken huhn")); + // Full pattern must match word fully + assertFalse(new StringMatcher("h?hner", false, false, false).match("hahn henne hühnerhof küken huhn")); + assertFalse(new StringMatcher("h*hner", false, false, false).match("hahn henne hühnerhof küken huhn")); + assertFalse(new StringMatcher("hühner", false, false, false).match("hahn henne hühnerhof küken huhn")); + + assertTrue(new StringMatcher("huhn", false, true, false).match("hahn henne hühner küken huhn")); + assertFalse(new StringMatcher("h?hner", false, true, false).match("hahn henne hühner küken huhn")); + assertFalse(new StringMatcher("h*hner", false, true, false).match("hahn henne hühner küken huhn")); + assertTrue(new StringMatcher("hühner", false, true, false).match("hahn henne hühner küken huhn")); + // Full pattern must match word fully + assertFalse(new StringMatcher("h?hner", false, true, false).match("hahn henne hühnerhof küken huhn")); + assertFalse(new StringMatcher("h*hner", false, true, false).match("hahn henne hühnerhof küken huhn")); + assertFalse(new StringMatcher("hühner", false, true).match("hahn henne hühnerhof küken huhn")); + + // Bug 570390: Pattern starting/ending with whitespace should still match + assertTrue(new StringMatcher("hahn ", false, false, false).match("hahn henne hühnerhof küken huhn")); + assertTrue(new StringMatcher("huhn ", false, false, false).match("hahn henne hühnerhof küken huhn")); + assertTrue(new StringMatcher(" hahn", false, false, false).match("hahn henne hühnerhof küken huhn")); + assertTrue(new StringMatcher(" huhn", false, false, false).match("hahn henne hühnerhof küken huhn")); + } + + @Test + public void testMultipleWords() { + assertTrue(new StringMatcher("huhn h?hner", false, false, false).match("hahn henne hühner küken huhn")); + assertTrue(new StringMatcher("huhn h?hner", false, false, false).match("hahn henne hühnerhof küken huhn")); + assertFalse(new StringMatcher("huhn h?hner", false, true, false).match("hahn henne hühner küken huhn")); + assertFalse(new StringMatcher("huhn h?hner", false, true, false).match("hahn henne hühnerhof küken huhn")); + assertTrue(new StringMatcher("huhn h*hner", false, false, false).match("hahn henne hühner küken huhn")); + assertTrue(new StringMatcher("huhn h*hner", false, false, false).match("hahn henne hühnerhof küken huhn")); + assertFalse(new StringMatcher("huhn h*hner", false, true, false).match("hahn henne hühner küken huhn")); + assertFalse(new StringMatcher("huhn h*hner", false, true, false).match("hahn henne hühnerhof küken huhn")); + assertTrue(new StringMatcher("huhn hühner", false, false, false).match("hahn henne hühner küken huhn")); + assertTrue(new StringMatcher("huhn hühner", false, false, false).match("hahn henne hühnerhof küken huhn")); + assertTrue(new StringMatcher("huhn hühner", false, true, false).match("hahn henne hühner küken huhn")); + assertTrue(new StringMatcher("huhn hühner", false, true, false).match("hahn henne hühnerhof küken huhn")); + + // Bug 570390: Pattern starting/ending with whitespace should still match + assertTrue(new StringMatcher("huhn hahn ", false, false, false).match("hahn henne hühnerhof küken huhn")); + assertTrue(new StringMatcher("hahn huhn ", false, false, false).match("hahn henne hühnerhof küken huhn")); + assertTrue(new StringMatcher(" huhn hahn", false, false, false).match("hahn henne hühnerhof küken huhn")); + assertTrue(new StringMatcher(" hahn huhn", false, false, false).match("hahn henne hühnerhof küken huhn")); + } + + @Test + public void testCaseInsensitivity() { + assertTrue(new StringMatcher("Huhn HÜHNER", true, false, false).match("hahn henne hühner küken huhn")); + assertTrue(new StringMatcher("Huhn HÜHNER", true, false, false).match("hahn henne hühnerhof küken huhn")); + assertTrue(new StringMatcher("Huhn HÜHNER", true, true, false).match("hahn henne hühner küken huhn")); + assertTrue(new StringMatcher("Huhn HÜHNER", true, true, false).match("hahn henne hühnerhof küken huhn")); + assertTrue(new StringMatcher("HüHnEr", true, false, false).match("hahn henne hühner küken huhn")); + assertFalse(new StringMatcher("HüHnEr", true, false, false).match("hahn henne hühnerhof küken huhn")); + assertTrue(new StringMatcher("HüHnEr", true, true, false).match("hahn henne hühner küken huhn")); + assertFalse(new StringMatcher("HüHnEr", true, true, false).match("hahn henne hühnerhof küken huhn")); + } +} diff --git a/bundles/org.eclipse.equinox.common/META-INF/MANIFEST.MF b/bundles/org.eclipse.equinox.common/META-INF/MANIFEST.MF index 47c4a0c9639..77de180a278 100644 --- a/bundles/org.eclipse.equinox.common/META-INF/MANIFEST.MF +++ b/bundles/org.eclipse.equinox.common/META-INF/MANIFEST.MF @@ -2,7 +2,7 @@ Manifest-Version: 1.0 Bundle-ManifestVersion: 2 Bundle-Name: %pluginName Bundle-SymbolicName: org.eclipse.equinox.common; singleton:=true -Bundle-Version: 3.19.200.qualifier +Bundle-Version: 3.20.0.qualifier Bundle-Localization: plugin Export-Package: org.eclipse.core.internal.boot;x-friends:="org.eclipse.core.resources,org.eclipse.pde.build", org.eclipse.core.internal.runtime;common=split;mandatory:=common; diff --git a/bundles/org.eclipse.equinox.common/src/org/eclipse/core/text/StringMatcher.java b/bundles/org.eclipse.equinox.common/src/org/eclipse/core/text/StringMatcher.java index d4a32298660..a67785662bd 100644 --- a/bundles/org.eclipse.equinox.common/src/org/eclipse/core/text/StringMatcher.java +++ b/bundles/org.eclipse.equinox.common/src/org/eclipse/core/text/StringMatcher.java @@ -1,5 +1,5 @@ /******************************************************************************* - * Copyright (c) 2000, 2020 IBM Corporation and others. + * Copyright (c) 2000, 2024 IBM Corporation and others. * * This program and the accompanying materials * are made available under the terms of the Eclipse Public License 2.0 @@ -10,11 +10,12 @@ * * Contributors: * IBM Corporation - initial API and implementation + * Thomas Wolf, Patrick Ziegler - support for matching individual words *******************************************************************************/ package org.eclipse.core.text; -import java.util.ArrayList; -import java.util.List; +import java.util.*; +import java.util.regex.Pattern; /** * A string pattern matcher. Supports '*' and '?' wildcards. @@ -23,6 +24,10 @@ */ public final class StringMatcher { + private static final Pattern NON_WORD = Pattern.compile("\\W+", Pattern.UNICODE_CHARACTER_CLASS); //$NON-NLS-1$ + + private static final StringMatcher[] NO_MATCHERS = new StringMatcher[0]; + private final String fPattern; private final int fLength; // pattern length @@ -31,10 +36,14 @@ public final class StringMatcher { private boolean fIgnoreWildCards; + private final boolean fIgnoreWords; + private boolean fHasLeadingStar; private boolean fHasTrailingStar; + private final StringMatcher fParts[]; // the given pattern is split into space-separated sub-patterns + private String fSegments[]; // the given pattern is split into * separated segments /* Minimum length required for a match: shorter texts cannot possibly match. */ @@ -112,6 +121,25 @@ public String toString() { } } + /** + * Splits a given text into words. + * + * @param text to split + * @return the words of the text + * @since 3.20 + */ + public static String[] getWords(String text) { + // Previous implementations (in the removed StringMatcher) used the ICU + // BreakIterator to split the text. That worked well, but in 2020 it was decided + // to drop the dependency to the ICU library due to its size. The JDK + // BreakIterator splits differently, causing e.g. + // https://bugs.eclipse.org/bugs/show_bug.cgi?id=563121 . The NON_WORD regexp + // appears to work well for programming language text, but may give sub-optimal + // results for natural languages. See also + // https://bugs.eclipse.org/bugs/show_bug.cgi?id=90579 . + return NON_WORD.split(text); + } + /** * StringMatcher constructor takes in a String object that is a simple pattern. * The pattern may contain '*' for 0 and many characters and '?' for exactly one @@ -141,14 +169,75 @@ public String toString() { * @throws IllegalArgumentException if {@code pattern == null} */ public StringMatcher(String pattern, boolean ignoreCase, boolean ignoreWildCards) { + this(pattern, ignoreCase, ignoreWildCards, true); + } + + /** + * StringMatcher constructor takes in a String object that is a simple pattern. + * The pattern may contain '*' for 0 and many characters and '?' for exactly one + * character. + *

+ * Literal '*' and '?' characters must be escaped in the pattern e.g., "\*" + * means literal "*", etc. + *

+ *

+ * The escape character '\' is an escape only if followed by '*', '?', or '\'. + * All other occurrences are taken literally. + *

+ *

+ * If invoking the StringMatcher with string literals in Java, don't forget + * escape characters are represented by "\\". + *

+ *

+ * If {@code ignoreWords} is true, this {@code StringMatcher} matches a pattern + * that may contain the wildcards '?' or '*' against a text. However, the + * matching is not only done on the full text, but also on individual words from + * the text, and if the pattern contains whitespace, the pattern is split into + * sub-patterns and those are matched, too. + *

+ *

+ * The precise rules are: + *

+ * + *

+ * An empty pattern matches only an empty text, unless {@link #usePrefixMatch()} + * is used, in which case it always matches. + *

+ * + * @param pattern the pattern to match text against, must not be + * {@code null} + * @param ignoreCase if true, case is ignored + * @param ignoreWildCards if true, wild cards and their escape sequences are + * ignored (everything is taken literally). + * @param ignoreWords if true, only matches against the whole text but not + * individual words + * @throws IllegalArgumentException if {@code pattern == null} + * @since 3.20 + */ + public StringMatcher(String pattern, boolean ignoreCase, boolean ignoreWildCards, boolean ignoreWords) { if (pattern == null) { throw new IllegalArgumentException(); } fIgnoreCase = ignoreCase; fIgnoreWildCards = ignoreWildCards; - fPattern = pattern; - fLength = pattern.length(); - + fIgnoreWords = ignoreWords; + if (fIgnoreWords) { + fPattern = pattern; + fLength = fPattern.length(); + fParts = NO_MATCHERS; + } else { + fPattern = pattern.trim(); + fLength = fPattern.length(); + fParts = splitPattern(); + } if (fIgnoreWildCards) { parseNoWildCards(); } else { @@ -256,7 +345,27 @@ public boolean match(String text) { if (text == null) { throw new IllegalArgumentException(); } - return match(text, 0, text.length()); + // match the whole text + if (match(text, 0, text.length())) { + return true; + } + // match individual words + if (!fIgnoreWords) { + String[] words = StringMatcher.getWords(text); + if (match(this, words)) { + return true; + } + if (fParts.length == 0) { + return false; + } + for (StringMatcher subMatcher : fParts) { + if (!subMatcher.match(text) && !match(subMatcher, words)) { + return false; + } + } + return true; + } + return false; } /** @@ -353,6 +462,41 @@ public boolean match(String text, int start, int end) { return i == segCount; } + /** + * Determines whether the given {@code matcher} matches at least one of the + * given {@code words}. + * + * @param matcher either this or a sub-matcher; must not be {@code null} + * @param words words to match; must not be {@code null} and not contain + * {@code null} words. + * @return {@code true} if at least one word is matched by the pattern; + * {@code false} otherwise + */ + private static boolean match(StringMatcher matcher, String[] words) { + return Arrays.stream(words).anyMatch(word -> matcher.match(word, 0, word.length())); + } + + private StringMatcher[] splitPattern() { + String pattern = fPattern.trim(); + if (pattern.isEmpty()) { + return NO_MATCHERS; + } + String[] subPatterns = pattern.split("\\s+"); //$NON-NLS-1$ + if (subPatterns.length <= 1) { + return NO_MATCHERS; + } + List matchers = new ArrayList<>(); + for (String s : subPatterns) { + if (s == null || s.isEmpty()) { + continue; + } + StringMatcher m = new StringMatcher(s, fIgnoreCase, fIgnoreWildCards); + m.usePrefixMatch(); + matchers.add(m); + } + return matchers.toArray(StringMatcher[]::new); + } + /** * Returns the single segment for a matcher ignoring wildcards. */