From 55526ec46575f0e0245887dfd338b97dfb7451c7 Mon Sep 17 00:00:00 2001 From: Russ Cam Date: Tue, 22 Aug 2023 21:57:12 +1000 Subject: [PATCH 1/7] Expose DelimitedTermFrequencyTokenFilter Relates: #9413 This commit exposes Lucene's delimited term frequency token filter to be able to provide term frequencies along with terms. Signed-off-by: Russ Cam --- .../common/CommonAnalysisModulePlugin.java | 12 +++++ ...imitedTermFrequencyTokenFilterFactory.java | 46 +++++++++++++++++++ .../common/CommonAnalysisFactoryTests.java | 1 + .../test/analysis-common/40_token_filters.yml | 37 +++++++++++++++ .../analysis/AnalysisFactoryTestCase.java | 4 +- 5 files changed, 97 insertions(+), 3 deletions(-) create mode 100644 modules/analysis-common/src/main/java/org/opensearch/analysis/common/DelimitedTermFrequencyTokenFilterFactory.java diff --git a/modules/analysis-common/src/main/java/org/opensearch/analysis/common/CommonAnalysisModulePlugin.java b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/CommonAnalysisModulePlugin.java index 46220f5369d16..8e68874a2c049 100644 --- a/modules/analysis-common/src/main/java/org/opensearch/analysis/common/CommonAnalysisModulePlugin.java +++ b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/CommonAnalysisModulePlugin.java @@ -89,6 +89,7 @@ import org.apache.lucene.analysis.lt.LithuanianAnalyzer; import org.apache.lucene.analysis.lv.LatvianAnalyzer; import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter; +import org.apache.lucene.analysis.miscellaneous.DelimitedTermFrequencyTokenFilter; import org.apache.lucene.analysis.miscellaneous.DisableGraphAttribute; import org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilter; import org.apache.lucene.analysis.miscellaneous.LengthFilter; @@ -265,6 +266,7 @@ public Map> getTokenFilters() { ); filters.put("decimal_digit", DecimalDigitFilterFactory::new); filters.put("delimited_payload", DelimitedPayloadTokenFilterFactory::new); + filters.put("delimited_termfreq", DelimitedTermFrequencyTokenFilterFactory::new); filters.put("dictionary_decompounder", requiresAnalysisSettings(DictionaryCompoundWordTokenFilterFactory::new)); filters.put("dutch_stem", DutchStemTokenFilterFactory::new); filters.put("edge_ngram", EdgeNGramTokenFilterFactory::new); @@ -500,6 +502,16 @@ public List getPreConfiguredTokenFilters() { ) ) ); + filters.add( + PreConfiguredTokenFilter.singleton( + "delimited_termfreq", + false, + input -> new DelimitedTermFrequencyTokenFilter( + input, + DelimitedTermFrequencyTokenFilterFactory.DEFAULT_DELIMITER + ) + ) + ); filters.add(PreConfiguredTokenFilter.singleton("dutch_stem", false, input -> new SnowballFilter(input, new DutchStemmer()))); filters.add(PreConfiguredTokenFilter.singleton("edge_ngram", false, false, input -> new EdgeNGramTokenFilter(input, 1))); filters.add(PreConfiguredTokenFilter.openSearchVersion("edgeNGram", false, false, (reader, version) -> { diff --git a/modules/analysis-common/src/main/java/org/opensearch/analysis/common/DelimitedTermFrequencyTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/DelimitedTermFrequencyTokenFilterFactory.java new file mode 100644 index 0000000000000..a35515c9694d3 --- /dev/null +++ b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/DelimitedTermFrequencyTokenFilterFactory.java @@ -0,0 +1,46 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.analysis.common; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.miscellaneous.DelimitedTermFrequencyTokenFilter; +import org.opensearch.common.settings.Settings; +import org.opensearch.env.Environment; +import org.opensearch.index.IndexSettings; +import org.opensearch.index.analysis.AbstractTokenFilterFactory; + +public class DelimitedTermFrequencyTokenFilterFactory extends AbstractTokenFilterFactory { + public static final char DEFAULT_DELIMITER = '|'; + private static final String DELIMITER = "delimiter"; + private final char delimiter; + + DelimitedTermFrequencyTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { + super(indexSettings, name, settings); + delimiter = parseDelimiter(settings); + } + + @Override + public TokenStream create(TokenStream tokenStream) { + return new DelimitedTermFrequencyTokenFilter(tokenStream, delimiter); + } + + private static char parseDelimiter(Settings settings) throws IllegalArgumentException { + String delimiter = settings.get(DELIMITER); + if (delimiter == null) { + return DEFAULT_DELIMITER; + } else if (delimiter.length() == 1) { + return delimiter.charAt(0); + } + + throw new IllegalArgumentException( + "Setting [" + DELIMITER + "] must be a single, non-null character. [" + delimiter + "] was provided." + ); + } +} + diff --git a/modules/analysis-common/src/test/java/org/opensearch/analysis/common/CommonAnalysisFactoryTests.java b/modules/analysis-common/src/test/java/org/opensearch/analysis/common/CommonAnalysisFactoryTests.java index 1c4db089565ff..b9c47b9a24acc 100644 --- a/modules/analysis-common/src/test/java/org/opensearch/analysis/common/CommonAnalysisFactoryTests.java +++ b/modules/analysis-common/src/test/java/org/opensearch/analysis/common/CommonAnalysisFactoryTests.java @@ -145,6 +145,7 @@ protected Map> getTokenFilters() { filters.put("cjkwidth", CJKWidthFilterFactory.class); filters.put("cjkbigram", CJKBigramFilterFactory.class); filters.put("delimitedpayload", DelimitedPayloadTokenFilterFactory.class); + filters.put("delimitedtermfreq", DelimitedTermFrequencyTokenFilterFactory.class); filters.put("keepword", KeepWordFilterFactory.class); filters.put("type", KeepTypesFilterFactory.class); filters.put("classic", ClassicFilterFactory.class); diff --git a/modules/analysis-common/src/yamlRestTest/resources/rest-api-spec/test/analysis-common/40_token_filters.yml b/modules/analysis-common/src/yamlRestTest/resources/rest-api-spec/test/analysis-common/40_token_filters.yml index 40c82ff185661..836e9b550f52a 100644 --- a/modules/analysis-common/src/yamlRestTest/resources/rest-api-spec/test/analysis-common/40_token_filters.yml +++ b/modules/analysis-common/src/yamlRestTest/resources/rest-api-spec/test/analysis-common/40_token_filters.yml @@ -1198,6 +1198,43 @@ - match: { tokens.0.token: foo } --- +"delimited_termfreq": + - do: + indices.create: + index: test + body: + settings: + analysis: + filter: + my_delimited_termfreq: + type: delimited_termfreq + delimiter: ^ + - do: + indices.analyze: + index: test + body: + text: foo^3 + tokenizer: keyword + filter: [my_delimited_termfreq] + attributes: termFrequency + explain: true + - length: { detail.tokenfilters: 1 } + - match: { detail.tokenfilters.0.tokens.0.token: foo } + - match: { detail.tokenfilters.0.tokens.0.termFrequency: 3 } + + # Test pre-configured token filter too: + - do: + indices.analyze: + body: + text: foo|100 + tokenizer: keyword + filter: [delimited_termfreq] + attributes: termFrequency + explain: true + - length: { detail.tokenfilters: 1 } + - match: { detail.tokenfilters.0.tokens.0.token: foo } + - match: { detail.tokenfilters.0.tokens.0.termFrequency: 100 } +--- "keep_filter": - do: indices.create: diff --git a/test/framework/src/main/java/org/opensearch/indices/analysis/AnalysisFactoryTestCase.java b/test/framework/src/main/java/org/opensearch/indices/analysis/AnalysisFactoryTestCase.java index b93cb64e32cfe..c412ae8317f24 100644 --- a/test/framework/src/main/java/org/opensearch/indices/analysis/AnalysisFactoryTestCase.java +++ b/test/framework/src/main/java/org/opensearch/indices/analysis/AnalysisFactoryTestCase.java @@ -98,6 +98,7 @@ public abstract class AnalysisFactoryTestCase extends OpenSearchTestCase { .put("czechstem", MovedToAnalysisCommon.class) .put("decimaldigit", MovedToAnalysisCommon.class) .put("delimitedpayload", MovedToAnalysisCommon.class) + .put("delimitedtermfrequency", MovedToAnalysisCommon.class) .put("dictionarycompoundword", MovedToAnalysisCommon.class) .put("edgengram", MovedToAnalysisCommon.class) .put("elision", MovedToAnalysisCommon.class) @@ -201,9 +202,6 @@ public abstract class AnalysisFactoryTestCase extends OpenSearchTestCase { .put("daterecognizer", Void.class) // for token filters that generate bad offsets, which are now rejected since Lucene 7 .put("fixbrokenoffsets", Void.class) - // should we expose it, or maybe think about higher level integration of the - // fake term frequency feature (LUCENE-7854) - .put("delimitedtermfrequency", Void.class) // LUCENE-8273: ProtectedTermFilterFactory allows analysis chains to skip // particular token filters based on the attributes of the current token. .put("protectedterm", Void.class) From b529953ea44f18590f19349c95001871182260bb Mon Sep 17 00:00:00 2001 From: Russ Cam Date: Tue, 22 Aug 2023 22:09:53 +1000 Subject: [PATCH 2/7] fix format violations Signed-off-by: Russ Cam --- .../analysis/common/CommonAnalysisModulePlugin.java | 5 +---- .../common/DelimitedTermFrequencyTokenFilterFactory.java | 1 - 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/modules/analysis-common/src/main/java/org/opensearch/analysis/common/CommonAnalysisModulePlugin.java b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/CommonAnalysisModulePlugin.java index 8e68874a2c049..230307012e7ad 100644 --- a/modules/analysis-common/src/main/java/org/opensearch/analysis/common/CommonAnalysisModulePlugin.java +++ b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/CommonAnalysisModulePlugin.java @@ -506,10 +506,7 @@ public List getPreConfiguredTokenFilters() { PreConfiguredTokenFilter.singleton( "delimited_termfreq", false, - input -> new DelimitedTermFrequencyTokenFilter( - input, - DelimitedTermFrequencyTokenFilterFactory.DEFAULT_DELIMITER - ) + input -> new DelimitedTermFrequencyTokenFilter(input, DelimitedTermFrequencyTokenFilterFactory.DEFAULT_DELIMITER) ) ); filters.add(PreConfiguredTokenFilter.singleton("dutch_stem", false, input -> new SnowballFilter(input, new DutchStemmer()))); diff --git a/modules/analysis-common/src/main/java/org/opensearch/analysis/common/DelimitedTermFrequencyTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/DelimitedTermFrequencyTokenFilterFactory.java index a35515c9694d3..81f52a3badacd 100644 --- a/modules/analysis-common/src/main/java/org/opensearch/analysis/common/DelimitedTermFrequencyTokenFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/DelimitedTermFrequencyTokenFilterFactory.java @@ -43,4 +43,3 @@ private static char parseDelimiter(Settings settings) throws IllegalArgumentExce ); } } - From 3ec15f7886b7583726583fd930e821b5905b7ae4 Mon Sep 17 00:00:00 2001 From: Russ Cam Date: Wed, 23 Aug 2023 08:54:03 +1000 Subject: [PATCH 3/7] fix test and add to changelog Signed-off-by: Russ Cam --- CHANGELOG.md | 3 ++- .../opensearch/analysis/common/CommonAnalysisFactoryTests.java | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 06b47e7453b3c..d16669f0d4430 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -87,6 +87,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - Make SearchTemplateRequest implement IndicesRequest.Replaceable ([#9122]()https://github.com/opensearch-project/OpenSearch/pull/9122) - [BWC and API enforcement] Define the initial set of annotations, their meaning and relations between them ([#9223](https://github.com/opensearch-project/OpenSearch/pull/9223)) - [Segment Replication] Support realtime reads for GET requests ([#9212](https://github.com/opensearch-project/OpenSearch/pull/9212)) +- Expose DelimitedTermFrequencyTokenFilter to allow providing term frequencies along with terms ([#9479](https://github.com/opensearch-project/OpenSearch/pull/9479)) ### Dependencies - Bump `org.apache.logging.log4j:log4j-core` from 2.17.1 to 2.20.0 ([#8307](https://github.com/opensearch-project/OpenSearch/pull/8307)) @@ -162,4 +163,4 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), ### Security [Unreleased 3.0]: https://github.com/opensearch-project/OpenSearch/compare/2.x...HEAD -[Unreleased 2.x]: https://github.com/opensearch-project/OpenSearch/compare/2.10...2.x \ No newline at end of file +[Unreleased 2.x]: https://github.com/opensearch-project/OpenSearch/compare/2.10...2.x diff --git a/modules/analysis-common/src/test/java/org/opensearch/analysis/common/CommonAnalysisFactoryTests.java b/modules/analysis-common/src/test/java/org/opensearch/analysis/common/CommonAnalysisFactoryTests.java index b9c47b9a24acc..d2e45257e81d5 100644 --- a/modules/analysis-common/src/test/java/org/opensearch/analysis/common/CommonAnalysisFactoryTests.java +++ b/modules/analysis-common/src/test/java/org/opensearch/analysis/common/CommonAnalysisFactoryTests.java @@ -145,7 +145,7 @@ protected Map> getTokenFilters() { filters.put("cjkwidth", CJKWidthFilterFactory.class); filters.put("cjkbigram", CJKBigramFilterFactory.class); filters.put("delimitedpayload", DelimitedPayloadTokenFilterFactory.class); - filters.put("delimitedtermfreq", DelimitedTermFrequencyTokenFilterFactory.class); + filters.put("delimitedtermfrequency", DelimitedTermFrequencyTokenFilterFactory.class); filters.put("keepword", KeepWordFilterFactory.class); filters.put("type", KeepTypesFilterFactory.class); filters.put("classic", ClassicFilterFactory.class); @@ -203,6 +203,7 @@ protected Map> getPreConfiguredTokenFilters() { filters.put("decimal_digit", null); filters.put("delimited_payload_filter", org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilterFactory.class); filters.put("delimited_payload", org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilterFactory.class); + filters.put("delimited_termfreq", org.apache.lucene.analysis.miscellaneous.DelimitedTermFrequencyTokenFilterFactory.class); filters.put("dutch_stem", SnowballPorterFilterFactory.class); filters.put("edge_ngram", null); filters.put("edgeNGram", null); From 000eeac7a8665e1d1c67f215d88da94d30a8eb45 Mon Sep 17 00:00:00 2001 From: Russ Cam Date: Mon, 28 Aug 2023 09:08:39 +1000 Subject: [PATCH 4/7] Address PR feedback - Add unit tests for DelimitedTermFrequencyTokenFilterFactory - Remove IllegalArgumentException as caught exception - Add skip to yaml rest tests to skip for version < 2.10 Signed-off-by: Russ Cam --- ...imitedTermFrequencyTokenFilterFactory.java | 2 +- ...dTermFrequencyTokenFilterFactoryTests.java | 85 +++++++++++++++++++ .../test/analysis-common/40_token_filters.yml | 3 + 3 files changed, 89 insertions(+), 1 deletion(-) create mode 100644 modules/analysis-common/src/test/java/org/opensearch/analysis/common/DelimitedTermFrequencyTokenFilterFactoryTests.java diff --git a/modules/analysis-common/src/main/java/org/opensearch/analysis/common/DelimitedTermFrequencyTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/DelimitedTermFrequencyTokenFilterFactory.java index 81f52a3badacd..8929a7c54ef4c 100644 --- a/modules/analysis-common/src/main/java/org/opensearch/analysis/common/DelimitedTermFrequencyTokenFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/DelimitedTermFrequencyTokenFilterFactory.java @@ -30,7 +30,7 @@ public TokenStream create(TokenStream tokenStream) { return new DelimitedTermFrequencyTokenFilter(tokenStream, delimiter); } - private static char parseDelimiter(Settings settings) throws IllegalArgumentException { + private static char parseDelimiter(Settings settings) { String delimiter = settings.get(DELIMITER); if (delimiter == null) { return DEFAULT_DELIMITER; diff --git a/modules/analysis-common/src/test/java/org/opensearch/analysis/common/DelimitedTermFrequencyTokenFilterFactoryTests.java b/modules/analysis-common/src/test/java/org/opensearch/analysis/common/DelimitedTermFrequencyTokenFilterFactoryTests.java new file mode 100644 index 0000000000000..b1470482f107b --- /dev/null +++ b/modules/analysis-common/src/test/java/org/opensearch/analysis/common/DelimitedTermFrequencyTokenFilterFactoryTests.java @@ -0,0 +1,85 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.analysis.common; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.WhitespaceTokenizer; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.TermFrequencyAttribute; +import org.opensearch.common.settings.Settings; +import org.opensearch.env.Environment; +import org.opensearch.index.analysis.AnalysisTestsHelper; +import org.opensearch.index.analysis.TokenFilterFactory; +import org.opensearch.test.OpenSearchTestCase; +import org.opensearch.test.OpenSearchTokenStreamTestCase; + +import java.io.StringReader; + +public class DelimitedTermFrequencyTokenFilterFactoryTests extends OpenSearchTokenStreamTestCase { + + public void testDefault() throws Exception { + OpenSearchTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings( + Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .put("index.analysis.filter.my_delimited_termfreq.type", "delimited_termfreq") + .build(), + new CommonAnalysisModulePlugin() + ); + doTest(analysis, "cat|4 dog|5"); + } + + public void testDelimiter() throws Exception { + OpenSearchTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings( + Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .put("index.analysis.filter.my_delimited_termfreq.type", "delimited_termfreq") + .put("index.analysis.filter.my_delimited_termfreq.delimiter", ":") + .build(), + new CommonAnalysisModulePlugin() + ); + doTest(analysis, "cat:4 dog:5"); + } + + public void testDelimiterLongerThanOneCharThrows() { + IllegalArgumentException ex = expectThrows(IllegalArgumentException.class, () -> AnalysisTestsHelper.createTestAnalysisFromSettings( + Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .put("index.analysis.filter.my_delimited_termfreq.type", "delimited_termfreq") + .put("index.analysis.filter.my_delimited_termfreq.delimiter", "^^") + .build(), + new CommonAnalysisModulePlugin() + )); + + assertEquals("Setting [delimiter] must be a single, non-null character. [^^] was provided.", ex.getMessage()); + } + + private void doTest(OpenSearchTestCase.TestAnalysis analysis, String source) throws Exception { + TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_delimited_termfreq"); + Tokenizer tokenizer = new WhitespaceTokenizer(); + tokenizer.setReader(new StringReader(source)); + + TokenStream stream = tokenFilter.create(tokenizer); + + CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class); + TermFrequencyAttribute tfAtt = stream.getAttribute(TermFrequencyAttribute.class); + stream.reset(); + assertTermEquals("cat", stream, termAtt, tfAtt, 4); + assertTermEquals("dog", stream, termAtt, tfAtt, 5); + assertFalse(stream.incrementToken()); + stream.end(); + stream.close(); + } + + void assertTermEquals(String expected, TokenStream stream, CharTermAttribute termAtt, TermFrequencyAttribute tfAtt, int expectedTf) throws Exception { + assertTrue(stream.incrementToken()); + assertEquals(expected, termAtt.toString()); + assertEquals(expectedTf, tfAtt.getTermFrequency()); + } +} diff --git a/modules/analysis-common/src/yamlRestTest/resources/rest-api-spec/test/analysis-common/40_token_filters.yml b/modules/analysis-common/src/yamlRestTest/resources/rest-api-spec/test/analysis-common/40_token_filters.yml index 836e9b550f52a..526b739feeb66 100644 --- a/modules/analysis-common/src/yamlRestTest/resources/rest-api-spec/test/analysis-common/40_token_filters.yml +++ b/modules/analysis-common/src/yamlRestTest/resources/rest-api-spec/test/analysis-common/40_token_filters.yml @@ -1199,6 +1199,9 @@ --- "delimited_termfreq": + - skip: + version: " - 2.9.99" + reason: "delimited_termfreq token filter was added in v2.10.0" - do: indices.create: index: test From a24903fb543950652e988ea76e302c2bc9ee3276 Mon Sep 17 00:00:00 2001 From: Russ Cam Date: Mon, 28 Aug 2023 09:28:14 +1000 Subject: [PATCH 5/7] formatting Signed-off-by: Russ Cam --- ...dTermFrequencyTokenFilterFactoryTests.java | 22 +++++++++++-------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/modules/analysis-common/src/test/java/org/opensearch/analysis/common/DelimitedTermFrequencyTokenFilterFactoryTests.java b/modules/analysis-common/src/test/java/org/opensearch/analysis/common/DelimitedTermFrequencyTokenFilterFactoryTests.java index b1470482f107b..f66ed422ae89c 100644 --- a/modules/analysis-common/src/test/java/org/opensearch/analysis/common/DelimitedTermFrequencyTokenFilterFactoryTests.java +++ b/modules/analysis-common/src/test/java/org/opensearch/analysis/common/DelimitedTermFrequencyTokenFilterFactoryTests.java @@ -48,14 +48,17 @@ public void testDelimiter() throws Exception { } public void testDelimiterLongerThanOneCharThrows() { - IllegalArgumentException ex = expectThrows(IllegalArgumentException.class, () -> AnalysisTestsHelper.createTestAnalysisFromSettings( - Settings.builder() - .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) - .put("index.analysis.filter.my_delimited_termfreq.type", "delimited_termfreq") - .put("index.analysis.filter.my_delimited_termfreq.delimiter", "^^") - .build(), - new CommonAnalysisModulePlugin() - )); + IllegalArgumentException ex = expectThrows( + IllegalArgumentException.class, + () -> AnalysisTestsHelper.createTestAnalysisFromSettings( + Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .put("index.analysis.filter.my_delimited_termfreq.type", "delimited_termfreq") + .put("index.analysis.filter.my_delimited_termfreq.delimiter", "^^") + .build(), + new CommonAnalysisModulePlugin() + ) + ); assertEquals("Setting [delimiter] must be a single, non-null character. [^^] was provided.", ex.getMessage()); } @@ -77,7 +80,8 @@ private void doTest(OpenSearchTestCase.TestAnalysis analysis, String source) thr stream.close(); } - void assertTermEquals(String expected, TokenStream stream, CharTermAttribute termAtt, TermFrequencyAttribute tfAtt, int expectedTf) throws Exception { + void assertTermEquals(String expected, TokenStream stream, CharTermAttribute termAtt, TermFrequencyAttribute tfAtt, int expectedTf) + throws Exception { assertTrue(stream.incrementToken()); assertEquals(expected, termAtt.toString()); assertEquals(expectedTf, tfAtt.getTermFrequency()); From 76fe75d5bdd64ffb7f751507e67643be0556a9e3 Mon Sep 17 00:00:00 2001 From: Russ Cam Date: Thu, 31 Aug 2023 08:18:46 +1000 Subject: [PATCH 6/7] Rename filter Signed-off-by: Russ Cam --- .../analysis/common/CommonAnalysisModulePlugin.java | 4 ++-- .../analysis/common/CommonAnalysisFactoryTests.java | 2 +- ...elimitedTermFrequencyTokenFilterFactoryTests.java | 12 ++++++------ 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/modules/analysis-common/src/main/java/org/opensearch/analysis/common/CommonAnalysisModulePlugin.java b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/CommonAnalysisModulePlugin.java index 230307012e7ad..b0d9c1765190a 100644 --- a/modules/analysis-common/src/main/java/org/opensearch/analysis/common/CommonAnalysisModulePlugin.java +++ b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/CommonAnalysisModulePlugin.java @@ -266,7 +266,7 @@ public Map> getTokenFilters() { ); filters.put("decimal_digit", DecimalDigitFilterFactory::new); filters.put("delimited_payload", DelimitedPayloadTokenFilterFactory::new); - filters.put("delimited_termfreq", DelimitedTermFrequencyTokenFilterFactory::new); + filters.put("delimited_term_freq", DelimitedTermFrequencyTokenFilterFactory::new); filters.put("dictionary_decompounder", requiresAnalysisSettings(DictionaryCompoundWordTokenFilterFactory::new)); filters.put("dutch_stem", DutchStemTokenFilterFactory::new); filters.put("edge_ngram", EdgeNGramTokenFilterFactory::new); @@ -504,7 +504,7 @@ public List getPreConfiguredTokenFilters() { ); filters.add( PreConfiguredTokenFilter.singleton( - "delimited_termfreq", + "delimited_term_freq", false, input -> new DelimitedTermFrequencyTokenFilter(input, DelimitedTermFrequencyTokenFilterFactory.DEFAULT_DELIMITER) ) diff --git a/modules/analysis-common/src/test/java/org/opensearch/analysis/common/CommonAnalysisFactoryTests.java b/modules/analysis-common/src/test/java/org/opensearch/analysis/common/CommonAnalysisFactoryTests.java index d2e45257e81d5..11713f52f5b18 100644 --- a/modules/analysis-common/src/test/java/org/opensearch/analysis/common/CommonAnalysisFactoryTests.java +++ b/modules/analysis-common/src/test/java/org/opensearch/analysis/common/CommonAnalysisFactoryTests.java @@ -203,7 +203,7 @@ protected Map> getPreConfiguredTokenFilters() { filters.put("decimal_digit", null); filters.put("delimited_payload_filter", org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilterFactory.class); filters.put("delimited_payload", org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilterFactory.class); - filters.put("delimited_termfreq", org.apache.lucene.analysis.miscellaneous.DelimitedTermFrequencyTokenFilterFactory.class); + filters.put("delimited_term_freq", org.apache.lucene.analysis.miscellaneous.DelimitedTermFrequencyTokenFilterFactory.class); filters.put("dutch_stem", SnowballPorterFilterFactory.class); filters.put("edge_ngram", null); filters.put("edgeNGram", null); diff --git a/modules/analysis-common/src/test/java/org/opensearch/analysis/common/DelimitedTermFrequencyTokenFilterFactoryTests.java b/modules/analysis-common/src/test/java/org/opensearch/analysis/common/DelimitedTermFrequencyTokenFilterFactoryTests.java index f66ed422ae89c..fab83a75387de 100644 --- a/modules/analysis-common/src/test/java/org/opensearch/analysis/common/DelimitedTermFrequencyTokenFilterFactoryTests.java +++ b/modules/analysis-common/src/test/java/org/opensearch/analysis/common/DelimitedTermFrequencyTokenFilterFactoryTests.java @@ -28,7 +28,7 @@ public void testDefault() throws Exception { OpenSearchTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings( Settings.builder() .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) - .put("index.analysis.filter.my_delimited_termfreq.type", "delimited_termfreq") + .put("index.analysis.filter.my_delimited_term_freq.type", "delimited_term_freq") .build(), new CommonAnalysisModulePlugin() ); @@ -39,8 +39,8 @@ public void testDelimiter() throws Exception { OpenSearchTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings( Settings.builder() .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) - .put("index.analysis.filter.my_delimited_termfreq.type", "delimited_termfreq") - .put("index.analysis.filter.my_delimited_termfreq.delimiter", ":") + .put("index.analysis.filter.my_delimited_term_freq.type", "delimited_term_freq") + .put("index.analysis.filter.my_delimited_term_freq.delimiter", ":") .build(), new CommonAnalysisModulePlugin() ); @@ -53,8 +53,8 @@ public void testDelimiterLongerThanOneCharThrows() { () -> AnalysisTestsHelper.createTestAnalysisFromSettings( Settings.builder() .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) - .put("index.analysis.filter.my_delimited_termfreq.type", "delimited_termfreq") - .put("index.analysis.filter.my_delimited_termfreq.delimiter", "^^") + .put("index.analysis.filter.my_delimited_term_freq.type", "delimited_term_freq") + .put("index.analysis.filter.my_delimited_term_freq.delimiter", "^^") .build(), new CommonAnalysisModulePlugin() ) @@ -64,7 +64,7 @@ public void testDelimiterLongerThanOneCharThrows() { } private void doTest(OpenSearchTestCase.TestAnalysis analysis, String source) throws Exception { - TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_delimited_termfreq"); + TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_delimited_term_freq"); Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader(source)); From a80e9a08e1df521b1a7f12a494317d7cd2808e34 Mon Sep 17 00:00:00 2001 From: Russ Cam Date: Thu, 31 Aug 2023 17:56:57 +1000 Subject: [PATCH 7/7] update naming in REST tests Signed-off-by: Russ Cam --- .../test/analysis-common/40_token_filters.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/modules/analysis-common/src/yamlRestTest/resources/rest-api-spec/test/analysis-common/40_token_filters.yml b/modules/analysis-common/src/yamlRestTest/resources/rest-api-spec/test/analysis-common/40_token_filters.yml index 526b739feeb66..e92cc0c4838c7 100644 --- a/modules/analysis-common/src/yamlRestTest/resources/rest-api-spec/test/analysis-common/40_token_filters.yml +++ b/modules/analysis-common/src/yamlRestTest/resources/rest-api-spec/test/analysis-common/40_token_filters.yml @@ -1198,10 +1198,10 @@ - match: { tokens.0.token: foo } --- -"delimited_termfreq": +"delimited_term_freq": - skip: version: " - 2.9.99" - reason: "delimited_termfreq token filter was added in v2.10.0" + reason: "delimited_term_freq token filter was added in v2.10.0" - do: indices.create: index: test @@ -1209,8 +1209,8 @@ settings: analysis: filter: - my_delimited_termfreq: - type: delimited_termfreq + my_delimited_term_freq: + type: delimited_term_freq delimiter: ^ - do: indices.analyze: @@ -1218,7 +1218,7 @@ body: text: foo^3 tokenizer: keyword - filter: [my_delimited_termfreq] + filter: [my_delimited_term_freq] attributes: termFrequency explain: true - length: { detail.tokenfilters: 1 } @@ -1231,7 +1231,7 @@ body: text: foo|100 tokenizer: keyword - filter: [delimited_termfreq] + filter: [delimited_term_freq] attributes: termFrequency explain: true - length: { detail.tokenfilters: 1 }