Skip to content

Commit

Permalink
Expose DelimitedTermFrequencyTokenFilter
Browse files Browse the repository at this point in the history
Relates: #9413

This commit exposes Lucene's delimited term frequency token filter to be
able to provide term frequencies along with terms.

Signed-off-by: Russ Cam <[email protected]>
  • Loading branch information
russcam committed Aug 22, 2023
1 parent dd75a22 commit 55526ec
Show file tree
Hide file tree
Showing 5 changed files with 97 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@
import org.apache.lucene.analysis.lt.LithuanianAnalyzer;
import org.apache.lucene.analysis.lv.LatvianAnalyzer;
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
import org.apache.lucene.analysis.miscellaneous.DelimitedTermFrequencyTokenFilter;
import org.apache.lucene.analysis.miscellaneous.DisableGraphAttribute;
import org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilter;
import org.apache.lucene.analysis.miscellaneous.LengthFilter;
Expand Down Expand Up @@ -265,6 +266,7 @@ public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
);
filters.put("decimal_digit", DecimalDigitFilterFactory::new);
filters.put("delimited_payload", DelimitedPayloadTokenFilterFactory::new);
filters.put("delimited_termfreq", DelimitedTermFrequencyTokenFilterFactory::new);
filters.put("dictionary_decompounder", requiresAnalysisSettings(DictionaryCompoundWordTokenFilterFactory::new));
filters.put("dutch_stem", DutchStemTokenFilterFactory::new);
filters.put("edge_ngram", EdgeNGramTokenFilterFactory::new);
Expand Down Expand Up @@ -500,6 +502,16 @@ public List<PreConfiguredTokenFilter> getPreConfiguredTokenFilters() {
)
)
);
filters.add(
PreConfiguredTokenFilter.singleton(
"delimited_termfreq",
false,
input -> new DelimitedTermFrequencyTokenFilter(
input,
DelimitedTermFrequencyTokenFilterFactory.DEFAULT_DELIMITER
)
)
);
filters.add(PreConfiguredTokenFilter.singleton("dutch_stem", false, input -> new SnowballFilter(input, new DutchStemmer())));
filters.add(PreConfiguredTokenFilter.singleton("edge_ngram", false, false, input -> new EdgeNGramTokenFilter(input, 1)));
filters.add(PreConfiguredTokenFilter.openSearchVersion("edgeNGram", false, false, (reader, version) -> {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
/*
* SPDX-License-Identifier: Apache-2.0
*
* The OpenSearch Contributors require contributions made to
* this file be licensed under the Apache-2.0 license or a
* compatible open source license.
*/

package org.opensearch.analysis.common;

import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.DelimitedTermFrequencyTokenFilter;
import org.opensearch.common.settings.Settings;
import org.opensearch.env.Environment;
import org.opensearch.index.IndexSettings;
import org.opensearch.index.analysis.AbstractTokenFilterFactory;

public class DelimitedTermFrequencyTokenFilterFactory extends AbstractTokenFilterFactory {
public static final char DEFAULT_DELIMITER = '|';
private static final String DELIMITER = "delimiter";
private final char delimiter;

DelimitedTermFrequencyTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
super(indexSettings, name, settings);
delimiter = parseDelimiter(settings);
}

@Override
public TokenStream create(TokenStream tokenStream) {
return new DelimitedTermFrequencyTokenFilter(tokenStream, delimiter);
}

private static char parseDelimiter(Settings settings) throws IllegalArgumentException {
String delimiter = settings.get(DELIMITER);
if (delimiter == null) {
return DEFAULT_DELIMITER;
} else if (delimiter.length() == 1) {
return delimiter.charAt(0);
}

throw new IllegalArgumentException(
"Setting [" + DELIMITER + "] must be a single, non-null character. [" + delimiter + "] was provided."
);
}
}

Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,7 @@ protected Map<String, Class<?>> getTokenFilters() {
filters.put("cjkwidth", CJKWidthFilterFactory.class);
filters.put("cjkbigram", CJKBigramFilterFactory.class);
filters.put("delimitedpayload", DelimitedPayloadTokenFilterFactory.class);
filters.put("delimitedtermfreq", DelimitedTermFrequencyTokenFilterFactory.class);
filters.put("keepword", KeepWordFilterFactory.class);
filters.put("type", KeepTypesFilterFactory.class);
filters.put("classic", ClassicFilterFactory.class);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1198,6 +1198,43 @@
- match: { tokens.0.token: foo }

---
"delimited_termfreq":
- do:
indices.create:
index: test
body:
settings:
analysis:
filter:
my_delimited_termfreq:
type: delimited_termfreq
delimiter: ^
- do:
indices.analyze:
index: test
body:
text: foo^3
tokenizer: keyword
filter: [my_delimited_termfreq]
attributes: termFrequency
explain: true
- length: { detail.tokenfilters: 1 }
- match: { detail.tokenfilters.0.tokens.0.token: foo }
- match: { detail.tokenfilters.0.tokens.0.termFrequency: 3 }

# Test pre-configured token filter too:
- do:
indices.analyze:
body:
text: foo|100
tokenizer: keyword
filter: [delimited_termfreq]
attributes: termFrequency
explain: true
- length: { detail.tokenfilters: 1 }
- match: { detail.tokenfilters.0.tokens.0.token: foo }
- match: { detail.tokenfilters.0.tokens.0.termFrequency: 100 }
---
"keep_filter":
- do:
indices.create:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ public abstract class AnalysisFactoryTestCase extends OpenSearchTestCase {
.put("czechstem", MovedToAnalysisCommon.class)
.put("decimaldigit", MovedToAnalysisCommon.class)
.put("delimitedpayload", MovedToAnalysisCommon.class)
.put("delimitedtermfrequency", MovedToAnalysisCommon.class)
.put("dictionarycompoundword", MovedToAnalysisCommon.class)
.put("edgengram", MovedToAnalysisCommon.class)
.put("elision", MovedToAnalysisCommon.class)
Expand Down Expand Up @@ -201,9 +202,6 @@ public abstract class AnalysisFactoryTestCase extends OpenSearchTestCase {
.put("daterecognizer", Void.class)
// for token filters that generate bad offsets, which are now rejected since Lucene 7
.put("fixbrokenoffsets", Void.class)
// should we expose it, or maybe think about higher level integration of the
// fake term frequency feature (LUCENE-7854)
.put("delimitedtermfrequency", Void.class)
// LUCENE-8273: ProtectedTermFilterFactory allows analysis chains to skip
// particular token filters based on the attributes of the current token.
.put("protectedterm", Void.class)
Expand Down

0 comments on commit 55526ec

Please sign in to comment.