From e437959da924d36fadb905b97f4ca465dc35f9f3 Mon Sep 17 00:00:00 2001 From: Rishabh Maurya Date: Tue, 31 Oct 2023 14:57:52 -0700 Subject: [PATCH] Implementation for match_only_text field --- .../index/mapper/MappedFieldType.java | 13 +- .../mapper/MatchOnlyTextFieldMapper.java | 167 ++++++++++++++++++ .../index/mapper/TextFieldMapper.java | 10 +- .../index/query/SourceFieldMatchQuery.java | 142 +++++++++++++++ .../opensearch/index/search/MatchQuery.java | 14 +- .../index/search/MultiMatchQuery.java | 4 +- .../org/opensearch/indices/IndicesModule.java | 2 + 7 files changed, 341 insertions(+), 11 deletions(-) create mode 100644 server/src/main/java/org/opensearch/index/mapper/MatchOnlyTextFieldMapper.java create mode 100644 server/src/main/java/org/opensearch/index/query/SourceFieldMatchQuery.java diff --git a/server/src/main/java/org/opensearch/index/mapper/MappedFieldType.java b/server/src/main/java/org/opensearch/index/mapper/MappedFieldType.java index 62acad99074c2..111d3c11fe253 100644 --- a/server/src/main/java/org/opensearch/index/mapper/MappedFieldType.java +++ b/server/src/main/java/org/opensearch/index/mapper/MappedFieldType.java @@ -352,23 +352,34 @@ public Query existsQuery(QueryShardContext context) { } public Query phraseQuery(TokenStream stream, int slop, boolean enablePositionIncrements) throws IOException { + return phraseQuery(stream, slop, enablePositionIncrements, null); + } + + public Query phraseQuery(TokenStream stream, int slop, boolean enablePositionIncrements, QueryShardContext context) throws IOException { throw new IllegalArgumentException( "Can only use phrase queries on text fields - not on [" + name + "] which is of type [" + typeName() + "]" ); } public Query multiPhraseQuery(TokenStream stream, int slop, boolean enablePositionIncrements) throws IOException { + return multiPhraseQuery(stream, slop, enablePositionIncrements, null); + } + + public Query multiPhraseQuery(TokenStream stream, int slop, boolean enablePositionIncrements, QueryShardContext context) throws IOException { throw new IllegalArgumentException( "Can only use phrase queries on text fields - not on [" + name + "] which is of type [" + typeName() + "]" ); } public Query phrasePrefixQuery(TokenStream stream, int slop, int maxExpansions) throws IOException { + return phrasePrefixQuery(stream, slop, maxExpansions, null); + } + + public Query phrasePrefixQuery(TokenStream stream, int slop, int maxExpansions, QueryShardContext context) throws IOException { throw new IllegalArgumentException( "Can only use phrase prefix queries on text fields - not on [" + name + "] which is of type [" + typeName() + "]" ); } - public SpanQuery spanPrefixQuery(String value, SpanMultiTermQueryWrapper.SpanRewriteMethod method, QueryShardContext context) { throw new IllegalArgumentException( "Can only use span prefix queries on text fields - not on [" + name + "] which is of type [" + typeName() + "]" diff --git a/server/src/main/java/org/opensearch/index/mapper/MatchOnlyTextFieldMapper.java b/server/src/main/java/org/opensearch/index/mapper/MatchOnlyTextFieldMapper.java new file mode 100644 index 0000000000000..b379cafaad5a0 --- /dev/null +++ b/server/src/main/java/org/opensearch/index/mapper/MatchOnlyTextFieldMapper.java @@ -0,0 +1,167 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.index.mapper; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; +import org.apache.lucene.document.FieldType; +import org.apache.lucene.index.IndexOptions; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.MultiPhraseQuery; +import org.apache.lucene.search.PhraseQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; +import org.opensearch.Version; +import org.opensearch.common.lucene.search.MultiPhrasePrefixQuery; +import org.opensearch.index.analysis.IndexAnalyzers; +import org.opensearch.index.query.QueryShardContext; +import org.opensearch.index.query.SourceFieldMatchQuery; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +public class MatchOnlyTextFieldMapper extends TextFieldMapper { + + public static final FieldType FIELD_TYPE = new FieldType(); + public static final String CONTENT_TYPE = "match_only_text"; + + @Override + protected String contentType() { + return CONTENT_TYPE; + } + + static { + FIELD_TYPE.setTokenized(true); + FIELD_TYPE.setStored(false); + FIELD_TYPE.setStoreTermVectors(false); + FIELD_TYPE.setOmitNorms(true); + FIELD_TYPE.setIndexOptions(IndexOptions.DOCS); + FIELD_TYPE.freeze(); + } + + public static final TypeParser PARSER = new TypeParser((n, c) -> new Builder(n, c.indexVersionCreated(), c.getIndexAnalyzers())); + + protected MatchOnlyTextFieldMapper(String simpleName, FieldType fieldType, MatchOnlyTextFieldType mappedFieldType, + TextFieldMapper.PrefixFieldMapper prefixFieldMapper, + TextFieldMapper.PhraseFieldMapper phraseFieldMapper, + MultiFields multiFields, CopyTo copyTo, Builder builder) { + + super(simpleName, fieldType, mappedFieldType, prefixFieldMapper, phraseFieldMapper, multiFields, copyTo, builder); + } + + public static class Builder extends TextFieldMapper.Builder { + + public Builder(String name, IndexAnalyzers indexAnalyzers) { + super(name, indexAnalyzers); + } + + public Builder(String name, Version indexCreatedVersion, IndexAnalyzers indexAnalyzers) { + super(name, indexCreatedVersion, indexAnalyzers); + } + + @Override + public MatchOnlyTextFieldMapper build(BuilderContext context) { + FieldType fieldType = FIELD_TYPE; + MatchOnlyTextFieldType tft = new MatchOnlyTextFieldType(buildFieldType(fieldType, context)); + return new MatchOnlyTextFieldMapper( + name, + fieldType, + tft, + buildPrefixMapper(context, fieldType, tft), + buildPhraseMapper(fieldType, tft), + multiFieldsBuilder.build(this, context), + copyTo.build(), + this + ); + } + } + + public static final class MatchOnlyTextFieldType extends TextFieldMapper.TextFieldType { + + @Override + public String typeName() { + return CONTENT_TYPE; + } + + public MatchOnlyTextFieldType(TextFieldMapper.TextFieldType tft) { + super(tft.name(), tft.isSearchable(), tft.isStored(), tft.getTextSearchInfo(), tft.meta()); + } + + @Override + public Query phraseQuery(TokenStream stream, int slop, boolean enablePosIncrements, QueryShardContext context) throws IOException { + PhraseQuery phraseQuery = (PhraseQuery) super.phraseQuery(stream, slop, enablePosIncrements); + BooleanQuery.Builder builder = new BooleanQuery.Builder(); + for (Term term: phraseQuery.getTerms()) { + builder.add(new TermQuery(term), BooleanClause.Occur.FILTER); + } + return new SourceFieldMatchQuery(builder.build(), phraseQuery, this, + (SourceValueFetcher) this.valueFetcher(context, context.lookup(), null), context.lookup()); + } + + @Override + public Query multiPhraseQuery(TokenStream stream, int slop, boolean enablePositionIncrements, QueryShardContext context) throws IOException { + MultiPhraseQuery multiPhraseQuery = (MultiPhraseQuery) super.multiPhraseQuery(stream, slop, enablePositionIncrements); + BooleanQuery.Builder builder = new BooleanQuery.Builder(); + for (Term[] terms : multiPhraseQuery.getTermArrays()) { + BooleanQuery.Builder disjunctions = new BooleanQuery.Builder(); + for (Term term: terms) { + disjunctions.add(new TermQuery(term), BooleanClause.Occur.SHOULD); + } + builder.add(disjunctions.build(), BooleanClause.Occur.FILTER); + } + return new SourceFieldMatchQuery(builder.build(), multiPhraseQuery, this, + (SourceValueFetcher) this.valueFetcher(context, context.lookup(), null), context.lookup()); + } + + @Override + public Query phrasePrefixQuery(TokenStream stream, int slop, int maxExpansions, QueryShardContext context) throws IOException { + Query phrasePrefixQuery = super.phrasePrefixQuery(stream, slop, maxExpansions); + List> termArray = getTermsFromTokenStream(stream); + BooleanQuery.Builder builder = new BooleanQuery.Builder(); + for (int i = 0; i < termArray.size(); i++) { + BooleanQuery.Builder disjunctions = new BooleanQuery.Builder(); + for (Term term: termArray.get(i)) { + if (i == termArray.size() - 1) { + MultiPhrasePrefixQuery mqb = new MultiPhrasePrefixQuery(name()); + mqb.add(term); + disjunctions.add(mqb, BooleanClause.Occur.SHOULD); + } else { + disjunctions.add(new TermQuery(term), BooleanClause.Occur.SHOULD); + } + } + builder.add(disjunctions.build(), BooleanClause.Occur.FILTER); + } + return new SourceFieldMatchQuery(builder.build(), phrasePrefixQuery, this, + (SourceValueFetcher) this.valueFetcher(context, context.lookup(), null), context.lookup()); + } + + private List> getTermsFromTokenStream(TokenStream stream) throws IOException { + final List> termArray = new ArrayList<>(); + TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class); + PositionIncrementAttribute posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class); + List currentTerms = new ArrayList<>(); + stream.reset(); + while (stream.incrementToken()) { + if (posIncrAtt.getPositionIncrement() != 0) { + if (currentTerms.isEmpty() == false) { + termArray.add(List.copyOf(currentTerms)); + } + currentTerms.clear(); + } + currentTerms.add(new Term(name(), termAtt.getBytesRef())); + } + termArray.add(List.copyOf(currentTerms)); + return termArray; + } + } +} diff --git a/server/src/main/java/org/opensearch/index/mapper/TextFieldMapper.java b/server/src/main/java/org/opensearch/index/mapper/TextFieldMapper.java index 1d0d1ae2bd899..5780e42105fe5 100644 --- a/server/src/main/java/org/opensearch/index/mapper/TextFieldMapper.java +++ b/server/src/main/java/org/opensearch/index/mapper/TextFieldMapper.java @@ -395,7 +395,7 @@ protected List> getParameters() { ); } - private TextFieldType buildFieldType(FieldType fieldType, BuilderContext context) { + protected TextFieldType buildFieldType(FieldType fieldType, BuilderContext context) { NamedAnalyzer indexAnalyzer = analyzers.getIndexAnalyzer(); NamedAnalyzer searchAnalyzer = analyzers.getSearchAnalyzer(); NamedAnalyzer searchQuoteAnalyzer = analyzers.getSearchQuoteAnalyzer(); @@ -420,7 +420,7 @@ private TextFieldType buildFieldType(FieldType fieldType, BuilderContext context return ft; } - private PrefixFieldMapper buildPrefixMapper(BuilderContext context, FieldType fieldType, TextFieldType tft) { + protected PrefixFieldMapper buildPrefixMapper(BuilderContext context, FieldType fieldType, TextFieldType tft) { if (indexPrefixes.get() == null) { return null; } @@ -454,7 +454,7 @@ private PrefixFieldMapper buildPrefixMapper(BuilderContext context, FieldType fi return new PrefixFieldMapper(pft, prefixFieldType); } - private PhraseFieldMapper buildPhraseMapper(FieldType fieldType, TextFieldType parent) { + protected PhraseFieldMapper buildPhraseMapper(FieldType fieldType, TextFieldType parent) { if (indexPhrases.get() == false) { return null; } @@ -683,7 +683,7 @@ public Query existsQuery(QueryShardContext context) { * * @opensearch.internal */ - private static final class PhraseFieldMapper extends FieldMapper { + protected static final class PhraseFieldMapper extends FieldMapper { PhraseFieldMapper(FieldType fieldType, PhraseFieldType mappedFieldType) { super(mappedFieldType.name(), fieldType, mappedFieldType, MultiFields.empty(), CopyTo.empty()); @@ -710,7 +710,7 @@ protected String contentType() { * * @opensearch.internal */ - private static final class PrefixFieldMapper extends FieldMapper { + protected static final class PrefixFieldMapper extends FieldMapper { protected PrefixFieldMapper(FieldType fieldType, PrefixFieldType mappedFieldType) { super(mappedFieldType.name(), fieldType, mappedFieldType, MultiFields.empty(), CopyTo.empty()); diff --git a/server/src/main/java/org/opensearch/index/query/SourceFieldMatchQuery.java b/server/src/main/java/org/opensearch/index/query/SourceFieldMatchQuery.java new file mode 100644 index 0000000000000..7eb95e4f1b855 --- /dev/null +++ b/server/src/main/java/org/opensearch/index/query/SourceFieldMatchQuery.java @@ -0,0 +1,142 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.index.query; + +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.memory.MemoryIndex; +import org.apache.lucene.search.ConstantScoreScorer; +import org.apache.lucene.search.ConstantScoreWeight; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.QueryVisitor; +import org.apache.lucene.search.ScoreMode; +import org.apache.lucene.search.Scorer; +import org.apache.lucene.search.TwoPhaseIterator; +import org.apache.lucene.search.Weight; +import org.opensearch.index.mapper.MappedFieldType; +import org.opensearch.index.mapper.SourceValueFetcher; +import org.opensearch.search.lookup.LeafSearchLookup; +import org.opensearch.search.lookup.SearchLookup; + +import java.io.IOException; +import java.util.List; +import java.util.Objects; + +/** + * A query that matches against each document from the parent query by filtering using the source field values. + * Useful to query against field type which doesn't store positional data and field is not stored/computed dynamically. + */ +public class SourceFieldMatchQuery extends Query { + final private Query delegateQuery; + final private Query filter; + final private SearchLookup lookup; + final private MappedFieldType fieldType; + final private SourceValueFetcher valueFetcher; + + /** + * Constructs a SourceFieldMatchQuery. + * + * @param delegateQuery The parent query to use to find matches. + * @param filter The query used to filter further by running against field value computed using _source field. + * @param fieldType The mapped field type. + * @param valueFetcher The source value fetcher. + * @param lookup The search lookup. + */ + public SourceFieldMatchQuery(Query delegateQuery, Query filter, MappedFieldType fieldType, + SourceValueFetcher valueFetcher, SearchLookup lookup) { + this.delegateQuery = delegateQuery; + this.filter = filter; + this.fieldType = fieldType; + this.valueFetcher = valueFetcher; + this.lookup = lookup; + } + + @Override + public void visit(QueryVisitor visitor) { + delegateQuery.visit(visitor); + } + + @Override + public Query rewrite(IndexSearcher searcher) throws IOException { + return delegateQuery.rewrite(searcher); + } + + @Override + public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException { + + Weight weight = delegateQuery.createWeight(searcher, scoreMode, boost); + + return new ConstantScoreWeight(this, boost) { + + @Override + public Scorer scorer(LeafReaderContext context) throws IOException { + + Scorer scorer = weight.scorer(context); + DocIdSetIterator approximation = scorer.iterator(); + LeafSearchLookup leafSearchLookup = lookup.getLeafSearchLookup(context); + TwoPhaseIterator twoPhase = new TwoPhaseIterator(approximation) { + + @Override + public boolean matches() { + leafSearchLookup.setDocument(approximation.docID()); + List values = valueFetcher.fetchValues(leafSearchLookup.source()); + MemoryIndex memoryIndex = new MemoryIndex(); + for (Object value : values) { + memoryIndex.addField(fieldType.name(), (String) value, fieldType.indexAnalyzer()); + } + float score = memoryIndex.search(delegateQuery); + return score > 0.0f; + } + + @Override + public float matchCost() { + // arbitrary cost + return 1000f; + } + }; + return new ConstantScoreScorer(this, score(), scoreMode, twoPhase); + } + + @Override + public boolean isCacheable(LeafReaderContext ctx) { + // It is fine to cache if delegate query weight is cacheable since additional logic here + // is just a filter on top of delegate query matches + return weight.isCacheable(ctx); + } + }; + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (sameClassAs(o) == false) { + return false; + } + SourceFieldMatchQuery other = (SourceFieldMatchQuery) o; + return Objects.equals(this.delegateQuery, other.delegateQuery) + && this.filter == other.filter + && Objects.equals(this.lookup, other.lookup) + && Objects.equals(this.fieldType, other.fieldType) + && Objects.equals(this.valueFetcher, other.valueFetcher); + } + + @Override + public int hashCode() { + return Objects.hash(classHash(), delegateQuery, filter, lookup, fieldType, valueFetcher); + } + + @Override + public String toString(String f) { + return "SourceFieldMatchQuery (delegate query: [ " + delegateQuery.toString(f) + + " ], filter query: [ " + filter.toString(f) + "])"; + } +} diff --git a/server/src/main/java/org/opensearch/index/search/MatchQuery.java b/server/src/main/java/org/opensearch/index/search/MatchQuery.java index 9e2b79971369d..69588c944ce06 100644 --- a/server/src/main/java/org/opensearch/index/search/MatchQuery.java +++ b/server/src/main/java/org/opensearch/index/search/MatchQuery.java @@ -67,6 +67,7 @@ import org.opensearch.core.common.io.stream.Writeable; import org.opensearch.index.mapper.KeywordFieldMapper; import org.opensearch.index.mapper.MappedFieldType; +import org.opensearch.index.mapper.MatchOnlyTextFieldMapper; import org.opensearch.index.mapper.TextFieldMapper; import org.opensearch.index.query.QueryShardContext; import org.opensearch.index.query.support.QueryParsers; @@ -701,7 +702,7 @@ private Query analyzeMultiBoolean(String field, TokenStream stream, BooleanClaus protected Query analyzePhrase(String field, TokenStream stream, int slop) throws IOException { try { checkForPositions(field); - return fieldType.phraseQuery(stream, slop, enablePositionIncrements); + return fieldType.phraseQuery(stream, slop, enablePositionIncrements, context); } catch (IllegalArgumentException | IllegalStateException e) { if (lenient) { return newLenientFieldQuery(field, e); @@ -713,8 +714,12 @@ protected Query analyzePhrase(String field, TokenStream stream, int slop) throws @Override protected Query analyzeMultiPhrase(String field, TokenStream stream, int slop) throws IOException { try { + if (fieldType instanceof MatchOnlyTextFieldMapper.MatchOnlyTextFieldType) { + return ((MatchOnlyTextFieldMapper.MatchOnlyTextFieldType) fieldType) + .multiPhraseQuery(stream, slop, enablePositionIncrements, context); + } checkForPositions(field); - return fieldType.multiPhraseQuery(stream, slop, enablePositionIncrements); + return fieldType.multiPhraseQuery(stream, slop, enablePositionIncrements, context); } catch (IllegalArgumentException | IllegalStateException e) { if (lenient) { return newLenientFieldQuery(field, e); @@ -728,7 +733,7 @@ private Query analyzePhrasePrefix(String field, TokenStream stream, int slop, in if (positionCount > 1) { checkForPositions(field); } - return fieldType.phrasePrefixQuery(stream, slop, maxExpansions); + return fieldType.phrasePrefixQuery(stream, slop, maxExpansions, context); } catch (IllegalArgumentException | IllegalStateException e) { if (lenient) { return newLenientFieldQuery(field, e); @@ -887,6 +892,9 @@ private Query analyzeGraphPhrase(TokenStream source, String field, Type type, in private void checkForPositions(String field) { if (fieldType.getTextSearchInfo().hasPositions() == false) { + if (fieldType instanceof MatchOnlyTextFieldMapper.MatchOnlyTextFieldType) { + return; + } throw new IllegalStateException("field:[" + field + "] was indexed without position data; cannot run PhraseQuery"); } } diff --git a/server/src/main/java/org/opensearch/index/search/MultiMatchQuery.java b/server/src/main/java/org/opensearch/index/search/MultiMatchQuery.java index 241f05af2c512..8c0c87e8c9d0c 100644 --- a/server/src/main/java/org/opensearch/index/search/MultiMatchQuery.java +++ b/server/src/main/java/org/opensearch/index/search/MultiMatchQuery.java @@ -248,7 +248,7 @@ protected Query newPrefixQuery(Term term) { protected Query analyzePhrase(String field, TokenStream stream, int slop) throws IOException { List disjunctions = new ArrayList<>(); for (FieldAndBoost fieldType : blendedFields) { - Query query = fieldType.fieldType.phraseQuery(stream, slop, enablePositionIncrements); + Query query = fieldType.fieldType.phraseQuery(stream, slop, enablePositionIncrements, context); if (fieldType.boost != 1f) { query = new BoostQuery(query, fieldType.boost); } @@ -261,7 +261,7 @@ protected Query analyzePhrase(String field, TokenStream stream, int slop) throws protected Query analyzeMultiPhrase(String field, TokenStream stream, int slop) throws IOException { List disjunctions = new ArrayList<>(); for (FieldAndBoost fieldType : blendedFields) { - Query query = fieldType.fieldType.multiPhraseQuery(stream, slop, enablePositionIncrements); + Query query = fieldType.fieldType.multiPhraseQuery(stream, slop, enablePositionIncrements, context); if (fieldType.boost != 1f) { query = new BoostQuery(query, fieldType.boost); } diff --git a/server/src/main/java/org/opensearch/indices/IndicesModule.java b/server/src/main/java/org/opensearch/indices/IndicesModule.java index 5c2137ec742a4..eea5dbbf57f6c 100644 --- a/server/src/main/java/org/opensearch/indices/IndicesModule.java +++ b/server/src/main/java/org/opensearch/indices/IndicesModule.java @@ -59,6 +59,7 @@ import org.opensearch.index.mapper.IpFieldMapper; import org.opensearch.index.mapper.KeywordFieldMapper; import org.opensearch.index.mapper.Mapper; +import org.opensearch.index.mapper.MatchOnlyTextFieldMapper; import org.opensearch.index.mapper.MetadataFieldMapper; import org.opensearch.index.mapper.NestedPathFieldMapper; import org.opensearch.index.mapper.NumberFieldMapper; @@ -158,6 +159,7 @@ public static Map getMappers(List mappe mappers.put(nanoseconds.type(), DateFieldMapper.NANOS_PARSER); mappers.put(IpFieldMapper.CONTENT_TYPE, IpFieldMapper.PARSER); mappers.put(TextFieldMapper.CONTENT_TYPE, TextFieldMapper.PARSER); + mappers.put(MatchOnlyTextFieldMapper.CONTENT_TYPE, MatchOnlyTextFieldMapper.PARSER); mappers.put(KeywordFieldMapper.CONTENT_TYPE, KeywordFieldMapper.PARSER); mappers.put(ObjectMapper.CONTENT_TYPE, new ObjectMapper.TypeParser()); mappers.put(ObjectMapper.NESTED_CONTENT_TYPE, new ObjectMapper.TypeParser());