From b01cf56ce9d7c43703f3dd481cb2e493a19e8542 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 28 Mar 2024 18:49:46 +0000 Subject: [PATCH] [DerivedFields] DerivedFieldScript and query execution logic (#12746) First in a series of commits to support derived fields, a form of schema-on-read. This commit adds: 1. DerivedFieldScript factory: This script factory will be used to execute scripts defined against derived fields of any type. 2. DerivedFieldValueFetcher: The value fetcher contains logic to execute script and fetch the value in form of List. It expects DerivedFieldScript.LeafFactory as an input and sets the contract with consumer to call setNextReader() whenever a segment is switched. 3. DerivedFieldQuery: This query will be used by any of the derived fields. It expects an input query and DerivedFieldValueFetcher. It uses 2-phase iterator approach with approximation iterator set to match all docs. On a match, it creates a lucene MemoryIndex for a given doc, fetches the value of the derived field from _source using DerivedFieldValueFetcher and executes the input query against. --------- Signed-off-by: Rishabh Maurya (cherry picked from commit 70711cfde74a00249e323ccc4198db657b9d9303) Signed-off-by: github-actions[bot] --- .../test/painless/71_context_api.yml | 2 +- .../mapper/DerivedFieldValueFetcher.java | 45 ++++++ .../index/query/DerivedFieldQuery.java | 146 ++++++++++++++++++ .../opensearch/script/DerivedFieldScript.java | 104 +++++++++++++ .../org/opensearch/script/ScriptModule.java | 3 +- .../index/query/DerivedFieldQueryTests.java | 104 +++++++++++++ .../opensearch/script/MockScriptEngine.java | 15 ++ 7 files changed, 417 insertions(+), 2 deletions(-) create mode 100644 server/src/main/java/org/opensearch/index/mapper/DerivedFieldValueFetcher.java create mode 100644 server/src/main/java/org/opensearch/index/query/DerivedFieldQuery.java create mode 100644 server/src/main/java/org/opensearch/script/DerivedFieldScript.java create mode 100644 server/src/test/java/org/opensearch/index/query/DerivedFieldQueryTests.java diff --git a/modules/lang-painless/src/yamlRestTest/resources/rest-api-spec/test/painless/71_context_api.yml b/modules/lang-painless/src/yamlRestTest/resources/rest-api-spec/test/painless/71_context_api.yml index 478ca9ae8abf4..20e6fd351a4b9 100644 --- a/modules/lang-painless/src/yamlRestTest/resources/rest-api-spec/test/painless/71_context_api.yml +++ b/modules/lang-painless/src/yamlRestTest/resources/rest-api-spec/test/painless/71_context_api.yml @@ -2,7 +2,7 @@ - do: scripts_painless_context: {} - match: { contexts.0: aggregation_selector} - - match: { contexts.23: update} + - match: { contexts.24: update} --- "Action to get all API values for score context": diff --git a/server/src/main/java/org/opensearch/index/mapper/DerivedFieldValueFetcher.java b/server/src/main/java/org/opensearch/index/mapper/DerivedFieldValueFetcher.java new file mode 100644 index 0000000000000..f3bf0c613415a --- /dev/null +++ b/server/src/main/java/org/opensearch/index/mapper/DerivedFieldValueFetcher.java @@ -0,0 +1,45 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.index.mapper; + +import org.apache.lucene.index.LeafReaderContext; +import org.opensearch.script.DerivedFieldScript; +import org.opensearch.search.lookup.SourceLookup; + +import java.io.IOException; +import java.util.List; + +/** + * The value fetcher contains logic to execute script and fetch the value in form of list of object. + * It expects DerivedFieldScript.LeafFactory as an input and sets the contract with consumer to call + * {@link #setNextReader(LeafReaderContext)} whenever a segment is switched. + */ +public final class DerivedFieldValueFetcher implements ValueFetcher { + private DerivedFieldScript derivedFieldScript; + private final DerivedFieldScript.LeafFactory derivedFieldScriptFactory; + + public DerivedFieldValueFetcher(DerivedFieldScript.LeafFactory derivedFieldScriptFactory) { + this.derivedFieldScriptFactory = derivedFieldScriptFactory; + } + + @Override + public List fetchValues(SourceLookup lookup) { + derivedFieldScript.setDocument(lookup.docId()); + // TODO: remove List.of() when derivedFieldScript.execute() returns list of objects. + return List.of(derivedFieldScript.execute()); + } + + public void setNextReader(LeafReaderContext context) { + try { + derivedFieldScript = derivedFieldScriptFactory.newInstance(context); + } catch (IOException e) { + throw new RuntimeException(e); + } + } +} diff --git a/server/src/main/java/org/opensearch/index/query/DerivedFieldQuery.java b/server/src/main/java/org/opensearch/index/query/DerivedFieldQuery.java new file mode 100644 index 0000000000000..8beef0bf46be0 --- /dev/null +++ b/server/src/main/java/org/opensearch/index/query/DerivedFieldQuery.java @@ -0,0 +1,146 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.index.query; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.index.IndexableField; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.memory.MemoryIndex; +import org.apache.lucene.search.ConstantScoreScorer; +import org.apache.lucene.search.ConstantScoreWeight; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.QueryVisitor; +import org.apache.lucene.search.ScoreMode; +import org.apache.lucene.search.Scorer; +import org.apache.lucene.search.TwoPhaseIterator; +import org.apache.lucene.search.Weight; +import org.opensearch.index.mapper.DerivedFieldValueFetcher; +import org.opensearch.search.lookup.LeafSearchLookup; +import org.opensearch.search.lookup.SearchLookup; + +import java.io.IOException; +import java.util.List; +import java.util.Objects; +import java.util.function.Function; + +/** + * DerivedFieldQuery used for querying derived fields. It contains the logic to execute an input lucene query against + * DerivedField. It also accepts DerivedFieldValueFetcher and SearchLookup as an input. + */ +public final class DerivedFieldQuery extends Query { + private final Query query; + private final DerivedFieldValueFetcher valueFetcher; + private final SearchLookup searchLookup; + private final Function indexableFieldGenerator; + private final Analyzer indexAnalyzer; + + /** + * @param query lucene query to be executed against the derived field + * @param valueFetcher DerivedFieldValueFetcher ValueFetcher to fetch the value of a derived field from _source + * using LeafSearchLookup + * @param searchLookup SearchLookup to get the LeafSearchLookup look used by valueFetcher to fetch the _source + * @param indexableFieldGenerator used to generate lucene IndexableField from a given object fetched by valueFetcher + * to be used in lucene memory index. + */ + public DerivedFieldQuery( + Query query, + DerivedFieldValueFetcher valueFetcher, + SearchLookup searchLookup, + Function indexableFieldGenerator, + Analyzer indexAnalyzer + ) { + this.query = query; + this.valueFetcher = valueFetcher; + this.searchLookup = searchLookup; + this.indexableFieldGenerator = indexableFieldGenerator; + this.indexAnalyzer = indexAnalyzer; + } + + @Override + public void visit(QueryVisitor visitor) { + query.visit(visitor); + } + + @Override + public Query rewrite(IndexSearcher indexSearcher) throws IOException { + Query rewritten = indexSearcher.rewrite(query); + if (rewritten == query) { + return this; + } + return new DerivedFieldQuery(rewritten, valueFetcher, searchLookup, indexableFieldGenerator, indexAnalyzer); + } + + @Override + public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException { + + return new ConstantScoreWeight(this, boost) { + @Override + public Scorer scorer(LeafReaderContext context) { + DocIdSetIterator approximation = DocIdSetIterator.all(context.reader().maxDoc()); + valueFetcher.setNextReader(context); + LeafSearchLookup leafSearchLookup = searchLookup.getLeafSearchLookup(context); + TwoPhaseIterator twoPhase = new TwoPhaseIterator(approximation) { + @Override + public boolean matches() { + leafSearchLookup.source().setSegmentAndDocument(context, approximation.docID()); + List values = valueFetcher.fetchValues(leafSearchLookup.source()); + // TODO: in case of errors from script, should it be ignored and treated as missing field + // by using a configurable setting? + MemoryIndex memoryIndex = new MemoryIndex(); + for (Object value : values) { + memoryIndex.addField(indexableFieldGenerator.apply(value), indexAnalyzer); + } + float score = memoryIndex.search(query); + return score > 0.0f; + } + + @Override + public float matchCost() { + // TODO: how can we compute this? + return 1000f; + } + }; + return new ConstantScoreScorer(this, score(), scoreMode, twoPhase); + } + + @Override + public boolean isCacheable(LeafReaderContext ctx) { + return false; + } + }; + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (sameClassAs(o) == false) { + return false; + } + DerivedFieldQuery other = (DerivedFieldQuery) o; + return Objects.equals(this.query, other.query) + && Objects.equals(this.valueFetcher, other.valueFetcher) + && Objects.equals(this.searchLookup, other.searchLookup) + && Objects.equals(this.indexableFieldGenerator, other.indexableFieldGenerator) + && Objects.equals(this.indexAnalyzer, other.indexAnalyzer); + } + + @Override + public int hashCode() { + return Objects.hash(classHash(), query, valueFetcher, searchLookup, indexableFieldGenerator, indexableFieldGenerator); + } + + @Override + public String toString(String f) { + return "DerivedFieldQuery (Query: [ " + query.toString(f) + "])"; + } +} diff --git a/server/src/main/java/org/opensearch/script/DerivedFieldScript.java b/server/src/main/java/org/opensearch/script/DerivedFieldScript.java new file mode 100644 index 0000000000000..7f5b991950ec6 --- /dev/null +++ b/server/src/main/java/org/opensearch/script/DerivedFieldScript.java @@ -0,0 +1,104 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.script; + +import org.apache.lucene.index.LeafReaderContext; +import org.opensearch.common.logging.DeprecationLogger; +import org.opensearch.index.fielddata.ScriptDocValues; +import org.opensearch.search.lookup.LeafSearchLookup; +import org.opensearch.search.lookup.SearchLookup; +import org.opensearch.search.lookup.SourceLookup; + +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; +import java.util.function.Function; + +/** + * Definition of Script for DerivedField. + * It will be used to execute scripts defined against derived fields of any type + * + * @opensearch.internal + */ +public abstract class DerivedFieldScript { + + public static final String[] PARAMETERS = {}; + public static final ScriptContext CONTEXT = new ScriptContext<>("derived_field", Factory.class); + private static final DeprecationLogger deprecationLogger = DeprecationLogger.getLogger(DynamicMap.class); + + private static final Map> PARAMS_FUNCTIONS = Map.of( + "doc", + value -> value, + "_source", + value -> ((SourceLookup) value).loadSourceIfNeeded() + ); + + /** + * The generic runtime parameters for the script. + */ + private final Map params; + + /** + * A leaf lookup for the bound segment this script will operate on. + */ + private final LeafSearchLookup leafLookup; + + public DerivedFieldScript(Map params, SearchLookup lookup, LeafReaderContext leafContext) { + Map parameters = new HashMap<>(params); + this.leafLookup = lookup.getLeafSearchLookup(leafContext); + parameters.putAll(leafLookup.asMap()); + this.params = new DynamicMap(parameters, PARAMS_FUNCTIONS); + } + + protected DerivedFieldScript() { + params = null; + leafLookup = null; + } + + /** + * Return the parameters for this script. + */ + public Map getParams() { + return params; + } + + /** + * The doc lookup for the Lucene segment this script was created for. + */ + public Map> getDoc() { + return leafLookup.doc(); + } + + /** + * Set the current document to run the script on next. + */ + public void setDocument(int docid) { + leafLookup.setDocument(docid); + } + + public abstract Object execute(); + + /** + * A factory to construct {@link DerivedFieldScript} instances. + * + * @opensearch.internal + */ + public interface LeafFactory { + DerivedFieldScript newInstance(LeafReaderContext ctx) throws IOException; + } + + /** + * A factory to construct stateful {@link DerivedFieldScript} factories for a specific index. + * + * @opensearch.internal + */ + public interface Factory extends ScriptFactory { + LeafFactory newFactory(Map params, SearchLookup lookup); + } +} diff --git a/server/src/main/java/org/opensearch/script/ScriptModule.java b/server/src/main/java/org/opensearch/script/ScriptModule.java index a192e9553016b..c83a6e64d53eb 100644 --- a/server/src/main/java/org/opensearch/script/ScriptModule.java +++ b/server/src/main/java/org/opensearch/script/ScriptModule.java @@ -78,7 +78,8 @@ public class ScriptModule { ScriptedMetricAggContexts.MapScript.CONTEXT, ScriptedMetricAggContexts.CombineScript.CONTEXT, ScriptedMetricAggContexts.ReduceScript.CONTEXT, - IntervalFilterScript.CONTEXT + IntervalFilterScript.CONTEXT, + DerivedFieldScript.CONTEXT ).collect(Collectors.toMap(c -> c.name, Function.identity())); } diff --git a/server/src/test/java/org/opensearch/index/query/DerivedFieldQueryTests.java b/server/src/test/java/org/opensearch/index/query/DerivedFieldQueryTests.java new file mode 100644 index 0000000000000..18d117fa8c0f5 --- /dev/null +++ b/server/src/test/java/org/opensearch/index/query/DerivedFieldQueryTests.java @@ -0,0 +1,104 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.index.query; + +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.KeywordField; +import org.apache.lucene.document.TextField; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.store.Directory; +import org.opensearch.common.lucene.Lucene; +import org.opensearch.index.mapper.DerivedFieldValueFetcher; +import org.opensearch.script.DerivedFieldScript; +import org.opensearch.script.Script; +import org.opensearch.search.lookup.LeafSearchLookup; +import org.opensearch.search.lookup.SearchLookup; +import org.opensearch.search.lookup.SourceLookup; +import org.opensearch.test.OpenSearchTestCase; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +public class DerivedFieldQueryTests extends OpenSearchTestCase { + + private static final String[][] raw_requests = new String[][] { + { "40.135.0.0 GET /images/hm_bg.jpg HTTP/1.0", "200", "40.135.0.0" }, + { "232.0.0.0 GET /images/hm_bg.jpg HTTP/1.0", "400", "232.0.0.0" }, + { "26.1.0.0 GET /images/hm_bg.jpg HTTP/1.0", "200", "26.1.0.0" }, + { "247.37.0.0 GET /french/splash_inet.html HTTP/1.0", "400", "247.37.0.0" }, + { "247.37.0.0 GET /french/splash_inet.html HTTP/1.0", "400", "247.37.0.0" } }; + + public void testDerivedField() throws IOException { + // Create lucene documents + List docs = new ArrayList<>(); + for (String[] request : raw_requests) { + Document document = new Document(); + document.add(new TextField("raw_request", request[0], Field.Store.YES)); + document.add(new KeywordField("status", request[1], Field.Store.YES)); + docs.add(document); + } + + // Mock SearchLookup + SearchLookup searchLookup = mock(SearchLookup.class); + SourceLookup sourceLookup = new SourceLookup(); + LeafSearchLookup leafLookup = mock(LeafSearchLookup.class); + when(leafLookup.source()).thenReturn(sourceLookup); + + // Mock DerivedFieldScript.Factory + DerivedFieldScript.Factory factory = (params, lookup) -> (DerivedFieldScript.LeafFactory) ctx -> { + when(searchLookup.getLeafSearchLookup(ctx)).thenReturn(leafLookup); + return new DerivedFieldScript(params, lookup, ctx) { + @Override + public Object execute() { + return raw_requests[sourceLookup.docId()][2]; + } + }; + }; + + // Create ValueFetcher from mocked DerivedFieldScript.Factory + DerivedFieldScript.LeafFactory leafFactory = factory.newFactory((new Script("")).getParams(), searchLookup); + DerivedFieldValueFetcher valueFetcher = new DerivedFieldValueFetcher(leafFactory); + + // Create DerivedFieldQuery + DerivedFieldQuery derivedFieldQuery = new DerivedFieldQuery( + new TermQuery(new Term("ip_from_raw_request", "247.37.0.0")), + valueFetcher, + searchLookup, + (o -> new KeywordField("ip_from_raw_request", (String) o, Field.Store.NO)), + Lucene.STANDARD_ANALYZER + ); + + // Index and Search + + try (Directory dir = newDirectory()) { + IndexWriter iw = new IndexWriter(dir, new IndexWriterConfig(Lucene.STANDARD_ANALYZER)); + for (Document d : docs) { + iw.addDocument(d); + } + try (IndexReader reader = DirectoryReader.open(iw)) { + iw.close(); + IndexSearcher searcher = new IndexSearcher(reader); + TopDocs topDocs = searcher.search(derivedFieldQuery, 10); + assertEquals(2, topDocs.totalHits.value); + } + } + } +} diff --git a/test/framework/src/main/java/org/opensearch/script/MockScriptEngine.java b/test/framework/src/main/java/org/opensearch/script/MockScriptEngine.java index 83b245a1bcecb..8c7e9718eb0cd 100644 --- a/test/framework/src/main/java/org/opensearch/script/MockScriptEngine.java +++ b/test/framework/src/main/java/org/opensearch/script/MockScriptEngine.java @@ -281,7 +281,22 @@ public double execute(Map params1, double[] values) { } else if (context.instanceClazz.equals(IntervalFilterScript.class)) { IntervalFilterScript.Factory factory = mockCompiled::createIntervalFilterScript; return context.factoryClazz.cast(factory); + } else if (context.instanceClazz.equals(DerivedFieldScript.class)) { + DerivedFieldScript.Factory factory = (derivedFieldsParams, lookup) -> ctx -> new DerivedFieldScript( + derivedFieldsParams, + lookup, + ctx + ) { + @Override + public Object execute() { + Map vars = new HashMap<>(derivedFieldsParams); + vars.put("params", derivedFieldsParams); + return script.apply(vars); + } + }; + return context.factoryClazz.cast(factory); } + ContextCompiler compiler = contexts.get(context); if (compiler != null) { return context.factoryClazz.cast(compiler.compile(script::apply, params));