[DerivedFields] DerivedFieldScript and query execution logic (#12746) (…

…#12968) First in a series of commits to support derived fields, a form of schema-on-read. This commit adds: 1. DerivedFieldScript factory: This script factory will be used to execute scripts defined against derived fields of any type. 2. DerivedFieldValueFetcher: The value fetcher contains logic to execute script and fetch the value in form of List<Object>. It expects DerivedFieldScript.LeafFactory as an input and sets the contract with consumer to call setNextReader() whenever a segment is switched. 3. DerivedFieldQuery: This query will be used by any of the derived fields. It expects an input query and DerivedFieldValueFetcher. It uses 2-phase iterator approach with approximation iterator set to match all docs. On a match, it creates a lucene MemoryIndex for a given doc, fetches the value of the derived field from _source using DerivedFieldValueFetcher and executes the input query against. --------- (cherry picked from commit 70711cf) Signed-off-by: Rishabh Maurya <[email protected]> Signed-off-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com> Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
opensearch-project · Apr 2, 2024 · 993927a · 993927a
1 parent 6919ee1
commit 993927a
Show file tree

Hide file tree

Showing 7 changed files with 417 additions and 2 deletions.
diff --git a/...s/lang-painless/src/yamlRestTest/resources/rest-api-spec/test/painless/71_context_api.yml b/...s/lang-painless/src/yamlRestTest/resources/rest-api-spec/test/painless/71_context_api.yml
@@ -2,7 +2,7 @@
     - do:
         scripts_painless_context: {}
     - match: { contexts.0: aggregation_selector}
-    - match: { contexts.23: update}
+    - match: { contexts.24: update}
 ---
 
 "Action to get all API values for score context":

diff --git a/server/src/main/java/org/opensearch/index/mapper/DerivedFieldValueFetcher.java b/server/src/main/java/org/opensearch/index/mapper/DerivedFieldValueFetcher.java
@@ -0,0 +1,45 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * The OpenSearch Contributors require contributions made to
+ * this file be licensed under the Apache-2.0 license or a
+ * compatible open source license.
+ */
+
+package org.opensearch.index.mapper;
+
+import org.apache.lucene.index.LeafReaderContext;
+import org.opensearch.script.DerivedFieldScript;
+import org.opensearch.search.lookup.SourceLookup;
+
+import java.io.IOException;
+import java.util.List;
+
+/**
+ * The value fetcher contains logic to execute script and fetch the value in form of list of object.
+ * It expects DerivedFieldScript.LeafFactory as an input and sets the contract with consumer to call
+ * {@link #setNextReader(LeafReaderContext)} whenever a segment is switched.
+ */
+public final class DerivedFieldValueFetcher implements ValueFetcher {
+    private DerivedFieldScript derivedFieldScript;
+    private final DerivedFieldScript.LeafFactory derivedFieldScriptFactory;
+
+    public DerivedFieldValueFetcher(DerivedFieldScript.LeafFactory derivedFieldScriptFactory) {
+        this.derivedFieldScriptFactory = derivedFieldScriptFactory;
+    }
+
+    @Override
+    public List<Object> fetchValues(SourceLookup lookup) {
+        derivedFieldScript.setDocument(lookup.docId());
+        // TODO: remove List.of() when derivedFieldScript.execute() returns list of objects.
+        return List.of(derivedFieldScript.execute());
+    }
+
+    public void setNextReader(LeafReaderContext context) {
+        try {
+            derivedFieldScript = derivedFieldScriptFactory.newInstance(context);
+        } catch (IOException e) {
+            throw new RuntimeException(e);
+        }
+    }
+}
diff --git a/server/src/main/java/org/opensearch/index/query/DerivedFieldQuery.java b/server/src/main/java/org/opensearch/index/query/DerivedFieldQuery.java
@@ -0,0 +1,146 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * The OpenSearch Contributors require contributions made to
+ * this file be licensed under the Apache-2.0 license or a
+ * compatible open source license.
+ */
+
+package org.opensearch.index.query;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.index.IndexableField;
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.index.memory.MemoryIndex;
+import org.apache.lucene.search.ConstantScoreScorer;
+import org.apache.lucene.search.ConstantScoreWeight;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.QueryVisitor;
+import org.apache.lucene.search.ScoreMode;
+import org.apache.lucene.search.Scorer;
+import org.apache.lucene.search.TwoPhaseIterator;
+import org.apache.lucene.search.Weight;
+import org.opensearch.index.mapper.DerivedFieldValueFetcher;
+import org.opensearch.search.lookup.LeafSearchLookup;
+import org.opensearch.search.lookup.SearchLookup;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Objects;
+import java.util.function.Function;
+
+/**
+ * DerivedFieldQuery used for querying derived fields. It contains the logic to execute an input lucene query against
+ * DerivedField. It also accepts DerivedFieldValueFetcher and SearchLookup as an input.
+ */
+public final class DerivedFieldQuery extends Query {
+    private final Query query;
+    private final DerivedFieldValueFetcher valueFetcher;
+    private final SearchLookup searchLookup;
+    private final Function<Object, IndexableField> indexableFieldGenerator;
+    private final Analyzer indexAnalyzer;
+
+    /**
+     * @param query lucene query to be executed against the derived field
+     * @param valueFetcher DerivedFieldValueFetcher ValueFetcher to fetch the value of a derived field from _source
+     *                     using LeafSearchLookup
+     * @param searchLookup SearchLookup to get the LeafSearchLookup look used by valueFetcher to fetch the _source
+     * @param indexableFieldGenerator used to generate lucene IndexableField from a given object fetched by valueFetcher
+     *                                to be used in lucene memory index.
+     */
+    public DerivedFieldQuery(
+        Query query,
+        DerivedFieldValueFetcher valueFetcher,
+        SearchLookup searchLookup,
+        Function<Object, IndexableField> indexableFieldGenerator,
+        Analyzer indexAnalyzer
+    ) {
+        this.query = query;
+        this.valueFetcher = valueFetcher;
+        this.searchLookup = searchLookup;
+        this.indexableFieldGenerator = indexableFieldGenerator;
+        this.indexAnalyzer = indexAnalyzer;
+    }
+
+    @Override
+    public void visit(QueryVisitor visitor) {
+        query.visit(visitor);
+    }
+
+    @Override
+    public Query rewrite(IndexSearcher indexSearcher) throws IOException {
+        Query rewritten = indexSearcher.rewrite(query);
+        if (rewritten == query) {
+            return this;
+        }
+        return new DerivedFieldQuery(rewritten, valueFetcher, searchLookup, indexableFieldGenerator, indexAnalyzer);
+    }
+
+    @Override
+    public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException {
+
+        return new ConstantScoreWeight(this, boost) {
+            @Override
+            public Scorer scorer(LeafReaderContext context) {
+                DocIdSetIterator approximation = DocIdSetIterator.all(context.reader().maxDoc());
+                valueFetcher.setNextReader(context);
+                LeafSearchLookup leafSearchLookup = searchLookup.getLeafSearchLookup(context);
+                TwoPhaseIterator twoPhase = new TwoPhaseIterator(approximation) {
+                    @Override
+                    public boolean matches() {
+                        leafSearchLookup.source().setSegmentAndDocument(context, approximation.docID());
+                        List<Object> values = valueFetcher.fetchValues(leafSearchLookup.source());
+                        // TODO: in case of errors from script, should it be ignored and treated as missing field
+                        // by using a configurable setting?
+                        MemoryIndex memoryIndex = new MemoryIndex();
+                        for (Object value : values) {
+                            memoryIndex.addField(indexableFieldGenerator.apply(value), indexAnalyzer);
+                        }
+                        float score = memoryIndex.search(query);
+                        return score > 0.0f;
+                    }
+
+                    @Override
+                    public float matchCost() {
+                        // TODO: how can we compute this?
+                        return 1000f;
+                    }
+                };
+                return new ConstantScoreScorer(this, score(), scoreMode, twoPhase);
+            }
+
+            @Override
+            public boolean isCacheable(LeafReaderContext ctx) {
+                return false;
+            }
+        };
+    }
+
+    @Override
+    public boolean equals(Object o) {
+        if (this == o) {
+            return true;
+        }
+        if (sameClassAs(o) == false) {
+            return false;
+        }
+        DerivedFieldQuery other = (DerivedFieldQuery) o;
+        return Objects.equals(this.query, other.query)
+            && Objects.equals(this.valueFetcher, other.valueFetcher)
+            && Objects.equals(this.searchLookup, other.searchLookup)
+            && Objects.equals(this.indexableFieldGenerator, other.indexableFieldGenerator)
+            && Objects.equals(this.indexAnalyzer, other.indexAnalyzer);
+    }
+
+    @Override
+    public int hashCode() {
+        return Objects.hash(classHash(), query, valueFetcher, searchLookup, indexableFieldGenerator, indexableFieldGenerator);
+    }
+
+    @Override
+    public String toString(String f) {
+        return "DerivedFieldQuery (Query: [ " + query.toString(f) + "])";
+    }
+}
diff --git a/server/src/main/java/org/opensearch/script/DerivedFieldScript.java b/server/src/main/java/org/opensearch/script/DerivedFieldScript.java
@@ -0,0 +1,104 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * The OpenSearch Contributors require contributions made to
+ * this file be licensed under the Apache-2.0 license or a
+ * compatible open source license.
+ */
+
+package org.opensearch.script;
+
+import org.apache.lucene.index.LeafReaderContext;
+import org.opensearch.common.logging.DeprecationLogger;
+import org.opensearch.index.fielddata.ScriptDocValues;
+import org.opensearch.search.lookup.LeafSearchLookup;
+import org.opensearch.search.lookup.SearchLookup;
+import org.opensearch.search.lookup.SourceLookup;
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.function.Function;
+
+/**
+ * Definition of Script for DerivedField.
+ * It will be used to execute scripts defined against derived fields of any type
+ *
+ * @opensearch.internal
+ */
+public abstract class DerivedFieldScript {
+
+    public static final String[] PARAMETERS = {};
+    public static final ScriptContext<Factory> CONTEXT = new ScriptContext<>("derived_field", Factory.class);
+    private static final DeprecationLogger deprecationLogger = DeprecationLogger.getLogger(DynamicMap.class);
+
+    private static final Map<String, Function<Object, Object>> PARAMS_FUNCTIONS = Map.of(
+        "doc",
+        value -> value,
+        "_source",
+        value -> ((SourceLookup) value).loadSourceIfNeeded()
+    );
+
+    /**
+     * The generic runtime parameters for the script.
+     */
+    private final Map<String, Object> params;
+
+    /**
+     * A leaf lookup for the bound segment this script will operate on.
+     */
+    private final LeafSearchLookup leafLookup;
+
+    public DerivedFieldScript(Map<String, Object> params, SearchLookup lookup, LeafReaderContext leafContext) {
+        Map<String, Object> parameters = new HashMap<>(params);
+        this.leafLookup = lookup.getLeafSearchLookup(leafContext);
+        parameters.putAll(leafLookup.asMap());
+        this.params = new DynamicMap(parameters, PARAMS_FUNCTIONS);
+    }
+
+    protected DerivedFieldScript() {
+        params = null;
+        leafLookup = null;
+    }
+
+    /**
+     * Return the parameters for this script.
+     */
+    public Map<String, Object> getParams() {
+        return params;
+    }
+
+    /**
+     * The doc lookup for the Lucene segment this script was created for.
+     */
+    public Map<String, ScriptDocValues<?>> getDoc() {
+        return leafLookup.doc();
+    }
+
+    /**
+     * Set the current document to run the script on next.
+     */
+    public void setDocument(int docid) {
+        leafLookup.setDocument(docid);
+    }
+
+    public abstract Object execute();
+
+    /**
+     * A factory to construct {@link DerivedFieldScript} instances.
+     *
+     * @opensearch.internal
+     */
+    public interface LeafFactory {
+        DerivedFieldScript newInstance(LeafReaderContext ctx) throws IOException;
+    }
+
+    /**
+     * A factory to construct stateful {@link DerivedFieldScript} factories for a specific index.
+     *
+     * @opensearch.internal
+     */
+    public interface Factory extends ScriptFactory {
+        LeafFactory newFactory(Map<String, Object> params, SearchLookup lookup);
+    }
+}
diff --git a/server/src/main/java/org/opensearch/script/ScriptModule.java b/server/src/main/java/org/opensearch/script/ScriptModule.java
@@ -78,7 +78,8 @@ public class ScriptModule {
             ScriptedMetricAggContexts.MapScript.CONTEXT,
             ScriptedMetricAggContexts.CombineScript.CONTEXT,
             ScriptedMetricAggContexts.ReduceScript.CONTEXT,
-            IntervalFilterScript.CONTEXT
+            IntervalFilterScript.CONTEXT,
+            DerivedFieldScript.CONTEXT
         ).collect(Collectors.toMap(c -> c.name, Function.identity()));
     }