From e1304593b2115e4b233b8287a564d245c1f381eb Mon Sep 17 00:00:00 2001 From: Jim Ferenczi Date: Wed, 4 Dec 2024 22:55:29 +0000 Subject: [PATCH] Add option to store `sparse_vector` outside `_source` (#117917) (#118018) This PR introduces an option for `sparse_vector` to store its values separately from `_source` by using term vectors. This capability is primarly needed by the semantic text field. --- docs/changelog/117917.yaml | 5 + .../mapping/types/sparse-vector.asciidoc | 17 ++ .../test/search.vectors/90_sparse_vector.yml | 117 ++++++++++++ .../index/mapper/MapperFeatures.java | 4 +- .../vectors/SparseVectorFieldMapper.java | 155 ++++++++++++++- .../index/mapper/vectors/XFeatureField.java | 177 ++++++++++++++++++ .../vectors/SparseVectorFieldMapperTests.java | 135 +++++++++++-- .../vectors/SparseVectorFieldTypeTests.java | 4 +- .../mapper/SemanticTextFieldMapperTests.java | 4 +- 9 files changed, 589 insertions(+), 29 deletions(-) create mode 100644 docs/changelog/117917.yaml create mode 100644 server/src/main/java/org/elasticsearch/index/mapper/vectors/XFeatureField.java diff --git a/docs/changelog/117917.yaml b/docs/changelog/117917.yaml new file mode 100644 index 000000000000..b6dc90f6b903 --- /dev/null +++ b/docs/changelog/117917.yaml @@ -0,0 +1,5 @@ +pr: 117917 +summary: Add option to store `sparse_vector` outside `_source` +area: Mapping +type: feature +issues: [] diff --git a/docs/reference/mapping/types/sparse-vector.asciidoc b/docs/reference/mapping/types/sparse-vector.asciidoc index b24f65fcf97c..22d4644ede49 100644 --- a/docs/reference/mapping/types/sparse-vector.asciidoc +++ b/docs/reference/mapping/types/sparse-vector.asciidoc @@ -26,6 +26,23 @@ PUT my-index See <> for a complete example on adding documents to a `sparse_vector` mapped field using ELSER. +[[sparse-vectors-params]] +==== Parameters for `sparse_vector` fields + +The following parameters are accepted by `sparse_vector` fields: + +[horizontal] + +<>:: + +Indicates whether the field value should be stored and retrievable independently of the <> field. +Accepted values: true or false (default). +The field's data is stored using term vectors, a disk-efficient structure compared to the original JSON input. +The input map can be retrieved during a search request via the <>. +To benefit from reduced disk usage, you must either: + * Exclude the field from <>. + * Use <>. + [[index-multi-value-sparse-vectors]] ==== Multi-value sparse vectors diff --git a/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/90_sparse_vector.yml b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/90_sparse_vector.yml index 2505e6d7e353..0b65a69bf500 100644 --- a/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/90_sparse_vector.yml +++ b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/90_sparse_vector.yml @@ -472,3 +472,120 @@ - match: _source.ml.tokens: {} + +--- +"stored sparse_vector": + + - requires: + cluster_features: [ "mapper.sparse_vector.store_support" ] + reason: "sparse_vector supports store parameter" + + - do: + indices.create: + index: test + body: + mappings: + properties: + ml.tokens: + type: sparse_vector + store: true + + - match: { acknowledged: true } + - do: + index: + index: test + id: "1" + body: + ml: + tokens: + running: 2 + good: 3 + run: 5 + race: 7 + for: 9 + + - match: { result: "created" } + + - do: + indices.refresh: { } + + - do: + search: + index: test + body: + fields: [ "ml.tokens" ] + + - length: { hits.hits.0.fields.ml\\.tokens: 1 } + - length: { hits.hits.0.fields.ml\\.tokens.0: 5 } + - match: { hits.hits.0.fields.ml\\.tokens.0.running: 2.0 } + - match: { hits.hits.0.fields.ml\\.tokens.0.good: 3.0 } + - match: { hits.hits.0.fields.ml\\.tokens.0.run: 5.0 } + - match: { hits.hits.0.fields.ml\\.tokens.0.race: 7.0 } + - match: { hits.hits.0.fields.ml\\.tokens.0.for: 9.0 } + +--- +"stored sparse_vector synthetic source": + + - requires: + cluster_features: [ "mapper.source.mode_from_index_setting", "mapper.sparse_vector.store_support" ] + reason: "sparse_vector supports store parameter" + + - do: + indices.create: + index: test + body: + settings: + index: + mapping.source.mode: synthetic + mappings: + properties: + ml.tokens: + type: sparse_vector + store: true + + - match: { acknowledged: true } + + - do: + index: + index: test + id: "1" + body: + ml: + tokens: + running: 2 + good: 3 + run: 5 + race: 7 + for: 9 + + - match: { result: "created" } + + - do: + indices.refresh: { } + + - do: + search: + index: test + body: + fields: [ "ml.tokens" ] + + - match: + hits.hits.0._source: { + ml: { + tokens: { + running: 2.0, + good: 3.0, + run: 5.0, + race: 7.0, + for: 9.0 + } + } + } + + - length: { hits.hits.0.fields.ml\\.tokens: 1 } + - length: { hits.hits.0.fields.ml\\.tokens.0: 5 } + - match: { hits.hits.0.fields.ml\\.tokens.0.running: 2.0 } + - match: { hits.hits.0.fields.ml\\.tokens.0.good: 3.0 } + - match: { hits.hits.0.fields.ml\\.tokens.0.run: 5.0 } + - match: { hits.hits.0.fields.ml\\.tokens.0.race: 7.0 } + - match: { hits.hits.0.fields.ml\\.tokens.0.for: 9.0 } diff --git a/server/src/main/java/org/elasticsearch/index/mapper/MapperFeatures.java b/server/src/main/java/org/elasticsearch/index/mapper/MapperFeatures.java index 1f310287ef74..193312477dd0 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/MapperFeatures.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/MapperFeatures.java @@ -56,6 +56,7 @@ public Set getFeatures() { ); public static final NodeFeature META_FETCH_FIELDS_ERROR_CODE_CHANGED = new NodeFeature("meta_fetch_fields_error_code_changed"); + public static final NodeFeature SPARSE_VECTOR_STORE_SUPPORT = new NodeFeature("mapper.sparse_vector.store_support"); @Override public Set getTestFeatures() { @@ -68,7 +69,8 @@ public Set getTestFeatures() { MapperService.LOGSDB_DEFAULT_IGNORE_DYNAMIC_BEYOND_LIMIT, DocumentParser.FIX_PARSING_SUBOBJECTS_FALSE_DYNAMIC_FALSE, CONSTANT_KEYWORD_SYNTHETIC_SOURCE_WRITE_FIX, - META_FETCH_FIELDS_ERROR_CODE_CHANGED + META_FETCH_FIELDS_ERROR_CODE_CHANGED, + SPARSE_VECTOR_STORE_SUPPORT ); } } diff --git a/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java index d0a8dfae4f24..62740b0fc380 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java @@ -11,6 +11,12 @@ import org.apache.lucene.document.FeatureField; import org.apache.lucene.index.IndexableField; +import org.apache.lucene.index.LeafReader; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.PostingsEnum; +import org.apache.lucene.index.TermVectors; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.MatchNoDocsQuery; import org.apache.lucene.search.Query; import org.apache.lucene.util.BytesRef; @@ -25,14 +31,22 @@ import org.elasticsearch.index.mapper.FieldMapper; import org.elasticsearch.index.mapper.MappedFieldType; import org.elasticsearch.index.mapper.MapperBuilderContext; +import org.elasticsearch.index.mapper.SourceLoader; import org.elasticsearch.index.mapper.SourceValueFetcher; import org.elasticsearch.index.mapper.TextSearchInfo; import org.elasticsearch.index.mapper.ValueFetcher; import org.elasticsearch.index.query.SearchExecutionContext; +import org.elasticsearch.search.fetch.StoredFieldsSpec; +import org.elasticsearch.search.lookup.Source; +import org.elasticsearch.xcontent.XContentBuilder; import org.elasticsearch.xcontent.XContentParser.Token; import java.io.IOException; +import java.io.UncheckedIOException; +import java.util.LinkedHashMap; +import java.util.List; import java.util.Map; +import java.util.stream.Stream; import static org.elasticsearch.index.query.AbstractQueryBuilder.DEFAULT_BOOST; @@ -52,8 +66,12 @@ public class SparseVectorFieldMapper extends FieldMapper { static final IndexVersion NEW_SPARSE_VECTOR_INDEX_VERSION = IndexVersions.NEW_SPARSE_VECTOR; static final IndexVersion SPARSE_VECTOR_IN_FIELD_NAMES_INDEX_VERSION = IndexVersions.SPARSE_VECTOR_IN_FIELD_NAMES_SUPPORT; - public static class Builder extends FieldMapper.Builder { + private static SparseVectorFieldMapper toType(FieldMapper in) { + return (SparseVectorFieldMapper) in; + } + public static class Builder extends FieldMapper.Builder { + private final Parameter stored = Parameter.storeParam(m -> toType(m).fieldType().isStored(), false); private final Parameter> meta = Parameter.metaParam(); public Builder(String name) { @@ -62,14 +80,14 @@ public Builder(String name) { @Override protected Parameter[] getParameters() { - return new Parameter[] { meta }; + return new Parameter[] { stored, meta }; } @Override public SparseVectorFieldMapper build(MapperBuilderContext context) { return new SparseVectorFieldMapper( leafName(), - new SparseVectorFieldType(context.buildFullName(leafName()), meta.getValue()), + new SparseVectorFieldType(context.buildFullName(leafName()), stored.getValue(), meta.getValue()), builderParams(this, context) ); } @@ -87,8 +105,8 @@ public SparseVectorFieldMapper build(MapperBuilderContext context) { public static final class SparseVectorFieldType extends MappedFieldType { - public SparseVectorFieldType(String name, Map meta) { - super(name, true, false, false, TextSearchInfo.SIMPLE_MATCH_ONLY, meta); + public SparseVectorFieldType(String name, boolean isStored, Map meta) { + super(name, true, isStored, false, TextSearchInfo.SIMPLE_MATCH_ONLY, meta); } @Override @@ -103,6 +121,9 @@ public IndexFieldData.Builder fielddataBuilder(FieldDataContext fieldDataContext @Override public ValueFetcher valueFetcher(SearchExecutionContext context, String format) { + if (isStored()) { + return new SparseVectorValueFetcher(name()); + } return SourceValueFetcher.identity(name(), context, format); } @@ -135,6 +156,14 @@ private SparseVectorFieldMapper(String simpleName, MappedFieldType mappedFieldTy super(simpleName, mappedFieldType, builderParams); } + @Override + protected SyntheticSourceSupport syntheticSourceSupport() { + if (fieldType().isStored()) { + return new SyntheticSourceSupport.Native(new SparseVectorSyntheticFieldLoader(fullPath(), leafName())); + } + return super.syntheticSourceSupport(); + } + @Override public Map indexAnalyzers() { return Map.of(mappedFieldType.name(), Lucene.KEYWORD_ANALYZER); @@ -189,9 +218,9 @@ public void parse(DocumentParserContext context) throws IOException { // based on recommendations from this paper: https://arxiv.org/pdf/2305.18494.pdf IndexableField currentField = context.doc().getByKey(key); if (currentField == null) { - context.doc().addWithKey(key, new FeatureField(fullPath(), feature, value)); - } else if (currentField instanceof FeatureField && ((FeatureField) currentField).getFeatureValue() < value) { - ((FeatureField) currentField).setFeatureValue(value); + context.doc().addWithKey(key, new XFeatureField(fullPath(), feature, value, fieldType().isStored())); + } else if (currentField instanceof XFeatureField && ((XFeatureField) currentField).getFeatureValue() < value) { + ((XFeatureField) currentField).setFeatureValue(value); } } else { throw new IllegalArgumentException( @@ -219,4 +248,114 @@ protected String contentType() { return CONTENT_TYPE; } + private static class SparseVectorValueFetcher implements ValueFetcher { + private final String fieldName; + private TermVectors termVectors; + + private SparseVectorValueFetcher(String fieldName) { + this.fieldName = fieldName; + } + + @Override + public void setNextReader(LeafReaderContext context) { + try { + termVectors = context.reader().termVectors(); + } catch (IOException exc) { + throw new UncheckedIOException(exc); + } + } + + @Override + public List fetchValues(Source source, int doc, List ignoredValues) throws IOException { + if (termVectors == null) { + return List.of(); + } + var terms = termVectors.get(doc, fieldName); + if (terms == null) { + return List.of(); + } + + var termsEnum = terms.iterator(); + PostingsEnum postingsScratch = null; + Map result = new LinkedHashMap<>(); + while (termsEnum.next() != null) { + postingsScratch = termsEnum.postings(postingsScratch); + postingsScratch.nextDoc(); + result.put(termsEnum.term().utf8ToString(), XFeatureField.decodeFeatureValue(postingsScratch.freq())); + assert postingsScratch.nextDoc() == DocIdSetIterator.NO_MORE_DOCS; + } + return List.of(result); + } + + @Override + public StoredFieldsSpec storedFieldsSpec() { + return StoredFieldsSpec.NO_REQUIREMENTS; + } + } + + private static class SparseVectorSyntheticFieldLoader implements SourceLoader.SyntheticFieldLoader { + private final String fullPath; + private final String leafName; + + private TermsEnum termsDocEnum; + + private SparseVectorSyntheticFieldLoader(String fullPath, String leafName) { + this.fullPath = fullPath; + this.leafName = leafName; + } + + @Override + public Stream> storedFieldLoaders() { + return Stream.of(); + } + + @Override + public DocValuesLoader docValuesLoader(LeafReader leafReader, int[] docIdsInLeaf) throws IOException { + var fieldInfos = leafReader.getFieldInfos().fieldInfo(fullPath); + if (fieldInfos == null || fieldInfos.hasVectors() == false) { + return null; + } + return docId -> { + var terms = leafReader.termVectors().get(docId, fullPath); + if (terms == null) { + return false; + } + termsDocEnum = terms.iterator(); + if (termsDocEnum.next() == null) { + termsDocEnum = null; + return false; + } + return true; + }; + } + + @Override + public boolean hasValue() { + return termsDocEnum != null; + } + + @Override + public void write(XContentBuilder b) throws IOException { + assert termsDocEnum != null; + PostingsEnum reuse = null; + b.startObject(leafName); + do { + reuse = termsDocEnum.postings(reuse); + reuse.nextDoc(); + b.field(termsDocEnum.term().utf8ToString(), XFeatureField.decodeFeatureValue(reuse.freq())); + } while (termsDocEnum.next() != null); + b.endObject(); + } + + @Override + public String fieldName() { + return leafName; + } + + @Override + public void reset() { + termsDocEnum = null; + } + } + } diff --git a/server/src/main/java/org/elasticsearch/index/mapper/vectors/XFeatureField.java b/server/src/main/java/org/elasticsearch/index/mapper/vectors/XFeatureField.java new file mode 100644 index 000000000000..5f4afb4a86ac --- /dev/null +++ b/server/src/main/java/org/elasticsearch/index/mapper/vectors/XFeatureField.java @@ -0,0 +1,177 @@ +/* + * @notice + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.elasticsearch.index.mapper.vectors; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.TermFrequencyAttribute; +import org.apache.lucene.document.FeatureField; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.FieldType; +import org.apache.lucene.index.IndexOptions; + +/** + * This class is forked from the Lucene {@link FeatureField} implementation to enable support for storing term vectors. + * It should be removed once apache/lucene#14034 becomes available. + */ +public final class XFeatureField extends Field { + private static final FieldType FIELD_TYPE = new FieldType(); + private static final FieldType FIELD_TYPE_STORE_TERM_VECTORS = new FieldType(); + + static { + FIELD_TYPE.setTokenized(false); + FIELD_TYPE.setOmitNorms(true); + FIELD_TYPE.setIndexOptions(IndexOptions.DOCS_AND_FREQS); + + FIELD_TYPE_STORE_TERM_VECTORS.setTokenized(false); + FIELD_TYPE_STORE_TERM_VECTORS.setOmitNorms(true); + FIELD_TYPE_STORE_TERM_VECTORS.setIndexOptions(IndexOptions.DOCS_AND_FREQS); + FIELD_TYPE_STORE_TERM_VECTORS.setStoreTermVectors(true); + } + + private float featureValue; + + /** + * Create a feature. + * + * @param fieldName The name of the field to store the information into. All features may be + * stored in the same field. + * @param featureName The name of the feature, eg. 'pagerank`. It will be indexed as a term. + * @param featureValue The value of the feature, must be a positive, finite, normal float. + */ + public XFeatureField(String fieldName, String featureName, float featureValue) { + this(fieldName, featureName, featureValue, false); + } + + /** + * Create a feature. + * + * @param fieldName The name of the field to store the information into. All features may be + * stored in the same field. + * @param featureName The name of the feature, eg. 'pagerank`. It will be indexed as a term. + * @param featureValue The value of the feature, must be a positive, finite, normal float. + */ + public XFeatureField(String fieldName, String featureName, float featureValue, boolean storeTermVectors) { + super(fieldName, featureName, storeTermVectors ? FIELD_TYPE_STORE_TERM_VECTORS : FIELD_TYPE); + setFeatureValue(featureValue); + } + + /** + * Update the feature value of this field. + */ + public void setFeatureValue(float featureValue) { + if (Float.isFinite(featureValue) == false) { + throw new IllegalArgumentException( + "featureValue must be finite, got: " + featureValue + " for feature " + fieldsData + " on field " + name + ); + } + if (featureValue < Float.MIN_NORMAL) { + throw new IllegalArgumentException( + "featureValue must be a positive normal float, got: " + + featureValue + + " for feature " + + fieldsData + + " on field " + + name + + " which is less than the minimum positive normal float: " + + Float.MIN_NORMAL + ); + } + this.featureValue = featureValue; + } + + @Override + public TokenStream tokenStream(Analyzer analyzer, TokenStream reuse) { + FeatureTokenStream stream; + if (reuse instanceof FeatureTokenStream) { + stream = (FeatureTokenStream) reuse; + } else { + stream = new FeatureTokenStream(); + } + + int freqBits = Float.floatToIntBits(featureValue); + stream.setValues((String) fieldsData, freqBits >>> 15); + return stream; + } + + /** + * This is useful if you have multiple features sharing a name and you want to take action to + * deduplicate them. + * + * @return the feature value of this field. + */ + public float getFeatureValue() { + return featureValue; + } + + private static final class FeatureTokenStream extends TokenStream { + private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class); + private final TermFrequencyAttribute freqAttribute = addAttribute(TermFrequencyAttribute.class); + private boolean used = true; + private String value = null; + private int freq = 0; + + private FeatureTokenStream() {} + + /** + * Sets the values + */ + void setValues(String value, int freq) { + this.value = value; + this.freq = freq; + } + + @Override + public boolean incrementToken() { + if (used) { + return false; + } + clearAttributes(); + termAttribute.append(value); + freqAttribute.setTermFrequency(freq); + used = true; + return true; + } + + @Override + public void reset() { + used = false; + } + + @Override + public void close() { + value = null; + } + } + + static final int MAX_FREQ = Float.floatToIntBits(Float.MAX_VALUE) >>> 15; + + static float decodeFeatureValue(float freq) { + if (freq > MAX_FREQ) { + // This is never used in practice but callers of the SimScorer API might + // occasionally call it on eg. Float.MAX_VALUE to compute the max score + // so we need to be consistent. + return Float.MAX_VALUE; + } + int tf = (int) freq; // lossless + int featureBits = tf << 15; + return Float.intBitsToFloat(featureBits); + } +} diff --git a/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java b/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java index 8caab46ef33a..2bb351ae6495 100644 --- a/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java +++ b/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java @@ -11,19 +11,26 @@ import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.TermFrequencyAttribute; -import org.apache.lucene.document.FeatureField; +import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexableField; +import org.apache.lucene.index.LeafReader; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.store.Directory; +import org.apache.lucene.tests.index.RandomIndexWriter; import org.elasticsearch.common.Strings; import org.elasticsearch.common.bytes.BytesReference; +import org.elasticsearch.common.compress.CompressedXContent; import org.elasticsearch.index.IndexVersion; import org.elasticsearch.index.IndexVersions; import org.elasticsearch.index.mapper.DocumentMapper; import org.elasticsearch.index.mapper.DocumentParsingException; import org.elasticsearch.index.mapper.MappedFieldType; import org.elasticsearch.index.mapper.MapperParsingException; +import org.elasticsearch.index.mapper.MapperService; import org.elasticsearch.index.mapper.MapperTestCase; import org.elasticsearch.index.mapper.ParsedDocument; import org.elasticsearch.index.mapper.SourceToParse; +import org.elasticsearch.search.lookup.Source; import org.elasticsearch.test.index.IndexVersionUtils; import org.elasticsearch.xcontent.XContentBuilder; import org.elasticsearch.xcontent.XContentFactory; @@ -33,18 +40,25 @@ import java.io.IOException; import java.util.Arrays; +import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import static org.elasticsearch.index.mapper.vectors.SparseVectorFieldMapper.NEW_SPARSE_VECTOR_INDEX_VERSION; import static org.elasticsearch.index.mapper.vectors.SparseVectorFieldMapper.PREVIOUS_SPARSE_VECTOR_INDEX_VERSION; +import static org.elasticsearch.xcontent.XContentFactory.jsonBuilder; import static org.hamcrest.Matchers.containsString; +import static org.hamcrest.Matchers.equalTo; +import static org.hamcrest.Matchers.instanceOf; public class SparseVectorFieldMapperTests extends MapperTestCase { @Override protected Object getSampleValueForDocument() { - return Map.of("ten", 10, "twenty", 20); + Map map = new LinkedHashMap<>(); + map.put("ten", 10f); + map.put("twenty", 20f); + return map; } @Override @@ -92,14 +106,18 @@ public void testDefaults() throws Exception { List fields = doc1.rootDoc().getFields("field"); assertEquals(2, fields.size()); - assertThat(fields.get(0), Matchers.instanceOf(FeatureField.class)); - FeatureField featureField1 = null; - FeatureField featureField2 = null; + if (IndexVersion.current().luceneVersion().major == 10) { + // TODO: Update to use Lucene's FeatureField after upgrading to Lucene 10.1. + assertThat(IndexVersion.current().luceneVersion().minor, equalTo(0)); + } + assertThat(fields.get(0), Matchers.instanceOf(XFeatureField.class)); + XFeatureField featureField1 = null; + XFeatureField featureField2 = null; for (IndexableField field : fields) { if (field.stringValue().equals("ten")) { - featureField1 = (FeatureField) field; + featureField1 = (XFeatureField) field; } else if (field.stringValue().equals("twenty")) { - featureField2 = (FeatureField) field; + featureField2 = (XFeatureField) field; } else { throw new UnsupportedOperationException(); } @@ -116,14 +134,14 @@ public void testDotInFieldName() throws Exception { List fields = parsedDocument.rootDoc().getFields("field"); assertEquals(2, fields.size()); - assertThat(fields.get(0), Matchers.instanceOf(FeatureField.class)); - FeatureField featureField1 = null; - FeatureField featureField2 = null; + assertThat(fields.get(0), Matchers.instanceOf(XFeatureField.class)); + XFeatureField featureField1 = null; + XFeatureField featureField2 = null; for (IndexableField field : fields) { if (field.stringValue().equals("foo.bar")) { - featureField1 = (FeatureField) field; + featureField1 = (XFeatureField) field; } else if (field.stringValue().equals("foobar")) { - featureField2 = (FeatureField) field; + featureField2 = (XFeatureField) field; } else { throw new UnsupportedOperationException(); } @@ -171,13 +189,13 @@ public void testHandlesMultiValuedFields() throws MapperParsingException, IOExce })); // then validate that the generate document stored both values appropriately and we have only the max value stored - FeatureField barField = ((FeatureField) doc1.rootDoc().getByKey("foo.field\\.bar")); + XFeatureField barField = ((XFeatureField) doc1.rootDoc().getByKey("foo.field\\.bar")); assertEquals(20, barField.getFeatureValue(), 1); - FeatureField storedBarField = ((FeatureField) doc1.rootDoc().getFields("foo.field").get(1)); + XFeatureField storedBarField = ((XFeatureField) doc1.rootDoc().getFields("foo.field").get(1)); assertEquals(20, storedBarField.getFeatureValue(), 1); - assertEquals(3, doc1.rootDoc().getFields().stream().filter((f) -> f instanceof FeatureField).count()); + assertEquals(3, doc1.rootDoc().getFields().stream().filter((f) -> f instanceof XFeatureField).count()); } public void testCannotBeUsedInMultiFields() { @@ -192,6 +210,53 @@ public void testCannotBeUsedInMultiFields() { assertThat(e.getMessage(), containsString("Field [feature] of type [sparse_vector] can't be used in multifields")); } + public void testStoreIsNotUpdateable() throws IOException { + var mapperService = createMapperService(fieldMapping(this::minimalMapping)); + XContentBuilder mapping = jsonBuilder().startObject() + .startObject("_doc") + .startObject("properties") + .startObject("field") + .field("type", "sparse_vector") + .field("store", true) + .endObject() + .endObject() + .endObject() + .endObject(); + var exc = expectThrows( + Exception.class, + () -> mapperService.merge("_doc", new CompressedXContent(Strings.toString(mapping)), MapperService.MergeReason.MAPPING_UPDATE) + ); + assertThat(exc.getMessage(), containsString("Cannot update parameter [store]")); + } + + @SuppressWarnings("unchecked") + public void testValueFetcher() throws Exception { + for (boolean store : new boolean[] { true, false }) { + var mapperService = createMapperService(fieldMapping(store ? this::minimalStoreMapping : this::minimalMapping)); + var mapper = mapperService.documentMapper(); + try (Directory directory = newDirectory()) { + RandomIndexWriter iw = new RandomIndexWriter(random(), directory); + var sourceToParse = source(this::writeField); + ParsedDocument doc1 = mapper.parse(sourceToParse); + iw.addDocument(doc1.rootDoc()); + iw.close(); + try (DirectoryReader reader = wrapInMockESDirectoryReader(DirectoryReader.open(directory))) { + LeafReader leafReader = getOnlyLeafReader(reader); + var searchContext = createSearchExecutionContext(mapperService, new IndexSearcher(leafReader)); + var fieldType = mapper.mappers().getFieldType("field"); + var valueFetcher = fieldType.valueFetcher(searchContext, null); + valueFetcher.setNextReader(leafReader.getContext()); + + var source = Source.fromBytes(sourceToParse.source()); + var result = valueFetcher.fetchValues(source, 0, List.of()); + assertThat(result.size(), equalTo(1)); + assertThat(result.get(0), instanceOf(Map.class)); + assertThat(toFloats((Map) result.get(0)), equalTo(toFloats((Map) source.source().get("field")))); + } + } + } + } + @Override protected Object generateRandomInputValue(MappedFieldType ft) { assumeFalse("Test implemented in a follow up", true); @@ -205,7 +270,29 @@ protected boolean allowsNullValues() { @Override protected SyntheticSourceSupport syntheticSourceSupport(boolean syntheticSource) { - throw new AssumptionViolatedException("not supported"); + boolean withStore = randomBoolean(); + return new SyntheticSourceSupport() { + @Override + public boolean preservesExactSource() { + return withStore == false; + } + + @Override + public SyntheticSourceExample example(int maxValues) { + return new SyntheticSourceExample(getSampleValueForDocument(), getSampleValueForDocument(), b -> { + if (withStore) { + minimalStoreMapping(b); + } else { + minimalMapping(b); + } + }); + } + + @Override + public List invalidExample() { + return List.of(); + } + }; } @Override @@ -276,4 +363,20 @@ public void testSparseVectorUnsupportedIndex() throws Exception { }))); assertThat(e.getMessage(), containsString(SparseVectorFieldMapper.ERROR_MESSAGE_8X)); } + + /** + * Handles float/double conversion when reading/writing with xcontent by converting all numbers to floats. + */ + private Map toFloats(Map value) { + // preserve order + Map result = new LinkedHashMap<>(); + for (var entry : value.entrySet()) { + if (entry.getValue() instanceof Number num) { + result.put(entry.getKey(), num.floatValue()); + } else { + throw new IllegalArgumentException("Expected Number, got: " + value.getClass().getSimpleName()); + } + } + return result; + } } diff --git a/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldTypeTests.java b/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldTypeTests.java index 4627d4d87195..0dbe3817c3e8 100644 --- a/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldTypeTests.java +++ b/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldTypeTests.java @@ -18,13 +18,13 @@ public class SparseVectorFieldTypeTests extends FieldTypeTestCase { public void testDocValuesDisabled() { - MappedFieldType fieldType = new SparseVectorFieldMapper.SparseVectorFieldType("field", Collections.emptyMap()); + MappedFieldType fieldType = new SparseVectorFieldMapper.SparseVectorFieldType("field", false, Collections.emptyMap()); assertFalse(fieldType.hasDocValues()); expectThrows(IllegalArgumentException.class, () -> fieldType.fielddataBuilder(FieldDataContext.noRuntimeFields("test"))); } public void testIsNotAggregatable() { - MappedFieldType fieldType = new SparseVectorFieldMapper.SparseVectorFieldType("field", Collections.emptyMap()); + MappedFieldType fieldType = new SparseVectorFieldMapper.SparseVectorFieldType("field", false, Collections.emptyMap()); assertFalse(fieldType.isAggregatable()); } } diff --git a/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/mapper/SemanticTextFieldMapperTests.java b/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/mapper/SemanticTextFieldMapperTests.java index 18096ebee4f0..ef70dbe401e5 100644 --- a/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/mapper/SemanticTextFieldMapperTests.java +++ b/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/mapper/SemanticTextFieldMapperTests.java @@ -7,7 +7,6 @@ package org.elasticsearch.xpack.inference.mapper; -import org.apache.lucene.document.FeatureField; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfos; import org.apache.lucene.index.IndexableField; @@ -47,6 +46,7 @@ import org.elasticsearch.index.mapper.SourceToParse; import org.elasticsearch.index.mapper.vectors.DenseVectorFieldMapper; import org.elasticsearch.index.mapper.vectors.SparseVectorFieldMapper; +import org.elasticsearch.index.mapper.vectors.XFeatureField; import org.elasticsearch.index.query.SearchExecutionContext; import org.elasticsearch.index.search.ESToParentBlockJoinQuery; import org.elasticsearch.inference.Model; @@ -1130,7 +1130,7 @@ private static void assertChildLeafNestedDocument( private static void assertSparseFeatures(LuceneDocument doc, String fieldName, int expectedCount) { int count = 0; for (IndexableField field : doc.getFields()) { - if (field instanceof FeatureField featureField) { + if (field instanceof XFeatureField featureField) { assertThat(featureField.name(), equalTo(fieldName)); ++count; }