Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add option to store sparse_vector outside _source #117917

Merged
merged 13 commits into from
Dec 4, 2024
Merged
5 changes: 5 additions & 0 deletions docs/changelog/117917.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pr: 117917
summary: Add option to store `sparse_vector` outside `_source`
area: Mapping
type: feature
issues: []
17 changes: 17 additions & 0 deletions docs/reference/mapping/types/sparse-vector.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,23 @@ PUT my-index

See <<semantic-search-elser, semantic search with ELSER>> for a complete example on adding documents to a `sparse_vector` mapped field using ELSER.

[[sparse-vectors-params]]
==== Parameters for `sparse_vector` fields

The following parameters are accepted by `sparse_vector` fields:

[horizontal]

<<mapping-store,store>>::

Indicates whether the field value should be stored and retrievable independently of the <<mapping-source-field,_source>> field.
Accepted values: true or false (default).
The field's data is stored using term vectors, a disk-efficient structure compared to the original JSON input.
The input map can be retrieved during a search request via the <<search-fields-param,`fields` parameter>>.
To benefit from reduced disk usage, you must either:
* Exclude the field from <<source-filtering, _source>>.
* Use <<synthetic-source,synthetic `_source`>>.

jimczi marked this conversation as resolved.
Show resolved Hide resolved
[[index-multi-value-sparse-vectors]]
==== Multi-value sparse vectors

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -472,3 +472,120 @@

- match:
_source.ml.tokens: {}

---
"stored sparse_vector":

- requires:
cluster_features: [ "mapper.sparse_vector.store_support" ]
reason: "sparse_vector supports store parameter"

- do:
indices.create:
index: test
body:
mappings:
properties:
ml.tokens:
type: sparse_vector
store: true

- match: { acknowledged: true }
- do:
index:
index: test
id: "1"
body:
ml:
tokens:
running: 2
good: 3
run: 5
race: 7
for: 9

- match: { result: "created" }

- do:
indices.refresh: { }

- do:
search:
index: test
body:
fields: [ "ml.tokens" ]

- length: { hits.hits.0.fields.ml\\.tokens: 1 }
- length: { hits.hits.0.fields.ml\\.tokens.0: 5 }
- match: { hits.hits.0.fields.ml\\.tokens.0.running: 2.0 }
- match: { hits.hits.0.fields.ml\\.tokens.0.good: 3.0 }
- match: { hits.hits.0.fields.ml\\.tokens.0.run: 5.0 }
- match: { hits.hits.0.fields.ml\\.tokens.0.race: 7.0 }
- match: { hits.hits.0.fields.ml\\.tokens.0.for: 9.0 }

---
"stored sparse_vector synthetic source":

- requires:
cluster_features: [ "mapper.source.mode_from_index_setting", "mapper.sparse_vector.store_support" ]
reason: "sparse_vector supports store parameter"

- do:
indices.create:
index: test
body:
settings:
index:
mapping.source.mode: synthetic
mappings:
properties:
ml.tokens:
type: sparse_vector
store: true

- match: { acknowledged: true }

- do:
index:
index: test
id: "1"
body:
ml:
tokens:
running: 2
good: 3
run: 5
race: 7
for: 9

- match: { result: "created" }

- do:
indices.refresh: { }

- do:
search:
index: test
body:
fields: [ "ml.tokens" ]

- match:
hits.hits.0._source: {
ml: {
tokens: {
running: 2.0,
good: 3.0,
run: 5.0,
race: 7.0,
for: 9.0
}
}
}

- length: { hits.hits.0.fields.ml\\.tokens: 1 }
- length: { hits.hits.0.fields.ml\\.tokens.0: 5 }
- match: { hits.hits.0.fields.ml\\.tokens.0.running: 2.0 }
- match: { hits.hits.0.fields.ml\\.tokens.0.good: 3.0 }
- match: { hits.hits.0.fields.ml\\.tokens.0.run: 5.0 }
- match: { hits.hits.0.fields.ml\\.tokens.0.race: 7.0 }
- match: { hits.hits.0.fields.ml\\.tokens.0.for: 9.0 }
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,18 @@

import org.apache.lucene.document.FeatureField;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.TermVectors;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.MatchNoDocsQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.common.logging.DeprecationCategory;
import org.elasticsearch.common.lucene.Lucene;
import org.elasticsearch.features.NodeFeature;
import org.elasticsearch.index.IndexVersion;
import org.elasticsearch.index.IndexVersions;
import org.elasticsearch.index.analysis.NamedAnalyzer;
Expand All @@ -25,14 +32,22 @@
import org.elasticsearch.index.mapper.FieldMapper;
import org.elasticsearch.index.mapper.MappedFieldType;
import org.elasticsearch.index.mapper.MapperBuilderContext;
import org.elasticsearch.index.mapper.SourceLoader;
import org.elasticsearch.index.mapper.SourceValueFetcher;
import org.elasticsearch.index.mapper.TextSearchInfo;
import org.elasticsearch.index.mapper.ValueFetcher;
import org.elasticsearch.index.query.SearchExecutionContext;
import org.elasticsearch.search.fetch.StoredFieldsSpec;
import org.elasticsearch.search.lookup.Source;
import org.elasticsearch.xcontent.XContentBuilder;
import org.elasticsearch.xcontent.XContentParser.Token;

import java.io.IOException;
import java.io.UncheckedIOException;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Stream;

import static org.elasticsearch.index.query.AbstractQueryBuilder.DEFAULT_BOOST;

Expand All @@ -44,6 +59,8 @@ public class SparseVectorFieldMapper extends FieldMapper {

public static final String CONTENT_TYPE = "sparse_vector";

public static final NodeFeature SPARSE_VECTOR_STORE_SUPPORT = new NodeFeature("mapper.sparse_vector.store_support");
jimczi marked this conversation as resolved.
Show resolved Hide resolved

static final String ERROR_MESSAGE_7X = "[sparse_vector] field type in old 7.x indices is allowed to "
+ "contain [sparse_vector] fields, but they cannot be indexed or searched.";
static final String ERROR_MESSAGE_8X = "The [sparse_vector] field type is not supported on indices created on versions 8.0 to 8.10.";
Expand All @@ -52,8 +69,12 @@ public class SparseVectorFieldMapper extends FieldMapper {
static final IndexVersion NEW_SPARSE_VECTOR_INDEX_VERSION = IndexVersions.NEW_SPARSE_VECTOR;
static final IndexVersion SPARSE_VECTOR_IN_FIELD_NAMES_INDEX_VERSION = IndexVersions.SPARSE_VECTOR_IN_FIELD_NAMES_SUPPORT;

public static class Builder extends FieldMapper.Builder {
private static SparseVectorFieldMapper toType(FieldMapper in) {
return (SparseVectorFieldMapper) in;
}

public static class Builder extends FieldMapper.Builder {
private final Parameter<Boolean> stored = Parameter.storeParam(m -> toType(m).fieldType().isStored(), false);
private final Parameter<Map<String, String>> meta = Parameter.metaParam();

public Builder(String name) {
Expand All @@ -62,14 +83,14 @@ public Builder(String name) {

@Override
protected Parameter<?>[] getParameters() {
return new Parameter<?>[] { meta };
return new Parameter<?>[] { stored, meta };
}

@Override
public SparseVectorFieldMapper build(MapperBuilderContext context) {
return new SparseVectorFieldMapper(
leafName(),
new SparseVectorFieldType(context.buildFullName(leafName()), meta.getValue()),
new SparseVectorFieldType(context.buildFullName(leafName()), stored.getValue(), meta.getValue()),
builderParams(this, context)
);
}
Expand All @@ -87,8 +108,8 @@ public SparseVectorFieldMapper build(MapperBuilderContext context) {

public static final class SparseVectorFieldType extends MappedFieldType {

public SparseVectorFieldType(String name, Map<String, String> meta) {
super(name, true, false, false, TextSearchInfo.SIMPLE_MATCH_ONLY, meta);
public SparseVectorFieldType(String name, boolean isStored, Map<String, String> meta) {
super(name, true, isStored, false, TextSearchInfo.SIMPLE_MATCH_ONLY, meta);
}

@Override
Expand All @@ -103,6 +124,9 @@ public IndexFieldData.Builder fielddataBuilder(FieldDataContext fieldDataContext

@Override
public ValueFetcher valueFetcher(SearchExecutionContext context, String format) {
if (isStored()) {
return new SparseVectorValueFetcher(name());
}
return SourceValueFetcher.identity(name(), context, format);
}

Expand Down Expand Up @@ -135,6 +159,14 @@ private SparseVectorFieldMapper(String simpleName, MappedFieldType mappedFieldTy
super(simpleName, mappedFieldType, builderParams);
}

@Override
protected SyntheticSourceSupport syntheticSourceSupport() {
if (fieldType().isStored()) {
return new SyntheticSourceSupport.Native(new SparseVectorSyntheticFieldLoader(fullPath(), leafName()));
}
return super.syntheticSourceSupport();
}

@Override
public Map<String, NamedAnalyzer> indexAnalyzers() {
return Map.of(mappedFieldType.name(), Lucene.KEYWORD_ANALYZER);
Expand Down Expand Up @@ -189,9 +221,9 @@ public void parse(DocumentParserContext context) throws IOException {
// based on recommendations from this paper: https://arxiv.org/pdf/2305.18494.pdf
IndexableField currentField = context.doc().getByKey(key);
if (currentField == null) {
context.doc().addWithKey(key, new FeatureField(fullPath(), feature, value));
} else if (currentField instanceof FeatureField && ((FeatureField) currentField).getFeatureValue() < value) {
((FeatureField) currentField).setFeatureValue(value);
context.doc().addWithKey(key, new XFeatureField(fullPath(), feature, value, fieldType().isStored()));
} else if (currentField instanceof XFeatureField && ((XFeatureField) currentField).getFeatureValue() < value) {
((XFeatureField) currentField).setFeatureValue(value);
}
} else {
throw new IllegalArgumentException(
Expand Down Expand Up @@ -219,4 +251,114 @@ protected String contentType() {
return CONTENT_TYPE;
}

private static class SparseVectorValueFetcher implements ValueFetcher {
private final String fieldName;
private TermVectors termVectors;

private SparseVectorValueFetcher(String fieldName) {
this.fieldName = fieldName;
}

@Override
public void setNextReader(LeafReaderContext context) {
try {
termVectors = context.reader().termVectors();
} catch (IOException exc) {
throw new UncheckedIOException(exc);
}
}

@Override
public List<Object> fetchValues(Source source, int doc, List<Object> ignoredValues) throws IOException {
if (termVectors == null) {
return List.of();
}
var terms = termVectors.get(doc, fieldName);
if (terms == null) {
return List.of();
}

var termsEnum = terms.iterator();
PostingsEnum postingsScratch = null;
Map<String, Float> result = new LinkedHashMap<>();
while (termsEnum.next() != null) {
postingsScratch = termsEnum.postings(postingsScratch);
postingsScratch.nextDoc();
result.put(termsEnum.term().utf8ToString(), XFeatureField.decodeFeatureValue(postingsScratch.freq()));
assert postingsScratch.nextDoc() == DocIdSetIterator.NO_MORE_DOCS;
}
return List.of(result);
}

@Override
public StoredFieldsSpec storedFieldsSpec() {
return StoredFieldsSpec.NO_REQUIREMENTS;
}
}

private static class SparseVectorSyntheticFieldLoader implements SourceLoader.SyntheticFieldLoader {
private final String fullPath;
private final String leafName;

private TermsEnum termsDocEnum;

private SparseVectorSyntheticFieldLoader(String fullPath, String leafName) {
this.fullPath = fullPath;
this.leafName = leafName;
}

@Override
public Stream<Map.Entry<String, StoredFieldLoader>> storedFieldLoaders() {
return Stream.of();
}

@Override
public DocValuesLoader docValuesLoader(LeafReader leafReader, int[] docIdsInLeaf) throws IOException {
var fieldInfos = leafReader.getFieldInfos().fieldInfo(fullPath);
if (fieldInfos == null || fieldInfos.hasTermVectors() == false) {
return null;
}
return docId -> {
var terms = leafReader.termVectors().get(docId, fullPath);
if (terms == null) {
return false;
}
termsDocEnum = terms.iterator();
if (termsDocEnum.next() == null) {
termsDocEnum = null;
return false;
}
return true;
};
}

@Override
public boolean hasValue() {
return termsDocEnum != null;
}

@Override
public void write(XContentBuilder b) throws IOException {
assert termsDocEnum != null;
PostingsEnum reuse = null;
b.startObject(leafName);
do {
reuse = termsDocEnum.postings(reuse);
reuse.nextDoc();
b.field(termsDocEnum.term().utf8ToString(), XFeatureField.decodeFeatureValue(reuse.freq()));
} while (termsDocEnum.next() != null);
b.endObject();
}

@Override
public String fieldName() {
return leafName;
}

@Override
public void reset() {
termsDocEnum = null;
}
}

}
Loading