diff --git a/server/src/main/java/org/opensearch/index/codec/fuzzy/AbstractFuzzySet.java b/server/src/main/java/org/opensearch/index/codec/fuzzy/AbstractFuzzySet.java index 680dd92e693de..09976297361fa 100644 --- a/server/src/main/java/org/opensearch/index/codec/fuzzy/AbstractFuzzySet.java +++ b/server/src/main/java/org/opensearch/index/codec/fuzzy/AbstractFuzzySet.java @@ -10,6 +10,7 @@ import org.apache.lucene.util.BytesRef; import org.opensearch.common.CheckedSupplier; +import org.opensearch.common.hash.T1ha1; import java.io.IOException; import java.util.Iterator; @@ -37,8 +38,14 @@ protected void addAll(CheckedSupplier, IOException> valuesIte } } + public Result contains(BytesRef val) { + return containsHash(generateKey(val)); + } + + protected abstract Result containsHash(long hash); + protected long generateKey(BytesRef value) { - return MurmurHash64.INSTANCE.hash(value); + return T1ha1.hash(value.bytes, value.offset, value.length, 0L); } protected void assertAllElementsExist(CheckedSupplier, IOException> iteratorProvider) throws IOException { diff --git a/server/src/main/java/org/opensearch/index/codec/fuzzy/BloomFilter.java b/server/src/main/java/org/opensearch/index/codec/fuzzy/BloomFilter.java index c769e88ef02be..e36c75064b83b 100644 --- a/server/src/main/java/org/opensearch/index/codec/fuzzy/BloomFilter.java +++ b/server/src/main/java/org/opensearch/index/codec/fuzzy/BloomFilter.java @@ -78,7 +78,7 @@ public void writeTo(DataOutput out) throws IOException { bitset.writeTo(out); } - static int getNearestSetSize(int maxNumberOfBits) { + private static int getNearestSetSize(int maxNumberOfBits) { int result = usableBitSetSizes[0]; for (int i = 0; i < usableBitSetSizes.length; i++) { if (usableBitSetSizes[i] <= maxNumberOfBits) { @@ -94,8 +94,7 @@ public SetType setType() { } @Override - public Result contains(BytesRef value) { - long hash = generateKey(value); + public Result containsHash(long hash) { int msb = (int) (hash >>> Integer.SIZE); int lsb = (int) hash; for (int i = 0; i < hashCount; i++) { diff --git a/server/src/main/java/org/opensearch/index/codec/fuzzy/FuzzyFilterPostingsFormat.java b/server/src/main/java/org/opensearch/index/codec/fuzzy/FuzzyFilterPostingsFormat.java index 5d3b2de1235af..f481b25d3fbc5 100644 --- a/server/src/main/java/org/opensearch/index/codec/fuzzy/FuzzyFilterPostingsFormat.java +++ b/server/src/main/java/org/opensearch/index/codec/fuzzy/FuzzyFilterPostingsFormat.java @@ -25,7 +25,6 @@ import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; -import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.util.BytesRef; @@ -98,7 +97,8 @@ public FuzzySetFieldsProducer(SegmentReadState state) throws IOException { IndexInput filterIn = null; boolean success = false; try { - filterIn = state.directory.openInput(fuzzySetFileName, IOContext.READONCE); + filterIn = state.directory.openInput(fuzzySetFileName, state.context); + CodecUtil.checkIndexHeader( filterIn, FUZZY_SET_CODEC_NAME, @@ -107,8 +107,6 @@ public FuzzySetFieldsProducer(SegmentReadState state) throws IOException { state.segmentInfo.getId(), state.segmentSuffix ); - // // Load the hash function used in the Fuzzy filter - // hashFunction = HashFunction.forName(filterIn.readString()); // Load the delegate postings format PostingsFormat delegatePostingsFormat = PostingsFormat.forName(filterIn.readString()); this.delegateFieldsProducer = delegatePostingsFormat.fieldsProducer(state); @@ -232,6 +230,7 @@ public BytesRef getMax() throws IOException { } static final class FilterAppliedTermsEnum extends BaseTermsEnum { + private Terms delegateTerms; private TermsEnum delegateTermsEnum; private final FuzzySet filter; @@ -250,7 +249,7 @@ private TermsEnum delegate() throws IOException { if (delegateTermsEnum == null) { /* pull the iterator only if we really need it - * this can be a relativly heavy operation depending on the - * delegate postings format and they underlying directory + * delegate postings format and the underlying directory * (clone IndexInput) */ delegateTermsEnum = delegateTerms.iterator(); } @@ -439,6 +438,7 @@ public void close() throws IOException { state.segmentInfo.getId(), state.segmentSuffix ); + // remember the name of the postings format we will delegate to fuzzyFilterFileOutput.writeString(delegatePostingsFormat.getName()); @@ -447,7 +447,6 @@ public void close() throws IOException { for (Map.Entry entry : nonSaturatedSets) { FieldInfo fieldInfo = entry.getKey(); FuzzySet fuzzySet = entry.getValue(); - fuzzyFilterFileOutput.writeInt(fieldInfo.number); saveAppropriatelySizedFuzzySet(fuzzyFilterFileOutput, fuzzySet, fieldInfo); } CodecUtil.writeFooter(fuzzyFilterFileOutput); @@ -458,6 +457,7 @@ public void close() throws IOException { } private void saveAppropriatelySizedFuzzySet(IndexOutput fileOutput, FuzzySet fuzzySet, FieldInfo fieldInfo) throws IOException { + fileOutput.writeInt(fieldInfo.number); fileOutput.writeString(fuzzySet.setType().getSetName()); fuzzySet.writeTo(fileOutput); } diff --git a/server/src/main/java/org/opensearch/index/codec/fuzzy/IndexInputLongArray.java b/server/src/main/java/org/opensearch/index/codec/fuzzy/IndexInputLongArray.java index 003c8cba31a34..8840fb5b8e90c 100644 --- a/server/src/main/java/org/opensearch/index/codec/fuzzy/IndexInputLongArray.java +++ b/server/src/main/java/org/opensearch/index/codec/fuzzy/IndexInputLongArray.java @@ -9,8 +9,8 @@ package org.opensearch.index.codec.fuzzy; import org.apache.lucene.store.RandomAccessInput; +import org.apache.lucene.util.RamUsageEstimator; import org.opensearch.OpenSearchException; -import org.opensearch.common.CheckedSupplier; import org.opensearch.common.util.LongArray; import java.io.IOException; @@ -18,12 +18,12 @@ /** * A Long array backed by RandomAccessInput. */ -public class IndexInputLongArray implements LongArray { +class IndexInputLongArray implements LongArray { - public RandomAccessInput input; - private long size; + private final RandomAccessInput input; + private final long size; - public IndexInputLongArray(long size, RandomAccessInput input) { + IndexInputLongArray(long size, RandomAccessInput input) { this.size = size; this.input = input; } @@ -37,8 +37,12 @@ public long size() { } @Override - public long get(long index) { - return wrapException(() -> input.readLong(index << 3)); + public synchronized long get(long index) { + try { + return input.readLong(index << 3); + } catch (IOException ex) { + throw new OpenSearchException(ex); + } } @Override @@ -58,14 +62,6 @@ public void fill(long fromIndex, long toIndex, long value) { @Override public long ramBytesUsed() { - return 128; - } - - private T wrapException(CheckedSupplier supplier) { - try { - return supplier.get(); - } catch (IOException ex) { - throw new OpenSearchException(ex); - } + return RamUsageEstimator.shallowSizeOfInstance(IndexInputLongArray.class); } } diff --git a/server/src/main/java/org/opensearch/index/codec/fuzzy/LongArrayBackedBitSet.java b/server/src/main/java/org/opensearch/index/codec/fuzzy/LongArrayBackedBitSet.java index 48495c6600b37..ef0b9b27cabd1 100644 --- a/server/src/main/java/org/opensearch/index/codec/fuzzy/LongArrayBackedBitSet.java +++ b/server/src/main/java/org/opensearch/index/codec/fuzzy/LongArrayBackedBitSet.java @@ -21,12 +21,13 @@ /** * A bitset backed by a long-indexed array. */ -public class LongArrayBackedBitSet implements Accountable, Closeable { +class LongArrayBackedBitSet implements Accountable, Closeable { private long underlyingArrayLength = 0L; private LongArray longArray; LongArrayBackedBitSet(long capacity) { + // Since the bitset is backed by a long array, we only need 1 element for every 64 bits in the underlying array. underlyingArrayLength = ((capacity - 1L) >> 6) + 1; this.longArray = BigArrays.NON_RECYCLING_INSTANCE.withCircuitBreaking().newLongArray(underlyingArrayLength); } @@ -56,8 +57,6 @@ public long cardinality() { public boolean isSet(long index) { long i = index >> 6; // div 64 long val = longArray.get(i); - // signed shift will keep a negative index and force an - // array-index-out-of-bounds-exception, removing the need for an explicit check. long bitmask = 1L << index; return (val & bitmask) != 0; } diff --git a/server/src/test/java/org/opensearch/index/codec/fuzzy/FuzzyFilterPostingsFormatTests.java b/server/src/test/java/org/opensearch/index/codec/fuzzy/FuzzyFilterPostingsFormatTests.java index 425211d7fbb46..868c2175d0689 100644 --- a/server/src/test/java/org/opensearch/index/codec/fuzzy/FuzzyFilterPostingsFormatTests.java +++ b/server/src/test/java/org/opensearch/index/codec/fuzzy/FuzzyFilterPostingsFormatTests.java @@ -19,7 +19,7 @@ public class FuzzyFilterPostingsFormatTests extends BasePostingsFormatTestCase { private TreeMap params = new TreeMap<>() { @Override public FuzzySetParameters get(Object k) { - return new FuzzySetParameters(() -> 0.2047); + return new FuzzySetParameters(() -> FuzzySetParameters.DEFAULT_FALSE_POSITIVE_PROBABILITY); } };