From 95d348f06c839f4de99ed50a38d67468eeb61df2 Mon Sep 17 00:00:00 2001 From: mgodwan Date: Wed, 8 Nov 2023 22:40:04 +0530 Subject: [PATCH] More changes for fuzzy filter Signed-off-by: mgodwan --- .../index/codec/fuzzy/AbstractFuzzySet.java | 9 ++- .../index/codec/fuzzy/BloomFilter.java | 5 +- .../fuzzy/FuzzyFilterPostingsFormat.java | 12 ++-- .../codec/fuzzy/IndexInputLongArray.java | 28 ++++---- .../codec/fuzzy/LongArrayBackedBitSet.java | 5 +- .../index/codec/fuzzy/MurmurHash64.java | 67 ------------------- .../fuzzy/FuzzyFilterPostingsFormatTests.java | 2 +- 7 files changed, 31 insertions(+), 97 deletions(-) delete mode 100644 server/src/main/java/org/opensearch/index/codec/fuzzy/MurmurHash64.java diff --git a/server/src/main/java/org/opensearch/index/codec/fuzzy/AbstractFuzzySet.java b/server/src/main/java/org/opensearch/index/codec/fuzzy/AbstractFuzzySet.java index 680dd92e693de..09976297361fa 100644 --- a/server/src/main/java/org/opensearch/index/codec/fuzzy/AbstractFuzzySet.java +++ b/server/src/main/java/org/opensearch/index/codec/fuzzy/AbstractFuzzySet.java @@ -10,6 +10,7 @@ import org.apache.lucene.util.BytesRef; import org.opensearch.common.CheckedSupplier; +import org.opensearch.common.hash.T1ha1; import java.io.IOException; import java.util.Iterator; @@ -37,8 +38,14 @@ protected void addAll(CheckedSupplier, IOException> valuesIte } } + public Result contains(BytesRef val) { + return containsHash(generateKey(val)); + } + + protected abstract Result containsHash(long hash); + protected long generateKey(BytesRef value) { - return MurmurHash64.INSTANCE.hash(value); + return T1ha1.hash(value.bytes, value.offset, value.length, 0L); } protected void assertAllElementsExist(CheckedSupplier, IOException> iteratorProvider) throws IOException { diff --git a/server/src/main/java/org/opensearch/index/codec/fuzzy/BloomFilter.java b/server/src/main/java/org/opensearch/index/codec/fuzzy/BloomFilter.java index c769e88ef02be..e36c75064b83b 100644 --- a/server/src/main/java/org/opensearch/index/codec/fuzzy/BloomFilter.java +++ b/server/src/main/java/org/opensearch/index/codec/fuzzy/BloomFilter.java @@ -78,7 +78,7 @@ public void writeTo(DataOutput out) throws IOException { bitset.writeTo(out); } - static int getNearestSetSize(int maxNumberOfBits) { + private static int getNearestSetSize(int maxNumberOfBits) { int result = usableBitSetSizes[0]; for (int i = 0; i < usableBitSetSizes.length; i++) { if (usableBitSetSizes[i] <= maxNumberOfBits) { @@ -94,8 +94,7 @@ public SetType setType() { } @Override - public Result contains(BytesRef value) { - long hash = generateKey(value); + public Result containsHash(long hash) { int msb = (int) (hash >>> Integer.SIZE); int lsb = (int) hash; for (int i = 0; i < hashCount; i++) { diff --git a/server/src/main/java/org/opensearch/index/codec/fuzzy/FuzzyFilterPostingsFormat.java b/server/src/main/java/org/opensearch/index/codec/fuzzy/FuzzyFilterPostingsFormat.java index 5d3b2de1235af..f481b25d3fbc5 100644 --- a/server/src/main/java/org/opensearch/index/codec/fuzzy/FuzzyFilterPostingsFormat.java +++ b/server/src/main/java/org/opensearch/index/codec/fuzzy/FuzzyFilterPostingsFormat.java @@ -25,7 +25,6 @@ import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; -import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.util.BytesRef; @@ -98,7 +97,8 @@ public FuzzySetFieldsProducer(SegmentReadState state) throws IOException { IndexInput filterIn = null; boolean success = false; try { - filterIn = state.directory.openInput(fuzzySetFileName, IOContext.READONCE); + filterIn = state.directory.openInput(fuzzySetFileName, state.context); + CodecUtil.checkIndexHeader( filterIn, FUZZY_SET_CODEC_NAME, @@ -107,8 +107,6 @@ public FuzzySetFieldsProducer(SegmentReadState state) throws IOException { state.segmentInfo.getId(), state.segmentSuffix ); - // // Load the hash function used in the Fuzzy filter - // hashFunction = HashFunction.forName(filterIn.readString()); // Load the delegate postings format PostingsFormat delegatePostingsFormat = PostingsFormat.forName(filterIn.readString()); this.delegateFieldsProducer = delegatePostingsFormat.fieldsProducer(state); @@ -232,6 +230,7 @@ public BytesRef getMax() throws IOException { } static final class FilterAppliedTermsEnum extends BaseTermsEnum { + private Terms delegateTerms; private TermsEnum delegateTermsEnum; private final FuzzySet filter; @@ -250,7 +249,7 @@ private TermsEnum delegate() throws IOException { if (delegateTermsEnum == null) { /* pull the iterator only if we really need it - * this can be a relativly heavy operation depending on the - * delegate postings format and they underlying directory + * delegate postings format and the underlying directory * (clone IndexInput) */ delegateTermsEnum = delegateTerms.iterator(); } @@ -439,6 +438,7 @@ public void close() throws IOException { state.segmentInfo.getId(), state.segmentSuffix ); + // remember the name of the postings format we will delegate to fuzzyFilterFileOutput.writeString(delegatePostingsFormat.getName()); @@ -447,7 +447,6 @@ public void close() throws IOException { for (Map.Entry entry : nonSaturatedSets) { FieldInfo fieldInfo = entry.getKey(); FuzzySet fuzzySet = entry.getValue(); - fuzzyFilterFileOutput.writeInt(fieldInfo.number); saveAppropriatelySizedFuzzySet(fuzzyFilterFileOutput, fuzzySet, fieldInfo); } CodecUtil.writeFooter(fuzzyFilterFileOutput); @@ -458,6 +457,7 @@ public void close() throws IOException { } private void saveAppropriatelySizedFuzzySet(IndexOutput fileOutput, FuzzySet fuzzySet, FieldInfo fieldInfo) throws IOException { + fileOutput.writeInt(fieldInfo.number); fileOutput.writeString(fuzzySet.setType().getSetName()); fuzzySet.writeTo(fileOutput); } diff --git a/server/src/main/java/org/opensearch/index/codec/fuzzy/IndexInputLongArray.java b/server/src/main/java/org/opensearch/index/codec/fuzzy/IndexInputLongArray.java index 003c8cba31a34..8840fb5b8e90c 100644 --- a/server/src/main/java/org/opensearch/index/codec/fuzzy/IndexInputLongArray.java +++ b/server/src/main/java/org/opensearch/index/codec/fuzzy/IndexInputLongArray.java @@ -9,8 +9,8 @@ package org.opensearch.index.codec.fuzzy; import org.apache.lucene.store.RandomAccessInput; +import org.apache.lucene.util.RamUsageEstimator; import org.opensearch.OpenSearchException; -import org.opensearch.common.CheckedSupplier; import org.opensearch.common.util.LongArray; import java.io.IOException; @@ -18,12 +18,12 @@ /** * A Long array backed by RandomAccessInput. */ -public class IndexInputLongArray implements LongArray { +class IndexInputLongArray implements LongArray { - public RandomAccessInput input; - private long size; + private final RandomAccessInput input; + private final long size; - public IndexInputLongArray(long size, RandomAccessInput input) { + IndexInputLongArray(long size, RandomAccessInput input) { this.size = size; this.input = input; } @@ -37,8 +37,12 @@ public long size() { } @Override - public long get(long index) { - return wrapException(() -> input.readLong(index << 3)); + public synchronized long get(long index) { + try { + return input.readLong(index << 3); + } catch (IOException ex) { + throw new OpenSearchException(ex); + } } @Override @@ -58,14 +62,6 @@ public void fill(long fromIndex, long toIndex, long value) { @Override public long ramBytesUsed() { - return 128; - } - - private T wrapException(CheckedSupplier supplier) { - try { - return supplier.get(); - } catch (IOException ex) { - throw new OpenSearchException(ex); - } + return RamUsageEstimator.shallowSizeOfInstance(IndexInputLongArray.class); } } diff --git a/server/src/main/java/org/opensearch/index/codec/fuzzy/LongArrayBackedBitSet.java b/server/src/main/java/org/opensearch/index/codec/fuzzy/LongArrayBackedBitSet.java index 48495c6600b37..ef0b9b27cabd1 100644 --- a/server/src/main/java/org/opensearch/index/codec/fuzzy/LongArrayBackedBitSet.java +++ b/server/src/main/java/org/opensearch/index/codec/fuzzy/LongArrayBackedBitSet.java @@ -21,12 +21,13 @@ /** * A bitset backed by a long-indexed array. */ -public class LongArrayBackedBitSet implements Accountable, Closeable { +class LongArrayBackedBitSet implements Accountable, Closeable { private long underlyingArrayLength = 0L; private LongArray longArray; LongArrayBackedBitSet(long capacity) { + // Since the bitset is backed by a long array, we only need 1 element for every 64 bits in the underlying array. underlyingArrayLength = ((capacity - 1L) >> 6) + 1; this.longArray = BigArrays.NON_RECYCLING_INSTANCE.withCircuitBreaking().newLongArray(underlyingArrayLength); } @@ -56,8 +57,6 @@ public long cardinality() { public boolean isSet(long index) { long i = index >> 6; // div 64 long val = longArray.get(i); - // signed shift will keep a negative index and force an - // array-index-out-of-bounds-exception, removing the need for an explicit check. long bitmask = 1L << index; return (val & bitmask) != 0; } diff --git a/server/src/main/java/org/opensearch/index/codec/fuzzy/MurmurHash64.java b/server/src/main/java/org/opensearch/index/codec/fuzzy/MurmurHash64.java deleted file mode 100644 index 33e263a172777..0000000000000 --- a/server/src/main/java/org/opensearch/index/codec/fuzzy/MurmurHash64.java +++ /dev/null @@ -1,67 +0,0 @@ -/* - * SPDX-License-Identifier: Apache-2.0 - * - * The OpenSearch Contributors require contributions made to - * this file be licensed under the Apache-2.0 license or a - * compatible open source license. - */ - -package org.opensearch.index.codec.fuzzy; - -import org.apache.lucene.util.BitUtil; -import org.apache.lucene.util.BytesRef; - -/** - * Utility to calculate hash for incoming bytes. - */ -public class MurmurHash64 { - private static final long M64 = 0xc6a4a7935bd1e995L; - private static final int R64 = 47; - public static final MurmurHash64 INSTANCE = new MurmurHash64(); - - /** - * Generates a 64-bit hash from byte array of the given length and seed. - * - * @param data The input byte array - * @param seed The initial seed value - * @param length The length of the array - * @return The 64-bit hash of the given array - */ - public static long hash64(byte[] data, int seed, int offset, int length) { - long h = (seed & 0xffffffffL) ^ (length * M64); - - final int nblocks = length >> 3; - - // body - for (int i = 0; i < nblocks; i++) { - - long k = (long) BitUtil.VH_LE_LONG.get(data, offset); - k *= M64; - k ^= k >>> R64; - k *= M64; - - h ^= k; - h *= M64; - - offset += Long.BYTES; - } - - int remaining = length & 0x07; - if (0 < remaining) { - for (int i = 0; i < remaining; i++) { - h ^= ((long) data[offset + i] & 0xff) << (Byte.SIZE * i); - } - h *= M64; - } - - h ^= h >>> R64; - h *= M64; - h ^= h >>> R64; - - return h; - } - - public final long hash(BytesRef br) { - return hash64(br.bytes, 0xe17a1465, br.offset, br.length); - } -} diff --git a/server/src/test/java/org/opensearch/index/codec/fuzzy/FuzzyFilterPostingsFormatTests.java b/server/src/test/java/org/opensearch/index/codec/fuzzy/FuzzyFilterPostingsFormatTests.java index 425211d7fbb46..868c2175d0689 100644 --- a/server/src/test/java/org/opensearch/index/codec/fuzzy/FuzzyFilterPostingsFormatTests.java +++ b/server/src/test/java/org/opensearch/index/codec/fuzzy/FuzzyFilterPostingsFormatTests.java @@ -19,7 +19,7 @@ public class FuzzyFilterPostingsFormatTests extends BasePostingsFormatTestCase { private TreeMap params = new TreeMap<>() { @Override public FuzzySetParameters get(Object k) { - return new FuzzySetParameters(() -> 0.2047); + return new FuzzySetParameters(() -> FuzzySetParameters.DEFAULT_FALSE_POSITIVE_PROBABILITY); } };