Skip to content

Commit

Permalink
More changes for fuzzy filter
Browse files Browse the repository at this point in the history
Signed-off-by: mgodwan <[email protected]>
  • Loading branch information
mgodwan committed Nov 14, 2023
1 parent 6fe9a15 commit 95d348f
Show file tree
Hide file tree
Showing 7 changed files with 31 additions and 97 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

import org.apache.lucene.util.BytesRef;
import org.opensearch.common.CheckedSupplier;
import org.opensearch.common.hash.T1ha1;

import java.io.IOException;
import java.util.Iterator;
Expand Down Expand Up @@ -37,8 +38,14 @@ protected void addAll(CheckedSupplier<Iterator<BytesRef>, IOException> valuesIte
}
}

public Result contains(BytesRef val) {
return containsHash(generateKey(val));
}

protected abstract Result containsHash(long hash);

protected long generateKey(BytesRef value) {
return MurmurHash64.INSTANCE.hash(value);
return T1ha1.hash(value.bytes, value.offset, value.length, 0L);
}

protected void assertAllElementsExist(CheckedSupplier<Iterator<BytesRef>, IOException> iteratorProvider) throws IOException {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ public void writeTo(DataOutput out) throws IOException {
bitset.writeTo(out);
}

static int getNearestSetSize(int maxNumberOfBits) {
private static int getNearestSetSize(int maxNumberOfBits) {
int result = usableBitSetSizes[0];
for (int i = 0; i < usableBitSetSizes.length; i++) {
if (usableBitSetSizes[i] <= maxNumberOfBits) {
Expand All @@ -94,8 +94,7 @@ public SetType setType() {
}

@Override
public Result contains(BytesRef value) {
long hash = generateKey(value);
public Result containsHash(long hash) {
int msb = (int) (hash >>> Integer.SIZE);
int lsb = (int) hash;
for (int i = 0; i < hashCount; i++) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.BytesRef;
Expand Down Expand Up @@ -98,7 +97,8 @@ public FuzzySetFieldsProducer(SegmentReadState state) throws IOException {
IndexInput filterIn = null;
boolean success = false;
try {
filterIn = state.directory.openInput(fuzzySetFileName, IOContext.READONCE);
filterIn = state.directory.openInput(fuzzySetFileName, state.context);

CodecUtil.checkIndexHeader(
filterIn,
FUZZY_SET_CODEC_NAME,
Expand All @@ -107,8 +107,6 @@ public FuzzySetFieldsProducer(SegmentReadState state) throws IOException {
state.segmentInfo.getId(),
state.segmentSuffix
);
// // Load the hash function used in the Fuzzy filter
// hashFunction = HashFunction.forName(filterIn.readString());
// Load the delegate postings format
PostingsFormat delegatePostingsFormat = PostingsFormat.forName(filterIn.readString());
this.delegateFieldsProducer = delegatePostingsFormat.fieldsProducer(state);
Expand Down Expand Up @@ -232,6 +230,7 @@ public BytesRef getMax() throws IOException {
}

static final class FilterAppliedTermsEnum extends BaseTermsEnum {

private Terms delegateTerms;
private TermsEnum delegateTermsEnum;
private final FuzzySet filter;
Expand All @@ -250,7 +249,7 @@ private TermsEnum delegate() throws IOException {
if (delegateTermsEnum == null) {
/* pull the iterator only if we really need it -
* this can be a relativly heavy operation depending on the
* delegate postings format and they underlying directory
* delegate postings format and the underlying directory
* (clone IndexInput) */
delegateTermsEnum = delegateTerms.iterator();
}
Expand Down Expand Up @@ -439,6 +438,7 @@ public void close() throws IOException {
state.segmentInfo.getId(),
state.segmentSuffix
);

// remember the name of the postings format we will delegate to
fuzzyFilterFileOutput.writeString(delegatePostingsFormat.getName());

Expand All @@ -447,7 +447,6 @@ public void close() throws IOException {
for (Map.Entry<FieldInfo, FuzzySet> entry : nonSaturatedSets) {
FieldInfo fieldInfo = entry.getKey();
FuzzySet fuzzySet = entry.getValue();
fuzzyFilterFileOutput.writeInt(fieldInfo.number);
saveAppropriatelySizedFuzzySet(fuzzyFilterFileOutput, fuzzySet, fieldInfo);
}
CodecUtil.writeFooter(fuzzyFilterFileOutput);
Expand All @@ -458,6 +457,7 @@ public void close() throws IOException {
}

private void saveAppropriatelySizedFuzzySet(IndexOutput fileOutput, FuzzySet fuzzySet, FieldInfo fieldInfo) throws IOException {
fileOutput.writeInt(fieldInfo.number);
fileOutput.writeString(fuzzySet.setType().getSetName());
fuzzySet.writeTo(fileOutput);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,21 +9,21 @@
package org.opensearch.index.codec.fuzzy;

import org.apache.lucene.store.RandomAccessInput;
import org.apache.lucene.util.RamUsageEstimator;
import org.opensearch.OpenSearchException;
import org.opensearch.common.CheckedSupplier;
import org.opensearch.common.util.LongArray;

import java.io.IOException;

/**
* A Long array backed by RandomAccessInput.
*/
public class IndexInputLongArray implements LongArray {
class IndexInputLongArray implements LongArray {

public RandomAccessInput input;
private long size;
private final RandomAccessInput input;
private final long size;

public IndexInputLongArray(long size, RandomAccessInput input) {
IndexInputLongArray(long size, RandomAccessInput input) {
this.size = size;
this.input = input;
}
Expand All @@ -37,8 +37,12 @@ public long size() {
}

@Override
public long get(long index) {
return wrapException(() -> input.readLong(index << 3));
public synchronized long get(long index) {
try {
return input.readLong(index << 3);
} catch (IOException ex) {
throw new OpenSearchException(ex);
}
}

@Override
Expand All @@ -58,14 +62,6 @@ public void fill(long fromIndex, long toIndex, long value) {

@Override
public long ramBytesUsed() {
return 128;
}

private <T> T wrapException(CheckedSupplier<T, IOException> supplier) {
try {
return supplier.get();
} catch (IOException ex) {
throw new OpenSearchException(ex);
}
return RamUsageEstimator.shallowSizeOfInstance(IndexInputLongArray.class);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,13 @@
/**
* A bitset backed by a long-indexed array.
*/
public class LongArrayBackedBitSet implements Accountable, Closeable {
class LongArrayBackedBitSet implements Accountable, Closeable {

private long underlyingArrayLength = 0L;
private LongArray longArray;

LongArrayBackedBitSet(long capacity) {
// Since the bitset is backed by a long array, we only need 1 element for every 64 bits in the underlying array.
underlyingArrayLength = ((capacity - 1L) >> 6) + 1;
this.longArray = BigArrays.NON_RECYCLING_INSTANCE.withCircuitBreaking().newLongArray(underlyingArrayLength);
}
Expand Down Expand Up @@ -56,8 +57,6 @@ public long cardinality() {
public boolean isSet(long index) {
long i = index >> 6; // div 64
long val = longArray.get(i);
// signed shift will keep a negative index and force an
// array-index-out-of-bounds-exception, removing the need for an explicit check.
long bitmask = 1L << index;
return (val & bitmask) != 0;
}
Expand Down

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ public class FuzzyFilterPostingsFormatTests extends BasePostingsFormatTestCase {
private TreeMap<String, FuzzySetParameters> params = new TreeMap<>() {
@Override
public FuzzySetParameters get(Object k) {
return new FuzzySetParameters(() -> 0.2047);
return new FuzzySetParameters(() -> FuzzySetParameters.DEFAULT_FALSE_POSITIVE_PROBABILITY);
}
};

Expand Down

0 comments on commit 95d348f

Please sign in to comment.