forked from opensearch-project/OpenSearch
-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Enable Fuzzy codec for doc id fields using a bloom filter
Signed-off-by: mgodwan <[email protected]>
- Loading branch information
Showing
17 changed files
with
1,239 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
52 changes: 52 additions & 0 deletions
52
server/src/main/java/org/opensearch/index/codec/fuzzy/AbstractFuzzySet.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
/* | ||
* SPDX-License-Identifier: Apache-2.0 | ||
* | ||
* The OpenSearch Contributors require contributions made to | ||
* this file be licensed under the Apache-2.0 license or a | ||
* compatible open source license. | ||
*/ | ||
|
||
package org.opensearch.index.codec.fuzzy; | ||
|
||
import org.apache.lucene.util.BytesRef; | ||
import org.opensearch.common.CheckedSupplier; | ||
|
||
import java.io.IOException; | ||
import java.util.ArrayList; | ||
import java.util.Iterator; | ||
import java.util.List; | ||
|
||
public abstract class AbstractFuzzySet implements FuzzySet { | ||
|
||
/** | ||
* Add an item to this fuzzy set. | ||
* @param value The value to be added | ||
*/ | ||
protected abstract void add(BytesRef value); | ||
|
||
/** | ||
* Add all items to the underlying set. | ||
* Implementations can choose to perform this using an optimized strategy based on the type of set. | ||
* @param valuesIteratorProvider Supplier for an iterator over All values which should be added to the set. | ||
*/ | ||
protected void addAll(CheckedSupplier<Iterator<BytesRef>, IOException> valuesIteratorProvider) throws IOException { | ||
Iterator<BytesRef> values = valuesIteratorProvider.get(); | ||
while (values.hasNext()) { | ||
add(values.next()); | ||
} | ||
} | ||
|
||
protected long generateKey(BytesRef value) { | ||
return MurmurHash64.INSTANCE.hash(value); | ||
} | ||
|
||
protected void assertAllElementsExist(CheckedSupplier<Iterator<BytesRef>, IOException> iteratorProvider) throws IOException { | ||
Iterator<BytesRef> iter = iteratorProvider.get(); | ||
int cnt = 0; | ||
while (iter.hasNext()) { | ||
BytesRef item = iter.next(); | ||
assert contains(item) == Result.MAYBE : "Expected Filter to return positive response for elements added to it. Elements matched: " + cnt; | ||
cnt ++; | ||
} | ||
} | ||
} |
147 changes: 147 additions & 0 deletions
147
server/src/main/java/org/opensearch/index/codec/fuzzy/BloomFilter.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,147 @@ | ||
/* | ||
* SPDX-License-Identifier: Apache-2.0 | ||
* | ||
* The OpenSearch Contributors require contributions made to | ||
* this file be licensed under the Apache-2.0 license or a | ||
* compatible open source license. | ||
*/ | ||
|
||
package org.opensearch.index.codec.fuzzy; | ||
|
||
import org.apache.logging.log4j.LogManager; | ||
import org.apache.logging.log4j.Logger; | ||
import org.apache.lucene.store.DataOutput; | ||
import org.apache.lucene.store.IndexInput; | ||
import org.apache.lucene.util.BytesRef; | ||
import org.apache.lucene.util.IOUtils; | ||
import org.apache.lucene.util.RamUsageEstimator; | ||
import org.opensearch.common.CheckedSupplier; | ||
import org.opensearch.core.Assertions; | ||
|
||
import java.io.IOException; | ||
import java.util.Iterator; | ||
import java.util.Optional; | ||
|
||
/** | ||
* The code is based on Lucene's implementation of Bloom Filter. | ||
* It represents a subset of the Lucene implementation needed for OpenSearch use cases. | ||
* Since the Lucene implementation is marked experimental, | ||
* this aims to ensure we can provide a bwc implementation during upgrades. | ||
*/ | ||
public class BloomFilter extends AbstractFuzzySet { | ||
|
||
private static final Logger logger = LogManager.getLogger(BloomFilter.class); | ||
|
||
// The sizes of BitSet used are all numbers that, when expressed in binary form, | ||
// are all ones. This is to enable fast downsizing from one bitset to another | ||
// by simply ANDing each set index in one bitset with the size of the target bitset | ||
// - this provides a fast modulo of the number. Values previously accumulated in | ||
// a large bitset and then mapped to a smaller set can be looked up using a single | ||
// AND operation of the query term's hash rather than needing to perform a 2-step | ||
// translation of the query term that mirrors the stored content's reprojections. | ||
static final int[] usableBitSetSizes; | ||
|
||
static { | ||
usableBitSetSizes = new int[26]; | ||
for (int i = 0; i < usableBitSetSizes.length; i++) { | ||
usableBitSetSizes[i] = (1 << (i + 6)) - 1; | ||
} | ||
} | ||
|
||
private final LongArrayBackedBitSet bitset; | ||
private final int setSize; | ||
private final int hashCount; | ||
|
||
BloomFilter(long maxDocs, double maxFpp, CheckedSupplier<Iterator<BytesRef>, IOException> fieldIteratorProvider) throws IOException { | ||
int setSize = | ||
(int) | ||
Math.ceil( | ||
(maxDocs * Math.log(maxFpp)) | ||
/ Math.log(1 / Math.pow(2, Math.log(2)))); | ||
setSize = getNearestSetSize(2 * setSize); | ||
int optimalK = (int) Math.round(((double) setSize / maxDocs) * Math.log(2)); | ||
this.bitset = new LongArrayBackedBitSet(setSize + 1); | ||
this.setSize = setSize; | ||
this.hashCount = optimalK; | ||
addAll(fieldIteratorProvider); | ||
if (Assertions.ENABLED) { | ||
assertAllElementsExist(fieldIteratorProvider); | ||
} | ||
logger.trace("Bloom filter created with fpp: {}, setSize: {}, hashCount: {}", maxFpp, setSize, hashCount); | ||
} | ||
|
||
BloomFilter(IndexInput in) throws IOException { | ||
hashCount = in.readInt(); | ||
setSize = in.readInt(); | ||
this.bitset = new LongArrayBackedBitSet(in); | ||
} | ||
|
||
@Override | ||
public void writeTo(DataOutput out) throws IOException { | ||
out.writeInt(hashCount); | ||
out.writeInt(setSize); | ||
bitset.writeTo(out); | ||
} | ||
|
||
static int getNearestSetSize(int maxNumberOfBits) { | ||
int result = usableBitSetSizes[0]; | ||
for (int i = 0; i < usableBitSetSizes.length; i++) { | ||
if (usableBitSetSizes[i] <= maxNumberOfBits) { | ||
result = usableBitSetSizes[i]; | ||
} | ||
} | ||
return result; | ||
} | ||
|
||
@Override | ||
public SetType setType() { | ||
return SetType.BLOOM_FILTER_V1; | ||
} | ||
|
||
@Override | ||
public Result contains(BytesRef value) { | ||
long hash = generateKey(value); | ||
int msb = (int) (hash >>> Integer.SIZE); | ||
int lsb = (int) hash; | ||
for (int i = 0; i < hashCount; i++) { | ||
int bloomPos = (lsb + i * msb); | ||
if (!mayContainValue(bloomPos)) { | ||
return Result.NO; | ||
} | ||
} | ||
return Result.MAYBE; | ||
} | ||
|
||
protected void add(BytesRef value) { | ||
long hash = generateKey(value); | ||
int msb = (int) (hash >>> Integer.SIZE); | ||
int lsb = (int) hash; | ||
for (int i = 0; i < hashCount; i++) { | ||
// Bitmasking using bloomSize is effectively a modulo operation since set sizes are always power of 2 | ||
int bloomPos = (lsb + i * msb) & setSize; | ||
bitset.set(bloomPos); | ||
} | ||
} | ||
|
||
@Override | ||
public boolean isSaturated() { | ||
long numBitsSet = bitset.cardinality(); | ||
return (float) numBitsSet / (float) setSize > 0.9f; | ||
} | ||
|
||
@Override | ||
public long ramBytesUsed() { | ||
return RamUsageEstimator.sizeOf(bitset.ramBytesUsed()); | ||
} | ||
|
||
private boolean mayContainValue(int aHash) { | ||
// Bloom sizes are always base 2 and so can be ANDed for a fast modulo | ||
int pos = aHash & setSize; | ||
return bitset.isSet(pos); | ||
} | ||
|
||
@Override | ||
public void close() throws IOException { | ||
IOUtils.close(bitset); | ||
} | ||
} |
Oops, something went wrong.