From 665bc12bb1083a4c222d39aeea75cc44a1073367 Mon Sep 17 00:00:00 2001 From: tanjialiang Date: Wed, 4 Dec 2024 14:18:02 +0800 Subject: [PATCH] [core] Use min/max to fast skip the bsi index comparison --- .../utils/BitSliceIndexRoaringBitmap.java | 104 +++++++- .../utils/BitSliceIndexRoaringBitmapTest.java | 230 ++++++++++++++---- 2 files changed, 281 insertions(+), 53 deletions(-) diff --git a/paimon-common/src/main/java/org/apache/paimon/utils/BitSliceIndexRoaringBitmap.java b/paimon-common/src/main/java/org/apache/paimon/utils/BitSliceIndexRoaringBitmap.java index 662d791d12325..72e64cc1510ad 100644 --- a/paimon-common/src/main/java/org/apache/paimon/utils/BitSliceIndexRoaringBitmap.java +++ b/paimon-common/src/main/java/org/apache/paimon/utils/BitSliceIndexRoaringBitmap.java @@ -18,11 +18,15 @@ package org.apache.paimon.utils; +import org.apache.paimon.annotation.VisibleForTesting; + import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; import java.util.Arrays; import java.util.Objects; +import java.util.Optional; +import java.util.function.Supplier; /* This file is based on source code from the RoaringBitmap Project (http://roaringbitmap.org/), licensed by the Apache * Software Foundation (ASF) under the Apache License, Version 2.0. See the NOTICE file distributed with this work for @@ -34,36 +38,38 @@ public class BitSliceIndexRoaringBitmap { public static final byte VERSION_1 = 1; public static final BitSliceIndexRoaringBitmap EMPTY = - new BitSliceIndexRoaringBitmap(0, new RoaringBitmap32(), new RoaringBitmap32[] {}); + new BitSliceIndexRoaringBitmap(0, 0, new RoaringBitmap32(), new RoaringBitmap32[]{}); private final long min; + private final long max; private final RoaringBitmap32 ebm; private final RoaringBitmap32[] slices; - private BitSliceIndexRoaringBitmap(long min, RoaringBitmap32 ebm, RoaringBitmap32[] slices) { + private BitSliceIndexRoaringBitmap(long min, long max, RoaringBitmap32 ebm, RoaringBitmap32[] slices) { this.min = min; + this.max = max; this.ebm = ebm; this.slices = slices; } public RoaringBitmap32 eq(long predicate) { - return oNeilCompare(Operation.EQ, predicate - min, null); + return compare(Operation.EQ, predicate, null); } public RoaringBitmap32 lt(long predicate) { - return oNeilCompare(Operation.LT, predicate - min, null); + return compare(Operation.LT, predicate, null); } public RoaringBitmap32 lte(long predicate) { - return oNeilCompare(Operation.LTE, predicate - min, null); + return compare(Operation.LTE, predicate, null); } public RoaringBitmap32 gt(long predicate) { - return oNeilCompare(Operation.GT, predicate - min, null); + return compare(Operation.GT, predicate, null); } public RoaringBitmap32 gte(long predicate) { - return oNeilCompare(Operation.GTE, predicate - min, null); + return compare(Operation.GTE, predicate, null); } public RoaringBitmap32 isNotNull() { @@ -84,6 +90,79 @@ public boolean equals(Object o) { && Arrays.equals(slices, that.slices); } + private RoaringBitmap32 compare(Operation operation, long predicate, RoaringBitmap32 foundSet) { + // using min/max to fast skip + return compareUsingMinMax(operation, predicate, foundSet) + .orElseGet(() -> oNeilCompare(operation, predicate - min, foundSet)); + } + + @VisibleForTesting + protected Optional compareUsingMinMax( + Operation operation, long predicate, RoaringBitmap32 foundSet) { + Supplier> empty = () -> Optional.of(new RoaringBitmap32()); + Supplier> all = () -> { + if (foundSet == null) { + return Optional.of(isNotNull()); + } else { + return Optional.of(RoaringBitmap32.and(foundSet, ebm)); + } + }; + + switch (operation) { + case EQ: { + if (min == max && min == predicate) { + return all.get(); + } else if (predicate < min || predicate > max) { + return empty.get(); + } + break; + } + case NEQ: { + if (min == max && min == predicate) { + return empty.get(); + } else if (predicate < min || predicate > max) { + return all.get(); + } + break; + } + case GTE: { + if (predicate <= min) { + return all.get(); + } else if (predicate > max) { + return empty.get(); + } + break; + } + case GT: { + if (predicate < min) { + return all.get(); + } else if (predicate >= max) { + return empty.get(); + } + break; + } + case LTE: { + if (predicate >= max) { + return all.get(); + } else if (predicate < min) { + return empty.get(); + } + break; + } + case LT: { + if (predicate > max) { + return all.get(); + } else if (predicate <= min) { + return empty.get(); + } + break; + } + default: + throw new IllegalArgumentException("not support operation: " + operation); + } + return Optional.empty(); + } + /** * O'Neil bit-sliced index compare algorithm. * @@ -133,7 +212,8 @@ private RoaringBitmap32 oNeilCompare( } /** Specifies O'Neil compare algorithm operation. */ - private enum Operation { + @VisibleForTesting + protected enum Operation { EQ, NEQ, LTE, @@ -151,8 +231,9 @@ public static BitSliceIndexRoaringBitmap map(DataInput in) throws IOException { version)); } - // deserialize min + // deserialize min & max long min = in.readLong(); + long max = in.readLong(); // deserialize ebm RoaringBitmap32 ebm = new RoaringBitmap32(); @@ -166,7 +247,7 @@ public static BitSliceIndexRoaringBitmap map(DataInput in) throws IOException { slices[i] = rb; } - return new BitSliceIndexRoaringBitmap(min, ebm, slices); + return new BitSliceIndexRoaringBitmap(min, max, ebm, slices); } /** A Builder for {@link BitSliceIndexRoaringBitmap}. */ @@ -220,6 +301,7 @@ public boolean isNotEmpty() { public void serialize(DataOutput out) throws IOException { out.writeByte(VERSION_1); out.writeLong(min); + out.writeLong(max); ebm.serialize(out); out.writeInt(slices.length); for (RoaringBitmap32 slice : slices) { @@ -228,7 +310,7 @@ public void serialize(DataOutput out) throws IOException { } public BitSliceIndexRoaringBitmap build() throws IOException { - return new BitSliceIndexRoaringBitmap(min, ebm, slices); + return new BitSliceIndexRoaringBitmap(min, max, ebm, slices); } } } diff --git a/paimon-common/src/test/java/org/apache/paimon/utils/BitSliceIndexRoaringBitmapTest.java b/paimon-common/src/test/java/org/apache/paimon/utils/BitSliceIndexRoaringBitmapTest.java index 8c4a27d4351ee..a5fd120698fc9 100644 --- a/paimon-common/src/test/java/org/apache/paimon/utils/BitSliceIndexRoaringBitmapTest.java +++ b/paimon-common/src/test/java/org/apache/paimon/utils/BitSliceIndexRoaringBitmapTest.java @@ -26,34 +26,69 @@ import java.io.DataInputStream; import java.io.DataOutputStream; import java.io.IOException; -import java.util.stream.IntStream; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Objects; +import java.util.Optional; +import java.util.Random; +import static org.apache.paimon.utils.BitSliceIndexRoaringBitmap.Operation.EQ; +import static org.apache.paimon.utils.BitSliceIndexRoaringBitmap.Operation.GT; +import static org.apache.paimon.utils.BitSliceIndexRoaringBitmap.Operation.GTE; +import static org.apache.paimon.utils.BitSliceIndexRoaringBitmap.Operation.LT; +import static org.apache.paimon.utils.BitSliceIndexRoaringBitmap.Operation.LTE; +import static org.apache.paimon.utils.BitSliceIndexRoaringBitmap.Operation.NEQ; import static org.assertj.core.api.Assertions.assertThat; /** Test for {@link BitSliceIndexRoaringBitmap}. */ public class BitSliceIndexRoaringBitmapTest { - private long base; + public static final int NUM_OF_ROWS = 100000; + public static final int VALUE_BOUND = 1000; + public static final int VALUE_LT_MIN = 0; + public static final int VALUE_GT_MAX = VALUE_BOUND + 100; + + private Random random; + private List pairs; private BitSliceIndexRoaringBitmap bsi; @BeforeEach public void setup() throws IOException { - this.base = System.currentTimeMillis(); + this.random = new Random(); + List pairs = new ArrayList<>(); + long min = 0; + long max = 0; + for (int i = 0; i < NUM_OF_ROWS; i++) { + if (i % 5 == 0) { + pairs.add(new Pair(i, null)); + continue; + } + long next = generateNextValue(); + min = Math.min(min == 0 ? next : min, next); + max = Math.max(max == 0 ? next : max, next); + pairs.add(new Pair(i, next)); + } BitSliceIndexRoaringBitmap.Appender appender = - new BitSliceIndexRoaringBitmap.Appender(base, toPredicate(100)); - IntStream.range(0, 31).forEach(x -> appender.append(x, toPredicate(x))); - IntStream.range(51, 100).forEach(x -> appender.append(x, toPredicate(x))); - appender.append(100, toPredicate(30)); + new BitSliceIndexRoaringBitmap.Appender(min, max); + for (Pair pair : pairs) { + if (pair.value == null) { + continue; + } + appender.append(pair.index, pair.value); + } this.bsi = appender.build(); + this.pairs = Collections.unmodifiableList(pairs); } @Test public void testSerde() throws IOException { BitSliceIndexRoaringBitmap.Appender appender = - new BitSliceIndexRoaringBitmap.Appender(0, toPredicate(100)); - IntStream.range(0, 31).forEach(x -> appender.append(x, toPredicate(x))); - IntStream.range(51, 100).forEach(x -> appender.append(x, toPredicate(x))); - appender.append(100, toPredicate(30)); + new BitSliceIndexRoaringBitmap.Appender(0, 10); + appender.append(0, 0); + appender.append(1, 1); + appender.append(2, 2); + appender.append(10, 6); ByteArrayOutputStream out = new ByteArrayOutputStream(); appender.serialize(new DataOutputStream(out)); @@ -65,60 +100,171 @@ public void testSerde() throws IOException { @Test public void testEQ() { - assertThat(bsi.eq(toPredicate(1))).isEqualTo(RoaringBitmap32.bitmapOf(1)); - assertThat(bsi.eq(toPredicate(32))).isEqualTo(RoaringBitmap32.bitmapOf()); - assertThat(bsi.eq(toPredicate(30))).isEqualTo(RoaringBitmap32.bitmapOf(30, 100)); + // test predicate in the value bound + for (int i = 0; i < 10; i++) { + long predicate = generateNextValue(); + assertThat(bsi.eq(predicate)) + .isEqualTo( + pairs.stream() + .filter(x -> Objects.equals(x.value, predicate)) + .map(x -> x.index) + .collect( + RoaringBitmap32::new, + RoaringBitmap32::add, + (x, y) -> x.or(y))); + } + + // test predicate out of the value bound + assertThat(bsi.eq(VALUE_LT_MIN)).isEqualTo(new RoaringBitmap32()); + assertThat(bsi.eq(VALUE_GT_MAX)).isEqualTo(new RoaringBitmap32()); } @Test public void testLT() { - assertThat(bsi.lt(toPredicate(30))) - .isEqualTo(RoaringBitmap32.bitmapOf(IntStream.range(0, 30).toArray())); - assertThat(bsi.lt(toPredicate(45))) - .isEqualTo( - RoaringBitmap32.bitmapOf( - IntStream.concat(IntStream.range(0, 31), IntStream.range(100, 101)) - .toArray())); + // test predicate in the value bound + for (int i = 0; i < 10; i++) { + long predicate = generateNextValue(); + assertThat(bsi.lt(predicate)) + .isEqualTo( + pairs.stream() + .filter(x -> x.value != null) + .filter(x -> x.value < predicate) + .map(x -> x.index) + .collect( + RoaringBitmap32::new, + RoaringBitmap32::add, + (x, y) -> x.or(y))); + } + + // test predicate out of the value bound + assertThat(bsi.lt(VALUE_LT_MIN)).isEqualTo(new RoaringBitmap32()); + assertThat(bsi.lt(VALUE_GT_MAX)).isEqualTo(bsi.isNotNull()); } @Test public void testLTE() { - RoaringBitmap32 expected = - RoaringBitmap32.bitmapOf( - IntStream.concat(IntStream.range(0, 31), IntStream.range(100, 101)) - .toArray()); - assertThat(bsi.lte(toPredicate(30))).isEqualTo(expected); - assertThat(bsi.lte(toPredicate(45))).isEqualTo(expected); + // test predicate in the value bound + for (int i = 0; i < 10; i++) { + long predicate = generateNextValue(); + assertThat(bsi.lte(predicate)) + .isEqualTo( + pairs.stream() + .filter(x -> x.value != null) + .filter(x -> x.value <= predicate) + .map(x -> x.index) + .collect( + RoaringBitmap32::new, + RoaringBitmap32::add, + (x, y) -> x.or(y))); + } + + // test predicate out of the value bound + assertThat(bsi.lte(VALUE_LT_MIN)).isEqualTo(new RoaringBitmap32()); + assertThat(bsi.lte(VALUE_GT_MAX)).isEqualTo(bsi.isNotNull()); } @Test public void testGT() { - RoaringBitmap32 expected = RoaringBitmap32.bitmapOf(IntStream.range(51, 100).toArray()); - assertThat(bsi.gt(toPredicate(30))).isEqualTo(expected); - assertThat(bsi.gt(toPredicate(45))).isEqualTo(expected); + // test predicate in the value bound + for (int i = 0; i < 10; i++) { + long predicate = generateNextValue(); + assertThat(bsi.gt(predicate)) + .isEqualTo( + pairs.stream() + .filter(x -> x.value != null) + .filter(x -> x.value > predicate) + .map(x -> x.index) + .collect( + RoaringBitmap32::new, + RoaringBitmap32::add, + (x, y) -> x.or(y))); + } + + // test predicate out of the value bound + assertThat(bsi.gt(VALUE_LT_MIN)).isEqualTo(bsi.isNotNull()); + assertThat(bsi.gt(VALUE_GT_MAX)).isEqualTo(new RoaringBitmap32()); } @Test public void testGTE() { - assertThat(bsi.gte(toPredicate(30))) - .isEqualTo( - RoaringBitmap32.bitmapOf( - IntStream.concat(IntStream.range(30, 31), IntStream.range(51, 101)) - .toArray())); - assertThat(bsi.gte(toPredicate(45))) - .isEqualTo(RoaringBitmap32.bitmapOf(IntStream.range(51, 100).toArray())); + // test predicate in the value bound + for (int i = 0; i < 10; i++) { + long predicate = generateNextValue(); + assertThat(bsi.gte(predicate)) + .isEqualTo( + pairs.stream() + .filter(x -> x.value != null) + .filter(x -> x.value >= predicate) + .map(x -> x.index) + .collect( + RoaringBitmap32::new, + RoaringBitmap32::add, + (x, y) -> x.or(y))); + } + + // test predicate out of the value bound + assertThat(bsi.gte(VALUE_LT_MIN)).isEqualTo(bsi.isNotNull()); + assertThat(bsi.gte(VALUE_GT_MAX)).isEqualTo(new RoaringBitmap32()); } @Test public void testIsNotNull() { assertThat(bsi.isNotNull()) .isEqualTo( - RoaringBitmap32.bitmapOf( - IntStream.concat(IntStream.range(0, 31), IntStream.range(51, 101)) - .toArray())); + pairs.stream() + .filter(x -> x.value != null) + .map(x -> x.index) + .collect( + RoaringBitmap32::new, + RoaringBitmap32::add, + (x, y) -> x.or(y))); } - private long toPredicate(long predicate) { - return base + predicate; + @Test + public void testCompareUsingMinMax() { + // a predicate in the value bound + final int VALUE_IN_BOUND = generateNextValue(); + final Optional EMPTY = Optional.of(new RoaringBitmap32()); + final Optional ALL = Optional.of(bsi.isNotNull()); + final Optional IN_VALUE_BOUND = Optional.empty(); + + // test eq & neq + assertThat(bsi.compareUsingMinMax(EQ, VALUE_IN_BOUND, null)).isEqualTo(IN_VALUE_BOUND); + assertThat(bsi.compareUsingMinMax(EQ, VALUE_LT_MIN, null)).isEqualTo(EMPTY); + assertThat(bsi.compareUsingMinMax(EQ, VALUE_GT_MAX, null)).isEqualTo(EMPTY); + assertThat(bsi.compareUsingMinMax(NEQ, VALUE_IN_BOUND, null)).isEqualTo(IN_VALUE_BOUND); + assertThat(bsi.compareUsingMinMax(NEQ, VALUE_LT_MIN, null)).isEqualTo(ALL); + assertThat(bsi.compareUsingMinMax(NEQ, VALUE_GT_MAX, null)).isEqualTo(ALL); + + // test lt & lte + assertThat(bsi.compareUsingMinMax(LT, VALUE_IN_BOUND, null)).isEqualTo(IN_VALUE_BOUND); + assertThat(bsi.compareUsingMinMax(LTE, VALUE_IN_BOUND, null)).isEqualTo(IN_VALUE_BOUND); + assertThat(bsi.compareUsingMinMax(LT, VALUE_LT_MIN, null)).isEqualTo(EMPTY); + assertThat(bsi.compareUsingMinMax(LTE, VALUE_LT_MIN, null)).isEqualTo(EMPTY); + assertThat(bsi.compareUsingMinMax(LT, VALUE_GT_MAX, null)).isEqualTo(ALL); + assertThat(bsi.compareUsingMinMax(LTE, VALUE_GT_MAX, null)).isEqualTo(ALL); + + // test gt & gte + assertThat(bsi.compareUsingMinMax(GT, VALUE_IN_BOUND, null)).isEqualTo(IN_VALUE_BOUND); + assertThat(bsi.compareUsingMinMax(GTE, VALUE_IN_BOUND, null)).isEqualTo(IN_VALUE_BOUND); + assertThat(bsi.compareUsingMinMax(GT, VALUE_LT_MIN, null)).isEqualTo(ALL); + assertThat(bsi.compareUsingMinMax(GTE, VALUE_LT_MIN, null)).isEqualTo(ALL); + assertThat(bsi.compareUsingMinMax(GT, VALUE_GT_MAX, null)).isEqualTo(EMPTY); + assertThat(bsi.compareUsingMinMax(GT, VALUE_GT_MAX, null)).isEqualTo(EMPTY); + } + + private int generateNextValue() { + // return a value in the range [1, VALUE_BOUND) + return random.nextInt(VALUE_BOUND) + 1; + } + + private static class Pair { + int index; + Long value; + + public Pair(int index, Long value) { + this.index = index; + this.value = value; + } } }