Skip to content

Commit

Permalink
set threshold for range-selectivity
Browse files Browse the repository at this point in the history
  • Loading branch information
englefly committed Aug 29, 2024
1 parent d7eb343 commit 054c1eb
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 34 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,9 @@
*/
public class FilterEstimation extends ExpressionVisitor<Statistics, EstimationContext> {
public static final double DEFAULT_INEQUALITY_COEFFICIENT = 0.5;
// "Range selectivity is prone to producing outliers, so we add this threshold limit.
// The threshold estimation is calculated based on selecting one month out of ten years."
public static final double RANGE_SELECTIVITY_THRESHOULD = 0.01;
public static final double DEFAULT_IN_COEFFICIENT = 1.0 / 3.0;

public static final double DEFAULT_LIKE_COMPARISON_SELECTIVITY = 0.2;
Expand Down Expand Up @@ -600,6 +603,8 @@ private Statistics estimateBinaryComparisonFilter(Expression leftExpr, DataType
double sel = leftRange.overlapPercentWith(rightRange);
if (!(dataType instanceof RangeScalable) && (sel != 0.0 && sel != 1.0)) {
sel = DEFAULT_INEQUALITY_COEFFICIENT;
} else if (sel < RANGE_SELECTIVITY_THRESHOULD) {
sel = RANGE_SELECTIVITY_THRESHOULD;
}
sel = getNotNullSelectivity(leftStats, sel);
updatedStatistics = context.statistics.withSel(sel);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
import org.apache.doris.nereids.trees.expressions.Or;
import org.apache.doris.nereids.trees.expressions.SlotReference;
import org.apache.doris.nereids.trees.expressions.functions.scalar.Left;
import org.apache.doris.nereids.trees.expressions.literal.BigIntLiteral;
import org.apache.doris.nereids.trees.expressions.literal.DateLiteral;
import org.apache.doris.nereids.trees.expressions.literal.DoubleLiteral;
import org.apache.doris.nereids.trees.expressions.literal.IntegerLiteral;
Expand Down Expand Up @@ -281,7 +282,7 @@ public void test1() {
Statistics stat = new Statistics(1000, slotToColumnStat);
FilterEstimation filterEstimation = new FilterEstimation();
Statistics expected = filterEstimation.estimate(or, stat);
Assertions.assertEquals(51, expected.getRowCount(), 1);
Assertions.assertEquals(51.9, expected.getRowCount(), 0.1);
}

// a > 500 and b < 100 or a > c
Expand Down Expand Up @@ -1134,39 +1135,6 @@ public void testNumNullsAnd() {
Assertions.assertEquals(result.getRowCount(), 2.0, 0.01);
}

/**
* a = 1 and b is not null
*/
@Test
public void testNumNullsAndTwoCol() {
SlotReference a = new SlotReference("a", IntegerType.INSTANCE);
ColumnStatisticBuilder builderA = new ColumnStatisticBuilder()
.setNdv(2)
.setAvgSizeByte(4)
.setNumNulls(0)
.setMaxValue(2)
.setMinValue(1)
.setCount(10);
IntegerLiteral int1 = new IntegerLiteral(1);
EqualTo equalTo = new EqualTo(a, int1);
SlotReference b = new SlotReference("a", IntegerType.INSTANCE);
ColumnStatisticBuilder builderB = new ColumnStatisticBuilder()
.setNdv(2)
.setAvgSizeByte(4)
.setNumNulls(8)
.setMaxValue(2)
.setMinValue(1)
.setCount(10);
Not isNotNull = new Not(new IsNull(b));
And and = new And(equalTo, isNotNull);
Statistics stats = new Statistics(10, new HashMap<>());
stats.addColumnStats(a, builderA.build());
stats.addColumnStats(b, builderB.build());
FilterEstimation filterEstimation = new FilterEstimation();
Statistics result = filterEstimation.estimate(and, stats);
Assertions.assertEquals(result.getRowCount(), 1.0, 0.01);
}

/**
* a >= 1 or a <= 2
*/
Expand Down Expand Up @@ -1365,4 +1333,32 @@ public void testStringRangeColToCol() {
Statistics agrtc = new FilterEstimation().estimate(new GreaterThan(a, c), baseStats);
Assertions.assertEquals(50, agrtc.getRowCount());
}

@Test
public void testLargeRange() {
SlotReference a = new SlotReference("a", IntegerType.INSTANCE);
long tenB = 1000000000;
long row = 1600000000;
ColumnStatistic colStats = new ColumnStatisticBuilder()
.setAvgSizeByte(10)
.setCount(row)
.setNdv(10000)
.setMinExpr(new IntLiteral(0))
.setMinValue(0)
.setMaxExpr(new IntLiteral(tenB))
.setMaxValue(tenB)
.build();
Statistics stats = new StatisticsBuilder()
.setRowCount(row)
.putColumnStatistics(a, colStats)
.build();
Expression less = new LessThan(a, new IntegerLiteral(50000));
FilterEstimation estimation = new FilterEstimation();
Statistics out = estimation.estimate(less, stats);
Assertions.assertEquals(out.getRowCount(), row * FilterEstimation.RANGE_SELECTIVITY_THRESHOULD);

Expression greater = new GreaterThan(a, new BigIntLiteral(tenB - 5000L));
out = estimation.estimate(greater, stats);
Assertions.assertEquals(out.getRowCount(), row * FilterEstimation.RANGE_SELECTIVITY_THRESHOULD);
}
}

0 comments on commit 054c1eb

Please sign in to comment.