From 054c1eb3c0924a3f495233df64f8ff469433327d Mon Sep 17 00:00:00 2001 From: englefly Date: Thu, 29 Aug 2024 09:59:28 +0800 Subject: [PATCH] set threshold for range-selectivity --- .../doris/nereids/stats/FilterEstimation.java | 5 ++ .../nereids/stats/FilterEstimationTest.java | 64 +++++++++---------- 2 files changed, 35 insertions(+), 34 deletions(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java index 2839a8177007946..44b2a87ce8c9548 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java @@ -67,6 +67,9 @@ */ public class FilterEstimation extends ExpressionVisitor { public static final double DEFAULT_INEQUALITY_COEFFICIENT = 0.5; + // "Range selectivity is prone to producing outliers, so we add this threshold limit. + // The threshold estimation is calculated based on selecting one month out of ten years." + public static final double RANGE_SELECTIVITY_THRESHOULD = 0.01; public static final double DEFAULT_IN_COEFFICIENT = 1.0 / 3.0; public static final double DEFAULT_LIKE_COMPARISON_SELECTIVITY = 0.2; @@ -600,6 +603,8 @@ private Statistics estimateBinaryComparisonFilter(Expression leftExpr, DataType double sel = leftRange.overlapPercentWith(rightRange); if (!(dataType instanceof RangeScalable) && (sel != 0.0 && sel != 1.0)) { sel = DEFAULT_INEQUALITY_COEFFICIENT; + } else if (sel < RANGE_SELECTIVITY_THRESHOULD) { + sel = RANGE_SELECTIVITY_THRESHOULD; } sel = getNotNullSelectivity(leftStats, sel); updatedStatistics = context.statistics.withSel(sel); diff --git a/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/FilterEstimationTest.java b/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/FilterEstimationTest.java index d7c44e082cf52e3..4ff6d57e762144e 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/FilterEstimationTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/FilterEstimationTest.java @@ -34,6 +34,7 @@ import org.apache.doris.nereids.trees.expressions.Or; import org.apache.doris.nereids.trees.expressions.SlotReference; import org.apache.doris.nereids.trees.expressions.functions.scalar.Left; +import org.apache.doris.nereids.trees.expressions.literal.BigIntLiteral; import org.apache.doris.nereids.trees.expressions.literal.DateLiteral; import org.apache.doris.nereids.trees.expressions.literal.DoubleLiteral; import org.apache.doris.nereids.trees.expressions.literal.IntegerLiteral; @@ -281,7 +282,7 @@ public void test1() { Statistics stat = new Statistics(1000, slotToColumnStat); FilterEstimation filterEstimation = new FilterEstimation(); Statistics expected = filterEstimation.estimate(or, stat); - Assertions.assertEquals(51, expected.getRowCount(), 1); + Assertions.assertEquals(51.9, expected.getRowCount(), 0.1); } // a > 500 and b < 100 or a > c @@ -1134,39 +1135,6 @@ public void testNumNullsAnd() { Assertions.assertEquals(result.getRowCount(), 2.0, 0.01); } - /** - * a = 1 and b is not null - */ - @Test - public void testNumNullsAndTwoCol() { - SlotReference a = new SlotReference("a", IntegerType.INSTANCE); - ColumnStatisticBuilder builderA = new ColumnStatisticBuilder() - .setNdv(2) - .setAvgSizeByte(4) - .setNumNulls(0) - .setMaxValue(2) - .setMinValue(1) - .setCount(10); - IntegerLiteral int1 = new IntegerLiteral(1); - EqualTo equalTo = new EqualTo(a, int1); - SlotReference b = new SlotReference("a", IntegerType.INSTANCE); - ColumnStatisticBuilder builderB = new ColumnStatisticBuilder() - .setNdv(2) - .setAvgSizeByte(4) - .setNumNulls(8) - .setMaxValue(2) - .setMinValue(1) - .setCount(10); - Not isNotNull = new Not(new IsNull(b)); - And and = new And(equalTo, isNotNull); - Statistics stats = new Statistics(10, new HashMap<>()); - stats.addColumnStats(a, builderA.build()); - stats.addColumnStats(b, builderB.build()); - FilterEstimation filterEstimation = new FilterEstimation(); - Statistics result = filterEstimation.estimate(and, stats); - Assertions.assertEquals(result.getRowCount(), 1.0, 0.01); - } - /** * a >= 1 or a <= 2 */ @@ -1365,4 +1333,32 @@ public void testStringRangeColToCol() { Statistics agrtc = new FilterEstimation().estimate(new GreaterThan(a, c), baseStats); Assertions.assertEquals(50, agrtc.getRowCount()); } + + @Test + public void testLargeRange() { + SlotReference a = new SlotReference("a", IntegerType.INSTANCE); + long tenB = 1000000000; + long row = 1600000000; + ColumnStatistic colStats = new ColumnStatisticBuilder() + .setAvgSizeByte(10) + .setCount(row) + .setNdv(10000) + .setMinExpr(new IntLiteral(0)) + .setMinValue(0) + .setMaxExpr(new IntLiteral(tenB)) + .setMaxValue(tenB) + .build(); + Statistics stats = new StatisticsBuilder() + .setRowCount(row) + .putColumnStatistics(a, colStats) + .build(); + Expression less = new LessThan(a, new IntegerLiteral(50000)); + FilterEstimation estimation = new FilterEstimation(); + Statistics out = estimation.estimate(less, stats); + Assertions.assertEquals(out.getRowCount(), row * FilterEstimation.RANGE_SELECTIVITY_THRESHOULD); + + Expression greater = new GreaterThan(a, new BigIntLiteral(tenB - 5000L)); + out = estimation.estimate(greater, stats); + Assertions.assertEquals(out.getRowCount(), row * FilterEstimation.RANGE_SELECTIVITY_THRESHOULD); + } }