From 2a258f561a60f12fce498d1c6163bfb313b23e45 Mon Sep 17 00:00:00 2001 From: "zhongjian.xzj" Date: Thu, 12 Sep 2024 19:54:52 +0800 Subject: [PATCH] [opt](nereids) refine operator estimation --- .../doris/nereids/stats/FilterEstimation.java | 96 +++++++++---------- .../doris/statistics/StatisticRange.java | 56 +++++------ .../apache/doris/statistics/Statistics.java | 10 +- 3 files changed, 75 insertions(+), 87 deletions(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java index a65a07fea30bdfc..b6d08f0a7b1a831 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java @@ -171,42 +171,32 @@ public Statistics visitComparisonPredicate(ComparisonPredicate cp, EstimationCon ColumnStatistic statsForLeft = ExpressionEstimation.estimate(left, context.statistics); ColumnStatistic statsForRight = ExpressionEstimation.estimate(right, context.statistics); if (!left.isConstant() && !right.isConstant()) { - return calculateWhenBothColumn(cp, context, statsForLeft, statsForRight); + return estimateColumnToColumn(cp, context, statsForLeft, statsForRight); } else { - // For literal, it's max min is same value. - return calculateWhenLiteralRight(cp, - statsForLeft, - statsForRight, - context); + return estimateColumnToConstant(cp, statsForLeft, statsForRight, context); } } - private Statistics updateLessThanLiteral(Expression leftExpr, DataType dataType, ColumnStatistic statsForLeft, - ColumnStatistic statsForRight, EstimationContext context) { - StatisticRange rightRange = new StatisticRange(statsForLeft.minValue, statsForLeft.minExpr, - statsForRight.maxValue, statsForRight.maxExpr, - statsForLeft.ndv, dataType); - return estimateBinaryComparisonFilter(leftExpr, dataType, - statsForLeft, - rightRange, context); + private Statistics estimateColumnLessThanConstant(Expression leftExpr, DataType dataType, + ColumnStatistic statsForLeft, ColumnStatistic statsForRight, EstimationContext context) { + StatisticRange constantRange = new StatisticRange(statsForRight.minValue, statsForRight.minExpr, + statsForRight.maxValue, statsForRight.maxExpr, 1, dataType); + return estimateColumnToConstantRange(leftExpr, dataType, statsForLeft, constantRange, context); } - private Statistics updateGreaterThanLiteral(Expression leftExpr, DataType dataType, ColumnStatistic statsForLeft, - ColumnStatistic statsForRight, EstimationContext context) { - StatisticRange rightRange = new StatisticRange(statsForRight.minValue, statsForRight.minExpr, - statsForLeft.maxValue, statsForLeft.maxExpr, - statsForLeft.ndv, dataType); - return estimateBinaryComparisonFilter(leftExpr, dataType, statsForLeft, rightRange, context); + private Statistics estimateColumnGreaterThanConstant(Expression leftExpr, DataType dataType, + ColumnStatistic statsForLeft, ColumnStatistic statsForRight, EstimationContext context) { + StatisticRange constantRange = new StatisticRange(statsForRight.minValue, statsForRight.minExpr, + statsForRight.maxValue, statsForRight.maxExpr, 1, dataType); + return estimateColumnToConstantRange(leftExpr, dataType, statsForLeft, constantRange, context); } - private Statistics calculateWhenLiteralRight(ComparisonPredicate cp, + private Statistics estimateColumnToConstant(ComparisonPredicate cp, ColumnStatistic statsForLeft, ColumnStatistic statsForRight, EstimationContext context) { if (statsForLeft.isUnKnown) { return context.statistics.withSel(DEFAULT_INEQUALITY_COEFFICIENT); - } - - if (cp instanceof EqualPredicate) { - return estimateEqualTo(cp, statsForLeft, statsForRight, context); + } else if (cp instanceof EqualPredicate) { + return estimateColumnEqualToConstant(cp, statsForLeft, statsForRight, context); } else { // literal Map used to covert dateLiteral back to stringLiteral Map literalMap = new HashMap<>(); @@ -229,12 +219,13 @@ private Statistics calculateWhenLiteralRight(ComparisonPredicate cp, statsForLeftMayConverted = statsForLeftMayConvertedOpt.get(); statsForRightMayConverted = statsForRightMayConvertedOpt.get(); } + Statistics result = null; if (cp instanceof LessThan || cp instanceof LessThanEqual) { - result = updateLessThanLiteral(cp.left(), compareType, statsForLeftMayConverted, + result = estimateColumnLessThanConstant(cp.left(), compareType, statsForLeftMayConverted, statsForRightMayConverted, context); } else if (cp instanceof GreaterThan || cp instanceof GreaterThanEqual) { - result = updateGreaterThanLiteral(cp.left(), compareType, statsForLeftMayConverted, + result = estimateColumnGreaterThanConstant(cp.left(), compareType, statsForLeftMayConverted, statsForRightMayConverted, context); } else { throw new RuntimeException(String.format("Unexpected expression : %s", cp.toSql())); @@ -315,7 +306,7 @@ private Optional tryConvertStrLiteralToDateLiteral(LiteralExpr lite return dt == null ? Optional.empty() : Optional.of(dt); } - private Statistics estimateEqualTo(ComparisonPredicate cp, ColumnStatistic statsForLeft, + private Statistics estimateColumnEqualToConstant(ComparisonPredicate cp, ColumnStatistic statsForLeft, ColumnStatistic statsForRight, EstimationContext context) { double selectivity; @@ -351,21 +342,20 @@ private Statistics estimateEqualTo(ComparisonPredicate cp, ColumnStatistic stats return equalStats; } - private Statistics calculateWhenBothColumn(ComparisonPredicate cp, EstimationContext context, + private Statistics estimateColumnToColumn(ComparisonPredicate cp, EstimationContext context, ColumnStatistic statsForLeft, ColumnStatistic statsForRight) { Expression left = cp.left(); Expression right = cp.right(); if (cp instanceof EqualPredicate) { return estimateColumnEqualToColumn(left, statsForLeft, right, statsForRight, cp instanceof NullSafeEqual, context); - } - if (cp instanceof GreaterThan || cp instanceof GreaterThanEqual) { + } else if (cp instanceof GreaterThan || cp instanceof GreaterThanEqual) { return estimateColumnLessThanColumn(right, statsForRight, left, statsForLeft, context); - } - if (cp instanceof LessThan || cp instanceof LessThanEqual) { + } else if (cp instanceof LessThan || cp instanceof LessThanEqual) { return estimateColumnLessThanColumn(left, statsForLeft, right, statsForRight, context); + } else { + return context.statistics; } - return context.statistics; } @Override @@ -580,15 +570,14 @@ public boolean isKeySlot(Expression expr) { } } - private Statistics estimateBinaryComparisonFilter(Expression leftExpr, DataType dataType, ColumnStatistic leftStats, + private Statistics estimateColumnToConstantRange(Expression leftExpr, DataType dataType, ColumnStatistic leftStats, StatisticRange rightRange, EstimationContext context) { - StatisticRange leftRange = - new StatisticRange(leftStats.minValue, leftStats.minExpr, leftStats.maxValue, leftStats.maxExpr, - leftStats.ndv, dataType); - StatisticRange intersectRange = leftRange.cover(rightRange); - + StatisticRange leftRange = new StatisticRange(leftStats.minValue, leftStats.minExpr, + leftStats.maxValue, leftStats.maxExpr, leftStats.ndv, dataType); + StatisticRange intersectRange = leftRange.intersect(rightRange); ColumnStatisticBuilder leftColumnStatisticBuilder; Statistics updatedStatistics; + if (intersectRange.isEmpty()) { updatedStatistics = context.statistics.withRowCount(0); leftColumnStatisticBuilder = new ColumnStatisticBuilder(leftStats) @@ -629,21 +618,28 @@ private Statistics estimateColumnEqualToColumn(Expression leftExpr, ColumnStatis Expression rightExpr, ColumnStatistic rightStats, boolean keepNull, EstimationContext context) { StatisticRange leftRange = StatisticRange.from(leftStats, leftExpr.getDataType()); StatisticRange rightRange = StatisticRange.from(rightStats, rightExpr.getDataType()); - StatisticRange leftIntersectRight = leftRange.intersect(rightRange); - StatisticRange intersect = rightRange.intersect(leftIntersectRight); + StatisticRange intersect = leftRange.intersect(rightRange); + ColumnStatisticBuilder intersectBuilder = new ColumnStatisticBuilder(leftStats); intersectBuilder.setNdv(intersect.getDistinctValues()); intersectBuilder.setMinValue(intersect.getLow()); intersectBuilder.setMaxValue(intersect.getHigh()); - double numNull = 0; - if (keepNull) { - numNull = Math.min(leftStats.numNulls, rightStats.numNulls); - } + double numNull = keepNull ? Math.min(leftStats.numNulls, rightStats.numNulls) : 0; intersectBuilder.setNumNulls(numNull); - double sel = 1 / StatsMathUtil.nonZeroDivisor(Math.max(leftStats.ndv, rightStats.ndv)); - Statistics updatedStatistics = context.statistics.withSel(sel, numNull); - updatedStatistics.addColumnStats(leftExpr, intersectBuilder.build()); - updatedStatistics.addColumnStats(rightExpr, intersectBuilder.build()); + + double origRowCount = context.statistics.getRowCount(); + double leftNotNullSel = Statistics.getValidSelectivity(1 - (leftStats.numNulls / origRowCount)); + double rightNotNullSel = Statistics.getValidSelectivity(1 - (rightStats.numNulls / origRowCount)); + double notNullSel = 1 / StatsMathUtil.nonZeroDivisor(Math.max(leftStats.ndv, rightStats.ndv)) + * (keepNull ? 1 : leftNotNullSel * rightNotNullSel); + + Statistics updatedStatistics = context.statistics.withSel(notNullSel, numNull); + ColumnStatistic newLeftStatistics = intersectBuilder + .setAvgSizeByte(leftStats.avgSizeByte).build(); + ColumnStatistic newRightStatistics = intersectBuilder + .setAvgSizeByte(rightStats.avgSizeByte).build(); + updatedStatistics.addColumnStats(leftExpr, newLeftStatistics); + updatedStatistics.addColumnStats(rightExpr, newRightStatistics); context.addKeyIfSlot(leftExpr); context.addKeyIfSlot(rightExpr); diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticRange.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticRange.java index ca9735b56654b12..b7639186c403e5a 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticRange.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticRange.java @@ -158,8 +158,9 @@ public StatisticRange intersect(StatisticRange other) { double newHigh = smallerHigh.first; LiteralExpr newHighExpr = smallerHigh.second; if (newLow <= newHigh) { + double distinctValues = overlappingDistinctValues(other); return new StatisticRange(newLow, newLowExpr, newHigh, newHighExpr, - overlappingDistinctValues(other), dataType); + distinctValues, dataType); } return empty(dataType); } @@ -178,33 +179,6 @@ public Pair maxPair(double r1, LiteralExpr e1, double r2, L return Pair.of(r2, e2); } - public StatisticRange cover(StatisticRange other) { - StatisticRange resultRange; - Pair biggerLow = maxPair(low, lowExpr, other.low, other.lowExpr); - double newLow = biggerLow.first; - LiteralExpr newLowExpr = biggerLow.second; - Pair smallerHigh = minPair(high, highExpr, other.high, other.highExpr); - double newHigh = smallerHigh.first; - LiteralExpr newHighExpr = smallerHigh.second; - - if (newLow <= newHigh) { - double overlapPercentOfLeft = overlapPercentWith(other); - double overlapDistinctValuesLeft = overlapPercentOfLeft * distinctValues; - double coveredDistinctValues = minExcludeNaN(distinctValues, overlapDistinctValuesLeft); - if (this.isBothInfinite() && other.isOneSideInfinite()) { - resultRange = new StatisticRange(newLow, newLowExpr, newHigh, newHighExpr, - distinctValues * INFINITE_TO_INFINITE_RANGE_INTERSECT_OVERLAP_HEURISTIC_FACTOR, - dataType); - } else { - resultRange = new StatisticRange(newLow, newLowExpr, newHigh, newHighExpr, coveredDistinctValues, - dataType); - } - } else { - resultRange = empty(dataType); - } - return resultRange; - } - public StatisticRange union(StatisticRange other) { double overlapPercentThis = this.overlapPercentWith(other); double overlapPercentOther = other.overlapPercentWith(this); @@ -220,10 +194,28 @@ public StatisticRange union(StatisticRange other) { } private double overlappingDistinctValues(StatisticRange other) { - double overlapPercentOfLeft = overlapPercentWith(other); - double overlapPercentOfRight = other.overlapPercentWith(this); - double overlapDistinctValuesLeft = overlapPercentOfLeft * distinctValues; - double overlapDistinctValuesRight = overlapPercentOfRight * other.distinctValues; + double overlapDistinctValuesLeft; + double overlapDistinctValuesRight; + // FIXME: what does it mean? + if (this.isBothInfinite() && other.isOneSideInfinite()) { + overlapDistinctValuesRight = distinctValues * INFINITE_TO_INFINITE_RANGE_INTERSECT_OVERLAP_HEURISTIC_FACTOR; + } else if (Math.abs(other.low - other.high) < 1e-6) { + // other is constant + overlapDistinctValuesRight = distinctValues; + } else { + double overlapPercentOfRight = other.overlapPercentWith(this); + overlapDistinctValuesRight = overlapPercentOfRight * other.distinctValues; + } + + if (other.isBothInfinite() && this.isOneSideInfinite()) { + overlapDistinctValuesLeft = distinctValues * INFINITE_TO_INFINITE_RANGE_INTERSECT_OVERLAP_HEURISTIC_FACTOR; + } else if (Math.abs(this.low - this.high) < 1e-6) { + overlapDistinctValuesLeft = distinctValues; + } else { + double overlapPercentOfLeft = this.overlapPercentWith(other); + overlapDistinctValuesLeft = overlapPercentOfLeft * distinctValues; + } + return minExcludeNaN(overlapDistinctValuesLeft, overlapDistinctValuesRight); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/Statistics.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/Statistics.java index 6883eb0b54208a5..de78e4ae813920b 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/Statistics.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/Statistics.java @@ -126,12 +126,12 @@ public Statistics withSel(double sel) { return withSel(sel, 0); } - public Statistics withSel(double sel, double numNull) { - sel = StatsMathUtil.minNonNaN(sel, 1); + public Statistics withSel(double notNullSel, double numNull) { + notNullSel = StatsMathUtil.minNonNaN(notNullSel, 1); if (Double.isNaN(rowCount)) { return this; } - double newCount = rowCount * sel + numNull; + double newCount = rowCount * notNullSel + numNull; return new Statistics(newCount, widthInJoinCluster, new HashMap<>(expressionToColumnStats)); } @@ -236,8 +236,8 @@ public static Statistics zero(Statistics statistics) { return zero; } - public static double getValidSelectivity(double nullSel) { - return nullSel < 0 ? 0 : (nullSel > 1 ? 1 : nullSel); + public static double getValidSelectivity(double selectivity) { + return selectivity < 0 ? 0 : (selectivity > 1 ? 1 : selectivity); } /**