Skip to content

Commit

Permalink
[opt](nereids) refine operator estimation
Browse files Browse the repository at this point in the history
  • Loading branch information
zhongjian.xzj authored and zhongjian.xzj committed Sep 19, 2024
1 parent 83f899b commit a7eb34b
Show file tree
Hide file tree
Showing 10 changed files with 455 additions and 337 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -409,7 +409,7 @@ public Void visitLogicalJoin(LogicalJoin<? extends Plan, ? extends Plan> join,
if (joinType.isInnerJoin() || joinType.isCrossJoin()) {
return visit(join, context);
} else if ((joinType.isLeftJoin()
|| joinType.isLefSemiJoin()
|| joinType.isLeftSemiJoin()
|| joinType.isLeftAntiJoin()) && useLeft) {
return visit(join.left(), context);
} else if ((joinType.isRightJoin()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -242,9 +242,9 @@ public ColumnStatistic visitLiteral(Literal literal, Statistics context) {
return new ColumnStatisticBuilder()
.setMaxValue(literalVal)
.setMinValue(literalVal)
.setNdv(1)
.setNumNulls(1)
.setAvgSizeByte(1)
.setNdv(literal.isNullLiteral() ? 0 : 1)
.setNumNulls(literal.isNullLiteral() ? 1 : 0)
.setAvgSizeByte(literal.getDataType().width())
.setMinExpr(literal.toLegacyLiteral())
.setMaxExpr(literal.toLegacyLiteral())
.build();
Expand Down Expand Up @@ -361,10 +361,11 @@ public ColumnStatistic visitMax(Max max, Statistics context) {

@Override
public ColumnStatistic visitCount(Count count, Statistics context) {
double width = count.getDataType().width();
// for scalar agg, ndv and row count will be normalized by 1 in StatsCalculator.computeAggregate()
return new ColumnStatisticBuilder(ColumnStatistic.UNKNOWN).setCount(context.getRowCount())
.setAvgSizeByte(width).build();
return new ColumnStatisticBuilder(ColumnStatistic.UNKNOWN)
.setCount(context.getRowCount())
.setAvgSizeByte(count.getDataType().width())
.build();
}

// TODO: return a proper estimated stat after supports histogram
Expand Down Expand Up @@ -580,7 +581,7 @@ public ColumnStatistic visitToDate(ToDate toDate, Statistics context) {
ColumnStatisticBuilder columnStatisticBuilder = new ColumnStatisticBuilder(childColumnStats)
.setAvgSizeByte(toDate.getDataType().width())
.setDataSize(toDate.getDataType().width() * context.getRowCount());
if (childColumnStats.minOrMaxIsInf()) {
if (childColumnStats.isMinMaxInvalid()) {
return columnStatisticBuilder.build();
}
double minValue;
Expand Down Expand Up @@ -611,7 +612,7 @@ public ColumnStatistic visitToDays(ToDays toDays, Statistics context) {
ColumnStatisticBuilder columnStatisticBuilder = new ColumnStatisticBuilder(childColumnStats)
.setAvgSizeByte(toDays.getDataType().width())
.setDataSize(toDays.getDataType().width() * context.getRowCount());
if (childColumnStats.minOrMaxIsInf()) {
if (childColumnStats.isMinMaxInvalid()) {
return columnStatisticBuilder.build();
}
double minValue;
Expand Down

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@
import org.apache.doris.nereids.trees.plans.algebra.EmptyRelation;
import org.apache.doris.nereids.trees.plans.algebra.Filter;
import org.apache.doris.nereids.trees.plans.algebra.Generate;
import org.apache.doris.nereids.trees.plans.algebra.Join;
import org.apache.doris.nereids.trees.plans.algebra.Limit;
import org.apache.doris.nereids.trees.plans.algebra.OlapScan;
import org.apache.doris.nereids.trees.plans.algebra.PartitionTopN;
Expand Down Expand Up @@ -236,7 +237,7 @@ public static void estimate(GroupExpression groupExpression, CascadesContext con
private void estimate() {
Plan plan = groupExpression.getPlan();
Statistics newStats = plan.accept(this, null);
newStats.enforceValid();
newStats.normalizeColumnStatistics();

// We ensure that the rowCount remains unchanged in order to make the cost of each plan comparable.
if (groupExpression.getOwnerGroup().getStatistics() == null) {
Expand Down Expand Up @@ -404,7 +405,9 @@ private Statistics computeOlapScan(OlapScan olapScan) {
if (derivedStats.findColumnStatistics(slot) == null) {
derivedStats.addColumnStats(slot,
new ColumnStatisticBuilder(ColumnStatistic.UNKNOWN)
.setCount(derivedRowCount).build());
.setCount(derivedRowCount)
//.setAvgSizeByte(slot.getDataType().width())
.build());
}
}
return derivedStats;
Expand All @@ -431,7 +434,10 @@ private Statistics computeOlapScan(OlapScan olapScan) {
// get row count from any visible slotReference's colStats
for (Slot slot : ((Plan) olapScan).getOutput()) {
builder.putColumnStatistics(slot,
new ColumnStatisticBuilder(ColumnStatistic.UNKNOWN).setCount(tableRowCount).build());
new ColumnStatisticBuilder(ColumnStatistic.UNKNOWN)
.setCount(tableRowCount)
//.setAvgSizeByte(slot.getDataType().width())
.build());
}
setHasUnknownColStatsInStatementContext();
return builder.setRowCount(tableRowCount).build();
Expand Down Expand Up @@ -576,8 +582,9 @@ public Statistics visitLogicalPartitionTopN(LogicalPartitionTopN<? extends Plan>

@Override
public Statistics visitLogicalJoin(LogicalJoin<? extends Plan, ? extends Plan> join, Void context) {
Statistics joinStats = JoinEstimation.estimate(groupExpression.childStatistics(0),
groupExpression.childStatistics(1), join);
Statistics joinStats = computeJoin(join);
// NOTE: physical operator visiting doesn't need the following
// logic which will ONLY be used in no-stats estimation.
joinStats = new StatisticsBuilder(joinStats).setWidthInJoinCluster(
groupExpression.childStatistics(0).getWidthInJoinCluster()
+ groupExpression.childStatistics(1).getWidthInJoinCluster()).build();
Expand Down Expand Up @@ -721,16 +728,14 @@ public Statistics visitPhysicalDeferMaterializeTopN(PhysicalDeferMaterializeTopN
@Override
public Statistics visitPhysicalHashJoin(
PhysicalHashJoin<? extends Plan, ? extends Plan> hashJoin, Void context) {
return JoinEstimation.estimate(groupExpression.childStatistics(0),
groupExpression.childStatistics(1), hashJoin);
return computeJoin(hashJoin);
}

@Override
public Statistics visitPhysicalNestedLoopJoin(
PhysicalNestedLoopJoin<? extends Plan, ? extends Plan> nestedLoopJoin,
Void context) {
return JoinEstimation.estimate(groupExpression.childStatistics(0),
groupExpression.childStatistics(1), nestedLoopJoin);
return computeJoin(nestedLoopJoin);
}

// TODO: We should subtract those pruned column, and consider the expression transformations in the node.
Expand Down Expand Up @@ -847,7 +852,7 @@ private Statistics computeFilter(Filter filter) {
}
builder.setRowCount(isNullStats.getRowCount());
stats = builder.build();
stats.enforceValid();
stats.normalizeColumnStatistics();
}
}
}
Expand Down Expand Up @@ -919,7 +924,7 @@ false, getTotalColumnStatisticMap(), false,

newStats = ((Plan) newJoin).accept(statsCalculator, null);
}
newStats.enforceValid();
newStats.normalizeColumnStatistics();

double selectivity = Statistics.getValidSelectivity(
newStats.getRowCount() / (leftRowCount * rightRowCount));
Expand Down Expand Up @@ -1070,6 +1075,11 @@ private Statistics computeCatalogRelation(CatalogRelation catalogRelation) {
return builder.setRowCount(tableRowCount).build();
}

private Statistics computeJoin(Join join) {
return JoinEstimation.estimate(groupExpression.childStatistics(0),
groupExpression.childStatistics(1), join);
}

private Statistics computeTopN(TopN topN) {
Statistics stats = groupExpression.childStatistics(0);
return stats.withRowCountAndEnforceValid(Math.min(stats.getRowCount(), topN.getLimit()));
Expand Down Expand Up @@ -1173,7 +1183,7 @@ private Statistics computeAggregate(Aggregate<? extends Plan> aggregate) {
slotToColumnStats.put(outputExpression.toSlot(), columnStat);
}
Statistics aggOutputStats = new Statistics(rowCount, 1, slotToColumnStats);
aggOutputStats.enforceValid();
aggOutputStats.normalizeColumnStatistics();
return aggOutputStats;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,4 +58,9 @@ public static double divide(double a, double b) {
}
return a / nonZeroDivisor(b);
}

// TODO: add more protection at other stats estimation
public static double normalizeRowCountOrNdv(double value) {
return value >= 0 && value < 1 ? 1 : value;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ public final boolean isLeftAntiJoin() {
return this == LEFT_ANTI_JOIN;
}

public final boolean isLefSemiJoin() {
public final boolean isLeftSemiJoin() {
return this == LEFT_SEMI_JOIN;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -356,7 +356,7 @@ public static ColumnStatistic fromJson(String statJson) {
);
}

public boolean minOrMaxIsInf() {
public boolean isMinMaxInvalid() {
return Double.isInfinite(maxValue) || Double.isInfinite(minValue);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -158,8 +158,8 @@ public StatisticRange intersect(StatisticRange other) {
double newHigh = smallerHigh.first;
LiteralExpr newHighExpr = smallerHigh.second;
if (newLow <= newHigh) {
return new StatisticRange(newLow, newLowExpr, newHigh, newHighExpr,
overlappingDistinctValues(other), dataType);
double distinctValues = overlappingDistinctValues(other);
return new StatisticRange(newLow, newLowExpr, newHigh, newHighExpr, distinctValues, dataType);
}
return empty(dataType);
}
Expand All @@ -178,33 +178,6 @@ public Pair<Double, LiteralExpr> maxPair(double r1, LiteralExpr e1, double r2, L
return Pair.of(r2, e2);
}

public StatisticRange cover(StatisticRange other) {
StatisticRange resultRange;
Pair<Double, LiteralExpr> biggerLow = maxPair(low, lowExpr, other.low, other.lowExpr);
double newLow = biggerLow.first;
LiteralExpr newLowExpr = biggerLow.second;
Pair<Double, LiteralExpr> smallerHigh = minPair(high, highExpr, other.high, other.highExpr);
double newHigh = smallerHigh.first;
LiteralExpr newHighExpr = smallerHigh.second;

if (newLow <= newHigh) {
double overlapPercentOfLeft = overlapPercentWith(other);
double overlapDistinctValuesLeft = overlapPercentOfLeft * distinctValues;
double coveredDistinctValues = minExcludeNaN(distinctValues, overlapDistinctValuesLeft);
if (this.isBothInfinite() && other.isOneSideInfinite()) {
resultRange = new StatisticRange(newLow, newLowExpr, newHigh, newHighExpr,
distinctValues * INFINITE_TO_INFINITE_RANGE_INTERSECT_OVERLAP_HEURISTIC_FACTOR,
dataType);
} else {
resultRange = new StatisticRange(newLow, newLowExpr, newHigh, newHighExpr, coveredDistinctValues,
dataType);
}
} else {
resultRange = empty(dataType);
}
return resultRange;
}

public StatisticRange union(StatisticRange other) {
double overlapPercentThis = this.overlapPercentWith(other);
double overlapPercentOther = other.overlapPercentWith(this);
Expand All @@ -220,10 +193,27 @@ public StatisticRange union(StatisticRange other) {
}

private double overlappingDistinctValues(StatisticRange other) {
double overlapPercentOfLeft = overlapPercentWith(other);
double overlapPercentOfRight = other.overlapPercentWith(this);
double overlapDistinctValuesLeft = overlapPercentOfLeft * distinctValues;
double overlapDistinctValuesRight = overlapPercentOfRight * other.distinctValues;
double overlapDistinctValuesLeft;
double overlapDistinctValuesRight;
if (this.isInfinite() || other.isInfinite()) {
overlapDistinctValuesRight = distinctValues * INFINITE_TO_INFINITE_RANGE_INTERSECT_OVERLAP_HEURISTIC_FACTOR;
} else if (Math.abs(other.low - other.high) < 1e-6) {
// other is constant
overlapDistinctValuesRight = distinctValues;
} else {
double overlapPercentOfRight = other.overlapPercentWith(this);
overlapDistinctValuesRight = overlapPercentOfRight * other.distinctValues;
}

if (other.isInfinite() || this.isInfinite()) {
overlapDistinctValuesLeft = distinctValues * INFINITE_TO_INFINITE_RANGE_INTERSECT_OVERLAP_HEURISTIC_FACTOR;
} else if (Math.abs(this.low - this.high) < 1e-6) {
overlapDistinctValuesLeft = distinctValues;
} else {
double overlapPercentOfLeft = this.overlapPercentWith(other);
overlapDistinctValuesLeft = overlapPercentOfLeft * distinctValues;
}

return minExcludeNaN(overlapDistinctValuesLeft, overlapDistinctValuesRight);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -98,40 +98,49 @@ public Statistics withExpressionToColumnStats(Map<Expression, ColumnStatistic> e
*/
public Statistics withRowCountAndEnforceValid(double rowCount) {
Statistics statistics = new Statistics(rowCount, widthInJoinCluster, expressionToColumnStats);
statistics.enforceValid();
statistics.normalizeColumnStatistics();
return statistics;
}

public void enforceValid() {
// IMPORTANT: it is suggested to do this action after each estimation critical visiting,
// since statistics will have serious deviation during the partial deriving.
public void normalizeColumnStatistics() {
normalizeColumnStatistics(this.rowCount);
}

public void normalizeColumnStatistics(double inputRowCount) {
for (Entry<Expression, ColumnStatistic> entry : expressionToColumnStats.entrySet()) {
ColumnStatistic columnStatistic = entry.getValue();
if (!checkColumnStatsValid(columnStatistic) && !columnStatistic.isUnKnown()) {
double ndv = Math.min(columnStatistic.ndv, rowCount);
double ratio = rowCount / inputRowCount;
// the following columnStatistic.isUnKnown() judgment is loop inside since current doris
// supports partial stats deriving, i.e, allowing part of tables have stats and other parts don't,
// or part of columns have stats but other parts don't, especially join and filter estimation.
if (!checkColumnStatsValid(columnStatistic, inputRowCount) && !columnStatistic.isUnKnown()) {
ColumnStatisticBuilder columnStatisticBuilder = new ColumnStatisticBuilder(columnStatistic);
columnStatisticBuilder.setNdv(ndv);
columnStatisticBuilder.setNumNulls(Math.min(columnStatistic.numNulls, rowCount - ndv));
columnStatisticBuilder.setCount(rowCount);
double ndv = Math.min(columnStatistic.ndv, inputRowCount);
double numNulls = Math.min(columnStatistic.numNulls * ratio, inputRowCount - ndv);
columnStatisticBuilder.setNumNulls(numNulls);
columnStatisticBuilder.setNdv(Math.min(ndv, rowCount - numNulls));
columnStatistic = columnStatisticBuilder.build();
expressionToColumnStats.put(entry.getKey(), columnStatistic);
}
}
}

public boolean checkColumnStatsValid(ColumnStatistic columnStatistic) {
return columnStatistic.ndv <= rowCount
&& columnStatistic.numNulls <= rowCount - columnStatistic.ndv;
public boolean checkColumnStatsValid(ColumnStatistic columnStatistic, double rowCount) {
return columnStatistic.ndv <= rowCount && columnStatistic.numNulls <= rowCount - columnStatistic.ndv;
}

public Statistics withSel(double sel) {
return withSel(sel, 0);
}

public Statistics withSel(double sel, double numNull) {
sel = StatsMathUtil.minNonNaN(sel, 1);
public Statistics withSel(double notNullSel, double numNull) {
notNullSel = StatsMathUtil.minNonNaN(notNullSel, 1);
if (Double.isNaN(rowCount)) {
return this;
}
double newCount = rowCount * sel + numNull;
double newCount = rowCount * notNullSel + numNull;
return new Statistics(newCount, widthInJoinCluster, new HashMap<>(expressionToColumnStats));
}

Expand Down Expand Up @@ -236,8 +245,8 @@ public static Statistics zero(Statistics statistics) {
return zero;
}

public static double getValidSelectivity(double nullSel) {
return nullSel < 0 ? 0 : (nullSel > 1 ? 1 : nullSel);
public static double getValidSelectivity(double selectivity) {
return selectivity < 0 ? 0 : (selectivity > 1 ? 1 : selectivity);
}

/**
Expand Down Expand Up @@ -272,24 +281,6 @@ public int getWidthInJoinCluster() {
return widthInJoinCluster;
}

public Statistics normalizeByRatio(double originRowCount) {
if (rowCount >= originRowCount || rowCount <= 0) {
return this;
}
StatisticsBuilder builder = new StatisticsBuilder(this);
double ratio = rowCount / originRowCount;
for (Entry<Expression, ColumnStatistic> entry : expressionToColumnStats.entrySet()) {
ColumnStatistic colStats = entry.getValue();
if (colStats.numNulls != 0 || colStats.ndv > rowCount) {
ColumnStatisticBuilder colStatsBuilder = new ColumnStatisticBuilder(colStats);
colStatsBuilder.setNumNulls(colStats.numNulls * ratio);
colStatsBuilder.setNdv(Math.min(rowCount - colStatsBuilder.getNumNulls(), colStats.ndv));
builder.putColumnStatistics(entry.getKey(), colStatsBuilder.build());
}
}
return builder.build();
}

public double getDeltaRowCount() {
return deltaRowCount;
}
Expand Down

0 comments on commit a7eb34b

Please sign in to comment.