From 39a0147b92ea2947a12fbf3a5ac95fa325f43a56 Mon Sep 17 00:00:00 2001 From: minghong Date: Fri, 11 Oct 2024 19:43:58 +0800 Subject: [PATCH] adjust min/max for partition key --- .../doris/nereids/stats/StatsCalculator.java | 131 ++++++++++++++++++ .../regression/action/ExplainAction.groovy | 17 +++ .../stats/partition_key_minmax.groovy | 67 +++++++++ 3 files changed, 215 insertions(+) create mode 100644 regression-test/suites/nereids_p0/stats/partition_key_minmax.groovy diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java index 3c70d4cd51868f..e9d2d0a51d90df 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java @@ -18,11 +18,18 @@ package org.apache.doris.nereids.stats; import org.apache.doris.analysis.IntLiteral; +import org.apache.doris.analysis.LiteralExpr; import org.apache.doris.catalog.Column; import org.apache.doris.catalog.Env; +import org.apache.doris.catalog.ListPartitionItem; import org.apache.doris.catalog.MTMV; import org.apache.doris.catalog.OlapTable; +import org.apache.doris.catalog.PartitionItem; +import org.apache.doris.catalog.PartitionKey; +import org.apache.doris.catalog.PartitionType; +import org.apache.doris.catalog.RangePartitionItem; import org.apache.doris.catalog.TableIf; +import org.apache.doris.common.AnalysisException; import org.apache.doris.common.FeConstants; import org.apache.doris.common.Pair; import org.apache.doris.nereids.CascadesContext; @@ -141,11 +148,13 @@ import org.apache.doris.statistics.Statistics; import org.apache.doris.statistics.StatisticsBuilder; import org.apache.doris.statistics.TableStatsMeta; +import org.apache.doris.statistics.util.StatisticsUtil; import com.google.common.base.Preconditions; import com.google.common.collect.ImmutableSet; import com.google.common.collect.Lists; import com.google.common.collect.Maps; +import com.google.common.collect.Range; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; @@ -483,6 +492,9 @@ private Statistics computeOlapScan(OlapScan olapScan) { }); for (SlotReference slot : visibleOutputSlots) { ColumnStatistic cache = getColumnStatsFromPartitionCache(olapScan, slot, selectedPartitionNames); + if (slot.getColumn().isPresent()) { + cache = updateMinMaxForPartitionKey(olapTable, selectedPartitionNames, slot, cache); + } ColumnStatisticBuilder colStatsBuilder = new ColumnStatisticBuilder(cache, selectedPartitionsRowCount); colStatsBuilder.normalizeAvgSizeByte(slot); @@ -508,6 +520,125 @@ private Statistics computeOlapScan(OlapScan olapScan) { return builder.build(); } + private ColumnStatistic updateMinMaxForPartitionKey(OlapTable olapTable, + List selectedPartitionNames, + SlotReference slot, ColumnStatistic cache) { + if (olapTable.getPartitionType() == PartitionType.LIST) { + cache = updateMinMaxForListPartitionKey(olapTable, selectedPartitionNames, slot, cache); + } else if (olapTable.getPartitionType() == PartitionType.RANGE) { + cache = updateMinMaxForTheFirstRangePartitionKey(olapTable, selectedPartitionNames, slot, cache); + } + return cache; + } + + private double convertLegacyLiteralToDouble(LiteralExpr literal) throws AnalysisException { + return StatisticsUtil.convertToDouble(literal.getType(), literal.getStringValue()); + } + + private ColumnStatistic updateMinMaxForListPartitionKey(OlapTable olapTable, + List selectedPartitionNames, + SlotReference slot, ColumnStatistic cache) { + int partitionColumnIdx = olapTable.getPartitionColumns().indexOf(slot.getColumn().get()); + if (partitionColumnIdx != -1) { + try { + LiteralExpr minExpr = null; + LiteralExpr maxExpr = null; + double minValue = 0; + double maxValue = 0; + for (String selectedPartitionName : selectedPartitionNames) { + PartitionItem item = olapTable.getPartitionItemOrAnalysisException( + selectedPartitionName); + if (item instanceof ListPartitionItem) { + ListPartitionItem lp = (ListPartitionItem) item; + for (PartitionKey key : lp.getItems()) { + if (minExpr == null) { + minExpr = key.getKeys().get(partitionColumnIdx); + minValue = convertLegacyLiteralToDouble(minExpr); + maxExpr = key.getKeys().get(partitionColumnIdx); + maxValue = convertLegacyLiteralToDouble(maxExpr); + } else { + double current = convertLegacyLiteralToDouble(key.getKeys().get(partitionColumnIdx)); + if (current > maxValue) { + maxValue = current; + maxExpr = key.getKeys().get(partitionColumnIdx); + } else if (current < minValue) { + minValue = current; + minExpr = key.getKeys().get(partitionColumnIdx); + } + } + } + } + } + if (minExpr != null) { + cache = new ColumnStatisticBuilder(cache) + .setMinExpr(minExpr) + .setMinValue(minValue) + .setMaxExpr(maxExpr) + .setMaxValue(maxValue) + .build(); + } + } catch (AnalysisException e) { + LOG.debug(e.getMessage()); + } + } + return cache; + } + + private ColumnStatistic updateMinMaxForTheFirstRangePartitionKey(OlapTable olapTable, + List selectedPartitionNames, + SlotReference slot, ColumnStatistic cache) { + int partitionColumnIdx = olapTable.getPartitionColumns().indexOf(slot.getColumn().get()); + // for multi partition keys, only the first partition key need to adjust min/max + if (partitionColumnIdx == 0) { + // update partition column min/max by partition info + try { + LiteralExpr minExpr = null; + LiteralExpr maxExpr = null; + double minValue = 0; + double maxValue = 0; + for (String selectedPartitionName : selectedPartitionNames) { + PartitionItem item = olapTable.getPartitionItemOrAnalysisException( + selectedPartitionName); + if (item instanceof RangePartitionItem) { + RangePartitionItem ri = (RangePartitionItem) item; + Range range = ri.getItems(); + PartitionKey upper = range.upperEndpoint(); + PartitionKey lower = range.lowerEndpoint(); + if (maxExpr == null) { + maxExpr = upper.getKeys().get(partitionColumnIdx); + maxValue = convertLegacyLiteralToDouble(maxExpr); + minExpr = lower.getKeys().get(partitionColumnIdx); + minValue = convertLegacyLiteralToDouble(minExpr); + } else { + double currentValue = convertLegacyLiteralToDouble(upper.getKeys() + .get(partitionColumnIdx)); + if (currentValue > maxValue) { + maxValue = currentValue; + maxExpr = upper.getKeys().get(partitionColumnIdx); + } + currentValue = convertLegacyLiteralToDouble(lower.getKeys().get(partitionColumnIdx)); + if (currentValue < minValue) { + minValue = currentValue; + minExpr = lower.getKeys().get(partitionColumnIdx); + } + } + } + } + if (minExpr != null) { + cache = new ColumnStatisticBuilder(cache) + .setMinExpr(minExpr) + .setMinValue(minValue) + .setMaxExpr(maxExpr) + .setMaxValue(maxValue) + .build(); + } + } catch (AnalysisException e) { + LOG.debug(e.getMessage()); + } + } + return cache; + } + @Override public Statistics visitLogicalOlapScan(LogicalOlapScan olapScan, Void context) { return computeOlapScan(olapScan); diff --git a/regression-test/framework/src/main/groovy/org/apache/doris/regression/action/ExplainAction.groovy b/regression-test/framework/src/main/groovy/org/apache/doris/regression/action/ExplainAction.groovy index f7a28c85acd3f0..9fc19c91a4af3f 100644 --- a/regression-test/framework/src/main/groovy/org/apache/doris/regression/action/ExplainAction.groovy +++ b/regression-test/framework/src/main/groovy/org/apache/doris/regression/action/ExplainAction.groovy @@ -31,6 +31,7 @@ class ExplainAction implements SuiteAction { private boolean verbose = false private SuiteContext context private Set containsStrings = new LinkedHashSet<>() + private Set containsAnyStrings = new LinkedHashSet<>() private Set notContainsStrings = new LinkedHashSet<>() private Map multiContainsStrings = new HashMap<>() private String coonType @@ -57,6 +58,10 @@ class ExplainAction implements SuiteAction { containsStrings.add(subString) } + void containsAny(String subString) { + containsAnyStrings.add(subString) + } + void multiContains(String subString, int n) { multiContainsStrings.put(subString, n); } @@ -124,6 +129,18 @@ class ExplainAction implements SuiteAction { throw t } } + boolean any = false; + for (String string : containsAnyStrings) { + if (explainString.contains(string)) { + any = true; + } + } + if (!containsAnyStrings.isEmpty() && !any) { + String msg = ("Explain and check failed, expect contains any '${containsAnyStrings}'," + + " but actual explain string is:\n${explainString}").toString() + def t = new IllegalStateException(msg) + throw t + } } } diff --git a/regression-test/suites/nereids_p0/stats/partition_key_minmax.groovy b/regression-test/suites/nereids_p0/stats/partition_key_minmax.groovy new file mode 100644 index 00000000000000..936df92102efba --- /dev/null +++ b/regression-test/suites/nereids_p0/stats/partition_key_minmax.groovy @@ -0,0 +1,67 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("partition_key_minmax") { + sql """ + drop table if exists rangetable; + create table rangetable (a int, + b int, + c int) + partition by range (a, b) + (partition p1 values [("1", "2"), ("10", "20")), + partition p2 values [("20", "100"), ("30", "200")), + partition p3 values [("300", "-1"), ("400", "1000")) + ) + distributed by hash(a) properties("replication_num"="1"); + + insert into rangetable values (5, 3, 0), (22, 150, 1), (333, 1, 2),(6, 1, 3); + + analyze table rangetable with sync; + """ + explain { + sql """memo plan + select * from rangetable where a < 250; + """ + containsAny("a#0 -> ndv=3.0000, min=1.000000(1), max=30.000000(30), count=3.0000") + containsAny("a#0 -> ndv=4.0000, min=5.000000(5), max=333.000000(333), count=4.0000") + } + + sql """ + drop table if exists listtable; + create table listtable(id int, city varchar(20), value int) + PARTITION BY LIST(id, city) + ( + PARTITION p1_city VALUES IN (("1", "Beijing"), ("1", "Shanghai")), + PARTITION p2_city VALUES IN (("2", "Beijing"), ("2", "Shanghai")), + PARTITION p3_city VALUES IN (("3", "Beijing"), ("3", "Shanghai")) + ) + distributed by hash(id) properties("replication_num"="1"); + + insert into listtable values (1, "Beijing", 0), (2, "Beijing", 0), (3, "Beijing", 0); + + analyze table listtable with sync; + """ + + explain { + sql """ + memo plan select * from listtable where id >=3; + """ + containsAny("id#0 -> ndv=1.0000, min=3.000000(3), max=3.000000(3), count=1.0000,") + containsAny("id#0 -> ndv=3.0000, min=1.000000(1), max=3.000000(3), count=3.0000,") + } +} +