Skip to content

Commit

Permalink
[fix](nereids) derive column stats for 'expr and A is not null' (apac…
Browse files Browse the repository at this point in the history
…he#37235) (apache#37498)

pick from apache#37235 
the algorithm for computing stats for "expr1 and expr2" predicate is as
following:
1. compute output stats of expr1 based on input stats. the result stats
is denoted by leftStats
2. compute stats of expr2 based on leftStats after step1, leftStats
should be normalized to avoid abnormal cases, such as ndv > rowCount or
numNulls > rowCount

Issue Number: close #xxx

<!--Describe your changes.-->

## Proposed changes

Issue Number: close #xxx

<!--Describe your changes.-->
  • Loading branch information
englefly authored Jul 9, 2024
1 parent 5247e0f commit 9b075bc
Show file tree
Hide file tree
Showing 5 changed files with 160 additions and 108 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ public Statistics visitCompoundPredicate(CompoundPredicate predicate, Estimation
Expression leftExpr = predicate.child(0);
Expression rightExpr = predicate.child(1);
Statistics leftStats = leftExpr.accept(this, context);
leftStats = leftStats.normalizeByRatio(context.statistics.getRowCount());
Statistics andStats = rightExpr.accept(this,
new EstimationContext(leftStats));
if (predicate instanceof And) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -204,4 +204,22 @@ public String detail(String prefix) {
public int getWidthInJoinCluster() {
return widthInJoinCluster;
}

public Statistics normalizeByRatio(double originRowCount) {
if (rowCount >= originRowCount || rowCount <= 0) {
return this;
}
StatisticsBuilder builder = new StatisticsBuilder(this);
double ratio = rowCount / originRowCount;
for (Entry<Expression, ColumnStatistic> entry : expressionToColumnStats.entrySet()) {
ColumnStatistic colStats = entry.getValue();
if (colStats.numNulls != 0 || colStats.ndv > rowCount) {
ColumnStatisticBuilder colStatsBuilder = new ColumnStatisticBuilder(colStats);
colStatsBuilder.setNumNulls(colStats.numNulls * ratio);
colStatsBuilder.setNdv(Math.min(rowCount - colStatsBuilder.getNumNulls(), colStats.ndv));
builder.putColumnStatistics(entry.getKey(), colStatsBuilder.build());
}
}
return builder.build();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,7 @@ public void test1() {
Statistics stat = new Statistics(1000, slotToColumnStat);
FilterEstimation filterEstimation = new FilterEstimation();
Statistics expected = filterEstimation.estimate(or, stat);
Assertions.assertEquals(51.9, expected.getRowCount(), 0.1);
Assertions.assertEquals(51, expected.getRowCount(), 1);
}

// a > 500 and b < 100 or a > c
Expand Down Expand Up @@ -1059,6 +1059,39 @@ public void testNumNullsAnd() {
Assertions.assertEquals(result.getRowCount(), 2.0, 0.01);
}

/**
* a = 1 and b is not null
*/
@Test
public void testNumNullsAndTwoCol() {
SlotReference a = new SlotReference("a", IntegerType.INSTANCE);
ColumnStatisticBuilder builderA = new ColumnStatisticBuilder()
.setNdv(2)
.setAvgSizeByte(4)
.setNumNulls(0)
.setMaxValue(2)
.setMinValue(1)
.setCount(10);
IntegerLiteral int1 = new IntegerLiteral(1);
EqualTo equalTo = new EqualTo(a, int1);
SlotReference b = new SlotReference("a", IntegerType.INSTANCE);
ColumnStatisticBuilder builderB = new ColumnStatisticBuilder()
.setNdv(2)
.setAvgSizeByte(4)
.setNumNulls(8)
.setMaxValue(2)
.setMinValue(1)
.setCount(10);
Not isNotNull = new Not(new IsNull(b));
And and = new And(equalTo, isNotNull);
Statistics stats = new Statistics(10, new HashMap<>());
stats.addColumnStats(a, builderA.build());
stats.addColumnStats(b, builderB.build());
FilterEstimation filterEstimation = new FilterEstimation();
Statistics result = filterEstimation.estimate(and, stats);
Assertions.assertEquals(result.getRowCount(), 1.0, 0.01);
}

/**
* a >= 1 or a <= 2
*/
Expand Down
102 changes: 51 additions & 51 deletions regression-test/data/nereids_tpcds_shape_sf100_p0/rf_prune/query64.out
Original file line number Diff line number Diff line change
Expand Up @@ -11,39 +11,64 @@ PhysicalCteAnchor ( cteId=CTEId#1 )
----------------PhysicalProject
------------------hashJoin[INNER_JOIN] hashCondition=((customer.c_first_sales_date_sk = d2.d_date_sk)) otherCondition=()
--------------------PhysicalProject
----------------------hashJoin[INNER_JOIN] hashCondition=((store_sales.ss_customer_sk = customer.c_customer_sk)) otherCondition=(( not (cd_marital_status = cd_marital_status)))
----------------------hashJoin[INNER_JOIN] hashCondition=((store_sales.ss_customer_sk = customer.c_customer_sk)) otherCondition=(( not (cd_marital_status = cd_marital_status))) build RFs:RF17 ss_customer_sk->[c_customer_sk]
------------------------PhysicalDistribute[DistributionSpecHash]
--------------------------PhysicalProject
----------------------------hashJoin[INNER_JOIN] hashCondition=((store_sales.ss_promo_sk = promotion.p_promo_sk)) otherCondition=()
------------------------------PhysicalProject
--------------------------------hashJoin[INNER_JOIN] hashCondition=((store_sales.ss_addr_sk = ad1.ca_address_sk)) otherCondition=() build RFs:RF15 ss_addr_sk->[ca_address_sk]
----------------------------------PhysicalProject
------------------------------------PhysicalOlapScan[customer_address] apply RFs: RF15
----------------------------------PhysicalDistribute[DistributionSpecHash]
------------------------------------PhysicalProject
--------------------------------------hashJoin[INNER_JOIN] hashCondition=((store_sales.ss_item_sk = store_returns.sr_item_sk) and (store_sales.ss_ticket_number = store_returns.sr_ticket_number)) otherCondition=() build RFs:RF13 ss_item_sk->[sr_item_sk];RF14 ss_ticket_number->[sr_ticket_number]
----------------------------------------PhysicalProject
------------------------------------------PhysicalOlapScan[store_returns] apply RFs: RF13 RF14
----------------------------------------PhysicalDistribute[DistributionSpecHash]
----------------------------hashJoin[INNER_JOIN] hashCondition=((customer.c_current_addr_sk = ad2.ca_address_sk)) otherCondition=()
------------------------------PhysicalDistribute[DistributionSpecHash]
--------------------------------PhysicalProject
----------------------------------hashJoin[INNER_JOIN] hashCondition=((customer.c_current_cdemo_sk = cd2.cd_demo_sk)) otherCondition=()
------------------------------------PhysicalDistribute[DistributionSpecHash]
--------------------------------------PhysicalProject
----------------------------------------hashJoin[INNER_JOIN] hashCondition=((customer.c_current_hdemo_sk = hd2.hd_demo_sk)) otherCondition=()
------------------------------------------PhysicalProject
--------------------------------------------hashJoin[INNER_JOIN] hashCondition=((store_sales.ss_cdemo_sk = cd1.cd_demo_sk)) otherCondition=() build RFs:RF12 ss_cdemo_sk->[cd_demo_sk]
----------------------------------------------PhysicalDistribute[DistributionSpecHash]
--------------------------------------------PhysicalOlapScan[customer] apply RFs: RF17
------------------------------------------PhysicalDistribute[DistributionSpecReplicated]
--------------------------------------------PhysicalProject
----------------------------------------------hashJoin[INNER_JOIN] hashCondition=((hd2.hd_income_band_sk = ib2.ib_income_band_sk)) otherCondition=()
------------------------------------------------PhysicalProject
--------------------------------------------------PhysicalOlapScan[customer_demographics] apply RFs: RF12
----------------------------------------------PhysicalDistribute[DistributionSpecHash]
------------------------------------------------hashJoin[INNER_JOIN] hashCondition=((store_sales.ss_item_sk = item.i_item_sk)) otherCondition=() build RFs:RF11 i_item_sk->[cr_item_sk,cs_item_sk,ss_item_sk]
--------------------------------------------------PhysicalOlapScan[household_demographics]
------------------------------------------------PhysicalDistribute[DistributionSpecReplicated]
--------------------------------------------------PhysicalProject
----------------------------------------------------PhysicalOlapScan[income_band]
------------------------------------PhysicalDistribute[DistributionSpecHash]
--------------------------------------PhysicalProject
----------------------------------------PhysicalOlapScan[customer_demographics]
------------------------------PhysicalDistribute[DistributionSpecHash]
--------------------------------PhysicalProject
----------------------------------PhysicalOlapScan[customer_address]
------------------------PhysicalDistribute[DistributionSpecHash]
--------------------------PhysicalProject
----------------------------hashJoin[INNER_JOIN] hashCondition=((store_sales.ss_item_sk = store_returns.sr_item_sk) and (store_sales.ss_ticket_number = store_returns.sr_ticket_number)) otherCondition=() build RFs:RF11 ss_item_sk->[sr_item_sk];RF12 ss_ticket_number->[sr_ticket_number]
------------------------------PhysicalProject
--------------------------------PhysicalOlapScan[store_returns] apply RFs: RF11 RF12
------------------------------PhysicalDistribute[DistributionSpecHash]
--------------------------------PhysicalProject
----------------------------------hashJoin[INNER_JOIN] hashCondition=((store_sales.ss_cdemo_sk = cd1.cd_demo_sk)) otherCondition=() build RFs:RF10 ss_cdemo_sk->[cd_demo_sk]
------------------------------------PhysicalDistribute[DistributionSpecHash]
--------------------------------------PhysicalProject
----------------------------------------PhysicalOlapScan[customer_demographics] apply RFs: RF10
------------------------------------PhysicalDistribute[DistributionSpecHash]
--------------------------------------PhysicalProject
----------------------------------------hashJoin[INNER_JOIN] hashCondition=((store_sales.ss_addr_sk = ad1.ca_address_sk)) otherCondition=() build RFs:RF9 ss_addr_sk->[ca_address_sk]
------------------------------------------PhysicalProject
--------------------------------------------PhysicalOlapScan[customer_address] apply RFs: RF9
------------------------------------------PhysicalDistribute[DistributionSpecHash]
--------------------------------------------hashJoin[INNER_JOIN] hashCondition=((store_sales.ss_item_sk = item.i_item_sk)) otherCondition=() build RFs:RF8 i_item_sk->[cr_item_sk,cs_item_sk,ss_item_sk]
----------------------------------------------PhysicalProject
------------------------------------------------hashJoin[INNER_JOIN] hashCondition=((store_sales.ss_promo_sk = promotion.p_promo_sk)) otherCondition=()
--------------------------------------------------PhysicalProject
----------------------------------------------------hashJoin[INNER_JOIN] hashCondition=((store_sales.ss_store_sk = store.s_store_sk)) otherCondition=()
------------------------------------------------------PhysicalProject
--------------------------------------------------------hashJoin[INNER_JOIN] hashCondition=((hd1.hd_income_band_sk = ib1.ib_income_band_sk)) otherCondition=()
----------------------------------------------------------PhysicalProject
------------------------------------------------------------hashJoin[INNER_JOIN] hashCondition=((store_sales.ss_hdemo_sk = hd1.hd_demo_sk)) otherCondition=()
--------------------------------------------------------------PhysicalProject
----------------------------------------------------------------hashJoin[INNER_JOIN] hashCondition=((store_sales.ss_sold_date_sk = d1.d_date_sk)) otherCondition=() build RFs:RF7 d_date_sk->[ss_sold_date_sk]
----------------------------------------------------------------hashJoin[INNER_JOIN] hashCondition=((store_sales.ss_sold_date_sk = d1.d_date_sk)) otherCondition=() build RFs:RF3 d_date_sk->[ss_sold_date_sk]
------------------------------------------------------------------PhysicalProject
--------------------------------------------------------------------hashJoin[INNER_JOIN] hashCondition=((store_sales.ss_item_sk = cs_ui.cs_item_sk)) otherCondition=() build RFs:RF6 cs_item_sk->[ss_item_sk]
--------------------------------------------------------------------hashJoin[INNER_JOIN] hashCondition=((store_sales.ss_item_sk = cs_ui.cs_item_sk)) otherCondition=() build RFs:RF2 cs_item_sk->[ss_item_sk]
----------------------------------------------------------------------PhysicalProject
------------------------------------------------------------------------PhysicalOlapScan[store_sales] apply RFs: RF6 RF7 RF11
------------------------------------------------------------------------PhysicalOlapScan[store_sales] apply RFs: RF2 RF3 RF8
----------------------------------------------------------------------PhysicalDistribute[DistributionSpecReplicated]
------------------------------------------------------------------------PhysicalProject
--------------------------------------------------------------------------filter((sale > (2 * refund)))
Expand All @@ -53,9 +78,9 @@ PhysicalCteAnchor ( cteId=CTEId#1 )
----------------------------------------------------------------------------------PhysicalProject
------------------------------------------------------------------------------------hashJoin[INNER_JOIN] hashCondition=((catalog_sales.cs_item_sk = catalog_returns.cr_item_sk) and (catalog_sales.cs_order_number = catalog_returns.cr_order_number)) otherCondition=()
--------------------------------------------------------------------------------------PhysicalProject
----------------------------------------------------------------------------------------PhysicalOlapScan[catalog_sales] apply RFs: RF11
----------------------------------------------------------------------------------------PhysicalOlapScan[catalog_sales] apply RFs: RF8
--------------------------------------------------------------------------------------PhysicalProject
----------------------------------------------------------------------------------------PhysicalOlapScan[catalog_returns] apply RFs: RF11
----------------------------------------------------------------------------------------PhysicalOlapScan[catalog_returns] apply RFs: RF8
------------------------------------------------------------------PhysicalDistribute[DistributionSpecReplicated]
--------------------------------------------------------------------PhysicalProject
----------------------------------------------------------------------filter(d_year IN (2001, 2002))
Expand All @@ -71,36 +96,11 @@ PhysicalCteAnchor ( cteId=CTEId#1 )
----------------------------------------------------------PhysicalOlapScan[store]
--------------------------------------------------PhysicalDistribute[DistributionSpecReplicated]
----------------------------------------------------PhysicalProject
------------------------------------------------------filter((item.i_current_price <= 33.00) and (item.i_current_price >= 24.00) and i_color IN ('blanched', 'brown', 'burlywood', 'chocolate', 'drab', 'medium'))
--------------------------------------------------------PhysicalOlapScan[item]
------------------------------PhysicalDistribute[DistributionSpecReplicated]
--------------------------------PhysicalProject
----------------------------------PhysicalOlapScan[promotion]
------------------------PhysicalDistribute[DistributionSpecHash]
--------------------------PhysicalProject
----------------------------hashJoin[INNER_JOIN] hashCondition=((customer.c_current_addr_sk = ad2.ca_address_sk)) otherCondition=()
------------------------------PhysicalDistribute[DistributionSpecHash]
--------------------------------PhysicalProject
----------------------------------hashJoin[INNER_JOIN] hashCondition=((customer.c_current_cdemo_sk = cd2.cd_demo_sk)) otherCondition=()
------------------------------------PhysicalDistribute[DistributionSpecHash]
--------------------------------------PhysicalProject
----------------------------------------hashJoin[INNER_JOIN] hashCondition=((customer.c_current_hdemo_sk = hd2.hd_demo_sk)) otherCondition=()
------------------------------------------PhysicalProject
--------------------------------------------PhysicalOlapScan[customer]
------------------------------------------PhysicalDistribute[DistributionSpecReplicated]
--------------------------------------------PhysicalProject
----------------------------------------------hashJoin[INNER_JOIN] hashCondition=((hd2.hd_income_band_sk = ib2.ib_income_band_sk)) otherCondition=()
------------------------------------------------------PhysicalOlapScan[promotion]
----------------------------------------------PhysicalDistribute[DistributionSpecReplicated]
------------------------------------------------PhysicalProject
--------------------------------------------------PhysicalOlapScan[household_demographics]
------------------------------------------------PhysicalDistribute[DistributionSpecReplicated]
--------------------------------------------------PhysicalProject
----------------------------------------------------PhysicalOlapScan[income_band]
------------------------------------PhysicalDistribute[DistributionSpecHash]
--------------------------------------PhysicalProject
----------------------------------------PhysicalOlapScan[customer_demographics]
------------------------------PhysicalDistribute[DistributionSpecHash]
--------------------------------PhysicalProject
----------------------------------PhysicalOlapScan[customer_address]
--------------------------------------------------filter((item.i_current_price <= 33.00) and (item.i_current_price >= 24.00) and i_color IN ('blanched', 'brown', 'burlywood', 'chocolate', 'drab', 'medium'))
----------------------------------------------------PhysicalOlapScan[item]
--------------------PhysicalDistribute[DistributionSpecReplicated]
----------------------PhysicalProject
------------------------PhysicalOlapScan[date_dim]
Expand Down
Loading

0 comments on commit 9b075bc

Please sign in to comment.