Skip to content

Commit

Permalink
left outer join estimation
Browse files Browse the repository at this point in the history
  • Loading branch information
englefly committed Sep 18, 2023
1 parent c54fc82 commit 76e3e06
Show file tree
Hide file tree
Showing 17 changed files with 211 additions and 198 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,8 @@ tools/single-node-cluster/fe*
tools/tpcds-tools/TPC-DS_Tools_*.zip
tools/tpcds-tools/bin/DSGen-software*
tools/tpcds-tools/bin/tpcds-data/
tools/tpcds-tools/bin/result
tools/tpcds-tools/result.csv

# be-ut
data_test
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ private static Statistics estimateHashJoin(Statistics leftStats, Statistics righ
outputRowCount = outputRowCount * Math.pow(0.9, unTrustableCondition.size());
} else {
outputRowCount = Math.max(leftStats.getRowCount(), rightStats.getRowCount());
Optional<Double> ratio = unTrustEqualRatio.stream().max(Double::compareTo);
Optional<Double> ratio = unTrustEqualRatio.stream().min(Double::compareTo);
if (ratio.isPresent()) {
outputRowCount = Math.max(1, outputRowCount * ratio.get());
}
Expand Down
55 changes: 27 additions & 28 deletions regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query17.out
Original file line number Diff line number Diff line change
Expand Up @@ -14,36 +14,35 @@ PhysicalResultSink
----------------------hashJoin[INNER_JOIN](store_returns.sr_item_sk = catalog_sales.cs_item_sk)(store_returns.sr_customer_sk = catalog_sales.cs_bill_customer_sk)
------------------------PhysicalDistribute
--------------------------PhysicalProject
----------------------------hashJoin[INNER_JOIN](store_returns.sr_returned_date_sk = d2.d_date_sk)
------------------------------PhysicalProject
--------------------------------hashJoin[INNER_JOIN](store_sales.ss_item_sk = store_returns.sr_item_sk)(store_sales.ss_ticket_number = store_returns.sr_ticket_number)(store_sales.ss_customer_sk = store_returns.sr_customer_sk)
----------------------------PhysicalOlapScan[catalog_sales]
------------------------PhysicalDistribute
--------------------------hashJoin[INNER_JOIN](store.s_store_sk = store_sales.ss_store_sk)
----------------------------PhysicalProject
------------------------------hashJoin[INNER_JOIN](item.i_item_sk = store_sales.ss_item_sk)
--------------------------------PhysicalDistribute
----------------------------------PhysicalProject
------------------------------------PhysicalOlapScan[store_returns]
----------------------------------PhysicalDistribute
------------------------------------PhysicalProject
--------------------------------------hashJoin[INNER_JOIN](store.s_store_sk = store_sales.ss_store_sk)
----------------------------------------hashJoin[INNER_JOIN](item.i_item_sk = store_sales.ss_item_sk)
------------------------------------------PhysicalDistribute
--------------------------------------------hashJoin[INNER_JOIN](d1.d_date_sk = store_sales.ss_sold_date_sk)
----------------------------------------------PhysicalProject
------------------------------------------------PhysicalOlapScan[store_sales]
----------------------------------------------PhysicalDistribute
------------------------------------------------PhysicalProject
--------------------------------------------------filter((cast(d_quarter_name as VARCHAR(*)) = '2001Q1'))
----------------------------------------------------PhysicalOlapScan[date_dim]
------------------------------------------PhysicalDistribute
--------------------------------------------PhysicalProject
----------------------------------------------PhysicalOlapScan[item]
----------------------------------------PhysicalDistribute
------------------------------------hashJoin[INNER_JOIN](store_returns.sr_returned_date_sk = d2.d_date_sk)
--------------------------------------PhysicalProject
----------------------------------------hashJoin[INNER_JOIN](store_sales.ss_item_sk = store_returns.sr_item_sk)(store_sales.ss_ticket_number = store_returns.sr_ticket_number)(store_sales.ss_customer_sk = store_returns.sr_customer_sk)
------------------------------------------PhysicalProject
--------------------------------------------PhysicalOlapScan[store]
------------------------------PhysicalDistribute
--------------------------------PhysicalProject
----------------------------------filter(d_quarter_name IN ('2001Q1', '2001Q2', '2001Q3'))
------------------------------------PhysicalOlapScan[date_dim]
------------------------PhysicalDistribute
--------------------------PhysicalProject
----------------------------PhysicalOlapScan[catalog_sales]
--------------------------------------------PhysicalOlapScan[store_returns]
------------------------------------------hashJoin[INNER_JOIN](d1.d_date_sk = store_sales.ss_sold_date_sk)
--------------------------------------------PhysicalProject
----------------------------------------------PhysicalOlapScan[store_sales]
--------------------------------------------PhysicalDistribute
----------------------------------------------PhysicalProject
------------------------------------------------filter((cast(d_quarter_name as VARCHAR(*)) = '2001Q1'))
--------------------------------------------------PhysicalOlapScan[date_dim]
--------------------------------------PhysicalDistribute
----------------------------------------PhysicalProject
------------------------------------------filter(d_quarter_name IN ('2001Q1', '2001Q2', '2001Q3'))
--------------------------------------------PhysicalOlapScan[date_dim]
--------------------------------PhysicalDistribute
----------------------------------PhysicalProject
------------------------------------PhysicalOlapScan[item]
----------------------------PhysicalDistribute
------------------------------PhysicalProject
--------------------------------PhysicalOlapScan[store]
--------------------PhysicalDistribute
----------------------PhysicalProject
------------------------filter(d_quarter_name IN ('2001Q1', '2001Q2', '2001Q3'))
Expand Down
51 changes: 25 additions & 26 deletions regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query25.out
Original file line number Diff line number Diff line change
Expand Up @@ -15,34 +15,33 @@ PhysicalResultSink
------------------------PhysicalProject
--------------------------PhysicalOlapScan[catalog_sales]
----------------------PhysicalDistribute
------------------------PhysicalProject
--------------------------hashJoin[INNER_JOIN](store_returns.sr_returned_date_sk = d2.d_date_sk)
----------------------------PhysicalProject
------------------------------hashJoin[INNER_JOIN](store_sales.ss_item_sk = store_returns.sr_item_sk)(store_sales.ss_ticket_number = store_returns.sr_ticket_number)(store_sales.ss_customer_sk = store_returns.sr_customer_sk)
------------------------hashJoin[INNER_JOIN](store.s_store_sk = store_sales.ss_store_sk)
--------------------------PhysicalProject
----------------------------hashJoin[INNER_JOIN](item.i_item_sk = store_sales.ss_item_sk)
------------------------------PhysicalDistribute
--------------------------------PhysicalProject
----------------------------------PhysicalOlapScan[store_returns]
--------------------------------PhysicalDistribute
----------------------------------PhysicalProject
------------------------------------hashJoin[INNER_JOIN](store.s_store_sk = store_sales.ss_store_sk)
--------------------------------------hashJoin[INNER_JOIN](item.i_item_sk = store_sales.ss_item_sk)
----------------------------------------PhysicalDistribute
------------------------------------------hashJoin[INNER_JOIN](d1.d_date_sk = store_sales.ss_sold_date_sk)
--------------------------------------------PhysicalProject
----------------------------------------------PhysicalOlapScan[store_sales]
--------------------------------------------PhysicalDistribute
----------------------------------------------PhysicalProject
------------------------------------------------filter((d1.d_year = 2000)(d1.d_moy = 4))
--------------------------------------------------PhysicalOlapScan[date_dim]
----------------------------------------PhysicalDistribute
------------------------------------------PhysicalProject
--------------------------------------------PhysicalOlapScan[item]
--------------------------------------PhysicalDistribute
----------------------------------hashJoin[INNER_JOIN](store_returns.sr_returned_date_sk = d2.d_date_sk)
------------------------------------PhysicalProject
--------------------------------------hashJoin[INNER_JOIN](store_sales.ss_item_sk = store_returns.sr_item_sk)(store_sales.ss_ticket_number = store_returns.sr_ticket_number)(store_sales.ss_customer_sk = store_returns.sr_customer_sk)
----------------------------------------PhysicalProject
------------------------------------------PhysicalOlapScan[store]
----------------------------PhysicalDistribute
------------------------------PhysicalProject
--------------------------------filter((d2.d_moy <= 10)(d2.d_moy >= 4)(d2.d_year = 2000))
----------------------------------PhysicalOlapScan[date_dim]
------------------------------------------PhysicalOlapScan[store_returns]
----------------------------------------hashJoin[INNER_JOIN](d1.d_date_sk = store_sales.ss_sold_date_sk)
------------------------------------------PhysicalProject
--------------------------------------------PhysicalOlapScan[store_sales]
------------------------------------------PhysicalDistribute
--------------------------------------------PhysicalProject
----------------------------------------------filter((d1.d_year = 2000)(d1.d_moy = 4))
------------------------------------------------PhysicalOlapScan[date_dim]
------------------------------------PhysicalDistribute
--------------------------------------PhysicalProject
----------------------------------------filter((d2.d_moy <= 10)(d2.d_moy >= 4)(d2.d_year = 2000))
------------------------------------------PhysicalOlapScan[date_dim]
------------------------------PhysicalDistribute
--------------------------------PhysicalProject
----------------------------------PhysicalOlapScan[item]
--------------------------PhysicalDistribute
----------------------------PhysicalProject
------------------------------PhysicalOlapScan[store]
------------------PhysicalDistribute
--------------------PhysicalProject
----------------------filter((d3.d_year = 2000)(d3.d_moy <= 10)(d3.d_moy >= 4))
Expand Down
56 changes: 27 additions & 29 deletions regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query29.out
Original file line number Diff line number Diff line change
Expand Up @@ -11,38 +11,36 @@ PhysicalResultSink
----------------hashJoin[INNER_JOIN](catalog_sales.cs_sold_date_sk = d3.d_date_sk)
------------------PhysicalProject
--------------------hashJoin[INNER_JOIN](store_returns.sr_item_sk = catalog_sales.cs_item_sk)(store_returns.sr_customer_sk = catalog_sales.cs_bill_customer_sk)
----------------------PhysicalProject
------------------------PhysicalOlapScan[catalog_sales]
----------------------PhysicalDistribute
------------------------PhysicalProject
--------------------------PhysicalOlapScan[catalog_sales]
----------------------PhysicalDistribute
------------------------PhysicalProject
--------------------------hashJoin[INNER_JOIN](store_returns.sr_returned_date_sk = d2.d_date_sk)
----------------------------PhysicalProject
------------------------------hashJoin[INNER_JOIN](store_sales.ss_item_sk = store_returns.sr_item_sk)(store_sales.ss_ticket_number = store_returns.sr_ticket_number)(store_sales.ss_customer_sk = store_returns.sr_customer_sk)
------------------------hashJoin[INNER_JOIN](store.s_store_sk = store_sales.ss_store_sk)
--------------------------PhysicalProject
----------------------------hashJoin[INNER_JOIN](item.i_item_sk = store_sales.ss_item_sk)
------------------------------PhysicalDistribute
--------------------------------PhysicalProject
----------------------------------PhysicalOlapScan[store_returns]
--------------------------------PhysicalDistribute
----------------------------------PhysicalProject
------------------------------------hashJoin[INNER_JOIN](store.s_store_sk = store_sales.ss_store_sk)
--------------------------------------hashJoin[INNER_JOIN](item.i_item_sk = store_sales.ss_item_sk)
----------------------------------------PhysicalDistribute
------------------------------------------hashJoin[INNER_JOIN](d1.d_date_sk = store_sales.ss_sold_date_sk)
--------------------------------------------PhysicalProject
----------------------------------------------PhysicalOlapScan[store_sales]
--------------------------------------------PhysicalDistribute
----------------------------------------------PhysicalProject
------------------------------------------------filter((d1.d_year = 1999)(d1.d_moy = 4))
--------------------------------------------------PhysicalOlapScan[date_dim]
----------------------------------------PhysicalDistribute
------------------------------------------PhysicalProject
--------------------------------------------PhysicalOlapScan[item]
--------------------------------------PhysicalDistribute
----------------------------------hashJoin[INNER_JOIN](d1.d_date_sk = store_sales.ss_sold_date_sk)
------------------------------------PhysicalProject
--------------------------------------hashJoin[INNER_JOIN](store_sales.ss_item_sk = store_returns.sr_item_sk)(store_sales.ss_ticket_number = store_returns.sr_ticket_number)(store_sales.ss_customer_sk = store_returns.sr_customer_sk)
----------------------------------------PhysicalProject
------------------------------------------PhysicalOlapScan[store]
----------------------------PhysicalDistribute
------------------------------PhysicalProject
--------------------------------filter((d2.d_moy <= 7)(d2.d_moy >= 4)(d2.d_year = 1999))
----------------------------------PhysicalOlapScan[date_dim]
------------------------------------------PhysicalOlapScan[store_sales]
----------------------------------------hashJoin[INNER_JOIN](store_returns.sr_returned_date_sk = d2.d_date_sk)
------------------------------------------PhysicalProject
--------------------------------------------PhysicalOlapScan[store_returns]
------------------------------------------PhysicalDistribute
--------------------------------------------PhysicalProject
----------------------------------------------filter((d2.d_moy <= 7)(d2.d_moy >= 4)(d2.d_year = 1999))
------------------------------------------------PhysicalOlapScan[date_dim]
------------------------------------PhysicalDistribute
--------------------------------------PhysicalProject
----------------------------------------filter((d1.d_year = 1999)(d1.d_moy = 4))
------------------------------------------PhysicalOlapScan[date_dim]
------------------------------PhysicalDistribute
--------------------------------PhysicalProject
----------------------------------PhysicalOlapScan[item]
--------------------------PhysicalDistribute
----------------------------PhysicalProject
------------------------------PhysicalOlapScan[store]
------------------PhysicalDistribute
--------------------PhysicalProject
----------------------filter(d_year IN (1999, 2000, 2001))
Expand Down
35 changes: 17 additions & 18 deletions regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query40.out
Original file line number Diff line number Diff line change
Expand Up @@ -8,25 +8,24 @@ PhysicalResultSink
----------PhysicalDistribute
------------hashAgg[LOCAL]
--------------PhysicalProject
----------------hashJoin[RIGHT_OUTER_JOIN](catalog_sales.cs_item_sk = catalog_returns.cr_item_sk)(catalog_sales.cs_order_number = catalog_returns.cr_order_number)
----------------hashJoin[INNER_JOIN](catalog_sales.cs_warehouse_sk = warehouse.w_warehouse_sk)
------------------PhysicalProject
--------------------PhysicalOlapScan[catalog_returns]
--------------------hashJoin[RIGHT_OUTER_JOIN](catalog_sales.cs_item_sk = catalog_returns.cr_item_sk)(catalog_sales.cs_order_number = catalog_returns.cr_order_number)
----------------------PhysicalProject
------------------------PhysicalOlapScan[catalog_returns]
----------------------hashJoin[INNER_JOIN](catalog_sales.cs_sold_date_sk = date_dim.d_date_sk)
------------------------hashJoin[INNER_JOIN](item.i_item_sk = catalog_sales.cs_item_sk)
--------------------------PhysicalProject
----------------------------PhysicalOlapScan[catalog_sales]
--------------------------PhysicalDistribute
----------------------------PhysicalProject
------------------------------filter((item.i_current_price >= 0.99)(item.i_current_price <= 1.49))
--------------------------------PhysicalOlapScan[item]
------------------------PhysicalDistribute
--------------------------PhysicalProject
----------------------------filter((date_dim.d_date >= 2001-03-03)(date_dim.d_date <= 2001-05-02))
------------------------------PhysicalOlapScan[date_dim]
------------------PhysicalDistribute
--------------------PhysicalProject
----------------------hashJoin[INNER_JOIN](catalog_sales.cs_warehouse_sk = warehouse.w_warehouse_sk)
------------------------PhysicalProject
--------------------------PhysicalOlapScan[warehouse]
------------------------PhysicalDistribute
--------------------------hashJoin[INNER_JOIN](catalog_sales.cs_sold_date_sk = date_dim.d_date_sk)
----------------------------hashJoin[INNER_JOIN](item.i_item_sk = catalog_sales.cs_item_sk)
------------------------------PhysicalProject
--------------------------------PhysicalOlapScan[catalog_sales]
------------------------------PhysicalDistribute
--------------------------------PhysicalProject
----------------------------------filter((item.i_current_price >= 0.99)(item.i_current_price <= 1.49))
------------------------------------PhysicalOlapScan[item]
----------------------------PhysicalDistribute
------------------------------PhysicalProject
--------------------------------filter((date_dim.d_date >= 2001-03-03)(date_dim.d_date <= 2001-05-02))
----------------------------------PhysicalOlapScan[date_dim]
----------------------PhysicalOlapScan[warehouse]

Original file line number Diff line number Diff line change
Expand Up @@ -35,16 +35,15 @@ PhysicalCteAnchor ( cteId=CTEId#0 )
--------PhysicalDistribute
----------PhysicalTopN
------------PhysicalProject
--------------hashJoin[INNER_JOIN](v1.i_category = v1_lead.i_category)(v1.i_brand = v1_lead.i_brand)(v1.s_store_name = v1_lead.s_store_name)(v1.s_company_name = v1_lead.s_company_name)(v1.rn = expr_(rn - 1))
----------------PhysicalProject
------------------hashJoin[INNER_JOIN](v1.i_category = v1_lag.i_category)(v1.i_brand = v1_lag.i_brand)(v1.s_store_name = v1_lag.s_store_name)(v1.s_company_name = v1_lag.s_company_name)(v1.rn = expr_(rn + 1))
--------------------PhysicalDistribute
----------------------PhysicalProject
--------------hashJoin[INNER_JOIN](v1.i_category = v1_lag.i_category)(v1.i_brand = v1_lag.i_brand)(v1.s_store_name = v1_lag.s_store_name)(v1.s_company_name = v1_lag.s_company_name)(v1.rn = expr_(rn + 1))
----------------hashJoin[INNER_JOIN](v1.i_category = v1_lead.i_category)(v1.i_brand = v1_lead.i_brand)(v1.s_store_name = v1_lead.s_store_name)(v1.s_company_name = v1_lead.s_company_name)(v1.rn = expr_(rn - 1))
------------------PhysicalDistribute
--------------------PhysicalProject
----------------------PhysicalCteConsumer ( cteId=CTEId#0 )
------------------PhysicalDistribute
--------------------PhysicalProject
----------------------filter((if((avg_monthly_sales > 0.0000), (abs((cast(sum_sales as DOUBLE) - cast(avg_monthly_sales as DOUBLE))) / cast(avg_monthly_sales as DOUBLE)), NULL) > 0.1)(v2.d_year = 2001)(v2.avg_monthly_sales > 0.0000))
------------------------PhysicalCteConsumer ( cteId=CTEId#0 )
--------------------PhysicalDistribute
----------------------PhysicalProject
------------------------filter((if((avg_monthly_sales > 0.0000), (abs((cast(sum_sales as DOUBLE) - cast(avg_monthly_sales as DOUBLE))) / cast(avg_monthly_sales as DOUBLE)), NULL) > 0.1)(v2.d_year = 2001)(v2.avg_monthly_sales > 0.0000))
--------------------------PhysicalCteConsumer ( cteId=CTEId#0 )
----------------PhysicalDistribute
------------------PhysicalProject
--------------------PhysicalCteConsumer ( cteId=CTEId#0 )
Expand Down
Loading

0 comments on commit 76e3e06

Please sign in to comment.