From 76e3e0683bda81069a62c82aa0b315ec465d9f13 Mon Sep 17 00:00:00 2001 From: englefly Date: Fri, 15 Sep 2023 17:21:07 +0800 Subject: [PATCH] left outer join estimation --- .gitignore | 2 + .../doris/nereids/stats/JoinEstimation.java | 2 +- .../shape/query17.out | 55 +++++----- .../shape/query25.out | 51 +++++---- .../shape/query29.out | 56 +++++----- .../shape/query40.out | 35 +++--- .../shape/query47.out | 17 ++- .../shape/query54.out | 100 +++++++++--------- .../shape/query59.out | 50 ++++----- .../rf/ds_rf17.groovy | 2 +- .../rf/ds_rf25.groovy | 2 +- .../rf/ds_rf40.groovy | 2 +- .../rf/ds_rf54.groovy | 2 +- .../rf/ds_rf59.groovy | 2 +- tools/tpcds-tools/bin/load-tpcds-data.sh | 14 +++ tools/tpcds-tools/bin/run-tpcds-queries.sh | 15 ++- tools/tpcds-tools/conf/doris-cluster.conf | 2 +- 17 files changed, 211 insertions(+), 198 deletions(-) diff --git a/.gitignore b/.gitignore index b12e3eb1de6c8c4..797b9fc0ccbb829 100644 --- a/.gitignore +++ b/.gitignore @@ -107,6 +107,8 @@ tools/single-node-cluster/fe* tools/tpcds-tools/TPC-DS_Tools_*.zip tools/tpcds-tools/bin/DSGen-software* tools/tpcds-tools/bin/tpcds-data/ +tools/tpcds-tools/bin/result +tools/tpcds-tools/result.csv # be-ut data_test diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/JoinEstimation.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/JoinEstimation.java index 944397226274f20..62f650f7988de20 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/JoinEstimation.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/JoinEstimation.java @@ -137,7 +137,7 @@ private static Statistics estimateHashJoin(Statistics leftStats, Statistics righ outputRowCount = outputRowCount * Math.pow(0.9, unTrustableCondition.size()); } else { outputRowCount = Math.max(leftStats.getRowCount(), rightStats.getRowCount()); - Optional ratio = unTrustEqualRatio.stream().max(Double::compareTo); + Optional ratio = unTrustEqualRatio.stream().min(Double::compareTo); if (ratio.isPresent()) { outputRowCount = Math.max(1, outputRowCount * ratio.get()); } diff --git a/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query17.out b/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query17.out index 36141eba60dddca..dddca95770a81ac 100644 --- a/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query17.out +++ b/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query17.out @@ -14,36 +14,35 @@ PhysicalResultSink ----------------------hashJoin[INNER_JOIN](store_returns.sr_item_sk = catalog_sales.cs_item_sk)(store_returns.sr_customer_sk = catalog_sales.cs_bill_customer_sk) ------------------------PhysicalDistribute --------------------------PhysicalProject -----------------------------hashJoin[INNER_JOIN](store_returns.sr_returned_date_sk = d2.d_date_sk) -------------------------------PhysicalProject ---------------------------------hashJoin[INNER_JOIN](store_sales.ss_item_sk = store_returns.sr_item_sk)(store_sales.ss_ticket_number = store_returns.sr_ticket_number)(store_sales.ss_customer_sk = store_returns.sr_customer_sk) +----------------------------PhysicalOlapScan[catalog_sales] +------------------------PhysicalDistribute +--------------------------hashJoin[INNER_JOIN](store.s_store_sk = store_sales.ss_store_sk) +----------------------------PhysicalProject +------------------------------hashJoin[INNER_JOIN](item.i_item_sk = store_sales.ss_item_sk) +--------------------------------PhysicalDistribute ----------------------------------PhysicalProject -------------------------------------PhysicalOlapScan[store_returns] -----------------------------------PhysicalDistribute -------------------------------------PhysicalProject ---------------------------------------hashJoin[INNER_JOIN](store.s_store_sk = store_sales.ss_store_sk) -----------------------------------------hashJoin[INNER_JOIN](item.i_item_sk = store_sales.ss_item_sk) -------------------------------------------PhysicalDistribute ---------------------------------------------hashJoin[INNER_JOIN](d1.d_date_sk = store_sales.ss_sold_date_sk) -----------------------------------------------PhysicalProject -------------------------------------------------PhysicalOlapScan[store_sales] -----------------------------------------------PhysicalDistribute -------------------------------------------------PhysicalProject ---------------------------------------------------filter((cast(d_quarter_name as VARCHAR(*)) = '2001Q1')) -----------------------------------------------------PhysicalOlapScan[date_dim] -------------------------------------------PhysicalDistribute ---------------------------------------------PhysicalProject -----------------------------------------------PhysicalOlapScan[item] -----------------------------------------PhysicalDistribute +------------------------------------hashJoin[INNER_JOIN](store_returns.sr_returned_date_sk = d2.d_date_sk) +--------------------------------------PhysicalProject +----------------------------------------hashJoin[INNER_JOIN](store_sales.ss_item_sk = store_returns.sr_item_sk)(store_sales.ss_ticket_number = store_returns.sr_ticket_number)(store_sales.ss_customer_sk = store_returns.sr_customer_sk) ------------------------------------------PhysicalProject ---------------------------------------------PhysicalOlapScan[store] -------------------------------PhysicalDistribute ---------------------------------PhysicalProject -----------------------------------filter(d_quarter_name IN ('2001Q1', '2001Q2', '2001Q3')) -------------------------------------PhysicalOlapScan[date_dim] -------------------------PhysicalDistribute ---------------------------PhysicalProject -----------------------------PhysicalOlapScan[catalog_sales] +--------------------------------------------PhysicalOlapScan[store_returns] +------------------------------------------hashJoin[INNER_JOIN](d1.d_date_sk = store_sales.ss_sold_date_sk) +--------------------------------------------PhysicalProject +----------------------------------------------PhysicalOlapScan[store_sales] +--------------------------------------------PhysicalDistribute +----------------------------------------------PhysicalProject +------------------------------------------------filter((cast(d_quarter_name as VARCHAR(*)) = '2001Q1')) +--------------------------------------------------PhysicalOlapScan[date_dim] +--------------------------------------PhysicalDistribute +----------------------------------------PhysicalProject +------------------------------------------filter(d_quarter_name IN ('2001Q1', '2001Q2', '2001Q3')) +--------------------------------------------PhysicalOlapScan[date_dim] +--------------------------------PhysicalDistribute +----------------------------------PhysicalProject +------------------------------------PhysicalOlapScan[item] +----------------------------PhysicalDistribute +------------------------------PhysicalProject +--------------------------------PhysicalOlapScan[store] --------------------PhysicalDistribute ----------------------PhysicalProject ------------------------filter(d_quarter_name IN ('2001Q1', '2001Q2', '2001Q3')) diff --git a/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query25.out b/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query25.out index 1bed3183e0a1d7d..81ba434db88870f 100644 --- a/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query25.out +++ b/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query25.out @@ -15,34 +15,33 @@ PhysicalResultSink ------------------------PhysicalProject --------------------------PhysicalOlapScan[catalog_sales] ----------------------PhysicalDistribute -------------------------PhysicalProject ---------------------------hashJoin[INNER_JOIN](store_returns.sr_returned_date_sk = d2.d_date_sk) -----------------------------PhysicalProject -------------------------------hashJoin[INNER_JOIN](store_sales.ss_item_sk = store_returns.sr_item_sk)(store_sales.ss_ticket_number = store_returns.sr_ticket_number)(store_sales.ss_customer_sk = store_returns.sr_customer_sk) +------------------------hashJoin[INNER_JOIN](store.s_store_sk = store_sales.ss_store_sk) +--------------------------PhysicalProject +----------------------------hashJoin[INNER_JOIN](item.i_item_sk = store_sales.ss_item_sk) +------------------------------PhysicalDistribute --------------------------------PhysicalProject -----------------------------------PhysicalOlapScan[store_returns] ---------------------------------PhysicalDistribute -----------------------------------PhysicalProject -------------------------------------hashJoin[INNER_JOIN](store.s_store_sk = store_sales.ss_store_sk) ---------------------------------------hashJoin[INNER_JOIN](item.i_item_sk = store_sales.ss_item_sk) -----------------------------------------PhysicalDistribute -------------------------------------------hashJoin[INNER_JOIN](d1.d_date_sk = store_sales.ss_sold_date_sk) ---------------------------------------------PhysicalProject -----------------------------------------------PhysicalOlapScan[store_sales] ---------------------------------------------PhysicalDistribute -----------------------------------------------PhysicalProject -------------------------------------------------filter((d1.d_year = 2000)(d1.d_moy = 4)) ---------------------------------------------------PhysicalOlapScan[date_dim] -----------------------------------------PhysicalDistribute -------------------------------------------PhysicalProject ---------------------------------------------PhysicalOlapScan[item] ---------------------------------------PhysicalDistribute +----------------------------------hashJoin[INNER_JOIN](store_returns.sr_returned_date_sk = d2.d_date_sk) +------------------------------------PhysicalProject +--------------------------------------hashJoin[INNER_JOIN](store_sales.ss_item_sk = store_returns.sr_item_sk)(store_sales.ss_ticket_number = store_returns.sr_ticket_number)(store_sales.ss_customer_sk = store_returns.sr_customer_sk) ----------------------------------------PhysicalProject -------------------------------------------PhysicalOlapScan[store] -----------------------------PhysicalDistribute -------------------------------PhysicalProject ---------------------------------filter((d2.d_moy <= 10)(d2.d_moy >= 4)(d2.d_year = 2000)) -----------------------------------PhysicalOlapScan[date_dim] +------------------------------------------PhysicalOlapScan[store_returns] +----------------------------------------hashJoin[INNER_JOIN](d1.d_date_sk = store_sales.ss_sold_date_sk) +------------------------------------------PhysicalProject +--------------------------------------------PhysicalOlapScan[store_sales] +------------------------------------------PhysicalDistribute +--------------------------------------------PhysicalProject +----------------------------------------------filter((d1.d_year = 2000)(d1.d_moy = 4)) +------------------------------------------------PhysicalOlapScan[date_dim] +------------------------------------PhysicalDistribute +--------------------------------------PhysicalProject +----------------------------------------filter((d2.d_moy <= 10)(d2.d_moy >= 4)(d2.d_year = 2000)) +------------------------------------------PhysicalOlapScan[date_dim] +------------------------------PhysicalDistribute +--------------------------------PhysicalProject +----------------------------------PhysicalOlapScan[item] +--------------------------PhysicalDistribute +----------------------------PhysicalProject +------------------------------PhysicalOlapScan[store] ------------------PhysicalDistribute --------------------PhysicalProject ----------------------filter((d3.d_year = 2000)(d3.d_moy <= 10)(d3.d_moy >= 4)) diff --git a/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query29.out b/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query29.out index 0559297b76f3488..c7a55dfa8dfeb07 100644 --- a/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query29.out +++ b/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query29.out @@ -11,38 +11,36 @@ PhysicalResultSink ----------------hashJoin[INNER_JOIN](catalog_sales.cs_sold_date_sk = d3.d_date_sk) ------------------PhysicalProject --------------------hashJoin[INNER_JOIN](store_returns.sr_item_sk = catalog_sales.cs_item_sk)(store_returns.sr_customer_sk = catalog_sales.cs_bill_customer_sk) +----------------------PhysicalProject +------------------------PhysicalOlapScan[catalog_sales] ----------------------PhysicalDistribute -------------------------PhysicalProject ---------------------------PhysicalOlapScan[catalog_sales] -----------------------PhysicalDistribute -------------------------PhysicalProject ---------------------------hashJoin[INNER_JOIN](store_returns.sr_returned_date_sk = d2.d_date_sk) -----------------------------PhysicalProject -------------------------------hashJoin[INNER_JOIN](store_sales.ss_item_sk = store_returns.sr_item_sk)(store_sales.ss_ticket_number = store_returns.sr_ticket_number)(store_sales.ss_customer_sk = store_returns.sr_customer_sk) +------------------------hashJoin[INNER_JOIN](store.s_store_sk = store_sales.ss_store_sk) +--------------------------PhysicalProject +----------------------------hashJoin[INNER_JOIN](item.i_item_sk = store_sales.ss_item_sk) +------------------------------PhysicalDistribute --------------------------------PhysicalProject -----------------------------------PhysicalOlapScan[store_returns] ---------------------------------PhysicalDistribute -----------------------------------PhysicalProject -------------------------------------hashJoin[INNER_JOIN](store.s_store_sk = store_sales.ss_store_sk) ---------------------------------------hashJoin[INNER_JOIN](item.i_item_sk = store_sales.ss_item_sk) -----------------------------------------PhysicalDistribute -------------------------------------------hashJoin[INNER_JOIN](d1.d_date_sk = store_sales.ss_sold_date_sk) ---------------------------------------------PhysicalProject -----------------------------------------------PhysicalOlapScan[store_sales] ---------------------------------------------PhysicalDistribute -----------------------------------------------PhysicalProject -------------------------------------------------filter((d1.d_year = 1999)(d1.d_moy = 4)) ---------------------------------------------------PhysicalOlapScan[date_dim] -----------------------------------------PhysicalDistribute -------------------------------------------PhysicalProject ---------------------------------------------PhysicalOlapScan[item] ---------------------------------------PhysicalDistribute +----------------------------------hashJoin[INNER_JOIN](d1.d_date_sk = store_sales.ss_sold_date_sk) +------------------------------------PhysicalProject +--------------------------------------hashJoin[INNER_JOIN](store_sales.ss_item_sk = store_returns.sr_item_sk)(store_sales.ss_ticket_number = store_returns.sr_ticket_number)(store_sales.ss_customer_sk = store_returns.sr_customer_sk) ----------------------------------------PhysicalProject -------------------------------------------PhysicalOlapScan[store] -----------------------------PhysicalDistribute -------------------------------PhysicalProject ---------------------------------filter((d2.d_moy <= 7)(d2.d_moy >= 4)(d2.d_year = 1999)) -----------------------------------PhysicalOlapScan[date_dim] +------------------------------------------PhysicalOlapScan[store_sales] +----------------------------------------hashJoin[INNER_JOIN](store_returns.sr_returned_date_sk = d2.d_date_sk) +------------------------------------------PhysicalProject +--------------------------------------------PhysicalOlapScan[store_returns] +------------------------------------------PhysicalDistribute +--------------------------------------------PhysicalProject +----------------------------------------------filter((d2.d_moy <= 7)(d2.d_moy >= 4)(d2.d_year = 1999)) +------------------------------------------------PhysicalOlapScan[date_dim] +------------------------------------PhysicalDistribute +--------------------------------------PhysicalProject +----------------------------------------filter((d1.d_year = 1999)(d1.d_moy = 4)) +------------------------------------------PhysicalOlapScan[date_dim] +------------------------------PhysicalDistribute +--------------------------------PhysicalProject +----------------------------------PhysicalOlapScan[item] +--------------------------PhysicalDistribute +----------------------------PhysicalProject +------------------------------PhysicalOlapScan[store] ------------------PhysicalDistribute --------------------PhysicalProject ----------------------filter(d_year IN (1999, 2000, 2001)) diff --git a/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query40.out b/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query40.out index e0af770a07319df..6c1818fbfdf8e49 100644 --- a/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query40.out +++ b/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query40.out @@ -8,25 +8,24 @@ PhysicalResultSink ----------PhysicalDistribute ------------hashAgg[LOCAL] --------------PhysicalProject -----------------hashJoin[RIGHT_OUTER_JOIN](catalog_sales.cs_item_sk = catalog_returns.cr_item_sk)(catalog_sales.cs_order_number = catalog_returns.cr_order_number) +----------------hashJoin[INNER_JOIN](catalog_sales.cs_warehouse_sk = warehouse.w_warehouse_sk) ------------------PhysicalProject ---------------------PhysicalOlapScan[catalog_returns] +--------------------hashJoin[RIGHT_OUTER_JOIN](catalog_sales.cs_item_sk = catalog_returns.cr_item_sk)(catalog_sales.cs_order_number = catalog_returns.cr_order_number) +----------------------PhysicalProject +------------------------PhysicalOlapScan[catalog_returns] +----------------------hashJoin[INNER_JOIN](catalog_sales.cs_sold_date_sk = date_dim.d_date_sk) +------------------------hashJoin[INNER_JOIN](item.i_item_sk = catalog_sales.cs_item_sk) +--------------------------PhysicalProject +----------------------------PhysicalOlapScan[catalog_sales] +--------------------------PhysicalDistribute +----------------------------PhysicalProject +------------------------------filter((item.i_current_price >= 0.99)(item.i_current_price <= 1.49)) +--------------------------------PhysicalOlapScan[item] +------------------------PhysicalDistribute +--------------------------PhysicalProject +----------------------------filter((date_dim.d_date >= 2001-03-03)(date_dim.d_date <= 2001-05-02)) +------------------------------PhysicalOlapScan[date_dim] ------------------PhysicalDistribute --------------------PhysicalProject -----------------------hashJoin[INNER_JOIN](catalog_sales.cs_warehouse_sk = warehouse.w_warehouse_sk) -------------------------PhysicalProject ---------------------------PhysicalOlapScan[warehouse] -------------------------PhysicalDistribute ---------------------------hashJoin[INNER_JOIN](catalog_sales.cs_sold_date_sk = date_dim.d_date_sk) -----------------------------hashJoin[INNER_JOIN](item.i_item_sk = catalog_sales.cs_item_sk) -------------------------------PhysicalProject ---------------------------------PhysicalOlapScan[catalog_sales] -------------------------------PhysicalDistribute ---------------------------------PhysicalProject -----------------------------------filter((item.i_current_price >= 0.99)(item.i_current_price <= 1.49)) -------------------------------------PhysicalOlapScan[item] -----------------------------PhysicalDistribute -------------------------------PhysicalProject ---------------------------------filter((date_dim.d_date >= 2001-03-03)(date_dim.d_date <= 2001-05-02)) -----------------------------------PhysicalOlapScan[date_dim] +----------------------PhysicalOlapScan[warehouse] diff --git a/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query47.out b/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query47.out index ba248108818fbf9..12b9c7211f6e1fb 100644 --- a/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query47.out +++ b/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query47.out @@ -35,16 +35,15 @@ PhysicalCteAnchor ( cteId=CTEId#0 ) --------PhysicalDistribute ----------PhysicalTopN ------------PhysicalProject ---------------hashJoin[INNER_JOIN](v1.i_category = v1_lead.i_category)(v1.i_brand = v1_lead.i_brand)(v1.s_store_name = v1_lead.s_store_name)(v1.s_company_name = v1_lead.s_company_name)(v1.rn = expr_(rn - 1)) -----------------PhysicalProject -------------------hashJoin[INNER_JOIN](v1.i_category = v1_lag.i_category)(v1.i_brand = v1_lag.i_brand)(v1.s_store_name = v1_lag.s_store_name)(v1.s_company_name = v1_lag.s_company_name)(v1.rn = expr_(rn + 1)) ---------------------PhysicalDistribute -----------------------PhysicalProject +--------------hashJoin[INNER_JOIN](v1.i_category = v1_lag.i_category)(v1.i_brand = v1_lag.i_brand)(v1.s_store_name = v1_lag.s_store_name)(v1.s_company_name = v1_lag.s_company_name)(v1.rn = expr_(rn + 1)) +----------------hashJoin[INNER_JOIN](v1.i_category = v1_lead.i_category)(v1.i_brand = v1_lead.i_brand)(v1.s_store_name = v1_lead.s_store_name)(v1.s_company_name = v1_lead.s_company_name)(v1.rn = expr_(rn - 1)) +------------------PhysicalDistribute +--------------------PhysicalProject +----------------------PhysicalCteConsumer ( cteId=CTEId#0 ) +------------------PhysicalDistribute +--------------------PhysicalProject +----------------------filter((if((avg_monthly_sales > 0.0000), (abs((cast(sum_sales as DOUBLE) - cast(avg_monthly_sales as DOUBLE))) / cast(avg_monthly_sales as DOUBLE)), NULL) > 0.1)(v2.d_year = 2001)(v2.avg_monthly_sales > 0.0000)) ------------------------PhysicalCteConsumer ( cteId=CTEId#0 ) ---------------------PhysicalDistribute -----------------------PhysicalProject -------------------------filter((if((avg_monthly_sales > 0.0000), (abs((cast(sum_sales as DOUBLE) - cast(avg_monthly_sales as DOUBLE))) / cast(avg_monthly_sales as DOUBLE)), NULL) > 0.1)(v2.d_year = 2001)(v2.avg_monthly_sales > 0.0000)) ---------------------------PhysicalCteConsumer ( cteId=CTEId#0 ) ----------------PhysicalDistribute ------------------PhysicalProject --------------------PhysicalCteConsumer ( cteId=CTEId#0 ) diff --git a/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query54.out b/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query54.out index 4e259454dba15e3..bfcfb82d3e7fe55 100644 --- a/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query54.out +++ b/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query54.out @@ -13,62 +13,58 @@ PhysicalResultSink --------------------PhysicalDistribute ----------------------hashAgg[LOCAL] ------------------------PhysicalProject ---------------------------hashJoin[INNER_JOIN](customer_address.ca_county = store.s_county)(customer_address.ca_state = store.s_state) +--------------------------NestedLoopJoin[INNER_JOIN](cast(d_month_seq as BIGINT) <= (d_month_seq + 3)) ----------------------------PhysicalProject -------------------------------NestedLoopJoin[INNER_JOIN](cast(d_month_seq as BIGINT) <= (d_month_seq + 3)) +------------------------------NestedLoopJoin[INNER_JOIN](cast(d_month_seq as BIGINT) >= (d_month_seq + 1)) --------------------------------PhysicalProject -----------------------------------NestedLoopJoin[INNER_JOIN](cast(d_month_seq as BIGINT) >= (d_month_seq + 1)) -------------------------------------hashJoin[INNER_JOIN](store_sales.ss_sold_date_sk = date_dim.d_date_sk) ---------------------------------------PhysicalDistribute -----------------------------------------PhysicalProject -------------------------------------------hashJoin[INNER_JOIN](my_customers.c_customer_sk = store_sales.ss_customer_sk) +----------------------------------hashJoin[INNER_JOIN](store_sales.ss_sold_date_sk = date_dim.d_date_sk) +------------------------------------PhysicalDistribute +--------------------------------------PhysicalProject +----------------------------------------hashJoin[INNER_JOIN](my_customers.c_customer_sk = store_sales.ss_customer_sk) +------------------------------------------PhysicalProject +--------------------------------------------PhysicalOlapScan[store_sales] +------------------------------------------PhysicalDistribute --------------------------------------------PhysicalProject -----------------------------------------------PhysicalOlapScan[store_sales] ---------------------------------------------PhysicalDistribute -----------------------------------------------hashJoin[INNER_JOIN](my_customers.c_current_addr_sk = customer_address.ca_address_sk) -------------------------------------------------PhysicalProject ---------------------------------------------------PhysicalOlapScan[customer_address] +----------------------------------------------hashJoin[INNER_JOIN](customer_address.ca_county = store.s_county)(customer_address.ca_state = store.s_state) ------------------------------------------------PhysicalDistribute ---------------------------------------------------PhysicalProject -----------------------------------------------------hashAgg[GLOBAL] -------------------------------------------------------PhysicalDistribute ---------------------------------------------------------hashAgg[LOCAL] -----------------------------------------------------------PhysicalProject -------------------------------------------------------------hashJoin[INNER_JOIN](customer.c_customer_sk = cs_or_ws_sales.customer_sk) +--------------------------------------------------hashJoin[INNER_JOIN](my_customers.c_current_addr_sk = customer_address.ca_address_sk) +----------------------------------------------------PhysicalProject +------------------------------------------------------PhysicalOlapScan[customer_address] +----------------------------------------------------PhysicalDistribute +------------------------------------------------------PhysicalProject +--------------------------------------------------------hashAgg[GLOBAL] +----------------------------------------------------------PhysicalDistribute +------------------------------------------------------------hashAgg[LOCAL] --------------------------------------------------------------PhysicalProject -----------------------------------------------------------------PhysicalOlapScan[customer] ---------------------------------------------------------------PhysicalDistribute -----------------------------------------------------------------PhysicalProject -------------------------------------------------------------------hashJoin[INNER_JOIN](cs_or_ws_sales.sold_date_sk = date_dim.d_date_sk) +----------------------------------------------------------------hashJoin[INNER_JOIN](customer.c_customer_sk = cs_or_ws_sales.customer_sk) +------------------------------------------------------------------PhysicalProject +--------------------------------------------------------------------PhysicalOlapScan[customer] +------------------------------------------------------------------PhysicalDistribute --------------------------------------------------------------------PhysicalProject -----------------------------------------------------------------------hashJoin[INNER_JOIN](cs_or_ws_sales.item_sk = item.i_item_sk) -------------------------------------------------------------------------PhysicalUnion ---------------------------------------------------------------------------PhysicalDistribute -----------------------------------------------------------------------------PhysicalProject -------------------------------------------------------------------------------PhysicalOlapScan[catalog_sales] ---------------------------------------------------------------------------PhysicalDistribute -----------------------------------------------------------------------------PhysicalProject -------------------------------------------------------------------------------PhysicalOlapScan[web_sales] +----------------------------------------------------------------------hashJoin[INNER_JOIN](cs_or_ws_sales.sold_date_sk = date_dim.d_date_sk) +------------------------------------------------------------------------PhysicalProject +--------------------------------------------------------------------------hashJoin[INNER_JOIN](cs_or_ws_sales.item_sk = item.i_item_sk) +----------------------------------------------------------------------------PhysicalUnion +------------------------------------------------------------------------------PhysicalDistribute +--------------------------------------------------------------------------------PhysicalProject +----------------------------------------------------------------------------------PhysicalOlapScan[catalog_sales] +------------------------------------------------------------------------------PhysicalDistribute +--------------------------------------------------------------------------------PhysicalProject +----------------------------------------------------------------------------------PhysicalOlapScan[web_sales] +----------------------------------------------------------------------------PhysicalDistribute +------------------------------------------------------------------------------PhysicalProject +--------------------------------------------------------------------------------filter((cast(i_class as VARCHAR(*)) = 'maternity')(cast(i_category as VARCHAR(*)) = 'Women')) +----------------------------------------------------------------------------------PhysicalOlapScan[item] ------------------------------------------------------------------------PhysicalDistribute --------------------------------------------------------------------------PhysicalProject -----------------------------------------------------------------------------filter((cast(i_class as VARCHAR(*)) = 'maternity')(cast(i_category as VARCHAR(*)) = 'Women')) -------------------------------------------------------------------------------PhysicalOlapScan[item] ---------------------------------------------------------------------PhysicalDistribute -----------------------------------------------------------------------PhysicalProject -------------------------------------------------------------------------filter((date_dim.d_year = 1998)(date_dim.d_moy = 5)) ---------------------------------------------------------------------------PhysicalOlapScan[date_dim] ---------------------------------------PhysicalDistribute -----------------------------------------PhysicalProject -------------------------------------------PhysicalOlapScan[date_dim] +----------------------------------------------------------------------------filter((date_dim.d_year = 1998)(date_dim.d_moy = 5)) +------------------------------------------------------------------------------PhysicalOlapScan[date_dim] +------------------------------------------------PhysicalDistribute +--------------------------------------------------PhysicalProject +----------------------------------------------------PhysicalOlapScan[store] ------------------------------------PhysicalDistribute ---------------------------------------PhysicalAssertNumRows -----------------------------------------PhysicalDistribute -------------------------------------------hashAgg[GLOBAL] ---------------------------------------------PhysicalDistribute -----------------------------------------------hashAgg[LOCAL] -------------------------------------------------PhysicalProject ---------------------------------------------------filter((date_dim.d_year = 1998)(date_dim.d_moy = 5)) -----------------------------------------------------PhysicalOlapScan[date_dim] +--------------------------------------PhysicalProject +----------------------------------------PhysicalOlapScan[date_dim] --------------------------------PhysicalDistribute ----------------------------------PhysicalAssertNumRows ------------------------------------PhysicalDistribute @@ -79,6 +75,12 @@ PhysicalResultSink ----------------------------------------------filter((date_dim.d_year = 1998)(date_dim.d_moy = 5)) ------------------------------------------------PhysicalOlapScan[date_dim] ----------------------------PhysicalDistribute -------------------------------PhysicalProject ---------------------------------PhysicalOlapScan[store] +------------------------------PhysicalAssertNumRows +--------------------------------PhysicalDistribute +----------------------------------hashAgg[GLOBAL] +------------------------------------PhysicalDistribute +--------------------------------------hashAgg[LOCAL] +----------------------------------------PhysicalProject +------------------------------------------filter((date_dim.d_year = 1998)(date_dim.d_moy = 5)) +--------------------------------------------PhysicalOlapScan[date_dim] diff --git a/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query59.out b/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query59.out index 74f0058cc1fb921..ac089d93c3b452c 100644 --- a/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query59.out +++ b/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query59.out @@ -17,33 +17,35 @@ PhysicalCteAnchor ( cteId=CTEId#0 ) ------PhysicalDistribute --------PhysicalTopN ----------PhysicalProject -------------hashJoin[INNER_JOIN](y.s_store_id1 = x.s_store_id2)(wss.ss_store_sk = store.s_store_sk) ---------------hashJoin[INNER_JOIN](expr_cast(d_week_seq1 as BIGINT) = expr_(d_week_seq2 - 52)) -----------------PhysicalDistribute -------------------PhysicalProject ---------------------hashJoin[INNER_JOIN](wss.ss_store_sk = store.s_store_sk) -----------------------PhysicalDistribute -------------------------hashJoin[INNER_JOIN](d.d_week_seq = d_week_seq1) ---------------------------PhysicalDistribute -----------------------------PhysicalProject -------------------------------PhysicalCteConsumer ( cteId=CTEId#0 ) ---------------------------PhysicalDistribute -----------------------------PhysicalProject -------------------------------filter((d.d_month_seq <= 1207)(d.d_month_seq >= 1196)) ---------------------------------PhysicalOlapScan[date_dim] -----------------------PhysicalDistribute -------------------------PhysicalProject ---------------------------PhysicalOlapScan[store] -----------------PhysicalDistribute -------------------hashJoin[INNER_JOIN](d.d_week_seq = d_week_seq2) +------------hashJoin[INNER_JOIN](y.s_store_id1 = x.s_store_id2)(expr_cast(d_week_seq1 as BIGINT) = expr_(d_week_seq2 - 52)) +--------------PhysicalDistribute +----------------PhysicalProject +------------------hashJoin[INNER_JOIN](wss.ss_store_sk = store.s_store_sk) --------------------PhysicalDistribute -----------------------PhysicalProject -------------------------PhysicalCteConsumer ( cteId=CTEId#0 ) +----------------------hashJoin[INNER_JOIN](d.d_week_seq = d_week_seq2) +------------------------PhysicalDistribute +--------------------------PhysicalProject +----------------------------PhysicalCteConsumer ( cteId=CTEId#0 ) +------------------------PhysicalDistribute +--------------------------PhysicalProject +----------------------------filter((d.d_month_seq <= 1219)(d.d_month_seq >= 1208)) +------------------------------PhysicalOlapScan[date_dim] --------------------PhysicalDistribute ----------------------PhysicalProject -------------------------filter((d.d_month_seq <= 1219)(d.d_month_seq >= 1208)) ---------------------------PhysicalOlapScan[date_dim] +------------------------PhysicalOlapScan[store] --------------PhysicalDistribute ----------------PhysicalProject -------------------PhysicalOlapScan[store] +------------------hashJoin[INNER_JOIN](wss.ss_store_sk = store.s_store_sk) +--------------------PhysicalDistribute +----------------------hashJoin[INNER_JOIN](d.d_week_seq = d_week_seq1) +------------------------PhysicalDistribute +--------------------------PhysicalProject +----------------------------PhysicalCteConsumer ( cteId=CTEId#0 ) +------------------------PhysicalDistribute +--------------------------PhysicalProject +----------------------------filter((d.d_month_seq <= 1207)(d.d_month_seq >= 1196)) +------------------------------PhysicalOlapScan[date_dim] +--------------------PhysicalDistribute +----------------------PhysicalProject +------------------------PhysicalOlapScan[store] diff --git a/regression-test/suites/nereids_tpcds_shape_sf100_p0/rf/ds_rf17.groovy b/regression-test/suites/nereids_tpcds_shape_sf100_p0/rf/ds_rf17.groovy index 8eece8aa39b92ab..a0ec1492fdec5f8 100644 --- a/regression-test/suites/nereids_tpcds_shape_sf100_p0/rf/ds_rf17.groovy +++ b/regression-test/suites/nereids_tpcds_shape_sf100_p0/rf/ds_rf17.groovy @@ -95,5 +95,5 @@ limit 100; // File file = new File(outFile) // file.write(getRuntimeFilters(plan)) - assertEquals("RF9[d_date_sk->[cs_sold_date_sk],RF7[cs_bill_customer_sk->[sr_customer_sk],RF8[cs_item_sk->[sr_item_sk],RF6[d_date_sk->[sr_returned_date_sk],RF3[ss_customer_sk->[sr_customer_sk],RF4[ss_item_sk->[sr_item_sk],RF5[ss_ticket_number->[sr_ticket_number],RF2[s_store_sk->[ss_store_sk],RF1[i_item_sk->[ss_item_sk],RF0[d_date_sk->[ss_sold_date_sk]", getRuntimeFilters(plan)) + assertEquals("RF9[d_date_sk->[cs_sold_date_sk],RF7[sr_customer_sk->[cs_bill_customer_sk],RF8[sr_item_sk->[cs_item_sk],RF6[s_store_sk->[ss_store_sk],RF5[i_item_sk->[ss_item_sk],RF4[d_date_sk->[sr_returned_date_sk],RF1[ss_customer_sk->[sr_customer_sk],RF2[ss_item_sk->[sr_item_sk],RF3[ss_ticket_number->[sr_ticket_number],RF0[d_date_sk->[ss_sold_date_sk]", getRuntimeFilters(plan)) } diff --git a/regression-test/suites/nereids_tpcds_shape_sf100_p0/rf/ds_rf25.groovy b/regression-test/suites/nereids_tpcds_shape_sf100_p0/rf/ds_rf25.groovy index a3b43d7cb2b9ea3..ecede80079b4352 100644 --- a/regression-test/suites/nereids_tpcds_shape_sf100_p0/rf/ds_rf25.groovy +++ b/regression-test/suites/nereids_tpcds_shape_sf100_p0/rf/ds_rf25.groovy @@ -97,5 +97,5 @@ suite("ds_rf25") { // def outFile = "regression-test/suites/nereids_tpcds_shape_sf100_p0/ddl/rf/rf.25" // File file = new File(outFile) // file.write(getRuntimeFilters(plan)) - assertEquals("RF9[d_date_sk->[cs_sold_date_sk],RF7[sr_customer_sk->[cs_bill_customer_sk],RF8[sr_item_sk->[cs_item_sk],RF6[d_date_sk->[sr_returned_date_sk],RF3[ss_customer_sk->[sr_customer_sk],RF4[ss_item_sk->[sr_item_sk],RF5[ss_ticket_number->[sr_ticket_number],RF2[s_store_sk->[ss_store_sk],RF1[i_item_sk->[ss_item_sk],RF0[d_date_sk->[ss_sold_date_sk]", getRuntimeFilters(plan)) + assertEquals("RF9[d_date_sk->[cs_sold_date_sk],RF7[sr_customer_sk->[cs_bill_customer_sk],RF8[sr_item_sk->[cs_item_sk],RF6[s_store_sk->[ss_store_sk],RF5[i_item_sk->[ss_item_sk],RF4[d_date_sk->[sr_returned_date_sk],RF1[ss_customer_sk->[sr_customer_sk],RF2[ss_item_sk->[sr_item_sk],RF3[ss_ticket_number->[sr_ticket_number],RF0[d_date_sk->[ss_sold_date_sk]", getRuntimeFilters(plan)) } diff --git a/regression-test/suites/nereids_tpcds_shape_sf100_p0/rf/ds_rf40.groovy b/regression-test/suites/nereids_tpcds_shape_sf100_p0/rf/ds_rf40.groovy index 6aefa4541dfcdbb..96a557b9f6b10a2 100644 --- a/regression-test/suites/nereids_tpcds_shape_sf100_p0/rf/ds_rf40.groovy +++ b/regression-test/suites/nereids_tpcds_shape_sf100_p0/rf/ds_rf40.groovy @@ -78,5 +78,5 @@ limit 100; // File file = new File(outFile) // file.write(getRuntimeFilters(plan)) - assertEquals("RF3[cs_order_number->[cr_order_number],RF4[cs_item_sk->[cr_item_sk],RF2[cs_warehouse_sk->[w_warehouse_sk],RF1[d_date_sk->[cs_sold_date_sk],RF0[i_item_sk->[cs_item_sk]", getRuntimeFilters(plan)) + assertEquals("RF4[w_warehouse_sk->[cs_warehouse_sk],RF2[cs_order_number->[cr_order_number],RF3[cs_item_sk->[cr_item_sk],RF1[d_date_sk->[cs_sold_date_sk],RF0[i_item_sk->[cs_item_sk]", getRuntimeFilters(plan)) } diff --git a/regression-test/suites/nereids_tpcds_shape_sf100_p0/rf/ds_rf54.groovy b/regression-test/suites/nereids_tpcds_shape_sf100_p0/rf/ds_rf54.groovy index da0f7714d8444d9..88431716efc10d5 100644 --- a/regression-test/suites/nereids_tpcds_shape_sf100_p0/rf/ds_rf54.groovy +++ b/regression-test/suites/nereids_tpcds_shape_sf100_p0/rf/ds_rf54.groovy @@ -106,5 +106,5 @@ suite("ds_rf54") { // File file = new File(outFile) // file.write(getRuntimeFilters(plan)) - assertEquals("RF5[s_county->[ca_county],RF6[s_state->[ca_state],RF4[d_date_sk->[ss_sold_date_sk],RF3[c_customer_sk->[ss_customer_sk],RF2[c_current_addr_sk->[ca_address_sk],RF1[customer_sk->[c_customer_sk],RF0[i_item_sk->[cs_item_sk, ws_item_sk]", getRuntimeFilters(plan)) + assertEquals("RF6[d_date_sk->[ss_sold_date_sk],RF5[c_customer_sk->[ss_customer_sk],RF3[s_county->[ca_county],RF4[s_state->[ca_state],RF2[c_current_addr_sk->[ca_address_sk],RF1[customer_sk->[c_customer_sk],RF0[i_item_sk->[cs_item_sk, ws_item_sk]", getRuntimeFilters(plan)) } diff --git a/regression-test/suites/nereids_tpcds_shape_sf100_p0/rf/ds_rf59.groovy b/regression-test/suites/nereids_tpcds_shape_sf100_p0/rf/ds_rf59.groovy index e97af073f25d2a4..af6fea1f90cc61f 100644 --- a/regression-test/suites/nereids_tpcds_shape_sf100_p0/rf/ds_rf59.groovy +++ b/regression-test/suites/nereids_tpcds_shape_sf100_p0/rf/ds_rf59.groovy @@ -93,5 +93,5 @@ limit 100; // def outFile = "regression-test/suites/nereids_tpcds_shape_sf100_p0/ddl/rf/rf.59" // File file = new File(outFile) // file.write(getRuntimeFilters(plan)) - assertEquals("RF0[d_date_sk->[ss_sold_date_sk],RF4[s_store_id2->[s_store_id],RF5[s_store_sk->[ss_store_sk],RF3[s_store_sk->[ss_store_sk],RF2[d_week_seq->[d_week_seq],RF1[d_week_seq->[d_week_seq]", getRuntimeFilters(plan)) + assertEquals("RF0[d_date_sk->[ss_sold_date_sk],RF4[s_store_sk->[ss_store_sk],RF3[d_week_seq->[d_week_seq],RF2[s_store_sk->[ss_store_sk],RF1[d_week_seq->[d_week_seq]", getRuntimeFilters(plan)) } diff --git a/tools/tpcds-tools/bin/load-tpcds-data.sh b/tools/tpcds-tools/bin/load-tpcds-data.sh index a5fe926f652ec65..337e2aa0c7758cc 100755 --- a/tools/tpcds-tools/bin/load-tpcds-data.sh +++ b/tools/tpcds-tools/bin/load-tpcds-data.sh @@ -196,4 +196,18 @@ exec 3>&- end_time=$(date +%s) echo "End time: $(date)" + +run_sql() { + echo "$*" + mysql -h"${FE_HOST}" -u"${USER}" -P"${FE_QUERY_PORT}" -D"${DB}" -e "$*" +} + echo "Finish load tpcds data, Time taken: $((end_time - start_time)) seconds" + +echo '============================================' +start=$(date +%s) +run_sql "analyze database ${DB} with sync;" +end=$(date +%s) +analyzeTime=$((end - start)) +echo "analyze database ${DB} with sync total time: ${analyzeTime} s" +echo '============================================' diff --git a/tools/tpcds-tools/bin/run-tpcds-queries.sh b/tools/tpcds-tools/bin/run-tpcds-queries.sh index 81dbfb70f14d0cc..3f9ef38ee16cc68 100755 --- a/tools/tpcds-tools/bin/run-tpcds-queries.sh +++ b/tools/tpcds-tools/bin/run-tpcds-queries.sh @@ -131,21 +131,20 @@ run_sql "show variables;" echo '============================================' run_sql "show table status;" echo '============================================' -start=$(date +%s) -run_sql "analyze database ${DB} with sync;" -end=$(date +%s) -totalTime=$((end - start)) -echo "analyze database ${DB} with sync total time: ${totalTime} s" -echo '============================================' echo "Time Unit: ms" RESULT_DIR="${CURDIR}/result" -rm "${RESULT_DIR}" +if [ -d "${RESULT_DIR}" ]; then + rm -r "${RESULT_DIR}" +fi mkdir -p "${RESULT_DIR}" touch result.csv cold_run_sum=0 best_hot_run_sum=0 -for i in {1..99}; do +# run part of queries, set their index to query_array +# query_array=(59 17 29 25 47 40 54) +query_array=$(seq 1 99) +for i in ${query_array[@]}; do cold=0 hot1=0 hot2=0 diff --git a/tools/tpcds-tools/conf/doris-cluster.conf b/tools/tpcds-tools/conf/doris-cluster.conf index fd737356c2c103d..c073d062fd99c37 100644 --- a/tools/tpcds-tools/conf/doris-cluster.conf +++ b/tools/tpcds-tools/conf/doris-cluster.conf @@ -26,4 +26,4 @@ export USER='root' # Doris password export PASSWORD='' # The database where TPC-DS tables located -export DB='tpcds' +export DB='tpcds_sf100'