From e2d9cc3c02e76fc477db71ac1fd53e6b8794feee Mon Sep 17 00:00:00 2001 From: zhangstar333 <2561612514@qq.com> Date: Thu, 21 Sep 2023 16:13:40 +0800 Subject: [PATCH] [bug](node)fix dense_rank function in partition sort node return wrong rows --- be/src/vec/common/sort/partition_sorter.cpp | 39 ++++++----- be/src/vec/common/sort/partition_sorter.h | 1 + .../test_select_stddev_variance_window.out | 68 +++++++++++++++++++ .../test_select_stddev_variance_window.groovy | 24 +++++++ 4 files changed, 114 insertions(+), 18 deletions(-) diff --git a/be/src/vec/common/sort/partition_sorter.cpp b/be/src/vec/common/sort/partition_sorter.cpp index 1bffb5ed7603eb..083c676ba8c1ff 100644 --- a/be/src/vec/common/sort/partition_sorter.cpp +++ b/be/src/vec/common/sort/partition_sorter.cpp @@ -101,12 +101,10 @@ Status PartitionSorter::partition_sort_read(Block* output_block, bool* eos, int auto& priority_queue = _state->get_priority_queue(); bool get_enough_data = false; - bool first_compare_row = false; while (!priority_queue.empty()) { auto current = priority_queue.top(); priority_queue.pop(); if (UNLIKELY(_previous_row->impl == nullptr)) { - first_compare_row = true; *_previous_row = current; } @@ -125,34 +123,39 @@ Status PartitionSorter::partition_sort_read(Block* output_block, bool* eos, int break; } case TopNAlgorithm::DENSE_RANK: { + // dense_rank(): 1,1,1,2,2,2,2,.......,2,3,3,3, if SQL: where rk < 3, need output all 1 and 2 //3 dense_rank() maybe need distinct rows of partition_inner_limit - if ((current_output_rows + _output_total_rows) < _partition_inner_limit) { - for (size_t i = 0; i < num_columns; ++i) { - merged_columns[i]->insert_from(*current->all_columns[i], current->pos); - } - } else { + //3.1 _has_global_limit = true, so check (current_output_rows + _output_total_rows) >= _partition_inner_limit) + //3.2 _has_global_limit = false. so check have output distinct rows, not _output_total_rows + if (_has_global_limit && + (current_output_rows + _output_total_rows) >= _partition_inner_limit) { get_enough_data = true; + break; } if (_has_global_limit) { current_output_rows++; } else { - //when it's first comes, the rows are same no need compare - if (first_compare_row) { - current_output_rows++; - first_compare_row = false; - } else { - // not the first comes, so need compare those, when is distinct row - // so could current_output_rows++ - bool cmp_res = _previous_row->compare_two_rows(current); - if (cmp_res == false) { // distinct row - current_output_rows++; - *_previous_row = current; + bool cmp_res = _previous_row->compare_two_rows(current); + //get a distinct row + if (cmp_res == false) { + _output_distinct_rows++; //need rows++ firstly + if (_output_distinct_rows >= _partition_inner_limit) { + get_enough_data = true; + break; } + *_previous_row = current; } } + for (size_t i = 0; i < num_columns; ++i) { + merged_columns[i]->insert_from(*current->all_columns[i], current->pos); + } break; } case TopNAlgorithm::RANK: { + // rank(): 1,1,1,4,5,6,6,6.....,6,100,101. if SQL where rk < 7, need output all 1,1,1,4,5,6,6,....6 + //2 rank() maybe need check when have get a distinct row + //2.1 _has_global_limit = true: (current_output_rows + _output_total_rows) >= _partition_inner_limit) + //2.2 _has_global_limit = false: so when the cmp_res is get a distinct row, need check have output all rows num if (_has_global_limit && (current_output_rows + _output_total_rows) >= _partition_inner_limit) { get_enough_data = true; diff --git a/be/src/vec/common/sort/partition_sorter.h b/be/src/vec/common/sort/partition_sorter.h index ff17ac21157623..ca0cd5c49336e8 100644 --- a/be/src/vec/common/sort/partition_sorter.h +++ b/be/src/vec/common/sort/partition_sorter.h @@ -99,6 +99,7 @@ class PartitionSorter final : public Sorter { std::unique_ptr _state; const RowDescriptor& _row_desc; int64 _output_total_rows = 0; + int64 _output_distinct_rows = 0; bool _has_global_limit = false; int _partition_inner_limit = 0; TopNAlgorithm::type _top_n_algorithm = TopNAlgorithm::type::ROW_NUMBER; diff --git a/regression-test/data/query_p0/sql_functions/window_functions/test_select_stddev_variance_window.out b/regression-test/data/query_p0/sql_functions/window_functions/test_select_stddev_variance_window.out index cb3e3e4d6436fa..d8542dca566289 100644 --- a/regression-test/data/query_p0/sql_functions/window_functions/test_select_stddev_variance_window.out +++ b/regression-test/data/query_p0/sql_functions/window_functions/test_select_stddev_variance_window.out @@ -1019,3 +1019,71 @@ 14 1987.5 15 1989.0 +-- !sql_row_number_1 -- +1 -32767 false +1 255 false +1 1985 true +1 1986 false +1 1989 false +1 1991 false +1 1992 true +1 32767 false + +-- !sql_rank_1 -- +1 -32767 false +1 -32767 false +1 255 false +1 1985 true +1 1986 false +1 1989 false +1 1991 false +1 1991 false +1 1992 true +1 32767 false + +-- !sql_dense_rank_1 -- +1 -32767 false +1 -32767 false +1 255 false +1 1985 true +1 1986 false +1 1989 false +1 1991 false +1 1991 false +1 1992 true +1 32767 false + +-- !sql_row_number -- +1 -32767 false +1 255 false +1 1985 true +1 1986 false +1 1989 false +1 1991 false +1 1992 true +1 32767 false + +-- !sql_rank -- +1 -32767 false +1 -32767 false +1 255 false +1 1985 true +1 1986 false +1 1989 false +1 1991 false +1 1991 false +1 1992 true +1 32767 false + +-- !sql_dense_rank -- +1 -32767 false +1 -32767 false +1 255 false +1 1985 true +1 1986 false +1 1989 false +1 1991 false +1 1991 false +1 1992 true +1 32767 false + diff --git a/regression-test/suites/query_p0/sql_functions/window_functions/test_select_stddev_variance_window.groovy b/regression-test/suites/query_p0/sql_functions/window_functions/test_select_stddev_variance_window.groovy index 7cbad40a61dbdd..7ec02d90ae130c 100644 --- a/regression-test/suites/query_p0/sql_functions/window_functions/test_select_stddev_variance_window.groovy +++ b/regression-test/suites/query_p0/sql_functions/window_functions/test_select_stddev_variance_window.groovy @@ -147,6 +147,30 @@ suite("test_select_stddev_variance_window") { qt_select_default "select k1, percentile_approx(k2,0.5,4096) over (partition by k6 order by k1 rows between current row and current row) from ${tableName} order by k1;" qt_select_default "select k1, percentile_approx(k2,0.5,4096) over (partition by k6 order by k1 rows between current row and unbounded following) from ${tableName} order by k1;" qt_select_default "select k1, percentile_approx(k2,0.5,4096) over (partition by k6 order by k1) from ${tableName} order by k1;" + + sql "set experimental_enable_nereids_planner = false;" + + qt_sql_row_number_1 """ + select * from (select row_number() over(partition by k2 order by k6) as rk,k2,k6 from ${tableName}) as t where rk = 1 order by 1,2,3; + """ + qt_sql_rank_1 """ + select * from (select rank() over(partition by k2 order by k6) as rk,k2,k6 from ${tableName}) as t where rk = 1 order by 1,2,3; + """ + qt_sql_dense_rank_1 """ + select * from (select dense_rank() over(partition by k2 order by k6) as rk,k2,k6 from ${tableName}) as t where rk = 1 order by 1,2,3; + """ + + sql "set experimental_enable_nereids_planner = true;" + + qt_sql_row_number """ + select * from (select row_number() over(partition by k2 order by k6) as rk,k2,k6 from ${tableName}) as t where rk = 1 order by 1,2,3; + """ + qt_sql_rank """ + select * from (select rank() over(partition by k2 order by k6) as rk,k2,k6 from ${tableName}) as t where rk = 1 order by 1,2,3; + """ + qt_sql_dense_rank """ + select * from (select dense_rank() over(partition by k2 order by k6) as rk,k2,k6 from ${tableName}) as t where rk = 1 order by 1,2,3; + """ }