Skip to content

Commit

Permalink
[bug](node)fix dense_rank function in partition sort node return wron…
Browse files Browse the repository at this point in the history
…g rows
  • Loading branch information
zhangstar333 committed Sep 21, 2023
1 parent 5a0ccd7 commit e2d9cc3
Show file tree
Hide file tree
Showing 4 changed files with 114 additions and 18 deletions.
39 changes: 21 additions & 18 deletions be/src/vec/common/sort/partition_sorter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -101,12 +101,10 @@ Status PartitionSorter::partition_sort_read(Block* output_block, bool* eos, int
auto& priority_queue = _state->get_priority_queue();

bool get_enough_data = false;
bool first_compare_row = false;
while (!priority_queue.empty()) {
auto current = priority_queue.top();
priority_queue.pop();
if (UNLIKELY(_previous_row->impl == nullptr)) {
first_compare_row = true;
*_previous_row = current;
}

Expand All @@ -125,34 +123,39 @@ Status PartitionSorter::partition_sort_read(Block* output_block, bool* eos, int
break;
}
case TopNAlgorithm::DENSE_RANK: {
// dense_rank(): 1,1,1,2,2,2,2,.......,2,3,3,3, if SQL: where rk < 3, need output all 1 and 2
//3 dense_rank() maybe need distinct rows of partition_inner_limit
if ((current_output_rows + _output_total_rows) < _partition_inner_limit) {
for (size_t i = 0; i < num_columns; ++i) {
merged_columns[i]->insert_from(*current->all_columns[i], current->pos);
}
} else {
//3.1 _has_global_limit = true, so check (current_output_rows + _output_total_rows) >= _partition_inner_limit)
//3.2 _has_global_limit = false. so check have output distinct rows, not _output_total_rows
if (_has_global_limit &&
(current_output_rows + _output_total_rows) >= _partition_inner_limit) {
get_enough_data = true;
break;
}
if (_has_global_limit) {
current_output_rows++;
} else {
//when it's first comes, the rows are same no need compare
if (first_compare_row) {
current_output_rows++;
first_compare_row = false;
} else {
// not the first comes, so need compare those, when is distinct row
// so could current_output_rows++
bool cmp_res = _previous_row->compare_two_rows(current);
if (cmp_res == false) { // distinct row
current_output_rows++;
*_previous_row = current;
bool cmp_res = _previous_row->compare_two_rows(current);
//get a distinct row
if (cmp_res == false) {
_output_distinct_rows++; //need rows++ firstly
if (_output_distinct_rows >= _partition_inner_limit) {
get_enough_data = true;
break;
}
*_previous_row = current;
}
}
for (size_t i = 0; i < num_columns; ++i) {
merged_columns[i]->insert_from(*current->all_columns[i], current->pos);
}
break;
}
case TopNAlgorithm::RANK: {
// rank(): 1,1,1,4,5,6,6,6.....,6,100,101. if SQL where rk < 7, need output all 1,1,1,4,5,6,6,....6
//2 rank() maybe need check when have get a distinct row
//2.1 _has_global_limit = true: (current_output_rows + _output_total_rows) >= _partition_inner_limit)
//2.2 _has_global_limit = false: so when the cmp_res is get a distinct row, need check have output all rows num
if (_has_global_limit &&
(current_output_rows + _output_total_rows) >= _partition_inner_limit) {
get_enough_data = true;
Expand Down
1 change: 1 addition & 0 deletions be/src/vec/common/sort/partition_sorter.h
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ class PartitionSorter final : public Sorter {
std::unique_ptr<MergeSorterState> _state;
const RowDescriptor& _row_desc;
int64 _output_total_rows = 0;
int64 _output_distinct_rows = 0;
bool _has_global_limit = false;
int _partition_inner_limit = 0;
TopNAlgorithm::type _top_n_algorithm = TopNAlgorithm::type::ROW_NUMBER;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1019,3 +1019,71 @@
14 1987.5
15 1989.0

-- !sql_row_number_1 --
1 -32767 false
1 255 false
1 1985 true
1 1986 false
1 1989 false
1 1991 false
1 1992 true
1 32767 false

-- !sql_rank_1 --
1 -32767 false
1 -32767 false
1 255 false
1 1985 true
1 1986 false
1 1989 false
1 1991 false
1 1991 false
1 1992 true
1 32767 false

-- !sql_dense_rank_1 --
1 -32767 false
1 -32767 false
1 255 false
1 1985 true
1 1986 false
1 1989 false
1 1991 false
1 1991 false
1 1992 true
1 32767 false

-- !sql_row_number --
1 -32767 false
1 255 false
1 1985 true
1 1986 false
1 1989 false
1 1991 false
1 1992 true
1 32767 false

-- !sql_rank --
1 -32767 false
1 -32767 false
1 255 false
1 1985 true
1 1986 false
1 1989 false
1 1991 false
1 1991 false
1 1992 true
1 32767 false

-- !sql_dense_rank --
1 -32767 false
1 -32767 false
1 255 false
1 1985 true
1 1986 false
1 1989 false
1 1991 false
1 1991 false
1 1992 true
1 32767 false

Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,30 @@ suite("test_select_stddev_variance_window") {
qt_select_default "select k1, percentile_approx(k2,0.5,4096) over (partition by k6 order by k1 rows between current row and current row) from ${tableName} order by k1;"
qt_select_default "select k1, percentile_approx(k2,0.5,4096) over (partition by k6 order by k1 rows between current row and unbounded following) from ${tableName} order by k1;"
qt_select_default "select k1, percentile_approx(k2,0.5,4096) over (partition by k6 order by k1) from ${tableName} order by k1;"

sql "set experimental_enable_nereids_planner = false;"

qt_sql_row_number_1 """
select * from (select row_number() over(partition by k2 order by k6) as rk,k2,k6 from ${tableName}) as t where rk = 1 order by 1,2,3;
"""
qt_sql_rank_1 """
select * from (select rank() over(partition by k2 order by k6) as rk,k2,k6 from ${tableName}) as t where rk = 1 order by 1,2,3;
"""
qt_sql_dense_rank_1 """
select * from (select dense_rank() over(partition by k2 order by k6) as rk,k2,k6 from ${tableName}) as t where rk = 1 order by 1,2,3;
"""

sql "set experimental_enable_nereids_planner = true;"

qt_sql_row_number """
select * from (select row_number() over(partition by k2 order by k6) as rk,k2,k6 from ${tableName}) as t where rk = 1 order by 1,2,3;
"""
qt_sql_rank """
select * from (select rank() over(partition by k2 order by k6) as rk,k2,k6 from ${tableName}) as t where rk = 1 order by 1,2,3;
"""
qt_sql_dense_rank """
select * from (select dense_rank() over(partition by k2 order by k6) as rk,k2,k6 from ${tableName}) as t where rk = 1 order by 1,2,3;
"""
}


Expand Down

0 comments on commit e2d9cc3

Please sign in to comment.