Skip to content

Commit

Permalink
[fix](profile) When all preds are always_true, input rows not be coun…
Browse files Browse the repository at this point in the history
…ted in the profile. (apache#42998)

## Proposed changes
In the past, if the predicate came from a runtime filter (e.g.,
MinMaxFilter),
it could trigger the all_pred_always_true logic in
_evaluate_vectorization_predicate.
before
```
                                  -  ScanRows:  399.864K  (399864)
                                          -  RowsShortCircuitPredFiltered:  378.582K  (378582)
                                          -  RowsShortCircuitPredInput:  399.862K  (399862)
                                          -  RowsVectorPredFiltered:  2
                                          -  RowsVectorPredInput:  12.192K  (12192)

```
now
```
                                  -  ScanRows:  399.864K  (399864)
                                          -  RowsShortCircuitPredFiltered:  378.582K  (378582)
                                          -  RowsShortCircuitPredInput:  399.862K  (399862)
                                          -  RowsVectorPredFiltered:  2
                                          -  RowsVectorPredInput:  399.864K  (399864)
```

This could result in missing statistics for some input rows.
 This PR also modifies some of the previous logic.
  • Loading branch information
Mryange authored Nov 8, 2024
1 parent 09280f8 commit d3d8a21
Show file tree
Hide file tree
Showing 6 changed files with 27 additions and 13 deletions.
2 changes: 1 addition & 1 deletion be/src/olap/bitmap_filter_predicate.h
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ class BitmapFilterColumnPredicate : public ColumnPredicate {
SpecificFilter* _specific_filter; // owned by _filter

int get_filter_id() const override { return _filter->get_filter_id(); }
bool is_filter() const override { return true; }
bool is_runtime_filter() const override { return true; }
};

template <PrimitiveType T>
Expand Down
1 change: 0 additions & 1 deletion be/src/olap/bloom_filter_predicate.h
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,6 @@ class BloomFilterColumnPredicate : public ColumnPredicate {
DCHECK(filter_id != -1);
return filter_id;
}
bool is_filter() const override { return true; }

std::shared_ptr<BloomFilterFuncBase> _filter;
SpecificFilter* _specific_filter; // owned by _filter
Expand Down
9 changes: 7 additions & 2 deletions be/src/olap/column_predicate.h
Original file line number Diff line number Diff line change
Expand Up @@ -262,8 +262,6 @@ class ColumnPredicate {
}

virtual int get_filter_id() const { return -1; }
// now InListPredicateBase BloomFilterColumnPredicate BitmapFilterColumnPredicate = true
virtual bool is_filter() const { return false; }
PredicateFilterInfo get_filtered_info() const {
return PredicateFilterInfo {static_cast<int>(type()), _evaluated_rows - 1,
_evaluated_rows - 1 - _passed_rows};
Expand Down Expand Up @@ -303,6 +301,13 @@ class ColumnPredicate {
}

bool always_true() const { return _always_true; }
// Return whether the ColumnPredicate was created by a runtime filter.
// If true, it was definitely created by a runtime filter.
// If false, it may still have been created by a runtime filter,
// as certain filters like "in filter" generate key ranges instead of ColumnPredicate.
// is_runtime_filter uses _can_ignore, except for BitmapFilter,
// as BitmapFilter cannot ignore data.
virtual bool is_runtime_filter() const { return _can_ignore(); }

protected:
virtual std::string _debug_string() const = 0;
Expand Down
1 change: 0 additions & 1 deletion be/src/olap/in_list_predicate.h
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,6 @@ class InListPredicateBase : public ColumnPredicate {
}

int get_filter_id() const override { return _values->get_filter_id(); }
bool is_filter() const override { return true; }

template <bool is_and>
void _evaluate_bit(const vectorized::IColumn& column, const uint16_t* sel, uint16_t size,
Expand Down
26 changes: 18 additions & 8 deletions be/src/olap/rowset/segment_v2/segment_iterator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1333,7 +1333,7 @@ Status SegmentIterator::_vec_init_lazy_materialization() {
short_cir_pred_col_id_set.insert(cid);
_short_cir_eval_predicate.push_back(predicate);
}
if (predicate->is_filter()) {
if (predicate->is_runtime_filter()) {
_filter_info_id.push_back(predicate);
}
}
Expand Down Expand Up @@ -1773,15 +1773,17 @@ uint16_t SegmentIterator::_evaluate_vectorization_predicate(uint16_t* sel_rowid_
}
}

const uint16_t original_size = selected_size;
//If all predicates are always_true, then return directly.
if (all_pred_always_true || !_is_need_vec_eval) {
for (uint16_t i = 0; i < selected_size; ++i) {
for (uint16_t i = 0; i < original_size; ++i) {
sel_rowid_idx[i] = i;
}
return selected_size;
// All preds are always_true, so return immediately and update the profile statistics here.
_opts.stats->vec_cond_input_rows += original_size;
return original_size;
}

uint16_t original_size = selected_size;
_ret_flags.resize(original_size);
DCHECK(!_pre_eval_block_predicate.empty());
bool is_first = true;
Expand Down Expand Up @@ -1847,10 +1849,6 @@ uint16_t SegmentIterator::_evaluate_short_circuit_predicate(uint16_t* vec_sel_ro
selected_size = predicate->evaluate(*short_cir_column, vec_sel_rowid_idx, selected_size);
}

// collect profile
for (auto* p : _filter_info_id) {
_opts.stats->filter_info[p->get_filter_id()] = p->get_filtered_info();
}
_opts.stats->short_circuit_cond_input_rows += original_size;
_opts.stats->rows_short_circuit_cond_filtered += original_size - selected_size;

Expand All @@ -1862,6 +1860,17 @@ uint16_t SegmentIterator::_evaluate_short_circuit_predicate(uint16_t* vec_sel_ro
return selected_size;
}

void SegmentIterator::_collect_runtime_filter_predicate() {
// collect profile
for (auto* p : _filter_info_id) {
// There is a situation, such as with in or minmax filters,
// where intermediate conversion to a key range or other types
// prevents obtaining the filter id.
if (p->get_filter_id() >= 0) {
_opts.stats->filter_info[p->get_filter_id()] = p->get_filtered_info();
}
}
}
Status SegmentIterator::_read_columns_by_rowids(std::vector<ColumnId>& read_column_ids,
std::vector<rowid_t>& rowid_vector,
uint16_t* sel_rowid_idx, size_t select_size,
Expand Down Expand Up @@ -2114,6 +2123,7 @@ Status SegmentIterator::_next_batch_internal(vectorized::Block* block) {
// In SSB test, it make no difference; So need more scenarios to test
selected_size = _evaluate_short_circuit_predicate(_sel_rowid_idx.data(), selected_size);

_collect_runtime_filter_predicate();
if (selected_size > 0) {
// step 3.1: output short circuit and predicate column
// when lazy materialization enables, _predicate_column_ids = distinct(_short_cir_pred_column_ids + _vec_pred_column_ids)
Expand Down
1 change: 1 addition & 0 deletions be/src/olap/rowset/segment_v2/segment_iterator.h
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,7 @@ class SegmentIterator : public RowwiseIterator {
uint32_t nrows_read_limit);
uint16_t _evaluate_vectorization_predicate(uint16_t* sel_rowid_idx, uint16_t selected_size);
uint16_t _evaluate_short_circuit_predicate(uint16_t* sel_rowid_idx, uint16_t selected_size);
void _collect_runtime_filter_predicate();
void _output_non_pred_columns(vectorized::Block* block);
[[nodiscard]] Status _read_columns_by_rowids(std::vector<ColumnId>& read_column_ids,
std::vector<rowid_t>& rowid_vector,
Expand Down

0 comments on commit d3d8a21

Please sign in to comment.