From d00d029ffb84e1a62f00c6bb156c15af85e3dcf7 Mon Sep 17 00:00:00 2001 From: Pxl Date: Mon, 16 Oct 2023 11:20:30 +0800 Subject: [PATCH] Separate fixed key hash map context creator (#25438) Separate fixed key hash map context creator --- .clang-tidy | 1 + .../exec/aggregation_sink_operator.cpp | 63 +-- .../pipeline/exec/aggregation_sink_operator.h | 5 +- .../exec/aggregation_source_operator.cpp | 6 +- ...ct_streaming_aggregation_sink_operator.cpp | 5 +- be/src/pipeline/exec/hashjoin_build_sink.cpp | 75 +--- be/src/pipeline/exec/hashjoin_build_sink.h | 3 - .../pipeline/exec/hashjoin_probe_operator.h | 1 - .../exec/partition_sort_sink_operator.cpp | 59 +-- .../exec/partition_sort_sink_operator.h | 1 - .../pipeline/exec/set_probe_sink_operator.cpp | 13 +- be/src/pipeline/pipeline_x/dependency.h | 59 +-- be/src/vec/common/columns_hashing.h | 21 +- be/src/vec/common/hash_table/fixed_hash_map.h | 165 -------- .../vec/common/hash_table/fixed_hash_table.h | 372 ------------------ .../vec/common/hash_table/hash_map_context.h | 108 ++--- .../hash_table/hash_map_context_creator.h | 87 ++++ be/src/vec/common/hash_table/hash_map_util.h | 22 -- be/src/vec/common/hash_table/hash_table.h | 4 +- .../common/hash_table/hash_table_set_build.h | 9 +- .../common/hash_table/hash_table_set_probe.h | 10 +- .../common/hash_table/partitioned_hash_map.h | 3 + .../vec/common/hash_table/string_hash_table.h | 4 - be/src/vec/common/uint128.h | 2 +- .../vec/exec/distinct_vaggregation_node.cpp | 4 +- .../vec/exec/join/process_hash_table_probe.h | 1 - .../exec/join/process_hash_table_probe_impl.h | 12 +- be/src/vec/exec/join/vhash_join_node.cpp | 57 +-- be/src/vec/exec/join/vhash_join_node.h | 22 +- be/src/vec/exec/vaggregation_node.cpp | 59 +-- be/src/vec/exec/vaggregation_node.h | 32 +- be/src/vec/exec/vpartition_sort_node.cpp | 56 +-- be/src/vec/exec/vpartition_sort_node.h | 22 +- be/src/vec/exec/vset_operation_node.cpp | 67 +--- be/src/vec/exec/vset_operation_node.h | 2 - .../array/function_array_enumerate_uniq.cpp | 4 +- 36 files changed, 259 insertions(+), 1177 deletions(-) delete mode 100644 be/src/vec/common/hash_table/fixed_hash_map.h delete mode 100644 be/src/vec/common/hash_table/fixed_hash_table.h create mode 100644 be/src/vec/common/hash_table/hash_map_context_creator.h diff --git a/.clang-tidy b/.clang-tidy index aad12455701928..f9d3155e36b344 100644 --- a/.clang-tidy +++ b/.clang-tidy @@ -13,6 +13,7 @@ Checks: | readability-*, -readability-identifier-length, -readability-implicit-bool-conversion, + -readability-function-cognitive-complexity, portability-simd-intrinsics, performance-type-promotion-in-math-fn, performance-faster-string-find, diff --git a/be/src/pipeline/exec/aggregation_sink_operator.cpp b/be/src/pipeline/exec/aggregation_sink_operator.cpp index 419313dce79638..013c7854f86f76 100644 --- a/be/src/pipeline/exec/aggregation_sink_operator.cpp +++ b/be/src/pipeline/exec/aggregation_sink_operator.cpp @@ -23,6 +23,7 @@ #include "pipeline/exec/operator.h" #include "pipeline/exec/streaming_aggregation_sink_operator.h" #include "runtime/primitive_type.h" +#include "vec/common/hash_table/hash.h" namespace doris::pipeline { @@ -109,8 +110,6 @@ Status AggSinkLocalState::init(RuntimeState* state, Base::_shared_state->agg_profile_arena = std::make_unique(); if (Base::_shared_state->probe_expr_ctxs.empty()) { - _agg_data->init(vectorized::AggregatedDataVariants::Type::without_key); - _agg_data->without_key = reinterpret_cast( Base::_shared_state->agg_profile_arena->alloc(p._total_size_of_aggregate_states)); @@ -500,9 +499,8 @@ void AggSinkLocalState::_emplace_into_hash_table( SCOPED_TIMER(_hash_table_compute_timer); using HashMethodType = std::decay_t; using AggState = typename HashMethodType::State; - AggState state(key_columns, Base::_shared_state->probe_key_sz); - agg_method.init_serialized_keys(key_columns, Base::_shared_state->probe_key_sz, - num_rows); + AggState state(key_columns); + agg_method.init_serialized_keys(key_columns, num_rows); auto creator = [this](const auto& ctor, auto& key, auto& origin) { HashMethodType::try_presis_key(key, origin, *_agg_arena_pool); @@ -545,9 +543,8 @@ void AggSinkLocalState::_find_in_hash_table( [&](auto&& agg_method) -> void { using HashMethodType = std::decay_t; using AggState = typename HashMethodType::State; - AggState state(key_columns, Base::_shared_state->probe_key_sz); - agg_method.init_serialized_keys(key_columns, Base::_shared_state->probe_key_sz, - num_rows); + AggState state(key_columns); + agg_method.init_serialized_keys(key_columns, num_rows); /// For all rows. for (size_t i = 0; i < num_rows; ++i) { @@ -625,52 +622,10 @@ void AggSinkLocalState::_init_hash_method( !Base::_parent->template cast()._is_first_phase), is_nullable); } else { - bool use_fixed_key = true; - bool has_null = false; - size_t key_byte_size = 0; - size_t bitmap_size = - vectorized::get_bitmap_size(Base::_shared_state->probe_expr_ctxs.size()); - - Base::_shared_state->probe_key_sz.resize(Base::_shared_state->probe_expr_ctxs.size()); - for (int i = 0; i < Base::_shared_state->probe_expr_ctxs.size(); ++i) { - const auto& expr = Base::_shared_state->probe_expr_ctxs[i]->root(); - const auto& data_type = expr->data_type(); - - if (!data_type->have_maximum_size_of_value()) { - use_fixed_key = false; - break; - } - - auto is_null = data_type->is_nullable(); - has_null |= is_null; - Base::_shared_state->probe_key_sz[i] = - data_type->get_maximum_size_of_value_in_memory() - (is_null ? 1 : 0); - key_byte_size += Base::_shared_state->probe_key_sz[i]; - } - - if (!has_null) { - bitmap_size = 0; - } - - if (bitmap_size + key_byte_size > sizeof(vectorized::UInt256)) { - use_fixed_key = false; - } - - if (use_fixed_key) { - if (bitmap_size + key_byte_size <= sizeof(vectorized::UInt64)) { - t = Type::int64_keys; - } else if (bitmap_size + key_byte_size <= sizeof(vectorized::UInt128)) { - t = Type::int128_keys; - } else if (bitmap_size + key_byte_size <= sizeof(vectorized::UInt136)) { - t = Type::int136_keys; - } else { - t = Type::int256_keys; - } - _agg_data->init(get_hash_key_type_with_phase( - t, !Base::_parent->template cast() - ._is_first_phase), - has_null); - } else { + if (!try_get_hash_map_context_fixed( + _agg_data->method_variant, probe_exprs)) { _agg_data->init(Type::serialized); } } diff --git a/be/src/pipeline/exec/aggregation_sink_operator.h b/be/src/pipeline/exec/aggregation_sink_operator.h index de73b12874238c..e1223f5c7eb4be 100644 --- a/be/src/pipeline/exec/aggregation_sink_operator.h +++ b/be/src/pipeline/exec/aggregation_sink_operator.h @@ -127,10 +127,7 @@ class AggSinkLocalState : public PipelineXSinkLocalState { } } - { - context.insert_keys_into_columns(keys, key_columns, num_rows, - Base::_shared_state->probe_key_sz); - } + { context.insert_keys_into_columns(keys, key_columns, num_rows); } if (hash_table.has_null_key_data()) { // only one key of group by support wrap null key diff --git a/be/src/pipeline/exec/aggregation_source_operator.cpp b/be/src/pipeline/exec/aggregation_source_operator.cpp index fd192e04ab3f1d..8d959b525e47c1 100644 --- a/be/src/pipeline/exec/aggregation_source_operator.cpp +++ b/be/src/pipeline/exec/aggregation_source_operator.cpp @@ -201,8 +201,7 @@ Status AggLocalState::_serialize_with_serialized_key_result_non_spill(RuntimeSta { SCOPED_TIMER(_insert_keys_to_column_timer); - agg_method.insert_keys_into_columns(keys, key_columns, num_rows, - _shared_state->probe_key_sz); + agg_method.insert_keys_into_columns(keys, key_columns, num_rows); } if (iter == _shared_state->aggregate_data_container->end()) { @@ -358,8 +357,7 @@ Status AggLocalState::_get_result_with_serialized_key_non_spill(RuntimeState* st { SCOPED_TIMER(_insert_keys_to_column_timer); - agg_method.insert_keys_into_columns(keys, key_columns, num_rows, - _shared_state->probe_key_sz); + agg_method.insert_keys_into_columns(keys, key_columns, num_rows); } for (size_t i = 0; i < _shared_state->aggregate_evaluators.size(); ++i) { diff --git a/be/src/pipeline/exec/distinct_streaming_aggregation_sink_operator.cpp b/be/src/pipeline/exec/distinct_streaming_aggregation_sink_operator.cpp index 4651da4ecd09c5..f0a86fcbd6cc50 100644 --- a/be/src/pipeline/exec/distinct_streaming_aggregation_sink_operator.cpp +++ b/be/src/pipeline/exec/distinct_streaming_aggregation_sink_operator.cpp @@ -156,9 +156,8 @@ void DistinctStreamingAggSinkLocalState::_emplace_into_hash_table_to_distinct( SCOPED_TIMER(_hash_table_compute_timer); using HashMethodType = std::decay_t; using AggState = typename HashMethodType::State; - AggState state(key_columns, _shared_state->probe_key_sz); - agg_method.init_serialized_keys(key_columns, Base::_shared_state->probe_key_sz, - num_rows); + AggState state(key_columns); + agg_method.init_serialized_keys(key_columns, num_rows); size_t row = 0; auto creator = [&](const auto& ctor, auto& key, auto& origin) { HashMethodType::try_presis_key(key, origin, _arena); diff --git a/be/src/pipeline/exec/hashjoin_build_sink.cpp b/be/src/pipeline/exec/hashjoin_build_sink.cpp index 939363776b0917..153882075b6a66 100644 --- a/be/src/pipeline/exec/hashjoin_build_sink.cpp +++ b/be/src/pipeline/exec/hashjoin_build_sink.cpp @@ -51,7 +51,6 @@ Status HashJoinBuildSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo _shared_hash_table_dependency = SharedHashTableDependency::create_shared(_parent->id()); auto& p = _parent->cast(); _shared_state->join_op_variants = p._join_op_variants; - _shared_state->probe_key_sz = p._build_key_sz; if (p._is_broadcast_join && state->enable_share_hash_table_for_broadcast_join()) { _shared_state->build_blocks = p._shared_hash_table_context->blocks; } else { @@ -144,10 +143,6 @@ Status HashJoinBuildSinkLocalState::open(RuntimeState* state) { return Status::OK(); } -vectorized::Sizes& HashJoinBuildSinkLocalState::build_key_sz() { - return _parent->cast()._build_key_sz; -} - bool HashJoinBuildSinkLocalState::build_unique() const { return _parent->cast()._build_unique; } @@ -326,62 +321,8 @@ void HashJoinBuildSinkLocalState::_hash_table_init(RuntimeState* state) { } return; } - - bool use_fixed_key = true; - bool has_null = false; - size_t key_byte_size = 0; - size_t bitmap_size = vectorized::get_bitmap_size(_build_expr_ctxs.size()); - - for (int i = 0; i < _build_expr_ctxs.size(); ++i) { - const auto vexpr = _build_expr_ctxs[i]->root(); - const auto& data_type = vexpr->data_type(); - - if (!data_type->have_maximum_size_of_value()) { - use_fixed_key = false; - break; - } - - auto is_null = data_type->is_nullable(); - has_null |= is_null; - key_byte_size += p._build_key_sz[i]; - } - - if (bitmap_size + key_byte_size > sizeof(vectorized::UInt256)) { - use_fixed_key = false; - } - - if (use_fixed_key) { - // TODO: may we should support uint256 in the future - if (has_null) { - if (bitmap_size + key_byte_size <= sizeof(vectorized::UInt64)) { - _shared_state->hash_table_variants - ->emplace>(); - } else if (bitmap_size + key_byte_size <= sizeof(vectorized::UInt128)) { - _shared_state->hash_table_variants - ->emplace>(); - } else { - _shared_state->hash_table_variants - ->emplace>(); - } - } else { - if (key_byte_size <= sizeof(vectorized::UInt64)) { - _shared_state->hash_table_variants - ->emplace>(); - } else if (key_byte_size <= sizeof(vectorized::UInt128)) { - _shared_state->hash_table_variants - ->emplace>(); - } else { - _shared_state->hash_table_variants - ->emplace>(); - } - } - } else { + if (!try_get_hash_map_context_fixed( + *_shared_state->hash_table_variants, _build_expr_ctxs)) { _shared_state->hash_table_variants ->emplace>(); } @@ -448,18 +389,6 @@ Status HashJoinBuildSinkOperatorX::init(const TPlanNode& tnode, RuntimeState* st null_aware || (_build_expr_ctxs.back()->root()->is_nullable() && build_stores_null)); } - - for (const auto& expr : _build_expr_ctxs) { - const auto& data_type = expr->root()->data_type(); - if (!data_type->have_maximum_size_of_value()) { - break; - } - - auto is_null = data_type->is_nullable(); - _build_key_sz.push_back(data_type->get_maximum_size_of_value_in_memory() - - (is_null ? 1 : 0)); - } - return Status::OK(); } diff --git a/be/src/pipeline/exec/hashjoin_build_sink.h b/be/src/pipeline/exec/hashjoin_build_sink.h index 459b66718c2a7f..9b43f95cd3bf2a 100644 --- a/be/src/pipeline/exec/hashjoin_build_sink.h +++ b/be/src/pipeline/exec/hashjoin_build_sink.h @@ -70,7 +70,6 @@ class HashJoinBuildSinkLocalState final void init_short_circuit_for_probe(); HashJoinBuildSinkOperatorX* join_build() { return (HashJoinBuildSinkOperatorX*)_parent; } - vectorized::Sizes& build_key_sz(); bool build_unique() const; std::vector& runtime_filter_descs() const; std::shared_ptr arena() { return _shared_state->arena; } @@ -168,8 +167,6 @@ class HashJoinBuildSinkOperatorX final // mark the join column whether support null eq std::vector _is_null_safe_eq_join; - vectorized::Sizes _build_key_sz; - bool _is_broadcast_join = false; std::shared_ptr _shared_hashtable_controller = nullptr; diff --git a/be/src/pipeline/exec/hashjoin_probe_operator.h b/be/src/pipeline/exec/hashjoin_probe_operator.h index ed241141e5d468..082f45199c1306 100644 --- a/be/src/pipeline/exec/hashjoin_probe_operator.h +++ b/be/src/pipeline/exec/hashjoin_probe_operator.h @@ -92,7 +92,6 @@ class HashJoinProbeLocalState final std::shared_ptr> build_blocks() const { return _shared_state->build_blocks; } - vectorized::Sizes probe_key_sz() const { return _shared_state->probe_key_sz; } private: void _prepare_probe_block(); diff --git a/be/src/pipeline/exec/partition_sort_sink_operator.cpp b/be/src/pipeline/exec/partition_sort_sink_operator.cpp index 4bd66be9dbfca6..515beba944e6a3 100644 --- a/be/src/pipeline/exec/partition_sort_sink_operator.cpp +++ b/be/src/pipeline/exec/partition_sort_sink_operator.cpp @@ -18,6 +18,7 @@ #include "partition_sort_sink_operator.h" #include "common/status.h" +#include "vec/common/hash_table/hash.h" namespace doris { @@ -180,10 +181,9 @@ void PartitionSortSinkOperatorX::_emplace_into_hash_table( using HashMethodType = std::decay_t; using AggState = typename HashMethodType::State; - AggState state(key_columns, local_state._partition_key_sz); + AggState state(key_columns); size_t num_rows = input_block->rows(); - agg_method.init_serialized_keys(key_columns, local_state._partition_key_sz, - num_rows); + agg_method.init_serialized_keys(key_columns, num_rows); auto creator = [&](const auto& ctor, auto& key, auto& origin) { HashMethodType::try_presis_key(key, origin, *local_state._agg_arena_pool); @@ -282,56 +282,9 @@ void PartitionSortSinkLocalState::_init_hash_method() { _partitioned_data->init(vectorized::PartitionedHashMapVariants::Type::serialized); } } else { - bool use_fixed_key = true; - bool has_null = false; - size_t key_byte_size = 0; - size_t bitmap_size = vectorized::get_bitmap_size(_partition_exprs_num); - - _partition_key_sz.resize(_partition_exprs_num); - for (int i = 0; i < _partition_exprs_num; ++i) { - const auto& data_type = _partition_expr_ctxs[i]->root()->data_type(); - - if (!data_type->have_maximum_size_of_value()) { - use_fixed_key = false; - break; - } - - auto is_null = data_type->is_nullable(); - has_null |= is_null; - _partition_key_sz[i] = - data_type->get_maximum_size_of_value_in_memory() - (is_null ? 1 : 0); - key_byte_size += _partition_key_sz[i]; - } - - if (bitmap_size + key_byte_size > sizeof(vectorized::UInt256)) { - use_fixed_key = false; - } - - if (use_fixed_key) { - if (has_null) { - if (bitmap_size + key_byte_size <= sizeof(vectorized::UInt64)) { - _partitioned_data->init( - vectorized::PartitionedHashMapVariants::Type::int64_keys, has_null); - } else if (bitmap_size + key_byte_size <= sizeof(vectorized::UInt128)) { - _partitioned_data->init( - vectorized::PartitionedHashMapVariants::Type::int128_keys, has_null); - } else { - _partitioned_data->init( - vectorized::PartitionedHashMapVariants::Type::int256_keys, has_null); - } - } else { - if (key_byte_size <= sizeof(vectorized::UInt64)) { - _partitioned_data->init( - vectorized::PartitionedHashMapVariants::Type::int64_keys, has_null); - } else if (key_byte_size <= sizeof(vectorized::UInt128)) { - _partitioned_data->init( - vectorized::PartitionedHashMapVariants::Type::int128_keys, has_null); - } else { - _partitioned_data->init( - vectorized::PartitionedHashMapVariants::Type::int256_keys, has_null); - } - } - } else { + if (!try_get_hash_map_context_fixed( + _partitioned_data->method_variant, _partition_expr_ctxs)) { _partitioned_data->init(vectorized::PartitionedHashMapVariants::Type::serialized); } } diff --git a/be/src/pipeline/exec/partition_sort_sink_operator.h b/be/src/pipeline/exec/partition_sort_sink_operator.h index 59517642bf4f2a..1b76623c4fa671 100644 --- a/be/src/pipeline/exec/partition_sort_sink_operator.h +++ b/be/src/pipeline/exec/partition_sort_sink_operator.h @@ -72,7 +72,6 @@ class PartitionSortSinkLocalState : public PipelineXSinkLocalState _partition_columns; std::unique_ptr _partitioned_data; std::unique_ptr _agg_arena_pool; - std::vector _partition_key_sz; int _partition_exprs_num = 0; RuntimeProfile::Counter* _build_timer; diff --git a/be/src/pipeline/exec/set_probe_sink_operator.cpp b/be/src/pipeline/exec/set_probe_sink_operator.cpp index 055709da3a208d..81c30d45d1eb67 100644 --- a/be/src/pipeline/exec/set_probe_sink_operator.cpp +++ b/be/src/pipeline/exec/set_probe_sink_operator.cpp @@ -246,11 +246,12 @@ void SetProbeSinkOperatorX::_refresh_hash_table( if constexpr (!std::is_same_v) { if constexpr (std::is_same_v) { - HashTableCtxType tmp_hash_table; + auto tmp_hash_table = + std::make_shared(); bool is_need_shrink = arg.hash_table->should_be_shrink(valid_element_in_hash_tbl); if (is_intersect || is_need_shrink) { - tmp_hash_table.hash_table->init_buf_size( + tmp_hash_table->init_buf_size( valid_element_in_hash_tbl / arg.hash_table->get_factor() + 1); } @@ -266,15 +267,13 @@ void SetProbeSinkOperatorX::_refresh_hash_table( if constexpr (is_intersect) { //intersected if (it->visited) { it->visited = false; - tmp_hash_table.hash_table->insert( - iter->get_value()); + tmp_hash_table->insert(iter->get_value()); } ++iter; } else { //except if constexpr (is_need_shrink_const) { if (!it->visited) { - tmp_hash_table.hash_table->insert( - iter->get_value()); + tmp_hash_table->insert(iter->get_value()); } } ++iter; @@ -285,7 +284,7 @@ void SetProbeSinkOperatorX::_refresh_hash_table( arg.reset(); if (is_intersect || is_need_shrink) { - arg.hash_table = std::move(tmp_hash_table.hash_table); + arg.hash_table = std::move(tmp_hash_table); } } else { LOG(FATAL) << "FATAL: Invalid RowRefList"; diff --git a/be/src/pipeline/pipeline_x/dependency.h b/be/src/pipeline/pipeline_x/dependency.h index c1ed40747725b3..5d643f58fbd497 100644 --- a/be/src/pipeline/pipeline_x/dependency.h +++ b/be/src/pipeline/pipeline_x/dependency.h @@ -23,6 +23,7 @@ #include "pipeline/exec/data_queue.h" #include "pipeline/exec/multi_cast_data_streamer.h" +#include "vec/common/hash_table/hash_map_context_creator.h" #include "vec/common/sort/partition_sorter.h" #include "vec/common/sort/sorter.h" #include "vec/exec/join/process_hash_table_probe.h" @@ -313,7 +314,6 @@ struct AggSharedState { std::unique_ptr spill_partition_helper; // group by k1,k2 vectorized::VExprContextSPtrs probe_expr_ctxs; - std::vector probe_key_sz; size_t input_num_rows = 0; std::vector values; std::unique_ptr agg_profile_arena; @@ -565,7 +565,6 @@ struct HashJoinSharedState : public JoinSharedState { // maybe share hash table with other fragment instances std::shared_ptr hash_table_variants = std::make_shared(); - vectorized::Sizes probe_key_sz; const std::vector build_side_child_desc; size_t build_exprs_size = 0; std::shared_ptr> build_blocks = nullptr; @@ -674,8 +673,6 @@ struct SetSharedState { /// init in setup_local_states std::unique_ptr hash_table_variants; // the real data HERE. std::vector build_not_ignore_null; - std::vector probe_key_sz; - std::vector build_key_sz; /// init in both upstream side. //The i-th result expr list refers to the i-th child. @@ -735,57 +732,9 @@ struct SetSharedState { return; } - bool use_fixed_key = true; - bool has_null = false; - size_t key_byte_size = 0; - size_t bitmap_size = vectorized::get_bitmap_size(child_exprs_lists[0].size()); - - build_key_sz.resize(child_exprs_lists[0].size()); - probe_key_sz.resize(child_exprs_lists[0].size()); - for (int i = 0; i < child_exprs_lists[0].size(); ++i) { - const auto vexpr = child_exprs_lists[0][i]->root(); - const auto& data_type = vexpr->data_type(); - - if (!data_type->have_maximum_size_of_value()) { - use_fixed_key = false; - break; - } - - auto is_null = data_type->is_nullable(); - has_null |= is_null; - build_key_sz[i] = data_type->get_maximum_size_of_value_in_memory() - (is_null ? 1 : 0); - probe_key_sz[i] = build_key_sz[i]; - key_byte_size += probe_key_sz[i]; - } - - if (bitmap_size + key_byte_size > sizeof(vectorized::UInt256)) { - use_fixed_key = false; - } - if (use_fixed_key) { - if (has_null) { - if (bitmap_size + key_byte_size <= sizeof(vectorized::UInt64)) { - hash_table_variants->emplace>(); - } else if (bitmap_size + key_byte_size <= sizeof(vectorized::UInt128)) { - hash_table_variants->emplace>(); - } else { - hash_table_variants->emplace>(); - } - } else { - if (key_byte_size <= sizeof(vectorized::UInt64)) { - hash_table_variants->emplace>(); - } else if (key_byte_size <= sizeof(vectorized::UInt128)) { - hash_table_variants->emplace>(); - } else { - hash_table_variants->emplace>(); - } - } - } else { + if (!try_get_hash_map_context_fixed( + *hash_table_variants, child_exprs_lists[0])) { hash_table_variants->emplace< vectorized::SerializedHashTableContext>(); } diff --git a/be/src/vec/common/columns_hashing.h b/be/src/vec/common/columns_hashing.h index 824ac2a37cc0a1..83f01fdf4b2fc1 100644 --- a/be/src/vec/common/columns_hashing.h +++ b/be/src/vec/common/columns_hashing.h @@ -47,8 +47,7 @@ struct HashMethodOneNumber using Self = HashMethodOneNumber; using Base = columns_hashing_impl::HashMethodBase; - /// If the keys of a fixed length then key_sizes contains their lengths, empty otherwise. - HashMethodOneNumber(const ColumnRawPtrs& key_columns, const Sizes& /*key_sizes*/) {} + HashMethodOneNumber(const ColumnRawPtrs& key_columns) {} using Base::find_key_with_hash; }; @@ -61,7 +60,7 @@ struct HashMethodString using Self = HashMethodString; using Base = columns_hashing_impl::HashMethodBase; - HashMethodString(const ColumnRawPtrs& key_columns, const Sizes& /*key_sizes*/) {} + HashMethodString(const ColumnRawPtrs& key_columns) {} protected: friend class columns_hashing_impl::HashMethodBase; @@ -79,7 +78,7 @@ struct HashMethodSerialized using Self = HashMethodSerialized; using Base = columns_hashing_impl::HashMethodBase; - HashMethodSerialized(const ColumnRawPtrs& key_columns_, const Sizes& /*key_sizes*/) {} + HashMethodSerialized(const ColumnRawPtrs& key_columns) {} protected: friend class columns_hashing_impl::HashMethodBase; @@ -96,8 +95,7 @@ struct HashMethodKeysFixed using BaseHashed = columns_hashing_impl::HashMethodBase; using Base = columns_hashing_impl::BaseStateKeysFixed; - HashMethodKeysFixed(const ColumnRawPtrs& key_columns, const Sizes& key_sizes_) - : Base(key_columns) {} + HashMethodKeysFixed(const ColumnRawPtrs& key_columns) : Base(key_columns) {} }; template @@ -109,16 +107,15 @@ struct HashMethodSingleLowNullableColumn : public SingleColumnMethod { const ColumnNullable* key_column; - static const ColumnRawPtrs get_nested_column(const IColumn* col) { - auto* nullable = check_and_get_column(*col); + static ColumnRawPtrs get_nested_column(const IColumn* col) { + const auto* nullable = check_and_get_column(*col); DCHECK(nullable != nullptr); - const auto nested_col = nullable->get_nested_column_ptr().get(); + const auto* const nested_col = nullable->get_nested_column_ptr().get(); return {nested_col}; } - HashMethodSingleLowNullableColumn(const ColumnRawPtrs& key_columns_nullable, - const Sizes& key_sizes) - : Base(get_nested_column(key_columns_nullable[0]), key_sizes), + HashMethodSingleLowNullableColumn(const ColumnRawPtrs& key_columns_nullable) + : Base(get_nested_column(key_columns_nullable[0])), key_column(assert_cast(key_columns_nullable[0])) {} template diff --git a/be/src/vec/common/hash_table/fixed_hash_map.h b/be/src/vec/common/hash_table/fixed_hash_map.h deleted file mode 100644 index a43a8381a5110a..00000000000000 --- a/be/src/vec/common/hash_table/fixed_hash_map.h +++ /dev/null @@ -1,165 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. -// This file is copied from -// https://github.com/ClickHouse/ClickHouse/blob/master/src/Common/HashTable/FixedHashMap.h -// and modified by Doris - -#pragma once - -#include "vec/common/hash_table/fixed_hash_table.h" -#include "vec/common/hash_table/hash_map.h" - -template -struct FixedHashMapCell { - using Mapped = TMapped; - using State = TState; - - using value_type = PairNoInit; - using mapped_type = TMapped; - - bool full; - Mapped mapped; - - FixedHashMapCell() {} - FixedHashMapCell(const Key&, const State&) : full(true) {} - FixedHashMapCell(const value_type& value_, const State&) : full(true), mapped(value_.second) {} - - const VoidKey get_key() const { return {}; } - Mapped& get_mapped() { return mapped; } - const Mapped& get_mapped() const { return mapped; } - - bool is_zero(const State&) const { return !full; } - void set_zero() { full = false; } - - /// Similar to FixedHashSetCell except that we need to contain a pointer to the Mapped field. - /// Note that we have to assemble a continuous layout for the value_type on each call of getValue(). - struct CellExt { - CellExt() {} - CellExt(Key&& key_, const FixedHashMapCell* ptr_) - : key(key_), ptr(const_cast(ptr_)) {} - void update(Key&& key_, const FixedHashMapCell* ptr_) { - key = key_; - ptr = const_cast(ptr_); - } - Key key; - FixedHashMapCell* ptr; - - const Key& get_key() const { return key; } - Mapped& get_mapped() { return ptr->mapped; } - const Mapped& get_mapped() const { return ptr->mapped; } - const value_type get_value() const { return {key, ptr->mapped}; } - }; -}; - -/// In case when we can encode empty cells with zero mapped values. -template -struct FixedHashMapImplicitZeroCell { - using Mapped = TMapped; - using State = TState; - - using value_type = PairNoInit; - using mapped_type = TMapped; - - Mapped mapped; - - FixedHashMapImplicitZeroCell() {} - FixedHashMapImplicitZeroCell(const Key&, const State&) {} - FixedHashMapImplicitZeroCell(const Key&, const Mapped& mapped_) : mapped(mapped_) {} - FixedHashMapImplicitZeroCell(const value_type& value_, const State&) : mapped(value_.second) {} - - const VoidKey get_first() const { return {}; } - Mapped& get_second() { return mapped; } - const Mapped& get_second() const { return mapped; } - - bool is_zero(const State&) const { return !mapped; } - void set_zero() { mapped = {}; } - - /// Similar to FixedHashSetCell except that we need to contain a pointer to the Mapped field. - /// Note that we have to assemble a continuous layout for the value_type on each call of getValue(). - struct CellExt { - CellExt() {} - CellExt(Key&& key_, const FixedHashMapImplicitZeroCell* ptr_) - : key(key_), ptr(const_cast(ptr_)) {} - void update(Key&& key_, const FixedHashMapImplicitZeroCell* ptr_) { - key = key_; - ptr = const_cast(ptr_); - } - Key key; - FixedHashMapImplicitZeroCell* ptr; - - const Key& get_first() const { return key; } - Mapped& get_second() { return ptr->mapped; } - const Mapped& get_second() const { return ptr->mapped; } - const value_type get_value() const { return {key, ptr->mapped}; } - }; -}; - -template -ALWAYS_INLINE inline auto lookup_result_get_mapped( - FixedHashMapImplicitZeroCell* cell) { - return &cell->get_second(); -} - -template , - typename Size = FixedHashTableStoredSize, typename Allocator = HashTableAllocator> -class FixedHashMap : public FixedHashTable { -public: - using Base = FixedHashTable; - using Self = FixedHashMap; - using LookupResult = typename Base::LookupResult; - - using Base::Base; - - template - void for_each_value(Func&& func) { - for (auto& v : *this) func(v.get_key(), v.get_mapped()); - } - - template - void for_each_mapped(Func&& func) { - for (auto& v : *this) func(v.get_second()); - } - - Mapped& ALWAYS_INLINE operator[](const Key& x) { - LookupResult it; - bool inserted; - this->emplace(x, it, inserted); - if (inserted) new (&it->get_mapped()) Mapped(); - - return it->get_mapped(); - } - - // fixed hash map never overflow - bool add_elem_size_overflow(size_t add_size) const { return false; } - template - char* get_null_key_data() { - return nullptr; - } - bool has_null_key_data() const { return false; } -}; - -template -using FixedImplicitZeroHashMap = - FixedHashMap, - FixedHashTableStoredSize>, - Allocator>; - -template -using FixedImplicitZeroHashMapWithCalculatedSize = - FixedHashMap, - FixedHashTableCalculatedSize>, - Allocator>; \ No newline at end of file diff --git a/be/src/vec/common/hash_table/fixed_hash_table.h b/be/src/vec/common/hash_table/fixed_hash_table.h deleted file mode 100644 index a0e901d6c3c3f7..00000000000000 --- a/be/src/vec/common/hash_table/fixed_hash_table.h +++ /dev/null @@ -1,372 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. -// This file is copied from -// https://github.com/ClickHouse/ClickHouse/blob/master/src/Common/HashTable/FixedHashTable.h -// and modified by Doris - -#pragma once - -#include "vec/common/hash_table/hash_table.h" - -/// How to obtain the size of the table. - -template -struct FixedHashTableStoredSize { - size_t m_size = 0; - - size_t get_size(const Cell*, const typename Cell::State&, size_t) const { return m_size; } - bool is_empty(const Cell*, const typename Cell::State&, size_t) const { return m_size == 0; } - - void increase_size() { ++m_size; } - void clear_size() { m_size = 0; } - void set_size(size_t to) { m_size = to; } -}; - -template -struct FixedHashTableCalculatedSize { - size_t get_size(const Cell* buf, const typename Cell::State& state, size_t num_cells) const { - size_t res = 0; - for (const Cell* end = buf + num_cells; buf != end; ++buf) - if (!buf->is_zero(state)) ++res; - return res; - } - - bool isEmpty(const Cell* buf, const typename Cell::State& state, size_t num_cells) const { - for (const Cell* end = buf + num_cells; buf != end; ++buf) - if (!buf->is_zero(state)) return false; - return true; - } - - void increase_size() {} - void clear_size() {} - void set_size(size_t) {} -}; - -/** Used as a lookup table for small keys such as UInt8, UInt16. It's different - * than a HashTable in that keys are not stored in the Cell buf, but inferred - * inside each iterator. There are a bunch of to make it faster than using - * HashTable: a) It doesn't have a conflict chain; b) There is no key - * comparison; c) The number of cycles for checking cell empty is halved; d) - * Memory layout is tighter, especially the Clearable variants. - * - * NOTE: For Set variants this should always be better. For Map variants - * however, as we need to assemble the real cell inside each iterator, there - * might be some cases we fall short. - * - * TODO: Deprecate the cell API so that end users don't rely on the structure - * of cell. Instead iterator should be used for operations such as cell - * transfer, key updates (f.g. StringRef) and serde. This will allow - * TwoLevelHashSet(Map) to contain different type of sets(maps). - */ -template -class FixedHashTable : private boost::noncopyable, - protected Allocator, - protected Cell::State, - protected Size { - static constexpr size_t NUM_CELLS = 1ULL << (sizeof(Key) * 8); - -protected: - using Self = FixedHashTable; - - Cell* buf; /// A piece of memory for all elements. - - void alloc() { buf = reinterpret_cast(Allocator::alloc(NUM_CELLS * sizeof(Cell))); } - - void free() { - if (buf) { - Allocator::free(buf, get_buffer_size_in_bytes()); - buf = nullptr; - } - } - - void destroy_elements() { - if (!std::is_trivially_destructible_v) - for (iterator it = begin(), it_end = end(); it != it_end; ++it) it.ptr->~Cell(); - } - - template - class iterator_base { - using Container = std::conditional_t; - using cell_type = std::conditional_t; - - Container* container; - cell_type* ptr; - - friend class FixedHashTable; - - public: - iterator_base() {} - iterator_base(Container* container_, cell_type* ptr_) : container(container_), ptr(ptr_) { - cell.update(ptr - container->buf, ptr); - } - - bool operator==(const iterator_base& rhs) const { return ptr == rhs.ptr; } - bool operator!=(const iterator_base& rhs) const { return ptr != rhs.ptr; } - - Derived& operator++() { - ++ptr; - - /// Skip empty cells in the main buffer. - auto buf_end = container->buf + container->NUM_CELLS; - while (ptr < buf_end && ptr->is_zero(*container)) ++ptr; - - return static_cast(*this); - } - - auto& operator*() { - if (cell.key != ptr - container->buf) cell.update(ptr - container->buf, ptr); - return cell; - } - auto* operator->() { - if (cell.key != ptr - container->buf) cell.update(ptr - container->buf, ptr); - return &cell; - } - - auto get_ptr() const { return ptr; } - size_t get_hash() const { return ptr - container->buf; } - size_t get_collision_chain_length() const { return 0; } - typename cell_type::CellExt cell; - }; - -public: - using key_type = Key; - using mapped_type = typename Cell::mapped_type; - using value_type = typename Cell::value_type; - using cell_type = Cell; - - using LookupResult = Cell*; - using ConstLookupResult = const Cell*; - - size_t hash(const Key& x) const { return x; } - - FixedHashTable() { alloc(); } - - FixedHashTable(FixedHashTable&& rhs) : buf(nullptr) { *this = std::move(rhs); } - - ~FixedHashTable() { - destroy_elements(); - free(); - } - - FixedHashTable& operator=(FixedHashTable&& rhs) { - destroy_elements(); - free(); - - const auto new_size = rhs.size(); - std::swap(buf, rhs.buf); - this->set_size(new_size); - - Allocator::operator=(std::move(rhs)); - Cell::State::operator=(std::move(rhs)); - - return *this; - } - - class iterator : public iterator_base { - public: - using iterator_base::iterator_base; - }; - - class const_iterator : public iterator_base { - public: - using iterator_base::iterator_base; - }; - - const_iterator begin() const { - if (!buf) return end(); - - const Cell* ptr = buf; - auto buf_end = buf + NUM_CELLS; - while (ptr < buf_end && ptr->is_zero(*this)) ++ptr; - - return const_iterator(this, ptr); - } - - const_iterator cbegin() const { return begin(); } - - iterator begin() { - if (!buf) return end(); - - Cell* ptr = buf; - auto buf_end = buf + NUM_CELLS; - while (ptr < buf_end && ptr->is_zero(*this)) ++ptr; - - return iterator(this, ptr); - } - - const_iterator end() const { - /// Avoid UBSan warning about adding zero to nullptr. It is valid in C++20 (and earlier) but not valid in C. - return const_iterator(this, buf ? buf + NUM_CELLS : buf); - } - - const_iterator cend() const { return end(); } - - iterator end() { return iterator(this, buf ? buf + NUM_CELLS : buf); } - -public: - /// The last parameter is unused but exists for compatibility with HashTable interface. - void ALWAYS_INLINE emplace(const Key& x, LookupResult& it, bool& inserted, - size_t /* hash */ = 0) { - it = &buf[x]; - - if (!buf[x].is_zero(*this)) { - inserted = false; - return; - } - - new (&buf[x]) Cell(x, *this); - inserted = true; - this->increase_size(); - } - - class Constructor { - public: - friend class FixedHashTable; - template - void operator()(Args&&... args) const { - new (_cell) Cell(std::forward(args)...); - } - - private: - Constructor(Cell* cell) : _cell(cell) {} - Cell* _cell; - }; - - template - void ALWAYS_INLINE lazy_emplace(const Key& x, LookupResult& it, Func&& f) { - it = &buf[x]; - - if (!buf[x].is_zero(*this)) { - return; - } - - f(Constructor(&buf[x]), x, x); - this->increase_size(); - } - - template - void ALWAYS_INLINE lazy_emplace(const Key& x, LookupResult& it, size_t hash_value, Func&& f) { - lazy_emplace(x, it, std::forward(f)); - } - - template - void ALWAYS_INLINE prefetch(const Key& key, size_t hash_value) { - // Two optional arguments: - // 'rw': 1 means the memory access is write - // 'locality': 0-3. 0 means no temporal locality. 3 means high temporal locality. - __builtin_prefetch(&buf[hash_value], READ ? 0 : 1, 1); - } - - std::pair ALWAYS_INLINE insert(const value_type& x) { - std::pair res; - emplace(Cell::get_key(x), res.first, res.second); - if (res.second) insert_set_mapped(res.first->get_mapped(), x); - - return res; - } - - LookupResult ALWAYS_INLINE find(const Key& x) { - return !buf[x].is_zero(*this) ? &buf[x] : nullptr; - } - - ConstLookupResult ALWAYS_INLINE find(const Key& x) const { - return const_cast*>(this)->find(x); - } - - LookupResult ALWAYS_INLINE find(const Key&, size_t hash_value) { - return !buf[hash_value].is_zero(*this) ? &buf[hash_value] : nullptr; - } - - ConstLookupResult ALWAYS_INLINE find(const Key& key, size_t hash_value) const { - return const_cast*>(this)->find(key, hash_value); - } - - bool ALWAYS_INLINE has(const Key& x) const { return !buf[x].is_zero(*this); } - bool ALWAYS_INLINE has(const Key&, size_t hash_value) const { - return !buf[hash_value].is_zero(*this); - } - - void write(doris::vectorized::BufferWritable& wb) const { - Cell::State::write(wb); - doris::vectorized::write_var_uint(size(), wb); - - if (!buf) return; - - for (auto ptr = buf, buf_end = buf + NUM_CELLS; ptr < buf_end; ++ptr) { - if (!ptr->is_zero(*this)) { - doris::vectorized::write_var_uint(ptr - buf, wb); - ptr->write(wb); - } - } - } - - void read(doris::vectorized::BufferReadable& rb) { - Cell::State::read(rb); - destroy_elements(); - doris::vectorized::UInt64 m_size; - doris::vectorized::read_var_uint(m_size, rb); - this->set_size(m_size); - free(); - alloc(); - - for (size_t i = 0; i < m_size; ++i) { - doris::vectorized::UInt64 place_value = 0; - doris::vectorized::read_var_uint(place_value, rb); - Cell x; - x.read(rb); - new (&buf[place_value]) Cell(x, *this); - } - } - - size_t size() const { return this->get_size(buf, *this, NUM_CELLS); } - bool empty() const { return this->is_empty(buf, *this, NUM_CELLS); } - - void clear() { - destroy_elements(); - this->clear_size(); - - memset(static_cast(buf), 0, NUM_CELLS * sizeof(*buf)); - } - - /// After executing this function, the table can only be destroyed, - /// and also you can use the methods `size`, `empty`, `begin`, `end`. - void clear_and_shrink() { - destroy_elements(); - this->clear_size(); - free(); - } - - size_t get_buffer_size_in_bytes() const { return NUM_CELLS * sizeof(Cell); } - - size_t get_buffer_size_in_cells() const { return NUM_CELLS; } - - /// Return offset for result in internal buffer. - /// Result can have value up to `getBufferSizeInCells() + 1` - /// because offset for zero value considered to be 0 - /// and for other values it will be `offset in buffer + 1` - size_t offset_internal(ConstLookupResult ptr) const { - if (ptr->is_zero(*this)) return 0; - return ptr - buf + 1; - } - - const Cell* data() const { return buf; } - Cell* data() { return buf; } - -#ifdef DBMS_HASH_MAP_COUNT_COLLISIONS - size_t get_collisions() const { return 0; } -#endif -}; diff --git a/be/src/vec/common/hash_table/hash_map_context.h b/be/src/vec/common/hash_table/hash_map_context.h index 34250e6bd4f73a..0d2ad598140a8a 100644 --- a/be/src/vec/common/hash_table/hash_map_context.h +++ b/be/src/vec/common/hash_table/hash_map_context.h @@ -18,6 +18,7 @@ #pragma once #include +#include #include "runtime/descriptors.h" #include "util/stack_util.h" @@ -33,6 +34,8 @@ namespace doris::vectorized { +constexpr auto BITSIZE = 8; + template struct DataWithNullKey; @@ -42,6 +45,7 @@ struct MethodBase { using Mapped = typename HashMap::mapped_type; using Value = typename HashMap::value_type; using Iterator = typename HashMap::iterator; + using HashMapType = HashMap; std::shared_ptr hash_table; Iterator iterator; @@ -64,8 +68,8 @@ struct MethodBase { iterator = hash_table->begin(); } } - virtual void init_serialized_keys(const ColumnRawPtrs& key_columns, const Sizes& key_sizes, - size_t num_rows, const uint8_t* null_map = nullptr) = 0; + virtual void init_serialized_keys(const ColumnRawPtrs& key_columns, size_t num_rows, + const uint8_t* null_map = nullptr) = 0; void init_hash_values(size_t num_rows, const uint8_t* null_map) { if (null_map == nullptr) { @@ -127,7 +131,7 @@ struct MethodBase { } virtual void insert_keys_into_columns(std::vector& keys, MutableColumns& key_columns, - const size_t num_rows, const Sizes&) = 0; + size_t num_rows) = 0; }; template @@ -151,8 +155,8 @@ struct MethodSerialized : public MethodBase { return {begin, sum_size}; } - void init_serialized_keys(const ColumnRawPtrs& key_columns, const Sizes& key_sizes, - size_t num_rows, const uint8_t* null_map = nullptr) override { + void init_serialized_keys(const ColumnRawPtrs& key_columns, size_t num_rows, + const uint8_t* null_map = nullptr) override { Base::arena.clear(); stored_keys.resize(num_rows); @@ -170,7 +174,7 @@ struct MethodSerialized : public MethodBase { serialize_keys_to_pool_contiguous(i, keys_size, key_columns, Base::arena); } } else { - uint8_t* serialized_key_buffer = + auto* serialized_key_buffer = reinterpret_cast(Base::arena.alloc(total_bytes)); for (size_t i = 0; i < num_rows; ++i) { @@ -188,7 +192,7 @@ struct MethodSerialized : public MethodBase { } void insert_keys_into_columns(std::vector& keys, MutableColumns& key_columns, - const size_t num_rows, const Sizes&) override { + const size_t num_rows) override { for (auto& column : key_columns) { column->deserialize_vec(keys, num_rows); } @@ -196,7 +200,7 @@ struct MethodSerialized : public MethodBase { }; inline size_t get_bitmap_size(size_t key_number) { - return (key_number + 7) / 8; + return (key_number + BITSIZE - 1) / BITSIZE; } template @@ -209,15 +213,15 @@ struct MethodStringNoCache : public MethodBase { std::vector stored_keys; - void init_serialized_keys(const ColumnRawPtrs& key_columns, const Sizes& key_sizes, - size_t num_rows, const uint8_t* null_map = nullptr) override { + void init_serialized_keys(const ColumnRawPtrs& key_columns, size_t num_rows, + const uint8_t* null_map = nullptr) override { const IColumn& column = *key_columns[0]; - const ColumnString& column_string = assert_cast( + const auto& column_string = assert_cast( column.is_nullable() ? assert_cast(column).get_nested_column() : column); - auto offsets = column_string.get_offsets().data(); - auto chars = column_string.get_chars().data(); + const auto* offsets = column_string.get_offsets().data(); + const auto* chars = column_string.get_chars().data(); stored_keys.resize(column_string.size()); for (size_t row = 0; row < column_string.size(); row++) { @@ -229,7 +233,7 @@ struct MethodStringNoCache : public MethodBase { } void insert_keys_into_columns(std::vector& keys, MutableColumns& key_columns, - const size_t num_rows, const Sizes&) override { + const size_t num_rows) override { key_columns[0]->reserve(num_rows); key_columns[0]->insert_many_strings(keys.data(), num_rows); } @@ -245,8 +249,8 @@ struct MethodOneNumber : public MethodBase { using State = ColumnsHashing::HashMethodOneNumber; - void init_serialized_keys(const ColumnRawPtrs& key_columns, const Sizes& key_sizes, - size_t num_rows, const uint8_t* null_map = nullptr) override { + void init_serialized_keys(const ColumnRawPtrs& key_columns, size_t num_rows, + const uint8_t* null_map = nullptr) override { Base::keys = (FieldType*)(key_columns[0]->is_nullable() ? assert_cast(key_columns[0]) ->get_nested_column_ptr() @@ -258,8 +262,7 @@ struct MethodOneNumber : public MethodBase { } void insert_keys_into_columns(std::vector& keys, - MutableColumns& key_columns, const size_t num_rows, - const Sizes&) override { + MutableColumns& key_columns, const size_t num_rows) override { key_columns[0]->reserve(num_rows); auto* column = static_cast(key_columns[0].get()); for (size_t i = 0; i != num_rows; ++i) { @@ -282,10 +285,13 @@ struct MethodKeysFixed : public MethodBase { has_nullable_keys>; std::vector stored_keys; + Sizes key_sizes; + + MethodKeysFixed(Sizes key_sizes_) : key_sizes(std::move(key_sizes_)) {} template std::vector pack_fixeds(size_t row_numbers, const ColumnRawPtrs& key_columns, - const Sizes& key_sizes, const ColumnRawPtrs& nullmap_columns) { + const ColumnRawPtrs& nullmap_columns) { size_t bitmap_size = get_bitmap_size(nullmap_columns.size()); std::vector result(row_numbers); @@ -295,8 +301,8 @@ struct MethodKeysFixed : public MethodBase { if (!nullmap_columns[j]) { continue; } - size_t bucket = j / 8; - size_t offset = j % 8; + size_t bucket = j / BITSIZE; + size_t offset = j % BITSIZE; const auto& data = assert_cast(*nullmap_columns[j]).get_data().data(); for (size_t i = 0; i < row_numbers; ++i) { @@ -311,7 +317,7 @@ struct MethodKeysFixed : public MethodBase { auto foo = [&](Fixed zero) { CHECK_EQ(sizeof(Fixed), key_sizes[j]); - if (nullmap_columns.size() && nullmap_columns[j]) { + if (!nullmap_columns.empty() && nullmap_columns[j]) { const auto& nullmap = assert_cast(*nullmap_columns[j]).get_data().data(); for (size_t i = 0; i < row_numbers; ++i) { @@ -326,15 +332,15 @@ struct MethodKeysFixed : public MethodBase { } }; - if (key_sizes[j] == 1) { - foo(int8_t()); - } else if (key_sizes[j] == 2) { - foo(int16_t()); - } else if (key_sizes[j] == 4) { - foo(int32_t()); - } else if (key_sizes[j] == 8) { - foo(int64_t()); - } else if (key_sizes[j] == 16) { + if (key_sizes[j] == sizeof(uint8_t)) { + foo(uint8_t()); + } else if (key_sizes[j] == sizeof(uint16_t)) { + foo(uint16_t()); + } else if (key_sizes[j] == sizeof(uint32_t)) { + foo(uint32_t()); + } else if (key_sizes[j] == sizeof(uint64_t)) { + foo(uint64_t()); + } else if (key_sizes[j] == sizeof(UInt128)) { foo(UInt128()); } else { throw Exception(ErrorCode::INTERNAL_ERROR, @@ -345,15 +351,15 @@ struct MethodKeysFixed : public MethodBase { return result; } - void init_serialized_keys(const ColumnRawPtrs& key_columns, const Sizes& key_sizes, - size_t num_rows, const uint8_t* null_map = nullptr) override { + void init_serialized_keys(const ColumnRawPtrs& key_columns, size_t num_rows, + const uint8_t* null_map = nullptr) override { ColumnRawPtrs actual_columns; ColumnRawPtrs null_maps; if (has_nullable_keys) { actual_columns.reserve(key_columns.size()); null_maps.reserve(key_columns.size()); for (const auto& col : key_columns) { - if (auto* nullable_col = check_and_get_column(col)) { + if (const auto* nullable_col = check_and_get_column(col)) { actual_columns.push_back(&nullable_col->get_nested_column()); null_maps.push_back(&nullable_col->get_null_map_column()); } else { @@ -364,14 +370,13 @@ struct MethodKeysFixed : public MethodBase { } else { actual_columns = key_columns; } - stored_keys = pack_fixeds(num_rows, actual_columns, key_sizes, null_maps); + stored_keys = pack_fixeds(num_rows, actual_columns, null_maps); Base::keys = stored_keys.data(); Base::init_hash_values(num_rows, null_map); } void insert_keys_into_columns(std::vector& keys, - MutableColumns& key_columns, const size_t num_rows, - const Sizes& key_sizes) override { + MutableColumns& key_columns, const size_t num_rows) override { // In any hash key value, column values to be read start just after the bitmap, if it exists. size_t pos = has_nullable_keys ? get_bitmap_size(key_columns.size()) : 0; @@ -381,7 +386,7 @@ struct MethodKeysFixed : public MethodBase { key_columns[i]->resize(num_rows); // If we have a nullable column, get its nested column and its null map. if (is_column_nullable(*key_columns[i])) { - ColumnNullable& nullable_col = assert_cast(*key_columns[i]); + auto& nullable_col = assert_cast(*key_columns[i]); data = const_cast(nullable_col.get_nested_column().get_raw_data().data); UInt8* nullmap = assert_cast(&nullable_col.get_null_map_column()) @@ -390,8 +395,8 @@ struct MethodKeysFixed : public MethodBase { // The current column is nullable. Check if the value of the // corresponding key is nullable. Update the null map accordingly. - size_t bucket = i / 8; - size_t offset = i % 8; + size_t bucket = i / BITSIZE; + size_t offset = i % BITSIZE; for (size_t j = 0; j < num_rows; j++) { nullmap[j] = (reinterpret_cast(&keys[j])[bucket] >> offset) & 1; } @@ -406,15 +411,15 @@ struct MethodKeysFixed : public MethodBase { } }; - if (size == 1) { - foo(int8_t()); - } else if (size == 2) { - foo(int16_t()); - } else if (size == 4) { - foo(int32_t()); - } else if (size == 8) { - foo(int64_t()); - } else if (size == 16) { + if (size == sizeof(uint8_t)) { + foo(uint8_t()); + } else if (size == sizeof(uint16_t)) { + foo(uint16_t()); + } else if (size == sizeof(uint32_t)) { + foo(uint32_t()); + } else if (size == sizeof(uint64_t)) { + foo(uint64_t()); + } else if (size == sizeof(UInt128)) { foo(UInt128()); } else { throw Exception(ErrorCode::INTERNAL_ERROR, @@ -461,9 +466,8 @@ struct MethodSingleNullableColumn : public SingleColumnMethod { typename Base::Mapped>; void insert_keys_into_columns(std::vector& keys, - MutableColumns& key_columns, const size_t num_rows, - const Sizes&) override { - auto col = key_columns[0].get(); + MutableColumns& key_columns, const size_t num_rows) override { + auto* col = key_columns[0].get(); col->reserve(num_rows); if constexpr (std::is_same_v) { col->insert_many_strings(keys.data(), num_rows); diff --git a/be/src/vec/common/hash_table/hash_map_context_creator.h b/be/src/vec/common/hash_table/hash_map_context_creator.h new file mode 100644 index 00000000000000..60376acea5e2e7 --- /dev/null +++ b/be/src/vec/common/hash_table/hash_map_context_creator.h @@ -0,0 +1,87 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "vec/common/hash_table/hash_map_context.h" +#include "vec/common/hash_table/ph_hash_map.h" + +namespace doris::vectorized { + +template