Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Feature](hash-table) unify initialization of HashTableVariants and support set for distinct agg #42046

Merged
merged 14 commits into from
Oct 24, 2024
3 changes: 1 addition & 2 deletions be/src/olap/rowset/segment_v2/column_reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@
#include "vec/columns/column.h"
#include "vec/columns/column_array.h" // ColumnArray
#include "vec/columns/subcolumn_tree.h"
#include "vec/common/hash_table/hash_map_context_creator.h"
#include "vec/data_types/data_type.h"
#include "vec/json/path_in_data.h"

Expand Down Expand Up @@ -725,7 +724,7 @@ class DefaultValueColumnIterator : public ColumnIterator {
class DefaultNestedColumnIterator : public ColumnIterator {
public:
DefaultNestedColumnIterator(std::unique_ptr<ColumnIterator>&& sibling,
DataTypePtr file_column_type)
vectorized::DataTypePtr file_column_type)
: _sibling_iter(std::move(sibling)), _file_column_type(std::move(file_column_type)) {}

Status init(const ColumnIteratorOptions& opts) override {
Expand Down
183 changes: 75 additions & 108 deletions be/src/pipeline/common/agg_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,152 +22,112 @@

#include "vec/common/arena.h"
#include "vec/common/hash_table/hash_map_context.h"
#include "vec/common/hash_table/hash_map_context_creator.h"
#include "vec/common/hash_table/hash_map_util.h"
#include "vec/common/hash_table/ph_hash_map.h"
#include "vec/common/hash_table/string_hash_map.h"

namespace doris {
namespace pipeline {

template <typename T>
using AggData = PHHashMap<T, vectorized::AggregateDataPtr, HashCRC32<T>>;
template <typename T>
using AggDataNullable = vectorized::DataWithNullKey<AggData<T>>;

using AggregatedDataWithoutKey = vectorized::AggregateDataPtr;
using AggregatedDataWithStringKey = PHHashMap<StringRef, vectorized::AggregateDataPtr>;
using AggregatedDataWithShortStringKey = StringHashMap<vectorized::AggregateDataPtr>;
using AggregatedDataWithUInt8Key = PHHashMap<vectorized::UInt8, vectorized::AggregateDataPtr>;
using AggregatedDataWithUInt16Key = PHHashMap<vectorized::UInt16, vectorized::AggregateDataPtr>;
using AggregatedDataWithUInt32Key =
PHHashMap<vectorized::UInt32, vectorized::AggregateDataPtr, HashCRC32<vectorized::UInt32>>;
using AggregatedDataWithUInt64Key =
PHHashMap<vectorized::UInt64, vectorized::AggregateDataPtr, HashCRC32<vectorized::UInt64>>;
using AggregatedDataWithUInt128Key = PHHashMap<vectorized::UInt128, vectorized::AggregateDataPtr,
HashCRC32<vectorized::UInt128>>;
using AggregatedDataWithUInt256Key = PHHashMap<vectorized::UInt256, vectorized::AggregateDataPtr,
HashCRC32<vectorized::UInt256>>;
using AggregatedDataWithUInt136Key = PHHashMap<vectorized::UInt136, vectorized::AggregateDataPtr,
HashCRC32<vectorized::UInt136>>;

using AggregatedDataWithUInt32KeyPhase2 =
PHHashMap<vectorized::UInt32, vectorized::AggregateDataPtr,
HashMixWrapper<vectorized::UInt32>>;
using AggregatedDataWithUInt64KeyPhase2 =
PHHashMap<vectorized::UInt64, vectorized::AggregateDataPtr,
HashMixWrapper<vectorized::UInt64>>;
using AggregatedDataWithUInt128KeyPhase2 =
PHHashMap<vectorized::UInt128, vectorized::AggregateDataPtr,
HashMixWrapper<vectorized::UInt128>>;
using AggregatedDataWithUInt256KeyPhase2 =
PHHashMap<vectorized::UInt256, vectorized::AggregateDataPtr,
HashMixWrapper<vectorized::UInt256>>;

using AggregatedDataWithUInt136KeyPhase2 =
PHHashMap<vectorized::UInt136, vectorized::AggregateDataPtr,
HashMixWrapper<vectorized::UInt136>>;

using AggregatedDataWithNullableUInt8Key = vectorized::DataWithNullKey<AggregatedDataWithUInt8Key>;
using AggregatedDataWithNullableUInt16Key =
vectorized::DataWithNullKey<AggregatedDataWithUInt16Key>;
using AggregatedDataWithNullableUInt32Key =
vectorized::DataWithNullKey<AggregatedDataWithUInt32Key>;
using AggregatedDataWithNullableUInt64Key =
vectorized::DataWithNullKey<AggregatedDataWithUInt64Key>;

using AggregatedDataWithNullableUInt32KeyPhase2 =
vectorized::DataWithNullKey<AggregatedDataWithUInt32KeyPhase2>;
using AggregatedDataWithNullableUInt64KeyPhase2 =
vectorized::DataWithNullKey<AggregatedDataWithUInt64KeyPhase2>;
using AggregatedDataWithNullableShortStringKey =
vectorized::DataWithNullKey<AggregatedDataWithShortStringKey>;
using AggregatedDataWithNullableUInt128Key =
vectorized::DataWithNullKey<AggregatedDataWithUInt128Key>;
using AggregatedDataWithNullableUInt128KeyPhase2 =
vectorized::DataWithNullKey<AggregatedDataWithUInt128KeyPhase2>;

using AggregatedMethodVariants = std::variant<
std::monostate, vectorized::MethodSerialized<AggregatedDataWithStringKey>,
vectorized::MethodOneNumber<vectorized::UInt8, AggregatedDataWithUInt8Key>,
vectorized::MethodOneNumber<vectorized::UInt16, AggregatedDataWithUInt16Key>,
vectorized::MethodOneNumber<vectorized::UInt32, AggregatedDataWithUInt32Key>,
vectorized::MethodOneNumber<vectorized::UInt64, AggregatedDataWithUInt64Key>,
vectorized::MethodOneNumber<vectorized::UInt8, AggData<vectorized::UInt8>>,
vectorized::MethodOneNumber<vectorized::UInt16, AggData<vectorized::UInt16>>,
vectorized::MethodOneNumber<vectorized::UInt32, AggData<vectorized::UInt32>>,
vectorized::MethodOneNumber<vectorized::UInt64, AggData<vectorized::UInt64>>,
vectorized::MethodStringNoCache<AggregatedDataWithShortStringKey>,
vectorized::MethodOneNumber<vectorized::UInt128, AggregatedDataWithUInt128Key>,
vectorized::MethodOneNumber<vectorized::UInt128, AggData<vectorized::UInt128>>,
vectorized::MethodOneNumber<vectorized::UInt256, AggData<vectorized::UInt256>>,
vectorized::MethodOneNumber<vectorized::UInt32, AggregatedDataWithUInt32KeyPhase2>,
vectorized::MethodOneNumber<vectorized::UInt64, AggregatedDataWithUInt64KeyPhase2>,
vectorized::MethodOneNumber<vectorized::UInt128, AggregatedDataWithUInt128KeyPhase2>,
vectorized::MethodSingleNullableColumn<
vectorized::MethodOneNumber<vectorized::UInt8, AggregatedDataWithNullableUInt8Key>>,
vectorized::MethodOneNumber<vectorized::UInt8, AggDataNullable<vectorized::UInt8>>>,
vectorized::MethodSingleNullableColumn<vectorized::MethodOneNumber<
vectorized::UInt16, AggregatedDataWithNullableUInt16Key>>,
vectorized::UInt16, AggDataNullable<vectorized::UInt16>>>,
vectorized::MethodSingleNullableColumn<vectorized::MethodOneNumber<
vectorized::UInt32, AggregatedDataWithNullableUInt32Key>>,
vectorized::UInt32, AggDataNullable<vectorized::UInt32>>>,
vectorized::MethodSingleNullableColumn<vectorized::MethodOneNumber<
vectorized::UInt64, AggregatedDataWithNullableUInt64Key>>,
vectorized::UInt64, AggDataNullable<vectorized::UInt64>>>,
vectorized::MethodSingleNullableColumn<vectorized::MethodOneNumber<
vectorized::UInt32, AggregatedDataWithNullableUInt32KeyPhase2>>,
vectorized::MethodSingleNullableColumn<vectorized::MethodOneNumber<
vectorized::UInt64, AggregatedDataWithNullableUInt64KeyPhase2>>,
vectorized::MethodSingleNullableColumn<vectorized::MethodOneNumber<
vectorized::UInt128, AggregatedDataWithNullableUInt128Key>>,
vectorized::UInt128, AggDataNullable<vectorized::UInt128>>>,
vectorized::MethodSingleNullableColumn<vectorized::MethodOneNumber<
vectorized::UInt128, AggregatedDataWithNullableUInt128KeyPhase2>>,
vectorized::UInt256, AggDataNullable<vectorized::UInt256>>>,
vectorized::MethodSingleNullableColumn<
vectorized::MethodStringNoCache<AggregatedDataWithNullableShortStringKey>>,
vectorized::MethodKeysFixed<AggregatedDataWithUInt64Key, false>,
vectorized::MethodKeysFixed<AggregatedDataWithUInt64Key, true>,
vectorized::MethodKeysFixed<AggregatedDataWithUInt128Key, false>,
vectorized::MethodKeysFixed<AggregatedDataWithUInt128Key, true>,
vectorized::MethodKeysFixed<AggregatedDataWithUInt256Key, false>,
vectorized::MethodKeysFixed<AggregatedDataWithUInt256Key, true>,
vectorized::MethodKeysFixed<AggregatedDataWithUInt136Key, false>,
vectorized::MethodKeysFixed<AggregatedDataWithUInt136Key, true>,
vectorized::MethodKeysFixed<AggregatedDataWithUInt64KeyPhase2, false>,
vectorized::MethodKeysFixed<AggregatedDataWithUInt64KeyPhase2, true>,
vectorized::MethodKeysFixed<AggregatedDataWithUInt128KeyPhase2, false>,
vectorized::MethodKeysFixed<AggregatedDataWithUInt128KeyPhase2, true>,
vectorized::MethodKeysFixed<AggregatedDataWithUInt256KeyPhase2, false>,
vectorized::MethodKeysFixed<AggregatedDataWithUInt256KeyPhase2, true>,
vectorized::MethodKeysFixed<AggregatedDataWithUInt136KeyPhase2, false>,
vectorized::MethodKeysFixed<AggregatedDataWithUInt136KeyPhase2, true>>;
vectorized::MethodKeysFixed<AggData<vectorized::UInt64>, false>,
vectorized::MethodKeysFixed<AggData<vectorized::UInt64>, true>,
vectorized::MethodKeysFixed<AggData<vectorized::UInt128>, false>,
vectorized::MethodKeysFixed<AggData<vectorized::UInt128>, true>,
vectorized::MethodKeysFixed<AggData<vectorized::UInt256>, false>,
vectorized::MethodKeysFixed<AggData<vectorized::UInt256>, true>,
vectorized::MethodKeysFixed<AggData<vectorized::UInt136>, false>,
vectorized::MethodKeysFixed<AggData<vectorized::UInt136>, true>>;

struct AggregatedDataVariants
: public vectorized::DataVariants<AggregatedMethodVariants,
vectorized::MethodSingleNullableColumn,
vectorized::MethodOneNumber, vectorized::MethodKeysFixed,
vectorized::DataWithNullKey> {
: public DataVariants<AggregatedMethodVariants, vectorized::MethodSingleNullableColumn,
vectorized::MethodOneNumber, vectorized::MethodKeysFixed,
vectorized::DataWithNullKey> {
AggregatedDataWithoutKey without_key = nullptr;

template <bool nullable>
void init(Type type) {
_type = type;
switch (_type) {
case Type::without_key:
void init(const std::vector<vectorized::DataTypePtr>& data_types, HashKeyType type) {
switch (type) {
case HashKeyType::without_key:
break;
case Type::serialized:
case HashKeyType::serialized:
method_variant.emplace<vectorized::MethodSerialized<AggregatedDataWithStringKey>>();
break;
case Type::int8_key:
emplace_single<vectorized::UInt8, AggregatedDataWithUInt8Key, nullable>();
case HashKeyType::int8_key:
emplace_single<vectorized::UInt8, AggData<vectorized::UInt8>, nullable>();
break;
case Type::int16_key:
emplace_single<vectorized::UInt16, AggregatedDataWithUInt16Key, nullable>();
case HashKeyType::int16_key:
emplace_single<vectorized::UInt16, AggData<vectorized::UInt16>, nullable>();
break;
case Type::int32_key:
emplace_single<vectorized::UInt32, AggregatedDataWithUInt32Key, nullable>();
case HashKeyType::int32_key:
emplace_single<vectorized::UInt32, AggData<vectorized::UInt32>, nullable>();
break;
case Type::int32_key_phase2:
case HashKeyType::int32_key_phase2:
emplace_single<vectorized::UInt32, AggregatedDataWithUInt32KeyPhase2, nullable>();
break;
case Type::int64_key:
emplace_single<vectorized::UInt64, AggregatedDataWithUInt64Key, nullable>();
case HashKeyType::int64_key:
emplace_single<vectorized::UInt64, AggData<vectorized::UInt64>, nullable>();
break;
case Type::int64_key_phase2:
case HashKeyType::int64_key_phase2:
emplace_single<vectorized::UInt64, AggregatedDataWithUInt64KeyPhase2, nullable>();
break;
case Type::int128_key:
emplace_single<vectorized::UInt128, AggregatedDataWithUInt128Key, nullable>();
case HashKeyType::int128_key:
emplace_single<vectorized::UInt128, AggData<vectorized::UInt128>, nullable>();
break;
case Type::int128_key_phase2:
emplace_single<vectorized::UInt128, AggregatedDataWithUInt128KeyPhase2, nullable>();
case HashKeyType::int256_key:
emplace_single<vectorized::UInt256, AggData<vectorized::UInt256>, nullable>();
break;
case Type::string_key:
case HashKeyType::string_key:
if (nullable) {
method_variant.emplace<
vectorized::MethodSingleNullableColumn<vectorized::MethodStringNoCache<
Expand All @@ -177,16 +137,29 @@ struct AggregatedDataVariants
vectorized::MethodStringNoCache<AggregatedDataWithShortStringKey>>();
}
break;
case HashKeyType::fixed64:
method_variant
.emplace<vectorized::MethodKeysFixed<AggData<vectorized::UInt64>, nullable>>(
get_key_sizes(data_types));
break;
case HashKeyType::fixed128:
method_variant
.emplace<vectorized::MethodKeysFixed<AggData<vectorized::UInt128>, nullable>>(
get_key_sizes(data_types));
break;
case HashKeyType::fixed136:
method_variant
.emplace<vectorized::MethodKeysFixed<AggData<vectorized::UInt136>, nullable>>(
get_key_sizes(data_types));
break;
case HashKeyType::fixed256:
method_variant
.emplace<vectorized::MethodKeysFixed<AggData<vectorized::UInt256>, nullable>>(
get_key_sizes(data_types));
break;
default:
throw Exception(ErrorCode::INTERNAL_ERROR, "meet invalid key type, type={}", type);
}
}

void init(Type type, bool is_nullable = false) {
if (is_nullable) {
init<true>(type);
} else {
init<false>(type);
throw Exception(ErrorCode::INTERNAL_ERROR,
"AggregatedDataVariants meet invalid key type, type={}", type);
}
}
};
Expand All @@ -210,7 +183,7 @@ struct AggregateDataContainer {
}

*reinterpret_cast<KeyType*>(_current_keys) = key;
auto aggregate_data = _current_agg_data;
auto* aggregate_data = _current_agg_data;
++_total_count;
++_index_in_sub_container;
_current_agg_data += _size_of_aggregate_states;
Expand Down Expand Up @@ -275,15 +248,15 @@ struct AggregateDataContainer {
using IteratorBase<ConstIterator, true>::IteratorBase;
};

ConstIterator begin() const { return ConstIterator(this, 0); }
ConstIterator begin() const { return {this, 0}; }

ConstIterator cbegin() const { return begin(); }

Iterator begin() { return Iterator(this, 0); }
Iterator begin() { return {this, 0}; }

ConstIterator end() const { return ConstIterator(this, _total_count); }
ConstIterator end() const { return {this, _total_count}; }
ConstIterator cend() const { return end(); }
Iterator end() { return Iterator(this, _total_count); }
Iterator end() { return {this, _total_count}; }

void init_once() {
if (_inited) {
Expand Down Expand Up @@ -331,10 +304,4 @@ struct AggregateDataContainer {
uint32_t _total_count {};
bool _inited = false;
};

} // namespace pipeline

constexpr auto init_agg_hash_method =
init_hash_method<pipeline::AggregatedDataVariants, vectorized::AggregateDataPtr>;

} // namespace doris
Loading
Loading