diff --git a/be/src/vec/functions/dictionary.cpp b/be/src/vec/functions/dictionary.cpp index 2e507852cff853..b672fb4f70d208 100644 --- a/be/src/vec/functions/dictionary.cpp +++ b/be/src/vec/functions/dictionary.cpp @@ -16,6 +16,7 @@ // under the License. #include "vec/functions/dictionary.h" +#include "vec/data_types/data_type_nullable.h" namespace doris::vectorized { @@ -27,6 +28,10 @@ IDictionary::IDictionary(std::string name, std::vector attr throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "The names of attributes should not have duplicates : {}", name); } + if (_attributes[i].type->is_nullable()) { + throw doris::Exception(ErrorCode::INVALID_ARGUMENT, + "Dictionary attribute should not be nullable : {}", name); + } _name_to_attributes_index[name] = i; } } @@ -52,4 +57,36 @@ DataTypePtr IDictionary::get_attribute_type(const std::string& name) const { size_t idx = it->second; return _attributes[idx].type; } + +void IDictionary::load_attributes(std::vector& attributes_column) { + // load att column + _attribute_data.resize(attributes_column.size()); + for (size_t i = 0; i < attributes_column.size(); i++) { + const DataTypePtr att_type = _attributes[i].type; + ColumnPtr column = attributes_column[i]; + auto remove_nullable_data_type = remove_nullable(att_type); + auto remove_nullable_column = remove_nullable(column); + + bool valid = IDictionary::cast_type(remove_nullable_data_type.get(), [&](const auto& type) { + using AttributeRealDataType = std::decay_t; + using AttributeRealColumnType = AttributeRealDataType::ColumnType; + const auto* res_real_column = + typeid_cast(remove_nullable_column.get()); + if (!res_real_column) { + return false; + } + auto& att = _attribute_data[i]; + ColumnWithType column_with_type; + column_with_type.column = AttributeRealColumnType::create(*res_real_column); + att = column_with_type; + return true; + }); + if (!valid) { + throw doris::Exception(ErrorCode::INVALID_ARGUMENT, + "Dictionary({}) att type is : {} , but input column is : {}", + dict_name(), att_type->get_name(), column->get_name()); + } + } +} + } // namespace doris::vectorized diff --git a/be/src/vec/functions/dictionary.h b/be/src/vec/functions/dictionary.h index 32b6d92cd925fd..9a6ef8ddbd53d4 100644 --- a/be/src/vec/functions/dictionary.h +++ b/be/src/vec/functions/dictionary.h @@ -18,8 +18,6 @@ #pragma once #include -#include -#include #include #include #include @@ -30,7 +28,6 @@ #include "vec/data_types/data_type_date_time.h" #include "vec/data_types/data_type_ipv4.h" #include "vec/data_types/data_type_ipv6.h" -#include "vec/data_types/data_type_nullable.h" #include "vec/data_types/data_type_number.h" #include "vec/data_types/data_type_string.h" #include "vec/data_types/data_type_time_v2.h" @@ -38,8 +35,8 @@ namespace doris::vectorized { struct DictionaryAttribute { - const std::string name; - const DataTypePtr type; + const std::string name; // attribute name + const DataTypePtr type; // should be a non-nullable type }; class IDictionary { @@ -58,6 +55,7 @@ class IDictionary { template static bool cast_type(const IDataType* type, F&& f) { + // The data types supported by cast_type must be consistent with the AttributeData below. return cast_type_to_either; + // using AttributeRealDataType = typename HashTableType::DataType; + // using AttributeRealColumnType = typename AttributeRealDataType::ColumnType; + // }, + // AttributeData); + using AttributeData = std::variant, ColumnWithType, ColumnWithType, ColumnWithType, ColumnWithType, ColumnWithType, @@ -90,8 +97,16 @@ class IDictionary { ColumnWithType>, ColumnWithType>>; + // load_attributes will remove nullable attributes. + // Any nullable-related data needs to be handled by the subclass dictionary. + void load_attributes(std::vector& attributes_column); + + // _attribute_data is used to store the data of attribute columns. + // Nullable columns are not stored here. + std::vector _attribute_data; const std::string _dict_name; std::vector _attributes; + // A mapping from attribute names to their corresponding indices. std::unordered_map _name_to_attributes_index; }; diff --git a/be/src/vec/functions/hash_map_dictionary.h b/be/src/vec/functions/hash_map_dictionary.h index 6928bd967af7ad..4f5b6d62985eca 100644 --- a/be/src/vec/functions/hash_map_dictionary.h +++ b/be/src/vec/functions/hash_map_dictionary.h @@ -29,11 +29,17 @@ #include "vec/common/assert_cast.h" #include "vec/common/string_ref.h" #include "vec/core/columns_with_type_and_name.h" +#include "vec/data_types/data_type_nullable.h" #include "vec/functions/dictionary.h" #include "vec/runtime/vdatetime_value.h" namespace doris::vectorized { +// HashMapDictionary is a regular dictionary. +// Its underlying implementation uses a hashmap. +// It maintains a map. For a given query key, it first retrieves the corresponding RowIndex, +// and then fetches the attribute value from attribute_data. +// This way, if multiple attributes are queried simultaneously, we only need to query the hashmap once. template class HashMapDictionary : public IDictionary { public: @@ -67,7 +73,8 @@ class HashMapDictionary : public IDictionary { dict_name(), _key_type->get_name(), key_type->get_name()); } MutableColumnPtr res_column = attribute_type->create_column(); - auto& attribute = _hashmap_attributes[attribute_index(attribute_name)]; + const auto& att_column = _attribute_data[attribute_index(attribute_name)]; + const auto& att_null_map = _attribute_null_maps[attribute_index(attribute_name)]; const auto* real_key_column = assert_cast(key_column.get()); std::visit( @@ -79,9 +86,9 @@ class HashMapDictionary : public IDictionary { auto* res_real_column = assert_cast(res_column.get()); const auto& attributes_column = arg.column; - if (attribute.null_map) { + if (att_null_map) { // att is nullable - const auto& null_map = attribute.null_map->get_data(); + const auto& null_map = att_null_map->get_data(); for (size_t i = 0; i < real_key_column->size(); i++) { const auto& key_value = real_key_column->get_element(i); auto it = _key_hash_map.find(key_value); @@ -112,68 +119,43 @@ class HashMapDictionary : public IDictionary { } } }, - attribute.containers); + att_column); return res_column; } private: void load_data(ColumnPtr& key_column, std::vector& attributes_column) { + // load key column const auto* key_real_column = assert_cast(key_column.get()); for (size_t i = 0; i < key_real_column->size(); i++) { auto key_str = key_real_column->get_element(i); _key_hash_map[key_str] = i; } - _hashmap_attributes.resize(attributes_column.size()); + // load att column + load_attributes(attributes_column); + + // load att nullable + _attribute_null_maps.resize(attributes_column.size()); for (size_t i = 0; i < attributes_column.size(); i++) { const DataTypePtr att_type = _attributes[i].type; ColumnPtr column = attributes_column[i]; - - auto remove_nullable_data_type = remove_nullable(att_type); - auto remove_nullable_column = remove_nullable(column); - - // Set containers - bool valid = cast_type(remove_nullable_data_type.get(), [&](const auto& type) { - using AttributeRealDataType = std::decay_t; - using AttributeRealColumnType = AttributeRealDataType::ColumnType; - const auto* res_real_column = - typeid_cast(remove_nullable_column.get()); - if (!res_real_column) { - return false; - } - auto& att = _hashmap_attributes[i]; - ColumnWithType column_with_type; - column_with_type.column = AttributeRealColumnType::create(*res_real_column); - att.containers = column_with_type; - return true; - }); - if (!valid) { - throw doris::Exception( - ErrorCode::INVALID_ARGUMENT, - "HashMapDictionary({}) att type is : {} , but input column is : {}", - dict_name(), att_type->get_name(), column->get_name()); - } - - // Set other content, such as nullmap (no need to visit). - if (column->is_nullable()) { - auto& att = _hashmap_attributes[i]; - att.null_map = ColumnUInt8::create( + auto& null_map = _attribute_null_maps[i]; + null_map = ColumnUInt8::create( assert_cast(column.get())->get_null_map_column()); } } } - struct ColumnWithNullAttribute final { - ColumnUInt8::Ptr null_map; - ColumnData containers; - }; - + /// TODO: For String Type, the KeyType is std::string, which causes additional copying. + // It should be changed to StringRef to avoid this overhead. + // map phmap::flat_hash_map _key_hash_map; - std::vector _hashmap_attributes; - + // If _attribute_null_maps[index] is not nullptr, the original attribute is a nullable column and needs to be handled. + std::vector _attribute_null_maps; DataTypePtr _key_type; }; @@ -191,8 +173,8 @@ inline DictionaryPtr create_hash_map_dict_from_column(const std::string& name, }); if (!valid) { - throw doris::Exception(ErrorCode::INVALID_ARGUMENT, " Unsupported key type : {}", - key_type->get_name()); + throw doris::Exception(ErrorCode::INVALID_ARGUMENT, + "HashMapDictionary Unsupported key type : {}", key_type->get_name()); } return dict; } diff --git a/be/src/vec/functions/ip_address_dictionary.cpp b/be/src/vec/functions/ip_address_dictionary.cpp index 75887c54abca19..a45ebb301138de 100644 --- a/be/src/vec/functions/ip_address_dictionary.cpp +++ b/be/src/vec/functions/ip_address_dictionary.cpp @@ -18,20 +18,14 @@ #include "vec/functions/ip_address_dictionary.h" #include -#include -#include #include #include -#include "gutil/strings/split.h" #include "vec/columns/column.h" #include "vec/columns/column_string.h" #include "vec/columns/columns_number.h" #include "vec/common/assert_cast.h" #include "vec/core/types.h" -#include "vec/data_types/data_type_number.h" -#include "vec/exprs/vlambda_function_call_expr.h" -#include "vec/functions/dictionary.h" #include "vec/runtime/ip_address_cidr.h" #include "vec/runtime/ipv4_value.h" @@ -49,7 +43,7 @@ ColumnPtr IPAddressDictionary::getColumn(const std::string& attribute_name, } MutableColumnPtr res_column = attribute_type->create_column(); - const auto& attribute = _column_data[attribute_index(attribute_name)]; + const auto& attribute = _attribute_data[attribute_index(attribute_name)]; if (WhichDataType {key_type}.is_ipv6()) { const auto* ipv6_column = assert_cast(key_column.get()); @@ -126,47 +120,24 @@ struct IPRecord { void IPAddressDictionary::load_data(ColumnPtr& key_column, std::vector& attributes_column) { - const auto* str_column = assert_cast(key_column.get()); + // load att column + load_attributes(attributes_column); - std::vector ip_records; + // Construct an IP trie - // load key column + // Step 1: Import the CIDR data. + // Record the parsed CIDR and the corresponding row from the original data. + const auto* str_column = assert_cast(key_column.get()); + std::vector ip_records; for (size_t i = 0; i < str_column->size(); i++) { auto ip_str = str_column->get_element(i); ip_records.push_back(IPRecord {parse_ip_with_cidr(ip_str), i}); } - // load att column - - _column_data.resize(attributes_column.size()); - - for (size_t i = 0; i < attributes_column.size(); i++) { - const DataTypePtr att_type = _attributes[i].type; - ColumnPtr column = attributes_column[i]; - bool valid = IDictionary::cast_type(att_type.get(), [&](const auto& type) { - using AttributeRealDataType = std::decay_t; - using AttributeRealColumnType = AttributeRealDataType::ColumnType; - const auto* res_real_column = typeid_cast(column.get()); - if (!res_real_column) { - return false; - } - auto& att = _column_data[i]; - ColumnWithType column_with_type; - column_with_type.column = AttributeRealColumnType::create(*res_real_column); - att = column_with_type; - return true; - }); - if (!valid) { - throw doris::Exception( - ErrorCode::INVALID_ARGUMENT, - "IPAddressDictionary({}) att type is : {} , but input column is : {}", - dict_name(), att_type->get_name(), column->get_name()); - } - } - - // build ip trie + // Step 2: Process IP data. - // sort all ip + // Step 2.1: Process all corresponding CIDRs as IPv6. + // Sort them by {the value of IPv6, the prefix length of the CIDR in IPv6}. std::sort(ip_records.begin(), ip_records.end(), [&](const IPRecord& a, const IPRecord& b) { if (a.to_ipv6() == b.to_ipv6()) { return a.prefix() < b.prefix(); @@ -174,18 +145,38 @@ void IPAddressDictionary::load_data(ColumnPtr& key_column, return a.to_ipv6() < b.to_ipv6(); }); + // Step 2.2: Remove duplicate data. auto new_end = std::unique(ip_records.begin(), ip_records.end(), [&](const IPRecord& a, const IPRecord& b) { return a.to_ipv6() == b.to_ipv6() && a.prefix() == b.prefix(); }); ip_records.erase(new_end, ip_records.end()); + // Step 3: Process the data needed for the Trie. + // You can treat ip_column, prefix_column, and origin_row_idx_column as a whole. + // struct TrieNode { + // IPv6 ip; + // UInt8 prefix; + // size_t origin_row_idx; + // }; for (const auto& record : ip_records) { ip_column.push_back(record.to_ipv6()); - mask_column.push_back(record.prefix()); - row_idx.push_back(record.row); + prefix_column.push_back(record.prefix()); + origin_row_idx_column.push_back(record.row); } + // Step 4: Construct subnet relationships. + // The CIDR at index i is a subnet of the CIDR at index parent_subnet[i], for example: + // 192.168.0.0/24 [0] + // ├── 192.168.0.0/25 [1] + // │ ├── 192.168.0.0/26 [2] + // │ └── 192.168.0.64/26 [3] + // └── 192.168.0.128/25 [4] + // parent_subnet[4] = 0 + // parent_subnet[3] = 1 + // parent_subnet[2] = 1 + // parent_subnet[1] = 0 + // parent_subnet[0] = 0 (itself) parent_subnet.resize(ip_records.size()); std::stack subnets_stack; for (auto i = 0; i < ip_records.size(); i++) { @@ -194,13 +185,12 @@ void IPAddressDictionary::load_data(ColumnPtr& key_column, size_t pi = subnets_stack.top(); auto cur_address_ip = ip_records[i].to_ipv6(); - const auto* cur_addi = reinterpret_cast(&cur_address_ip); - auto cur_subnet_ip = ip_records[pi].to_ipv6(); - const auto* cur_addip = reinterpret_cast(&cur_subnet_ip); + const auto* addr = reinterpret_cast(&cur_address_ip); + auto parent_subnet_ip = ip_records[pi].to_ipv6(); + const auto* parent_addr = reinterpret_cast(&parent_subnet_ip); bool is_mask_smaller = ip_records[pi].prefix() < ip_records[i].prefix(); - if (is_mask_smaller && - match_ipv6_subnet(cur_addi, cur_addip, ip_records[pi].prefix())) { + if (is_mask_smaller && match_ipv6_subnet(addr, parent_addr, ip_records[pi].prefix())) { parent_subnet[i] = pi; break; } @@ -212,14 +202,15 @@ void IPAddressDictionary::load_data(ColumnPtr& key_column, } IPAddressDictionary::RowIdxConstIter IPAddressDictionary::lookupIP(IPv6 target) const { - if (row_idx.empty()) { + if (origin_row_idx_column.empty()) { return ip_not_found(); } auto comp = [&](IPv6 value, auto idx) -> bool { return value < ip_column[idx]; }; - auto range = std::ranges::views::iota(0ULL, row_idx.size()); + auto range = std::ranges::views::iota(0ULL, origin_row_idx_column.size()); + // Query Step 1: First, use binary search to find a CIDR that is close to the target. auto found_it = std::ranges::upper_bound(range, target, comp); if (found_it == range.begin()) { @@ -228,10 +219,12 @@ IPAddressDictionary::RowIdxConstIter IPAddressDictionary::lookupIP(IPv6 target) --found_it; + // Query Step 2: Based on the subnet relationships, find the first matching CIDR. for (auto idx = *found_it;; idx = parent_subnet[idx]) { if (match_ipv6_subnet(reinterpret_cast(&target), - reinterpret_cast(&ip_column[idx]), mask_column[idx])) { - return row_idx.begin() + idx; + reinterpret_cast(&ip_column[idx]), + prefix_column[idx])) { + return origin_row_idx_column.begin() + idx; } if (idx == parent_subnet[idx]) { return ip_not_found(); diff --git a/be/src/vec/functions/ip_address_dictionary.h b/be/src/vec/functions/ip_address_dictionary.h index 8ab112655afdb8..d939498592104a 100644 --- a/be/src/vec/functions/ip_address_dictionary.h +++ b/be/src/vec/functions/ip_address_dictionary.h @@ -18,12 +18,9 @@ #pragma once #include -#include #include #include "vec/columns/column.h" -#include "vec/columns/columns_number.h" -#include "vec/common/string_ref.h" #include "vec/core/column_with_type_and_name.h" #include "vec/core/columns_with_type_and_name.h" #include "vec/core/types.h" @@ -54,7 +51,7 @@ class IPAddressDictionary : public IDictionary { private: using RowIdxConstIter = std::vector::const_iterator; - RowIdxConstIter ip_not_found() const { return row_idx.end(); } + RowIdxConstIter ip_not_found() const { return origin_row_idx_column.end(); } RowIdxConstIter lookupIP(IPv6 target) const; @@ -62,13 +59,11 @@ class IPAddressDictionary : public IDictionary { std::vector ip_column; - std::vector mask_column; + std::vector prefix_column; - std::vector parent_subnet; - - std::vector row_idx; + std::vector origin_row_idx_column; - std::vector _column_data; + std::vector parent_subnet; }; inline DictionaryPtr create_ip_trie_dict_from_column(const std::string& name, diff --git a/be/test/vec/function/function_ip_dict_test.h b/be/test/vec/function/function_ip_dict_test.h index 239b6115273d84..81d744e2d25a38 100644 --- a/be/test/vec/function/function_ip_dict_test.h +++ b/be/test/vec/function/function_ip_dict_test.h @@ -61,7 +61,7 @@ class MockIPAddressDictionary : public IDictionary { ColumnPtr getColumn(const std::string& attribute_name, const DataTypePtr& attribute_type, const ColumnPtr& key_column, const DataTypePtr& key_type) const override { MutableColumnPtr res_column = attribute_type->create_column(); - const auto& attribute = _column_data[attribute_index(attribute_name)]; + const auto& attribute = _attribute_data[attribute_index(attribute_name)]; if (WhichDataType {key_type}.is_ipv6()) { const auto* ipv6_column = assert_cast(key_column.get()); @@ -140,32 +140,7 @@ class MockIPAddressDictionary : public IDictionary { std::sort(ip_records.begin(), ip_records.end(), [&](const IPRecord& a, const IPRecord& b) { return a.prefix() > b.prefix(); }); - // load att column - _column_data.resize(attributes_column.size()); - for (size_t i = 0; i < attributes_column.size(); i++) { - const DataTypePtr att_type = _attributes[i].type; - ColumnPtr column = attributes_column[i]; - bool valid = IDictionary::cast_type(att_type.get(), [&](const auto& type) { - using AttributeRealDataType = std::decay_t; - using AttributeRealColumnType = AttributeRealDataType::ColumnType; - const auto* res_real_column = - typeid_cast(column.get()); - if (!res_real_column) { - return false; - } - auto& att = _column_data[i]; - ColumnWithType column_with_type; - column_with_type.column = AttributeRealColumnType::create(*res_real_column); - att = column_with_type; - return true; - }); - if (!valid) { - throw doris::Exception( - ErrorCode::INVALID_ARGUMENT, - "IPAddressDictionary({}) att type is : {} , but input column is : {}", - dict_name(), att_type->get_name(), column->get_name()); - } - } + load_attributes(attributes_column); } using RowIdxConstIter = std::vector::const_iterator; @@ -184,7 +159,6 @@ class MockIPAddressDictionary : public IDictionary { RowIdxConstIter ip_not_found() const { return ip_records.end(); } std::vector ip_records; - std::vector _column_data; }; inline DictionaryPtr create_mock_ip_trie_dict_from_column(const std::string& name,