Skip to content

Commit

Permalink
refine code
Browse files Browse the repository at this point in the history
  • Loading branch information
Mryange committed Dec 27, 2024
1 parent a9a2675 commit 52361ff
Show file tree
Hide file tree
Showing 6 changed files with 134 additions and 138 deletions.
37 changes: 37 additions & 0 deletions be/src/vec/functions/dictionary.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
// under the License.

#include "vec/functions/dictionary.h"
#include "vec/data_types/data_type_nullable.h"

namespace doris::vectorized {

Expand All @@ -27,6 +28,10 @@ IDictionary::IDictionary(std::string name, std::vector<DictionaryAttribute> attr
throw doris::Exception(ErrorCode::INVALID_ARGUMENT,
"The names of attributes should not have duplicates : {}", name);
}
if (_attributes[i].type->is_nullable()) {
throw doris::Exception(ErrorCode::INVALID_ARGUMENT,
"Dictionary attribute should not be nullable : {}", name);
}
_name_to_attributes_index[name] = i;
}
}
Expand All @@ -52,4 +57,36 @@ DataTypePtr IDictionary::get_attribute_type(const std::string& name) const {
size_t idx = it->second;
return _attributes[idx].type;
}

void IDictionary::load_attributes(std::vector<ColumnPtr>& attributes_column) {
// load att column
_attribute_data.resize(attributes_column.size());
for (size_t i = 0; i < attributes_column.size(); i++) {
const DataTypePtr att_type = _attributes[i].type;
ColumnPtr column = attributes_column[i];
auto remove_nullable_data_type = remove_nullable(att_type);
auto remove_nullable_column = remove_nullable(column);

bool valid = IDictionary::cast_type(remove_nullable_data_type.get(), [&](const auto& type) {
using AttributeRealDataType = std::decay_t<decltype(type)>;
using AttributeRealColumnType = AttributeRealDataType::ColumnType;
const auto* res_real_column =
typeid_cast<const AttributeRealColumnType*>(remove_nullable_column.get());
if (!res_real_column) {
return false;
}
auto& att = _attribute_data[i];
ColumnWithType<AttributeRealDataType> column_with_type;
column_with_type.column = AttributeRealColumnType::create(*res_real_column);
att = column_with_type;
return true;
});
if (!valid) {
throw doris::Exception(ErrorCode::INVALID_ARGUMENT,
"Dictionary({}) att type is : {} , but input column is : {}",
dict_name(), att_type->get_name(), column->get_name());
}
}
}

} // namespace doris::vectorized
27 changes: 21 additions & 6 deletions be/src/vec/functions/dictionary.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,6 @@
#pragma once

#include <memory>
#include <mutex>
#include <shared_mutex>
#include <unordered_map>
#include <utility>
#include <vector>
Expand All @@ -30,16 +28,15 @@
#include "vec/data_types/data_type_date_time.h"
#include "vec/data_types/data_type_ipv4.h"
#include "vec/data_types/data_type_ipv6.h"
#include "vec/data_types/data_type_nullable.h"
#include "vec/data_types/data_type_number.h"
#include "vec/data_types/data_type_string.h"
#include "vec/data_types/data_type_time_v2.h"
#include "vec/functions/cast_type_to_either.h"

namespace doris::vectorized {
struct DictionaryAttribute {
const std::string name;
const DataTypePtr type;
const std::string name; // attribute name
const DataTypePtr type; // should be a non-nullable type
};

class IDictionary {
Expand All @@ -58,6 +55,7 @@ class IDictionary {

template <typename F>
static bool cast_type(const IDataType* type, F&& f) {
// The data types supported by cast_type must be consistent with the AttributeData below.
return cast_type_to_either<DataTypeUInt8, DataTypeInt8, DataTypeInt16, DataTypeInt32,
DataTypeInt64, DataTypeInt128, DataTypeFloat32, DataTypeFloat64,
DataTypeIPv4, DataTypeIPv6, DataTypeString, DataTypeDateV2,
Expand All @@ -72,7 +70,16 @@ class IDictionary {
using DataType = Type;
DataType::ColumnType::Ptr column;
};
using ColumnData =

// `AttributeData` is a variant type. Use it with `std::visit` in the following way:
// std::visit(
// [&](auto&& arg) {
// using HashTableType = std::decay_t<decltype(arg)>;
// using AttributeRealDataType = typename HashTableType::DataType;
// using AttributeRealColumnType = typename AttributeRealDataType::ColumnType;
// },
// AttributeData);
using AttributeData =
std::variant<ColumnWithType<DataTypeUInt8>, ColumnWithType<DataTypeInt8>,
ColumnWithType<DataTypeInt16>, ColumnWithType<DataTypeInt32>,
ColumnWithType<DataTypeInt64>, ColumnWithType<DataTypeInt128>,
Expand All @@ -90,8 +97,16 @@ class IDictionary {
ColumnWithType<DataTypeDecimal<Decimal128V3>>,
ColumnWithType<DataTypeDecimal<Decimal256>>>;

// load_attributes will remove nullable attributes.
// Any nullable-related data needs to be handled by the subclass dictionary.
void load_attributes(std::vector<ColumnPtr>& attributes_column);

// _attribute_data is used to store the data of attribute columns.
// Nullable columns are not stored here.
std::vector<AttributeData> _attribute_data;
const std::string _dict_name;
std::vector<DictionaryAttribute> _attributes;
// A mapping from attribute names to their corresponding indices.
std::unordered_map<std::string, size_t> _name_to_attributes_index;
};

Expand Down
70 changes: 26 additions & 44 deletions be/src/vec/functions/hash_map_dictionary.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,17 @@
#include "vec/common/assert_cast.h"
#include "vec/common/string_ref.h"
#include "vec/core/columns_with_type_and_name.h"
#include "vec/data_types/data_type_nullable.h"
#include "vec/functions/dictionary.h"
#include "vec/runtime/vdatetime_value.h"

namespace doris::vectorized {

// HashMapDictionary is a regular dictionary.
// Its underlying implementation uses a hashmap.
// It maintains a map<Key, RowIndex>. For a given query key, it first retrieves the corresponding RowIndex,
// and then fetches the attribute value from attribute_data.
// This way, if multiple attributes are queried simultaneously, we only need to query the hashmap once.
template <typename KeyDataType>
class HashMapDictionary : public IDictionary {
public:
Expand Down Expand Up @@ -67,7 +73,8 @@ class HashMapDictionary : public IDictionary {
dict_name(), _key_type->get_name(), key_type->get_name());
}
MutableColumnPtr res_column = attribute_type->create_column();
auto& attribute = _hashmap_attributes[attribute_index(attribute_name)];
const auto& att_column = _attribute_data[attribute_index(attribute_name)];
const auto& att_null_map = _attribute_null_maps[attribute_index(attribute_name)];
const auto* real_key_column = assert_cast<const KeyRealColumnType*>(key_column.get());

std::visit(
Expand All @@ -79,9 +86,9 @@ class HashMapDictionary : public IDictionary {
auto* res_real_column = assert_cast<AttributeRealColumnType*>(res_column.get());
const auto& attributes_column = arg.column;

if (attribute.null_map) {
if (att_null_map) {
// att is nullable
const auto& null_map = attribute.null_map->get_data();
const auto& null_map = att_null_map->get_data();
for (size_t i = 0; i < real_key_column->size(); i++) {
const auto& key_value = real_key_column->get_element(i);
auto it = _key_hash_map.find(key_value);
Expand Down Expand Up @@ -112,68 +119,43 @@ class HashMapDictionary : public IDictionary {
}
}
},
attribute.containers);
att_column);

return res_column;
}

private:
void load_data(ColumnPtr& key_column, std::vector<ColumnPtr>& attributes_column) {
// load key column
const auto* key_real_column = assert_cast<const KeyRealColumnType*>(key_column.get());
for (size_t i = 0; i < key_real_column->size(); i++) {
auto key_str = key_real_column->get_element(i);
_key_hash_map[key_str] = i;
}

_hashmap_attributes.resize(attributes_column.size());
// load att column
load_attributes(attributes_column);

// load att nullable
_attribute_null_maps.resize(attributes_column.size());
for (size_t i = 0; i < attributes_column.size(); i++) {
const DataTypePtr att_type = _attributes[i].type;
ColumnPtr column = attributes_column[i];

auto remove_nullable_data_type = remove_nullable(att_type);
auto remove_nullable_column = remove_nullable(column);

// Set containers
bool valid = cast_type(remove_nullable_data_type.get(), [&](const auto& type) {
using AttributeRealDataType = std::decay_t<decltype(type)>;
using AttributeRealColumnType = AttributeRealDataType::ColumnType;
const auto* res_real_column =
typeid_cast<const AttributeRealColumnType*>(remove_nullable_column.get());
if (!res_real_column) {
return false;
}
auto& att = _hashmap_attributes[i];
ColumnWithType<AttributeRealDataType> column_with_type;
column_with_type.column = AttributeRealColumnType::create(*res_real_column);
att.containers = column_with_type;
return true;
});
if (!valid) {
throw doris::Exception(
ErrorCode::INVALID_ARGUMENT,
"HashMapDictionary({}) att type is : {} , but input column is : {}",
dict_name(), att_type->get_name(), column->get_name());
}

// Set other content, such as nullmap (no need to visit).

if (column->is_nullable()) {
auto& att = _hashmap_attributes[i];
att.null_map = ColumnUInt8::create(
auto& null_map = _attribute_null_maps[i];
null_map = ColumnUInt8::create(
assert_cast<const ColumnNullable*>(column.get())->get_null_map_column());
}
}
}

struct ColumnWithNullAttribute final {
ColumnUInt8::Ptr null_map;
ColumnData containers;
};

/// TODO: For String Type, the KeyType is std::string, which causes additional copying.
// It should be changed to StringRef to avoid this overhead.
// map<Key, RowIndex>
phmap::flat_hash_map<KeyType, IColumn::ColumnIndex> _key_hash_map;

std::vector<ColumnWithNullAttribute> _hashmap_attributes;

// If _attribute_null_maps[index] is not nullptr, the original attribute is a nullable column and needs to be handled.
std::vector<ColumnUInt8::Ptr> _attribute_null_maps;
DataTypePtr _key_type;
};

Expand All @@ -191,8 +173,8 @@ inline DictionaryPtr create_hash_map_dict_from_column(const std::string& name,
});

if (!valid) {
throw doris::Exception(ErrorCode::INVALID_ARGUMENT, " Unsupported key type : {}",
key_type->get_name());
throw doris::Exception(ErrorCode::INVALID_ARGUMENT,
"HashMapDictionary Unsupported key type : {}", key_type->get_name());
}
return dict;
}
Expand Down
Loading

0 comments on commit 52361ff

Please sign in to comment.