Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[opt](hashset)Optimize u8/i8 distinct by using SmallFixedSizeHashSet #46542

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 34 additions & 2 deletions be/src/pipeline/common/distinct_agg_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,45 @@
#include "vec/common/hash_table/ph_hash_map.h"
#include "vec/common/hash_table/ph_hash_set.h"
#include "vec/common/hash_table/string_hash_map.h"
#include "vec/core/types.h"

namespace doris {

template <typename T>
struct DistinctHashSetType {
using HashSet = PHHashSet<T, HashCRC32<T>>;
};

template <>
struct DistinctHashSetType<vectorized::UInt8> {
using HashSet = SmallFixedSizeHashSet<vectorized::UInt8>;
};

template <>
struct DistinctHashSetType<vectorized::Int8> {
using HashSet = SmallFixedSizeHashSet<vectorized::Int8>;
};

template <typename T>
struct DistinctPhase2HashSetType {
using HashSet = PHHashSet<T, HashMixWrapper<T>>;
};

template <>
struct DistinctPhase2HashSetType<vectorized::UInt8> {
using HashSet = SmallFixedSizeHashSet<vectorized::UInt8>;
};

template <>
struct DistinctPhase2HashSetType<vectorized::Int8> {
using HashSet = SmallFixedSizeHashSet<vectorized::Int8>;
};

template <typename T>
using DistinctData = PHHashSet<T, HashCRC32<T>>;
using DistinctData = typename DistinctHashSetType<T>::HashSet;

template <typename T>
using DistinctDataPhase2 = PHHashSet<T, HashMixWrapper<T>>;
using DistinctDataPhase2 = typename DistinctPhase2HashSetType<T>::HashSet;

using DistinctDataWithStringKey = PHHashSet<StringRef>;

Expand Down
100 changes: 100 additions & 0 deletions be/src/vec/common/hash_table/ph_hash_set.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#pragma once

#include <boost/core/noncopyable.hpp>
#include <cstdint>

#include "vec/common/hash_table/hash.h"
#include "vec/common/hash_table/phmap_fwd_decl.h"
Expand Down Expand Up @@ -110,3 +111,102 @@ class PHHashSet : private boost::noncopyable {
private:
HashSetImpl _hash_set;
};

//use to small fixed size key ,for example: int8_t, int16_t
template <typename KeyType>
class SmallFixedSizeHashSet {
public:
using key_type = KeyType;
using mapped_type = void;
using value_type = void;
using Value = void*;
static_assert(std::is_integral_v<KeyType>);
using search_key_type = std::make_unsigned_t<KeyType>;
using SetType = uint8_t;

static constexpr int hash_table_size = 1 << sizeof(KeyType) * 8;

using LookupResult = void*;
static constexpr auto End = static_cast<SetType>(0xFFFF);

static constexpr SetType set_flag = 1;

SmallFixedSizeHashSet() {
memset(_hash_table, 0, sizeof(SetType) * hash_table_size);
_hash_table[hash_table_size] = End;
}

SmallFixedSizeHashSet(size_t reserve_for_num_elements) {}

SmallFixedSizeHashSet(SmallFixedSizeHashSet&& other) { *this = std::move(other); }

SmallFixedSizeHashSet& operator=(SmallFixedSizeHashSet&& rhs) {
_size = rhs._size;
memcpy(_hash_table, rhs._hash_table, sizeof(SetType) * hash_table_size);
_hash_table[hash_table_size] = End;
return *this;
}

size_t hash(const KeyType& x) const { return x; }

template <typename KeyHolder, typename Func>
void ALWAYS_INLINE lazy_emplace(KeyHolder&& key_holder, LookupResult& it, Func&& f) {
if (_hash_table[static_cast<search_key_type>(key_holder)] != set_flag) {
auto ctor = [&](auto& key_value) {
_size++;
_hash_table[static_cast<search_key_type>(key_value)] = set_flag;
};
f(ctor, key_holder);
}
}

template <typename KeyHolder, typename Func>
void ALWAYS_INLINE lazy_emplace(KeyHolder&& key, LookupResult& it, size_t hash_value,
Func&& f) {
if (_hash_table[static_cast<search_key_type>(key)] != set_flag) {
auto ctor = [&](auto& key_value) {
_size++;
_hash_table[static_cast<search_key_type>(key_value)] = set_flag;
};
f(ctor, key, key);
}
}

template <bool read>
void ALWAYS_INLINE prefetch(const KeyType& key, size_t hash_value) {}

/// Call func(Mapped &) for each hash map element.
template <typename Func>
void for_each_mapped(Func&& func) {
for (int i = 0; i < hash_table_size; i++) {
if (_hash_table[i] == set_flag) {
func(i);
}
}
}

size_t get_buffer_size_in_bytes() const { return sizeof(SetType) * hash_table_size; }

size_t get_buffer_size_in_cells() const { return hash_table_size; }

bool add_elem_size_overflow(size_t row) const { return false; }

size_t size() const { return _size; }

template <typename MappedType>
void* get_null_key_data() {
return nullptr;
}

bool has_null_key_data() const { return false; }

bool empty() const { return _size == 0; }

void clear_and_shrink() {}

void reserve(size_t num_elem) {}

private:
size_t _size = 0;
uint8_t _hash_table[hash_table_size + 1];
};
98 changes: 98 additions & 0 deletions be/test/vec/utils/small_size_hash_set_test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#include <gtest/gtest-message.h>
#include <gtest/gtest-test-part.h>
#include <gtest/gtest.h>

#include <cstdint>
#include <vector>

#include "vec/common/hash_table/ph_hash_set.h"

namespace doris::vectorized {

TEST(SmallSizeHashSetTest, testint8) {
SmallFixedSizeHashSet<int8_t> small_hash_set;
PHHashSet<int8_t, HashCRC32<int8_t>> ph_hash_set;
EXPECT_EQ(small_hash_set.size(), ph_hash_set.size());
EXPECT_EQ(0, small_hash_set.size());
EXPECT_EQ(0, ph_hash_set.size());

void* lookup_result_get_mapped = nullptr;

std::vector<int8_t> input_data = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
-1, -2, -3, -4, -5, -6, -7, -8, -9, -10};
for (auto& data : input_data) {
small_hash_set.lazy_emplace(data, lookup_result_get_mapped,
[&](auto& ctor, auto& key_holder) { ctor(key_holder); });
ph_hash_set.lazy_emplace(data, lookup_result_get_mapped,
[&](auto& ctor, auto& key_holder) { ctor(key_holder); });
}

EXPECT_EQ(small_hash_set.size(), ph_hash_set.size());

for (auto& data : input_data) {
small_hash_set.lazy_emplace(data, lookup_result_get_mapped,
[&](auto& ctor, auto& key_holder) {
EXPECT_TRUE(false);
ctor(key_holder);
});
ph_hash_set.lazy_emplace(data, lookup_result_get_mapped, [&](auto& ctor, auto& key_holder) {
EXPECT_TRUE(false);
ctor(key_holder);
});
}
EXPECT_EQ(small_hash_set.size(), ph_hash_set.size());

std::cout << "hash set size : " << small_hash_set.size() << "\t" << ph_hash_set.size();
}

TEST(SmallSizeHashSetTest, testuint8) {
SmallFixedSizeHashSet<uint8_t> small_hash_set;
PHHashSet<int8_t, HashCRC32<uint8_t>> ph_hash_set;
EXPECT_EQ(small_hash_set.size(), ph_hash_set.size());
EXPECT_EQ(0, small_hash_set.size());
EXPECT_EQ(0, ph_hash_set.size());

void* lookup_result_get_mapped = nullptr;

std::vector<uint8_t> input_data = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 233, 24, 51, 231};
for (auto& data : input_data) {
small_hash_set.lazy_emplace(data, lookup_result_get_mapped,
[&](auto& ctor, auto& key_holder) { ctor(key_holder); });
ph_hash_set.lazy_emplace(data, lookup_result_get_mapped,
[&](auto& ctor, auto& key_holder) { ctor(key_holder); });
}

EXPECT_EQ(small_hash_set.size(), ph_hash_set.size());

for (auto& data : input_data) {
small_hash_set.lazy_emplace(data, lookup_result_get_mapped,
[&](auto& ctor, auto& key_holder) {
EXPECT_TRUE(false);
ctor(key_holder);
});
ph_hash_set.lazy_emplace(data, lookup_result_get_mapped, [&](auto& ctor, auto& key_holder) {
EXPECT_TRUE(false);
ctor(key_holder);
Mryange marked this conversation as resolved.
Show resolved Hide resolved
});
}
EXPECT_EQ(small_hash_set.size(), ph_hash_set.size());
std::cout << "hash set size : " << small_hash_set.size() << "\t" << ph_hash_set.size();
}
} // namespace doris::vectorized
Loading