Skip to content

Commit

Permalink
[opt](hashset)Optimize u8/i8 distinct by using SmallFixedSizeHashSet (#…
Browse files Browse the repository at this point in the history
…46542)

### What problem does this PR solve?
Optimize u8/i8 distinct by using SmallFixedSizeHashSet
before
```
mysql [test]>select count(distinct u8) from nums;
+--------------------+
| count(DISTINCT u8) |
+--------------------+
|                256 |
+--------------------+
1 row in set (0.17 sec)
```
now
```
mysql [test]>select count(distinct u8) from nums;
+--------------------+
| count(DISTINCT u8) |
+--------------------+
|                256 |
+--------------------+
1 row in set (0.04 sec)
```
  • Loading branch information
Mryange authored Jan 13, 2025
1 parent 77aadf1 commit a64be25
Show file tree
Hide file tree
Showing 3 changed files with 258 additions and 2 deletions.
36 changes: 34 additions & 2 deletions be/src/pipeline/common/distinct_agg_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,45 @@
#include "vec/common/hash_table/ph_hash_map.h"
#include "vec/common/hash_table/ph_hash_set.h"
#include "vec/common/hash_table/string_hash_map.h"
#include "vec/core/types.h"

namespace doris {

template <typename T>
struct DistinctHashSetType {
using HashSet = PHHashSet<T, HashCRC32<T>>;
};

template <>
struct DistinctHashSetType<vectorized::UInt8> {
using HashSet = SmallFixedSizeHashSet<vectorized::UInt8>;
};

template <>
struct DistinctHashSetType<vectorized::Int8> {
using HashSet = SmallFixedSizeHashSet<vectorized::Int8>;
};

template <typename T>
struct DistinctPhase2HashSetType {
using HashSet = PHHashSet<T, HashMixWrapper<T>>;
};

template <>
struct DistinctPhase2HashSetType<vectorized::UInt8> {
using HashSet = SmallFixedSizeHashSet<vectorized::UInt8>;
};

template <>
struct DistinctPhase2HashSetType<vectorized::Int8> {
using HashSet = SmallFixedSizeHashSet<vectorized::Int8>;
};

template <typename T>
using DistinctData = PHHashSet<T, HashCRC32<T>>;
using DistinctData = typename DistinctHashSetType<T>::HashSet;

template <typename T>
using DistinctDataPhase2 = PHHashSet<T, HashMixWrapper<T>>;
using DistinctDataPhase2 = typename DistinctPhase2HashSetType<T>::HashSet;

using DistinctDataWithStringKey = PHHashSet<StringRef>;

Expand Down
101 changes: 101 additions & 0 deletions be/src/vec/common/hash_table/ph_hash_set.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#pragma once

#include <boost/core/noncopyable.hpp>
#include <cstdint>

#include "vec/common/hash_table/hash.h"
#include "vec/common/hash_table/phmap_fwd_decl.h"
Expand Down Expand Up @@ -110,3 +111,103 @@ class PHHashSet : private boost::noncopyable {
private:
HashSetImpl _hash_set;
};

//use to small fixed size key ,for example: int8_t, int16_t
template <typename KeyType>
class SmallFixedSizeHashSet {
public:
static_assert(std::is_integral_v<KeyType>);
static_assert(sizeof(KeyType) <= 2);

using key_type = KeyType;
using mapped_type = void;
using value_type = void;
using Value = void*;
using LookupResult = void*;

using search_key_type = std::make_unsigned_t<KeyType>;
using SetType = bool;
static constexpr SetType set_flag = true;
static constexpr SetType not_set_flag = false;
static constexpr int hash_table_size = 1 << sizeof(KeyType) * 8;
static_assert(sizeof(SetType) == 1);

SmallFixedSizeHashSet() { memset(_hash_table, not_set_flag, hash_table_size); }

SmallFixedSizeHashSet(size_t reserve_for_num_elements) {}

SmallFixedSizeHashSet(SmallFixedSizeHashSet&& other) { *this = std::move(other); }

SmallFixedSizeHashSet& operator=(SmallFixedSizeHashSet&& rhs) {
_size = rhs._size;
memcpy(_hash_table, rhs._hash_table, hash_table_size);
return *this;
}

size_t hash(const KeyType& x) const { return static_cast<search_key_type>(x); }

template <typename KeyHolder, typename Func>
void ALWAYS_INLINE lazy_emplace(KeyHolder&& key_holder, LookupResult& it, Func&& f) {
DCHECK(0 <= static_cast<search_key_type>(key_holder) &&
static_cast<search_key_type>(key_holder) < hash_table_size);
if (_hash_table[static_cast<search_key_type>(key_holder)] == not_set_flag) {
auto ctor = [&](auto& key_value) {
_size++;
_hash_table[static_cast<search_key_type>(key_value)] = set_flag;
};
f(ctor, key_holder);
}
}

template <typename KeyHolder, typename Func>
void ALWAYS_INLINE lazy_emplace(KeyHolder&& key, LookupResult& it, size_t hash_value,
Func&& f) {
DCHECK(0 <= static_cast<search_key_type>(key) &&
static_cast<search_key_type>(key) < hash_table_size);
if (_hash_table[static_cast<search_key_type>(key)] == not_set_flag) {
auto ctor = [&](auto& key_value) {
_size++;
_hash_table[static_cast<search_key_type>(key_value)] = set_flag;
};
f(ctor, key, key);
}
}

template <bool read>
void ALWAYS_INLINE prefetch(const KeyType& key, size_t hash_value) {}

/// Call func(Mapped &) for each hash map element.
template <typename Func>
void for_each_mapped(Func&& func) {
for (int i = 0; i < hash_table_size; i++) {
if (_hash_table[i] == set_flag) {
func(i);
}
}
}

size_t get_buffer_size_in_bytes() const { return hash_table_size; }

size_t get_buffer_size_in_cells() const { return hash_table_size; }

bool add_elem_size_overflow(size_t row) const { return false; }

size_t size() const { return _size; }

template <typename MappedType>
void* get_null_key_data() {
return nullptr;
}

bool has_null_key_data() const { return false; }

bool empty() const { return _size == 0; }

void clear_and_shrink() {}

void reserve(size_t num_elem) {}

private:
size_t _size = 0;
uint8_t _hash_table[hash_table_size];
};
123 changes: 123 additions & 0 deletions be/test/vec/utils/small_size_hash_set_test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#include <gtest/gtest-message.h>
#include <gtest/gtest-test-part.h>
#include <gtest/gtest.h>

#include <cstdint>
#include <ostream>
#include <random>
#include <sstream>
#include <vector>

#include "vec/common/hash_table/ph_hash_set.h"

namespace doris::vectorized {

template <typename T>
void test_type_hash_map() {
SmallFixedSizeHashSet<T> small_hash_set;
PHHashSet<T, HashCRC32<T>> ph_hash_set;
EXPECT_EQ(small_hash_set.size(), ph_hash_set.size());
EXPECT_EQ(0, small_hash_set.size());
EXPECT_EQ(0, ph_hash_set.size());

void* lookup_result_get_mapped = nullptr;
std::vector<T> input_data;
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_int_distribution<T> dis(std::numeric_limits<T>::min(),
std::numeric_limits<T>::max());

for (int i = 0; i < 20; ++i) {
input_data.push_back(dis(gen));
}

for (auto& data : input_data) {
small_hash_set.lazy_emplace(data, lookup_result_get_mapped,
[&](auto& ctor, auto& key_holder) { ctor(key_holder); });
ph_hash_set.lazy_emplace(data, lookup_result_get_mapped,
[&](auto& ctor, auto& key_holder) { ctor(key_holder); });
}

EXPECT_EQ(small_hash_set.size(), ph_hash_set.size());

for (auto& data : input_data) {
small_hash_set.lazy_emplace(data, lookup_result_get_mapped,
[&](auto& ctor, auto& key_holder) {
EXPECT_TRUE(false);
ctor(key_holder);
});
ph_hash_set.lazy_emplace(data, lookup_result_get_mapped, [&](auto& ctor, auto& key_holder) {
EXPECT_TRUE(false);
ctor(key_holder);
});
}

auto print_data = [&]() -> std::string {
std::stringstream ss;
for (int x : input_data) {
ss << x << " ";
}
return ss.str();
};
EXPECT_EQ(small_hash_set.size(), ph_hash_set.size()) << print_data();

auto move_small_hash_set = std::move(small_hash_set);
auto move_ph_hash_set = std::move(ph_hash_set);

for (auto& data : input_data) {
move_small_hash_set.lazy_emplace(data, lookup_result_get_mapped,
[&](auto& ctor, auto& key_holder) {
EXPECT_TRUE(false);
ctor(key_holder);
});
move_ph_hash_set.lazy_emplace(data, lookup_result_get_mapped,
[&](auto& ctor, auto& key_holder) {
EXPECT_TRUE(false);
ctor(key_holder);
});
}

EXPECT_EQ(move_small_hash_set.size(), move_ph_hash_set.size()) << print_data();
}

TEST(SmallSizeHashSetTest, testint8) {
for (int i = 0; i < 100; i++) {
test_type_hash_map<int8_t>();
}
}

TEST(SmallSizeHashSetTest, testuint8) {
for (int i = 0; i < 100; i++) {
test_type_hash_map<uint8_t>();
}
}

TEST(SmallSizeHashSetTest, testint16) {
for (int i = 0; i < 100; i++) {
test_type_hash_map<int16_t>();
}
}

TEST(SmallSizeHashSetTest, testuint16) {
for (int i = 0; i < 100; i++) {
test_type_hash_map<uint16_t>();
}
}
} // namespace doris::vectorized

0 comments on commit a64be25

Please sign in to comment.