From 30ebe421dddbe96fb4d3805c219abfd736a583b1 Mon Sep 17 00:00:00 2001 From: airborne12 Date: Thu, 26 Dec 2024 19:42:25 +0800 Subject: [PATCH] [opt](bloomfilter index) optimize memory usage for bloom filter index writer (#45833) ### What problem does this PR solve? Issue Number: close #xxx Related PR: #xxx Problem Summary: Optimize memory usage when adding string values for bloom filter index. Using uint64 hash value instead of string values itself, it is expected to save a lot of memory for especially long text --- be/src/olap/rowset/segment_v2/bloom_filter.h | 10 ++++++ .../segment_v2/bloom_filter_index_writer.cpp | 27 +++++++++------- .../bloom_filter_index_reader_writer_test.cpp | 31 ++++++++++++++++++- 3 files changed, 56 insertions(+), 12 deletions(-) diff --git a/be/src/olap/rowset/segment_v2/bloom_filter.h b/be/src/olap/rowset/segment_v2/bloom_filter.h index 4f4adf0fd12283..2ef050257e16ab 100644 --- a/be/src/olap/rowset/segment_v2/bloom_filter.h +++ b/be/src/olap/rowset/segment_v2/bloom_filter.h @@ -167,6 +167,16 @@ class BloomFilter { return hash_code; } + static Result hash(const char* buf, uint32_t size, HashStrategyPB strategy) { + if (strategy == HASH_MURMUR3_X64_64) { + uint64_t hash_code; + murmur_hash3_x64_64(buf, size, DEFAULT_SEED, &hash_code); + return hash_code; + } else { + return Status::InvalidArgument("invalid strategy:{}", strategy); + } + } + virtual void add_bytes(const char* buf, uint32_t size) { if (buf == nullptr) { *_has_null = true; diff --git a/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.cpp b/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.cpp index 3f9fb94df0a844..0326512c3d76ca 100644 --- a/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.cpp +++ b/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.cpp @@ -78,9 +78,10 @@ class BloomFilterIndexWriterImpl : public BloomFilterIndexWriter { for (int i = 0; i < count; ++i) { if (_values.find(*v) == _values.end()) { if constexpr (_is_slice_type()) { - CppType new_value; - RETURN_IF_CATCH_EXCEPTION(_type_info->deep_copy(&new_value, v, &_arena)); - _values.insert(new_value); + const auto* s = reinterpret_cast(v); + auto hash = + DORIS_TRY(BloomFilter::hash(s->data, s->size, _bf_options.strategy)); + _hash_values.insert(hash); } else if constexpr (_is_int128()) { int128_t new_value; memcpy(&new_value, v, sizeof(PackedInt128)); @@ -99,25 +100,28 @@ class BloomFilterIndexWriterImpl : public BloomFilterIndexWriter { Status flush() override { std::unique_ptr bf; RETURN_IF_ERROR(BloomFilter::create(BLOCK_BLOOM_FILTER, &bf)); - RETURN_IF_ERROR(bf->init(_values.size(), _bf_options.fpp, _bf_options.strategy)); - bf->set_has_null(_has_null); - for (auto& v : _values) { - if constexpr (_is_slice_type()) { - auto* s = (Slice*)&v; - bf->add_bytes(s->data, s->size); - } else { + if constexpr (_is_slice_type()) { + RETURN_IF_ERROR(bf->init(_hash_values.size(), _bf_options.fpp, _bf_options.strategy)); + for (const auto& h : _hash_values) { + bf->add_hash(h); + } + } else { + RETURN_IF_ERROR(bf->init(_values.size(), _bf_options.fpp, _bf_options.strategy)); + for (auto& v : _values) { bf->add_bytes((char*)&v, sizeof(CppType)); } } + bf->set_has_null(_has_null); _bf_buffer_size += bf->size(); _bfs.push_back(std::move(bf)); _values.clear(); + _hash_values.clear(); _has_null = false; return Status::OK(); } Status finish(io::FileWriter* file_writer, ColumnIndexMetaPB* index_meta) override { - if (_values.size() > 0) { + if (_values.size() > 0 || !_hash_values.empty()) { RETURN_IF_ERROR(flush()); } index_meta->set_type(BLOOM_FILTER_INDEX); @@ -166,6 +170,7 @@ class BloomFilterIndexWriterImpl : public BloomFilterIndexWriter { // distinct values ValueDict _values; std::vector> _bfs; + std::set _hash_values; }; } // namespace diff --git a/be/test/olap/rowset/segment_v2/bloom_filter_index_reader_writer_test.cpp b/be/test/olap/rowset/segment_v2/bloom_filter_index_reader_writer_test.cpp index e561f8ce944887..23256ee5a6e102 100644 --- a/be/test/olap/rowset/segment_v2/bloom_filter_index_reader_writer_test.cpp +++ b/be/test/olap/rowset/segment_v2/bloom_filter_index_reader_writer_test.cpp @@ -180,7 +180,12 @@ Status test_bloom_filter_index_reader_writer_template( } // test nullptr EXPECT_TRUE(bf->test_bytes(nullptr, 1)); - + if (is_slice_type) { + Slice* value = (Slice*)(not_exist_value); + EXPECT_FALSE(bf->test_bytes(value->data, value->size)); + } else { + EXPECT_FALSE(bf->test_bytes((char*)not_exist_value, sizeof(CppType))); + } delete reader; } return Status::OK(); @@ -803,5 +808,29 @@ TEST_F(BloomFilterIndexReaderWriterTest, test_bloom_filter_fpp_multiple) { test_bloom_filter_fpp(fpp); } } + +TEST_F(BloomFilterIndexReaderWriterTest, test_slice_memory_usage) { + size_t num = 1024 * 3; + const size_t slice_size = 256; + + std::vector data_buffer; + data_buffer.resize(num * slice_size); + + std::vector slice_vals(num); + for (size_t i = 0; i < num; ++i) { + char* ptr = data_buffer.data() + i * slice_size; + memset(ptr, 'a' + (i % 26), slice_size); + + slice_vals[i].data = ptr; + slice_vals[i].size = slice_size; + } + + std::string not_exist_str = "not_exist_val"; + Slice not_exist_value(not_exist_str); + + auto st = test_bloom_filter_index_reader_writer_template( + "bloom_filter_large_slices", slice_vals.data(), num, 1, ¬_exist_value, true, false); + EXPECT_TRUE(st.ok()); +} } // namespace segment_v2 } // namespace doris