From fc549eca51d5dc16dc254e373a438c0971af14f5 Mon Sep 17 00:00:00 2001 From: airborne12 Date: Thu, 21 Nov 2024 21:34:44 +0800 Subject: [PATCH] [opt](primary key bf) enhance primary key bloomfilter by fixed slice type (#44397) Problem Summary: Currently, the primary key Bloom filter index can be created with any data type. However, when adding values, it only supports slice values. This inconsistency may lead to potential misuse or future issues. --- be/src/olap/primary_key_index.cpp | 4 +- .../segment_v2/bloom_filter_index_writer.cpp | 17 +++ .../segment_v2/bloom_filter_index_writer.h | 2 + .../bloom_filter_index_reader_writer_test.cpp | 107 +++++++++++++----- 4 files changed, 99 insertions(+), 31 deletions(-) diff --git a/be/src/olap/primary_key_index.cpp b/be/src/olap/primary_key_index.cpp index e416639cfb06cd..5f7bedb01fc8de 100644 --- a/be/src/olap/primary_key_index.cpp +++ b/be/src/olap/primary_key_index.cpp @@ -50,8 +50,8 @@ Status PrimaryKeyIndexBuilder::init() { auto opt = segment_v2::BloomFilterOptions(); opt.fpp = 0.01; - _bloom_filter_index_builder.reset( - new segment_v2::PrimaryKeyBloomFilterIndexWriterImpl(opt, type_info)); + RETURN_IF_ERROR(segment_v2::PrimaryKeyBloomFilterIndexWriterImpl::create( + opt, type_info, &_bloom_filter_index_builder)); return Status::OK(); } diff --git a/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.cpp b/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.cpp index 98669ccb141ae7..edc6102703f492 100644 --- a/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.cpp +++ b/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.cpp @@ -348,5 +348,22 @@ Status NGramBloomFilterIndexWriterImpl::create(const BloomFilterOptions& bf_opti return Status::OK(); } +Status PrimaryKeyBloomFilterIndexWriterImpl::create(const BloomFilterOptions& bf_options, + const TypeInfo* typeinfo, + std::unique_ptr* res) { + FieldType type = typeinfo->type(); + switch (type) { + case FieldType::OLAP_FIELD_TYPE_CHAR: + case FieldType::OLAP_FIELD_TYPE_VARCHAR: + case FieldType::OLAP_FIELD_TYPE_STRING: + *res = std::make_unique(bf_options, typeinfo); + break; + default: + return Status::NotSupported("unsupported type for primary key bloom filter index:{}", + std::to_string(int(type))); + } + return Status::OK(); +} + } // namespace segment_v2 } // namespace doris diff --git a/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.h b/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.h index 2cdf7171e3e276..a94982438f651a 100644 --- a/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.h +++ b/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.h @@ -85,6 +85,8 @@ class PrimaryKeyBloomFilterIndexWriterImpl : public BloomFilterIndexWriter { } }; + static Status create(const BloomFilterOptions& bf_options, const TypeInfo* typeinfo, + std::unique_ptr* res); // This method may allocate large memory for bf, will return error // when memory is exhaused to prevent oom. Status add_values(const void* values, size_t count) override; diff --git a/be/test/olap/rowset/segment_v2/bloom_filter_index_reader_writer_test.cpp b/be/test/olap/rowset/segment_v2/bloom_filter_index_reader_writer_test.cpp index 258dd9a5ff8b51..69cb343f04bf91 100644 --- a/be/test/olap/rowset/segment_v2/bloom_filter_index_reader_writer_test.cpp +++ b/be/test/olap/rowset/segment_v2/bloom_filter_index_reader_writer_test.cpp @@ -59,40 +59,46 @@ class BloomFilterIndexReaderWriterTest : public testing::Test { }; template -void write_bloom_filter_index_file(const std::string& file_name, const void* values, - size_t value_count, size_t null_count, - ColumnIndexMetaPB* index_meta) { +Status write_bloom_filter_index_file(const std::string& file_name, const void* values, + size_t value_count, size_t null_count, + ColumnIndexMetaPB* index_meta, + bool use_primary_key_bloom_filter = false) { const auto* type_info = get_scalar_type_info(); using CppType = typename CppTypeTraits::CppType; std::string fname = dname + "/" + file_name; auto fs = io::global_local_filesystem(); { io::FileWriterPtr file_writer; - Status st = fs->create_file(fname, &file_writer); - EXPECT_TRUE(st.ok()) << st.to_string(); + RETURN_IF_ERROR(fs->create_file(fname, &file_writer)); std::unique_ptr bloom_filter_index_writer; BloomFilterOptions bf_options; - static_cast( - BloomFilterIndexWriter::create(bf_options, type_info, &bloom_filter_index_writer)); + + if (use_primary_key_bloom_filter) { + RETURN_IF_ERROR(PrimaryKeyBloomFilterIndexWriterImpl::create( + bf_options, type_info, &bloom_filter_index_writer)); + } else { + RETURN_IF_ERROR(BloomFilterIndexWriter::create(bf_options, type_info, + &bloom_filter_index_writer)); + } + const CppType* vals = (const CppType*)values; for (int i = 0; i < value_count;) { size_t num = std::min(1024, (int)value_count - i); - static_cast(bloom_filter_index_writer->add_values(vals + i, num)); + RETURN_IF_ERROR(bloom_filter_index_writer->add_values(vals + i, num)); if (i == 2048) { // second page bloom_filter_index_writer->add_nulls(null_count); } - st = bloom_filter_index_writer->flush(); - EXPECT_TRUE(st.ok()); + RETURN_IF_ERROR(bloom_filter_index_writer->flush()); i += 1024; } - st = bloom_filter_index_writer->finish(file_writer.get(), index_meta); - EXPECT_TRUE(st.ok()) << "writer finish status:" << st.to_string(); + RETURN_IF_ERROR(bloom_filter_index_writer->finish(file_writer.get(), index_meta)); EXPECT_TRUE(file_writer->close().ok()); EXPECT_EQ(BLOOM_FILTER_INDEX, index_meta->type()); EXPECT_EQ(bf_options.strategy, index_meta->bloom_filter_index().hash_strategy()); } + return Status::OK(); } void get_bloom_filter_reader_iter(const std::string& file_name, const ColumnIndexMetaPB& meta, @@ -110,13 +116,14 @@ void get_bloom_filter_reader_iter(const std::string& file_name, const ColumnInde } template -void test_bloom_filter_index_reader_writer_template( +Status test_bloom_filter_index_reader_writer_template( const std::string file_name, typename TypeTraits::CppType* val, size_t num, size_t null_num, typename TypeTraits::CppType* not_exist_value, - bool is_slice_type = false) { + bool is_slice_type = false, bool use_primary_key_bloom_filter = false) { using CppType = typename TypeTraits::CppType; ColumnIndexMetaPB meta; - write_bloom_filter_index_file(file_name, val, num, null_num, &meta); + RETURN_IF_ERROR(write_bloom_filter_index_file(file_name, val, num, null_num, &meta, + use_primary_key_bloom_filter)); { BloomFilterIndexReader* reader = nullptr; std::unique_ptr iter; @@ -124,8 +131,7 @@ void test_bloom_filter_index_reader_writer_template( // page 0 std::unique_ptr bf; - auto st = iter->read_bloom_filter(0, &bf); - EXPECT_TRUE(st.ok()); + RETURN_IF_ERROR(iter->read_bloom_filter(0, &bf)); for (int i = 0; i < 1024; ++i) { if (is_slice_type) { Slice* value = (Slice*)(val + i); @@ -136,8 +142,7 @@ void test_bloom_filter_index_reader_writer_template( } // page 1 - st = iter->read_bloom_filter(1, &bf); - EXPECT_TRUE(st.ok()); + RETURN_IF_ERROR(iter->read_bloom_filter(1, &bf)); for (int i = 1024; i < 2048; ++i) { if (is_slice_type) { Slice* value = (Slice*)(val + i); @@ -148,8 +153,7 @@ void test_bloom_filter_index_reader_writer_template( } // page 2 - st = iter->read_bloom_filter(2, &bf); - EXPECT_TRUE(st.ok()); + RETURN_IF_ERROR(iter->read_bloom_filter(2, &bf)); for (int i = 2048; i < 3071; ++i) { if (is_slice_type) { Slice* value = (Slice*)(val + i); @@ -163,6 +167,7 @@ void test_bloom_filter_index_reader_writer_template( delete reader; } + return Status::OK(); } TEST_F(BloomFilterIndexReaderWriterTest, test_int) { @@ -175,8 +180,9 @@ TEST_F(BloomFilterIndexReaderWriterTest, test_int) { std::string file_name = "bloom_filter_int"; int not_exist_value = 18888; - test_bloom_filter_index_reader_writer_template( + auto st = test_bloom_filter_index_reader_writer_template( file_name, val, num, 1, ¬_exist_value); + EXPECT_TRUE(st.ok()); delete[] val; } @@ -190,8 +196,9 @@ TEST_F(BloomFilterIndexReaderWriterTest, test_bigint) { std::string file_name = "bloom_filter_bigint"; int64_t not_exist_value = 18888; - test_bloom_filter_index_reader_writer_template( + auto st = test_bloom_filter_index_reader_writer_template( file_name, val, num, 1, ¬_exist_value); + EXPECT_TRUE(st.ok()); delete[] val; } @@ -205,8 +212,9 @@ TEST_F(BloomFilterIndexReaderWriterTest, test_largeint) { std::string file_name = "bloom_filter_largeint"; int128_t not_exist_value = 18888; - test_bloom_filter_index_reader_writer_template( + auto st = test_bloom_filter_index_reader_writer_template( file_name, val, num, 1, ¬_exist_value); + EXPECT_TRUE(st.ok()); delete[] val; } @@ -224,8 +232,9 @@ TEST_F(BloomFilterIndexReaderWriterTest, test_varchar_type) { } std::string file_name = "bloom_filter_varchar"; Slice not_exist_value("value_not_exist"); - test_bloom_filter_index_reader_writer_template( + auto st = test_bloom_filter_index_reader_writer_template( file_name, slices, num, 1, ¬_exist_value, true); + EXPECT_TRUE(st.ok()); delete[] val; delete[] slices; } @@ -244,8 +253,9 @@ TEST_F(BloomFilterIndexReaderWriterTest, test_char) { } std::string file_name = "bloom_filter_char"; Slice not_exist_value("char_value_not_exist"); - test_bloom_filter_index_reader_writer_template( + auto st = test_bloom_filter_index_reader_writer_template( file_name, slices, num, 1, ¬_exist_value, true); + EXPECT_TRUE(st.ok()); delete[] val; delete[] slices; } @@ -260,8 +270,9 @@ TEST_F(BloomFilterIndexReaderWriterTest, test_date) { std::string file_name = "bloom_filter_date"; uint24_t not_exist_value = 18888; - test_bloom_filter_index_reader_writer_template( + auto st = test_bloom_filter_index_reader_writer_template( file_name, val, num, 1, ¬_exist_value); + EXPECT_TRUE(st.ok()); delete[] val; } @@ -275,8 +286,9 @@ TEST_F(BloomFilterIndexReaderWriterTest, test_datetime) { std::string file_name = "bloom_filter_datetime"; int64_t not_exist_value = 18888; - test_bloom_filter_index_reader_writer_template( + auto st = test_bloom_filter_index_reader_writer_template( file_name, val, num, 1, ¬_exist_value); + EXPECT_TRUE(st.ok()); delete[] val; } @@ -290,8 +302,45 @@ TEST_F(BloomFilterIndexReaderWriterTest, test_decimal) { std::string file_name = "bloom_filter_decimal"; decimal12_t not_exist_value = {666, 666}; - test_bloom_filter_index_reader_writer_template( + auto st = test_bloom_filter_index_reader_writer_template( file_name, val, num, 1, ¬_exist_value); + EXPECT_TRUE(st.ok()); + delete[] val; +} + +TEST_F(BloomFilterIndexReaderWriterTest, test_primary_key_bloom_filter_index) { + size_t num = 1024 * 3 - 1; + std::vector val_strings(num); + for (size_t i = 0; i < num; ++i) { + val_strings[i] = "primary_key_" + std::to_string(i); + } + std::vector slices(num); + for (size_t i = 0; i < num; ++i) { + slices[i] = Slice(val_strings[i]); + } + + std::string file_name = "primary_key_bloom_filter_index"; + Slice not_exist_value("primary_key_not_exist"); + + auto st = test_bloom_filter_index_reader_writer_template( + file_name, slices.data(), num, 0, ¬_exist_value, true, true); + EXPECT_TRUE(st.ok()); +} + +TEST_F(BloomFilterIndexReaderWriterTest, test_primary_key_bloom_filter_index_int) { + size_t num = 1024 * 3 - 1; + int* val = new int[num]; + for (int i = 0; i < num; ++i) { + // there will be 3 bloom filter pages + val[i] = 10000 + i + 1; + } + + std::string file_name = "primary_key_bloom_filter_index_int"; + int not_exist_value = 18888; + auto st = test_bloom_filter_index_reader_writer_template( + file_name, val, num, 1, ¬_exist_value, false, true); + EXPECT_FALSE(st.ok()); + EXPECT_EQ(st.code(), TStatusCode::NOT_IMPLEMENTED_ERROR); delete[] val; }