diff --git a/be/src/olap/primary_key_index.cpp b/be/src/olap/primary_key_index.cpp index e416639cfb06cd..5f7bedb01fc8de 100644 --- a/be/src/olap/primary_key_index.cpp +++ b/be/src/olap/primary_key_index.cpp @@ -50,8 +50,8 @@ Status PrimaryKeyIndexBuilder::init() { auto opt = segment_v2::BloomFilterOptions(); opt.fpp = 0.01; - _bloom_filter_index_builder.reset( - new segment_v2::PrimaryKeyBloomFilterIndexWriterImpl(opt, type_info)); + RETURN_IF_ERROR(segment_v2::PrimaryKeyBloomFilterIndexWriterImpl::create( + opt, type_info, &_bloom_filter_index_builder)); return Status::OK(); } diff --git a/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.cpp b/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.cpp index 98669ccb141ae7..edc6102703f492 100644 --- a/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.cpp +++ b/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.cpp @@ -348,5 +348,22 @@ Status NGramBloomFilterIndexWriterImpl::create(const BloomFilterOptions& bf_opti return Status::OK(); } +Status PrimaryKeyBloomFilterIndexWriterImpl::create(const BloomFilterOptions& bf_options, + const TypeInfo* typeinfo, + std::unique_ptr* res) { + FieldType type = typeinfo->type(); + switch (type) { + case FieldType::OLAP_FIELD_TYPE_CHAR: + case FieldType::OLAP_FIELD_TYPE_VARCHAR: + case FieldType::OLAP_FIELD_TYPE_STRING: + *res = std::make_unique(bf_options, typeinfo); + break; + default: + return Status::NotSupported("unsupported type for primary key bloom filter index:{}", + std::to_string(int(type))); + } + return Status::OK(); +} + } // namespace segment_v2 } // namespace doris diff --git a/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.h b/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.h index 2cdf7171e3e276..a94982438f651a 100644 --- a/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.h +++ b/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.h @@ -85,6 +85,8 @@ class PrimaryKeyBloomFilterIndexWriterImpl : public BloomFilterIndexWriter { } }; + static Status create(const BloomFilterOptions& bf_options, const TypeInfo* typeinfo, + std::unique_ptr* res); // This method may allocate large memory for bf, will return error // when memory is exhaused to prevent oom. Status add_values(const void* values, size_t count) override; diff --git a/be/test/olap/rowset/segment_v2/bloom_filter_index_reader_writer_test.cpp b/be/test/olap/rowset/segment_v2/bloom_filter_index_reader_writer_test.cpp index 258dd9a5ff8b51..69cb343f04bf91 100644 --- a/be/test/olap/rowset/segment_v2/bloom_filter_index_reader_writer_test.cpp +++ b/be/test/olap/rowset/segment_v2/bloom_filter_index_reader_writer_test.cpp @@ -59,40 +59,46 @@ class BloomFilterIndexReaderWriterTest : public testing::Test { }; template -void write_bloom_filter_index_file(const std::string& file_name, const void* values, - size_t value_count, size_t null_count, - ColumnIndexMetaPB* index_meta) { +Status write_bloom_filter_index_file(const std::string& file_name, const void* values, + size_t value_count, size_t null_count, + ColumnIndexMetaPB* index_meta, + bool use_primary_key_bloom_filter = false) { const auto* type_info = get_scalar_type_info(); using CppType = typename CppTypeTraits::CppType; std::string fname = dname + "/" + file_name; auto fs = io::global_local_filesystem(); { io::FileWriterPtr file_writer; - Status st = fs->create_file(fname, &file_writer); - EXPECT_TRUE(st.ok()) << st.to_string(); + RETURN_IF_ERROR(fs->create_file(fname, &file_writer)); std::unique_ptr bloom_filter_index_writer; BloomFilterOptions bf_options; - static_cast( - BloomFilterIndexWriter::create(bf_options, type_info, &bloom_filter_index_writer)); + + if (use_primary_key_bloom_filter) { + RETURN_IF_ERROR(PrimaryKeyBloomFilterIndexWriterImpl::create( + bf_options, type_info, &bloom_filter_index_writer)); + } else { + RETURN_IF_ERROR(BloomFilterIndexWriter::create(bf_options, type_info, + &bloom_filter_index_writer)); + } + const CppType* vals = (const CppType*)values; for (int i = 0; i < value_count;) { size_t num = std::min(1024, (int)value_count - i); - static_cast(bloom_filter_index_writer->add_values(vals + i, num)); + RETURN_IF_ERROR(bloom_filter_index_writer->add_values(vals + i, num)); if (i == 2048) { // second page bloom_filter_index_writer->add_nulls(null_count); } - st = bloom_filter_index_writer->flush(); - EXPECT_TRUE(st.ok()); + RETURN_IF_ERROR(bloom_filter_index_writer->flush()); i += 1024; } - st = bloom_filter_index_writer->finish(file_writer.get(), index_meta); - EXPECT_TRUE(st.ok()) << "writer finish status:" << st.to_string(); + RETURN_IF_ERROR(bloom_filter_index_writer->finish(file_writer.get(), index_meta)); EXPECT_TRUE(file_writer->close().ok()); EXPECT_EQ(BLOOM_FILTER_INDEX, index_meta->type()); EXPECT_EQ(bf_options.strategy, index_meta->bloom_filter_index().hash_strategy()); } + return Status::OK(); } void get_bloom_filter_reader_iter(const std::string& file_name, const ColumnIndexMetaPB& meta, @@ -110,13 +116,14 @@ void get_bloom_filter_reader_iter(const std::string& file_name, const ColumnInde } template -void test_bloom_filter_index_reader_writer_template( +Status test_bloom_filter_index_reader_writer_template( const std::string file_name, typename TypeTraits::CppType* val, size_t num, size_t null_num, typename TypeTraits::CppType* not_exist_value, - bool is_slice_type = false) { + bool is_slice_type = false, bool use_primary_key_bloom_filter = false) { using CppType = typename TypeTraits::CppType; ColumnIndexMetaPB meta; - write_bloom_filter_index_file(file_name, val, num, null_num, &meta); + RETURN_IF_ERROR(write_bloom_filter_index_file(file_name, val, num, null_num, &meta, + use_primary_key_bloom_filter)); { BloomFilterIndexReader* reader = nullptr; std::unique_ptr iter; @@ -124,8 +131,7 @@ void test_bloom_filter_index_reader_writer_template( // page 0 std::unique_ptr bf; - auto st = iter->read_bloom_filter(0, &bf); - EXPECT_TRUE(st.ok()); + RETURN_IF_ERROR(iter->read_bloom_filter(0, &bf)); for (int i = 0; i < 1024; ++i) { if (is_slice_type) { Slice* value = (Slice*)(val + i); @@ -136,8 +142,7 @@ void test_bloom_filter_index_reader_writer_template( } // page 1 - st = iter->read_bloom_filter(1, &bf); - EXPECT_TRUE(st.ok()); + RETURN_IF_ERROR(iter->read_bloom_filter(1, &bf)); for (int i = 1024; i < 2048; ++i) { if (is_slice_type) { Slice* value = (Slice*)(val + i); @@ -148,8 +153,7 @@ void test_bloom_filter_index_reader_writer_template( } // page 2 - st = iter->read_bloom_filter(2, &bf); - EXPECT_TRUE(st.ok()); + RETURN_IF_ERROR(iter->read_bloom_filter(2, &bf)); for (int i = 2048; i < 3071; ++i) { if (is_slice_type) { Slice* value = (Slice*)(val + i); @@ -163,6 +167,7 @@ void test_bloom_filter_index_reader_writer_template( delete reader; } + return Status::OK(); } TEST_F(BloomFilterIndexReaderWriterTest, test_int) { @@ -175,8 +180,9 @@ TEST_F(BloomFilterIndexReaderWriterTest, test_int) { std::string file_name = "bloom_filter_int"; int not_exist_value = 18888; - test_bloom_filter_index_reader_writer_template( + auto st = test_bloom_filter_index_reader_writer_template( file_name, val, num, 1, ¬_exist_value); + EXPECT_TRUE(st.ok()); delete[] val; } @@ -190,8 +196,9 @@ TEST_F(BloomFilterIndexReaderWriterTest, test_bigint) { std::string file_name = "bloom_filter_bigint"; int64_t not_exist_value = 18888; - test_bloom_filter_index_reader_writer_template( + auto st = test_bloom_filter_index_reader_writer_template( file_name, val, num, 1, ¬_exist_value); + EXPECT_TRUE(st.ok()); delete[] val; } @@ -205,8 +212,9 @@ TEST_F(BloomFilterIndexReaderWriterTest, test_largeint) { std::string file_name = "bloom_filter_largeint"; int128_t not_exist_value = 18888; - test_bloom_filter_index_reader_writer_template( + auto st = test_bloom_filter_index_reader_writer_template( file_name, val, num, 1, ¬_exist_value); + EXPECT_TRUE(st.ok()); delete[] val; } @@ -224,8 +232,9 @@ TEST_F(BloomFilterIndexReaderWriterTest, test_varchar_type) { } std::string file_name = "bloom_filter_varchar"; Slice not_exist_value("value_not_exist"); - test_bloom_filter_index_reader_writer_template( + auto st = test_bloom_filter_index_reader_writer_template( file_name, slices, num, 1, ¬_exist_value, true); + EXPECT_TRUE(st.ok()); delete[] val; delete[] slices; } @@ -244,8 +253,9 @@ TEST_F(BloomFilterIndexReaderWriterTest, test_char) { } std::string file_name = "bloom_filter_char"; Slice not_exist_value("char_value_not_exist"); - test_bloom_filter_index_reader_writer_template( + auto st = test_bloom_filter_index_reader_writer_template( file_name, slices, num, 1, ¬_exist_value, true); + EXPECT_TRUE(st.ok()); delete[] val; delete[] slices; } @@ -260,8 +270,9 @@ TEST_F(BloomFilterIndexReaderWriterTest, test_date) { std::string file_name = "bloom_filter_date"; uint24_t not_exist_value = 18888; - test_bloom_filter_index_reader_writer_template( + auto st = test_bloom_filter_index_reader_writer_template( file_name, val, num, 1, ¬_exist_value); + EXPECT_TRUE(st.ok()); delete[] val; } @@ -275,8 +286,9 @@ TEST_F(BloomFilterIndexReaderWriterTest, test_datetime) { std::string file_name = "bloom_filter_datetime"; int64_t not_exist_value = 18888; - test_bloom_filter_index_reader_writer_template( + auto st = test_bloom_filter_index_reader_writer_template( file_name, val, num, 1, ¬_exist_value); + EXPECT_TRUE(st.ok()); delete[] val; } @@ -290,8 +302,45 @@ TEST_F(BloomFilterIndexReaderWriterTest, test_decimal) { std::string file_name = "bloom_filter_decimal"; decimal12_t not_exist_value = {666, 666}; - test_bloom_filter_index_reader_writer_template( + auto st = test_bloom_filter_index_reader_writer_template( file_name, val, num, 1, ¬_exist_value); + EXPECT_TRUE(st.ok()); + delete[] val; +} + +TEST_F(BloomFilterIndexReaderWriterTest, test_primary_key_bloom_filter_index) { + size_t num = 1024 * 3 - 1; + std::vector val_strings(num); + for (size_t i = 0; i < num; ++i) { + val_strings[i] = "primary_key_" + std::to_string(i); + } + std::vector slices(num); + for (size_t i = 0; i < num; ++i) { + slices[i] = Slice(val_strings[i]); + } + + std::string file_name = "primary_key_bloom_filter_index"; + Slice not_exist_value("primary_key_not_exist"); + + auto st = test_bloom_filter_index_reader_writer_template( + file_name, slices.data(), num, 0, ¬_exist_value, true, true); + EXPECT_TRUE(st.ok()); +} + +TEST_F(BloomFilterIndexReaderWriterTest, test_primary_key_bloom_filter_index_int) { + size_t num = 1024 * 3 - 1; + int* val = new int[num]; + for (int i = 0; i < num; ++i) { + // there will be 3 bloom filter pages + val[i] = 10000 + i + 1; + } + + std::string file_name = "primary_key_bloom_filter_index_int"; + int not_exist_value = 18888; + auto st = test_bloom_filter_index_reader_writer_template( + file_name, val, num, 1, ¬_exist_value, false, true); + EXPECT_FALSE(st.ok()); + EXPECT_EQ(st.code(), TStatusCode::NOT_IMPLEMENTED_ERROR); delete[] val; }