From 1ebcfc02e114dcbc9dfe4a0547c3e9544ca0f17a Mon Sep 17 00:00:00 2001 From: jacktengg Date: Fri, 8 Nov 2024 18:34:33 +0800 Subject: [PATCH] [fix](string64) fix coredump caused by ColumnArray>::insert_indices_from --- be/src/vec/columns/column_array.cpp | 3 +- be/src/vec/columns/column_string.cpp | 56 ++++++++++++++----------- be/test/vec/core/column_array_test.cpp | 47 +++++++++++++++++++++ be/test/vec/core/column_string_test.cpp | 38 +++++++++++++++++ 4 files changed, 118 insertions(+), 26 deletions(-) diff --git a/be/src/vec/columns/column_array.cpp b/be/src/vec/columns/column_array.cpp index ca1f49a67f067f6..e66e016381e83b4 100644 --- a/be/src/vec/columns/column_array.cpp +++ b/be/src/vec/columns/column_array.cpp @@ -389,7 +389,8 @@ void ColumnArray::insert_from(const IColumn& src_, size_t n) { if (!get_data().is_nullable() && src.get_data().is_nullable()) { // Note: we can't process the case of 'Array(Nullable(nest))' - DCHECK(false); + throw Exception(ErrorCode::INTERNAL_ERROR, "insert '{}' into '{}'", src.get_name(), + get_name()); } else if (get_data().is_nullable() && !src.get_data().is_nullable()) { // Note: here we should process the case of 'Array(NotNullable(nest))' reinterpret_cast(&get_data()) diff --git a/be/src/vec/columns/column_string.cpp b/be/src/vec/columns/column_string.cpp index 40d74b18ed45093..6eb50da8bbc4de8 100644 --- a/be/src/vec/columns/column_string.cpp +++ b/be/src/vec/columns/column_string.cpp @@ -22,7 +22,6 @@ #include #include -#include #include "util/memcpy_inlined.h" #include "util/simd/bits.h" @@ -134,34 +133,41 @@ void ColumnStr::insert_range_from(const IColumn& src, size_t start, size_t le if (length == 0) { return; } + auto do_insert = [&](const auto& src_concrete) { + const auto& src_offsets = src_concrete.get_offsets(); + const auto& src_chars = src_concrete.get_chars(); + if (start + length > src_offsets.size()) { + throw doris::Exception( + doris::ErrorCode::INTERNAL_ERROR, + "Parameter out of bound in IColumnStr::insert_range_from method."); + } + size_t nested_offset = src_offsets[static_cast(start) - 1]; + size_t nested_length = src_offsets[start + length - 1] - nested_offset; - const auto& src_concrete = assert_cast&>(src); - - if (start + length > src_concrete.offsets.size()) { - throw doris::Exception( - doris::ErrorCode::INTERNAL_ERROR, - "Parameter out of bound in IColumnStr::insert_range_from method."); - } - - size_t nested_offset = src_concrete.offset_at(start); - size_t nested_length = src_concrete.offsets[start + length - 1] - nested_offset; - - size_t old_chars_size = chars.size(); - check_chars_length(old_chars_size + nested_length, offsets.size() + length); - chars.resize(old_chars_size + nested_length); - memcpy(&chars[old_chars_size], &src_concrete.chars[nested_offset], nested_length); + size_t old_chars_size = chars.size(); + check_chars_length(old_chars_size + nested_length, offsets.size() + length); + chars.resize(old_chars_size + nested_length); + memcpy(&chars[old_chars_size], &src_chars[nested_offset], nested_length); - if (start == 0 && offsets.empty()) { - offsets.assign(src_concrete.offsets.begin(), src_concrete.offsets.begin() + length); - } else { - size_t old_size = offsets.size(); - size_t prev_max_offset = offsets.back(); /// -1th index is Ok, see PaddedPODArray - offsets.resize(old_size + length); + using OffsetsType = std::decay_t; + if (std::is_same_v && start == 0 && offsets.empty()) { + offsets.assign(src_offsets.begin(), src_offsets.begin() + length); + } else { + size_t old_size = offsets.size(); + size_t prev_max_offset = offsets.back(); /// -1th index is Ok, see PaddedPODArray + offsets.resize(old_size + length); - for (size_t i = 0; i < length; ++i) { - offsets[old_size + i] = - src_concrete.offsets[start + i] - nested_offset + prev_max_offset; + for (size_t i = 0; i < length; ++i) { + offsets[old_size + i] = src_offsets[start + i] - nested_offset + prev_max_offset; + } } + }; + // insert_range_from maybe called by ColumnArray::insert_indices_from(which is used by hash join operator), + // so we need to support both ColumnStr and ColumnStr + if (src.is_column_string64()) { + do_insert(assert_cast&>(src)); + } else { + do_insert(assert_cast&>(src)); } } diff --git a/be/test/vec/core/column_array_test.cpp b/be/test/vec/core/column_array_test.cpp index c371dda25e85d17..0c8a486b5bf06c6 100644 --- a/be/test/vec/core/column_array_test.cpp +++ b/be/test/vec/core/column_array_test.cpp @@ -108,6 +108,53 @@ TEST(ColumnArrayTest, StringArrayTest) { } } +TEST(ColumnArrayTest, String64ArrayTest) { + auto off_column = ColumnVector::create(); + auto str64_column = ColumnString64::create(); + // init column array with [["abc","d"],["ef"],[], [""]]; + std::vector offs = {0, 2, 3, 3, 4}; + std::vector vals = {"abc", "d", "ef", ""}; + for (size_t i = 1; i < offs.size(); ++i) { + off_column->insert_data((const char*)(&offs[i]), 0); + } + for (auto& v : vals) { + str64_column->insert_data(v.data(), v.size()); + } + + ColumnArray str64_array_column(std::move(str64_column), std::move(off_column)); + EXPECT_EQ(str64_array_column.size(), offs.size() - 1); + for (size_t i = 0; i < str64_array_column.size(); ++i) { + auto v = get(str64_array_column[i]); + EXPECT_EQ(v.size(), offs[i + 1] - offs[i]); + for (size_t j = 0; j < v.size(); ++j) { + EXPECT_EQ(vals[offs[i] + j], get(v[j])); + } + } + // test insert ColumnArray> into ColumnArray> + auto str32_column = ColumnString::create(); + ColumnArray str32_array_column(std::move(str32_column)); + std::vector indices; + indices.push_back(0); + indices.push_back(1); + indices.push_back(3); + str32_array_column.insert_indices_from(str64_array_column, indices.data(), + indices.data() + indices.size()); + EXPECT_EQ(str32_array_column.size(), 3); + + auto v = get(str32_array_column[0]); + EXPECT_EQ(v.size(), 2); + EXPECT_EQ(get(v[0]), vals[0]); + EXPECT_EQ(get(v[1]), vals[1]); + + v = get(str32_array_column[1]); + EXPECT_EQ(v.size(), 1); + EXPECT_EQ(get(v[0]), vals[2]); + + v = get(str32_array_column[2]); + EXPECT_EQ(v.size(), 1); + EXPECT_EQ(get(v[0]), vals[3]); +} + TEST(ColumnArrayTest, IntArrayPermuteTest) { auto off_column = ColumnVector::create(); auto data_column = ColumnVector::create(); diff --git a/be/test/vec/core/column_string_test.cpp b/be/test/vec/core/column_string_test.cpp index a1967a30ce72247..715b7d0e7f92a0c 100644 --- a/be/test/vec/core/column_string_test.cpp +++ b/be/test/vec/core/column_string_test.cpp @@ -19,6 +19,7 @@ #include +#include "vec/columns/column.h" #include "vec/core/block.h" #include "vec/data_types/data_type_string.h" #include "vec/functions/function_string.h" @@ -64,4 +65,41 @@ TEST(ColumnStringTest, TestConcat) { auto actual_res_col_str = assert_cast(actual_res_col.get()); actual_res_col_str->sanity_check(); } + +TEST(ColumnStringTest, TestStringInsert) { + { + auto str32_column = ColumnString::create(); + std::vector vals_tmp = {"x", "yy", "zzz", ""}; + auto str32_column_tmp = ColumnString::create(); + for (auto& v : vals_tmp) { + str32_column_tmp->insert_data(v.data(), v.size()); + } + str32_column->insert_range_from(*str32_column_tmp, 0, vals_tmp.size()); + str32_column->insert_range_from(*str32_column_tmp, 0, vals_tmp.size()); + auto row_count = str32_column->size(); + EXPECT_EQ(row_count, vals_tmp.size() * 2); + for (size_t i = 0; i < row_count; ++i) { + auto row_data = str32_column->get_data_at(i); + EXPECT_EQ(row_data.to_string(), vals_tmp[i % vals_tmp.size()]); + } + } + + { + // test insert ColumnString64 to ColumnString + auto str32_column = ColumnString::create(); + std::vector vals_tmp = {"x", "yy", "zzz", ""}; + auto str64_column_tmp = ColumnString64::create(); + for (auto& v : vals_tmp) { + str64_column_tmp->insert_data(v.data(), v.size()); + } + str32_column->insert_range_from(*str64_column_tmp, 0, vals_tmp.size()); + str32_column->insert_range_from(*str64_column_tmp, 0, vals_tmp.size()); + auto row_count = str32_column->size(); + EXPECT_EQ(row_count, vals_tmp.size() * 2); + for (size_t i = 0; i < row_count; ++i) { + auto row_data = str32_column->get_data_at(i); + EXPECT_EQ(row_data.to_string(), vals_tmp[i % vals_tmp.size()]); + } + } +} } // namespace doris::vectorized \ No newline at end of file