From d4712aed1a9c74494b6cf1319e4f60c590105232 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 13 Nov 2024 19:31:11 +0800 Subject: [PATCH] branch-2.1: [fix](string64) fix coredump caused by ColumnArray>::insert_indices_from (#43862) Cherry-picked from #43624 Co-authored-by: TengJianPing --- be/src/vec/columns/column_array.cpp | 3 +- be/src/vec/columns/column_string.cpp | 65 ++++++---- be/test/vec/core/column_array_test.cpp | 47 +++++++ be/test/vec/core/column_map_test.cpp | 155 ++++++++++++++++++++++++ be/test/vec/core/column_string_test.cpp | 38 ++++++ be/test/vec/core/column_struct_test.cpp | 79 ++++++++++++ 6 files changed, 361 insertions(+), 26 deletions(-) create mode 100644 be/test/vec/core/column_map_test.cpp create mode 100644 be/test/vec/core/column_struct_test.cpp diff --git a/be/src/vec/columns/column_array.cpp b/be/src/vec/columns/column_array.cpp index 110c7f492b1bd8..c735e4413ff183 100644 --- a/be/src/vec/columns/column_array.cpp +++ b/be/src/vec/columns/column_array.cpp @@ -431,7 +431,8 @@ void ColumnArray::insert_from(const IColumn& src_, size_t n) { if (!get_data().is_nullable() && src.get_data().is_nullable()) { // Note: we can't process the case of 'Array(Nullable(nest))' - DCHECK(false); + throw Exception(ErrorCode::INTERNAL_ERROR, "insert '{}' into '{}'", src.get_name(), + get_name()); } else if (get_data().is_nullable() && !src.get_data().is_nullable()) { // Note: here we should process the case of 'Array(NotNullable(nest))' reinterpret_cast(&get_data()) diff --git a/be/src/vec/columns/column_string.cpp b/be/src/vec/columns/column_string.cpp index 3799bab34f6ebd..bf3e2b5c753a58 100644 --- a/be/src/vec/columns/column_string.cpp +++ b/be/src/vec/columns/column_string.cpp @@ -22,7 +22,6 @@ #include #include -#include #include "util/simd/bits.h" #include "vec/columns/columns_common.h" @@ -93,6 +92,13 @@ MutableColumnPtr ColumnStr::get_shrinked_column() { return shrinked_column; } +// This method is only called by MutableBlock::merge_ignore_overflow +// by hash join operator to collect build data to avoid +// the total string length of a ColumnStr column exceeds the 4G limit. +// +// After finishing collecting build data, a ColumnStr column +// will be converted to ColumnStr if the total string length +// exceeds the 4G limit by calling Block::replace_if_overflow. template void ColumnStr::insert_range_from_ignore_overflow(const doris::vectorized::IColumn& src, size_t start, size_t length) { @@ -122,6 +128,8 @@ void ColumnStr::insert_range_from_ignore_overflow(const doris::vectorized::IC offsets.resize(old_size + length); for (size_t i = 0; i < length; ++i) { + // unsinged integer overflow is well defined in C++, + // so we don't need to check the overflow here. offsets[old_size + i] = src_concrete.offsets[start + i] - nested_offset + prev_max_offset; } @@ -133,34 +141,41 @@ void ColumnStr::insert_range_from(const IColumn& src, size_t start, size_t le if (length == 0) { return; } + auto do_insert = [&](const auto& src_concrete) { + const auto& src_offsets = src_concrete.get_offsets(); + const auto& src_chars = src_concrete.get_chars(); + if (start + length > src_offsets.size()) { + throw doris::Exception( + doris::ErrorCode::INTERNAL_ERROR, + "Parameter out of bound in IColumnStr::insert_range_from method."); + } + size_t nested_offset = src_offsets[static_cast(start) - 1]; + size_t nested_length = src_offsets[start + length - 1] - nested_offset; - const auto& src_concrete = assert_cast&>(src); - - if (start + length > src_concrete.offsets.size()) { - throw doris::Exception( - doris::ErrorCode::INTERNAL_ERROR, - "Parameter out of bound in IColumnStr::insert_range_from method."); - } - - size_t nested_offset = src_concrete.offset_at(start); - size_t nested_length = src_concrete.offsets[start + length - 1] - nested_offset; - - size_t old_chars_size = chars.size(); - check_chars_length(old_chars_size + nested_length, offsets.size() + length); - chars.resize(old_chars_size + nested_length); - memcpy(&chars[old_chars_size], &src_concrete.chars[nested_offset], nested_length); + size_t old_chars_size = chars.size(); + check_chars_length(old_chars_size + nested_length, offsets.size() + length); + chars.resize(old_chars_size + nested_length); + memcpy(&chars[old_chars_size], &src_chars[nested_offset], nested_length); - if (start == 0 && offsets.empty()) { - offsets.assign(src_concrete.offsets.begin(), src_concrete.offsets.begin() + length); - } else { - size_t old_size = offsets.size(); - size_t prev_max_offset = offsets.back(); /// -1th index is Ok, see PaddedPODArray - offsets.resize(old_size + length); + using OffsetsType = std::decay_t; + if (std::is_same_v && start == 0 && offsets.empty()) { + offsets.assign(src_offsets.begin(), src_offsets.begin() + length); + } else { + size_t old_size = offsets.size(); + size_t prev_max_offset = offsets.back(); /// -1th index is Ok, see PaddedPODArray + offsets.resize(old_size + length); - for (size_t i = 0; i < length; ++i) { - offsets[old_size + i] = - src_concrete.offsets[start + i] - nested_offset + prev_max_offset; + for (size_t i = 0; i < length; ++i) { + offsets[old_size + i] = src_offsets[start + i] - nested_offset + prev_max_offset; + } } + }; + // insert_range_from maybe called by ColumnArray::insert_indices_from(which is used by hash join operator), + // so we need to support both ColumnStr and ColumnStr + if (src.is_column_string64()) { + do_insert(assert_cast&>(src)); + } else { + do_insert(assert_cast&>(src)); } } diff --git a/be/test/vec/core/column_array_test.cpp b/be/test/vec/core/column_array_test.cpp index c371dda25e85d1..0c8a486b5bf06c 100644 --- a/be/test/vec/core/column_array_test.cpp +++ b/be/test/vec/core/column_array_test.cpp @@ -108,6 +108,53 @@ TEST(ColumnArrayTest, StringArrayTest) { } } +TEST(ColumnArrayTest, String64ArrayTest) { + auto off_column = ColumnVector::create(); + auto str64_column = ColumnString64::create(); + // init column array with [["abc","d"],["ef"],[], [""]]; + std::vector offs = {0, 2, 3, 3, 4}; + std::vector vals = {"abc", "d", "ef", ""}; + for (size_t i = 1; i < offs.size(); ++i) { + off_column->insert_data((const char*)(&offs[i]), 0); + } + for (auto& v : vals) { + str64_column->insert_data(v.data(), v.size()); + } + + ColumnArray str64_array_column(std::move(str64_column), std::move(off_column)); + EXPECT_EQ(str64_array_column.size(), offs.size() - 1); + for (size_t i = 0; i < str64_array_column.size(); ++i) { + auto v = get(str64_array_column[i]); + EXPECT_EQ(v.size(), offs[i + 1] - offs[i]); + for (size_t j = 0; j < v.size(); ++j) { + EXPECT_EQ(vals[offs[i] + j], get(v[j])); + } + } + // test insert ColumnArray> into ColumnArray> + auto str32_column = ColumnString::create(); + ColumnArray str32_array_column(std::move(str32_column)); + std::vector indices; + indices.push_back(0); + indices.push_back(1); + indices.push_back(3); + str32_array_column.insert_indices_from(str64_array_column, indices.data(), + indices.data() + indices.size()); + EXPECT_EQ(str32_array_column.size(), 3); + + auto v = get(str32_array_column[0]); + EXPECT_EQ(v.size(), 2); + EXPECT_EQ(get(v[0]), vals[0]); + EXPECT_EQ(get(v[1]), vals[1]); + + v = get(str32_array_column[1]); + EXPECT_EQ(v.size(), 1); + EXPECT_EQ(get(v[0]), vals[2]); + + v = get(str32_array_column[2]); + EXPECT_EQ(v.size(), 1); + EXPECT_EQ(get(v[0]), vals[3]); +} + TEST(ColumnArrayTest, IntArrayPermuteTest) { auto off_column = ColumnVector::create(); auto data_column = ColumnVector::create(); diff --git a/be/test/vec/core/column_map_test.cpp b/be/test/vec/core/column_map_test.cpp new file mode 100644 index 00000000000000..d59247dae33f03 --- /dev/null +++ b/be/test/vec/core/column_map_test.cpp @@ -0,0 +1,155 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "vec/columns/column_map.h" + +#include +#include + +#include "gtest/gtest_pred_impl.h" +#include "vec/columns/column.h" +#include "vec/columns/column_string.h" +#include "vec/columns/column_vector.h" +#include "vec/core/field.h" + +namespace doris::vectorized { +TEST(ColumnMapTest, StringKeyTest) { + auto col_map_str64 = ColumnMap(ColumnString64::create(), ColumnInt64::create(), + ColumnArray::ColumnOffsets::create()); + Array k1 = {"a", "b", "c"}; + Array v1 = {1, 2, 3}; + { + Map map; + map.push_back(k1); + map.push_back(v1); + col_map_str64.insert(map); + } + Array k2 = {"aa", "bb", "cc"}; + Array v2 = {11, 22, 33}; + { + Map map; + map.push_back(k2); + map.push_back(v2); + col_map_str64.insert(map); + } + Array k3 = {"aaa", "bbb", "ccc"}; + Array v3 = {111, 222, 333}; + { + Map map; + map.push_back(k3); + map.push_back(v3); + col_map_str64.insert(map); + } + + // test insert ColumnMap, Column> into ColumnMap, Column> + auto col_map_str32 = ColumnMap(ColumnString::create(), ColumnInt64::create(), + ColumnArray::ColumnOffsets::create()); + std::vector indices; + indices.push_back(0); + indices.push_back(2); + col_map_str32.insert_indices_from(col_map_str64, indices.data(), + indices.data() + indices.size()); + EXPECT_EQ(col_map_str32.size(), 2); + + auto map = get(col_map_str32[0]); + auto k = get(map[0]); + auto v = get(map[1]); + EXPECT_EQ(k.size(), 3); + for (size_t i = 0; i < k.size(); ++i) { + EXPECT_EQ(k[i], k1[i]); + } + EXPECT_EQ(v.size(), 3); + for (size_t i = 0; i < v.size(); ++i) { + EXPECT_EQ(v[i], v1[i]); + } + + map = get(col_map_str32[1]); + k = get(map[0]); + v = get(map[1]); + EXPECT_EQ(k.size(), 3); + for (size_t i = 0; i < k.size(); ++i) { + EXPECT_EQ(k[i], k3[i]); + } + EXPECT_EQ(v.size(), 3); + for (size_t i = 0; i < v.size(); ++i) { + EXPECT_EQ(v[i], v3[i]); + } +}; + +TEST(ColumnMapTest, StringValueTest) { + auto col_map_str64 = ColumnMap(ColumnInt64::create(), ColumnString64::create(), + ColumnArray::ColumnOffsets::create()); + Array k1 = {1, 2, 3}; + Array v1 = {"a", "b", "c"}; + { + Map map; + map.push_back(k1); + map.push_back(v1); + col_map_str64.insert(map); + } + Array k2 = {11, 22, 33}; + Array v2 = {"aa", "bb", "cc"}; + { + Map map; + map.push_back(k2); + map.push_back(v2); + col_map_str64.insert(map); + } + Array k3 = {111, 222, 333}; + Array v3 = {"aaa", "bbb", "ccc"}; + { + Map map; + map.push_back(k3); + map.push_back(v3); + col_map_str64.insert(map); + } + + // test insert ColumnMap, Column> into ColumnMap, Column> + auto col_map_str32 = ColumnMap(ColumnInt64::create(), ColumnString::create(), + ColumnArray::ColumnOffsets::create()); + std::vector indices; + indices.push_back(0); + indices.push_back(2); + col_map_str32.insert_indices_from(col_map_str64, indices.data(), + indices.data() + indices.size()); + EXPECT_EQ(col_map_str32.size(), 2); + + auto map = get(col_map_str32[0]); + auto k = get(map[0]); + auto v = get(map[1]); + EXPECT_EQ(k.size(), 3); + for (size_t i = 0; i < k.size(); ++i) { + EXPECT_EQ(k[i], k1[i]); + } + EXPECT_EQ(v.size(), 3); + for (size_t i = 0; i < v.size(); ++i) { + EXPECT_EQ(v[i], v1[i]); + } + + map = get(col_map_str32[1]); + k = get(map[0]); + v = get(map[1]); + EXPECT_EQ(k.size(), 3); + for (size_t i = 0; i < k.size(); ++i) { + EXPECT_EQ(k[i], k3[i]); + } + EXPECT_EQ(v.size(), 3); + for (size_t i = 0; i < v.size(); ++i) { + EXPECT_EQ(v[i], v3[i]); + } +}; +} // namespace doris::vectorized \ No newline at end of file diff --git a/be/test/vec/core/column_string_test.cpp b/be/test/vec/core/column_string_test.cpp index 81f41bd11c465c..aea6d87b323c6b 100644 --- a/be/test/vec/core/column_string_test.cpp +++ b/be/test/vec/core/column_string_test.cpp @@ -19,6 +19,7 @@ #include +#include "vec/columns/column.h" #include "vec/core/block.h" #include "vec/data_types/data_type_string.h" #include "vec/functions/function_string.h" @@ -56,4 +57,41 @@ TEST(ColumnStringTest, TestConcat) { auto actual_res_col_str = assert_cast(actual_res_col.get()); actual_res_col_str->sanity_check(); } + +TEST(ColumnStringTest, TestStringInsert) { + { + auto str32_column = ColumnString::create(); + std::vector vals_tmp = {"x", "yy", "zzz", ""}; + auto str32_column_tmp = ColumnString::create(); + for (auto& v : vals_tmp) { + str32_column_tmp->insert_data(v.data(), v.size()); + } + str32_column->insert_range_from(*str32_column_tmp, 0, vals_tmp.size()); + str32_column->insert_range_from(*str32_column_tmp, 0, vals_tmp.size()); + auto row_count = str32_column->size(); + EXPECT_EQ(row_count, vals_tmp.size() * 2); + for (size_t i = 0; i < row_count; ++i) { + auto row_data = str32_column->get_data_at(i); + EXPECT_EQ(row_data.to_string(), vals_tmp[i % vals_tmp.size()]); + } + } + + { + // test insert ColumnString64 to ColumnString + auto str32_column = ColumnString::create(); + std::vector vals_tmp = {"x", "yy", "zzz", ""}; + auto str64_column_tmp = ColumnString64::create(); + for (auto& v : vals_tmp) { + str64_column_tmp->insert_data(v.data(), v.size()); + } + str32_column->insert_range_from(*str64_column_tmp, 0, vals_tmp.size()); + str32_column->insert_range_from(*str64_column_tmp, 0, vals_tmp.size()); + auto row_count = str32_column->size(); + EXPECT_EQ(row_count, vals_tmp.size() * 2); + for (size_t i = 0; i < row_count; ++i) { + auto row_data = str32_column->get_data_at(i); + EXPECT_EQ(row_data.to_string(), vals_tmp[i % vals_tmp.size()]); + } + } +} } // namespace doris::vectorized \ No newline at end of file diff --git a/be/test/vec/core/column_struct_test.cpp b/be/test/vec/core/column_struct_test.cpp new file mode 100644 index 00000000000000..1720b01638cf1b --- /dev/null +++ b/be/test/vec/core/column_struct_test.cpp @@ -0,0 +1,79 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "vec/columns/column_struct.h" + +#include +#include + +#include + +#include "gtest/gtest_pred_impl.h" +#include "vec/columns/column.h" +#include "vec/columns/column_string.h" +#include "vec/columns/columns_number.h" +#include "vec/core/field.h" + +namespace doris::vectorized { +TEST(ColumnStructTest, StringTest) { + auto str64_column = ColumnString64::create(); + auto i32_column = ColumnInt32::create(); + + std::vector str_vals = {"aaa", "bbb", "ccc", "ddd"}; + std::vector int_vals = {111, 222, 333, 444}; + for (size_t i = 0; i < str_vals.size(); ++i) { + str64_column->insert_data(str_vals[i].data(), str_vals[i].size()); + i32_column->insert_data((const char*)(&int_vals[i]), 0); + } + + std::vector vector_columns; + vector_columns.emplace_back(str64_column->get_ptr()); + vector_columns.emplace_back(i32_column->get_ptr()); + auto str64_struct_column = ColumnStruct::create(vector_columns); + + auto str32_column = ColumnString::create(); + auto i32_column2 = ColumnInt32::create(); + std::vector vector_columns2; + vector_columns2.emplace_back(str32_column->get_ptr()); + vector_columns2.emplace_back(i32_column2->get_ptr()); + auto str32_struct_column = ColumnStruct::create(vector_columns2); + + std::vector indices; + indices.push_back(0); + indices.push_back(2); + indices.push_back(3); + std::move(*str32_struct_column) + .mutate() + ->insert_indices_from(*str64_struct_column, indices.data(), + indices.data() + indices.size()); + EXPECT_EQ(str32_struct_column->size(), indices.size()); + auto t = get(str32_struct_column->operator[](0)); + EXPECT_EQ(t.size(), 2); + EXPECT_EQ(t[0], "aaa"); + EXPECT_EQ(t[1], 111); + + t = get(str32_struct_column->operator[](1)); + EXPECT_EQ(t.size(), 2); + EXPECT_EQ(t[0], "ccc"); + EXPECT_EQ(t[1], 333); + + t = get(str32_struct_column->operator[](2)); + EXPECT_EQ(t.size(), 2); + EXPECT_EQ(t[0], "ddd"); + EXPECT_EQ(t[1], 444); +}; +} // namespace doris::vectorized \ No newline at end of file