Skip to content

Commit

Permalink
branch-2.1: [fix](string64) fix coredump caused by ColumnArray<Column…
Browse files Browse the repository at this point in the history
…Str<uint64_t>>::insert_indices_from (#43862)

Cherry-picked from #43624

Co-authored-by: TengJianPing <[email protected]>
  • Loading branch information
github-actions[bot] and jacktengg authored Nov 13, 2024
1 parent 1101fba commit d4712ae
Show file tree
Hide file tree
Showing 6 changed files with 361 additions and 26 deletions.
3 changes: 2 additions & 1 deletion be/src/vec/columns/column_array.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -431,7 +431,8 @@ void ColumnArray::insert_from(const IColumn& src_, size_t n) {

if (!get_data().is_nullable() && src.get_data().is_nullable()) {
// Note: we can't process the case of 'Array(Nullable(nest))'
DCHECK(false);
throw Exception(ErrorCode::INTERNAL_ERROR, "insert '{}' into '{}'", src.get_name(),
get_name());
} else if (get_data().is_nullable() && !src.get_data().is_nullable()) {
// Note: here we should process the case of 'Array(NotNullable(nest))'
reinterpret_cast<ColumnNullable*>(&get_data())
Expand Down
65 changes: 40 additions & 25 deletions be/src/vec/columns/column_string.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@

#include <algorithm>
#include <boost/iterator/iterator_facade.hpp>
#include <ostream>

#include "util/simd/bits.h"
#include "vec/columns/columns_common.h"
Expand Down Expand Up @@ -93,6 +92,13 @@ MutableColumnPtr ColumnStr<T>::get_shrinked_column() {
return shrinked_column;
}

// This method is only called by MutableBlock::merge_ignore_overflow
// by hash join operator to collect build data to avoid
// the total string length of a ColumnStr<uint32_t> column exceeds the 4G limit.
//
// After finishing collecting build data, a ColumnStr<uint32_t> column
// will be converted to ColumnStr<uint64_t> if the total string length
// exceeds the 4G limit by calling Block::replace_if_overflow.
template <typename T>
void ColumnStr<T>::insert_range_from_ignore_overflow(const doris::vectorized::IColumn& src,
size_t start, size_t length) {
Expand Down Expand Up @@ -122,6 +128,8 @@ void ColumnStr<T>::insert_range_from_ignore_overflow(const doris::vectorized::IC
offsets.resize(old_size + length);

for (size_t i = 0; i < length; ++i) {
// unsinged integer overflow is well defined in C++,
// so we don't need to check the overflow here.
offsets[old_size + i] =
src_concrete.offsets[start + i] - nested_offset + prev_max_offset;
}
Expand All @@ -133,34 +141,41 @@ void ColumnStr<T>::insert_range_from(const IColumn& src, size_t start, size_t le
if (length == 0) {
return;
}
auto do_insert = [&](const auto& src_concrete) {
const auto& src_offsets = src_concrete.get_offsets();
const auto& src_chars = src_concrete.get_chars();
if (start + length > src_offsets.size()) {
throw doris::Exception(
doris::ErrorCode::INTERNAL_ERROR,
"Parameter out of bound in IColumnStr<T>::insert_range_from method.");
}
size_t nested_offset = src_offsets[static_cast<ssize_t>(start) - 1];
size_t nested_length = src_offsets[start + length - 1] - nested_offset;

const auto& src_concrete = assert_cast<const ColumnStr<T>&>(src);

if (start + length > src_concrete.offsets.size()) {
throw doris::Exception(
doris::ErrorCode::INTERNAL_ERROR,
"Parameter out of bound in IColumnStr<T>::insert_range_from method.");
}

size_t nested_offset = src_concrete.offset_at(start);
size_t nested_length = src_concrete.offsets[start + length - 1] - nested_offset;

size_t old_chars_size = chars.size();
check_chars_length(old_chars_size + nested_length, offsets.size() + length);
chars.resize(old_chars_size + nested_length);
memcpy(&chars[old_chars_size], &src_concrete.chars[nested_offset], nested_length);
size_t old_chars_size = chars.size();
check_chars_length(old_chars_size + nested_length, offsets.size() + length);
chars.resize(old_chars_size + nested_length);
memcpy(&chars[old_chars_size], &src_chars[nested_offset], nested_length);

if (start == 0 && offsets.empty()) {
offsets.assign(src_concrete.offsets.begin(), src_concrete.offsets.begin() + length);
} else {
size_t old_size = offsets.size();
size_t prev_max_offset = offsets.back(); /// -1th index is Ok, see PaddedPODArray
offsets.resize(old_size + length);
using OffsetsType = std::decay_t<decltype(src_offsets)>;
if (std::is_same_v<T, typename OffsetsType::value_type> && start == 0 && offsets.empty()) {
offsets.assign(src_offsets.begin(), src_offsets.begin() + length);
} else {
size_t old_size = offsets.size();
size_t prev_max_offset = offsets.back(); /// -1th index is Ok, see PaddedPODArray
offsets.resize(old_size + length);

for (size_t i = 0; i < length; ++i) {
offsets[old_size + i] =
src_concrete.offsets[start + i] - nested_offset + prev_max_offset;
for (size_t i = 0; i < length; ++i) {
offsets[old_size + i] = src_offsets[start + i] - nested_offset + prev_max_offset;
}
}
};
// insert_range_from maybe called by ColumnArray::insert_indices_from(which is used by hash join operator),
// so we need to support both ColumnStr<uint32_t> and ColumnStr<uint64_t>
if (src.is_column_string64()) {
do_insert(assert_cast<const ColumnStr<uint64_t>&>(src));
} else {
do_insert(assert_cast<const ColumnStr<uint32_t>&>(src));
}
}

Expand Down
47 changes: 47 additions & 0 deletions be/test/vec/core/column_array_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,53 @@ TEST(ColumnArrayTest, StringArrayTest) {
}
}

TEST(ColumnArrayTest, String64ArrayTest) {
auto off_column = ColumnVector<ColumnArray::Offset64>::create();
auto str64_column = ColumnString64::create();
// init column array with [["abc","d"],["ef"],[], [""]];
std::vector<ColumnArray::Offset64> offs = {0, 2, 3, 3, 4};
std::vector<std::string> vals = {"abc", "d", "ef", ""};
for (size_t i = 1; i < offs.size(); ++i) {
off_column->insert_data((const char*)(&offs[i]), 0);
}
for (auto& v : vals) {
str64_column->insert_data(v.data(), v.size());
}

ColumnArray str64_array_column(std::move(str64_column), std::move(off_column));
EXPECT_EQ(str64_array_column.size(), offs.size() - 1);
for (size_t i = 0; i < str64_array_column.size(); ++i) {
auto v = get<Array>(str64_array_column[i]);
EXPECT_EQ(v.size(), offs[i + 1] - offs[i]);
for (size_t j = 0; j < v.size(); ++j) {
EXPECT_EQ(vals[offs[i] + j], get<std::string>(v[j]));
}
}
// test insert ColumnArray<ColumnStr<uint64_t>> into ColumnArray<ColumnStr<uint32_t>>
auto str32_column = ColumnString::create();
ColumnArray str32_array_column(std::move(str32_column));
std::vector<uint32_t> indices;
indices.push_back(0);
indices.push_back(1);
indices.push_back(3);
str32_array_column.insert_indices_from(str64_array_column, indices.data(),
indices.data() + indices.size());
EXPECT_EQ(str32_array_column.size(), 3);

auto v = get<Array>(str32_array_column[0]);
EXPECT_EQ(v.size(), 2);
EXPECT_EQ(get<std::string>(v[0]), vals[0]);
EXPECT_EQ(get<std::string>(v[1]), vals[1]);

v = get<Array>(str32_array_column[1]);
EXPECT_EQ(v.size(), 1);
EXPECT_EQ(get<std::string>(v[0]), vals[2]);

v = get<Array>(str32_array_column[2]);
EXPECT_EQ(v.size(), 1);
EXPECT_EQ(get<std::string>(v[0]), vals[3]);
}

TEST(ColumnArrayTest, IntArrayPermuteTest) {
auto off_column = ColumnVector<ColumnArray::Offset64>::create();
auto data_column = ColumnVector<int32_t>::create();
Expand Down
155 changes: 155 additions & 0 deletions be/test/vec/core/column_map_test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#include "vec/columns/column_map.h"

#include <gtest/gtest-message.h>
#include <gtest/gtest-test-part.h>

#include "gtest/gtest_pred_impl.h"
#include "vec/columns/column.h"
#include "vec/columns/column_string.h"
#include "vec/columns/column_vector.h"
#include "vec/core/field.h"

namespace doris::vectorized {
TEST(ColumnMapTest, StringKeyTest) {
auto col_map_str64 = ColumnMap(ColumnString64::create(), ColumnInt64::create(),
ColumnArray::ColumnOffsets::create());
Array k1 = {"a", "b", "c"};
Array v1 = {1, 2, 3};
{
Map map;
map.push_back(k1);
map.push_back(v1);
col_map_str64.insert(map);
}
Array k2 = {"aa", "bb", "cc"};
Array v2 = {11, 22, 33};
{
Map map;
map.push_back(k2);
map.push_back(v2);
col_map_str64.insert(map);
}
Array k3 = {"aaa", "bbb", "ccc"};
Array v3 = {111, 222, 333};
{
Map map;
map.push_back(k3);
map.push_back(v3);
col_map_str64.insert(map);
}

// test insert ColumnMap<ColumnStr<uint64_t>, Column> into ColumnMap<ColumnStr<uint32_t>, Column>
auto col_map_str32 = ColumnMap(ColumnString::create(), ColumnInt64::create(),
ColumnArray::ColumnOffsets::create());
std::vector<uint32_t> indices;
indices.push_back(0);
indices.push_back(2);
col_map_str32.insert_indices_from(col_map_str64, indices.data(),
indices.data() + indices.size());
EXPECT_EQ(col_map_str32.size(), 2);

auto map = get<Map>(col_map_str32[0]);
auto k = get<Array>(map[0]);
auto v = get<Array>(map[1]);
EXPECT_EQ(k.size(), 3);
for (size_t i = 0; i < k.size(); ++i) {
EXPECT_EQ(k[i], k1[i]);
}
EXPECT_EQ(v.size(), 3);
for (size_t i = 0; i < v.size(); ++i) {
EXPECT_EQ(v[i], v1[i]);
}

map = get<Map>(col_map_str32[1]);
k = get<Array>(map[0]);
v = get<Array>(map[1]);
EXPECT_EQ(k.size(), 3);
for (size_t i = 0; i < k.size(); ++i) {
EXPECT_EQ(k[i], k3[i]);
}
EXPECT_EQ(v.size(), 3);
for (size_t i = 0; i < v.size(); ++i) {
EXPECT_EQ(v[i], v3[i]);
}
};

TEST(ColumnMapTest, StringValueTest) {
auto col_map_str64 = ColumnMap(ColumnInt64::create(), ColumnString64::create(),
ColumnArray::ColumnOffsets::create());
Array k1 = {1, 2, 3};
Array v1 = {"a", "b", "c"};
{
Map map;
map.push_back(k1);
map.push_back(v1);
col_map_str64.insert(map);
}
Array k2 = {11, 22, 33};
Array v2 = {"aa", "bb", "cc"};
{
Map map;
map.push_back(k2);
map.push_back(v2);
col_map_str64.insert(map);
}
Array k3 = {111, 222, 333};
Array v3 = {"aaa", "bbb", "ccc"};
{
Map map;
map.push_back(k3);
map.push_back(v3);
col_map_str64.insert(map);
}

// test insert ColumnMap<ColumnStr<uint64_t>, Column> into ColumnMap<ColumnStr<uint32_t>, Column>
auto col_map_str32 = ColumnMap(ColumnInt64::create(), ColumnString::create(),
ColumnArray::ColumnOffsets::create());
std::vector<uint32_t> indices;
indices.push_back(0);
indices.push_back(2);
col_map_str32.insert_indices_from(col_map_str64, indices.data(),
indices.data() + indices.size());
EXPECT_EQ(col_map_str32.size(), 2);

auto map = get<Map>(col_map_str32[0]);
auto k = get<Array>(map[0]);
auto v = get<Array>(map[1]);
EXPECT_EQ(k.size(), 3);
for (size_t i = 0; i < k.size(); ++i) {
EXPECT_EQ(k[i], k1[i]);
}
EXPECT_EQ(v.size(), 3);
for (size_t i = 0; i < v.size(); ++i) {
EXPECT_EQ(v[i], v1[i]);
}

map = get<Map>(col_map_str32[1]);
k = get<Array>(map[0]);
v = get<Array>(map[1]);
EXPECT_EQ(k.size(), 3);
for (size_t i = 0; i < k.size(); ++i) {
EXPECT_EQ(k[i], k3[i]);
}
EXPECT_EQ(v.size(), 3);
for (size_t i = 0; i < v.size(); ++i) {
EXPECT_EQ(v[i], v3[i]);
}
};
} // namespace doris::vectorized
38 changes: 38 additions & 0 deletions be/test/vec/core/column_string_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

#include <gtest/gtest.h>

#include "vec/columns/column.h"
#include "vec/core/block.h"
#include "vec/data_types/data_type_string.h"
#include "vec/functions/function_string.h"
Expand Down Expand Up @@ -56,4 +57,41 @@ TEST(ColumnStringTest, TestConcat) {
auto actual_res_col_str = assert_cast<const ColumnString*>(actual_res_col.get());
actual_res_col_str->sanity_check();
}

TEST(ColumnStringTest, TestStringInsert) {
{
auto str32_column = ColumnString::create();
std::vector<std::string> vals_tmp = {"x", "yy", "zzz", ""};
auto str32_column_tmp = ColumnString::create();
for (auto& v : vals_tmp) {
str32_column_tmp->insert_data(v.data(), v.size());
}
str32_column->insert_range_from(*str32_column_tmp, 0, vals_tmp.size());
str32_column->insert_range_from(*str32_column_tmp, 0, vals_tmp.size());
auto row_count = str32_column->size();
EXPECT_EQ(row_count, vals_tmp.size() * 2);
for (size_t i = 0; i < row_count; ++i) {
auto row_data = str32_column->get_data_at(i);
EXPECT_EQ(row_data.to_string(), vals_tmp[i % vals_tmp.size()]);
}
}

{
// test insert ColumnString64 to ColumnString
auto str32_column = ColumnString::create();
std::vector<std::string> vals_tmp = {"x", "yy", "zzz", ""};
auto str64_column_tmp = ColumnString64::create();
for (auto& v : vals_tmp) {
str64_column_tmp->insert_data(v.data(), v.size());
}
str32_column->insert_range_from(*str64_column_tmp, 0, vals_tmp.size());
str32_column->insert_range_from(*str64_column_tmp, 0, vals_tmp.size());
auto row_count = str32_column->size();
EXPECT_EQ(row_count, vals_tmp.size() * 2);
for (size_t i = 0; i < row_count; ++i) {
auto row_data = str32_column->get_data_at(i);
EXPECT_EQ(row_data.to_string(), vals_tmp[i % vals_tmp.size()]);
}
}
}
} // namespace doris::vectorized
Loading

0 comments on commit d4712ae

Please sign in to comment.