Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[fix](string64) fix coredump caused by ColumnArray<ColumnStr<uint64_t>>::insert_indices_from #43624

Merged
merged 3 commits into from
Nov 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion be/src/vec/columns/column_array.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -389,7 +389,8 @@ void ColumnArray::insert_from(const IColumn& src_, size_t n) {

if (!get_data().is_nullable() && src.get_data().is_nullable()) {
// Note: we can't process the case of 'Array(Nullable(nest))'
DCHECK(false);
throw Exception(ErrorCode::INTERNAL_ERROR, "insert '{}' into '{}'", src.get_name(),
get_name());
} else if (get_data().is_nullable() && !src.get_data().is_nullable()) {
// Note: here we should process the case of 'Array(NotNullable(nest))'
reinterpret_cast<ColumnNullable*>(&get_data())
Expand Down
65 changes: 40 additions & 25 deletions be/src/vec/columns/column_string.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@

#include <algorithm>
#include <boost/iterator/iterator_facade.hpp>
#include <ostream>

#include "util/memcpy_inlined.h"
#include "util/simd/bits.h"
Expand Down Expand Up @@ -94,6 +93,13 @@ MutableColumnPtr ColumnStr<T>::get_shrinked_column() {
return shrinked_column;
}

// This method is only called by MutableBlock::merge_ignore_overflow
// by hash join operator to collect build data to avoid
// the total string length of a ColumnStr<uint32_t> column exceeds the 4G limit.
//
// After finishing collecting build data, a ColumnStr<uint32_t> column
// will be converted to ColumnStr<uint64_t> if the total string length
// exceeds the 4G limit by calling Block::replace_if_overflow.
template <typename T>
void ColumnStr<T>::insert_range_from_ignore_overflow(const doris::vectorized::IColumn& src,
size_t start, size_t length) {
Expand Down Expand Up @@ -123,6 +129,8 @@ void ColumnStr<T>::insert_range_from_ignore_overflow(const doris::vectorized::IC
offsets.resize(old_size + length);

for (size_t i = 0; i < length; ++i) {
// unsinged integer overflow is well defined in C++,
// so we don't need to check the overflow here.
offsets[old_size + i] =
src_concrete.offsets[start + i] - nested_offset + prev_max_offset;
}
Expand All @@ -134,34 +142,41 @@ void ColumnStr<T>::insert_range_from(const IColumn& src, size_t start, size_t le
if (length == 0) {
return;
}
auto do_insert = [&](const auto& src_concrete) {
yiguolei marked this conversation as resolved.
Show resolved Hide resolved
const auto& src_offsets = src_concrete.get_offsets();
const auto& src_chars = src_concrete.get_chars();
yiguolei marked this conversation as resolved.
Show resolved Hide resolved
if (start + length > src_offsets.size()) {
throw doris::Exception(
doris::ErrorCode::INTERNAL_ERROR,
"Parameter out of bound in IColumnStr<T>::insert_range_from method.");
}
size_t nested_offset = src_offsets[static_cast<ssize_t>(start) - 1];
size_t nested_length = src_offsets[start + length - 1] - nested_offset;

const auto& src_concrete = assert_cast<const ColumnStr<T>&>(src);

if (start + length > src_concrete.offsets.size()) {
throw doris::Exception(
doris::ErrorCode::INTERNAL_ERROR,
"Parameter out of bound in IColumnStr<T>::insert_range_from method.");
}

size_t nested_offset = src_concrete.offset_at(start);
size_t nested_length = src_concrete.offsets[start + length - 1] - nested_offset;

size_t old_chars_size = chars.size();
check_chars_length(old_chars_size + nested_length, offsets.size() + length);
chars.resize(old_chars_size + nested_length);
memcpy(&chars[old_chars_size], &src_concrete.chars[nested_offset], nested_length);
size_t old_chars_size = chars.size();
check_chars_length(old_chars_size + nested_length, offsets.size() + length);
chars.resize(old_chars_size + nested_length);
memcpy(&chars[old_chars_size], &src_chars[nested_offset], nested_length);

if (start == 0 && offsets.empty()) {
offsets.assign(src_concrete.offsets.begin(), src_concrete.offsets.begin() + length);
} else {
size_t old_size = offsets.size();
size_t prev_max_offset = offsets.back(); /// -1th index is Ok, see PaddedPODArray
offsets.resize(old_size + length);
using OffsetsType = std::decay_t<decltype(src_offsets)>;
if (std::is_same_v<T, typename OffsetsType::value_type> && start == 0 && offsets.empty()) {
offsets.assign(src_offsets.begin(), src_offsets.begin() + length);
yiguolei marked this conversation as resolved.
Show resolved Hide resolved
} else {
size_t old_size = offsets.size();
size_t prev_max_offset = offsets.back(); /// -1th index is Ok, see PaddedPODArray
offsets.resize(old_size + length);

for (size_t i = 0; i < length; ++i) {
offsets[old_size + i] =
src_concrete.offsets[start + i] - nested_offset + prev_max_offset;
for (size_t i = 0; i < length; ++i) {
offsets[old_size + i] = src_offsets[start + i] - nested_offset + prev_max_offset;
}
}
};
// insert_range_from maybe called by ColumnArray::insert_indices_from(which is used by hash join operator),
// so we need to support both ColumnStr<uint32_t> and ColumnStr<uint64_t>
if (src.is_column_string64()) {
do_insert(assert_cast<const ColumnStr<uint64_t>&>(src));
yiguolei marked this conversation as resolved.
Show resolved Hide resolved
} else {
do_insert(assert_cast<const ColumnStr<uint32_t>&>(src));
}
}

Expand Down
47 changes: 47 additions & 0 deletions be/test/vec/core/column_array_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,53 @@ TEST(ColumnArrayTest, StringArrayTest) {
}
}

TEST(ColumnArrayTest, String64ArrayTest) {
auto off_column = ColumnVector<ColumnArray::Offset64>::create();
auto str64_column = ColumnString64::create();
// init column array with [["abc","d"],["ef"],[], [""]];
std::vector<ColumnArray::Offset64> offs = {0, 2, 3, 3, 4};
std::vector<std::string> vals = {"abc", "d", "ef", ""};
for (size_t i = 1; i < offs.size(); ++i) {
off_column->insert_data((const char*)(&offs[i]), 0);
}
for (auto& v : vals) {
str64_column->insert_data(v.data(), v.size());
}

ColumnArray str64_array_column(std::move(str64_column), std::move(off_column));
EXPECT_EQ(str64_array_column.size(), offs.size() - 1);
for (size_t i = 0; i < str64_array_column.size(); ++i) {
auto v = get<Array>(str64_array_column[i]);
EXPECT_EQ(v.size(), offs[i + 1] - offs[i]);
for (size_t j = 0; j < v.size(); ++j) {
EXPECT_EQ(vals[offs[i] + j], get<std::string>(v[j]));
}
}
// test insert ColumnArray<ColumnStr<uint64_t>> into ColumnArray<ColumnStr<uint32_t>>
auto str32_column = ColumnString::create();
ColumnArray str32_array_column(std::move(str32_column));
std::vector<uint32_t> indices;
indices.push_back(0);
indices.push_back(1);
indices.push_back(3);
str32_array_column.insert_indices_from(str64_array_column, indices.data(),
indices.data() + indices.size());
EXPECT_EQ(str32_array_column.size(), 3);

auto v = get<Array>(str32_array_column[0]);
EXPECT_EQ(v.size(), 2);
EXPECT_EQ(get<std::string>(v[0]), vals[0]);
EXPECT_EQ(get<std::string>(v[1]), vals[1]);

v = get<Array>(str32_array_column[1]);
EXPECT_EQ(v.size(), 1);
EXPECT_EQ(get<std::string>(v[0]), vals[2]);

v = get<Array>(str32_array_column[2]);
EXPECT_EQ(v.size(), 1);
EXPECT_EQ(get<std::string>(v[0]), vals[3]);
}

TEST(ColumnArrayTest, IntArrayPermuteTest) {
auto off_column = ColumnVector<ColumnArray::Offset64>::create();
auto data_column = ColumnVector<int32_t>::create();
Expand Down
155 changes: 155 additions & 0 deletions be/test/vec/core/column_map_test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#include "vec/columns/column_map.h"

#include <gtest/gtest-message.h>
#include <gtest/gtest-test-part.h>

#include "gtest/gtest_pred_impl.h"
#include "vec/columns/column.h"
#include "vec/columns/column_string.h"
#include "vec/columns/column_vector.h"
#include "vec/core/field.h"

namespace doris::vectorized {
TEST(ColumnMapTest, StringKeyTest) {
auto col_map_str64 = ColumnMap(ColumnString64::create(), ColumnInt64::create(),
ColumnArray::ColumnOffsets::create());
Array k1 = {"a", "b", "c"};
Array v1 = {1, 2, 3};
{
Map map;
map.push_back(k1);
map.push_back(v1);
col_map_str64.insert(map);
}
Array k2 = {"aa", "bb", "cc"};
Array v2 = {11, 22, 33};
{
Map map;
map.push_back(k2);
map.push_back(v2);
col_map_str64.insert(map);
}
Array k3 = {"aaa", "bbb", "ccc"};
Array v3 = {111, 222, 333};
{
Map map;
map.push_back(k3);
map.push_back(v3);
col_map_str64.insert(map);
}

// test insert ColumnMap<ColumnStr<uint64_t>, Column> into ColumnMap<ColumnStr<uint32_t>, Column>
auto col_map_str32 = ColumnMap(ColumnString::create(), ColumnInt64::create(),
ColumnArray::ColumnOffsets::create());
std::vector<uint32_t> indices;
indices.push_back(0);
indices.push_back(2);
col_map_str32.insert_indices_from(col_map_str64, indices.data(),
indices.data() + indices.size());
EXPECT_EQ(col_map_str32.size(), 2);

auto map = get<Map>(col_map_str32[0]);
auto k = get<Array>(map[0]);
auto v = get<Array>(map[1]);
EXPECT_EQ(k.size(), 3);
for (size_t i = 0; i < k.size(); ++i) {
EXPECT_EQ(k[i], k1[i]);
}
EXPECT_EQ(v.size(), 3);
for (size_t i = 0; i < v.size(); ++i) {
EXPECT_EQ(v[i], v1[i]);
}

map = get<Map>(col_map_str32[1]);
k = get<Array>(map[0]);
v = get<Array>(map[1]);
EXPECT_EQ(k.size(), 3);
for (size_t i = 0; i < k.size(); ++i) {
EXPECT_EQ(k[i], k3[i]);
}
EXPECT_EQ(v.size(), 3);
for (size_t i = 0; i < v.size(); ++i) {
EXPECT_EQ(v[i], v3[i]);
}
};

TEST(ColumnMapTest, StringValueTest) {
auto col_map_str64 = ColumnMap(ColumnInt64::create(), ColumnString64::create(),
ColumnArray::ColumnOffsets::create());
Array k1 = {1, 2, 3};
Array v1 = {"a", "b", "c"};
{
Map map;
map.push_back(k1);
map.push_back(v1);
col_map_str64.insert(map);
}
Array k2 = {11, 22, 33};
Array v2 = {"aa", "bb", "cc"};
{
Map map;
map.push_back(k2);
map.push_back(v2);
col_map_str64.insert(map);
}
Array k3 = {111, 222, 333};
Array v3 = {"aaa", "bbb", "ccc"};
{
Map map;
map.push_back(k3);
map.push_back(v3);
col_map_str64.insert(map);
}

// test insert ColumnMap<ColumnStr<uint64_t>, Column> into ColumnMap<ColumnStr<uint32_t>, Column>
auto col_map_str32 = ColumnMap(ColumnInt64::create(), ColumnString::create(),
ColumnArray::ColumnOffsets::create());
std::vector<uint32_t> indices;
indices.push_back(0);
indices.push_back(2);
col_map_str32.insert_indices_from(col_map_str64, indices.data(),
indices.data() + indices.size());
EXPECT_EQ(col_map_str32.size(), 2);

auto map = get<Map>(col_map_str32[0]);
auto k = get<Array>(map[0]);
auto v = get<Array>(map[1]);
EXPECT_EQ(k.size(), 3);
for (size_t i = 0; i < k.size(); ++i) {
EXPECT_EQ(k[i], k1[i]);
}
EXPECT_EQ(v.size(), 3);
for (size_t i = 0; i < v.size(); ++i) {
EXPECT_EQ(v[i], v1[i]);
}

map = get<Map>(col_map_str32[1]);
k = get<Array>(map[0]);
v = get<Array>(map[1]);
EXPECT_EQ(k.size(), 3);
for (size_t i = 0; i < k.size(); ++i) {
EXPECT_EQ(k[i], k3[i]);
}
EXPECT_EQ(v.size(), 3);
for (size_t i = 0; i < v.size(); ++i) {
EXPECT_EQ(v[i], v3[i]);
}
};
} // namespace doris::vectorized
38 changes: 38 additions & 0 deletions be/test/vec/core/column_string_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

#include <gtest/gtest.h>

#include "vec/columns/column.h"
#include "vec/core/block.h"
#include "vec/data_types/data_type_string.h"
#include "vec/functions/function_string.h"
Expand Down Expand Up @@ -64,4 +65,41 @@ TEST(ColumnStringTest, TestConcat) {
auto actual_res_col_str = assert_cast<const ColumnString*>(actual_res_col.get());
actual_res_col_str->sanity_check();
}

TEST(ColumnStringTest, TestStringInsert) {
{
auto str32_column = ColumnString::create();
std::vector<std::string> vals_tmp = {"x", "yy", "zzz", ""};
auto str32_column_tmp = ColumnString::create();
for (auto& v : vals_tmp) {
str32_column_tmp->insert_data(v.data(), v.size());
}
str32_column->insert_range_from(*str32_column_tmp, 0, vals_tmp.size());
str32_column->insert_range_from(*str32_column_tmp, 0, vals_tmp.size());
auto row_count = str32_column->size();
EXPECT_EQ(row_count, vals_tmp.size() * 2);
for (size_t i = 0; i < row_count; ++i) {
auto row_data = str32_column->get_data_at(i);
EXPECT_EQ(row_data.to_string(), vals_tmp[i % vals_tmp.size()]);
}
}

{
// test insert ColumnString64 to ColumnString
auto str32_column = ColumnString::create();
std::vector<std::string> vals_tmp = {"x", "yy", "zzz", ""};
auto str64_column_tmp = ColumnString64::create();
for (auto& v : vals_tmp) {
str64_column_tmp->insert_data(v.data(), v.size());
}
str32_column->insert_range_from(*str64_column_tmp, 0, vals_tmp.size());
str32_column->insert_range_from(*str64_column_tmp, 0, vals_tmp.size());
auto row_count = str32_column->size();
EXPECT_EQ(row_count, vals_tmp.size() * 2);
for (size_t i = 0; i < row_count; ++i) {
auto row_data = str32_column->get_data_at(i);
EXPECT_EQ(row_data.to_string(), vals_tmp[i % vals_tmp.size()]);
}
}
}
} // namespace doris::vectorized
Loading
Loading