Skip to content

Commit

Permalink
[fix](string64) fix coredump caused by ColumnArray<ColumnStr<uint64_t…
Browse files Browse the repository at this point in the history
…>>::insert_indices_from
  • Loading branch information
jacktengg committed Nov 11, 2024
1 parent af565ba commit 3241ad5
Show file tree
Hide file tree
Showing 4 changed files with 118 additions and 26 deletions.
3 changes: 2 additions & 1 deletion be/src/vec/columns/column_array.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -389,7 +389,8 @@ void ColumnArray::insert_from(const IColumn& src_, size_t n) {

if (!get_data().is_nullable() && src.get_data().is_nullable()) {
// Note: we can't process the case of 'Array(Nullable(nest))'
DCHECK(false);
throw Exception(ErrorCode::INTERNAL_ERROR, "insert '{}' into '{}'", src.get_name(),
get_name());
} else if (get_data().is_nullable() && !src.get_data().is_nullable()) {
// Note: here we should process the case of 'Array(NotNullable(nest))'
reinterpret_cast<ColumnNullable*>(&get_data())
Expand Down
56 changes: 31 additions & 25 deletions be/src/vec/columns/column_string.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@

#include <algorithm>
#include <boost/iterator/iterator_facade.hpp>
#include <ostream>

#include "util/memcpy_inlined.h"
#include "util/simd/bits.h"
Expand Down Expand Up @@ -134,34 +133,41 @@ void ColumnStr<T>::insert_range_from(const IColumn& src, size_t start, size_t le
if (length == 0) {
return;
}
auto do_insert = [&](const auto& src_concrete) {
const auto& src_offsets = src_concrete.get_offsets();
const auto& src_chars = src_concrete.get_chars();
if (start + length > src_offsets.size()) {
throw doris::Exception(
doris::ErrorCode::INTERNAL_ERROR,
"Parameter out of bound in IColumnStr<T>::insert_range_from method.");
}
size_t nested_offset = src_offsets[static_cast<ssize_t>(start) - 1];
size_t nested_length = src_offsets[start + length - 1] - nested_offset;

const auto& src_concrete = assert_cast<const ColumnStr<T>&>(src);

if (start + length > src_concrete.offsets.size()) {
throw doris::Exception(
doris::ErrorCode::INTERNAL_ERROR,
"Parameter out of bound in IColumnStr<T>::insert_range_from method.");
}

size_t nested_offset = src_concrete.offset_at(start);
size_t nested_length = src_concrete.offsets[start + length - 1] - nested_offset;

size_t old_chars_size = chars.size();
check_chars_length(old_chars_size + nested_length, offsets.size() + length);
chars.resize(old_chars_size + nested_length);
memcpy(&chars[old_chars_size], &src_concrete.chars[nested_offset], nested_length);
size_t old_chars_size = chars.size();
check_chars_length(old_chars_size + nested_length, offsets.size() + length);
chars.resize(old_chars_size + nested_length);
memcpy(&chars[old_chars_size], &src_chars[nested_offset], nested_length);

if (start == 0 && offsets.empty()) {
offsets.assign(src_concrete.offsets.begin(), src_concrete.offsets.begin() + length);
} else {
size_t old_size = offsets.size();
size_t prev_max_offset = offsets.back(); /// -1th index is Ok, see PaddedPODArray
offsets.resize(old_size + length);
using OffsetsType = std::decay_t<decltype(src_offsets)>;
if (std::is_same_v<T, typename OffsetsType::value_type> && start == 0 && offsets.empty()) {
offsets.assign(src_offsets.begin(), src_offsets.begin() + length);
} else {
size_t old_size = offsets.size();
size_t prev_max_offset = offsets.back(); /// -1th index is Ok, see PaddedPODArray
offsets.resize(old_size + length);

for (size_t i = 0; i < length; ++i) {
offsets[old_size + i] =
src_concrete.offsets[start + i] - nested_offset + prev_max_offset;
for (size_t i = 0; i < length; ++i) {
offsets[old_size + i] = src_offsets[start + i] - nested_offset + prev_max_offset;
}
}
};
// insert_range_from maybe called by ColumnArray::insert_indices_from(which is used by hash join operator),
// so we need to support both ColumnStr<uint32_t> and ColumnStr<uint64_t>
if (src.is_column_string64()) {
do_insert(assert_cast<const ColumnStr<uint64_t>&>(src));
} else {
do_insert(assert_cast<const ColumnStr<uint32_t>&>(src));
}
}

Expand Down
47 changes: 47 additions & 0 deletions be/test/vec/core/column_array_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,53 @@ TEST(ColumnArrayTest, StringArrayTest) {
}
}

TEST(ColumnArrayTest, String64ArrayTest) {
auto off_column = ColumnVector<ColumnArray::Offset64>::create();
auto str64_column = ColumnString64::create();
// init column array with [["abc","d"],["ef"],[], [""]];
std::vector<ColumnArray::Offset64> offs = {0, 2, 3, 3, 4};
std::vector<std::string> vals = {"abc", "d", "ef", ""};
for (size_t i = 1; i < offs.size(); ++i) {
off_column->insert_data((const char*)(&offs[i]), 0);
}
for (auto& v : vals) {
str64_column->insert_data(v.data(), v.size());
}

ColumnArray str64_array_column(std::move(str64_column), std::move(off_column));
EXPECT_EQ(str64_array_column.size(), offs.size() - 1);
for (size_t i = 0; i < str64_array_column.size(); ++i) {
auto v = get<Array>(str64_array_column[i]);
EXPECT_EQ(v.size(), offs[i + 1] - offs[i]);
for (size_t j = 0; j < v.size(); ++j) {
EXPECT_EQ(vals[offs[i] + j], get<std::string>(v[j]));
}
}
// test insert ColumnArray<ColumnStr<uint64_t>> into ColumnArray<ColumnStr<uint32_t>>
auto str32_column = ColumnString::create();
ColumnArray str32_array_column(std::move(str32_column));
std::vector<uint32_t> indices;
indices.push_back(0);
indices.push_back(1);
indices.push_back(3);
str32_array_column.insert_indices_from(str64_array_column, indices.data(),
indices.data() + indices.size());
EXPECT_EQ(str32_array_column.size(), 3);

auto v = get<Array>(str32_array_column[0]);
EXPECT_EQ(v.size(), 2);
EXPECT_EQ(get<std::string>(v[0]), vals[0]);
EXPECT_EQ(get<std::string>(v[1]), vals[1]);

v = get<Array>(str32_array_column[1]);
EXPECT_EQ(v.size(), 1);
EXPECT_EQ(get<std::string>(v[0]), vals[2]);

v = get<Array>(str32_array_column[2]);
EXPECT_EQ(v.size(), 1);
EXPECT_EQ(get<std::string>(v[0]), vals[3]);
}

TEST(ColumnArrayTest, IntArrayPermuteTest) {
auto off_column = ColumnVector<ColumnArray::Offset64>::create();
auto data_column = ColumnVector<int32_t>::create();
Expand Down
38 changes: 38 additions & 0 deletions be/test/vec/core/column_string_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

#include <gtest/gtest.h>

#include "vec/columns/column.h"
#include "vec/core/block.h"
#include "vec/data_types/data_type_string.h"
#include "vec/functions/function_string.h"
Expand Down Expand Up @@ -64,4 +65,41 @@ TEST(ColumnStringTest, TestConcat) {
auto actual_res_col_str = assert_cast<const ColumnString*>(actual_res_col.get());
actual_res_col_str->sanity_check();
}

TEST(ColumnStringTest, TestStringInsert) {
{
auto str32_column = ColumnString::create();
std::vector<std::string> vals_tmp = {"x", "yy", "zzz", ""};
auto str32_column_tmp = ColumnString::create();
for (auto& v : vals_tmp) {
str32_column_tmp->insert_data(v.data(), v.size());
}
str32_column->insert_range_from(*str32_column_tmp, 0, vals_tmp.size());
str32_column->insert_range_from(*str32_column_tmp, 0, vals_tmp.size());
auto row_count = str32_column->size();
EXPECT_EQ(row_count, vals_tmp.size() * 2);
for (size_t i = 0; i < row_count; ++i) {
auto row_data = str32_column->get_data_at(i);
EXPECT_EQ(row_data.to_string(), vals_tmp[i % vals_tmp.size()]);
}
}

{
// test insert ColumnString64 to ColumnString
auto str32_column = ColumnString::create();
std::vector<std::string> vals_tmp = {"x", "yy", "zzz", ""};
auto str64_column_tmp = ColumnString64::create();
for (auto& v : vals_tmp) {
str64_column_tmp->insert_data(v.data(), v.size());
}
str32_column->insert_range_from(*str64_column_tmp, 0, vals_tmp.size());
str32_column->insert_range_from(*str64_column_tmp, 0, vals_tmp.size());
auto row_count = str32_column->size();
EXPECT_EQ(row_count, vals_tmp.size() * 2);
for (size_t i = 0; i < row_count; ++i) {
auto row_data = str32_column->get_data_at(i);
EXPECT_EQ(row_data.to_string(), vals_tmp[i % vals_tmp.size()]);
}
}
}
} // namespace doris::vectorized

0 comments on commit 3241ad5

Please sign in to comment.