diff --git a/be/src/util/simd/vstring_function.h b/be/src/util/simd/vstring_function.h index 99313132382e5c..bfa75b728d5620 100644 --- a/be/src/util/simd/vstring_function.h +++ b/be/src/util/simd/vstring_function.h @@ -309,8 +309,11 @@ class VStringFunctions { // is to say, counting bytes which do not match 10xx_xxxx pattern. // All 0xxx_xxxx, 110x_xxxx, 1110_xxxx and 1111_0xxx are greater than 1011_1111 when use int8_t arithmetic, // so just count bytes greater than 1011_1111 in a byte string as the result of utf8_length. - static inline size_t get_char_len(const char* src, size_t len) { - size_t char_len = 0; + // get_char_len is used to return the UTF-8 length of a string. + // The return value will never exceed len. + template + static inline T get_char_len(const char* src, T len) { + T char_len = 0; const char* p = src; const char* end = p + len; #if defined(__SSE2__) || defined(__aarch64__) diff --git a/be/src/vec/functions/function_rpc.cpp b/be/src/vec/functions/function_rpc.cpp index 6d3a64f78409c6..d88266cc2bf647 100644 --- a/be/src/vec/functions/function_rpc.cpp +++ b/be/src/vec/functions/function_rpc.cpp @@ -33,7 +33,7 @@ #include "vec/data_types/serde/data_type_serde.h" namespace doris::vectorized { - +#include "common/compile_check_begin.h" RPCFnImpl::RPCFnImpl(const TFunction& fn) : _fn(fn) { _function_name = _fn.scalar_fn.symbol; _server_addr = _fn.hdfs_location; diff --git a/be/src/vec/functions/function_split_by_regexp.cpp b/be/src/vec/functions/function_split_by_regexp.cpp index a4c46b59ac1f75..89791d398ece50 100644 --- a/be/src/vec/functions/function_split_by_regexp.cpp +++ b/be/src/vec/functions/function_split_by_regexp.cpp @@ -30,6 +30,7 @@ #include "vec/functions/simple_function_factory.h" namespace doris::vectorized { +#include "common/compile_check_begin.h" struct Match { std::string::size_type offset; @@ -194,7 +195,7 @@ struct ExecuteImpl { unpack_if_const(block.get_by_position(arguments[1]).column); const auto& [three_column, three_is_const] = unpack_if_const(block.get_by_position(arguments[2]).column); - auto limit_value = assert_cast(*three_column).get_int(0); + auto limit_value = assert_cast(*three_column).get_element(0); const auto& src_column = assert_cast(*first_column); const auto& pattern_column = assert_cast(*second_column); @@ -238,7 +239,7 @@ struct ExecuteImpl { const StringRef& pattern_ref, ColumnString& dest_column_string, ColumnArray::Offsets64& dest_offsets, - NullMapType* dest_nested_null_map, Int64 limit_value, + NullMapType* dest_nested_null_map, Int32 limit_value, size_t input_rows_count, RE2::Options* opts) { const char* token_begin = nullptr; const char* token_end = nullptr; @@ -270,7 +271,7 @@ struct ExecuteImpl { const ColumnString& pattern_column, ColumnString& dest_column_string, ColumnArray::Offsets64& dest_offsets, - NullMapType* dest_nested_null_map, Int64 limit_value, + NullMapType* dest_nested_null_map, Int32 limit_value, size_t input_rows_count, RE2::Options* opts) { const char* token_begin = nullptr; const char* token_end = nullptr; @@ -307,7 +308,7 @@ struct ExecuteImpl { const ColumnString& pattern_column, ColumnString& dest_column_string, ColumnArray::Offsets64& dest_offsets, - NullMapType* dest_nested_null_map, Int64 limit_value, + NullMapType* dest_nested_null_map, Int32 limit_value, size_t input_rows_count, RE2::Options* opts) { const char* token_begin = nullptr; const char* token_end = nullptr; diff --git a/be/src/vec/functions/function_string.cpp b/be/src/vec/functions/function_string.cpp index c434d344daa40a..f34d9f91b4d91d 100644 --- a/be/src/vec/functions/function_string.cpp +++ b/be/src/vec/functions/function_string.cpp @@ -25,6 +25,7 @@ #include #include +#include "common/cast_set.h" #include "common/status.h" #include "runtime/string_search.hpp" #include "util/url_coding.h" @@ -38,6 +39,7 @@ #include "vec/functions/simple_function_factory.h" namespace doris::vectorized { +#include "common/compile_check_begin.h" struct NameStringASCII { static constexpr auto name = "ascii"; }; @@ -68,7 +70,7 @@ struct NameQuoteImpl { static Status vector(const ColumnString::Chars& data, const ColumnString::Offsets& offsets, ColumnString::Chars& res_data, ColumnString::Offsets& res_offsets) { size_t offset_size = offsets.size(); - size_t pos = 0; + ColumnString::Offset pos = 0; res_offsets.resize(offset_size); res_data.resize(data.size() + offset_size * 2); for (int i = 0; i < offset_size; i++) { @@ -285,9 +287,9 @@ struct StringInStrImpl { /// We check that the entry does not pass through the boundaries of strings. if (pos + rdata.size <= begin + loffsets[i]) { - int loc = pos - begin - loffsets[i - 1]; + int loc = (int)(pos - begin) - loffsets[i - 1]; int l_str_size = loffsets[i] - loffsets[i - 1]; - size_t len = std::min(l_str_size, loc); + auto len = std::min(l_str_size, loc); loc = simd::VStringFunctions::get_char_len((char*)(begin + loffsets[i - 1]), len); res[i] = loc + 1; } @@ -332,7 +334,7 @@ struct StringInStrImpl { // Hive returns positions starting from 1. int loc = search.search(&strl); if (loc > 0) { - size_t len = std::min((size_t)loc, strl.size); + int len = std::min(loc, (int)strl.size); loc = simd::VStringFunctions::get_char_len(strl.data, len); } @@ -489,7 +491,16 @@ struct InitcapImpl { if (!::isalnum(res_data[i])) { need_capitalize = true; } else if (need_capitalize) { - res_data[i] = ::toupper(res_data[i]); + /* + https://en.cppreference.com/w/cpp/string/byte/toupper + Like all other functions from , the behavior of std::toupper is undefined if the argument's value is neither representable as unsigned char nor equal to EOF. + To use these functions safely with plain chars (or signed chars), the argument should first be converted to unsigned char: + char my_toupper(char ch) + { + return static_cast(std::toupper(static_cast(ch))); + } + */ + res_data[i] = static_cast(::toupper(res_data[i])); need_capitalize = false; } } @@ -540,7 +551,8 @@ struct TrimUtil { } res_data.insert_assume_reserved(str_begin, str_end); - res_offsets[i] = res_data.size(); + // The length of the result of the trim function will never exceed the length of the input. + res_offsets[i] = (ColumnString::Offset)res_data.size(); } return Status::OK(); } @@ -606,7 +618,8 @@ struct TrimInUtil { } res_data.insert_assume_reserved(left_trim_pos, right_trim_pos); - res_offsets[i] = res_data.size(); + // The length of the result of the trim function will never exceed the length of the input. + res_offsets[i] = (ColumnString::Offset)res_data.size(); } return Status::OK(); @@ -669,7 +682,8 @@ struct TrimInUtil { } res_data.insert_assume_reserved(left_trim_pos, right_trim_pos); - res_offsets[i] = res_data.size(); + // The length of the result of the trim function will never exceed the length of the input. + res_offsets[i] = (ColumnString::Offset)res_data.size(); } return Status::OK(); } @@ -820,7 +834,7 @@ struct UnHexImpl { return false; } - static int hex_decode(const char* src_str, size_t src_len, char* dst_str) { + static int hex_decode(const char* src_str, ColumnString::Offset src_len, char* dst_str) { // if str length is odd or 0, return empty string like mysql dose. if ((src_len & 1) != 0 or src_len == 0) { return 0; @@ -848,7 +862,7 @@ struct UnHexImpl { for (int i = 0; i < rows_count; ++i) { const auto* source = reinterpret_cast(&data[offsets[i - 1]]); - size_t srclen = offsets[i] - offsets[i - 1]; + ColumnString::Offset srclen = offsets[i] - offsets[i - 1]; if (srclen == 0) { StringOP::push_empty_string(i, dst_data, dst_offsets); @@ -898,7 +912,7 @@ struct StringSpace { for (size_t i = 0; i < input_size; ++i) { if (data[i] > 0) [[likely]] { res_data.resize_fill(res_data.size() + data[i], ' '); - res_offsets[i] = res_data.size(); + cast_set(res_offsets[i], res_data.size()); } else { StringOP::push_empty_string(i, res_data, res_offsets); } @@ -962,7 +976,7 @@ struct FromBase64Impl { } const auto* source = reinterpret_cast(&data[offsets[i - 1]]); - size_t srclen = offsets[i] - offsets[i - 1]; + ColumnString::Offset srclen = offsets[i] - offsets[i - 1]; if (srclen == 0) { StringOP::push_empty_string(i, dst_data, dst_offsets); diff --git a/be/src/vec/functions/functions_comparison.h b/be/src/vec/functions/functions_comparison.h index cb56f176c71eb1..2da767fa7ef909 100644 --- a/be/src/vec/functions/functions_comparison.h +++ b/be/src/vec/functions/functions_comparison.h @@ -42,6 +42,7 @@ //#include "olap/rowset/segment_v2/inverted_index_reader.h" namespace doris::vectorized { +#include "common/compile_check_begin.h" /** Comparison functions: ==, !=, <, >, <=, >=. * The comparison functions always return 0 or 1 (UInt8). @@ -441,7 +442,7 @@ class FunctionComparison : public IFunction { if (c0_const_string) { c0_const_chars = &c0_const_string->get_chars(); - c0_const_size = c0_const_string->get_data_at(0).size; + c0_const_size = c0_const_string->get_offsets()[0]; } else { return Status::NotSupported("Illegal columns {}, of argument of function {}", c0->get_name(), name); @@ -454,7 +455,7 @@ class FunctionComparison : public IFunction { if (c1_const_string) { c1_const_chars = &c1_const_string->get_chars(); - c1_const_size = c1_const_string->get_data_at(0).size; + c1_const_size = c1_const_string->get_offsets()[0]; } else { return Status::NotSupported("Illegal columns {}, of argument of function {}", c1->get_name(), name); @@ -714,4 +715,5 @@ class FunctionComparison : public IFunction { } }; +#include "common/compile_check_end.h" } // namespace doris::vectorized \ No newline at end of file diff --git a/be/src/vec/functions/functions_multi_string_position.cpp b/be/src/vec/functions/functions_multi_string_position.cpp index 77dccff5eb763e..fdef27bc2be873 100644 --- a/be/src/vec/functions/functions_multi_string_position.cpp +++ b/be/src/vec/functions/functions_multi_string_position.cpp @@ -55,6 +55,7 @@ #include "vec/functions/simple_function_factory.h" namespace doris { +#include "common/compile_check_begin.h" class FunctionContext; } // namespace doris @@ -106,10 +107,11 @@ class FunctionMultiStringPosition : public IFunction { const ColumnConst* col_needles_const = check_and_get_column_const(needles_ptr.get()); - if (!col_needles_const && !col_needles_vector) + if (!col_needles_const && !col_needles_vector) { return Status::InvalidArgument( "function '{}' encountered unsupported needles column, found {}", name, needles_column->get_name()); + } if (col_haystack_const && col_needles_vector) { return Status::InvalidArgument( @@ -219,9 +221,9 @@ struct FunctionMultiSearchAllPositionsImpl { const auto* haystack_end = haystack - prev_haystack_offset + haystack_offsets[haystack_index]; - auto ans_now = searcher.search(haystack, haystack_end); + const auto* ans_now = searcher.search(haystack, haystack_end); vec_res[res_index] = - ans_now >= haystack_end ? 0 : std::distance(haystack, ans_now) + 1; + ans_now >= haystack_end ? 0 : (Int32)std::distance(haystack, ans_now) + 1; prev_haystack_offset = haystack_offsets[haystack_index]; } } @@ -296,7 +298,7 @@ struct FunctionMultiSearchAllPositionsImpl { auto ans_now = searcher.search(haystack, haystack_end); vec_res[ans_row_begin + ans_slot_in_row] = - ans_now >= haystack_end ? 0 : std::distance(haystack, ans_now) + 1; + ans_now >= haystack_end ? 0 : (Int32)std::distance(haystack, ans_now) + 1; } prev_haystack_offset = haystack_offsets[haystack_index]; @@ -315,4 +317,5 @@ void register_function_multi_string_position(SimpleFunctionFactory& factory) { factory.register_function(); } +#include "common/compile_check_end.h" } // namespace doris::vectorized