From b75f5b8111c0ce9779da6cdb6327bf8569716511 Mon Sep 17 00:00:00 2001 From: chenqi Date: Wed, 24 Jul 2024 16:40:14 +0800 Subject: [PATCH] [Fix](parquet-reader) some fix for #5 (Fix and optimize parquet min-max filtering). --- be/src/vec/exec/format/parquet/parquet_common.h | 10 +++++----- be/src/vec/exec/format/parquet/parquet_pred_cmp.h | 12 +++++++----- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/be/src/vec/exec/format/parquet/parquet_common.h b/be/src/vec/exec/format/parquet/parquet_common.h index 69b16d011ee9ae..48d30a7825dcb5 100644 --- a/be/src/vec/exec/format/parquet/parquet_common.h +++ b/be/src/vec/exec/format/parquet/parquet_common.h @@ -437,16 +437,16 @@ class CorruptStatistics { private: static void warn_parse_error_once(const std::string& createdBy, const std::string_view& msg) { //if (!already_logged.exchange(true)) { - LOG(WARNING) << "Ignoring statistics because created_by could not be parsed (see " - "PARQUET-251)." - " CreatedBy: " - << createdBy << ", msg: " << msg; + LOG(WARNING) << "Ignoring statistics because created_by could not be parsed (see " + "PARQUET-251)." + " CreatedBy: " + << createdBy << ", msg: " << msg; //} } static void warn_once(const std::string_view& msg) { //if (!already_logged.exchange(true)) { - LOG(WARNING) << msg; + LOG(WARNING) << msg; //} } diff --git a/be/src/vec/exec/format/parquet/parquet_pred_cmp.h b/be/src/vec/exec/format/parquet/parquet_pred_cmp.h index 0b66d4ce45d401..fff980a16bdcec 100644 --- a/be/src/vec/exec/format/parquet/parquet_pred_cmp.h +++ b/be/src/vec/exec/format/parquet/parquet_pred_cmp.h @@ -131,6 +131,8 @@ class ParquetPredicate { CppType min_value; CppType max_value; + std::unique_ptr encoded_min_copy; + std::unique_ptr encoded_max_copy; tparquet::Type::type physical_type = col_schema->physical_type; switch (col_val_range.type()) { #define DISPATCH(REINTERPRET_TYPE, PARQUET_TYPE) \ @@ -186,13 +188,13 @@ class ParquetPredicate { case TYPE_STRING: if constexpr (std::is_same_v) { if (!use_min_max_value) { - std::string min_copy(encoded_min); - std::string max_copy(encoded_max); - if (!_try_read_old_utf8_stats(min_copy, max_copy)) { + encoded_min_copy = std::make_unique(encoded_min); + encoded_max_copy = std::make_unique(encoded_max); + if (!_try_read_old_utf8_stats(*encoded_min_copy, *encoded_max_copy)) { return false; } - min_value = StringRef(min_copy); - max_value = StringRef(max_copy); + min_value = StringRef(*encoded_min_copy); + max_value = StringRef(*encoded_max_copy); } else { min_value = StringRef(encoded_min); max_value = StringRef(encoded_max);