From 57986f26860e46d46967ed145d2e84b2a6e26538 Mon Sep 17 00:00:00 2001 From: Nikita Mikhaylov Date: Tue, 15 Oct 2024 10:33:52 +0000 Subject: [PATCH 1/6] Empty commit From 39e53db6ddf3dd04c8b40d8a21c5465d82a706bc Mon Sep 17 00:00:00 2001 From: Michael Kolupaev Date: Tue, 21 May 2024 01:11:52 -0700 Subject: [PATCH 2/6] Merge pull request #66 from copperybean/release-13.0.0 Check validation of of bit offset when reading bit packed values (cherry picked from commit 5cfccd8ea65f33d4517e7409815d761c7650b45d) --- cpp/src/arrow/util/bit_stream_utils.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/cpp/src/arrow/util/bit_stream_utils.h b/cpp/src/arrow/util/bit_stream_utils.h index 811694e43b76c..e5a10e48ecc3e 100644 --- a/cpp/src/arrow/util/bit_stream_utils.h +++ b/cpp/src/arrow/util/bit_stream_utils.h @@ -272,6 +272,11 @@ inline void GetValue_(int num_bits, T* v, int max_bytes, const uint8_t* buffer, #pragma warning(push) #pragma warning(disable : 4800) #endif + if (ARROW_PREDICT_FALSE(*bit_offset >= 64)) { + auto msg = std::string("invalid bit offset: ") + std::to_string(*bit_offset); + msg += ", may be malformed num_bits: " + std::to_string(num_bits); + throw std::runtime_error(msg); + } *v = static_cast(bit_util::TrailingBits(*buffered_values, *bit_offset + num_bits) >> *bit_offset); #ifdef _MSC_VER From 45ea0d6b66b4cc12b870d449454196773f318206 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Fri, 3 Nov 2023 14:15:38 +0100 Subject: [PATCH 3/6] Merge pull request #47 from ClickHouse/fix-uninit-value-msan Fix possible use-of-uninitizliaed-value (cherry picked from commit ba5c67934e8274d649befcffab56731632dc5253) --- cpp/src/parquet/encoding.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 05221568c8fa0..5b567989adf6b 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -2721,7 +2721,7 @@ class DeltaBitPackDecoder : public DecoderImpl, virtual public TypedDecoder delta_bit_widths_; - int delta_bit_width_; + int delta_bit_width_ = 0; T last_value_; }; From fb031b0632a00db1e1ec952bfdbd0c1461003c34 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Wed, 9 Aug 2023 09:07:59 +0200 Subject: [PATCH 4/6] Merge pull request #39 from ClickHouse/count-from-record-batch Allow to get number of rows in record batch (cherry picked from commit 1d93838f69a802639ca144ea5704a98e2481810d) --- cpp/src/arrow/ipc/reader.cc | 17 +++++++++++++++++ cpp/src/arrow/ipc/reader.h | 2 ++ 2 files changed, 19 insertions(+) diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc index d272c78560f82..a34517c660108 100644 --- a/cpp/src/arrow/ipc/reader.cc +++ b/cpp/src/arrow/ipc/reader.cc @@ -1369,6 +1369,23 @@ class RecordBatchFileReaderImpl : public RecordBatchFileReader { return total; } + Result RecordBatchCountRows(int i) override { + DCHECK_GE(i, 0); + DCHECK_LT(i, num_record_batches()); + ARROW_ASSIGN_OR_RAISE(auto outer_message, + ReadMessageFromBlock(GetRecordBatchBlock(i))); + auto metadata = outer_message->metadata(); + const flatbuf::Message* message = nullptr; + RETURN_NOT_OK( + internal::VerifyMessage(metadata->data(), metadata->size(), &message)); + auto batch = message->header_as_RecordBatch(); + if (batch == nullptr) { + return Status::IOError( + "Header-type of flatbuffer-encoded Message is not RecordBatch."); + } + return batch->length(); + } + Status Open(const std::shared_ptr& file, int64_t footer_offset, const IpcReadOptions& options) { owned_file_ = file; diff --git a/cpp/src/arrow/ipc/reader.h b/cpp/src/arrow/ipc/reader.h index 888f59a627771..2e876d65f567f 100644 --- a/cpp/src/arrow/ipc/reader.h +++ b/cpp/src/arrow/ipc/reader.h @@ -203,6 +203,8 @@ class ARROW_EXPORT RecordBatchFileReader /// \brief Computes the total number of rows in the file. virtual Result CountRows() = 0; + virtual Result RecordBatchCountRows(int i) = 0; + /// \brief Begin loading metadata for the desired batches into memory. /// /// This method will also begin loading all dictionaries messages into memory. From 21a89cfd65632971cf7d974e3ce9030c5689f85e Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Tue, 1 Mar 2022 14:11:47 +0300 Subject: [PATCH 5/6] Merge pull request #9 from taiyang-li/raw_orc_reader Add interface to get raw orc reader from adapters (cherry picked from commit ce6b7af516cff9b106e0f7b1c30628f18e7a6169) --- cpp/src/arrow/adapters/orc/adapter.cc | 9 +++++++++ cpp/src/arrow/adapters/orc/adapter.h | 13 +++++++++++++ 2 files changed, 22 insertions(+) diff --git a/cpp/src/arrow/adapters/orc/adapter.cc b/cpp/src/arrow/adapters/orc/adapter.cc index 98784450b3cce..14340ac285e28 100644 --- a/cpp/src/arrow/adapters/orc/adapter.cc +++ b/cpp/src/arrow/adapters/orc/adapter.cc @@ -222,6 +222,11 @@ class ORCFileReader::Impl { return Init(); } + virtual liborc::Reader* GetRawORCReader() { + return reader_.get(); + } + + Status Init() { int64_t nstripes = reader_->getNumberOfStripes(); stripes_.resize(static_cast(nstripes)); @@ -548,6 +553,10 @@ class ORCFileReader::Impl { return NextStripeReader(batch_size, empty_vec); } + liborc::Reader* ORCFileReader::GetRawORCReader() { + return impl_->GetRawORCReader(); + } + private: MemoryPool* pool_; std::unique_ptr reader_; diff --git a/cpp/src/arrow/adapters/orc/adapter.h b/cpp/src/arrow/adapters/orc/adapter.h index 4ffff81f355f1..41f53d3474d53 100644 --- a/cpp/src/arrow/adapters/orc/adapter.h +++ b/cpp/src/arrow/adapters/orc/adapter.h @@ -53,6 +53,19 @@ class ARROW_EXPORT ORCFileReader { public: ~ORCFileReader(); + /// \brief Creates a new ORC reader. + /// + /// \param[in] file the data source + /// \param[in] pool a MemoryPool to use for buffer allocations + /// \param[out] reader the returned reader object + /// \return Status + ARROW_DEPRECATED("Deprecated in 6.0.0. Use Result-returning overload instead.") + static Status Open(const std::shared_ptr& file, MemoryPool* pool, + std::unique_ptr* reader); + + /// \brief Get ORC reader from inside. + liborc::Reader* GetRawORCReader(); + /// \brief Creates a new ORC reader /// /// \param[in] file the data source From 4c22e00038fcda585f191ae88b42a087dc956568 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Mon, 21 Mar 2022 12:47:16 +0100 Subject: [PATCH 6/6] Merge pull request #10 from taiyang-li/fix_pr_9 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fix building issue introduced by https://github.com/ClickHouse-Extras… (cherry picked from commit 20dc6ad56d7677645a68dc55bbcde2930bea82dd) --- cpp/src/arrow/adapters/orc/adapter.cc | 9 +++++++++ cpp/src/arrow/adapters/orc/adapter.h | 1 + 2 files changed, 10 insertions(+) diff --git a/cpp/src/arrow/adapters/orc/adapter.cc b/cpp/src/arrow/adapters/orc/adapter.cc index 14340ac285e28..576bd0dada596 100644 --- a/cpp/src/arrow/adapters/orc/adapter.cc +++ b/cpp/src/arrow/adapters/orc/adapter.cc @@ -568,6 +568,15 @@ ORCFileReader::ORCFileReader() { impl_.reset(new ORCFileReader::Impl()); } ORCFileReader::~ORCFileReader() {} +liborc::Reader* ORCFileReader::GetRawORCReader() { + return impl_->GetRawORCReader(); +} + +Status ORCFileReader::Open(const std::shared_ptr& file, + MemoryPool* pool, std::unique_ptr* reader) { + return Open(file, pool).Value(reader); +} + Result> ORCFileReader::Open( const std::shared_ptr& file, MemoryPool* pool) { #ifdef ARROW_ORC_NEED_TIME_ZONE_DATABASE_CHECK diff --git a/cpp/src/arrow/adapters/orc/adapter.h b/cpp/src/arrow/adapters/orc/adapter.h index 41f53d3474d53..6adb3be5ec5b3 100644 --- a/cpp/src/arrow/adapters/orc/adapter.h +++ b/cpp/src/arrow/adapters/orc/adapter.h @@ -30,6 +30,7 @@ #include "arrow/type_fwd.h" #include "arrow/util/macros.h" #include "arrow/util/visibility.h" +#include "arrow/adapters/orc/adapter_util.h" namespace arrow { namespace adapters {