From 51459686eaa1a295442d846d7b8c4fba3c23aa77 Mon Sep 17 00:00:00 2001 From: sundyli <543950155@qq.com> Date: Sun, 11 Dec 2022 23:04:15 -0800 Subject: [PATCH] add bytes_estimate for binary push in parquet deserialize (#1308) --- src/io/parquet/read/deserialize/binary/basic.rs | 8 ++++++-- src/io/parquet/read/deserialize/binary/utils.rs | 10 +++++++++- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/src/io/parquet/read/deserialize/binary/basic.rs b/src/io/parquet/read/deserialize/binary/basic.rs index 14c805158f8..a56d36086df 100644 --- a/src/io/parquet/read/deserialize/binary/basic.rs +++ b/src/io/parquet/read/deserialize/binary/basic.rs @@ -477,9 +477,13 @@ impl<'a, O: Offset> utils::Decoder<'a> for BinaryDecoder { pub(super) fn finish>( data_type: &DataType, - values: Binary, - validity: MutableBitmap, + mut values: Binary, + mut validity: MutableBitmap, ) -> Result { + values.offsets.shrink_to_fit(); + values.values.shrink_to_fit(); + validity.shrink_to_fit(); + A::try_new( data_type.clone(), values.offsets.into(), diff --git a/src/io/parquet/read/deserialize/binary/utils.rs b/src/io/parquet/read/deserialize/binary/utils.rs index ddf7abc2a06..9c28c8785ac 100644 --- a/src/io/parquet/read/deserialize/binary/utils.rs +++ b/src/io/parquet/read/deserialize/binary/utils.rs @@ -39,12 +39,20 @@ impl Binary { pub fn with_capacity(capacity: usize) -> Self { Self { offsets: Offsets::with_capacity(capacity), - values: Vec::with_capacity(capacity * 24), + values: Vec::with_capacity(capacity.min(100) * 24), } } #[inline] pub fn push(&mut self, v: &[u8]) { + if self.offsets.len() == 100 && self.offsets.capacity() > 100 { + let bytes_per_row = self.values.len() / 100 + 1; + let bytes_estimate = bytes_per_row * self.offsets.capacity(); + if bytes_estimate > self.values.capacity() { + self.values.reserve(bytes_estimate - self.values.capacity()); + } + } + self.values.extend(v); self.offsets.try_push_usize(v.len()).unwrap() }