Skip to content

Commit

Permalink
[enhance](parquet) support reading brotli compressed parquet file (ap…
Browse files Browse the repository at this point in the history
…ache#41875)

## Proposed changes

Impl BrotliBlockCompression to uncompressed brotli parquet data.
fix parquet case: group0/large_string_map.brotli.parquet
  • Loading branch information
suxiaogang223 authored Oct 21, 2024
1 parent bf949c2 commit ff6d2ea
Show file tree
Hide file tree
Showing 6 changed files with 62 additions and 12 deletions.
37 changes: 30 additions & 7 deletions be/src/util/block_compression.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,34 +28,31 @@
defined(__i386) || defined(_M_IX86)
#include <libdeflate.h>
#endif
#include <brotli/decode.h>
#include <glog/log_severity.h>
#include <glog/logging.h>
#include <limits.h>
#include <lz4/lz4.h>
#include <lz4/lz4frame.h>
#include <lz4/lz4hc.h>
#include <snappy/snappy-sinksource.h>
#include <snappy/snappy.h>
#include <stdint.h>
#include <zconf.h>
#include <zlib.h>
#include <zstd.h>
#include <zstd_errors.h>

#include <algorithm>
#include <cstdint>
#include <limits>
#include <mutex>
#include <new>
#include <ostream>

#include "common/config.h"
#include "common/factory_creator.h"
#include "exec/decompressor.h"
#include "gutil/endian.h"
#include "gutil/strings/substitute.h"
#include "orc/OrcFile.hh"
#include "runtime/thread_context.h"
#include "util/bit_util.h"
#include "util/defer_op.h"
#include "util/faststring.h"

Expand All @@ -74,8 +71,6 @@ uint64_t lzoDecompress(const char* inputAddress, const char* inputLimit, char* o

namespace doris {

using strings::Substitute;

// exception safe
Status BlockCompressionCodec::compress(const std::vector<Slice>& inputs, size_t uncompressed_size,
faststring* output) {
Expand Down Expand Up @@ -1492,6 +1487,31 @@ class LzoBlockCompression final : public BlockCompressionCodec {
}
};

class BrotliBlockCompression final : public BlockCompressionCodec {
public:
static BrotliBlockCompression* instance() {
static BrotliBlockCompression s_instance;
return &s_instance;
}

Status compress(const Slice& input, faststring* output) override {
return Status::InvalidArgument("not impl brotli compress.");
}

size_t max_compressed_len(size_t len) override { return 0; };

Status decompress(const Slice& input, Slice* output) override {
// The size of output buffer is always equal to the umcompressed length.
BrotliDecoderResult result = BrotliDecoderDecompress(
input.get_size(), reinterpret_cast<const uint8_t*>(input.get_data()), &output->size,
reinterpret_cast<uint8_t*>(output->data));
if (result != BROTLI_DECODER_RESULT_SUCCESS) {
return Status::InternalError("Brotli decompression failed, result={}", result);
}
return Status::OK();
}
};

Status get_block_compression_codec(segment_v2::CompressionTypePB type,
BlockCompressionCodec** codec) {
switch (type) {
Expand Down Expand Up @@ -1582,6 +1602,9 @@ Status get_block_compression_codec(tparquet::CompressionCodec::type parquet_code
case tparquet::CompressionCodec::LZO:
*codec = LzoBlockCompression::instance();
break;
case tparquet::CompressionCodec::BROTLI:
*codec = BrotliBlockCompression::instance();
break;
default:
return Status::InternalError("unknown compression type({})", parquet_codec);
}
Expand Down
Binary file not shown.
Binary file not shown.
22 changes: 22 additions & 0 deletions regression-test/data/external_table_p0/tvf/test_hdfs_tvf.out
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,28 @@
19 Supplier#000000019 edZT3es,nBFD8lBXTGeTl 24 34-278-310-2731 6150.38 refully final foxes across the dogged theodolites sleep slyly abou
20 Supplier#000000020 iybAE,RmTymrZVYaFZva2SH,j 3 13-715-945-6730 530.82 n, ironic ideas would nag blithely about the slyly regular accounts. silent, expr

-- !parquet_brotli --
1 Supplier#000000001 N kD4on9OM Ipw3,gf0JBoQDd7tgrzrddZ 17 27-918-335-1736 5755.94 each slyly above the careful
2 Supplier#000000002 89eJ5ksX3ImxJQBvxObC, 5 15-679-861-2259 4032.68 slyly bold instructions. idle dependen
3 Supplier#000000003 q1,G3Pj6OjIuUYfUoH18BFTKP5aU9bEV3 1 11-383-516-1199 4192.40 blithely silent requests after the express dependencies are sl
4 Supplier#000000004 Bk7ah4CK8SYQTepEmvMkkgMwg 15 25-843-787-7479 4641.08 riously even requests above the exp
5 Supplier#000000005 Gcdm2rJRzl5qlTVzc 11 21-151-690-3663 -283.84 . slyly regular pinto bea
6 Supplier#000000006 tQxuVm7s7CnK 14 24-696-997-4969 1365.79 final accounts. regular dolphins use against the furiously ironic decoys.
7 Supplier#000000007 s,4TicNGB4uO6PaSqNBUq 23 33-990-965-2201 6820.35 s unwind silently furiously regular courts. final requests are deposits. requests wake quietly blit
8 Supplier#000000008 9Sq4bBH2FQEmaFOocY45sRTxo6yuoG 17 27-498-742-3860 7627.85 al pinto beans. asymptotes haggl
9 Supplier#000000009 1KhUgZegwM3ua7dsYmekYBsK 10 20-403-398-8662 5302.37 s. unusual, even requests along the furiously regular pac
10 Supplier#000000010 Saygah3gYWMp72i PY 24 34-852-489-8585 3891.91 ing waters. regular requests ar
11 Supplier#000000011 JfwTs,LZrV, M,9C 18 28-613-996-1505 3393.08 y ironic packages. slyly ironic accounts affix furiously; ironically unusual excuses across the flu
12 Supplier#000000012 aLIW q0HYd 8 18-179-925-7181 1432.69 al packages nag alongside of the bold instructions. express, daring accounts
13 Supplier#000000013 HK71HQyWoqRWOX8GI FpgAifW,2PoH 3 13-727-620-7813 9107.22 requests engage regularly instructions. furiously special requests ar
14 Supplier#000000014 EXsnO5pTNj4iZRm 15 25-656-247-5058 9189.82 l accounts boost. fluffily bold warhorses wake
15 Supplier#000000015 olXVbNBfVzRqgokr1T,Ie 8 18-453-357-6394 308.56 across the furiously regular platelets wake even deposits. quickly express she
16 Supplier#000000016 YjP5C55zHDXL7LalK27zfQnwejdpin4AMpvh 22 32-822-502-4215 2972.26 ously express ideas haggle quickly dugouts? fu
17 Supplier#000000017 c2d,ESHRSkK3WYnxpgw6aOqN0q 19 29-601-884-9219 1687.81 eep against the furiously bold ideas. fluffily bold packa
18 Supplier#000000018 PGGVE5PWAMwKDZw 16 26-729-551-1115 7040.82 accounts snooze slyly furiously bold
19 Supplier#000000019 edZT3es,nBFD8lBXTGeTl 24 34-278-310-2731 6150.38 refully final foxes across the dogged theodolites sleep slyly abou
20 Supplier#000000020 iybAE,RmTymrZVYaFZva2SH,j 3 13-715-945-6730 530.82 n, ironic ideas would nag blithely about the slyly regular accounts. silent, expr

-- !parquet_decimal256 --
1 99999999999999999999999999999999999999.99999999999999999999999999999999999999
2 -99999999999999999999999999999999999999.99999999999999999999999999999999999999
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -104,13 +104,10 @@ suite("test_hdfs_parquet_group0","external,hive,tvf,external_docker") {


uri = "${defaultFS}" + "/user/doris/tvf_data/test_hdfs_parquet/group0/large_string_map.brotli.parquet"
test {
sql """ select * from HDFS(
order_qt_test_11 """ select count(arr) from HDFS(
"uri" = "${uri}",
"hadoop.username" = "${hdfsUserName}",
"format" = "parquet") limit 10; """
exception "unknown compression type(4)"
}
"format" = "parquet"); """


uri = "${defaultFS}" + "/user/doris/tvf_data/test_hdfs_parquet/group0/non_hadoop_lz4_compressed.parquet"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,14 @@ suite("test_hdfs_tvf","external,hive,tvf,external_docker") {
"uri" = "${uri}",
"hadoop.username" = "${hdfsUserName}",
"format" = "${format}") order by s_suppkey limit 20; """

// test parquet brotli
uri = "${defaultFS}" + "/user/doris/preinstalled_data/hdfs_tvf/test_parquet.brotli.parquet"
format = "parquet"
qt_parquet_brotli """ select * from HDFS(
"uri" = "${uri}",
"hadoop.username" = "${hdfsUserName}",
"format" = "${format}") order by s_suppkey limit 20; """

// test parquet decimal256
uri = "${defaultFS}" + "/user/doris/preinstalled_data/hdfs_tvf/test_parquet_decimal256.parquet"
Expand Down

0 comments on commit ff6d2ea

Please sign in to comment.