diff --git a/.gitignore b/.gitignore deleted file mode 100644 index 07827cc..0000000 --- a/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -target/ -.idea/ \ No newline at end of file diff --git a/LICENSE.txt b/LICENSE.txt deleted file mode 100644 index 7225c83..0000000 --- a/LICENSE.txt +++ /dev/null @@ -1,217 +0,0 @@ - - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - --------------------------------------------------------------------------------- - -This product includes code from Apache Avro. - -Copyright: 2014 The Apache Software Foundation. -Home page: https://avro.apache.org/ -License: http://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This project includes code from Daniel Lemire's JavaFastPFOR project. The -"Lemire" bit packing source code produced by parquet-generator is derived from -the JavaFastPFOR project. - -Copyright: 2013 Daniel Lemire -Home page: http://lemire.me/en/ -Project page: https://github.com/lemire/JavaFastPFOR -License: Apache License Version 2.0 http://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product includes code from Apache Spark. - -* dev/merge_parquet_pr.py is based on Spark's dev/merge_spark_pr.py - -Copyright: 2014 The Apache Software Foundation. -Home page: https://spark.apache.org/ -License: http://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product includes code from Twitter's ElephantBird project. - -* parquet-hadoop's UnmaterializableRecordCounter.java includes code from - ElephantBird's LzoRecordReader.java - -Copyright: 2012-2014 Twitter -Home page: https://github.com/twitter/elephant-bird -License: http://www.apache.org/licenses/LICENSE-2.0 \ No newline at end of file diff --git a/pom.xml b/pom.xml deleted file mode 100644 index fcad48a..0000000 --- a/pom.xml +++ /dev/null @@ -1,33 +0,0 @@ - - - 4.0.0 - - cn.edu.tsinghua.iginx - parquet-file - 0.0.0 - - - 8 - 8 - UTF-8 - - - - - org.apache.parquet - parquet-hadoop - 1.13.1 - - - - - - local-repo-release - GitHub Release - file://${project.basedir}/maven-repo - - - - \ No newline at end of file diff --git a/src/main/java/org/apache/parquet/io/LocalInputFile.java b/src/main/java/org/apache/parquet/io/LocalInputFile.java deleted file mode 100644 index ee1ba1f..0000000 --- a/src/main/java/org/apache/parquet/io/LocalInputFile.java +++ /dev/null @@ -1,105 +0,0 @@ -/* - * Apache Parquet MR (Incubating) - * Copyright 2014 The Apache Software Foundation - * - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.parquet.io; - -import java.io.IOException; -import java.io.RandomAccessFile; -import java.nio.ByteBuffer; -import java.nio.file.Path; - -/** - * {@code LocalInputFile} is an implementation needed by Parquet to read from local data files using - * {@link org.apache.parquet.io.SeekableInputStream} instances. - */ -public class LocalInputFile implements InputFile { - - private final Path path; - private long length = -1; - - public LocalInputFile(Path file) { - path = file; - } - - @Override - public long getLength() throws IOException { - if (length == -1) { - try (RandomAccessFile file = new RandomAccessFile(path.toFile(), "r")) { - length = file.length(); - } - } - return length; - } - - @Override - public SeekableInputStream newStream() throws IOException { - - return new SeekableInputStream() { - - private final RandomAccessFile randomAccessFile = new RandomAccessFile(path.toFile(), "r"); - - @Override - public int read() throws IOException { - return randomAccessFile.read(); - } - - @Override - public long getPos() throws IOException { - return randomAccessFile.getFilePointer(); - } - - @Override - public void seek(long newPos) throws IOException { - randomAccessFile.seek(newPos); - } - - @Override - public void readFully(byte[] bytes) throws IOException { - randomAccessFile.readFully(bytes); - } - - @Override - public void readFully(byte[] bytes, int start, int len) throws IOException { - randomAccessFile.readFully(bytes, start, len); - } - - @Override - public int read(ByteBuffer buf) throws IOException { - byte[] buffer = new byte[buf.remaining()]; - int code = read(buffer); - buf.put(buffer, buf.position() + buf.arrayOffset(), buf.remaining()); - return code; - } - - @Override - public void readFully(ByteBuffer buf) throws IOException { - byte[] buffer = new byte[buf.remaining()]; - readFully(buffer); - buf.put(buffer, buf.position() + buf.arrayOffset(), buf.remaining()); - } - - @Override - public void close() throws IOException { - randomAccessFile.close(); - } - }; - } -} diff --git a/src/main/java/org/apache/parquet/io/LocalOutputFile.java b/src/main/java/org/apache/parquet/io/LocalOutputFile.java deleted file mode 100644 index 0346700..0000000 --- a/src/main/java/org/apache/parquet/io/LocalOutputFile.java +++ /dev/null @@ -1,111 +0,0 @@ -/* - * Apache Parquet MR (Incubating) - * Copyright 2014 The Apache Software Foundation - * - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.parquet.io; - -import java.io.BufferedOutputStream; -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.StandardOpenOption; - -/** - * {@code LocalOutputFile} is an implementation needed by Parquet to write to local data files using - * {@link org.apache.parquet.io.PositionOutputStream} instances. - */ -public class LocalOutputFile implements OutputFile { - - private class LocalPositionOutputStream extends PositionOutputStream { - - private final BufferedOutputStream stream; - private long pos = 0; - - public LocalPositionOutputStream(int buffer, StandardOpenOption... openOption) - throws IOException { - stream = new BufferedOutputStream(Files.newOutputStream(path, openOption), buffer); - } - - @Override - public long getPos() { - return pos; - } - - @Override - public void write(int data) throws IOException { - pos++; - stream.write(data); - } - - @Override - public void write(byte[] data) throws IOException { - pos += data.length; - stream.write(data); - } - - @Override - public void write(byte[] data, int off, int len) throws IOException { - pos += len; - stream.write(data, off, len); - } - - @Override - public void flush() throws IOException { - stream.flush(); - } - - @Override - public void close() throws IOException { - stream.close(); - } - } - - private final Path path; - - public LocalOutputFile(Path file) { - path = file; - } - - @Override - public PositionOutputStream create(long buffer) throws IOException { - return new LocalPositionOutputStream((int) buffer, StandardOpenOption.CREATE_NEW); - } - - @Override - public PositionOutputStream createOrOverwrite(long buffer) throws IOException { - return new LocalPositionOutputStream( - (int) buffer, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING); - } - - @Override - public boolean supportsBlockSize() { - return true; - } - - @Override - public long defaultBlockSize() { - return 512; - } - - @Override - public String getPath() { - return path.toString(); - } -} diff --git a/src/main/java/org/apache/parquet/io/api/RecordDematerializer.java b/src/main/java/org/apache/parquet/io/api/RecordDematerializer.java deleted file mode 100644 index eca2dfb..0000000 --- a/src/main/java/org/apache/parquet/io/api/RecordDematerializer.java +++ /dev/null @@ -1,31 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.parquet.io.api; - -public abstract class RecordDematerializer { - - public abstract void setRecordConsumer(RecordConsumer recordConsumer); - - /** - * called once per record - * - * @param record one record to write to the previously provided record consumer - */ - public abstract void write(T record); -} diff --git a/src/main/java/org/apache/parquet/local/BloomFilterReader.java b/src/main/java/org/apache/parquet/local/BloomFilterReader.java deleted file mode 100644 index ea9cd2c..0000000 --- a/src/main/java/org/apache/parquet/local/BloomFilterReader.java +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.parquet.local; - -import org.apache.parquet.column.values.bloomfilter.BloomFilter; -import org.apache.parquet.hadoop.metadata.BlockMetaData; -import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; -import org.apache.parquet.hadoop.metadata.ColumnPath; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.util.HashMap; -import java.util.Map; - -/** Bloom filter reader that reads Bloom filter data from an open {@link ParquetFileReader}. */ -public class BloomFilterReader { - private final ParquetFileReader reader; - private final Map columns; - private final Map cache = new HashMap<>(); - private Logger logger = LoggerFactory.getLogger(BloomFilterReader.class); - - public BloomFilterReader(ParquetFileReader fileReader, BlockMetaData block) { - this.reader = fileReader; - this.columns = new HashMap<>(); - for (ColumnChunkMetaData column : block.getColumns()) { - columns.put(column.getPath(), column); - } - } - - public BloomFilter readBloomFilter(ColumnChunkMetaData meta) { - if (cache.containsKey(meta.getPath())) { - return cache.get(meta.getPath()); - } - try { - if (!cache.containsKey(meta.getPath())) { - BloomFilter bloomFilter = reader.readBloomFilter(meta); - if (bloomFilter == null) { - return null; - } - - cache.put(meta.getPath(), bloomFilter); - } - return cache.get(meta.getPath()); - } catch (IOException e) { - logger.error("Failed to read Bloom filter data", e); - } - - return null; - } -} diff --git a/src/main/java/org/apache/parquet/local/CodecFactory.java b/src/main/java/org/apache/parquet/local/CodecFactory.java deleted file mode 100644 index 30ad3e2..0000000 --- a/src/main/java/org/apache/parquet/local/CodecFactory.java +++ /dev/null @@ -1,79 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.parquet.local; - -import org.apache.parquet.compression.CompressionCodecFactory; -import org.apache.parquet.hadoop.metadata.CompressionCodecName; -import org.apache.parquet.local.codec.NoopBytesInputCompressor; -import org.apache.parquet.local.codec.NoopBytesInputDecompressor; -import org.apache.parquet.local.codec.SnappyBytesInputCompressor; -import org.apache.parquet.local.codec.SnappyBytesInputDecompressor; - -import java.util.HashMap; -import java.util.Map; - -public class CodecFactory implements CompressionCodecFactory { - - private final Map compressors = new HashMap<>(); - private final Map decompressors = new HashMap<>(); - - @Override - public BytesInputCompressor getCompressor(CompressionCodecName codecName) { - return createCompressor(codecName); - } - - @Override - public BytesInputDecompressor getDecompressor(CompressionCodecName codecName) { - return decompressors.computeIfAbsent(codecName, this::createDecompressor); - } - - protected BytesInputCompressor createCompressor(CompressionCodecName codecName) { - switch (codecName) { - case UNCOMPRESSED: - return new NoopBytesInputCompressor(); - case SNAPPY: - return new SnappyBytesInputCompressor(); - default: - throw new IllegalArgumentException("Unimplemented codec: " + codecName); - } - } - - protected BytesInputDecompressor createDecompressor(CompressionCodecName codecName) { - switch (codecName) { - case UNCOMPRESSED: - return new NoopBytesInputDecompressor(); - case SNAPPY: - return new SnappyBytesInputDecompressor(); - default: - throw new IllegalArgumentException("Unimplemented codec: " + codecName); - } - } - - @Override - public void release() { - for (BytesInputCompressor compressor : compressors.values()) { - compressor.release(); - } - compressors.clear(); - for (BytesInputDecompressor decompressor : decompressors.values()) { - decompressor.release(); - } - decompressors.clear(); - } -} diff --git a/src/main/java/org/apache/parquet/local/ColumnChunkPageReadStore.java b/src/main/java/org/apache/parquet/local/ColumnChunkPageReadStore.java deleted file mode 100644 index 8b23d75..0000000 --- a/src/main/java/org/apache/parquet/local/ColumnChunkPageReadStore.java +++ /dev/null @@ -1,314 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.parquet.local; - -import org.apache.parquet.bytes.BytesInput; -import org.apache.parquet.column.ColumnDescriptor; -import org.apache.parquet.column.page.*; -import org.apache.parquet.compression.CompressionCodecFactory.BytesInputDecompressor; -import org.apache.parquet.crypto.AesCipher; -import org.apache.parquet.crypto.ModuleCipherFactory.ModuleType; -import org.apache.parquet.format.BlockCipher; -import org.apache.parquet.internal.column.columnindex.OffsetIndex; -import org.apache.parquet.internal.filter2.columnindex.RowRanges; -import org.apache.parquet.io.ParquetDecodingException; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.util.*; - -class ColumnChunkPageReadStore implements PageReadStore, DictionaryPageReadStore { - private static final Logger LOG = LoggerFactory.getLogger(ColumnChunkPageReadStore.class); - - /** - * PageReader for a single column chunk. A column chunk contains several pages, which are yielded - * one by one in order. - * - *

This implementation is provided with a list of pages, each of which is decompressed and - * passed through. - */ - static final class ColumnChunkPageReader implements PageReader { - - private final BytesInputDecompressor decompressor; - private final long valueCount; - private final Queue compressedPages; - private final DictionaryPage compressedDictionaryPage; - // null means no page synchronization is required; firstRowIndex will not be returned by the - // pages - private final OffsetIndex offsetIndex; - private final long rowCount; - private int pageIndex = 0; - - private final BlockCipher.Decryptor blockDecryptor; - private final byte[] dataPageAAD; - private final byte[] dictionaryPageAAD; - - ColumnChunkPageReader( - BytesInputDecompressor decompressor, - List compressedPages, - DictionaryPage compressedDictionaryPage, - OffsetIndex offsetIndex, - long rowCount, - BlockCipher.Decryptor blockDecryptor, - byte[] fileAAD, - int rowGroupOrdinal, - int columnOrdinal) { - this.decompressor = decompressor; - this.compressedPages = new ArrayDeque(compressedPages); - this.compressedDictionaryPage = compressedDictionaryPage; - long count = 0; - for (DataPage p : compressedPages) { - count += p.getValueCount(); - } - this.valueCount = count; - this.offsetIndex = offsetIndex; - this.rowCount = rowCount; - - this.blockDecryptor = blockDecryptor; - - if (null != blockDecryptor) { - dataPageAAD = - AesCipher.createModuleAAD( - fileAAD, ModuleType.DataPage, rowGroupOrdinal, columnOrdinal, 0); - dictionaryPageAAD = - AesCipher.createModuleAAD( - fileAAD, ModuleType.DictionaryPage, rowGroupOrdinal, columnOrdinal, -1); - } else { - dataPageAAD = null; - dictionaryPageAAD = null; - } - } - - private int getPageOrdinal(int currentPageIndex) { - if (null == offsetIndex) { - return currentPageIndex; - } - - return offsetIndex.getPageOrdinal(currentPageIndex); - } - - @Override - public long getTotalValueCount() { - return valueCount; - } - - @Override - public DataPage readPage() { - final DataPage compressedPage = compressedPages.poll(); - if (compressedPage == null) { - return null; - } - final int currentPageIndex = pageIndex++; - - if (null != blockDecryptor) { - AesCipher.quickUpdatePageAAD(dataPageAAD, getPageOrdinal(currentPageIndex)); - } - - return compressedPage.accept( - new DataPage.Visitor() { - @Override - public DataPage visit(DataPageV1 dataPageV1) { - try { - BytesInput bytes = dataPageV1.getBytes(); - if (null != blockDecryptor) { - bytes = BytesInput.from(blockDecryptor.decrypt(bytes.toByteArray(), dataPageAAD)); - } - BytesInput decompressed = - decompressor.decompress(bytes, dataPageV1.getUncompressedSize()); - - final DataPageV1 decompressedPage; - if (offsetIndex == null) { - decompressedPage = - new DataPageV1( - decompressed, - dataPageV1.getValueCount(), - dataPageV1.getUncompressedSize(), - dataPageV1.getStatistics(), - dataPageV1.getRlEncoding(), - dataPageV1.getDlEncoding(), - dataPageV1.getValueEncoding()); - } else { - long firstRowIndex = offsetIndex.getFirstRowIndex(currentPageIndex); - decompressedPage = - new DataPageV1( - decompressed, - dataPageV1.getValueCount(), - dataPageV1.getUncompressedSize(), - firstRowIndex, - Math.toIntExact( - offsetIndex.getLastRowIndex(currentPageIndex, rowCount) - - firstRowIndex - + 1), - dataPageV1.getStatistics(), - dataPageV1.getRlEncoding(), - dataPageV1.getDlEncoding(), - dataPageV1.getValueEncoding()); - } - if (dataPageV1.getCrc().isPresent()) { - decompressedPage.setCrc(dataPageV1.getCrc().getAsInt()); - } - return decompressedPage; - } catch (IOException e) { - throw new ParquetDecodingException("could not decompress page", e); - } - } - - @Override - public DataPage visit(DataPageV2 dataPageV2) { - if (!dataPageV2.isCompressed() && offsetIndex == null && null == blockDecryptor) { - return dataPageV2; - } - BytesInput pageBytes = dataPageV2.getData(); - - if (null != blockDecryptor) { - try { - pageBytes = - BytesInput.from(blockDecryptor.decrypt(pageBytes.toByteArray(), dataPageAAD)); - } catch (IOException e) { - throw new ParquetDecodingException( - "could not convert page ByteInput to byte array", e); - } - } - if (dataPageV2.isCompressed()) { - int uncompressedSize = - Math.toIntExact( - dataPageV2.getUncompressedSize() - - dataPageV2.getDefinitionLevels().size() - - dataPageV2.getRepetitionLevels().size()); - try { - pageBytes = decompressor.decompress(pageBytes, uncompressedSize); - } catch (IOException e) { - throw new ParquetDecodingException("could not decompress page", e); - } - } - - if (offsetIndex == null) { - return DataPageV2.uncompressed( - dataPageV2.getRowCount(), - dataPageV2.getNullCount(), - dataPageV2.getValueCount(), - dataPageV2.getRepetitionLevels(), - dataPageV2.getDefinitionLevels(), - dataPageV2.getDataEncoding(), - pageBytes, - dataPageV2.getStatistics()); - } else { - return DataPageV2.uncompressed( - dataPageV2.getRowCount(), - dataPageV2.getNullCount(), - dataPageV2.getValueCount(), - offsetIndex.getFirstRowIndex(currentPageIndex), - dataPageV2.getRepetitionLevels(), - dataPageV2.getDefinitionLevels(), - dataPageV2.getDataEncoding(), - pageBytes, - dataPageV2.getStatistics()); - } - } - }); - } - - @Override - public DictionaryPage readDictionaryPage() { - if (compressedDictionaryPage == null) { - return null; - } - try { - BytesInput bytes = compressedDictionaryPage.getBytes(); - if (null != blockDecryptor) { - bytes = BytesInput.from(blockDecryptor.decrypt(bytes.toByteArray(), dictionaryPageAAD)); - } - DictionaryPage decompressedPage = - new DictionaryPage( - decompressor.decompress(bytes, compressedDictionaryPage.getUncompressedSize()), - compressedDictionaryPage.getDictionarySize(), - compressedDictionaryPage.getEncoding()); - if (compressedDictionaryPage.getCrc().isPresent()) { - decompressedPage.setCrc(compressedDictionaryPage.getCrc().getAsInt()); - } - return decompressedPage; - } catch (IOException e) { - throw new ParquetDecodingException("Could not decompress dictionary page", e); - } - } - } - - private final Map readers = - new HashMap(); - private final long rowCount; - private final long rowIndexOffset; - private final RowRanges rowRanges; - - public ColumnChunkPageReadStore(long rowCount) { - this(rowCount, -1); - } - - ColumnChunkPageReadStore(RowRanges rowRanges) { - this(rowRanges, -1); - } - - ColumnChunkPageReadStore(long rowCount, long rowIndexOffset) { - this.rowCount = rowCount; - this.rowIndexOffset = rowIndexOffset; - rowRanges = null; - } - - ColumnChunkPageReadStore(RowRanges rowRanges, long rowIndexOffset) { - this.rowRanges = rowRanges; - this.rowIndexOffset = rowIndexOffset; - rowCount = rowRanges.rowCount(); - } - - @Override - public long getRowCount() { - return rowCount; - } - - @Override - public Optional getRowIndexOffset() { - return rowIndexOffset < 0 ? Optional.empty() : Optional.of(rowIndexOffset); - } - - @Override - public PageReader getPageReader(ColumnDescriptor path) { - final PageReader pageReader = readers.get(path); - if (pageReader == null) { - throw new IllegalArgumentException( - path + " is not in the store: " + readers.keySet() + " " + rowCount); - } - return pageReader; - } - - @Override - public DictionaryPage readDictionaryPage(ColumnDescriptor descriptor) { - return readers.get(descriptor).readDictionaryPage(); - } - - @Override - public Optional getRowIndexes() { - return rowRanges == null ? Optional.empty() : Optional.of(rowRanges.iterator()); - } - - void addColumn(ColumnDescriptor path, ColumnChunkPageReader reader) { - if (readers.put(path, reader) != null) { - throw new RuntimeException(path + " was added twice"); - } - } -} diff --git a/src/main/java/org/apache/parquet/local/ColumnChunkPageWriteStore.java b/src/main/java/org/apache/parquet/local/ColumnChunkPageWriteStore.java deleted file mode 100644 index ce9a4e6..0000000 --- a/src/main/java/org/apache/parquet/local/ColumnChunkPageWriteStore.java +++ /dev/null @@ -1,501 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.parquet.local; - -import org.apache.parquet.bytes.ByteBufferAllocator; -import org.apache.parquet.bytes.BytesInput; -import org.apache.parquet.bytes.ConcatenatingByteArrayCollector; -import org.apache.parquet.column.ColumnDescriptor; -import org.apache.parquet.column.Encoding; -import org.apache.parquet.column.page.DictionaryPage; -import org.apache.parquet.column.page.PageWriteStore; -import org.apache.parquet.column.page.PageWriter; -import org.apache.parquet.column.statistics.Statistics; -import org.apache.parquet.column.values.bloomfilter.BloomFilter; -import org.apache.parquet.column.values.bloomfilter.BloomFilterWriteStore; -import org.apache.parquet.column.values.bloomfilter.BloomFilterWriter; -import org.apache.parquet.compression.CompressionCodecFactory; -import org.apache.parquet.crypto.AesCipher; -import org.apache.parquet.crypto.InternalColumnEncryptionSetup; -import org.apache.parquet.crypto.InternalFileEncryptor; -import org.apache.parquet.crypto.ModuleCipherFactory.ModuleType; -import org.apache.parquet.format.BlockCipher; -import org.apache.parquet.hadoop.metadata.ColumnPath; -import org.apache.parquet.internal.column.columnindex.ColumnIndexBuilder; -import org.apache.parquet.internal.column.columnindex.OffsetIndexBuilder; -import org.apache.parquet.io.ParquetEncodingException; -import org.apache.parquet.schema.MessageType; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.ByteArrayOutputStream; -import java.io.IOException; -import java.util.*; -import java.util.zip.CRC32; - -class ColumnChunkPageWriteStore implements PageWriteStore, BloomFilterWriteStore { - private static final Logger LOG = LoggerFactory.getLogger(ColumnChunkPageWriteStore.class); - - private static final ParquetMetadataConverter parquetMetadataConverter = - new ParquetMetadataConverter(); - - private static final class ColumnChunkPageWriter implements PageWriter, BloomFilterWriter { - - private final ColumnDescriptor path; - private final CompressionCodecFactory.BytesInputCompressor compressor; - - private final ByteArrayOutputStream tempOutputStream = new ByteArrayOutputStream(); - private final ConcatenatingByteArrayCollector buf; - private DictionaryPage dictionaryPage; - - private long uncompressedLength; - private long compressedLength; - private long totalValueCount; - private int pageCount; - - // repetition and definition level encodings are used only for v1 pages and don't change - private final Set rlEncodings = new HashSet<>(); - private final Set dlEncodings = new HashSet<>(); - private final List dataEncodings = new ArrayList<>(); - - private BloomFilter bloomFilter; - private ColumnIndexBuilder columnIndexBuilder; - private OffsetIndexBuilder offsetIndexBuilder; - private Statistics totalStatistics; - - private final CRC32 crc; - boolean pageWriteChecksumEnabled; - - private final BlockCipher.Encryptor headerBlockEncryptor; - private final BlockCipher.Encryptor pageBlockEncryptor; - private final int rowGroupOrdinal; - private final int columnOrdinal; - private int pageOrdinal; - private final byte[] dataPageAAD; - private final byte[] dataPageHeaderAAD; - private final byte[] fileAAD; - - private ColumnChunkPageWriter( - ColumnDescriptor path, - CompressionCodecFactory.BytesInputCompressor compressor, - ByteBufferAllocator allocator, - int columnIndexTruncateLength, - boolean pageWriteChecksumEnabled, - BlockCipher.Encryptor headerBlockEncryptor, - BlockCipher.Encryptor pageBlockEncryptor, - byte[] fileAAD, - int rowGroupOrdinal, - int columnOrdinal) { - this.path = path; - this.compressor = compressor; - this.buf = new ConcatenatingByteArrayCollector(); - this.columnIndexBuilder = - ColumnIndexBuilder.getBuilder(path.getPrimitiveType(), columnIndexTruncateLength); - this.offsetIndexBuilder = OffsetIndexBuilder.getBuilder(); - this.pageWriteChecksumEnabled = pageWriteChecksumEnabled; - this.crc = pageWriteChecksumEnabled ? new CRC32() : null; - - this.headerBlockEncryptor = headerBlockEncryptor; - this.pageBlockEncryptor = pageBlockEncryptor; - this.fileAAD = fileAAD; - this.rowGroupOrdinal = rowGroupOrdinal; - this.columnOrdinal = columnOrdinal; - this.pageOrdinal = -1; - if (null != headerBlockEncryptor) { - dataPageHeaderAAD = - AesCipher.createModuleAAD( - fileAAD, ModuleType.DataPageHeader, rowGroupOrdinal, columnOrdinal, 0); - } else { - dataPageHeaderAAD = null; - } - if (null != pageBlockEncryptor) { - dataPageAAD = - AesCipher.createModuleAAD( - fileAAD, ModuleType.DataPage, rowGroupOrdinal, columnOrdinal, 0); - } else { - dataPageAAD = null; - } - } - - @Override - @Deprecated - public void writePage( - BytesInput bytesInput, - int valueCount, - Statistics statistics, - Encoding rlEncoding, - Encoding dlEncoding, - Encoding valuesEncoding) - throws IOException { - // Setting the builders to the no-op ones so no column/offset indexes will be written for this - // column chunk - columnIndexBuilder = ColumnIndexBuilder.getNoOpBuilder(); - offsetIndexBuilder = OffsetIndexBuilder.getNoOpBuilder(); - - writePage(bytesInput, valueCount, -1, statistics, rlEncoding, dlEncoding, valuesEncoding); - } - - @Override - public void writePage( - BytesInput bytes, - int valueCount, - int rowCount, - Statistics statistics, - Encoding rlEncoding, - Encoding dlEncoding, - Encoding valuesEncoding) - throws IOException { - pageOrdinal++; - long uncompressedSize = bytes.size(); - if (uncompressedSize > Integer.MAX_VALUE || uncompressedSize < 0) { - throw new ParquetEncodingException( - "Cannot write page larger than Integer.MAX_VALUE or negative bytes: " - + uncompressedSize); - } - BytesInput compressedBytes = compressor.compress(bytes); - if (null != pageBlockEncryptor) { - AesCipher.quickUpdatePageAAD(dataPageAAD, pageOrdinal); - compressedBytes = - BytesInput.from(pageBlockEncryptor.encrypt(compressedBytes.toByteArray(), dataPageAAD)); - } - long compressedSize = compressedBytes.size(); - if (compressedSize > Integer.MAX_VALUE) { - throw new ParquetEncodingException( - "Cannot write compressed page larger than Integer.MAX_VALUE bytes: " + compressedSize); - } - tempOutputStream.reset(); - if (null != headerBlockEncryptor) { - AesCipher.quickUpdatePageAAD(dataPageHeaderAAD, pageOrdinal); - } - if (pageWriteChecksumEnabled) { - crc.reset(); - crc.update(compressedBytes.toByteArray()); - parquetMetadataConverter.writeDataPageV1Header( - (int) uncompressedSize, - (int) compressedSize, - valueCount, - rlEncoding, - dlEncoding, - valuesEncoding, - (int) crc.getValue(), - tempOutputStream, - headerBlockEncryptor, - dataPageHeaderAAD); - } else { - parquetMetadataConverter.writeDataPageV1Header( - (int) uncompressedSize, - (int) compressedSize, - valueCount, - rlEncoding, - dlEncoding, - valuesEncoding, - tempOutputStream, - headerBlockEncryptor, - dataPageHeaderAAD); - } - this.uncompressedLength += uncompressedSize; - this.compressedLength += compressedSize; - this.totalValueCount += valueCount; - this.pageCount += 1; - - // Copying the statistics if it is not initialized yet so we have the correct typed one - if (totalStatistics == null) { - totalStatistics = statistics.copy(); - } else { - totalStatistics.mergeStatistics(statistics); - } - - columnIndexBuilder.add(statistics); - offsetIndexBuilder.add(toIntWithCheck(tempOutputStream.size() + compressedSize), rowCount); - - // by concatenating before collecting instead of collecting twice, - // we only allocate one buffer to copy into instead of multiple. - buf.collect(BytesInput.concat(BytesInput.from(tempOutputStream), compressedBytes)); - rlEncodings.add(rlEncoding); - dlEncodings.add(dlEncoding); - dataEncodings.add(valuesEncoding); - } - - @Override - public void writePageV2( - int rowCount, - int nullCount, - int valueCount, - BytesInput repetitionLevels, - BytesInput definitionLevels, - Encoding dataEncoding, - BytesInput data, - Statistics statistics) - throws IOException { - pageOrdinal++; - - int rlByteLength = toIntWithCheck(repetitionLevels.size()); - int dlByteLength = toIntWithCheck(definitionLevels.size()); - int uncompressedSize = - toIntWithCheck(data.size() + repetitionLevels.size() + definitionLevels.size()); - // TODO: decide if we compress - BytesInput compressedData = compressor.compress(data); - if (null != pageBlockEncryptor) { - AesCipher.quickUpdatePageAAD(dataPageAAD, pageOrdinal); - compressedData = - BytesInput.from(pageBlockEncryptor.encrypt(compressedData.toByteArray(), dataPageAAD)); - } - int compressedSize = - toIntWithCheck(compressedData.size() + repetitionLevels.size() + definitionLevels.size()); - tempOutputStream.reset(); - if (null != headerBlockEncryptor) { - AesCipher.quickUpdatePageAAD(dataPageHeaderAAD, pageOrdinal); - } - parquetMetadataConverter.writeDataPageV2Header( - uncompressedSize, - compressedSize, - valueCount, - nullCount, - rowCount, - dataEncoding, - rlByteLength, - dlByteLength, - tempOutputStream, - headerBlockEncryptor, - dataPageHeaderAAD); - this.uncompressedLength += uncompressedSize; - this.compressedLength += compressedSize; - this.totalValueCount += valueCount; - this.pageCount += 1; - - // Copying the statistics if it is not initialized yet so we have the correct typed one - if (totalStatistics == null) { - totalStatistics = statistics.copy(); - } else { - totalStatistics.mergeStatistics(statistics); - } - - columnIndexBuilder.add(statistics); - offsetIndexBuilder.add( - toIntWithCheck((long) tempOutputStream.size() + compressedSize), rowCount); - - // by concatenating before collecting instead of collecting twice, - // we only allocate one buffer to copy into instead of multiple. - buf.collect( - BytesInput.concat( - BytesInput.from(tempOutputStream), - repetitionLevels, - definitionLevels, - compressedData)); - dataEncodings.add(dataEncoding); - } - - private int toIntWithCheck(long size) { - if (size > Integer.MAX_VALUE) { - throw new ParquetEncodingException( - "Cannot write page larger than " + Integer.MAX_VALUE + " bytes: " + size); - } - return (int) size; - } - - @Override - public long getMemSize() { - return buf.size(); - } - - public void writeToFileWriter(ParquetFileWriter writer) throws IOException { - if (null == headerBlockEncryptor) { - writer.writeColumnChunk( - path, - totalValueCount, - compressor.getCodecName(), - dictionaryPage, - buf, - uncompressedLength, - compressedLength, - totalStatistics, - columnIndexBuilder, - offsetIndexBuilder, - bloomFilter, - rlEncodings, - dlEncodings, - dataEncodings); - } else { - writer.writeColumnChunk( - path, - totalValueCount, - compressor.getCodecName(), - dictionaryPage, - buf, - uncompressedLength, - compressedLength, - totalStatistics, - columnIndexBuilder, - offsetIndexBuilder, - bloomFilter, - rlEncodings, - dlEncodings, - dataEncodings, - headerBlockEncryptor, - rowGroupOrdinal, - columnOrdinal, - fileAAD); - } - if (LOG.isDebugEnabled()) { - LOG.debug( - String.format( - "written %,dB for %s: %,d values, %,dB raw, %,dB comp, %d pages, encodings: %s", - buf.size(), - path, - totalValueCount, - uncompressedLength, - compressedLength, - pageCount, - new HashSet(dataEncodings)) - + (dictionaryPage != null - ? String.format( - ", dic { %,d entries, %,dB raw, %,dB comp}", - dictionaryPage.getDictionarySize(), - dictionaryPage.getUncompressedSize(), - dictionaryPage.getDictionarySize()) - : "")); - } - rlEncodings.clear(); - dlEncodings.clear(); - dataEncodings.clear(); - pageCount = 0; - pageOrdinal = -1; - } - - @Override - public long allocatedSize() { - return buf.size(); - } - - @Override - public void writeDictionaryPage(DictionaryPage dictionaryPage) throws IOException { - if (this.dictionaryPage != null) { - throw new ParquetEncodingException("Only one dictionary page is allowed"); - } - BytesInput dictionaryBytes = dictionaryPage.getBytes(); - int uncompressedSize = (int) dictionaryBytes.size(); - BytesInput compressedBytes = compressor.compress(dictionaryBytes); - if (null != pageBlockEncryptor) { - byte[] dictonaryPageAAD = - AesCipher.createModuleAAD( - fileAAD, ModuleType.DictionaryPage, rowGroupOrdinal, columnOrdinal, -1); - compressedBytes = - BytesInput.from( - pageBlockEncryptor.encrypt(compressedBytes.toByteArray(), dictonaryPageAAD)); - } - this.dictionaryPage = - new DictionaryPage( - BytesInput.copy(compressedBytes), - uncompressedSize, - dictionaryPage.getDictionarySize(), - dictionaryPage.getEncoding()); - } - - @Override - public String memUsageString(String prefix) { - return buf.memUsageString(prefix + " ColumnChunkPageWriter"); - } - - @Override - public void writeBloomFilter(BloomFilter bloomFilter) { - this.bloomFilter = bloomFilter; - } - } - - private final Map writers = - new HashMap(); - private final MessageType schema; - - public ColumnChunkPageWriteStore( - CompressionCodecFactory.BytesInputCompressor compressor, - MessageType schema, - ByteBufferAllocator allocator, - int columnIndexTruncateLength, - boolean pageWriteChecksumEnabled, - InternalFileEncryptor fileEncryptor, - int rowGroupOrdinal) { - this.schema = schema; - if (null == fileEncryptor) { - for (ColumnDescriptor path : schema.getColumns()) { - writers.put( - path, - new ColumnChunkPageWriter( - path, - compressor, - allocator, - columnIndexTruncateLength, - pageWriteChecksumEnabled, - null, - null, - null, - -1, - -1)); - } - return; - } - - // Encrypted file - int columnOrdinal = -1; - byte[] fileAAD = fileEncryptor.getFileAAD(); - for (ColumnDescriptor path : schema.getColumns()) { - columnOrdinal++; - BlockCipher.Encryptor headerBlockEncryptor = null; - BlockCipher.Encryptor pageBlockEncryptor = null; - ColumnPath columnPath = ColumnPath.get(path.getPath()); - - InternalColumnEncryptionSetup columnSetup = - fileEncryptor.getColumnSetup(columnPath, true, columnOrdinal); - if (columnSetup.isEncrypted()) { - headerBlockEncryptor = columnSetup.getMetaDataEncryptor(); - pageBlockEncryptor = columnSetup.getDataEncryptor(); - } - - writers.put( - path, - new ColumnChunkPageWriter( - path, - compressor, - allocator, - columnIndexTruncateLength, - pageWriteChecksumEnabled, - headerBlockEncryptor, - pageBlockEncryptor, - fileAAD, - rowGroupOrdinal, - columnOrdinal)); - } - } - - @Override - public PageWriter getPageWriter(ColumnDescriptor path) { - return writers.get(path); - } - - @Override - public BloomFilterWriter getBloomFilterWriter(ColumnDescriptor path) { - return writers.get(path); - } - - public void flushToFileWriter(ParquetFileWriter writer) throws IOException { - for (ColumnDescriptor path : schema.getColumns()) { - ColumnChunkPageWriter pageWriter = writers.get(path); - pageWriter.writeToFileWriter(writer); - } - } -} diff --git a/src/main/java/org/apache/parquet/local/ColumnIndexFilterUtils.java b/src/main/java/org/apache/parquet/local/ColumnIndexFilterUtils.java deleted file mode 100644 index b83b0e4..0000000 --- a/src/main/java/org/apache/parquet/local/ColumnIndexFilterUtils.java +++ /dev/null @@ -1,164 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.parquet.local; - -import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; -import org.apache.parquet.internal.column.columnindex.OffsetIndex; -import org.apache.parquet.internal.filter2.columnindex.RowRanges; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Formatter; -import java.util.List; - -/** Internal utility class to help at column index based filtering. */ -class ColumnIndexFilterUtils { - static class OffsetRange { - private final long offset; - private long length; - - private OffsetRange(long offset, int length) { - this.offset = offset; - this.length = length; - } - - long getOffset() { - return offset; - } - - long getLength() { - return length; - } - - private boolean extend(long offset, int length) { - if (this.offset + this.length == offset) { - this.length += length; - return true; - } else { - return false; - } - } - } - - private static class FilteredOffsetIndex implements OffsetIndex { - private final OffsetIndex offsetIndex; - private final int[] indexMap; - - private FilteredOffsetIndex(OffsetIndex offsetIndex, int[] indexMap) { - this.offsetIndex = offsetIndex; - this.indexMap = indexMap; - } - - @Override - public int getPageOrdinal(int pageIndex) { - return indexMap[pageIndex]; - } - - @Override - public int getPageCount() { - return indexMap.length; - } - - @Override - public long getOffset(int pageIndex) { - return offsetIndex.getOffset(indexMap[pageIndex]); - } - - @Override - public int getCompressedPageSize(int pageIndex) { - return offsetIndex.getCompressedPageSize(indexMap[pageIndex]); - } - - @Override - public long getFirstRowIndex(int pageIndex) { - return offsetIndex.getFirstRowIndex(indexMap[pageIndex]); - } - - @Override - public long getLastRowIndex(int pageIndex, long totalRowCount) { - int nextIndex = indexMap[pageIndex] + 1; - return (nextIndex >= offsetIndex.getPageCount() - ? totalRowCount - : offsetIndex.getFirstRowIndex(nextIndex)) - - 1; - } - - @Override - public String toString() { - try (Formatter formatter = new Formatter()) { - formatter.format( - "%-12s %20s %16s %20s\n", "", "offset", "compressed size", "first row index"); - for (int i = 0, n = offsetIndex.getPageCount(); i < n; ++i) { - int index = Arrays.binarySearch(indexMap, i); - boolean isHidden = index < 0; - formatter.format( - "%spage-%-5d %20d %16d %20d\n", - isHidden ? "- " : " ", - isHidden ? i : index, - offsetIndex.getOffset(i), - offsetIndex.getCompressedPageSize(i), - offsetIndex.getFirstRowIndex(i)); - } - return formatter.toString(); - } - } - } - - /* - * Returns the filtered offset index containing only the pages which are overlapping with rowRanges. - */ - static OffsetIndex filterOffsetIndex( - OffsetIndex offsetIndex, RowRanges rowRanges, long totalRowCount) { - int[] result = new int[offsetIndex.getPageCount()]; - int count = 0; - for (int i = 0, n = offsetIndex.getPageCount(); i < n; ++i) { - long from = offsetIndex.getFirstRowIndex(i); - if (rowRanges.isOverlapping(from, offsetIndex.getLastRowIndex(i, totalRowCount))) { - result[count++] = i; - } - } - return new FilteredOffsetIndex(offsetIndex, Arrays.copyOfRange(result, 0, count)); - } - - static List calculateOffsetRanges( - OffsetIndex offsetIndex, ColumnChunkMetaData cm, long firstPageOffset) { - List ranges = new ArrayList<>(); - int n = offsetIndex.getPageCount(); - if (n > 0) { - OffsetRange currentRange = null; - - // Add a range for the dictionary page if required - long rowGroupOffset = cm.getStartingPos(); - if (rowGroupOffset < firstPageOffset) { - currentRange = new OffsetRange(rowGroupOffset, (int) (firstPageOffset - rowGroupOffset)); - ranges.add(currentRange); - } - - for (int i = 0; i < n; ++i) { - long offset = offsetIndex.getOffset(i); - int length = offsetIndex.getCompressedPageSize(i); - if (currentRange == null || !currentRange.extend(offset, length)) { - currentRange = new OffsetRange(offset, length); - ranges.add(currentRange); - } - } - } - return ranges; - } -} diff --git a/src/main/java/org/apache/parquet/local/ColumnIndexStoreImpl.java b/src/main/java/org/apache/parquet/local/ColumnIndexStoreImpl.java deleted file mode 100644 index 9ecaac5..0000000 --- a/src/main/java/org/apache/parquet/local/ColumnIndexStoreImpl.java +++ /dev/null @@ -1,158 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.parquet.local; - -import org.apache.parquet.hadoop.metadata.BlockMetaData; -import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; -import org.apache.parquet.hadoop.metadata.ColumnPath; -import org.apache.parquet.internal.column.columnindex.ColumnIndex; -import org.apache.parquet.internal.column.columnindex.OffsetIndex; -import org.apache.parquet.internal.filter2.columnindex.ColumnIndexStore; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.util.HashMap; -import java.util.Map; -import java.util.Set; - -import static java.util.Collections.emptySet; - -/** Internal implementation of {@link ColumnIndexStore}. */ -class ColumnIndexStoreImpl implements ColumnIndexStore { - - private interface IndexStore { - ColumnIndex getColumnIndex(); - - OffsetIndex getOffsetIndex(); - } - - private class IndexStoreImpl implements IndexStore { - private final ColumnChunkMetaData meta; - private ColumnIndex columnIndex; - private boolean columnIndexRead; - private final OffsetIndex offsetIndex; - - IndexStoreImpl(ColumnChunkMetaData meta) { - this.meta = meta; - OffsetIndex oi; - try { - oi = reader.readOffsetIndex(meta); - } catch (IOException e) { - // If the I/O issue still stands it will fail the reading later; - // otherwise we fail the filtering only with a missing offset index. - LOGGER.warn("Unable to read offset index for column {}", meta.getPath(), e); - oi = null; - } - if (oi == null) { - throw new MissingOffsetIndexException(meta.getPath()); - } - offsetIndex = oi; - } - - @Override - public ColumnIndex getColumnIndex() { - if (!columnIndexRead) { - try { - columnIndex = reader.readColumnIndex(meta); - } catch (IOException e) { - // If the I/O issue still stands it will fail the reading later; - // otherwise we fail the filtering only with a missing column index. - LOGGER.warn("Unable to read column index for column {}", meta.getPath(), e); - } - columnIndexRead = true; - } - return columnIndex; - } - - @Override - public OffsetIndex getOffsetIndex() { - return offsetIndex; - } - } - - private static final Logger LOGGER = LoggerFactory.getLogger(ColumnIndexStoreImpl.class); - // Used for columns are not in this parquet file - private static final IndexStore MISSING_INDEX_STORE = - new IndexStore() { - @Override - public ColumnIndex getColumnIndex() { - return null; - } - - @Override - public OffsetIndex getOffsetIndex() { - return null; - } - }; - private static final ColumnIndexStoreImpl EMPTY = - new ColumnIndexStoreImpl(null, new BlockMetaData(), emptySet()) { - @Override - public ColumnIndex getColumnIndex(ColumnPath column) { - return null; - } - - @Override - public OffsetIndex getOffsetIndex(ColumnPath column) { - throw new MissingOffsetIndexException(column); - } - }; - - private final ParquetFileReader reader; - private final Map store; - - /* - * Creates a column index store which lazily reads column/offset indexes for the columns in paths. (paths are the set - * of columns used for the projection) - */ - static ColumnIndexStore create( - ParquetFileReader reader, BlockMetaData block, Set paths) { - try { - return new ColumnIndexStoreImpl(reader, block, paths); - } catch (MissingOffsetIndexException e) { - return EMPTY; - } - } - - private ColumnIndexStoreImpl( - ParquetFileReader reader, BlockMetaData block, Set paths) { - // TODO[GS]: Offset index for every paths will be required; pre-read the consecutive ones at - // once? - // TODO[GS]: Pre-read column index based on filter? - this.reader = reader; - Map store = new HashMap<>(); - for (ColumnChunkMetaData column : block.getColumns()) { - ColumnPath path = column.getPath(); - if (paths.contains(path)) { - store.put(path, new IndexStoreImpl(column)); - } - } - this.store = store; - } - - @Override - public ColumnIndex getColumnIndex(ColumnPath column) { - return store.getOrDefault(column, MISSING_INDEX_STORE).getColumnIndex(); - } - - @Override - public OffsetIndex getOffsetIndex(ColumnPath column) { - return store.getOrDefault(column, MISSING_INDEX_STORE).getOffsetIndex(); - } -} diff --git a/src/main/java/org/apache/parquet/local/DictionaryPageReader.java b/src/main/java/org/apache/parquet/local/DictionaryPageReader.java deleted file mode 100644 index 9072da6..0000000 --- a/src/main/java/org/apache/parquet/local/DictionaryPageReader.java +++ /dev/null @@ -1,116 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.parquet.local; - -import org.apache.parquet.bytes.BytesInput; -import org.apache.parquet.column.ColumnDescriptor; -import org.apache.parquet.column.page.DictionaryPage; -import org.apache.parquet.column.page.DictionaryPageReadStore; -import org.apache.parquet.hadoop.metadata.BlockMetaData; -import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; -import org.apache.parquet.io.ParquetDecodingException; - -import java.io.IOException; -import java.util.HashMap; -import java.util.Map; -import java.util.Objects; -import java.util.Optional; -import java.util.concurrent.ConcurrentHashMap; - -/** - * A {@link DictionaryPageReadStore} implementation that reads dictionaries from an open {@link - * ParquetFileReader}. - * - *

This implementation will delegate dictionary reads to a {@link ColumnChunkPageReadStore} to - * avoid extra reads after a row group has been loaded into memory. - */ -class DictionaryPageReader implements DictionaryPageReadStore { - - private final ParquetFileReader reader; - private final Map columns; - private final Map> dictionaryPageCache; - private ColumnChunkPageReadStore rowGroup = null; - - /** - * Instantiate a new DictionaryPageReader. - * - * @param reader The target ParquetFileReader - * @param block The target BlockMetaData - * @throws NullPointerException if {@code reader} or {@code block} is {@code null} - */ - DictionaryPageReader(ParquetFileReader reader, BlockMetaData block) { - this.reader = Objects.requireNonNull(reader); - this.columns = new HashMap<>(); - this.dictionaryPageCache = new ConcurrentHashMap<>(); - - for (ColumnChunkMetaData column : block.getColumns()) { - columns.put(column.getPath().toDotString(), column); - } - } - - /** - * Sets this reader's row group's page store. When a row group is set, this reader will delegate - * to that row group to return dictionary pages. This avoids seeking and re-reading dictionary - * bytes after this reader's row group is loaded into memory. - * - * @param rowGroup a ColumnChunkPageReadStore for this reader's row group - */ - void setRowGroup(ColumnChunkPageReadStore rowGroup) { - this.rowGroup = rowGroup; - } - - @Override - public DictionaryPage readDictionaryPage(ColumnDescriptor descriptor) { - if (rowGroup != null) { - // if the row group has already been read, use that dictionary - return rowGroup.readDictionaryPage(descriptor); - } - - String dotPath = String.join(".", descriptor.getPath()); - ColumnChunkMetaData column = columns.get(dotPath); - if (column == null) { - throw new ParquetDecodingException("Failed to load dictionary, unknown column: " + dotPath); - } - - return dictionaryPageCache - .computeIfAbsent( - dotPath, - key -> { - try { - final DictionaryPage dict = - column.hasDictionaryPage() ? reader.readDictionary(column) : null; - - // Copy the dictionary to ensure it can be reused if it is returned - // more than once. This can happen when a DictionaryFilter has two or - // more predicates for the same column. Cache misses as well. - return (dict != null) ? Optional.of(reusableCopy(dict)) : Optional.empty(); - } catch (IOException e) { - throw new ParquetDecodingException("Failed to read dictionary", e); - } - }) - .orElse(null); - } - - private static DictionaryPage reusableCopy(DictionaryPage dict) throws IOException { - return new DictionaryPage( - BytesInput.from(dict.getBytes().toByteArray()), - dict.getDictionarySize(), - dict.getEncoding()); - } -} diff --git a/src/main/java/org/apache/parquet/local/ParquetFileReader.java b/src/main/java/org/apache/parquet/local/ParquetFileReader.java deleted file mode 100644 index 05c99a7..0000000 --- a/src/main/java/org/apache/parquet/local/ParquetFileReader.java +++ /dev/null @@ -1,1345 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * copied from parquet-mr, updated by An Qi - */ - -package org.apache.parquet.local; - -import org.apache.parquet.bytes.ByteBufferInputStream; -import org.apache.parquet.bytes.BytesInput; -import org.apache.parquet.column.ColumnDescriptor; -import org.apache.parquet.column.page.*; -import org.apache.parquet.column.values.bloomfilter.BlockSplitBloomFilter; -import org.apache.parquet.column.values.bloomfilter.BloomFilter; -import org.apache.parquet.compression.CompressionCodecFactory.BytesInputDecompressor; -import org.apache.parquet.crypto.*; -import org.apache.parquet.crypto.ModuleCipherFactory.ModuleType; -import org.apache.parquet.filter2.compat.FilterCompat; -import org.apache.parquet.format.*; -import org.apache.parquet.hadoop.ParquetEmptyBlockException; -import org.apache.parquet.hadoop.metadata.*; -import org.apache.parquet.hadoop.metadata.FileMetaData; -import org.apache.parquet.internal.column.columnindex.ColumnIndex; -import org.apache.parquet.internal.column.columnindex.OffsetIndex; -import org.apache.parquet.internal.filter2.columnindex.ColumnIndexFilter; -import org.apache.parquet.internal.filter2.columnindex.ColumnIndexStore; -import org.apache.parquet.internal.filter2.columnindex.RowRanges; -import org.apache.parquet.internal.hadoop.metadata.IndexReference; -import org.apache.parquet.io.InputFile; -import org.apache.parquet.io.ParquetDecodingException; -import org.apache.parquet.io.SeekableInputStream; -import org.apache.parquet.local.ColumnChunkPageReadStore.ColumnChunkPageReader; -import org.apache.parquet.local.ColumnIndexFilterUtils.OffsetRange; -import org.apache.parquet.local.filter2.compat.RowGroupFilter; -import org.apache.parquet.schema.MessageType; -import org.apache.parquet.schema.PrimitiveType; -import org.apache.yetus.audience.InterfaceAudience.Private; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.Closeable; -import java.io.IOException; -import java.io.InputStream; -import java.io.SequenceInputStream; -import java.nio.ByteBuffer; -import java.util.*; -import java.util.Map.Entry; -import java.util.zip.CRC32; - -import static org.apache.parquet.bytes.BytesUtils.readIntLittleEndian; -import static org.apache.parquet.format.Util.readFileCryptoMetaData; -import static org.apache.parquet.local.ColumnIndexFilterUtils.calculateOffsetRanges; -import static org.apache.parquet.local.ColumnIndexFilterUtils.filterOffsetIndex; -import static org.apache.parquet.local.ParquetFileWriter.EFMAGIC; -import static org.apache.parquet.local.ParquetFileWriter.MAGIC; - -/** Internal implementation of the Parquet file reader as a block container */ -public class ParquetFileReader implements Closeable { - - private static final Logger LOG = LoggerFactory.getLogger(ParquetFileReader.class); - - private final ParquetMetadataConverter converter; - - private final CRC32 crc; - - public static ParquetMetadata readFooter( - InputFile file, ParquetReadOptions options, SeekableInputStream f) throws IOException { - ParquetMetadataConverter converter = new ParquetMetadataConverter(options); - return readFooter(file, options, f, converter); - } - - private static ParquetMetadata readFooter( - InputFile file, - ParquetReadOptions options, - SeekableInputStream f, - ParquetMetadataConverter converter) - throws IOException { - - long fileLen = file.getLength(); - String filePath = file.toString(); - LOG.debug("File length {}", fileLen); - - int FOOTER_LENGTH_SIZE = 4; - if (fileLen - < MAGIC.length - + FOOTER_LENGTH_SIZE - + MAGIC.length) { // MAGIC + data + footer + footerIndex + MAGIC - throw new RuntimeException( - filePath + " is not a Parquet file (length is too low: " + fileLen + ")"); - } - - // Read footer length and magic string - with a single seek - byte[] magic = new byte[MAGIC.length]; - long fileMetadataLengthIndex = fileLen - magic.length - FOOTER_LENGTH_SIZE; - LOG.debug("reading footer index at {}", fileMetadataLengthIndex); - f.seek(fileMetadataLengthIndex); - int fileMetadataLength = readIntLittleEndian(f); - f.readFully(magic); - - boolean encryptedFooterMode; - if (Arrays.equals(MAGIC, magic)) { - encryptedFooterMode = false; - } else if (Arrays.equals(EFMAGIC, magic)) { - encryptedFooterMode = true; - } else { - throw new RuntimeException( - filePath - + " is not a Parquet file. Expected magic number at tail, but found " - + Arrays.toString(magic)); - } - - long fileMetadataIndex = fileMetadataLengthIndex - fileMetadataLength; - LOG.debug("read footer length: {}, footer index: {}", fileMetadataLength, fileMetadataIndex); - if (fileMetadataIndex < magic.length || fileMetadataIndex >= fileMetadataLengthIndex) { - throw new RuntimeException( - "corrupted file: the footer index is not within the file: " + fileMetadataIndex); - } - f.seek(fileMetadataIndex); - - FileDecryptionProperties fileDecryptionProperties = options.getDecryptionProperties(); - InternalFileDecryptor fileDecryptor = null; - if (null != fileDecryptionProperties) { - fileDecryptor = new InternalFileDecryptor(fileDecryptionProperties); - } - - // Read all the footer bytes in one time to avoid multiple read operations, - // since it can be pretty time consuming for a single read operation in HDFS. - ByteBuffer footerBytesBuffer = ByteBuffer.allocate(fileMetadataLength); - f.readFully(footerBytesBuffer); - LOG.debug("Finished to read all footer bytes."); - footerBytesBuffer.flip(); - InputStream footerBytesStream = ByteBufferInputStream.wrap(footerBytesBuffer); - - // Regular file, or encrypted file with plaintext footer - if (!encryptedFooterMode) { - return converter.readParquetMetadata( - footerBytesStream, options.getMetadataFilter(), fileDecryptor, false, fileMetadataLength); - } - - // Encrypted file with encrypted footer - if (null == fileDecryptor) { - throw new ParquetCryptoRuntimeException( - "Trying to read file with encrypted footer. No keys available"); - } - FileCryptoMetaData fileCryptoMetaData = readFileCryptoMetaData(footerBytesStream); - fileDecryptor.setFileCryptoMetaData( - fileCryptoMetaData.getEncryption_algorithm(), true, fileCryptoMetaData.getKey_metadata()); - // footer length is required only for signed plaintext footers - return converter.readParquetMetadata( - footerBytesStream, options.getMetadataFilter(), fileDecryptor, true, 0); - } - - protected final SeekableInputStream f; - private final InputFile file; - private final ParquetReadOptions options; - private final Map paths = new HashMap<>(); - private final FileMetaData fileMetaData; // may be null - private final List blocks; - private final List blockIndexStores; - private final List blockRowRanges; - - // not final. in some cases, this may be lazily loaded for backward-compat. - private final ParquetMetadata footer; - - private int currentBlock = 0; - private ColumnChunkPageReadStore currentRowGroup = null; - private DictionaryPageReader nextDictionaryReader = null; - - private InternalFileDecryptor fileDecryptor = null; - - public ParquetFileReader(InputFile file, ParquetMetadata footer, ParquetReadOptions options) - throws IOException { - this.converter = new ParquetMetadataConverter(options); - this.file = file; - this.options = options; - this.f = file.newStream(); - try { - this.footer = footer; - this.fileMetaData = footer.getFileMetaData(); - this.fileDecryptor = - fileMetaData.getFileDecryptor(); // must be called before filterRowGroups! - if (null != fileDecryptor && fileDecryptor.plaintextFile()) { - this.fileDecryptor = null; // Plaintext file. No need in decryptor - } - - this.blocks = filterRowGroups(footer.getBlocks()); - this.blockIndexStores = listWithNulls(this.blocks.size()); - this.blockRowRanges = listWithNulls(this.blocks.size()); - for (ColumnDescriptor col : footer.getFileMetaData().getSchema().getColumns()) { - paths.put(ColumnPath.get(col.getPath()), col); - } - this.crc = options.usePageChecksumVerification() ? new CRC32() : null; - } catch (Exception e) { - f.close(); - throw e; - } - } - - private static List listWithNulls(int size) { - return new ArrayList<>(Collections.nCopies(size, null)); - } - - public FileMetaData getFileMetaData() { - return fileMetaData; - } - - public long getRecordCount() { - long total = 0L; - for (BlockMetaData block : blocks) { - total += block.getRowCount(); - } - return total; - } - - public long getFilteredRecordCount() { - if (!options.useColumnIndexFilter() - || !FilterCompat.isFilteringRequired(options.getRecordFilter())) { - return getRecordCount(); - } - long total = 0L; - for (int i = 0, n = blocks.size(); i < n; ++i) { - total += getRowRanges(i).rowCount(); - } - return total; - } - - public String getFile() { - return file.toString(); - } - - public List filterRowGroups(List blocks) throws IOException { - FilterCompat.Filter recordFilter = options.getRecordFilter(); - if (FilterCompat.isFilteringRequired(recordFilter)) { - // set up data filters based on configured levels - List levels = new ArrayList<>(); - - if (options.useStatsFilter()) { - levels.add(RowGroupFilter.FilterLevel.STATISTICS); - } - - if (options.useDictionaryFilter()) { - levels.add(RowGroupFilter.FilterLevel.DICTIONARY); - } - - if (options.useBloomFilter()) { - levels.add(RowGroupFilter.FilterLevel.BLOOMFILTER); - } - return RowGroupFilter.filterRowGroups(levels, recordFilter, blocks, this); - } - - return blocks; - } - - public List getRowGroups() { - return blocks; - } - - private MessageType requestedSchema = null; - - public void setRequestedSchema(MessageType projection) { - requestedSchema = projection; - paths.clear(); - for (ColumnDescriptor col : projection.getColumns()) { - paths.put(ColumnPath.get(col.getPath()), col); - } - } - - public MessageType getRequestedSchema() { - if (requestedSchema == null) { - return fileMetaData.getSchema(); - } - return requestedSchema; - } - - /** - * Reads all the columns requested from the row group at the specified block. - * - * @param blockIndex the index of the requested block - * @return the PageReadStore which can provide PageReaders for each column. - * @throws IOException if an error occurs while reading - */ - public PageReadStore readRowGroup(int blockIndex) throws IOException { - return internalReadRowGroup(blockIndex); - } - - /** - * Reads all the columns requested from the row group at the current file position. - * - * @return the PageReadStore which can provide PageReaders for each column. - * @throws IOException if an error occurs while reading - */ - public PageReadStore readNextRowGroup() throws IOException { - ColumnChunkPageReadStore rowGroup = null; - try { - rowGroup = internalReadRowGroup(currentBlock); - } catch (ParquetEmptyBlockException e) { - LOG.warn("Read empty block at index {} from {}", currentBlock, getFile()); - advanceToNextBlock(); - return readNextRowGroup(); - } - - if (rowGroup == null) { - return null; - } - this.currentRowGroup = rowGroup; - // avoid re-reading bytes the dictionary reader is used after this call - if (nextDictionaryReader != null) { - nextDictionaryReader.setRowGroup(currentRowGroup); - } - - advanceToNextBlock(); - - return currentRowGroup; - } - - private ColumnChunkPageReadStore internalReadRowGroup(int blockIndex) throws IOException { - if (blockIndex < 0 || blockIndex >= blocks.size()) { - return null; - } - BlockMetaData block = blocks.get(blockIndex); - if (block.getRowCount() == 0) { - throw new ParquetEmptyBlockException("Illegal row group of 0 rows"); - } - org.apache.parquet.local.ColumnChunkPageReadStore rowGroup = - new ColumnChunkPageReadStore(block.getRowCount(), block.getRowIndexOffset()); - // prepare the list of consecutive parts to read them in one scan - List allParts = new ArrayList(); - ConsecutivePartList currentParts = null; - for (ColumnChunkMetaData mc : block.getColumns()) { - ColumnPath pathKey = mc.getPath(); - ColumnDescriptor columnDescriptor = paths.get(pathKey); - if (columnDescriptor != null) { - long startingPos = mc.getStartingPos(); - // first part or not consecutive => new list - if (currentParts == null || currentParts.endPos() != startingPos) { - currentParts = new ConsecutivePartList(startingPos); - allParts.add(currentParts); - } - currentParts.addChunk( - new ChunkDescriptor(columnDescriptor, mc, startingPos, mc.getTotalSize())); - } - } - // actually read all the chunks - ChunkListBuilder builder = new ChunkListBuilder(block.getRowCount()); - for (ConsecutivePartList consecutiveChunks : allParts) { - consecutiveChunks.readAll(f, builder); - } - for (Chunk chunk : builder.build()) { - readChunkPages(chunk, block, rowGroup); - } - - return rowGroup; - } - - /** - * Reads all the columns requested from the specified row group. It may skip specific pages based - * on the column indexes according to the actual filter. As the rows are not aligned among the - * pages of the different columns row synchronization might be required. See the documentation of - * the class SynchronizingColumnReader for details. - * - * @param blockIndex the index of the requested block - * @return the PageReadStore which can provide PageReaders for each column or null if there are no - * rows in this block - * @throws IOException if an error occurs while reading - */ - public PageReadStore readFilteredRowGroup(int blockIndex) throws IOException { - if (blockIndex < 0 || blockIndex >= blocks.size()) { - return null; - } - - // Filtering not required -> fall back to the non-filtering path - if (!options.useColumnIndexFilter() - || !FilterCompat.isFilteringRequired(options.getRecordFilter())) { - return internalReadRowGroup(blockIndex); - } - - BlockMetaData block = blocks.get(blockIndex); - if (block.getRowCount() == 0) { - throw new ParquetEmptyBlockException("Illegal row group of 0 rows"); - } - - RowRanges rowRanges = getRowRanges(blockIndex); - return readFilteredRowGroup(blockIndex, rowRanges); - } - - /** - * Reads all the columns requested from the specified row group. It may skip specific pages based - * on the {@code rowRanges} passed in. As the rows are not aligned among the pages of the - * different columns row synchronization might be required. See the documentation of the class - * SynchronizingColumnReader for details. - * - * @param blockIndex the index of the requested block - * @param rowRanges the row ranges to be read from the requested block - * @return the PageReadStore which can provide PageReaders for each column or null if there are no - * rows in this block - * @throws IOException if an error occurs while reading - * @throws IllegalArgumentException if the {@code blockIndex} is invalid or the {@code rowRanges} - * is null - */ - public ColumnChunkPageReadStore readFilteredRowGroup(int blockIndex, RowRanges rowRanges) - throws IOException { - if (blockIndex < 0 || blockIndex >= blocks.size()) { - throw new IllegalArgumentException( - String.format( - "Invalid block index %s, the valid block index range are: " + "[%s, %s]", - blockIndex, 0, blocks.size() - 1)); - } - - if (Objects.isNull(rowRanges)) { - throw new IllegalArgumentException("RowRanges must not be null"); - } - - BlockMetaData block = blocks.get(blockIndex); - if (block.getRowCount() == 0L) { - return null; - } - - long rowCount = rowRanges.rowCount(); - if (rowCount == 0) { - // There are no matching rows -> returning null - return null; - } - - if (rowCount == block.getRowCount()) { - // All rows are matching -> fall back to the non-filtering path - return internalReadRowGroup(blockIndex); - } - - return internalReadFilteredRowGroup(block, rowRanges, getColumnIndexStore(blockIndex)); - } - - /** - * Reads all the columns requested from the row group at the current file position. It may skip - * specific pages based on the column indexes according to the actual filter. As the rows are not - * aligned among the pages of the different columns row synchronization might be required. See the - * documentation of the class SynchronizingColumnReader for details. - * - * @return the PageReadStore which can provide PageReaders for each column - * @throws IOException if an error occurs while reading - */ - public PageReadStore readNextFilteredRowGroup() throws IOException { - if (currentBlock == blocks.size()) { - return null; - } - // Filtering not required -> fall back to the non-filtering path - if (!options.useColumnIndexFilter() - || !FilterCompat.isFilteringRequired(options.getRecordFilter())) { - return readNextRowGroup(); - } - BlockMetaData block = blocks.get(currentBlock); - if (block.getRowCount() == 0L) { - LOG.warn("Read empty block at index {} from {}", currentBlock, getFile()); - // Skip the empty block - advanceToNextBlock(); - return readNextFilteredRowGroup(); - } - RowRanges rowRanges = getRowRanges(currentBlock); - long rowCount = rowRanges.rowCount(); - if (rowCount == 0) { - // There are no matching rows -> skipping this row-group - advanceToNextBlock(); - return readNextFilteredRowGroup(); - } - if (rowCount == block.getRowCount()) { - // All rows are matching -> fall back to the non-filtering path - return readNextRowGroup(); - } - - this.currentRowGroup = - internalReadFilteredRowGroup(block, rowRanges, getColumnIndexStore(currentBlock)); - - // avoid re-reading bytes the dictionary reader is used after this call - if (nextDictionaryReader != null) { - nextDictionaryReader.setRowGroup(currentRowGroup); - } - - advanceToNextBlock(); - - return this.currentRowGroup; - } - - private ColumnChunkPageReadStore internalReadFilteredRowGroup( - BlockMetaData block, RowRanges rowRanges, ColumnIndexStore ciStore) throws IOException { - ColumnChunkPageReadStore rowGroup = - new ColumnChunkPageReadStore(rowRanges, block.getRowIndexOffset()); - // prepare the list of consecutive parts to read them in one scan - ChunkListBuilder builder = new ChunkListBuilder(block.getRowCount()); - List allParts = new ArrayList<>(); - ConsecutivePartList currentParts = null; - for (ColumnChunkMetaData mc : block.getColumns()) { - ColumnPath pathKey = mc.getPath(); - ColumnDescriptor columnDescriptor = paths.get(pathKey); - if (columnDescriptor != null) { - OffsetIndex offsetIndex = ciStore.getOffsetIndex(mc.getPath()); - - OffsetIndex filteredOffsetIndex = - filterOffsetIndex(offsetIndex, rowRanges, block.getRowCount()); - for (OffsetRange range : - calculateOffsetRanges(filteredOffsetIndex, mc, offsetIndex.getOffset(0))) { - long startingPos = range.getOffset(); - // first part or not consecutive => new list - if (currentParts == null || currentParts.endPos() != startingPos) { - currentParts = new ConsecutivePartList(startingPos); - allParts.add(currentParts); - } - ChunkDescriptor chunkDescriptor = - new ChunkDescriptor(columnDescriptor, mc, startingPos, range.getLength()); - currentParts.addChunk(chunkDescriptor); - builder.setOffsetIndex(chunkDescriptor, filteredOffsetIndex); - } - } - } - // actually read all the chunks - for (ConsecutivePartList consecutiveChunks : allParts) { - consecutiveChunks.readAll(f, builder); - } - for (Chunk chunk : builder.build()) { - readChunkPages(chunk, block, rowGroup); - } - - return rowGroup; - } - - private void readChunkPages(Chunk chunk, BlockMetaData block, ColumnChunkPageReadStore rowGroup) - throws IOException { - if (null == fileDecryptor || fileDecryptor.plaintextFile()) { - rowGroup.addColumn(chunk.descriptor.col, chunk.readAllPages()); - return; - } - // Encrypted file - ColumnPath columnPath = ColumnPath.get(chunk.descriptor.col.getPath()); - InternalColumnDecryptionSetup columnDecryptionSetup = fileDecryptor.getColumnSetup(columnPath); - if (!columnDecryptionSetup.isEncrypted()) { // plaintext column - rowGroup.addColumn(chunk.descriptor.col, chunk.readAllPages()); - } else { // encrypted column - rowGroup.addColumn( - chunk.descriptor.col, - chunk.readAllPages( - columnDecryptionSetup.getMetaDataDecryptor(), - columnDecryptionSetup.getDataDecryptor(), - fileDecryptor.getFileAAD(), - block.getOrdinal(), - columnDecryptionSetup.getOrdinal())); - } - } - - public ColumnIndexStore getColumnIndexStore(int blockIndex) { - ColumnIndexStore ciStore = blockIndexStores.get(blockIndex); - if (ciStore == null) { - ciStore = - org.apache.parquet.local.ColumnIndexStoreImpl.create( - this, blocks.get(blockIndex), paths.keySet()); - blockIndexStores.set(blockIndex, ciStore); - } - return ciStore; - } - - private RowRanges getRowRanges(int blockIndex) { - assert FilterCompat.isFilteringRequired(options.getRecordFilter()) - : "Should not be invoked if filter is null or NOOP"; - RowRanges rowRanges = blockRowRanges.get(blockIndex); - if (rowRanges == null) { - rowRanges = - ColumnIndexFilter.calculateRowRanges( - options.getRecordFilter(), - getColumnIndexStore(blockIndex), - paths.keySet(), - blocks.get(blockIndex).getRowCount()); - blockRowRanges.set(blockIndex, rowRanges); - } - return rowRanges; - } - - public boolean skipNextRowGroup() { - return advanceToNextBlock(); - } - - private boolean advanceToNextBlock() { - if (currentBlock == blocks.size()) { - return false; - } - - // update the current block and instantiate a dictionary reader for it - ++currentBlock; - this.nextDictionaryReader = null; - - return true; - } - - /** - * Returns a {@link DictionaryPageReadStore} for the row group that would be returned by calling - * {@link #readNextRowGroup()} or skipped by calling {@link #skipNextRowGroup()}. - * - * @return a DictionaryPageReadStore for the next row group - */ - public DictionaryPageReadStore getNextDictionaryReader() { - if (nextDictionaryReader == null) { - this.nextDictionaryReader = getDictionaryReader(currentBlock); - } - return nextDictionaryReader; - } - - public DictionaryPageReader getDictionaryReader(int blockIndex) { - if (blockIndex < 0 || blockIndex >= blocks.size()) { - return null; - } - return new DictionaryPageReader(this, blocks.get(blockIndex)); - } - - public DictionaryPageReader getDictionaryReader(BlockMetaData block) { - return new DictionaryPageReader(this, block); - } - - /** - * Reads and decompresses a dictionary page for the given column chunk. - * - *

Returns null if the given column chunk has no dictionary page. - * - * @param meta a column's ColumnChunkMetaData to read the dictionary from - * @return an uncompressed DictionaryPage or null - * @throws IOException if there is an error while reading the dictionary - */ - DictionaryPage readDictionary(ColumnChunkMetaData meta) throws IOException { - if (!meta.hasDictionaryPage()) { - return null; - } - - // TODO: this should use getDictionaryPageOffset() but it isn't reliable. - if (f.getPos() != meta.getStartingPos()) { - f.seek(meta.getStartingPos()); - } - - boolean encryptedColumn = false; - InternalColumnDecryptionSetup columnDecryptionSetup = null; - byte[] dictionaryPageAAD = null; - BlockCipher.Decryptor pageDecryptor = null; - if (null != fileDecryptor && !fileDecryptor.plaintextFile()) { - columnDecryptionSetup = fileDecryptor.getColumnSetup(meta.getPath()); - if (columnDecryptionSetup.isEncrypted()) { - encryptedColumn = true; - } - } - - PageHeader pageHeader; - if (!encryptedColumn) { - pageHeader = Util.readPageHeader(f); - } else { - byte[] dictionaryPageHeaderAAD = - AesCipher.createModuleAAD( - fileDecryptor.getFileAAD(), - ModuleType.DictionaryPageHeader, - meta.getRowGroupOrdinal(), - columnDecryptionSetup.getOrdinal(), - -1); - pageHeader = - Util.readPageHeader( - f, columnDecryptionSetup.getMetaDataDecryptor(), dictionaryPageHeaderAAD); - dictionaryPageAAD = - AesCipher.createModuleAAD( - fileDecryptor.getFileAAD(), - ModuleType.DictionaryPage, - meta.getRowGroupOrdinal(), - columnDecryptionSetup.getOrdinal(), - -1); - pageDecryptor = columnDecryptionSetup.getDataDecryptor(); - } - - if (!pageHeader.isSetDictionary_page_header()) { - return null; // TODO: should this complain? - } - - DictionaryPage compressedPage = - readCompressedDictionary(pageHeader, f, pageDecryptor, dictionaryPageAAD); - BytesInputDecompressor decompressor = - options.getCodecFactory().getDecompressor(meta.getCodec()); - - return new DictionaryPage( - decompressor.decompress(compressedPage.getBytes(), compressedPage.getUncompressedSize()), - compressedPage.getDictionarySize(), - compressedPage.getEncoding()); - } - - private DictionaryPage readCompressedDictionary( - PageHeader pageHeader, - SeekableInputStream fin, - BlockCipher.Decryptor pageDecryptor, - byte[] dictionaryPageAAD) - throws IOException { - DictionaryPageHeader dictHeader = pageHeader.getDictionary_page_header(); - - int uncompressedPageSize = pageHeader.getUncompressed_page_size(); - int compressedPageSize = pageHeader.getCompressed_page_size(); - - byte[] dictPageBytes = new byte[compressedPageSize]; - fin.readFully(dictPageBytes); - - BytesInput bin = BytesInput.from(dictPageBytes); - - if (null != pageDecryptor) { - bin = BytesInput.from(pageDecryptor.decrypt(bin.toByteArray(), dictionaryPageAAD)); - } - - return new DictionaryPage( - bin, - uncompressedPageSize, - dictHeader.getNum_values(), - converter.getEncoding(dictHeader.getEncoding())); - } - - public BloomFilterReader getBloomFilterDataReader(int blockIndex) { - if (blockIndex < 0 || blockIndex >= blocks.size()) { - return null; - } - return new BloomFilterReader(this, blocks.get(blockIndex)); - } - - public BloomFilterReader getBloomFilterDataReader(BlockMetaData block) { - return new BloomFilterReader(this, block); - } - - /** - * Reads Bloom filter data for the given column chunk. - * - * @param meta a column's ColumnChunkMetaData to read the dictionary from - * @return an BloomFilter object. - * @throws IOException if there is an error while reading the Bloom filter. - */ - public BloomFilter readBloomFilter(ColumnChunkMetaData meta) throws IOException { - long bloomFilterOffset = meta.getBloomFilterOffset(); - if (bloomFilterOffset < 0) { - return null; - } - - // Prepare to decrypt Bloom filter (for encrypted columns) - BlockCipher.Decryptor bloomFilterDecryptor = null; - byte[] bloomFilterHeaderAAD = null; - byte[] bloomFilterBitsetAAD = null; - if (null != fileDecryptor && !fileDecryptor.plaintextFile()) { - InternalColumnDecryptionSetup columnDecryptionSetup = - fileDecryptor.getColumnSetup(meta.getPath()); - if (columnDecryptionSetup.isEncrypted()) { - bloomFilterDecryptor = columnDecryptionSetup.getMetaDataDecryptor(); - bloomFilterHeaderAAD = - AesCipher.createModuleAAD( - fileDecryptor.getFileAAD(), - ModuleType.BloomFilterHeader, - meta.getRowGroupOrdinal(), - columnDecryptionSetup.getOrdinal(), - -1); - bloomFilterBitsetAAD = - AesCipher.createModuleAAD( - fileDecryptor.getFileAAD(), - ModuleType.BloomFilterBitset, - meta.getRowGroupOrdinal(), - columnDecryptionSetup.getOrdinal(), - -1); - } - } - - // Read Bloom filter data header. - f.seek(bloomFilterOffset); - BloomFilterHeader bloomFilterHeader; - try { - bloomFilterHeader = Util.readBloomFilterHeader(f, bloomFilterDecryptor, bloomFilterHeaderAAD); - } catch (IOException e) { - LOG.warn("read no bloom filter"); - return null; - } - - int numBytes = bloomFilterHeader.getNumBytes(); - if (numBytes <= 0 || numBytes > BlockSplitBloomFilter.UPPER_BOUND_BYTES) { - LOG.warn("the read bloom filter size is wrong, size is {}", bloomFilterHeader.getNumBytes()); - return null; - } - - if (!bloomFilterHeader.getHash().isSetXXHASH() - || !bloomFilterHeader.getAlgorithm().isSetBLOCK() - || !bloomFilterHeader.getCompression().isSetUNCOMPRESSED()) { - LOG.warn( - "the read bloom filter is not supported yet, algorithm = {}, hash = {}, compression = {}", - bloomFilterHeader.getAlgorithm(), - bloomFilterHeader.getHash(), - bloomFilterHeader.getCompression()); - return null; - } - - byte[] bitset; - if (null == bloomFilterDecryptor) { - bitset = new byte[numBytes]; - f.readFully(bitset); - } else { - bitset = bloomFilterDecryptor.decrypt(f, bloomFilterBitsetAAD); - if (bitset.length != numBytes) { - throw new ParquetCryptoRuntimeException("Wrong length of decrypted bloom filter bitset"); - } - } - return new BlockSplitBloomFilter(bitset); - } - - /** - * @param column the column chunk which the column index is to be returned for - * @return the column index for the specified column chunk or {@code null} if there is no index - * @throws IOException if any I/O error occurs during reading the file - */ - @Private - public ColumnIndex readColumnIndex(ColumnChunkMetaData column) throws IOException { - IndexReference ref = column.getColumnIndexReference(); - if (ref == null) { - return null; - } - f.seek(ref.getOffset()); - - BlockCipher.Decryptor columnIndexDecryptor = null; - byte[] columnIndexAAD = null; - if (null != fileDecryptor && !fileDecryptor.plaintextFile()) { - InternalColumnDecryptionSetup columnDecryptionSetup = - fileDecryptor.getColumnSetup(column.getPath()); - if (columnDecryptionSetup.isEncrypted()) { - columnIndexDecryptor = columnDecryptionSetup.getMetaDataDecryptor(); - columnIndexAAD = - AesCipher.createModuleAAD( - fileDecryptor.getFileAAD(), - ModuleType.ColumnIndex, - column.getRowGroupOrdinal(), - columnDecryptionSetup.getOrdinal(), - -1); - } - } - return ParquetMetadataConverter.fromParquetColumnIndex( - column.getPrimitiveType(), Util.readColumnIndex(f, columnIndexDecryptor, columnIndexAAD)); - } - - /** - * @param column the column chunk which the offset index is to be returned for - * @return the offset index for the specified column chunk or {@code null} if there is no index - * @throws IOException if any I/O error occurs during reading the file - */ - @Private - public OffsetIndex readOffsetIndex(ColumnChunkMetaData column) throws IOException { - IndexReference ref = column.getOffsetIndexReference(); - if (ref == null) { - return null; - } - f.seek(ref.getOffset()); - - BlockCipher.Decryptor offsetIndexDecryptor = null; - byte[] offsetIndexAAD = null; - if (null != fileDecryptor && !fileDecryptor.plaintextFile()) { - InternalColumnDecryptionSetup columnDecryptionSetup = - fileDecryptor.getColumnSetup(column.getPath()); - if (columnDecryptionSetup.isEncrypted()) { - offsetIndexDecryptor = columnDecryptionSetup.getMetaDataDecryptor(); - offsetIndexAAD = - AesCipher.createModuleAAD( - fileDecryptor.getFileAAD(), - ModuleType.OffsetIndex, - column.getRowGroupOrdinal(), - columnDecryptionSetup.getOrdinal(), - -1); - } - } - return ParquetMetadataConverter.fromParquetOffsetIndex( - Util.readOffsetIndex(f, offsetIndexDecryptor, offsetIndexAAD)); - } - - @Override - public void close() throws IOException { - try { - if (f != null) { - f.close(); - } - } finally { - options.getCodecFactory().release(); - } - } - - public ParquetMetadata getFooter() { - return footer; - } - - /* - * Builder to concatenate the buffers of the discontinuous parts for the same column. These parts are generated as a - * result of the column-index based filtering when some pages might be skipped at reading. - */ - private class ChunkListBuilder { - private class ChunkData { - final List buffers = new ArrayList<>(); - OffsetIndex offsetIndex; - } - - private final Map map = new HashMap<>(); - private ChunkDescriptor lastDescriptor; - private final long rowCount; - private SeekableInputStream f; - - public ChunkListBuilder(long rowCount) { - this.rowCount = rowCount; - } - - void add(ChunkDescriptor descriptor, List buffers, SeekableInputStream f) { - map.computeIfAbsent(descriptor, d -> new ChunkData()).buffers.addAll(buffers); - lastDescriptor = descriptor; - this.f = f; - } - - void setOffsetIndex(ChunkDescriptor descriptor, OffsetIndex offsetIndex) { - map.computeIfAbsent(descriptor, d -> new ChunkData()).offsetIndex = offsetIndex; - } - - List build() { - Set> entries = map.entrySet(); - List chunks = new ArrayList<>(entries.size()); - for (Entry entry : entries) { - ChunkDescriptor descriptor = entry.getKey(); - ChunkData data = entry.getValue(); - if (descriptor.equals(lastDescriptor)) { - // because of a bug, the last chunk might be larger than descriptor.size - chunks.add( - new WorkaroundChunk(lastDescriptor, data.buffers, f, data.offsetIndex, rowCount)); - } else { - chunks.add(new Chunk(descriptor, data.buffers, data.offsetIndex, rowCount)); - } - } - return chunks; - } - } - - /** The data for a column chunk */ - private class Chunk { - - protected final ChunkDescriptor descriptor; - protected final ByteBufferInputStream stream; - final OffsetIndex offsetIndex; - final long rowCount; - - /** - * @param descriptor descriptor for the chunk - * @param buffers ByteBuffers that contain the chunk - * @param offsetIndex the offset index for this column; might be null - */ - public Chunk( - ChunkDescriptor descriptor, - List buffers, - OffsetIndex offsetIndex, - long rowCount) { - this.descriptor = descriptor; - this.stream = ByteBufferInputStream.wrap(buffers); - this.offsetIndex = offsetIndex; - this.rowCount = rowCount; - } - - protected PageHeader readPageHeader() throws IOException { - return readPageHeader(null, null); - } - - protected PageHeader readPageHeader(BlockCipher.Decryptor blockDecryptor, byte[] pageHeaderAAD) - throws IOException { - return Util.readPageHeader(stream, blockDecryptor, pageHeaderAAD); - } - - /** - * Calculate checksum of input bytes, throw decoding exception if it does not match the provided - * reference crc - */ - private void verifyCrc(int referenceCrc, byte[] bytes, String exceptionMsg) { - crc.reset(); - crc.update(bytes); - if (crc.getValue() != ((long) referenceCrc & 0xffffffffL)) { - throw new ParquetDecodingException(exceptionMsg); - } - } - - /** - * Read all of the pages in a given column chunk. - * - * @return the list of pages - */ - public ColumnChunkPageReader readAllPages() throws IOException { - return readAllPages(null, null, null, -1, -1); - } - - public ColumnChunkPageReader readAllPages( - BlockCipher.Decryptor headerBlockDecryptor, - BlockCipher.Decryptor pageBlockDecryptor, - byte[] aadPrefix, - int rowGroupOrdinal, - int columnOrdinal) - throws IOException { - List pagesInChunk = new ArrayList<>(); - DictionaryPage dictionaryPage = null; - PrimitiveType type = - getFileMetaData().getSchema().getType(descriptor.col.getPath()).asPrimitiveType(); - long valuesCountReadSoFar = 0L; - int dataPageCountReadSoFar = 0; - byte[] dataPageHeaderAAD = null; - if (null != headerBlockDecryptor) { - dataPageHeaderAAD = - AesCipher.createModuleAAD( - aadPrefix, - ModuleType.DataPageHeader, - rowGroupOrdinal, - columnOrdinal, - getPageOrdinal(dataPageCountReadSoFar)); - } - while (hasMorePages(valuesCountReadSoFar, dataPageCountReadSoFar)) { - byte[] pageHeaderAAD = dataPageHeaderAAD; - if (null != headerBlockDecryptor) { - // Important: this verifies file integrity (makes sure dictionary page had not been - // removed) - if (null == dictionaryPage && descriptor.metadata.hasDictionaryPage()) { - pageHeaderAAD = - AesCipher.createModuleAAD( - aadPrefix, ModuleType.DictionaryPageHeader, rowGroupOrdinal, columnOrdinal, -1); - } else { - int pageOrdinal = getPageOrdinal(dataPageCountReadSoFar); - AesCipher.quickUpdatePageAAD(dataPageHeaderAAD, pageOrdinal); - } - } - PageHeader pageHeader = readPageHeader(headerBlockDecryptor, pageHeaderAAD); - int uncompressedPageSize = pageHeader.getUncompressed_page_size(); - int compressedPageSize = pageHeader.getCompressed_page_size(); - final BytesInput pageBytes; - switch (pageHeader.type) { - case DICTIONARY_PAGE: - // there is only one dictionary page per column chunk - if (dictionaryPage != null) { - throw new ParquetDecodingException( - "more than one dictionary page in column " + descriptor.col); - } - pageBytes = this.readAsBytesInput(compressedPageSize); - if (options.usePageChecksumVerification() && pageHeader.isSetCrc()) { - verifyCrc( - pageHeader.getCrc(), - pageBytes.toByteArray(), - "could not verify dictionary page integrity, CRC checksum verification failed"); - } - DictionaryPageHeader dicHeader = pageHeader.getDictionary_page_header(); - dictionaryPage = - new DictionaryPage( - pageBytes, - uncompressedPageSize, - dicHeader.getNum_values(), - converter.getEncoding(dicHeader.getEncoding())); - // Copy crc to new page, used for testing - if (pageHeader.isSetCrc()) { - dictionaryPage.setCrc(pageHeader.getCrc()); - } - break; - case DATA_PAGE: - DataPageHeader dataHeaderV1 = pageHeader.getData_page_header(); - pageBytes = this.readAsBytesInput(compressedPageSize); - if (options.usePageChecksumVerification() && pageHeader.isSetCrc()) { - verifyCrc( - pageHeader.getCrc(), - pageBytes.toByteArray(), - "could not verify page integrity, CRC checksum verification failed"); - } - DataPageV1 dataPageV1 = - new DataPageV1( - pageBytes, - dataHeaderV1.getNum_values(), - uncompressedPageSize, - converter.fromParquetStatistics( - getFileMetaData().getCreatedBy(), dataHeaderV1.getStatistics(), type), - converter.getEncoding(dataHeaderV1.getRepetition_level_encoding()), - converter.getEncoding(dataHeaderV1.getDefinition_level_encoding()), - converter.getEncoding(dataHeaderV1.getEncoding())); - // Copy crc to new page, used for testing - if (pageHeader.isSetCrc()) { - dataPageV1.setCrc(pageHeader.getCrc()); - } - pagesInChunk.add(dataPageV1); - valuesCountReadSoFar += dataHeaderV1.getNum_values(); - ++dataPageCountReadSoFar; - break; - case DATA_PAGE_V2: - DataPageHeaderV2 dataHeaderV2 = pageHeader.getData_page_header_v2(); - int dataSize = - compressedPageSize - - dataHeaderV2.getRepetition_levels_byte_length() - - dataHeaderV2.getDefinition_levels_byte_length(); - pagesInChunk.add( - new DataPageV2( - dataHeaderV2.getNum_rows(), - dataHeaderV2.getNum_nulls(), - dataHeaderV2.getNum_values(), - this.readAsBytesInput(dataHeaderV2.getRepetition_levels_byte_length()), - this.readAsBytesInput(dataHeaderV2.getDefinition_levels_byte_length()), - converter.getEncoding(dataHeaderV2.getEncoding()), - this.readAsBytesInput(dataSize), - uncompressedPageSize, - converter.fromParquetStatistics( - getFileMetaData().getCreatedBy(), dataHeaderV2.getStatistics(), type), - dataHeaderV2.isIs_compressed())); - valuesCountReadSoFar += dataHeaderV2.getNum_values(); - ++dataPageCountReadSoFar; - break; - default: - LOG.debug( - "skipping page of type {} of size {}", pageHeader.getType(), compressedPageSize); - stream.skipFully(compressedPageSize); - break; - } - } - if (offsetIndex == null && valuesCountReadSoFar != descriptor.metadata.getValueCount()) { - // Would be nice to have a CorruptParquetFileException or something as a subclass? - throw new IOException( - "Expected " - + descriptor.metadata.getValueCount() - + " values in column chunk at " - + getFile() - + " offset " - + descriptor.metadata.getFirstDataPageOffset() - + " but got " - + valuesCountReadSoFar - + " values instead over " - + pagesInChunk.size() - + " pages ending at file offset " - + (descriptor.fileOffset + stream.position())); - } - BytesInputDecompressor decompressor = - options.getCodecFactory().getDecompressor(descriptor.metadata.getCodec()); - return new ColumnChunkPageReader( - decompressor, - pagesInChunk, - dictionaryPage, - offsetIndex, - rowCount, - pageBlockDecryptor, - aadPrefix, - rowGroupOrdinal, - columnOrdinal); - } - - private boolean hasMorePages(long valuesCountReadSoFar, int dataPageCountReadSoFar) { - return offsetIndex == null - ? valuesCountReadSoFar < descriptor.metadata.getValueCount() - : dataPageCountReadSoFar < offsetIndex.getPageCount(); - } - - private int getPageOrdinal(int dataPageCountReadSoFar) { - if (null == offsetIndex) { - return dataPageCountReadSoFar; - } - - return offsetIndex.getPageOrdinal(dataPageCountReadSoFar); - } - - /** - * @param size the size of the page - * @return the page - * @throws IOException if there is an error while reading from the file stream - */ - public BytesInput readAsBytesInput(int size) throws IOException { - return BytesInput.from(stream.sliceBuffers(size)); - } - } - - /** deals with a now fixed bug where compressedLength was missing a few bytes. */ - private class WorkaroundChunk extends Chunk { - - private final SeekableInputStream f; - - /** - * @param descriptor the descriptor of the chunk - * @param f the file stream positioned at the end of this chunk - */ - private WorkaroundChunk( - ChunkDescriptor descriptor, - List buffers, - SeekableInputStream f, - OffsetIndex offsetIndex, - long rowCount) { - super(descriptor, buffers, offsetIndex, rowCount); - this.f = f; - } - - protected PageHeader readPageHeader() throws IOException { - PageHeader pageHeader; - stream.mark(8192); // headers should not be larger than 8k - try { - pageHeader = Util.readPageHeader(stream); - } catch (IOException e) { - // this is to workaround a bug where the compressedLength - // of the chunk is missing the size of the header of the dictionary - // to allow reading older files (using dictionary) we need this. - // usually 13 to 19 bytes are missing - // if the last page is smaller than this, the page header itself is truncated in the buffer. - stream.reset(); // resetting the buffer to the position before we got the error - LOG.info("completing the column chunk to read the page header"); - pageHeader = - Util.readPageHeader( - new SequenceInputStream( - stream, f)); // trying again from the buffer + remainder of the stream. - } - return pageHeader; - } - - public BytesInput readAsBytesInput(int size) throws IOException { - int available = stream.available(); - if (size > available) { - // this is to workaround a bug where the compressedLength - // of the chunk is missing the size of the header of the dictionary - // to allow reading older files (using dictionary) we need this. - // usually 13 to 19 bytes are missing - int missingBytes = size - available; - LOG.info("completed the column chunk with {} bytes", missingBytes); - - List streamBuffers = stream.sliceBuffers(available); - - ByteBuffer lastBuffer = ByteBuffer.allocate(missingBytes); - f.readFully(lastBuffer); - - List buffers = new ArrayList<>(streamBuffers.size() + 1); - buffers.addAll(streamBuffers); - buffers.add(lastBuffer); - - return BytesInput.from(buffers); - } - - return super.readAsBytesInput(size); - } - } - - /** Information needed to read a column chunk or a part of it. */ - private static class ChunkDescriptor { - - private final ColumnDescriptor col; - private final ColumnChunkMetaData metadata; - private final long fileOffset; - private final long size; - - /** - * @param col column this chunk is part of - * @param metadata metadata for the column - * @param fileOffset offset in the file where this chunk starts - * @param size size of the chunk - */ - private ChunkDescriptor( - ColumnDescriptor col, ColumnChunkMetaData metadata, long fileOffset, long size) { - super(); - this.col = col; - this.metadata = metadata; - this.fileOffset = fileOffset; - this.size = size; - } - - @Override - public int hashCode() { - return col.hashCode(); - } - - @Override - public boolean equals(Object obj) { - if (this == obj) { - return true; - } else if (obj instanceof ChunkDescriptor) { - return col.equals(((ChunkDescriptor) obj).col); - } else { - return false; - } - } - } - - /** - * Describes a list of consecutive parts to be read at once. A consecutive part may contain whole - * column chunks or only parts of them (some pages). - */ - private class ConsecutivePartList { - - private final long offset; - private long length; - private final List chunks = new ArrayList<>(); - - /** @param offset where the first chunk starts */ - ConsecutivePartList(long offset) { - this.offset = offset; - } - - /** - * adds a chunk to the list. It must be consecutive to the previous chunk - * - * @param descriptor a chunk descriptor - */ - public void addChunk(ChunkDescriptor descriptor) { - chunks.add(descriptor); - length += descriptor.size; - } - - /** - * @param f file to read the chunks from - * @param builder used to build chunk list to read the pages for the different columns - * @throws IOException if there is an error while reading from the stream - */ - public void readAll(SeekableInputStream f, ChunkListBuilder builder) throws IOException { - f.seek(offset); - - int fullAllocations = Math.toIntExact(length / options.getMaxAllocationSize()); - int lastAllocationSize = Math.toIntExact(length % options.getMaxAllocationSize()); - - int numAllocations = fullAllocations + (lastAllocationSize > 0 ? 1 : 0); - List buffers = new ArrayList<>(numAllocations); - - for (int i = 0; i < fullAllocations; i += 1) { - buffers.add(options.getAllocator().allocate(options.getMaxAllocationSize())); - } - - if (lastAllocationSize > 0) { - buffers.add(options.getAllocator().allocate(lastAllocationSize)); - } - - for (ByteBuffer buffer : buffers) { - f.readFully(buffer); - buffer.flip(); - } - - ByteBufferInputStream stream = ByteBufferInputStream.wrap(buffers); - for (final ChunkDescriptor descriptor : chunks) { - builder.add(descriptor, stream.sliceBuffers(descriptor.size), f); - } - } - - /** @return the position following the last byte of these chunks */ - public long endPos() { - return offset + length; - } - } -} diff --git a/src/main/java/org/apache/parquet/local/ParquetFileWriter.java b/src/main/java/org/apache/parquet/local/ParquetFileWriter.java deleted file mode 100644 index f85885c..0000000 --- a/src/main/java/org/apache/parquet/local/ParquetFileWriter.java +++ /dev/null @@ -1,1266 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * copied from parquet-mr, updated by An Qi - */ - -package org.apache.parquet.local; - -import org.apache.parquet.Preconditions; -import org.apache.parquet.Version; -import org.apache.parquet.bytes.BytesInput; -import org.apache.parquet.bytes.BytesUtils; -import org.apache.parquet.column.ColumnDescriptor; -import org.apache.parquet.column.Encoding; -import org.apache.parquet.column.EncodingStats; -import org.apache.parquet.column.page.DictionaryPage; -import org.apache.parquet.column.statistics.Statistics; -import org.apache.parquet.column.values.bloomfilter.BloomFilter; -import org.apache.parquet.crypto.*; -import org.apache.parquet.crypto.ModuleCipherFactory.ModuleType; -import org.apache.parquet.format.BlockCipher; -import org.apache.parquet.format.Util; -import org.apache.parquet.hadoop.metadata.*; -import org.apache.parquet.internal.column.columnindex.ColumnIndex; -import org.apache.parquet.internal.column.columnindex.ColumnIndexBuilder; -import org.apache.parquet.internal.column.columnindex.OffsetIndex; -import org.apache.parquet.internal.column.columnindex.OffsetIndexBuilder; -import org.apache.parquet.internal.hadoop.metadata.IndexReference; -import org.apache.parquet.io.OutputFile; -import org.apache.parquet.io.ParquetEncodingException; -import org.apache.parquet.io.PositionOutputStream; -import org.apache.parquet.schema.MessageType; -import org.apache.parquet.schema.PrimitiveType; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.ByteArrayOutputStream; -import java.io.IOException; -import java.nio.charset.StandardCharsets; -import java.util.*; -import java.util.zip.CRC32; - -import static org.apache.parquet.format.Util.writeFileCryptoMetaData; -import static org.apache.parquet.format.Util.writeFileMetaData; -import static org.apache.parquet.format.converter.ParquetMetadataConverter.MAX_STATS_SIZE; - -/** Internal implementation of the Parquet file writer as a block container */ -public class ParquetFileWriter { - private static final Logger LOG = LoggerFactory.getLogger(ParquetFileWriter.class); - - private final ParquetMetadataConverter metadataConverter; - - public static final String PARQUET_METADATA_FILE = "_metadata"; - public static final String MAGIC_STR = "PAR1"; - public static final byte[] MAGIC = MAGIC_STR.getBytes(StandardCharsets.US_ASCII); - public static final String EF_MAGIC_STR = "PARE"; - public static final byte[] EFMAGIC = EF_MAGIC_STR.getBytes(StandardCharsets.US_ASCII); - public static final String PARQUET_COMMON_METADATA_FILE = "_common_metadata"; - public static final int CURRENT_VERSION = 1; - - // File creation modes - public static enum Mode { - CREATE, - OVERWRITE - } - - protected final PositionOutputStream out; - - private final AlignmentStrategy alignment; - private final int columnIndexTruncateLength; - - // file data - private List blocks = new ArrayList(); - - // The column/offset indexes per blocks per column chunks - private final List> columnIndexes = new ArrayList<>(); - private final List> offsetIndexes = new ArrayList<>(); - - // The Bloom filters - private final List> bloomFilters = new ArrayList<>(); - - // The file encryptor - private final InternalFileEncryptor fileEncryptor; - - // row group data - private BlockMetaData currentBlock; // appended to by endColumn - - // The column/offset indexes for the actual block - private List currentColumnIndexes; - private List currentOffsetIndexes; - - // The Bloom filter for the actual block - private Map currentBloomFilters; - - // row group data set at the start of a row group - private long currentRecordCount; // set in startBlock - - // column chunk data accumulated as pages are written - private EncodingStats.Builder encodingStatsBuilder; - private Set currentEncodings; - private long uncompressedLength; - private long compressedLength; - private Statistics currentStatistics; // accumulated in writePage(s) - private ColumnIndexBuilder columnIndexBuilder; - private OffsetIndexBuilder offsetIndexBuilder; - - // column chunk data set at the start of a column - private CompressionCodecName currentChunkCodec; // set in startColumn - private ColumnPath currentChunkPath; // set in startColumn - private PrimitiveType currentChunkType; // set in startColumn - private long currentChunkValueCount; // set in startColumn - private long currentChunkFirstDataPage; // set in startColumn & page writes - private long currentChunkDictionaryPageOffset; // set in writeDictionaryPage - - // set when end is called - private ParquetMetadata footer = null; - - private final CRC32 crc; - private boolean pageWriteChecksumEnabled; - - /** Captures the order in which methods should be called */ - private enum STATE { - NOT_STARTED { - STATE start() { - return STARTED; - } - }, - STARTED { - STATE startBlock() { - return BLOCK; - } - - STATE end() { - return ENDED; - } - }, - BLOCK { - STATE startColumn() { - return COLUMN; - } - - STATE endBlock() { - return STARTED; - } - }, - COLUMN { - STATE endColumn() { - return BLOCK; - }; - - STATE write() { - return this; - } - }, - ENDED; - - STATE start() throws IOException { - return error(); - } - - STATE startBlock() throws IOException { - return error(); - } - - STATE startColumn() throws IOException { - return error(); - } - - STATE write() throws IOException { - return error(); - } - - STATE endColumn() throws IOException { - return error(); - } - - STATE endBlock() throws IOException { - return error(); - } - - STATE end() throws IOException { - return error(); - } - - private final STATE error() throws IOException { - throw new IOException( - "The file being written is in an invalid state. Probably caused by an error thrown previously. Current state: " - + this.name()); - } - } - - private STATE state = STATE.NOT_STARTED; - - public ParquetFileWriter(OutputFile file, ParquetWriteOptions options) throws IOException { - this( - file, - options.isEnableOverwrite() ? Mode.OVERWRITE : Mode.CREATE, - options.getRowGroupSize(), - options.getMaxPaddingSize(), - options.getParquetProperties().getColumnIndexTruncateLength(), - options.getParquetProperties().getStatisticsTruncateLength(), - options.getParquetProperties().getPageWriteChecksumEnabled(), - options.getEncryptionProperties(), - null); - } - - private ParquetFileWriter( - OutputFile file, - Mode mode, - long rowGroupSize, - int maxPaddingSize, - int columnIndexTruncateLength, - int statisticsTruncateLength, - boolean pageWriteChecksumEnabled, - FileEncryptionProperties encryptionProperties, - InternalFileEncryptor encryptor) - throws IOException { - - long blockSize = rowGroupSize; - if (file.supportsBlockSize()) { - blockSize = Math.max(file.defaultBlockSize(), rowGroupSize); - this.alignment = PaddingAlignment.get(blockSize, rowGroupSize, maxPaddingSize); - } else { - this.alignment = NoAlignment.get(rowGroupSize); - } - - if (mode == Mode.OVERWRITE) { - this.out = file.createOrOverwrite(blockSize); - } else { - this.out = file.create(blockSize); - } - - this.encodingStatsBuilder = new EncodingStats.Builder(); - this.columnIndexTruncateLength = columnIndexTruncateLength; - this.pageWriteChecksumEnabled = pageWriteChecksumEnabled; - this.crc = pageWriteChecksumEnabled ? new CRC32() : null; - - this.metadataConverter = new ParquetMetadataConverter(statisticsTruncateLength); - - if (null == encryptionProperties && null == encryptor) { - this.fileEncryptor = null; - return; - } - - if (null == encryptionProperties) { - encryptionProperties = encryptor.getEncryptionProperties(); - } - - if (null == encryptor) { - this.fileEncryptor = new InternalFileEncryptor(encryptionProperties); - } else { - this.fileEncryptor = encryptor; - } - } - - /** - * start the file - * - * @throws IOException if there is an error while writing - */ - public void start() throws IOException { - state = state.start(); - LOG.debug("{}: start", out.getPos()); - byte[] magic = MAGIC; - if (null != fileEncryptor && fileEncryptor.isFooterEncrypted()) { - magic = EFMAGIC; - } - out.write(magic); - } - - public InternalFileEncryptor getEncryptor() { - return fileEncryptor; - } - - /** - * start a block - * - * @param recordCount the record count in this block - * @throws IOException if there is an error while writing - */ - public void startBlock(long recordCount) throws IOException { - state = state.startBlock(); - LOG.debug("{}: start block", out.getPos()); - // out.write(MAGIC); // TODO: add a magic delimiter - - alignment.alignForRowGroup(out); - - currentBlock = new BlockMetaData(); - currentRecordCount = recordCount; - - currentColumnIndexes = new ArrayList<>(); - currentOffsetIndexes = new ArrayList<>(); - - currentBloomFilters = new HashMap<>(); - } - - /** - * start a column inside a block - * - * @param descriptor the column descriptor - * @param valueCount the value count in this column - * @param compressionCodecName a compression codec name - * @throws IOException if there is an error while writing - */ - public void startColumn( - ColumnDescriptor descriptor, long valueCount, CompressionCodecName compressionCodecName) - throws IOException { - state = state.startColumn(); - encodingStatsBuilder.clear(); - currentEncodings = new HashSet(); - currentChunkPath = ColumnPath.get(descriptor.getPath()); - currentChunkType = descriptor.getPrimitiveType(); - currentChunkCodec = compressionCodecName; - currentChunkValueCount = valueCount; - currentChunkFirstDataPage = -1; - compressedLength = 0; - uncompressedLength = 0; - // The statistics will be copied from the first one added at writeDataPage(s) so we have the - // correct typed one - currentStatistics = null; - - columnIndexBuilder = ColumnIndexBuilder.getBuilder(currentChunkType, columnIndexTruncateLength); - offsetIndexBuilder = OffsetIndexBuilder.getBuilder(); - } - - /** - * writes a dictionary page page - * - * @param dictionaryPage the dictionary page - * @throws IOException if there is an error while writing - */ - public void writeDictionaryPage(DictionaryPage dictionaryPage) throws IOException { - writeDictionaryPage(dictionaryPage, null, null); - } - - public void writeDictionaryPage( - DictionaryPage dictionaryPage, BlockCipher.Encryptor headerBlockEncryptor, byte[] AAD) - throws IOException { - state = state.write(); - LOG.debug( - "{}: write dictionary page: {} values", out.getPos(), dictionaryPage.getDictionarySize()); - currentChunkDictionaryPageOffset = out.getPos(); - int uncompressedSize = dictionaryPage.getUncompressedSize(); - int compressedPageSize = (int) dictionaryPage.getBytes().size(); // TODO: fix casts - if (pageWriteChecksumEnabled) { - crc.reset(); - crc.update(dictionaryPage.getBytes().toByteArray()); - metadataConverter.writeDictionaryPageHeader( - uncompressedSize, - compressedPageSize, - dictionaryPage.getDictionarySize(), - dictionaryPage.getEncoding(), - (int) crc.getValue(), - out, - headerBlockEncryptor, - AAD); - } else { - metadataConverter.writeDictionaryPageHeader( - uncompressedSize, - compressedPageSize, - dictionaryPage.getDictionarySize(), - dictionaryPage.getEncoding(), - out, - headerBlockEncryptor, - AAD); - } - long headerSize = out.getPos() - currentChunkDictionaryPageOffset; - this.uncompressedLength += uncompressedSize + headerSize; - this.compressedLength += compressedPageSize + headerSize; - LOG.debug("{}: write dictionary page content {}", out.getPos(), compressedPageSize); - dictionaryPage - .getBytes() - .writeAllTo(out); // for encrypted column, dictionary page bytes are already encrypted - encodingStatsBuilder.addDictEncoding(dictionaryPage.getEncoding()); - currentEncodings.add(dictionaryPage.getEncoding()); - } - - /** - * writes a single page - * - * @param valueCount count of values - * @param uncompressedPageSize the size of the data once uncompressed - * @param bytes the compressed data for the page without header - * @param rlEncoding encoding of the repetition level - * @param dlEncoding encoding of the definition level - * @param valuesEncoding encoding of values - * @throws IOException if there is an error while writing - */ - @Deprecated - public void writeDataPage( - int valueCount, - int uncompressedPageSize, - BytesInput bytes, - Encoding rlEncoding, - Encoding dlEncoding, - Encoding valuesEncoding) - throws IOException { - state = state.write(); - // We are unable to build indexes without rowCount so skip them for this column - offsetIndexBuilder = OffsetIndexBuilder.getNoOpBuilder(); - columnIndexBuilder = ColumnIndexBuilder.getNoOpBuilder(); - long beforeHeader = out.getPos(); - LOG.debug("{}: write data page: {} values", beforeHeader, valueCount); - int compressedPageSize = (int) bytes.size(); - metadataConverter.writeDataPageV1Header( - uncompressedPageSize, - compressedPageSize, - valueCount, - rlEncoding, - dlEncoding, - valuesEncoding, - out); - long headerSize = out.getPos() - beforeHeader; - this.uncompressedLength += uncompressedPageSize + headerSize; - this.compressedLength += compressedPageSize + headerSize; - LOG.debug("{}: write data page content {}", out.getPos(), compressedPageSize); - bytes.writeAllTo(out); - encodingStatsBuilder.addDataEncoding(valuesEncoding); - currentEncodings.add(rlEncoding); - currentEncodings.add(dlEncoding); - currentEncodings.add(valuesEncoding); - if (currentChunkFirstDataPage < 0) { - currentChunkFirstDataPage = beforeHeader; - } - } - - /** - * writes a single page - * - * @param valueCount count of values - * @param uncompressedPageSize the size of the data once uncompressed - * @param bytes the compressed data for the page without header - * @param statistics statistics for the page - * @param rlEncoding encoding of the repetition level - * @param dlEncoding encoding of the definition level - * @param valuesEncoding encoding of values - * @throws IOException if there is an error while writing - * @deprecated this method does not support writing column indexes; Use {@link #writeDataPage(int, - * int, BytesInput, Statistics, long, Encoding, Encoding, Encoding)} instead - */ - @Deprecated - public void writeDataPage( - int valueCount, - int uncompressedPageSize, - BytesInput bytes, - Statistics statistics, - Encoding rlEncoding, - Encoding dlEncoding, - Encoding valuesEncoding) - throws IOException { - // We are unable to build indexes without rowCount so skip them for this column - offsetIndexBuilder = OffsetIndexBuilder.getNoOpBuilder(); - columnIndexBuilder = ColumnIndexBuilder.getNoOpBuilder(); - innerWriteDataPage( - valueCount, - uncompressedPageSize, - bytes, - statistics, - rlEncoding, - dlEncoding, - valuesEncoding, - null, - null); - } - - /** - * Writes a single page - * - * @param valueCount count of values - * @param uncompressedPageSize the size of the data once uncompressed - * @param bytes the compressed data for the page without header - * @param statistics the statistics of the page - * @param rowCount the number of rows in the page - * @param rlEncoding encoding of the repetition level - * @param dlEncoding encoding of the definition level - * @param valuesEncoding encoding of values - * @throws IOException if any I/O error occurs during writing the file - */ - public void writeDataPage( - int valueCount, - int uncompressedPageSize, - BytesInput bytes, - Statistics statistics, - long rowCount, - Encoding rlEncoding, - Encoding dlEncoding, - Encoding valuesEncoding) - throws IOException { - writeDataPage( - valueCount, - uncompressedPageSize, - bytes, - statistics, - rowCount, - rlEncoding, - dlEncoding, - valuesEncoding, - null, - null); - } - - /** - * Writes a single page - * - * @param valueCount count of values - * @param uncompressedPageSize the size of the data once uncompressed - * @param bytes the compressed data for the page without header - * @param statistics the statistics of the page - * @param rowCount the number of rows in the page - * @param rlEncoding encoding of the repetition level - * @param dlEncoding encoding of the definition level - * @param valuesEncoding encoding of values - * @param metadataBlockEncryptor encryptor for block data - * @param pageHeaderAAD pageHeader AAD - * @throws IOException if any I/O error occurs during writing the file - */ - public void writeDataPage( - int valueCount, - int uncompressedPageSize, - BytesInput bytes, - Statistics statistics, - long rowCount, - Encoding rlEncoding, - Encoding dlEncoding, - Encoding valuesEncoding, - BlockCipher.Encryptor metadataBlockEncryptor, - byte[] pageHeaderAAD) - throws IOException { - long beforeHeader = out.getPos(); - innerWriteDataPage( - valueCount, - uncompressedPageSize, - bytes, - statistics, - rlEncoding, - dlEncoding, - valuesEncoding, - metadataBlockEncryptor, - pageHeaderAAD); - offsetIndexBuilder.add((int) (out.getPos() - beforeHeader), rowCount); - } - - private void innerWriteDataPage( - int valueCount, - int uncompressedPageSize, - BytesInput bytes, - Statistics statistics, - Encoding rlEncoding, - Encoding dlEncoding, - Encoding valuesEncoding, - BlockCipher.Encryptor metadataBlockEncryptor, - byte[] pageHeaderAAD) - throws IOException { - writeDataPage( - valueCount, - uncompressedPageSize, - bytes, - statistics, - rlEncoding, - dlEncoding, - valuesEncoding, - metadataBlockEncryptor, - pageHeaderAAD); - } - - /** - * writes a single page - * - * @param valueCount count of values - * @param uncompressedPageSize the size of the data once uncompressed - * @param bytes the compressed data for the page without header - * @param statistics statistics for the page - * @param rlEncoding encoding of the repetition level - * @param dlEncoding encoding of the definition level - * @param valuesEncoding encoding of values - * @param metadataBlockEncryptor encryptor for block data - * @param pageHeaderAAD pageHeader AAD - * @throws IOException if there is an error while writing - */ - public void writeDataPage( - int valueCount, - int uncompressedPageSize, - BytesInput bytes, - Statistics statistics, - Encoding rlEncoding, - Encoding dlEncoding, - Encoding valuesEncoding, - BlockCipher.Encryptor metadataBlockEncryptor, - byte[] pageHeaderAAD) - throws IOException { - state = state.write(); - long beforeHeader = out.getPos(); - if (currentChunkFirstDataPage < 0) { - currentChunkFirstDataPage = beforeHeader; - } - LOG.debug("{}: write data page: {} values", beforeHeader, valueCount); - int compressedPageSize = (int) bytes.size(); - if (pageWriteChecksumEnabled) { - crc.reset(); - crc.update(bytes.toByteArray()); - metadataConverter.writeDataPageV1Header( - uncompressedPageSize, - compressedPageSize, - valueCount, - rlEncoding, - dlEncoding, - valuesEncoding, - (int) crc.getValue(), - out, - metadataBlockEncryptor, - pageHeaderAAD); - } else { - metadataConverter.writeDataPageV1Header( - uncompressedPageSize, - compressedPageSize, - valueCount, - rlEncoding, - dlEncoding, - valuesEncoding, - out, - metadataBlockEncryptor, - pageHeaderAAD); - } - long headerSize = out.getPos() - beforeHeader; - this.uncompressedLength += uncompressedPageSize + headerSize; - this.compressedLength += compressedPageSize + headerSize; - LOG.debug("{}: write data page content {}", out.getPos(), compressedPageSize); - bytes.writeAllTo(out); - - // Copying the statistics if it is not initialized yet so we have the correct typed one - if (currentStatistics == null) { - currentStatistics = statistics.copy(); - } else { - currentStatistics.mergeStatistics(statistics); - } - - columnIndexBuilder.add(statistics); - - encodingStatsBuilder.addDataEncoding(valuesEncoding); - currentEncodings.add(rlEncoding); - currentEncodings.add(dlEncoding); - currentEncodings.add(valuesEncoding); - } - - /** - * Add a Bloom filter that will be written out. This is only used in unit test. - * - * @param column the column name - * @param bloomFilter the bloom filter of column values - */ - void addBloomFilter(String column, BloomFilter bloomFilter) { - currentBloomFilters.put(column, bloomFilter); - } - - /** - * Writes a single v2 data page - * - * @param rowCount count of rows - * @param nullCount count of nulls - * @param valueCount count of values - * @param repetitionLevels repetition level bytes - * @param definitionLevels definition level bytes - * @param dataEncoding encoding for data - * @param compressedData compressed data bytes - * @param uncompressedDataSize the size of uncompressed data - * @param statistics the statistics of the page - * @throws IOException if any I/O error occurs during writing the file - */ - public void writeDataPageV2( - int rowCount, - int nullCount, - int valueCount, - BytesInput repetitionLevels, - BytesInput definitionLevels, - Encoding dataEncoding, - BytesInput compressedData, - int uncompressedDataSize, - Statistics statistics) - throws IOException { - state = state.write(); - int rlByteLength = toIntWithCheck(repetitionLevels.size()); - int dlByteLength = toIntWithCheck(definitionLevels.size()); - - int compressedSize = - toIntWithCheck(compressedData.size() + repetitionLevels.size() + definitionLevels.size()); - - int uncompressedSize = - toIntWithCheck(uncompressedDataSize + repetitionLevels.size() + definitionLevels.size()); - - long beforeHeader = out.getPos(); - if (currentChunkFirstDataPage < 0) { - currentChunkFirstDataPage = beforeHeader; - } - - metadataConverter.writeDataPageV2Header( - uncompressedSize, - compressedSize, - valueCount, - nullCount, - rowCount, - dataEncoding, - rlByteLength, - dlByteLength, - out); - - long headersSize = out.getPos() - beforeHeader; - this.uncompressedLength += uncompressedSize + headersSize; - this.compressedLength += compressedSize + headersSize; - - if (currentStatistics == null) { - currentStatistics = statistics.copy(); - } else { - currentStatistics.mergeStatistics(statistics); - } - - columnIndexBuilder.add(statistics); - currentEncodings.add(dataEncoding); - encodingStatsBuilder.addDataEncoding(dataEncoding); - - BytesInput.concat(repetitionLevels, definitionLevels, compressedData).writeAllTo(out); - - offsetIndexBuilder.add((int) (out.getPos() - beforeHeader), rowCount); - } - - /** - * Writes a column chunk at once - * - * @param descriptor the descriptor of the column - * @param valueCount the value count in this column - * @param compressionCodecName the name of the compression codec used for compressing the pages - * @param dictionaryPage the dictionary page for this column chunk (might be null) - * @param bytes the encoded pages including page headers to be written as is - * @param uncompressedTotalPageSize total uncompressed size (without page headers) - * @param compressedTotalPageSize total compressed size (without page headers) - * @param totalStats accumulated statistics for the column chunk - * @param columnIndexBuilder the builder object for the column index - * @param offsetIndexBuilder the builder object for the offset index - * @param bloomFilter the bloom filter for this column - * @param rlEncodings the RL encodings used in this column chunk - * @param dlEncodings the DL encodings used in this column chunk - * @param dataEncodings the data encodings used in this column chunk - * @throws IOException if there is an error while writing - */ - void writeColumnChunk( - ColumnDescriptor descriptor, - long valueCount, - CompressionCodecName compressionCodecName, - DictionaryPage dictionaryPage, - BytesInput bytes, - long uncompressedTotalPageSize, - long compressedTotalPageSize, - Statistics totalStats, - ColumnIndexBuilder columnIndexBuilder, - OffsetIndexBuilder offsetIndexBuilder, - BloomFilter bloomFilter, - Set rlEncodings, - Set dlEncodings, - List dataEncodings) - throws IOException { - writeColumnChunk( - descriptor, - valueCount, - compressionCodecName, - dictionaryPage, - bytes, - uncompressedTotalPageSize, - compressedTotalPageSize, - totalStats, - columnIndexBuilder, - offsetIndexBuilder, - bloomFilter, - rlEncodings, - dlEncodings, - dataEncodings, - null, - 0, - 0, - null); - } - - void writeColumnChunk( - ColumnDescriptor descriptor, - long valueCount, - CompressionCodecName compressionCodecName, - DictionaryPage dictionaryPage, - BytesInput bytes, - long uncompressedTotalPageSize, - long compressedTotalPageSize, - Statistics totalStats, - ColumnIndexBuilder columnIndexBuilder, - OffsetIndexBuilder offsetIndexBuilder, - BloomFilter bloomFilter, - Set rlEncodings, - Set dlEncodings, - List dataEncodings, - BlockCipher.Encryptor headerBlockEncryptor, - int rowGroupOrdinal, - int columnOrdinal, - byte[] fileAAD) - throws IOException { - startColumn(descriptor, valueCount, compressionCodecName); - - state = state.write(); - if (dictionaryPage != null) { - byte[] dictonaryPageHeaderAAD = null; - if (null != headerBlockEncryptor) { - dictonaryPageHeaderAAD = - AesCipher.createModuleAAD( - fileAAD, ModuleType.DictionaryPageHeader, rowGroupOrdinal, columnOrdinal, -1); - } - writeDictionaryPage(dictionaryPage, headerBlockEncryptor, dictonaryPageHeaderAAD); - } - - if (bloomFilter != null) { - // write bloom filter if one of data pages is not dictionary encoded - boolean isWriteBloomFilter = false; - for (Encoding encoding : dataEncodings) { - // dictionary encoding: `PLAIN_DICTIONARY` is used in parquet v1, `RLE_DICTIONARY` is used - // in parquet v2 - if (encoding != Encoding.PLAIN_DICTIONARY && encoding != Encoding.RLE_DICTIONARY) { - isWriteBloomFilter = true; - break; - } - } - if (isWriteBloomFilter) { - currentBloomFilters.put(String.join(".", descriptor.getPath()), bloomFilter); - } - } - LOG.debug("{}: write data pages", out.getPos()); - long headersSize = bytes.size() - compressedTotalPageSize; - this.uncompressedLength += uncompressedTotalPageSize + headersSize; - this.compressedLength += compressedTotalPageSize + headersSize; - LOG.debug("{}: write data pages content", out.getPos()); - currentChunkFirstDataPage = out.getPos(); - bytes.writeAllTo(out); - encodingStatsBuilder.addDataEncodings(dataEncodings); - if (rlEncodings.isEmpty()) { - encodingStatsBuilder.withV2Pages(); - } - currentEncodings.addAll(rlEncodings); - currentEncodings.addAll(dlEncodings); - currentEncodings.addAll(dataEncodings); - currentStatistics = totalStats; - - this.columnIndexBuilder = columnIndexBuilder; - this.offsetIndexBuilder = offsetIndexBuilder; - - endColumn(); - } - - /** - * end a column (once all rep, def and data have been written) - * - * @throws IOException if there is an error while writing - */ - public void endColumn() throws IOException { - state = state.endColumn(); - LOG.debug("{}: end column", out.getPos()); - if (columnIndexBuilder.getMinMaxSize() > columnIndexBuilder.getPageCount() * MAX_STATS_SIZE) { - currentColumnIndexes.add(null); - } else { - currentColumnIndexes.add(columnIndexBuilder.build()); - } - currentOffsetIndexes.add(offsetIndexBuilder.build(currentChunkFirstDataPage)); - currentBlock.addColumn( - ColumnChunkMetaData.get( - currentChunkPath, - currentChunkType, - currentChunkCodec, - encodingStatsBuilder.build(), - currentEncodings, - currentStatistics, - currentChunkFirstDataPage, - currentChunkDictionaryPageOffset, - currentChunkValueCount, - compressedLength, - uncompressedLength)); - this.currentBlock.setTotalByteSize(currentBlock.getTotalByteSize() + uncompressedLength); - this.uncompressedLength = 0; - this.compressedLength = 0; - this.currentChunkDictionaryPageOffset = 0; - columnIndexBuilder = null; - offsetIndexBuilder = null; - } - - /** - * ends a block once all column chunks have been written - * - * @throws IOException if there is an error while writing - */ - public void endBlock() throws IOException { - if (currentRecordCount == 0) { - throw new ParquetEncodingException("End block with zero record"); - } - - state = state.endBlock(); - LOG.debug("{}: end block", out.getPos()); - currentBlock.setRowCount(currentRecordCount); - currentBlock.setOrdinal(blocks.size()); - blocks.add(currentBlock); - columnIndexes.add(currentColumnIndexes); - offsetIndexes.add(currentOffsetIndexes); - bloomFilters.add(currentBloomFilters); - currentColumnIndexes = null; - currentOffsetIndexes = null; - currentBloomFilters = null; - currentBlock = null; - } - - public void end(MessageType schema, Map extraMetaData) throws IOException { - state = state.end(); - serializeColumnIndexes(columnIndexes, blocks, out, fileEncryptor); - serializeOffsetIndexes(offsetIndexes, blocks, out, fileEncryptor); - serializeBloomFilters(bloomFilters, blocks, out, fileEncryptor); - LOG.debug("{}: end", out.getPos()); - this.footer = - new ParquetMetadata(new FileMetaData(schema, extraMetaData, Version.FULL_VERSION), blocks); - serializeFooter(footer, out, fileEncryptor, metadataConverter); - out.close(); - } - - private static void serializeColumnIndexes( - List> columnIndexes, - List blocks, - PositionOutputStream out, - InternalFileEncryptor fileEncryptor) - throws IOException { - LOG.debug("{}: column indexes", out.getPos()); - for (int bIndex = 0, bSize = blocks.size(); bIndex < bSize; ++bIndex) { - BlockMetaData block = blocks.get(bIndex); - List columns = block.getColumns(); - List blockColumnIndexes = columnIndexes.get(bIndex); - for (int cIndex = 0, cSize = columns.size(); cIndex < cSize; ++cIndex) { - ColumnChunkMetaData column = columns.get(cIndex); - org.apache.parquet.format.ColumnIndex columnIndex = - ParquetMetadataConverter.toParquetColumnIndex( - column.getPrimitiveType(), blockColumnIndexes.get(cIndex)); - if (columnIndex == null) { - continue; - } - BlockCipher.Encryptor columnIndexEncryptor = null; - byte[] columnIndexAAD = null; - if (null != fileEncryptor) { - InternalColumnEncryptionSetup columnEncryptionSetup = - fileEncryptor.getColumnSetup(column.getPath(), false, cIndex); - if (columnEncryptionSetup.isEncrypted()) { - columnIndexEncryptor = columnEncryptionSetup.getMetaDataEncryptor(); - columnIndexAAD = - AesCipher.createModuleAAD( - fileEncryptor.getFileAAD(), - ModuleType.ColumnIndex, - block.getOrdinal(), - columnEncryptionSetup.getOrdinal(), - -1); - } - } - long offset = out.getPos(); - Util.writeColumnIndex(columnIndex, out, columnIndexEncryptor, columnIndexAAD); - column.setColumnIndexReference(new IndexReference(offset, (int) (out.getPos() - offset))); - } - } - } - - private int toIntWithCheck(long size) { - if ((int) size != size) { - throw new ParquetEncodingException( - "Cannot write page larger than " + Integer.MAX_VALUE + " bytes: " + size); - } - return (int) size; - } - - private static void serializeOffsetIndexes( - List> offsetIndexes, - List blocks, - PositionOutputStream out, - InternalFileEncryptor fileEncryptor) - throws IOException { - LOG.debug("{}: offset indexes", out.getPos()); - for (int bIndex = 0, bSize = blocks.size(); bIndex < bSize; ++bIndex) { - BlockMetaData block = blocks.get(bIndex); - List columns = block.getColumns(); - List blockOffsetIndexes = offsetIndexes.get(bIndex); - for (int cIndex = 0, cSize = columns.size(); cIndex < cSize; ++cIndex) { - OffsetIndex offsetIndex = blockOffsetIndexes.get(cIndex); - if (offsetIndex == null) { - continue; - } - ColumnChunkMetaData column = columns.get(cIndex); - BlockCipher.Encryptor offsetIndexEncryptor = null; - byte[] offsetIndexAAD = null; - if (null != fileEncryptor) { - InternalColumnEncryptionSetup columnEncryptionSetup = - fileEncryptor.getColumnSetup(column.getPath(), false, cIndex); - if (columnEncryptionSetup.isEncrypted()) { - offsetIndexEncryptor = columnEncryptionSetup.getMetaDataEncryptor(); - offsetIndexAAD = - AesCipher.createModuleAAD( - fileEncryptor.getFileAAD(), - ModuleType.OffsetIndex, - block.getOrdinal(), - columnEncryptionSetup.getOrdinal(), - -1); - } - } - long offset = out.getPos(); - Util.writeOffsetIndex( - ParquetMetadataConverter.toParquetOffsetIndex(offsetIndex), - out, - offsetIndexEncryptor, - offsetIndexAAD); - column.setOffsetIndexReference(new IndexReference(offset, (int) (out.getPos() - offset))); - } - } - } - - private static void serializeBloomFilters( - List> bloomFilters, - List blocks, - PositionOutputStream out, - InternalFileEncryptor fileEncryptor) - throws IOException { - LOG.debug("{}: bloom filters", out.getPos()); - for (int bIndex = 0, bSize = blocks.size(); bIndex < bSize; ++bIndex) { - BlockMetaData block = blocks.get(bIndex); - List columns = block.getColumns(); - Map blockBloomFilters = bloomFilters.get(bIndex); - if (blockBloomFilters.isEmpty()) continue; - for (int cIndex = 0, cSize = columns.size(); cIndex < cSize; ++cIndex) { - ColumnChunkMetaData column = columns.get(cIndex); - BloomFilter bloomFilter = blockBloomFilters.get(column.getPath().toDotString()); - if (bloomFilter == null) { - continue; - } - - long offset = out.getPos(); - column.setBloomFilterOffset(offset); - - BlockCipher.Encryptor bloomFilterEncryptor = null; - byte[] bloomFilterHeaderAAD = null; - byte[] bloomFilterBitsetAAD = null; - if (null != fileEncryptor) { - InternalColumnEncryptionSetup columnEncryptionSetup = - fileEncryptor.getColumnSetup(column.getPath(), false, cIndex); - if (columnEncryptionSetup.isEncrypted()) { - bloomFilterEncryptor = columnEncryptionSetup.getMetaDataEncryptor(); - int columnOrdinal = columnEncryptionSetup.getOrdinal(); - bloomFilterHeaderAAD = - AesCipher.createModuleAAD( - fileEncryptor.getFileAAD(), - ModuleType.BloomFilterHeader, - block.getOrdinal(), - columnOrdinal, - -1); - bloomFilterBitsetAAD = - AesCipher.createModuleAAD( - fileEncryptor.getFileAAD(), - ModuleType.BloomFilterBitset, - block.getOrdinal(), - columnOrdinal, - -1); - } - } - - Util.writeBloomFilterHeader( - ParquetMetadataConverter.toBloomFilterHeader(bloomFilter), - out, - bloomFilterEncryptor, - bloomFilterHeaderAAD); - - ByteArrayOutputStream tempOutStream = new ByteArrayOutputStream(); - bloomFilter.writeTo(tempOutStream); - byte[] serializedBitset = tempOutStream.toByteArray(); - if (null != bloomFilterEncryptor) { - serializedBitset = bloomFilterEncryptor.encrypt(serializedBitset, bloomFilterBitsetAAD); - } - out.write(serializedBitset); - } - } - } - - private static void serializeFooter( - ParquetMetadata footer, - PositionOutputStream out, - InternalFileEncryptor fileEncryptor, - ParquetMetadataConverter metadataConverter) - throws IOException { - - // Unencrypted file - if (null == fileEncryptor) { - long footerIndex = out.getPos(); - org.apache.parquet.format.FileMetaData parquetMetadata = - metadataConverter.toParquetMetadata(CURRENT_VERSION, footer); - writeFileMetaData(parquetMetadata, out); - LOG.debug("{}: footer length = {}", out.getPos(), (out.getPos() - footerIndex)); - BytesUtils.writeIntLittleEndian(out, (int) (out.getPos() - footerIndex)); - out.write(MAGIC); - return; - } - - org.apache.parquet.format.FileMetaData parquetMetadata = - metadataConverter.toParquetMetadata(CURRENT_VERSION, footer, fileEncryptor); - - // Encrypted file with plaintext footer - if (!fileEncryptor.isFooterEncrypted()) { - long footerIndex = out.getPos(); - parquetMetadata.setEncryption_algorithm(fileEncryptor.getEncryptionAlgorithm()); - // create footer signature (nonce + tag of encrypted footer) - byte[] footerSigningKeyMetaData = fileEncryptor.getFooterSigningKeyMetaData(); - if (null != footerSigningKeyMetaData) { - parquetMetadata.setFooter_signing_key_metadata(footerSigningKeyMetaData); - } - ByteArrayOutputStream tempOutStream = new ByteArrayOutputStream(); - writeFileMetaData(parquetMetadata, tempOutStream); - byte[] serializedFooter = tempOutStream.toByteArray(); - byte[] footerAAD = AesCipher.createFooterAAD(fileEncryptor.getFileAAD()); - byte[] encryptedFooter = - fileEncryptor.getSignedFooterEncryptor().encrypt(serializedFooter, footerAAD); - byte[] signature = new byte[AesCipher.NONCE_LENGTH + AesCipher.GCM_TAG_LENGTH]; - System.arraycopy( - encryptedFooter, - ModuleCipherFactory.SIZE_LENGTH, - signature, - 0, - AesCipher.NONCE_LENGTH); // copy Nonce - System.arraycopy( - encryptedFooter, - encryptedFooter.length - AesCipher.GCM_TAG_LENGTH, - signature, - AesCipher.NONCE_LENGTH, - AesCipher.GCM_TAG_LENGTH); // copy GCM Tag - out.write(serializedFooter); - out.write(signature); - LOG.debug("{}: footer and signature length = {}", out.getPos(), (out.getPos() - footerIndex)); - BytesUtils.writeIntLittleEndian(out, (int) (out.getPos() - footerIndex)); - out.write(MAGIC); - return; - } - - // Encrypted file with encrypted footer - long cryptoFooterIndex = out.getPos(); - writeFileCryptoMetaData(fileEncryptor.getFileCryptoMetaData(), out); - byte[] footerAAD = AesCipher.createFooterAAD(fileEncryptor.getFileAAD()); - writeFileMetaData(parquetMetadata, out, fileEncryptor.getFooterEncryptor(), footerAAD); - int combinedMetaDataLength = (int) (out.getPos() - cryptoFooterIndex); - LOG.debug("{}: crypto metadata and footer length = {}", out.getPos(), combinedMetaDataLength); - BytesUtils.writeIntLittleEndian(out, combinedMetaDataLength); - out.write(EFMAGIC); - } - - public ParquetMetadata getFooter() { - Preconditions.checkState(state == STATE.ENDED, "Cannot return unfinished footer."); - return footer; - } - - /** - * @return the current position in the underlying file - * @throws IOException if there is an error while getting the current stream's position - */ - public long getPos() throws IOException { - return out.getPos(); - } - - public long getNextRowGroupSize() throws IOException { - return alignment.nextRowGroupSize(out); - } - - private interface AlignmentStrategy { - void alignForRowGroup(PositionOutputStream out) throws IOException; - - long nextRowGroupSize(PositionOutputStream out) throws IOException; - } - - private static class NoAlignment implements AlignmentStrategy { - public static NoAlignment get(long rowGroupSize) { - return new NoAlignment(rowGroupSize); - } - - private final long rowGroupSize; - - private NoAlignment(long rowGroupSize) { - this.rowGroupSize = rowGroupSize; - } - - @Override - public void alignForRowGroup(PositionOutputStream out) {} - - @Override - public long nextRowGroupSize(PositionOutputStream out) { - return rowGroupSize; - } - } - - /** - * Alignment strategy that pads when less than half the row group size is left before the next DFS - * block. - */ - private static class PaddingAlignment implements AlignmentStrategy { - private static final byte[] zeros = new byte[4096]; - - public static PaddingAlignment get(long dfsBlockSize, long rowGroupSize, int maxPaddingSize) { - return new PaddingAlignment(dfsBlockSize, rowGroupSize, maxPaddingSize); - } - - protected final long dfsBlockSize; - protected final long rowGroupSize; - protected final int maxPaddingSize; - - private PaddingAlignment(long dfsBlockSize, long rowGroupSize, int maxPaddingSize) { - this.dfsBlockSize = dfsBlockSize; - this.rowGroupSize = rowGroupSize; - this.maxPaddingSize = maxPaddingSize; - } - - @Override - public void alignForRowGroup(PositionOutputStream out) throws IOException { - long remaining = dfsBlockSize - (out.getPos() % dfsBlockSize); - - if (isPaddingNeeded(remaining)) { - LOG.debug( - "Adding {} bytes of padding (row group size={}B, block size={}B)", - remaining, - rowGroupSize, - dfsBlockSize); - for (; remaining > 0; remaining -= zeros.length) { - out.write(zeros, 0, (int) Math.min((long) zeros.length, remaining)); - } - } - } - - @Override - public long nextRowGroupSize(PositionOutputStream out) throws IOException { - if (maxPaddingSize <= 0) { - return rowGroupSize; - } - - long remaining = dfsBlockSize - (out.getPos() % dfsBlockSize); - - if (isPaddingNeeded(remaining)) { - return rowGroupSize; - } - - return Math.min(remaining, rowGroupSize); - } - - protected boolean isPaddingNeeded(long remaining) { - return (remaining <= maxPaddingSize); - } - } -} diff --git a/src/main/java/org/apache/parquet/local/ParquetMetadataConverter.java b/src/main/java/org/apache/parquet/local/ParquetMetadataConverter.java deleted file mode 100644 index 79c1837..0000000 --- a/src/main/java/org/apache/parquet/local/ParquetMetadataConverter.java +++ /dev/null @@ -1,2282 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.parquet.local; - -import org.apache.parquet.CorruptStatistics; -import org.apache.parquet.Preconditions; -import org.apache.parquet.bytes.ByteBufferInputStream; -import org.apache.parquet.column.EncodingStats; -import org.apache.parquet.column.ParquetProperties; -import org.apache.parquet.column.statistics.BinaryStatistics; -import org.apache.parquet.column.values.bloomfilter.BloomFilter; -import org.apache.parquet.crypto.*; -import org.apache.parquet.crypto.ModuleCipherFactory.ModuleType; -import org.apache.parquet.format.*; -import org.apache.parquet.format.ColumnOrder; -import org.apache.parquet.format.FileMetaData; -import org.apache.parquet.format.Type; -import org.apache.parquet.hadoop.metadata.*; -import org.apache.parquet.hadoop.metadata.FileMetaData.EncryptionType; -import org.apache.parquet.internal.column.columnindex.BinaryTruncator; -import org.apache.parquet.internal.column.columnindex.ColumnIndexBuilder; -import org.apache.parquet.internal.column.columnindex.OffsetIndexBuilder; -import org.apache.parquet.internal.hadoop.metadata.IndexReference; -import org.apache.parquet.io.InvalidFileOffsetException; -import org.apache.parquet.io.ParquetDecodingException; -import org.apache.parquet.io.api.Binary; -import org.apache.parquet.local.metadata.EncryptedColumnChunkMetaData; -import org.apache.parquet.schema.*; -import org.apache.parquet.schema.ColumnOrder.ColumnOrderName; -import org.apache.parquet.schema.LogicalTypeAnnotation.UUIDLogicalTypeAnnotation; -import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; -import org.apache.parquet.schema.Type.Repetition; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.*; -import java.util.*; -import java.util.Map.Entry; -import java.util.concurrent.ConcurrentHashMap; - -import static java.util.Optional.empty; -import static java.util.Optional.of; -import static org.apache.parquet.format.Util.*; - -public class ParquetMetadataConverter { - - private static final TypeDefinedOrder TYPE_DEFINED_ORDER = new TypeDefinedOrder(); - public static final MetadataFilter NO_FILTER = new NoFilter(); - public static final MetadataFilter SKIP_ROW_GROUPS = new SkipMetadataFilter(); - public static final long MAX_STATS_SIZE = 4096; // limit stats to 4k - - private static final Logger LOG = LoggerFactory.getLogger(ParquetMetadataConverter.class); - private static final LogicalTypeConverterVisitor LOGICAL_TYPE_ANNOTATION_VISITOR = - new LogicalTypeConverterVisitor(); - private static final ConvertedTypeConverterVisitor CONVERTED_TYPE_CONVERTER_VISITOR = - new ConvertedTypeConverterVisitor(); - private final int statisticsTruncateLength; - private final boolean useSignedStringMinMax; - - public ParquetMetadataConverter() { - this(false); - } - - public ParquetMetadataConverter(int statisticsTruncateLength) { - this(false, statisticsTruncateLength); - } - - public ParquetMetadataConverter(ParquetReadOptions options) { - this(options.useSignedStringMinMax()); - } - - private ParquetMetadataConverter(boolean useSignedStringMinMax) { - this(useSignedStringMinMax, ParquetProperties.DEFAULT_STATISTICS_TRUNCATE_LENGTH); - } - - private ParquetMetadataConverter(boolean useSignedStringMinMax, int statisticsTruncateLength) { - if (statisticsTruncateLength <= 0) { - throw new IllegalArgumentException("Truncate length should be greater than 0"); - } - this.useSignedStringMinMax = useSignedStringMinMax; - this.statisticsTruncateLength = statisticsTruncateLength; - } - - // NOTE: this cache is for memory savings, not cpu savings, and is used to de-duplicate - // sets of encodings. It is important that all collections inserted to this cache be - // immutable and have thread-safe read-only access. This can be achieved by wrapping - // an unsynchronized collection in Collections.unmodifiable*(), and making sure to not - // keep any references to the original collection. - private static final ConcurrentHashMap< - Set, Set> - cachedEncodingSets = - new ConcurrentHashMap< - Set, Set>(); - - public FileMetaData toParquetMetadata(int currentVersion, ParquetMetadata parquetMetadata) { - return toParquetMetadata(currentVersion, parquetMetadata, null); - } - - public FileMetaData toParquetMetadata( - int currentVersion, ParquetMetadata parquetMetadata, InternalFileEncryptor fileEncryptor) { - List blocks = parquetMetadata.getBlocks(); - List rowGroups = new ArrayList(); - long numRows = 0; - long preBlockStartPos = 0; - long preBlockCompressedSize = 0; - for (BlockMetaData block : blocks) { - numRows += block.getRowCount(); - long blockStartPos = block.getStartingPos(); - // first block - if (blockStartPos == 4) { - preBlockStartPos = 0; - preBlockCompressedSize = 0; - } - if (preBlockStartPos != 0) { - Preconditions.checkState( - blockStartPos >= preBlockStartPos + preBlockCompressedSize, - "Invalid block starting position: %s", - blockStartPos); - } - preBlockStartPos = blockStartPos; - preBlockCompressedSize = block.getCompressedSize(); - addRowGroup(parquetMetadata, rowGroups, block, fileEncryptor); - } - FileMetaData fileMetaData = - new FileMetaData( - currentVersion, - toParquetSchema(parquetMetadata.getFileMetaData().getSchema()), - numRows, - rowGroups); - - Set> keyValues = - parquetMetadata.getFileMetaData().getKeyValueMetaData().entrySet(); - for (Entry keyValue : keyValues) { - addKeyValue(fileMetaData, keyValue.getKey(), keyValue.getValue()); - } - - fileMetaData.setCreated_by(parquetMetadata.getFileMetaData().getCreatedBy()); - - fileMetaData.setColumn_orders(getColumnOrders(parquetMetadata.getFileMetaData().getSchema())); - - return fileMetaData; - } - - private List getColumnOrders(MessageType schema) { - List columnOrders = new ArrayList<>(); - // Currently, only TypeDefinedOrder is supported, so we create a column order for each columns - // with - // TypeDefinedOrder even if some types (e.g. INT96) have undefined column orders. - for (int i = 0, n = schema.getPaths().size(); i < n; ++i) { - ColumnOrder columnOrder = new ColumnOrder(); - columnOrder.setTYPE_ORDER(TYPE_DEFINED_ORDER); - columnOrders.add(columnOrder); - } - return columnOrders; - } - - // Visible for testing - List toParquetSchema(MessageType schema) { - List result = new ArrayList(); - addToList(result, schema); - return result; - } - - private void addToList(final List result, org.apache.parquet.schema.Type field) { - field.accept( - new TypeVisitor() { - @Override - public void visit(PrimitiveType primitiveType) { - SchemaElement element = new SchemaElement(primitiveType.getName()); - element.setRepetition_type(toParquetRepetition(primitiveType.getRepetition())); - element.setType(getType(primitiveType.getPrimitiveTypeName())); - if (primitiveType.getLogicalTypeAnnotation() != null) { - element.setConverted_type( - convertToConvertedType(primitiveType.getLogicalTypeAnnotation())); - element.setLogicalType( - convertToLogicalType(primitiveType.getLogicalTypeAnnotation())); - } - if (primitiveType.getDecimalMetadata() != null) { - element.setPrecision(primitiveType.getDecimalMetadata().getPrecision()); - element.setScale(primitiveType.getDecimalMetadata().getScale()); - } - if (primitiveType.getTypeLength() > 0) { - element.setType_length(primitiveType.getTypeLength()); - } - if (primitiveType.getId() != null) { - element.setField_id(primitiveType.getId().intValue()); - } - result.add(element); - } - - @Override - public void visit(MessageType messageType) { - SchemaElement element = new SchemaElement(messageType.getName()); - if (messageType.getId() != null) { - element.setField_id(messageType.getId().intValue()); - } - visitChildren(result, messageType.asGroupType(), element); - } - - @Override - public void visit(GroupType groupType) { - SchemaElement element = new SchemaElement(groupType.getName()); - element.setRepetition_type(toParquetRepetition(groupType.getRepetition())); - if (groupType.getLogicalTypeAnnotation() != null) { - element.setConverted_type( - convertToConvertedType(groupType.getLogicalTypeAnnotation())); - element.setLogicalType(convertToLogicalType(groupType.getLogicalTypeAnnotation())); - } - if (groupType.getId() != null) { - element.setField_id(groupType.getId().intValue()); - } - visitChildren(result, groupType, element); - } - - private void visitChildren( - final List result, GroupType groupType, SchemaElement element) { - element.setNum_children(groupType.getFieldCount()); - result.add(element); - for (org.apache.parquet.schema.Type field : groupType.getFields()) { - addToList(result, field); - } - } - }); - } - - LogicalType convertToLogicalType(LogicalTypeAnnotation logicalTypeAnnotation) { - return logicalTypeAnnotation.accept(LOGICAL_TYPE_ANNOTATION_VISITOR).orElse(null); - } - - ConvertedType convertToConvertedType(LogicalTypeAnnotation logicalTypeAnnotation) { - return logicalTypeAnnotation.accept(CONVERTED_TYPE_CONVERTER_VISITOR).orElse(null); - } - - static TimeUnit convertUnit(LogicalTypeAnnotation.TimeUnit unit) { - switch (unit) { - case MICROS: - return TimeUnit.MICROS(new MicroSeconds()); - case MILLIS: - return TimeUnit.MILLIS(new MilliSeconds()); - case NANOS: - return TimeUnit.NANOS(new NanoSeconds()); - default: - throw new RuntimeException("Unknown time unit " + unit); - } - } - - private static class ConvertedTypeConverterVisitor - implements LogicalTypeAnnotation.LogicalTypeAnnotationVisitor { - @Override - public Optional visit( - LogicalTypeAnnotation.StringLogicalTypeAnnotation stringLogicalType) { - return of(ConvertedType.UTF8); - } - - @Override - public Optional visit( - LogicalTypeAnnotation.MapLogicalTypeAnnotation mapLogicalType) { - return of(ConvertedType.MAP); - } - - @Override - public Optional visit( - LogicalTypeAnnotation.ListLogicalTypeAnnotation listLogicalType) { - return of(ConvertedType.LIST); - } - - @Override - public Optional visit( - LogicalTypeAnnotation.EnumLogicalTypeAnnotation enumLogicalType) { - return of(ConvertedType.ENUM); - } - - @Override - public Optional visit( - LogicalTypeAnnotation.DecimalLogicalTypeAnnotation decimalLogicalType) { - return of(ConvertedType.DECIMAL); - } - - @Override - public Optional visit( - LogicalTypeAnnotation.DateLogicalTypeAnnotation dateLogicalType) { - return of(ConvertedType.DATE); - } - - @Override - public Optional visit( - LogicalTypeAnnotation.TimeLogicalTypeAnnotation timeLogicalType) { - switch (timeLogicalType.getUnit()) { - case MILLIS: - return of(ConvertedType.TIME_MILLIS); - case MICROS: - return of(ConvertedType.TIME_MICROS); - case NANOS: - return empty(); - default: - throw new RuntimeException( - "Unknown converted type for " + timeLogicalType.toOriginalType()); - } - } - - @Override - public Optional visit( - LogicalTypeAnnotation.TimestampLogicalTypeAnnotation timestampLogicalType) { - switch (timestampLogicalType.getUnit()) { - case MICROS: - return of(ConvertedType.TIMESTAMP_MICROS); - case MILLIS: - return of(ConvertedType.TIMESTAMP_MILLIS); - case NANOS: - return empty(); - default: - throw new RuntimeException( - "Unknown converted type for " + timestampLogicalType.toOriginalType()); - } - } - - @Override - public Optional visit( - LogicalTypeAnnotation.IntLogicalTypeAnnotation intLogicalType) { - boolean signed = intLogicalType.isSigned(); - switch (intLogicalType.getBitWidth()) { - case 8: - return of(signed ? ConvertedType.INT_8 : ConvertedType.UINT_8); - case 16: - return of(signed ? ConvertedType.INT_16 : ConvertedType.UINT_16); - case 32: - return of(signed ? ConvertedType.INT_32 : ConvertedType.UINT_32); - case 64: - return of(signed ? ConvertedType.INT_64 : ConvertedType.UINT_64); - default: - throw new RuntimeException("Unknown original type " + intLogicalType.toOriginalType()); - } - } - - @Override - public Optional visit( - LogicalTypeAnnotation.JsonLogicalTypeAnnotation jsonLogicalType) { - return of(ConvertedType.JSON); - } - - @Override - public Optional visit( - LogicalTypeAnnotation.BsonLogicalTypeAnnotation bsonLogicalType) { - return of(ConvertedType.BSON); - } - - @Override - public Optional visit( - LogicalTypeAnnotation.IntervalLogicalTypeAnnotation intervalLogicalType) { - return of(ConvertedType.INTERVAL); - } - - @Override - public Optional visit( - LogicalTypeAnnotation.MapKeyValueTypeAnnotation mapKeyValueLogicalType) { - return of(ConvertedType.MAP_KEY_VALUE); - } - } - - private static class LogicalTypeConverterVisitor - implements LogicalTypeAnnotation.LogicalTypeAnnotationVisitor { - @Override - public Optional visit( - LogicalTypeAnnotation.StringLogicalTypeAnnotation stringLogicalType) { - return of(LogicalType.STRING(new StringType())); - } - - @Override - public Optional visit( - LogicalTypeAnnotation.MapLogicalTypeAnnotation mapLogicalType) { - return of(LogicalType.MAP(new MapType())); - } - - @Override - public Optional visit( - LogicalTypeAnnotation.ListLogicalTypeAnnotation listLogicalType) { - return of(LogicalType.LIST(new ListType())); - } - - @Override - public Optional visit( - LogicalTypeAnnotation.EnumLogicalTypeAnnotation enumLogicalType) { - return of(LogicalType.ENUM(new EnumType())); - } - - @Override - public Optional visit( - LogicalTypeAnnotation.DecimalLogicalTypeAnnotation decimalLogicalType) { - return of( - LogicalType.DECIMAL( - new DecimalType(decimalLogicalType.getScale(), decimalLogicalType.getPrecision()))); - } - - @Override - public Optional visit( - LogicalTypeAnnotation.DateLogicalTypeAnnotation dateLogicalType) { - return of(LogicalType.DATE(new DateType())); - } - - @Override - public Optional visit( - LogicalTypeAnnotation.TimeLogicalTypeAnnotation timeLogicalType) { - return of( - LogicalType.TIME( - new TimeType( - timeLogicalType.isAdjustedToUTC(), convertUnit(timeLogicalType.getUnit())))); - } - - @Override - public Optional visit( - LogicalTypeAnnotation.TimestampLogicalTypeAnnotation timestampLogicalType) { - return of( - LogicalType.TIMESTAMP( - new TimestampType( - timestampLogicalType.isAdjustedToUTC(), - convertUnit(timestampLogicalType.getUnit())))); - } - - @Override - public Optional visit( - LogicalTypeAnnotation.IntLogicalTypeAnnotation intLogicalType) { - return of( - LogicalType.INTEGER( - new IntType((byte) intLogicalType.getBitWidth(), intLogicalType.isSigned()))); - } - - @Override - public Optional visit( - LogicalTypeAnnotation.JsonLogicalTypeAnnotation jsonLogicalType) { - return of(LogicalType.JSON(new JsonType())); - } - - @Override - public Optional visit( - LogicalTypeAnnotation.BsonLogicalTypeAnnotation bsonLogicalType) { - return of(LogicalType.BSON(new BsonType())); - } - - @Override - public Optional visit(UUIDLogicalTypeAnnotation uuidLogicalType) { - return of(LogicalType.UUID(new UUIDType())); - } - - @Override - public Optional visit( - LogicalTypeAnnotation.IntervalLogicalTypeAnnotation intervalLogicalType) { - return of(LogicalType.UNKNOWN(new NullType())); - } - } - - private void addRowGroup( - ParquetMetadata parquetMetadata, - List rowGroups, - BlockMetaData block, - InternalFileEncryptor fileEncryptor) { - - // rowGroup.total_byte_size = ; - List columns = block.getColumns(); - List parquetColumns = new ArrayList(); - int rowGroupOrdinal = rowGroups.size(); - int columnOrdinal = -1; - ByteArrayOutputStream tempOutStream = null; - for (ColumnChunkMetaData columnMetaData : columns) { - ColumnChunk columnChunk = - new ColumnChunk( - columnMetaData.getFirstDataPageOffset()); // verify this is the right offset - columnChunk.file_path = block.getPath(); // they are in the same file for now - InternalColumnEncryptionSetup columnSetup = null; - boolean writeCryptoMetadata = false; - boolean encryptMetaData = false; - ColumnPath path = columnMetaData.getPath(); - if (null != fileEncryptor) { - columnOrdinal++; - columnSetup = fileEncryptor.getColumnSetup(path, false, columnOrdinal); - writeCryptoMetadata = columnSetup.isEncrypted(); - encryptMetaData = fileEncryptor.encryptColumnMetaData(columnSetup); - } - ColumnMetaData metaData = - new ColumnMetaData( - getType(columnMetaData.getType()), - toFormatEncodings(columnMetaData.getEncodings()), - Arrays.asList(columnMetaData.getPath().toArray()), - toFormatCodec(columnMetaData.getCodec()), - columnMetaData.getValueCount(), - columnMetaData.getTotalUncompressedSize(), - columnMetaData.getTotalSize(), - columnMetaData.getFirstDataPageOffset()); - if (columnMetaData.getEncodingStats() != null - && columnMetaData.getEncodingStats().hasDictionaryPages()) { - metaData.setDictionary_page_offset(columnMetaData.getDictionaryPageOffset()); - } - long bloomFilterOffset = columnMetaData.getBloomFilterOffset(); - if (bloomFilterOffset >= 0) { - metaData.setBloom_filter_offset(bloomFilterOffset); - } - if (columnMetaData.getStatistics() != null && !columnMetaData.getStatistics().isEmpty()) { - metaData.setStatistics( - toParquetStatistics(columnMetaData.getStatistics(), this.statisticsTruncateLength)); - } - if (columnMetaData.getEncodingStats() != null) { - metaData.setEncoding_stats(convertEncodingStats(columnMetaData.getEncodingStats())); - } - - if (!encryptMetaData) { - columnChunk.setMeta_data(metaData); - } else { - // Serialize and encrypt ColumnMetadata separately - byte[] columnMetaDataAAD = - AesCipher.createModuleAAD( - fileEncryptor.getFileAAD(), - ModuleType.ColumnMetaData, - rowGroupOrdinal, - columnSetup.getOrdinal(), - -1); - if (null == tempOutStream) { - tempOutStream = new ByteArrayOutputStream(); - } else { - tempOutStream.reset(); - } - try { - writeColumnMetaData( - metaData, tempOutStream, columnSetup.getMetaDataEncryptor(), columnMetaDataAAD); - } catch (IOException e) { - throw new ParquetCryptoRuntimeException( - "Failed to serialize and encrypt ColumnMetadata for " + columnMetaData.getPath(), e); - } - columnChunk.setEncrypted_column_metadata(tempOutStream.toByteArray()); - // Keep redacted metadata version - if (!fileEncryptor.isFooterEncrypted()) { - ColumnMetaData metaDataRedacted = metaData.deepCopy(); - if (metaDataRedacted.isSetStatistics()) metaDataRedacted.unsetStatistics(); - if (metaDataRedacted.isSetEncoding_stats()) metaDataRedacted.unsetEncoding_stats(); - columnChunk.setMeta_data(metaDataRedacted); - } - } - if (writeCryptoMetadata) { - columnChunk.setCrypto_metadata(columnSetup.getColumnCryptoMetaData()); - } - - // columnChunk.meta_data.index_page_offset = ; - // columnChunk.meta_data.key_value_metadata = ; // nothing yet - - IndexReference columnIndexRef = columnMetaData.getColumnIndexReference(); - if (columnIndexRef != null) { - columnChunk.setColumn_index_offset(columnIndexRef.getOffset()); - columnChunk.setColumn_index_length(columnIndexRef.getLength()); - } - IndexReference offsetIndexRef = columnMetaData.getOffsetIndexReference(); - if (offsetIndexRef != null) { - columnChunk.setOffset_index_offset(offsetIndexRef.getOffset()); - columnChunk.setOffset_index_length(offsetIndexRef.getLength()); - } - - parquetColumns.add(columnChunk); - } - RowGroup rowGroup = new RowGroup(parquetColumns, block.getTotalByteSize(), block.getRowCount()); - rowGroup.setFile_offset(block.getStartingPos()); - rowGroup.setTotal_compressed_size(block.getCompressedSize()); - rowGroup.setOrdinal((short) rowGroupOrdinal); - rowGroups.add(rowGroup); - } - - private List toFormatEncodings(Set encodings) { - List converted = new ArrayList(encodings.size()); - for (org.apache.parquet.column.Encoding encoding : encodings) { - converted.add(getEncoding(encoding)); - } - return converted; - } - - // Visible for testing - Set fromFormatEncodings(List encodings) { - Set converted = - new HashSet(); - - for (Encoding encoding : encodings) { - converted.add(getEncoding(encoding)); - } - - // make converted unmodifiable, drop reference to modifiable copy - converted = Collections.unmodifiableSet(converted); - - // atomically update the cache - Set cached = - cachedEncodingSets.putIfAbsent(converted, converted); - - if (cached == null) { - // cached == null signifies that converted was *not* in the cache previously - // so we can return converted instead of throwing it away, it has now - // been cached - cached = converted; - } - - return cached; - } - - private CompressionCodecName fromFormatCodec(CompressionCodec codec) { - return CompressionCodecName.valueOf(codec.toString()); - } - - private CompressionCodec toFormatCodec(CompressionCodecName codec) { - return CompressionCodec.valueOf(codec.toString()); - } - - public org.apache.parquet.column.Encoding getEncoding(Encoding encoding) { - return org.apache.parquet.column.Encoding.valueOf(encoding.name()); - } - - public Encoding getEncoding(org.apache.parquet.column.Encoding encoding) { - return Encoding.valueOf(encoding.name()); - } - - public EncodingStats convertEncodingStats(List stats) { - if (stats == null) { - return null; - } - - EncodingStats.Builder builder = new EncodingStats.Builder(); - for (PageEncodingStats stat : stats) { - switch (stat.getPage_type()) { - case DATA_PAGE_V2: - builder.withV2Pages(); - // falls through - case DATA_PAGE: - builder.addDataEncoding(getEncoding(stat.getEncoding()), stat.getCount()); - break; - case DICTIONARY_PAGE: - builder.addDictEncoding(getEncoding(stat.getEncoding()), stat.getCount()); - break; - } - } - return builder.build(); - } - - public List convertEncodingStats(EncodingStats stats) { - if (stats == null) { - return null; - } - - List formatStats = new ArrayList(); - for (org.apache.parquet.column.Encoding encoding : stats.getDictionaryEncodings()) { - formatStats.add( - new PageEncodingStats( - PageType.DICTIONARY_PAGE, - getEncoding(encoding), - stats.getNumDictionaryPagesEncodedAs(encoding))); - } - PageType dataPageType = (stats.usesV2Pages() ? PageType.DATA_PAGE_V2 : PageType.DATA_PAGE); - for (org.apache.parquet.column.Encoding encoding : stats.getDataEncodings()) { - formatStats.add( - new PageEncodingStats( - dataPageType, getEncoding(encoding), stats.getNumDataPagesEncodedAs(encoding))); - } - return formatStats; - } - - public static Statistics toParquetStatistics( - org.apache.parquet.column.statistics.Statistics stats) { - return toParquetStatistics(stats, ParquetProperties.DEFAULT_STATISTICS_TRUNCATE_LENGTH); - } - - public static Statistics toParquetStatistics( - org.apache.parquet.column.statistics.Statistics stats, int truncateLength) { - Statistics formatStats = new Statistics(); - // Don't write stats larger than the max size rather than truncating. The - // rationale is that some engines may use the minimum value in the page as - // the true minimum for aggregations and there is no way to mark that a - // value has been truncated and is a lower bound and not in the page. - if (!stats.isEmpty() && withinLimit(stats, truncateLength)) { - formatStats.setNull_count(stats.getNumNulls()); - if (stats.hasNonNullValue()) { - byte[] min; - byte[] max; - - if (stats instanceof BinaryStatistics && truncateLength != Integer.MAX_VALUE) { - BinaryTruncator truncator = BinaryTruncator.getTruncator(stats.type()); - min = tuncateMin(truncator, truncateLength, stats.getMinBytes()); - max = tuncateMax(truncator, truncateLength, stats.getMaxBytes()); - } else { - min = stats.getMinBytes(); - max = stats.getMaxBytes(); - } - // Fill the former min-max statistics only if the comparison logic is - // signed so the logic of V1 and V2 stats are the same (which is - // trivially true for equal min-max values) - if (sortOrder(stats.type()) == SortOrder.SIGNED || Arrays.equals(min, max)) { - formatStats.setMin(min); - formatStats.setMax(max); - } - - if (isMinMaxStatsSupported(stats.type()) || Arrays.equals(min, max)) { - formatStats.setMin_value(min); - formatStats.setMax_value(max); - } - } - } - return formatStats; - } - - private static boolean withinLimit( - org.apache.parquet.column.statistics.Statistics stats, int truncateLength) { - if (stats.isSmallerThan(MAX_STATS_SIZE)) { - return true; - } - - if (!(stats instanceof BinaryStatistics)) { - return false; - } - - BinaryStatistics binaryStatistics = (BinaryStatistics) stats; - return binaryStatistics.isSmallerThanWithTruncation(MAX_STATS_SIZE, truncateLength); - } - - private static byte[] tuncateMin(BinaryTruncator truncator, int truncateLength, byte[] input) { - return truncator.truncateMin(Binary.fromConstantByteArray(input), truncateLength).getBytes(); - } - - private static byte[] tuncateMax(BinaryTruncator truncator, int truncateLength, byte[] input) { - return truncator.truncateMax(Binary.fromConstantByteArray(input), truncateLength).getBytes(); - } - - private static boolean isMinMaxStatsSupported(PrimitiveType type) { - return type.columnOrder().getColumnOrderName() == ColumnOrderName.TYPE_DEFINED_ORDER; - } - - /** - * @param statistics parquet format statistics - * @param type a primitive type name - * @return the statistics - * @deprecated will be removed in 2.0.0. - */ - @Deprecated - public static org.apache.parquet.column.statistics.Statistics fromParquetStatistics( - Statistics statistics, PrimitiveTypeName type) { - return fromParquetStatistics(null, statistics, type); - } - - /** - * @param createdBy the created-by string from the file - * @param statistics parquet format statistics - * @param type a primitive type name - * @return the statistics - * @deprecated will be removed in 2.0.0. - */ - @Deprecated - public static org.apache.parquet.column.statistics.Statistics fromParquetStatistics( - String createdBy, Statistics statistics, PrimitiveTypeName type) { - return fromParquetStatisticsInternal( - createdBy, - statistics, - new PrimitiveType(Repetition.OPTIONAL, type, "fake_type"), - defaultSortOrder(type)); - } - - // Visible for testing - static org.apache.parquet.column.statistics.Statistics fromParquetStatisticsInternal( - String createdBy, Statistics formatStats, PrimitiveType type, SortOrder typeSortOrder) { - // create stats object based on the column type - org.apache.parquet.column.statistics.Statistics.Builder statsBuilder = - org.apache.parquet.column.statistics.Statistics.getBuilderForReading(type); - - if (formatStats != null) { - // Use the new V2 min-max statistics over the former one if it is filled - if (formatStats.isSetMin_value() && formatStats.isSetMax_value()) { - byte[] min = formatStats.min_value.array(); - byte[] max = formatStats.max_value.array(); - if (isMinMaxStatsSupported(type) || Arrays.equals(min, max)) { - statsBuilder.withMin(min); - statsBuilder.withMax(max); - } - } else { - boolean isSet = formatStats.isSetMax() && formatStats.isSetMin(); - boolean maxEqualsMin = - isSet ? Arrays.equals(formatStats.getMin(), formatStats.getMax()) : false; - boolean sortOrdersMatch = SortOrder.SIGNED == typeSortOrder; - // NOTE: See docs in CorruptStatistics for explanation of why this check is needed - // The sort order is checked to avoid returning min/max stats that are not - // valid with the type's sort order. In previous releases, all stats were - // aggregated using a signed byte-wise ordering, which isn't valid for all the - // types (e.g. strings, decimals etc.). - if (!CorruptStatistics.shouldIgnoreStatistics(createdBy, type.getPrimitiveTypeName()) - && (sortOrdersMatch || maxEqualsMin)) { - if (isSet) { - statsBuilder.withMin(formatStats.min.array()); - statsBuilder.withMax(formatStats.max.array()); - } - } - } - - if (formatStats.isSetNull_count()) { - statsBuilder.withNumNulls(formatStats.null_count); - } - } - return statsBuilder.build(); - } - - public org.apache.parquet.column.statistics.Statistics fromParquetStatistics( - String createdBy, Statistics statistics, PrimitiveType type) { - SortOrder expectedOrder = overrideSortOrderToSigned(type) ? SortOrder.SIGNED : sortOrder(type); - return fromParquetStatisticsInternal(createdBy, statistics, type, expectedOrder); - } - - /** - * Sort order for page and column statistics. Types are associated with sort orders (e.g., UTF8 - * columns should use UNSIGNED) and column stats are aggregated using a sort order. As of - * parquet-format version 2.3.1, the order used to aggregate stats is always SIGNED and is not - * stored in the Parquet file. These stats are discarded for types that need unsigned. - * - *

See PARQUET-686. - */ - enum SortOrder { - SIGNED, - UNSIGNED, - UNKNOWN - } - - private static final Set STRING_TYPES = - Collections.unmodifiableSet( - new HashSet<>( - Arrays.asList( - LogicalTypeAnnotation.StringLogicalTypeAnnotation.class, - LogicalTypeAnnotation.EnumLogicalTypeAnnotation.class, - LogicalTypeAnnotation.JsonLogicalTypeAnnotation.class))); - - /** - * Returns whether to use signed order min and max with a type. It is safe to use signed min and - * max when the type is a string type and contains only ASCII characters (where the sign bit was - * 0). This checks whether the type is a string type and uses {@code useSignedStringMinMax} to - * determine if only ASCII characters were written. - * - * @param type a primitive type with a logical type annotation - * @return true if signed order min/max can be used with this type - */ - private boolean overrideSortOrderToSigned(PrimitiveType type) { - // even if the override is set, only return stats for string-ish types - // a null type annotation is considered string-ish because some writers - // failed to use the UTF8 annotation. - LogicalTypeAnnotation annotation = type.getLogicalTypeAnnotation(); - return useSignedStringMinMax - && PrimitiveTypeName.BINARY == type.getPrimitiveTypeName() - && (annotation == null || STRING_TYPES.contains(annotation.getClass())); - } - - /** - * @param primitive a primitive physical type - * @return the default sort order used when the logical type is not known - */ - private static SortOrder defaultSortOrder(PrimitiveTypeName primitive) { - switch (primitive) { - case BOOLEAN: - case INT32: - case INT64: - case FLOAT: - case DOUBLE: - return SortOrder.SIGNED; - case BINARY: - case FIXED_LEN_BYTE_ARRAY: - return SortOrder.UNSIGNED; - } - return SortOrder.UNKNOWN; - } - - /** - * @param primitive a primitive type with a logical type annotation - * @return the "correct" sort order of the type that applications assume - */ - private static SortOrder sortOrder(PrimitiveType primitive) { - LogicalTypeAnnotation annotation = primitive.getLogicalTypeAnnotation(); - if (annotation != null) { - return annotation - .accept( - new LogicalTypeAnnotation.LogicalTypeAnnotationVisitor() { - @Override - public Optional visit( - LogicalTypeAnnotation.IntLogicalTypeAnnotation intLogicalType) { - return intLogicalType.isSigned() ? of(SortOrder.SIGNED) : of(SortOrder.UNSIGNED); - } - - @Override - public Optional visit( - LogicalTypeAnnotation.IntervalLogicalTypeAnnotation intervalLogicalType) { - return of(SortOrder.UNKNOWN); - } - - @Override - public Optional visit( - LogicalTypeAnnotation.DateLogicalTypeAnnotation dateLogicalType) { - return of(SortOrder.SIGNED); - } - - @Override - public Optional visit( - LogicalTypeAnnotation.EnumLogicalTypeAnnotation enumLogicalType) { - return of(SortOrder.UNSIGNED); - } - - @Override - public Optional visit( - LogicalTypeAnnotation.BsonLogicalTypeAnnotation bsonLogicalType) { - return of(SortOrder.UNSIGNED); - } - - @Override - public Optional visit(UUIDLogicalTypeAnnotation uuidLogicalType) { - return of(SortOrder.UNSIGNED); - } - - @Override - public Optional visit( - LogicalTypeAnnotation.JsonLogicalTypeAnnotation jsonLogicalType) { - return of(SortOrder.UNSIGNED); - } - - @Override - public Optional visit( - LogicalTypeAnnotation.StringLogicalTypeAnnotation stringLogicalType) { - return of(SortOrder.UNSIGNED); - } - - @Override - public Optional visit( - LogicalTypeAnnotation.DecimalLogicalTypeAnnotation decimalLogicalType) { - return of(SortOrder.UNKNOWN); - } - - @Override - public Optional visit( - LogicalTypeAnnotation.MapKeyValueTypeAnnotation mapKeyValueLogicalType) { - return of(SortOrder.UNKNOWN); - } - - @Override - public Optional visit( - LogicalTypeAnnotation.MapLogicalTypeAnnotation mapLogicalType) { - return of(SortOrder.UNKNOWN); - } - - @Override - public Optional visit( - LogicalTypeAnnotation.ListLogicalTypeAnnotation listLogicalType) { - return of(SortOrder.UNKNOWN); - } - - @Override - public Optional visit( - LogicalTypeAnnotation.TimeLogicalTypeAnnotation timeLogicalType) { - return of(SortOrder.SIGNED); - } - - @Override - public Optional visit( - LogicalTypeAnnotation.TimestampLogicalTypeAnnotation timestampLogicalType) { - return of(SortOrder.SIGNED); - } - }) - .orElse(defaultSortOrder(primitive.getPrimitiveTypeName())); - } - - return defaultSortOrder(primitive.getPrimitiveTypeName()); - } - - public PrimitiveTypeName getPrimitive(Type type) { - switch (type) { - case BYTE_ARRAY: // TODO: rename BINARY and remove this switch - return PrimitiveTypeName.BINARY; - case INT64: - return PrimitiveTypeName.INT64; - case INT32: - return PrimitiveTypeName.INT32; - case BOOLEAN: - return PrimitiveTypeName.BOOLEAN; - case FLOAT: - return PrimitiveTypeName.FLOAT; - case DOUBLE: - return PrimitiveTypeName.DOUBLE; - case INT96: - return PrimitiveTypeName.INT96; - case FIXED_LEN_BYTE_ARRAY: - return PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY; - default: - throw new RuntimeException("Unknown type " + type); - } - } - - // Visible for testing - Type getType(PrimitiveTypeName type) { - switch (type) { - case INT64: - return Type.INT64; - case INT32: - return Type.INT32; - case BOOLEAN: - return Type.BOOLEAN; - case BINARY: - return Type.BYTE_ARRAY; - case FLOAT: - return Type.FLOAT; - case DOUBLE: - return Type.DOUBLE; - case INT96: - return Type.INT96; - case FIXED_LEN_BYTE_ARRAY: - return Type.FIXED_LEN_BYTE_ARRAY; - default: - throw new RuntimeException("Unknown primitive type " + type); - } - } - - // Visible for testing - LogicalTypeAnnotation getLogicalTypeAnnotation(ConvertedType type, SchemaElement schemaElement) { - switch (type) { - case UTF8: - return LogicalTypeAnnotation.stringType(); - case MAP: - return LogicalTypeAnnotation.mapType(); - case MAP_KEY_VALUE: - return LogicalTypeAnnotation.MapKeyValueTypeAnnotation.getInstance(); - case LIST: - return LogicalTypeAnnotation.listType(); - case ENUM: - return LogicalTypeAnnotation.enumType(); - case DECIMAL: - int scale = (schemaElement == null ? 0 : schemaElement.scale); - int precision = (schemaElement == null ? 0 : schemaElement.precision); - return LogicalTypeAnnotation.decimalType(scale, precision); - case DATE: - return LogicalTypeAnnotation.dateType(); - case TIME_MILLIS: - return LogicalTypeAnnotation.timeType(true, LogicalTypeAnnotation.TimeUnit.MILLIS); - case TIME_MICROS: - return LogicalTypeAnnotation.timeType(true, LogicalTypeAnnotation.TimeUnit.MICROS); - case TIMESTAMP_MILLIS: - return LogicalTypeAnnotation.timestampType(true, LogicalTypeAnnotation.TimeUnit.MILLIS); - case TIMESTAMP_MICROS: - return LogicalTypeAnnotation.timestampType(true, LogicalTypeAnnotation.TimeUnit.MICROS); - case INTERVAL: - return LogicalTypeAnnotation.IntervalLogicalTypeAnnotation.getInstance(); - case INT_8: - return LogicalTypeAnnotation.intType(8, true); - case INT_16: - return LogicalTypeAnnotation.intType(16, true); - case INT_32: - return LogicalTypeAnnotation.intType(32, true); - case INT_64: - return LogicalTypeAnnotation.intType(64, true); - case UINT_8: - return LogicalTypeAnnotation.intType(8, false); - case UINT_16: - return LogicalTypeAnnotation.intType(16, false); - case UINT_32: - return LogicalTypeAnnotation.intType(32, false); - case UINT_64: - return LogicalTypeAnnotation.intType(64, false); - case JSON: - return LogicalTypeAnnotation.jsonType(); - case BSON: - return LogicalTypeAnnotation.bsonType(); - default: - throw new RuntimeException( - "Can't convert converted type to logical type, unknown converted type " + type); - } - } - - LogicalTypeAnnotation getLogicalTypeAnnotation(LogicalType type) { - switch (type.getSetField()) { - case MAP: - return LogicalTypeAnnotation.mapType(); - case BSON: - return LogicalTypeAnnotation.bsonType(); - case DATE: - return LogicalTypeAnnotation.dateType(); - case ENUM: - return LogicalTypeAnnotation.enumType(); - case JSON: - return LogicalTypeAnnotation.jsonType(); - case LIST: - return LogicalTypeAnnotation.listType(); - case TIME: - TimeType time = type.getTIME(); - return LogicalTypeAnnotation.timeType(time.isAdjustedToUTC, convertTimeUnit(time.unit)); - case STRING: - return LogicalTypeAnnotation.stringType(); - case DECIMAL: - DecimalType decimal = type.getDECIMAL(); - return LogicalTypeAnnotation.decimalType(decimal.scale, decimal.precision); - case INTEGER: - IntType integer = type.getINTEGER(); - return LogicalTypeAnnotation.intType(integer.bitWidth, integer.isSigned); - case UNKNOWN: - return null; - case TIMESTAMP: - TimestampType timestamp = type.getTIMESTAMP(); - return LogicalTypeAnnotation.timestampType( - timestamp.isAdjustedToUTC, convertTimeUnit(timestamp.unit)); - case UUID: - return LogicalTypeAnnotation.uuidType(); - default: - throw new RuntimeException("Unknown logical type " + type); - } - } - - private LogicalTypeAnnotation.TimeUnit convertTimeUnit(TimeUnit unit) { - switch (unit.getSetField()) { - case MICROS: - return LogicalTypeAnnotation.TimeUnit.MICROS; - case MILLIS: - return LogicalTypeAnnotation.TimeUnit.MILLIS; - case NANOS: - return LogicalTypeAnnotation.TimeUnit.NANOS; - default: - throw new RuntimeException("Unknown time unit " + unit); - } - } - - private static void addKeyValue(FileMetaData fileMetaData, String key, String value) { - KeyValue keyValue = new KeyValue(key); - keyValue.value = value; - fileMetaData.addToKey_value_metadata(keyValue); - } - - private static interface MetadataFilterVisitor { - T visit(NoFilter filter) throws E; - - T visit(SkipMetadataFilter filter) throws E; - - T visit(RangeMetadataFilter filter) throws E; - - T visit(OffsetMetadataFilter filter) throws E; - } - - public abstract static class MetadataFilter { - private MetadataFilter() {} - - abstract T accept(MetadataFilterVisitor visitor) throws E; - } - - /** - * [ startOffset, endOffset ) - * - * @param startOffset a start offset (inclusive) - * @param endOffset an end offset (exclusive) - * @return a range filter from the offsets - */ - public static MetadataFilter range(long startOffset, long endOffset) { - return new RangeMetadataFilter(startOffset, endOffset); - } - - public static MetadataFilter offsets(long... offsets) { - Set set = new HashSet(); - for (long offset : offsets) { - set.add(offset); - } - return new OffsetMetadataFilter(set); - } - - private static final class NoFilter extends MetadataFilter { - private NoFilter() {} - - @Override - T accept(MetadataFilterVisitor visitor) throws E { - return visitor.visit(this); - } - - @Override - public String toString() { - return "NO_FILTER"; - } - } - - private static final class SkipMetadataFilter extends MetadataFilter { - private SkipMetadataFilter() {} - - @Override - T accept(MetadataFilterVisitor visitor) throws E { - return visitor.visit(this); - } - - @Override - public String toString() { - return "SKIP_ROW_GROUPS"; - } - } - - /** [ startOffset, endOffset ) */ - // Visible for testing - static final class RangeMetadataFilter extends MetadataFilter { - final long startOffset; - final long endOffset; - - RangeMetadataFilter(long startOffset, long endOffset) { - super(); - this.startOffset = startOffset; - this.endOffset = endOffset; - } - - @Override - T accept(MetadataFilterVisitor visitor) throws E { - return visitor.visit(this); - } - - public boolean contains(long offset) { - return offset >= this.startOffset && offset < this.endOffset; - } - - @Override - public String toString() { - return "range(s:" + startOffset + ", e:" + endOffset + ")"; - } - } - - static final class OffsetMetadataFilter extends MetadataFilter { - private final Set offsets; - - public OffsetMetadataFilter(Set offsets) { - this.offsets = offsets; - } - - public boolean contains(long offset) { - return offsets.contains(offset); - } - - @Override - T accept(MetadataFilterVisitor visitor) throws E { - return visitor.visit(this); - } - } - - @Deprecated - public ParquetMetadata readParquetMetadata(InputStream from) throws IOException { - return readParquetMetadata(from, NO_FILTER); - } - - // Visible for testing - static FileMetaData filterFileMetaDataByMidpoint( - FileMetaData metaData, RangeMetadataFilter filter) { - List rowGroups = metaData.getRow_groups(); - List newRowGroups = new ArrayList(); - long preStartIndex = 0; - long preCompressedSize = 0; - boolean firstColumnWithMetadata = true; - if (rowGroups != null && rowGroups.size() > 0) { - firstColumnWithMetadata = rowGroups.get(0).getColumns().get(0).isSetMeta_data(); - } - for (RowGroup rowGroup : rowGroups) { - long totalSize = 0; - long startIndex; - ColumnChunk columnChunk = rowGroup.getColumns().get(0); - if (firstColumnWithMetadata) { - startIndex = getOffset(columnChunk); - } else { - assert rowGroup.isSetFile_offset(); - assert rowGroup.isSetTotal_compressed_size(); - - // the file_offset of first block always holds the truth, while other blocks don't : - // see PARQUET-2078 for details - startIndex = rowGroup.getFile_offset(); - if (invalidFileOffset(startIndex, preStartIndex, preCompressedSize)) { - // first row group's offset is always 4 - if (preStartIndex == 0) { - startIndex = 4; - } else { - // use minStartIndex(imprecise in case of padding, but good enough for filtering) - startIndex = preStartIndex + preCompressedSize; - } - } - preStartIndex = startIndex; - preCompressedSize = rowGroup.getTotal_compressed_size(); - } - - if (rowGroup.isSetTotal_compressed_size()) { - totalSize = rowGroup.getTotal_compressed_size(); - } else { - for (ColumnChunk col : rowGroup.getColumns()) { - totalSize += col.getMeta_data().getTotal_compressed_size(); - } - } - - long midPoint = startIndex + totalSize / 2; - if (filter.contains(midPoint)) { - newRowGroups.add(rowGroup); - } - } - - metaData.setRow_groups(newRowGroups); - return metaData; - } - - private static boolean invalidFileOffset( - long startIndex, long preStartIndex, long preCompressedSize) { - boolean invalid = false; - assert preStartIndex <= startIndex; - // checking the first rowGroup - if (preStartIndex == 0 && startIndex != 4) { - invalid = true; - return invalid; - } - - // calculate start index for other blocks - long minStartIndex = preStartIndex + preCompressedSize; - if (startIndex < minStartIndex) { - // a bad offset detected, try first column's offset - // can not use minStartIndex in case of padding - invalid = true; - } - - return invalid; - } - - // Visible for testing - static FileMetaData filterFileMetaDataByStart( - FileMetaData metaData, OffsetMetadataFilter filter) { - List rowGroups = metaData.getRow_groups(); - List newRowGroups = new ArrayList(); - long preStartIndex = 0; - long preCompressedSize = 0; - boolean firstColumnWithMetadata = true; - if (rowGroups != null && rowGroups.size() > 0) { - firstColumnWithMetadata = rowGroups.get(0).getColumns().get(0).isSetMeta_data(); - } - for (RowGroup rowGroup : rowGroups) { - long startIndex; - ColumnChunk columnChunk = rowGroup.getColumns().get(0); - if (firstColumnWithMetadata) { - startIndex = getOffset(columnChunk); - } else { - assert rowGroup.isSetFile_offset(); - assert rowGroup.isSetTotal_compressed_size(); - - // the file_offset of first block always holds the truth, while other blocks don't : - // see PARQUET-2078 for details - startIndex = rowGroup.getFile_offset(); - if (invalidFileOffset(startIndex, preStartIndex, preCompressedSize)) { - // first row group's offset is always 4 - if (preStartIndex == 0) { - startIndex = 4; - } else { - throw new InvalidFileOffsetException( - "corrupted RowGroup.file_offset found, " - + "please use file range instead of block offset for split."); - } - } - preStartIndex = startIndex; - preCompressedSize = rowGroup.getTotal_compressed_size(); - } - - if (filter.contains(startIndex)) { - newRowGroups.add(rowGroup); - } - } - metaData.setRow_groups(newRowGroups); - return metaData; - } - - static long getOffset(RowGroup rowGroup) { - if (rowGroup.isSetFile_offset()) { - return rowGroup.getFile_offset(); - } - return getOffset(rowGroup.getColumns().get(0)); - } - - // Visible for testing - static long getOffset(ColumnChunk columnChunk) { - ColumnMetaData md = columnChunk.getMeta_data(); - long offset = md.getData_page_offset(); - if (md.isSetDictionary_page_offset() && offset > md.getDictionary_page_offset()) { - offset = md.getDictionary_page_offset(); - } - return offset; - } - - private static void verifyFooterIntegrity( - InputStream from, InternalFileDecryptor fileDecryptor, int combinedFooterLength) - throws IOException { - - byte[] nonce = new byte[AesCipher.NONCE_LENGTH]; - from.read(nonce); - byte[] gcmTag = new byte[AesCipher.GCM_TAG_LENGTH]; - from.read(gcmTag); - - AesGcmEncryptor footerSigner = fileDecryptor.createSignedFooterEncryptor(); - - byte[] footerAndSignature = ((ByteBufferInputStream) from).slice(0).array(); - int footerSignatureLength = AesCipher.NONCE_LENGTH + AesCipher.GCM_TAG_LENGTH; - byte[] serializedFooter = new byte[combinedFooterLength - footerSignatureLength]; - System.arraycopy(footerAndSignature, 0, serializedFooter, 0, serializedFooter.length); - - byte[] signedFooterAAD = AesCipher.createFooterAAD(fileDecryptor.getFileAAD()); - byte[] encryptedFooterBytes = - footerSigner.encrypt(false, serializedFooter, nonce, signedFooterAAD); - byte[] calculatedTag = new byte[AesCipher.GCM_TAG_LENGTH]; - System.arraycopy( - encryptedFooterBytes, - encryptedFooterBytes.length - AesCipher.GCM_TAG_LENGTH, - calculatedTag, - 0, - AesCipher.GCM_TAG_LENGTH); - if (!Arrays.equals(gcmTag, calculatedTag)) { - throw new TagVerificationException("Signature mismatch in plaintext footer"); - } - } - - public ParquetMetadata readParquetMetadata(final InputStream from, MetadataFilter filter) - throws IOException { - return readParquetMetadata(from, filter, null, false, 0); - } - - private Map generateRowGroupOffsets(FileMetaData metaData) { - Map rowGroupOrdinalToRowIdx = new HashMap<>(); - List rowGroups = metaData.getRow_groups(); - if (rowGroups != null) { - long rowIdxSum = 0; - for (int i = 0; i < rowGroups.size(); i++) { - rowGroupOrdinalToRowIdx.put(rowGroups.get(i), rowIdxSum); - rowIdxSum += rowGroups.get(i).getNum_rows(); - } - } - return rowGroupOrdinalToRowIdx; - } - - /** A container for [[FileMetaData]] and [[RowGroup]] to ROW_INDEX offset map. */ - private class FileMetaDataAndRowGroupOffsetInfo { - final FileMetaData fileMetadata; - final Map rowGroupToRowIndexOffsetMap; - - public FileMetaDataAndRowGroupOffsetInfo( - FileMetaData fileMetadata, Map rowGroupToRowIndexOffsetMap) { - this.fileMetadata = fileMetadata; - this.rowGroupToRowIndexOffsetMap = rowGroupToRowIndexOffsetMap; - } - } - - public ParquetMetadata readParquetMetadata( - final InputStream from, - MetadataFilter filter, - final InternalFileDecryptor fileDecryptor, - final boolean encryptedFooter, - final int combinedFooterLength) - throws IOException { - - final BlockCipher.Decryptor footerDecryptor = - (encryptedFooter ? fileDecryptor.fetchFooterDecryptor() : null); - final byte[] encryptedFooterAAD = - (encryptedFooter ? AesCipher.createFooterAAD(fileDecryptor.getFileAAD()) : null); - - FileMetaDataAndRowGroupOffsetInfo fileMetaDataAndRowGroupInfo = - filter.accept( - new MetadataFilterVisitor() { - @Override - public FileMetaDataAndRowGroupOffsetInfo visit(NoFilter filter) throws IOException { - FileMetaData fileMetadata = - readFileMetaData(from, footerDecryptor, encryptedFooterAAD); - return new FileMetaDataAndRowGroupOffsetInfo( - fileMetadata, generateRowGroupOffsets(fileMetadata)); - } - - @Override - public FileMetaDataAndRowGroupOffsetInfo visit(SkipMetadataFilter filter) - throws IOException { - FileMetaData fileMetadata = - readFileMetaData(from, true, footerDecryptor, encryptedFooterAAD); - return new FileMetaDataAndRowGroupOffsetInfo( - fileMetadata, generateRowGroupOffsets(fileMetadata)); - } - - @Override - public FileMetaDataAndRowGroupOffsetInfo visit(OffsetMetadataFilter filter) - throws IOException { - FileMetaData fileMetadata = - readFileMetaData(from, footerDecryptor, encryptedFooterAAD); - // We must generate the map *before* filtering because it modifies `fileMetadata`. - Map rowGroupToRowIndexOffsetMap = - generateRowGroupOffsets(fileMetadata); - FileMetaData filteredFileMetadata = filterFileMetaDataByStart(fileMetadata, filter); - return new FileMetaDataAndRowGroupOffsetInfo( - filteredFileMetadata, rowGroupToRowIndexOffsetMap); - } - - @Override - public FileMetaDataAndRowGroupOffsetInfo visit(RangeMetadataFilter filter) - throws IOException { - FileMetaData fileMetadata = - readFileMetaData(from, footerDecryptor, encryptedFooterAAD); - // We must generate the map *before* filtering because it modifies `fileMetadata`. - Map rowGroupToRowIndexOffsetMap = - generateRowGroupOffsets(fileMetadata); - FileMetaData filteredFileMetadata = - filterFileMetaDataByMidpoint(fileMetadata, filter); - return new FileMetaDataAndRowGroupOffsetInfo( - filteredFileMetadata, rowGroupToRowIndexOffsetMap); - } - }); - FileMetaData fileMetaData = fileMetaDataAndRowGroupInfo.fileMetadata; - Map rowGroupToRowIndexOffsetMap = - fileMetaDataAndRowGroupInfo.rowGroupToRowIndexOffsetMap; - LOG.debug("{}", fileMetaData); - - if (!encryptedFooter && null != fileDecryptor) { - if (!fileMetaData.isSetEncryption_algorithm()) { // Plaintext file - fileDecryptor.setPlaintextFile(); - // Done to detect files that were not encrypted by mistake - if (!fileDecryptor.plaintextFilesAllowed()) { - throw new ParquetCryptoRuntimeException("Applying decryptor on plaintext file"); - } - } else { // Encrypted file with plaintext footer - // if no fileDecryptor, can still read plaintext columns - fileDecryptor.setFileCryptoMetaData( - fileMetaData.getEncryption_algorithm(), - false, - fileMetaData.getFooter_signing_key_metadata()); - if (fileDecryptor.checkFooterIntegrity()) { - verifyFooterIntegrity(from, fileDecryptor, combinedFooterLength); - } - } - } - - ParquetMetadata parquetMetadata = - fromParquetMetadata( - fileMetaData, fileDecryptor, encryptedFooter, rowGroupToRowIndexOffsetMap); - if (LOG.isDebugEnabled()) LOG.debug(ParquetMetadata.toPrettyJSON(parquetMetadata)); - return parquetMetadata; - } - - public ColumnChunkMetaData buildColumnChunkMetaData( - ColumnMetaData metaData, ColumnPath columnPath, PrimitiveType type, String createdBy) { - return ColumnChunkMetaData.get( - columnPath, - type, - fromFormatCodec(metaData.codec), - convertEncodingStats(metaData.getEncoding_stats()), - fromFormatEncodings(metaData.encodings), - fromParquetStatistics(createdBy, metaData.statistics, type), - metaData.data_page_offset, - metaData.dictionary_page_offset, - metaData.num_values, - metaData.total_compressed_size, - metaData.total_uncompressed_size); - } - - public ParquetMetadata fromParquetMetadata(FileMetaData parquetMetadata) throws IOException { - return fromParquetMetadata(parquetMetadata, null, false); - } - - public ParquetMetadata fromParquetMetadata( - FileMetaData parquetMetadata, InternalFileDecryptor fileDecryptor, boolean encryptedFooter) - throws IOException { - return fromParquetMetadata( - parquetMetadata, fileDecryptor, encryptedFooter, new HashMap()); - } - - public ParquetMetadata fromParquetMetadata( - FileMetaData parquetMetadata, - InternalFileDecryptor fileDecryptor, - boolean encryptedFooter, - Map rowGroupToRowIndexOffsetMap) - throws IOException { - MessageType messageType = - fromParquetSchema(parquetMetadata.getSchema(), parquetMetadata.getColumn_orders()); - List blocks = new ArrayList(); - List row_groups = parquetMetadata.getRow_groups(); - - if (row_groups != null) { - for (RowGroup rowGroup : row_groups) { - BlockMetaData blockMetaData = new BlockMetaData(); - blockMetaData.setRowCount(rowGroup.getNum_rows()); - blockMetaData.setTotalByteSize(rowGroup.getTotal_byte_size()); - if (rowGroupToRowIndexOffsetMap.containsKey(rowGroup)) { - blockMetaData.setRowIndexOffset(rowGroupToRowIndexOffsetMap.get(rowGroup)); - } - // not set in legacy files - if (rowGroup.isSetOrdinal()) { - blockMetaData.setOrdinal(rowGroup.getOrdinal()); - } - List columns = rowGroup.getColumns(); - String filePath = columns.get(0).getFile_path(); - int columnOrdinal = -1; - for (ColumnChunk columnChunk : columns) { - columnOrdinal++; - if ((filePath == null && columnChunk.getFile_path() != null) - || (filePath != null && !filePath.equals(columnChunk.getFile_path()))) { - throw new ParquetDecodingException( - "all column chunks of the same row group must be in the same file for now"); - } - ColumnMetaData metaData = columnChunk.meta_data; - ColumnCryptoMetaData cryptoMetaData = columnChunk.getCrypto_metadata(); - ColumnChunkMetaData column = null; - ColumnPath columnPath = null; - boolean lazyMetadataDecryption = false; - - if (null == cryptoMetaData) { // Plaintext column - columnPath = getPath(metaData); - if (null != fileDecryptor && !fileDecryptor.plaintextFile()) { - // mark this column as plaintext in encrypted file decryptor - fileDecryptor.setColumnCryptoMetadata( - columnPath, false, false, (byte[]) null, columnOrdinal); - } - } else { // Encrypted column - boolean encryptedWithFooterKey = cryptoMetaData.isSetENCRYPTION_WITH_FOOTER_KEY(); - if (encryptedWithFooterKey) { // Column encrypted with footer key - if (null == fileDecryptor) { - throw new ParquetCryptoRuntimeException( - "Column encrypted with footer key: No keys available"); - } - if (null == metaData) { - throw new ParquetCryptoRuntimeException( - "ColumnMetaData not set in Encryption with Footer key"); - } - columnPath = getPath(metaData); - if (!encryptedFooter) { // Unencrypted footer. Decrypt full column metadata, using - // footer key - ByteArrayInputStream tempInputStream = - new ByteArrayInputStream(columnChunk.getEncrypted_column_metadata()); - byte[] columnMetaDataAAD = - AesCipher.createModuleAAD( - fileDecryptor.getFileAAD(), - ModuleType.ColumnMetaData, - rowGroup.getOrdinal(), - columnOrdinal, - -1); - try { - metaData = - readColumnMetaData( - tempInputStream, fileDecryptor.fetchFooterDecryptor(), columnMetaDataAAD); - } catch (IOException e) { - throw new ParquetCryptoRuntimeException( - columnPath + ". Failed to decrypt column metadata", e); - } - } - fileDecryptor.setColumnCryptoMetadata( - columnPath, true, true, (byte[]) null, columnOrdinal); - } else { // Column encrypted with column key - // setColumnCryptoMetadata triggers KMS interaction, hence delayed until this column - // is projected - lazyMetadataDecryption = true; - } - } - - String createdBy = parquetMetadata.getCreated_by(); - if (!lazyMetadataDecryption) { // full column metadata (with stats) is available - column = - buildColumnChunkMetaData( - metaData, - columnPath, - messageType.getType(columnPath.toArray()).asPrimitiveType(), - createdBy); - column.setRowGroupOrdinal(rowGroup.getOrdinal()); - if (metaData.isSetBloom_filter_offset()) { - column.setBloomFilterOffset(metaData.getBloom_filter_offset()); - } - } else { // column encrypted with column key - // Metadata will be decrypted later, if this column is accessed - EncryptionWithColumnKey columnKeyStruct = - cryptoMetaData.getENCRYPTION_WITH_COLUMN_KEY(); - List pathList = columnKeyStruct.getPath_in_schema(); - byte[] columnKeyMetadata = columnKeyStruct.getKey_metadata(); - columnPath = ColumnPath.get(pathList.toArray(new String[pathList.size()])); - byte[] encryptedMetadataBuffer = columnChunk.getEncrypted_column_metadata(); - column = - new EncryptedColumnChunkMetaData( - this, - columnPath, - messageType.getType(columnPath.toArray()).asPrimitiveType(), - encryptedMetadataBuffer, - columnKeyMetadata, - fileDecryptor, - rowGroup.getOrdinal(), - columnOrdinal, - createdBy); - } - - column.setColumnIndexReference(toColumnIndexReference(columnChunk)); - column.setOffsetIndexReference(toOffsetIndexReference(columnChunk)); - - // TODO - // index_page_offset - // key_value_metadata - blockMetaData.addColumn(column); - } - blockMetaData.setPath(filePath); - blocks.add(blockMetaData); - } - } - Map keyValueMetaData = new HashMap(); - List key_value_metadata = parquetMetadata.getKey_value_metadata(); - if (key_value_metadata != null) { - for (KeyValue keyValue : key_value_metadata) { - keyValueMetaData.put(keyValue.key, keyValue.value); - } - } - EncryptionType encryptionType; - if (encryptedFooter) { - encryptionType = EncryptionType.ENCRYPTED_FOOTER; - } else if (parquetMetadata.isSetEncryption_algorithm()) { - encryptionType = EncryptionType.PLAINTEXT_FOOTER; - } else { - encryptionType = EncryptionType.UNENCRYPTED; - } - return new ParquetMetadata( - new org.apache.parquet.hadoop.metadata.FileMetaData( - messageType, - keyValueMetaData, - parquetMetadata.getCreated_by(), - encryptionType, - fileDecryptor), - blocks); - } - - private static IndexReference toColumnIndexReference(ColumnChunk columnChunk) { - if (columnChunk.isSetColumn_index_offset() && columnChunk.isSetColumn_index_length()) { - return new IndexReference( - columnChunk.getColumn_index_offset(), columnChunk.getColumn_index_length()); - } - return null; - } - - private static IndexReference toOffsetIndexReference(ColumnChunk columnChunk) { - if (columnChunk.isSetOffset_index_offset() && columnChunk.isSetOffset_index_length()) { - return new IndexReference( - columnChunk.getOffset_index_offset(), columnChunk.getOffset_index_length()); - } - return null; - } - - private static ColumnPath getPath(ColumnMetaData metaData) { - String[] path = metaData.path_in_schema.toArray(new String[0]); - return ColumnPath.get(path); - } - - // Visible for testing - MessageType fromParquetSchema(List schema, List columnOrders) { - Iterator iterator = schema.iterator(); - SchemaElement root = iterator.next(); - Types.MessageTypeBuilder builder = Types.buildMessage(); - if (root.isSetField_id()) { - builder.id(root.field_id); - } - buildChildren(builder, iterator, root.getNum_children(), columnOrders, 0); - return builder.named(root.name); - } - - private void buildChildren( - Types.GroupBuilder builder, - Iterator schema, - int childrenCount, - List columnOrders, - int columnCount) { - for (int i = 0; i < childrenCount; i++) { - SchemaElement schemaElement = schema.next(); - - // Create Parquet Type. - Types.Builder childBuilder; - if (schemaElement.type != null) { - Types.PrimitiveBuilder primitiveBuilder = - builder.primitive( - getPrimitive(schemaElement.type), - fromParquetRepetition(schemaElement.repetition_type)); - if (schemaElement.isSetType_length()) { - primitiveBuilder.length(schemaElement.type_length); - } - if (schemaElement.isSetPrecision()) { - primitiveBuilder.precision(schemaElement.precision); - } - if (schemaElement.isSetScale()) { - primitiveBuilder.scale(schemaElement.scale); - } - if (columnOrders != null) { - org.apache.parquet.schema.ColumnOrder columnOrder = - fromParquetColumnOrder(columnOrders.get(columnCount)); - // As per parquet format 2.4.0 no UNDEFINED order is supported. So, set undefined column - // order for the types - // where ordering is not supported. - if (columnOrder.getColumnOrderName() == ColumnOrderName.TYPE_DEFINED_ORDER - && (schemaElement.type == Type.INT96 - || schemaElement.converted_type == ConvertedType.INTERVAL)) { - columnOrder = org.apache.parquet.schema.ColumnOrder.undefined(); - } - primitiveBuilder.columnOrder(columnOrder); - } - childBuilder = primitiveBuilder; - - } else { - childBuilder = builder.group(fromParquetRepetition(schemaElement.repetition_type)); - buildChildren( - (Types.GroupBuilder) childBuilder, - schema, - schemaElement.num_children, - columnOrders, - columnCount); - } - - if (schemaElement.isSetLogicalType()) { - childBuilder.as(getLogicalTypeAnnotation(schemaElement.logicalType)); - } - if (schemaElement.isSetConverted_type()) { - OriginalType originalType = - getLogicalTypeAnnotation(schemaElement.converted_type, schemaElement).toOriginalType(); - OriginalType newOriginalType = - (schemaElement.isSetLogicalType() - && getLogicalTypeAnnotation(schemaElement.logicalType) != null) - ? getLogicalTypeAnnotation(schemaElement.logicalType).toOriginalType() - : null; - if (!originalType.equals(newOriginalType)) { - if (newOriginalType != null) { - LOG.warn( - "Converted type and logical type metadata mismatch (convertedType: {}, logical type: {}). Using value in converted type.", - schemaElement.converted_type, - schemaElement.logicalType); - } - childBuilder.as(originalType); - } - } - if (schemaElement.isSetField_id()) { - childBuilder.id(schemaElement.field_id); - } - - childBuilder.named(schemaElement.name); - ++columnCount; - } - } - - // Visible for testing - FieldRepetitionType toParquetRepetition(Repetition repetition) { - return FieldRepetitionType.valueOf(repetition.name()); - } - - // Visible for testing - Repetition fromParquetRepetition(FieldRepetitionType repetition) { - return Repetition.valueOf(repetition.name()); - } - - private static org.apache.parquet.schema.ColumnOrder fromParquetColumnOrder( - ColumnOrder columnOrder) { - if (columnOrder.isSetTYPE_ORDER()) { - return org.apache.parquet.schema.ColumnOrder.typeDefined(); - } - // The column order is not yet supported by this API - return org.apache.parquet.schema.ColumnOrder.undefined(); - } - - @Deprecated - public void writeDataPageHeader( - int uncompressedSize, - int compressedSize, - int valueCount, - org.apache.parquet.column.Encoding rlEncoding, - org.apache.parquet.column.Encoding dlEncoding, - org.apache.parquet.column.Encoding valuesEncoding, - OutputStream to) - throws IOException { - writePageHeader( - newDataPageHeader( - uncompressedSize, compressedSize, valueCount, rlEncoding, dlEncoding, valuesEncoding), - to); - } - - // Statistics are no longer saved in page headers - @Deprecated - public void writeDataPageHeader( - int uncompressedSize, - int compressedSize, - int valueCount, - org.apache.parquet.column.statistics.Statistics statistics, - org.apache.parquet.column.Encoding rlEncoding, - org.apache.parquet.column.Encoding dlEncoding, - org.apache.parquet.column.Encoding valuesEncoding, - OutputStream to) - throws IOException { - writePageHeader( - newDataPageHeader( - uncompressedSize, compressedSize, valueCount, rlEncoding, dlEncoding, valuesEncoding), - to); - } - - private PageHeader newDataPageHeader( - int uncompressedSize, - int compressedSize, - int valueCount, - org.apache.parquet.column.Encoding rlEncoding, - org.apache.parquet.column.Encoding dlEncoding, - org.apache.parquet.column.Encoding valuesEncoding) { - PageHeader pageHeader = new PageHeader(PageType.DATA_PAGE, uncompressedSize, compressedSize); - pageHeader.setData_page_header( - new DataPageHeader( - valueCount, - getEncoding(valuesEncoding), - getEncoding(dlEncoding), - getEncoding(rlEncoding))); - return pageHeader; - } - - private PageHeader newDataPageHeader( - int uncompressedSize, - int compressedSize, - int valueCount, - org.apache.parquet.column.Encoding rlEncoding, - org.apache.parquet.column.Encoding dlEncoding, - org.apache.parquet.column.Encoding valuesEncoding, - int crc) { - PageHeader pageHeader = new PageHeader(PageType.DATA_PAGE, uncompressedSize, compressedSize); - pageHeader.setCrc(crc); - pageHeader.setData_page_header( - new DataPageHeader( - valueCount, - getEncoding(valuesEncoding), - getEncoding(dlEncoding), - getEncoding(rlEncoding))); - return pageHeader; - } - - // Statistics are no longer saved in page headers - @Deprecated - public void writeDataPageV2Header( - int uncompressedSize, - int compressedSize, - int valueCount, - int nullCount, - int rowCount, - org.apache.parquet.column.statistics.Statistics statistics, - org.apache.parquet.column.Encoding dataEncoding, - int rlByteLength, - int dlByteLength, - OutputStream to) - throws IOException { - writePageHeader( - newDataPageV2Header( - uncompressedSize, - compressedSize, - valueCount, - nullCount, - rowCount, - dataEncoding, - rlByteLength, - dlByteLength), - to); - } - - public void writeDataPageV1Header( - int uncompressedSize, - int compressedSize, - int valueCount, - org.apache.parquet.column.Encoding rlEncoding, - org.apache.parquet.column.Encoding dlEncoding, - org.apache.parquet.column.Encoding valuesEncoding, - OutputStream to) - throws IOException { - writeDataPageV1Header( - uncompressedSize, - compressedSize, - valueCount, - rlEncoding, - dlEncoding, - valuesEncoding, - to, - null, - null); - } - - public void writeDataPageV1Header( - int uncompressedSize, - int compressedSize, - int valueCount, - org.apache.parquet.column.Encoding rlEncoding, - org.apache.parquet.column.Encoding dlEncoding, - org.apache.parquet.column.Encoding valuesEncoding, - OutputStream to, - BlockCipher.Encryptor blockEncryptor, - byte[] pageHeaderAAD) - throws IOException { - writePageHeader( - newDataPageHeader( - uncompressedSize, compressedSize, valueCount, rlEncoding, dlEncoding, valuesEncoding), - to, - blockEncryptor, - pageHeaderAAD); - } - - public void writeDataPageV1Header( - int uncompressedSize, - int compressedSize, - int valueCount, - org.apache.parquet.column.Encoding rlEncoding, - org.apache.parquet.column.Encoding dlEncoding, - org.apache.parquet.column.Encoding valuesEncoding, - int crc, - OutputStream to) - throws IOException { - writeDataPageV1Header( - uncompressedSize, - compressedSize, - valueCount, - rlEncoding, - dlEncoding, - valuesEncoding, - crc, - to, - null, - null); - } - - public void writeDataPageV1Header( - int uncompressedSize, - int compressedSize, - int valueCount, - org.apache.parquet.column.Encoding rlEncoding, - org.apache.parquet.column.Encoding dlEncoding, - org.apache.parquet.column.Encoding valuesEncoding, - int crc, - OutputStream to, - BlockCipher.Encryptor blockEncryptor, - byte[] pageHeaderAAD) - throws IOException { - writePageHeader( - newDataPageHeader( - uncompressedSize, - compressedSize, - valueCount, - rlEncoding, - dlEncoding, - valuesEncoding, - crc), - to, - blockEncryptor, - pageHeaderAAD); - } - - public void writeDataPageV2Header( - int uncompressedSize, - int compressedSize, - int valueCount, - int nullCount, - int rowCount, - org.apache.parquet.column.Encoding dataEncoding, - int rlByteLength, - int dlByteLength, - OutputStream to) - throws IOException { - writeDataPageV2Header( - uncompressedSize, - compressedSize, - valueCount, - nullCount, - rowCount, - dataEncoding, - rlByteLength, - dlByteLength, - to, - null, - null); - } - - public void writeDataPageV2Header( - int uncompressedSize, - int compressedSize, - int valueCount, - int nullCount, - int rowCount, - org.apache.parquet.column.Encoding dataEncoding, - int rlByteLength, - int dlByteLength, - OutputStream to, - BlockCipher.Encryptor blockEncryptor, - byte[] pageHeaderAAD) - throws IOException { - writePageHeader( - newDataPageV2Header( - uncompressedSize, - compressedSize, - valueCount, - nullCount, - rowCount, - dataEncoding, - rlByteLength, - dlByteLength), - to, - blockEncryptor, - pageHeaderAAD); - } - - private PageHeader newDataPageV2Header( - int uncompressedSize, - int compressedSize, - int valueCount, - int nullCount, - int rowCount, - org.apache.parquet.column.Encoding dataEncoding, - int rlByteLength, - int dlByteLength) { - // TODO: pageHeader.crc = ...; - DataPageHeaderV2 dataPageHeaderV2 = - new DataPageHeaderV2( - valueCount, nullCount, rowCount, getEncoding(dataEncoding), dlByteLength, rlByteLength); - PageHeader pageHeader = new PageHeader(PageType.DATA_PAGE_V2, uncompressedSize, compressedSize); - pageHeader.setData_page_header_v2(dataPageHeaderV2); - return pageHeader; - } - - public void writeDictionaryPageHeader( - int uncompressedSize, - int compressedSize, - int valueCount, - org.apache.parquet.column.Encoding valuesEncoding, - OutputStream to) - throws IOException { - writeDictionaryPageHeader( - uncompressedSize, compressedSize, valueCount, valuesEncoding, to, null, null); - } - - public void writeDictionaryPageHeader( - int uncompressedSize, - int compressedSize, - int valueCount, - org.apache.parquet.column.Encoding valuesEncoding, - OutputStream to, - BlockCipher.Encryptor blockEncryptor, - byte[] pageHeaderAAD) - throws IOException { - PageHeader pageHeader = - new PageHeader(PageType.DICTIONARY_PAGE, uncompressedSize, compressedSize); - pageHeader.setDictionary_page_header( - new DictionaryPageHeader(valueCount, getEncoding(valuesEncoding))); - writePageHeader(pageHeader, to, blockEncryptor, pageHeaderAAD); - } - - public void writeDictionaryPageHeader( - int uncompressedSize, - int compressedSize, - int valueCount, - org.apache.parquet.column.Encoding valuesEncoding, - int crc, - OutputStream to) - throws IOException { - writeDictionaryPageHeader( - uncompressedSize, compressedSize, valueCount, valuesEncoding, crc, to, null, null); - } - - public void writeDictionaryPageHeader( - int uncompressedSize, - int compressedSize, - int valueCount, - org.apache.parquet.column.Encoding valuesEncoding, - int crc, - OutputStream to, - BlockCipher.Encryptor blockEncryptor, - byte[] pageHeaderAAD) - throws IOException { - PageHeader pageHeader = - new PageHeader(PageType.DICTIONARY_PAGE, uncompressedSize, compressedSize); - pageHeader.setCrc(crc); - pageHeader.setDictionary_page_header( - new DictionaryPageHeader(valueCount, getEncoding(valuesEncoding))); - writePageHeader(pageHeader, to, blockEncryptor, pageHeaderAAD); - } - - private static BoundaryOrder toParquetBoundaryOrder( - org.apache.parquet.internal.column.columnindex.BoundaryOrder boundaryOrder) { - switch (boundaryOrder) { - case ASCENDING: - return BoundaryOrder.ASCENDING; - case DESCENDING: - return BoundaryOrder.DESCENDING; - case UNORDERED: - return BoundaryOrder.UNORDERED; - default: - throw new IllegalArgumentException("Unsupported boundary order: " + boundaryOrder); - } - } - - private static org.apache.parquet.internal.column.columnindex.BoundaryOrder - fromParquetBoundaryOrder(BoundaryOrder boundaryOrder) { - switch (boundaryOrder) { - case ASCENDING: - return org.apache.parquet.internal.column.columnindex.BoundaryOrder.ASCENDING; - case DESCENDING: - return org.apache.parquet.internal.column.columnindex.BoundaryOrder.DESCENDING; - case UNORDERED: - return org.apache.parquet.internal.column.columnindex.BoundaryOrder.UNORDERED; - default: - throw new IllegalArgumentException("Unsupported boundary order: " + boundaryOrder); - } - } - - public static ColumnIndex toParquetColumnIndex( - PrimitiveType type, org.apache.parquet.internal.column.columnindex.ColumnIndex columnIndex) { - if (!isMinMaxStatsSupported(type) || columnIndex == null) { - return null; - } - ColumnIndex parquetColumnIndex = - new ColumnIndex( - columnIndex.getNullPages(), - columnIndex.getMinValues(), - columnIndex.getMaxValues(), - toParquetBoundaryOrder(columnIndex.getBoundaryOrder())); - parquetColumnIndex.setNull_counts(columnIndex.getNullCounts()); - return parquetColumnIndex; - } - - public static org.apache.parquet.internal.column.columnindex.ColumnIndex fromParquetColumnIndex( - PrimitiveType type, ColumnIndex parquetColumnIndex) { - if (!isMinMaxStatsSupported(type)) { - return null; - } - return ColumnIndexBuilder.build( - type, - fromParquetBoundaryOrder(parquetColumnIndex.getBoundary_order()), - parquetColumnIndex.getNull_pages(), - parquetColumnIndex.getNull_counts(), - parquetColumnIndex.getMin_values(), - parquetColumnIndex.getMax_values()); - } - - public static OffsetIndex toParquetOffsetIndex( - org.apache.parquet.internal.column.columnindex.OffsetIndex offsetIndex) { - List pageLocations = new ArrayList<>(offsetIndex.getPageCount()); - for (int i = 0, n = offsetIndex.getPageCount(); i < n; ++i) { - pageLocations.add( - new PageLocation( - offsetIndex.getOffset(i), - offsetIndex.getCompressedPageSize(i), - offsetIndex.getFirstRowIndex(i))); - } - return new OffsetIndex(pageLocations); - } - - public static org.apache.parquet.internal.column.columnindex.OffsetIndex fromParquetOffsetIndex( - OffsetIndex parquetOffsetIndex) { - OffsetIndexBuilder builder = OffsetIndexBuilder.getBuilder(); - for (PageLocation pageLocation : parquetOffsetIndex.getPage_locations()) { - builder.add( - pageLocation.getOffset(), - pageLocation.getCompressed_page_size(), - pageLocation.getFirst_row_index()); - } - return builder.build(); - } - - public static BloomFilterHeader toBloomFilterHeader(BloomFilter bloomFilter) { - - BloomFilterAlgorithm algorithm = null; - BloomFilterHash hashStrategy = null; - BloomFilterCompression compression = null; - - if (bloomFilter.getAlgorithm() == BloomFilter.Algorithm.BLOCK) { - algorithm = BloomFilterAlgorithm.BLOCK(new SplitBlockAlgorithm()); - } - - if (bloomFilter.getHashStrategy() == BloomFilter.HashStrategy.XXH64) { - hashStrategy = BloomFilterHash.XXHASH(new XxHash()); - } - - if (bloomFilter.getCompression() == BloomFilter.Compression.UNCOMPRESSED) { - compression = BloomFilterCompression.UNCOMPRESSED(new Uncompressed()); - } - - if (algorithm != null && hashStrategy != null && compression != null) { - return new BloomFilterHeader( - bloomFilter.getBitsetSize(), algorithm, hashStrategy, compression); - } else { - throw new IllegalArgumentException( - String.format( - "Failed to build thrift structure for BloomFilterHeader," - + "algorithm=%s, hash=%s, compression=%s", - bloomFilter.getAlgorithm(), - bloomFilter.getHashStrategy(), - bloomFilter.getCompression())); - } - } -} diff --git a/src/main/java/org/apache/parquet/local/ParquetReadOptions.java b/src/main/java/org/apache/parquet/local/ParquetReadOptions.java deleted file mode 100644 index 5610423..0000000 --- a/src/main/java/org/apache/parquet/local/ParquetReadOptions.java +++ /dev/null @@ -1,295 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.parquet.local; - -import org.apache.parquet.bytes.ByteBufferAllocator; -import org.apache.parquet.bytes.HeapByteBufferAllocator; -import org.apache.parquet.compression.CompressionCodecFactory; -import org.apache.parquet.crypto.FileDecryptionProperties; -import org.apache.parquet.filter2.compat.FilterCompat; - -public class ParquetReadOptions { - private static final boolean RECORD_FILTERING_ENABLED_DEFAULT = true; - private static final boolean STATS_FILTERING_ENABLED_DEFAULT = true; - private static final boolean DICTIONARY_FILTERING_ENABLED_DEFAULT = true; - private static final boolean COLUMN_INDEX_FILTERING_ENABLED_DEFAULT = true; - private static final int ALLOCATION_SIZE_DEFAULT = 8388608; // 8MB - private static final boolean PAGE_VERIFY_CHECKSUM_ENABLED_DEFAULT = false; - private static final boolean BLOOM_FILTER_ENABLED_DEFAULT = true; - private static final boolean USE_PREVIOUS_FILTER_DEFAULT = false; - private static final double LAZY_FETCH_RATIO_DEFAULT = 0.9; - - private final boolean useSignedStringMinMax; - private final boolean useStatsFilter; - private final boolean useDictionaryFilter; - private final boolean useRecordFilter; - private final boolean useColumnIndexFilter; - private final boolean usePageChecksumVerification; - private final boolean useBloomFilter; - private final boolean usePreivousFilter; - private final double lazyFetchRatio; - private final FilterCompat.Filter recordFilter; - private final ParquetMetadataConverter.MetadataFilter metadataFilter; - private final CompressionCodecFactory codecFactory; - private final ByteBufferAllocator allocator; - private final int maxAllocationSize; - private final FileDecryptionProperties fileDecryptionProperties; - - ParquetReadOptions( - boolean useSignedStringMinMax, - boolean useStatsFilter, - boolean useDictionaryFilter, - boolean useRecordFilter, - boolean useColumnIndexFilter, - boolean usePageChecksumVerification, - boolean useBloomFilter, - boolean usePreivousFilter, - double lazyFetchRatio, - FilterCompat.Filter recordFilter, - ParquetMetadataConverter.MetadataFilter metadataFilter, - CompressionCodecFactory codecFactory, - ByteBufferAllocator allocator, - int maxAllocationSize, - FileDecryptionProperties fileDecryptionProperties) { - this.useSignedStringMinMax = useSignedStringMinMax; - this.useStatsFilter = useStatsFilter; - this.useDictionaryFilter = useDictionaryFilter; - this.useRecordFilter = useRecordFilter; - this.useColumnIndexFilter = useColumnIndexFilter; - this.usePageChecksumVerification = usePageChecksumVerification; - this.useBloomFilter = useBloomFilter; - this.usePreivousFilter = usePreivousFilter; - this.lazyFetchRatio = lazyFetchRatio; - this.recordFilter = recordFilter; - this.metadataFilter = metadataFilter; - this.codecFactory = codecFactory; - this.allocator = allocator; - this.maxAllocationSize = maxAllocationSize; - this.fileDecryptionProperties = fileDecryptionProperties; - } - - public boolean useSignedStringMinMax() { - return useSignedStringMinMax; - } - - public boolean useStatsFilter() { - return useStatsFilter; - } - - public boolean useDictionaryFilter() { - return useDictionaryFilter; - } - - public boolean useRecordFilter() { - return useRecordFilter; - } - - public boolean useColumnIndexFilter() { - return useColumnIndexFilter; - } - - public boolean useBloomFilter() { - return useBloomFilter; - } - - public boolean usePageChecksumVerification() { - return usePageChecksumVerification; - } - - public FilterCompat.Filter getRecordFilter() { - return recordFilter; - } - - public ParquetMetadataConverter.MetadataFilter getMetadataFilter() { - return metadataFilter; - } - - public CompressionCodecFactory getCodecFactory() { - return codecFactory; - } - - public ByteBufferAllocator getAllocator() { - return allocator; - } - - public int getMaxAllocationSize() { - return maxAllocationSize; - } - - public FileDecryptionProperties getDecryptionProperties() { - return fileDecryptionProperties; - } - - public static Builder builder() { - return new Builder(); - } - - public static class Builder { - protected boolean useSignedStringMinMax = false; - protected boolean useStatsFilter = STATS_FILTERING_ENABLED_DEFAULT; - protected boolean useDictionaryFilter = DICTIONARY_FILTERING_ENABLED_DEFAULT; - protected boolean useRecordFilter = RECORD_FILTERING_ENABLED_DEFAULT; - protected boolean useColumnIndexFilter = COLUMN_INDEX_FILTERING_ENABLED_DEFAULT; - protected boolean usePageChecksumVerification = PAGE_VERIFY_CHECKSUM_ENABLED_DEFAULT; - protected boolean useBloomFilter = BLOOM_FILTER_ENABLED_DEFAULT; - private boolean usePreivousFilter; - private double lazyFetchRatio; - protected FilterCompat.Filter recordFilter = null; - protected ParquetMetadataConverter.MetadataFilter metadataFilter = - ParquetMetadataConverter.NO_FILTER; - // the page size parameter isn't used when only using the codec factory to get decompressors - protected CompressionCodecFactory codecFactory = null; - protected ByteBufferAllocator allocator = new HeapByteBufferAllocator(); - protected int maxAllocationSize = ALLOCATION_SIZE_DEFAULT; - protected FileDecryptionProperties fileDecryptionProperties = null; - - public Builder useSignedStringMinMax(boolean useSignedStringMinMax) { - this.useSignedStringMinMax = useSignedStringMinMax; - return this; - } - - public Builder useStatsFilter(boolean useStatsFilter) { - this.useStatsFilter = useStatsFilter; - return this; - } - - public Builder useDictionaryFilter(boolean useDictionaryFilter) { - this.useDictionaryFilter = useDictionaryFilter; - return this; - } - - public Builder useRecordFilter(boolean useRecordFilter) { - this.useRecordFilter = useRecordFilter; - return this; - } - - public Builder useColumnIndexFilter(boolean useColumnIndexFilter) { - this.useColumnIndexFilter = useColumnIndexFilter; - return this; - } - - public Builder usePageChecksumVerification(boolean usePageChecksumVerification) { - this.usePageChecksumVerification = usePageChecksumVerification; - return this; - } - - public Builder useBloomFilter(boolean useBloomFilter) { - this.useBloomFilter = useBloomFilter; - return this; - } - - public Builder usePreviousFilter(boolean usePreviousFilter) { - this.usePreivousFilter = usePreviousFilter; - return this; - } - - public Builder withLazyFetchRatio(double lazyFetchRatio) { - this.lazyFetchRatio = lazyFetchRatio; - return this; - } - - public Builder withRecordFilter(FilterCompat.Filter rowGroupFilter) { - this.recordFilter = rowGroupFilter; - return this; - } - - public Builder withRange(long start, long end) { - this.metadataFilter = ParquetMetadataConverter.range(start, end); - return this; - } - - public Builder withOffsets(long... rowGroupOffsets) { - this.metadataFilter = ParquetMetadataConverter.offsets(rowGroupOffsets); - return this; - } - - public Builder withMetadataFilter(ParquetMetadataConverter.MetadataFilter metadataFilter) { - this.metadataFilter = metadataFilter; - return this; - } - - public Builder withCodecFactory(CompressionCodecFactory codecFactory) { - this.codecFactory = codecFactory; - return this; - } - - public Builder withAllocator(ByteBufferAllocator allocator) { - this.allocator = allocator; - return this; - } - - public Builder withMaxAllocationInBytes(int allocationSizeInBytes) { - this.maxAllocationSize = allocationSizeInBytes; - return this; - } - - public Builder withPageChecksumVerification(boolean val) { - this.usePageChecksumVerification = val; - return this; - } - - public Builder withDecryption(FileDecryptionProperties fileDecryptionProperties) { - this.fileDecryptionProperties = fileDecryptionProperties; - return this; - } - - public Builder copy(ParquetReadOptions options) { - useSignedStringMinMax(options.useSignedStringMinMax); - useStatsFilter(options.useStatsFilter); - useDictionaryFilter(options.useDictionaryFilter); - useRecordFilter(options.useRecordFilter); - useColumnIndexFilter(options.useColumnIndexFilter); - usePageChecksumVerification(options.usePageChecksumVerification); - useBloomFilter(options.useBloomFilter); - usePreviousFilter(options.usePreivousFilter); - withLazyFetchRatio(options.lazyFetchRatio); - withRecordFilter(options.recordFilter); - withMetadataFilter(options.metadataFilter); - withCodecFactory(options.codecFactory); - withAllocator(options.allocator); - withPageChecksumVerification(options.usePageChecksumVerification); - withDecryption(options.fileDecryptionProperties); - return this; - } - - public ParquetReadOptions build() { - if (codecFactory == null) { - codecFactory = new CodecFactory(); - } - - return new ParquetReadOptions( - useSignedStringMinMax, - useStatsFilter, - useDictionaryFilter, - useRecordFilter, - useColumnIndexFilter, - usePageChecksumVerification, - useBloomFilter, - usePreivousFilter, - lazyFetchRatio, - recordFilter, - metadataFilter, - codecFactory, - allocator, - maxAllocationSize, - fileDecryptionProperties); - } - } -} diff --git a/src/main/java/org/apache/parquet/local/ParquetRecordReader.java b/src/main/java/org/apache/parquet/local/ParquetRecordReader.java deleted file mode 100644 index b0cd0d9..0000000 --- a/src/main/java/org/apache/parquet/local/ParquetRecordReader.java +++ /dev/null @@ -1,199 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.parquet.local; - -import org.apache.parquet.column.page.PageReadStore; -import org.apache.parquet.filter2.compat.FilterCompat; -import org.apache.parquet.filter2.compat.FilterCompat.Filter; -import org.apache.parquet.hadoop.metadata.FileMetaData; -import org.apache.parquet.io.ColumnIOFactory; -import org.apache.parquet.io.MessageColumnIO; -import org.apache.parquet.io.ParquetDecodingException; -import org.apache.parquet.io.api.RecordMaterializer; -import org.apache.parquet.io.api.RecordMaterializer.RecordMaterializationException; -import org.apache.parquet.schema.MessageType; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.util.Optional; -import java.util.PrimitiveIterator; -import java.util.stream.LongStream; - -import static java.lang.String.format; - -public class ParquetRecordReader { - private static final Logger LOG = LoggerFactory.getLogger(ParquetRecordReader.class); - - private ColumnIOFactory columnIOFactory = null; - private final Filter filter; - private MessageType requestedSchema; - private MessageType fileSchema; - private RecordMaterializer recordMaterializer; - private T currentValue; - private long total; - private long current = 0; - private int currentBlock = -1; - private ParquetFileReader reader; - private long currentRowIdx = -1; - private PrimitiveIterator.OfLong rowIdxInFileItr; - private org.apache.parquet.io.RecordReader recordReader; - - private long totalCountLoadedSoFar = 0; - - public ParquetRecordReader( - RecordMaterializer recordMaterializer, - ParquetFileReader reader, - ParquetReadOptions options) { - this.recordMaterializer = recordMaterializer; - this.filter = - options.getRecordFilter() == null || !options.useRecordFilter() - ? FilterCompat.NOOP - : options.getRecordFilter(); - this.reader = reader; - this.requestedSchema = reader.getRequestedSchema(); - this.total = reader.getFilteredRecordCount(); - - FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData(); - this.fileSchema = parquetFileMetadata.getSchema(); - this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy()); - } - - private void checkRead() throws IOException { - if (current == totalCountLoadedSoFar) { - PageReadStore pages = reader.readNextFilteredRowGroup(); - if (pages == null) { - throw new IOException( - "expecting more rows but reached last block. Read " + current + " out of " + total); - } - resetRowIndexIterator(pages); - - MessageColumnIO columnIO = columnIOFactory.getColumnIO(requestedSchema, fileSchema); - recordReader = columnIO.getRecordReader(pages, recordMaterializer, filter); - totalCountLoadedSoFar += pages.getRowCount(); - ++currentBlock; - } - } - - public void close() throws IOException { - if (reader != null) { - reader.close(); - } - } - - public T getCurrentValue() { - return currentValue; - } - - public boolean nextKeyValue() throws IOException { - boolean recordFound = false; - - while (!recordFound) { - // no more records left - if (current >= total) { - return false; - } - - try { - checkRead(); - current++; - - try { - currentValue = recordReader.read(); - if (rowIdxInFileItr != null && rowIdxInFileItr.hasNext()) { - currentRowIdx = rowIdxInFileItr.next(); - } else { - currentRowIdx = -1; - } - } catch (RecordMaterializationException e) { - // this might throw, but it's fatal if it does. - LOG.debug("skipping a corrupt record"); - continue; - } - - if (recordReader.shouldSkipCurrentRecord()) { - // this record is being filtered via the filter2 package - LOG.debug("skipping record"); - continue; - } - - if (currentValue == null) { - // only happens with FilteredRecordReader at end of block - current = totalCountLoadedSoFar; - LOG.debug("filtered record reader reached end of block"); - continue; - } - - recordFound = true; - - LOG.debug("read value: {}", currentValue); - } catch (RuntimeException e) { - throw new ParquetDecodingException( - format( - "Can not read value at %d in block %d in file %s", - current, currentBlock, reader.getFile()), - e); - } - } - return true; - } - - /** - * Returns the row index of the current row. If no row has been processed or if the row index - * information is unavailable from the underlying @{@link PageReadStore}, returns -1. - */ - public long getCurrentRowIndex() { - if (current == 0L || rowIdxInFileItr == null) { - return -1; - } - return currentRowIdx; - } - - /** Resets the row index iterator based on the current processed row group. */ - private void resetRowIndexIterator(PageReadStore pages) { - Optional rowGroupRowIdxOffset = pages.getRowIndexOffset(); - if (!rowGroupRowIdxOffset.isPresent()) { - this.rowIdxInFileItr = null; - return; - } - - currentRowIdx = -1; - final PrimitiveIterator.OfLong rowIdxInRowGroupItr; - if (pages.getRowIndexes().isPresent()) { - rowIdxInRowGroupItr = pages.getRowIndexes().get(); - } else { - rowIdxInRowGroupItr = LongStream.range(0, pages.getRowCount()).iterator(); - } - // Adjust the row group offset in the `rowIndexWithinRowGroupIterator` iterator. - this.rowIdxInFileItr = - new PrimitiveIterator.OfLong() { - public long nextLong() { - return rowGroupRowIdxOffset.get() + rowIdxInRowGroupItr.nextLong(); - } - - public boolean hasNext() { - return rowIdxInRowGroupItr.hasNext(); - } - - public Long next() { - return rowGroupRowIdxOffset.get() + rowIdxInRowGroupItr.next(); - } - }; - } -} diff --git a/src/main/java/org/apache/parquet/local/ParquetRecordWriter.java b/src/main/java/org/apache/parquet/local/ParquetRecordWriter.java deleted file mode 100644 index 194abe4..0000000 --- a/src/main/java/org/apache/parquet/local/ParquetRecordWriter.java +++ /dev/null @@ -1,187 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.parquet.local; - -import org.apache.parquet.column.ColumnWriteStore; -import org.apache.parquet.column.ParquetProperties; -import org.apache.parquet.compression.CompressionCodecFactory; -import org.apache.parquet.crypto.InternalFileEncryptor; -import org.apache.parquet.io.ColumnIOFactory; -import org.apache.parquet.io.MessageColumnIO; -import org.apache.parquet.io.api.RecordConsumer; -import org.apache.parquet.io.api.RecordDematerializer; -import org.apache.parquet.schema.MessageType; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.util.Map; -import java.util.Objects; - -import static java.lang.Math.max; -import static java.lang.Math.min; - -public class ParquetRecordWriter { - private static final Logger LOG = LoggerFactory.getLogger(ParquetRecordWriter.class); - - private final ParquetFileWriter parquetFileWriter; - private final RecordDematerializer recordDematerializer; - private final MessageType schema; - private final Map extraMetaData; - private final long rowGroupSizeThreshold; - private long nextRowGroupSize; - private final CompressionCodecFactory.BytesInputCompressor compressor; - private final boolean validating; - private final ParquetProperties props; - - private boolean closed; - - private long recordCount = 0; - private long recordCountForNextMemCheck; - private long lastRowGroupEndPos = 0; - - private ColumnWriteStore columnStore; - private ColumnChunkPageWriteStore pageStore; - private RecordConsumer recordConsumer; - - private final InternalFileEncryptor fileEncryptor; - private int rowGroupOrdinal; - - public ParquetRecordWriter( - ParquetFileWriter parquetFileWriter, - RecordDematerializer recordDematerializer, - MessageType schema, - Map extraMetaData, - ParquetWriteOptions options) - throws IOException { - parquetFileWriter.start(); - - this.parquetFileWriter = parquetFileWriter; - this.recordDematerializer = - Objects.requireNonNull(recordDematerializer, "writeSupport cannot be null"); - this.schema = schema; - this.extraMetaData = extraMetaData; - this.rowGroupSizeThreshold = options.getRowGroupSize(); - this.nextRowGroupSize = rowGroupSizeThreshold; - this.compressor = options.getCompressor(); - this.validating = options.isEnableValidation(); - this.props = options.getParquetProperties(); - this.fileEncryptor = parquetFileWriter.getEncryptor(); - this.rowGroupOrdinal = 0; - initStore(); - recordCountForNextMemCheck = props.getMinRowCountForPageSizeCheck(); - } - - private void initStore() { - ColumnChunkPageWriteStore columnChunkPageWriteStore = - new ColumnChunkPageWriteStore( - compressor, - schema, - props.getAllocator(), - props.getColumnIndexTruncateLength(), - props.getPageWriteChecksumEnabled(), - fileEncryptor, - rowGroupOrdinal); - pageStore = columnChunkPageWriteStore; - - columnStore = props.newColumnWriteStore(schema, pageStore, columnChunkPageWriteStore); - MessageColumnIO columnIO = new ColumnIOFactory(validating).getColumnIO(schema); - this.recordConsumer = columnIO.getRecordWriter(columnStore); - recordDematerializer.setRecordConsumer(recordConsumer); - } - - public void close() throws IOException, InterruptedException { - if (!closed) { - flushRowGroupToStore(); - parquetFileWriter.end(schema, extraMetaData); - closed = true; - } - } - - public void write(T value) throws IOException { - recordDematerializer.write(value); - ++recordCount; - checkBlockSizeReached(); - } - - /** @return the total size of data written to the file and buffered in memory */ - public long getDataSize() { - return lastRowGroupEndPos + columnStore.getBufferedSize(); - } - - private void checkBlockSizeReached() throws IOException { - if (recordCount - >= recordCountForNextMemCheck) { // checking the memory size is relatively expensive, so - // let's not do it for every record. - long memSize = columnStore.getBufferedSize(); - long recordSize = memSize / recordCount; - // flush the row group if it is within ~2 records of the limit - // it is much better to be slightly under size than to be over at all - if (memSize > (nextRowGroupSize - 2 * recordSize)) { - LOG.debug( - "mem size {} > {}: flushing {} records to disk.", - memSize, - nextRowGroupSize, - recordCount); - flushRowGroupToStore(); - initStore(); - recordCountForNextMemCheck = - min( - max(props.getMinRowCountForPageSizeCheck(), recordCount / 2), - props.getMaxRowCountForPageSizeCheck()); - this.lastRowGroupEndPos = parquetFileWriter.getPos(); - } else { - recordCountForNextMemCheck = - min( - max( - props.getMinRowCountForPageSizeCheck(), - (recordCount + (long) (nextRowGroupSize / ((float) recordSize))) - / 2), // will check halfway - recordCount - + props.getMaxRowCountForPageSizeCheck() // will not look more than max records - // ahead - ); - LOG.debug( - "Checked mem at {} will check again at: {}", recordCount, recordCountForNextMemCheck); - } - } - } - - private void flushRowGroupToStore() throws IOException { - recordConsumer.flush(); - LOG.debug( - "Flushing mem columnStore to file. allocated memory: {}", columnStore.getAllocatedSize()); - if (columnStore.getAllocatedSize() > (3 * rowGroupSizeThreshold)) { - LOG.warn("Too much memory used: {}", columnStore.memUsageString()); - } - - if (recordCount > 0) { - rowGroupOrdinal++; - parquetFileWriter.startBlock(recordCount); - columnStore.flush(); - pageStore.flushToFileWriter(parquetFileWriter); - recordCount = 0; - parquetFileWriter.endBlock(); - this.nextRowGroupSize = min(parquetFileWriter.getNextRowGroupSize(), rowGroupSizeThreshold); - } - - columnStore = null; - pageStore = null; - } -} diff --git a/src/main/java/org/apache/parquet/local/ParquetWriteOptions.java b/src/main/java/org/apache/parquet/local/ParquetWriteOptions.java deleted file mode 100644 index 42226a1..0000000 --- a/src/main/java/org/apache/parquet/local/ParquetWriteOptions.java +++ /dev/null @@ -1,250 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.parquet.local; - -import org.apache.parquet.column.ParquetProperties; -import org.apache.parquet.compression.CompressionCodecFactory; -import org.apache.parquet.crypto.FileEncryptionProperties; -import org.apache.parquet.hadoop.metadata.CompressionCodecName; - -public class ParquetWriteOptions { - - // max size (bytes) to write as padding and the min size of a row group - public static final int MAX_PADDING_SIZE_DEFAULT = 8 * 1024 * 1024; // 8MB - public static final int ROW_GROUP_SIZE_DEFAULT = 128 * 1024 * 1024; // 128MB - public static final boolean ENABLE_OVERWRITE_DEFAULT = true; - public static final boolean ENABLE_VALIDATION_DEFAULT = false; - - private final ParquetProperties parquetProperties; - - private final boolean enableOverwrite; - - private final boolean enableValidation; - - private final long rowGroupSize; - - private final int maxPaddingSize; - - private final CompressionCodecFactory.BytesInputCompressor compressor; - - private final FileEncryptionProperties encryptionProperties; - - public ParquetWriteOptions( - ParquetProperties parquetProperties, - boolean enableOverwrite, - boolean enableValidation, - long rowGroupSize, - int maxPaddingSize, - CompressionCodecFactory.BytesInputCompressor compressor, - FileEncryptionProperties encryptionProperties) { - this.parquetProperties = parquetProperties; - this.enableOverwrite = enableOverwrite; - this.enableValidation = enableValidation; - this.rowGroupSize = rowGroupSize; - this.maxPaddingSize = maxPaddingSize; - this.compressor = compressor; - this.encryptionProperties = encryptionProperties; - } - - public ParquetProperties getParquetProperties() { - return parquetProperties; - } - - public boolean isEnableOverwrite() { - return enableOverwrite; - } - - public boolean isEnableValidation() { - return enableValidation; - } - - public long getRowGroupSize() { - return rowGroupSize; - } - - public int getMaxPaddingSize() { - return maxPaddingSize; - } - - public CompressionCodecFactory.BytesInputCompressor getCompressor() { - return compressor; - } - - public FileEncryptionProperties getEncryptionProperties() { - return encryptionProperties; - } - - public static Builder builder() { - return new Builder(); - } - - public static class Builder { - - private ParquetProperties.Builder parquetPropertiesBuilder = ParquetProperties.builder(); - - private CompressionCodecFactory.BytesInputCompressor compressor = null; - - private FileEncryptionProperties encryptionProperties = null; - - private long rowGroupSize = ROW_GROUP_SIZE_DEFAULT; - - private int maxPaddingSize = MAX_PADDING_SIZE_DEFAULT; - - private boolean enableValidation = ENABLE_VALIDATION_DEFAULT; - - private boolean enableOverwrite = ENABLE_OVERWRITE_DEFAULT; - - public Builder withOverwrite(boolean enableOverwrite) { - this.enableOverwrite = enableOverwrite; - return this; - } - - public Builder withCompressor(CompressionCodecFactory.BytesInputCompressor compressor) { - this.compressor = compressor; - return this; - } - - public Builder withEncryption(FileEncryptionProperties encryptionProperties) { - this.encryptionProperties = encryptionProperties; - return this; - } - - public Builder withRowGroupSize(long rowGroupSize) { - this.rowGroupSize = rowGroupSize; - return this; - } - - public Builder withPageSize(int pageSize) { - parquetPropertiesBuilder.withPageSize(pageSize); - return this; - } - - public Builder withPageRowCountLimit(int rowCount) { - parquetPropertiesBuilder.withPageRowCountLimit(rowCount); - return this; - } - - public Builder withDictionaryPageSize(int dictionaryPageSize) { - parquetPropertiesBuilder.withDictionaryPageSize(dictionaryPageSize); - return this; - } - - public Builder withMaxPaddingSize(int maxPaddingSize) { - this.maxPaddingSize = maxPaddingSize; - return this; - } - - public Builder withDictionaryEncoding(boolean enableDictionary) { - parquetPropertiesBuilder.withDictionaryEncoding(enableDictionary); - return this; - } - - public Builder withByteStreamSplitEncoding(boolean enableByteStreamSplit) { - parquetPropertiesBuilder.withByteStreamSplitEncoding(enableByteStreamSplit); - return this; - } - - public Builder withDictionaryEncoding(String columnPath, boolean enableDictionary) { - parquetPropertiesBuilder.withDictionaryEncoding(columnPath, enableDictionary); - return this; - } - - public Builder withValidation(boolean enableValidation) { - this.enableValidation = enableValidation; - return this; - } - - public Builder withWriterVersion(ParquetProperties.WriterVersion version) { - parquetPropertiesBuilder.withWriterVersion(version); - return this; - } - - public Builder withPageWriteChecksumEnabled(boolean enablePageWriteChecksum) { - parquetPropertiesBuilder.withPageWriteChecksumEnabled(enablePageWriteChecksum); - return this; - } - - public Builder withBloomFilterNDV(String columnPath, long ndv) { - parquetPropertiesBuilder.withBloomFilterNDV(columnPath, ndv); - return this; - } - - public Builder withBloomFilterFPP(String columnPath, double fpp) { - parquetPropertiesBuilder.withBloomFilterFPP(columnPath, fpp); - return this; - } - - public Builder withBloomFilterEnabled(boolean enabled) { - parquetPropertiesBuilder.withBloomFilterEnabled(enabled); - return this; - } - - public Builder withBloomFilterEnabled(String columnPath, boolean enabled) { - parquetPropertiesBuilder.withBloomFilterEnabled(columnPath, enabled); - return this; - } - - public Builder withMinRowCountForPageSizeCheck(int min) { - parquetPropertiesBuilder.withMinRowCountForPageSizeCheck(min); - return this; - } - - public Builder withMaxRowCountForPageSizeCheck(int max) { - parquetPropertiesBuilder.withMaxRowCountForPageSizeCheck(max); - return this; - } - - public Builder withColumnIndexTruncateLength(int length) { - parquetPropertiesBuilder.withColumnIndexTruncateLength(length); - return this; - } - - public Builder withStatisticsTruncateLength(int length) { - parquetPropertiesBuilder.withStatisticsTruncateLength(length); - return this; - } - - public Builder copy(ParquetWriteOptions options) { - this.parquetPropertiesBuilder = ParquetProperties.copy(options.parquetProperties); - withCompressor(options.compressor); - withEncryption(options.encryptionProperties); - withRowGroupSize(options.rowGroupSize); - withMaxPaddingSize(options.maxPaddingSize); - withValidation(options.enableValidation); - withOverwrite(options.enableOverwrite); - return this; - } - - public ParquetWriteOptions build() { - CompressionCodecFactory.BytesInputCompressor compressor = this.compressor; - if (compressor == null) { - compressor = new CodecFactory().getCompressor(CompressionCodecName.UNCOMPRESSED); - } - return new ParquetWriteOptions( - parquetPropertiesBuilder.build(), - enableOverwrite, - enableValidation, - rowGroupSize, - maxPaddingSize, - compressor, - encryptionProperties); - } - } -} diff --git a/src/main/java/org/apache/parquet/local/codec/NoopBytesInputCompressor.java b/src/main/java/org/apache/parquet/local/codec/NoopBytesInputCompressor.java deleted file mode 100644 index 9e607d5..0000000 --- a/src/main/java/org/apache/parquet/local/codec/NoopBytesInputCompressor.java +++ /dev/null @@ -1,38 +0,0 @@ -/* - * Copyright 2023 IginX of Tsinghua University - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.parquet.local.codec; - -import org.apache.parquet.bytes.BytesInput; -import org.apache.parquet.compression.CompressionCodecFactory; -import org.apache.parquet.hadoop.metadata.CompressionCodecName; - -import java.io.IOException; - -public class NoopBytesInputCompressor implements CompressionCodecFactory.BytesInputCompressor { - @Override - public BytesInput compress(BytesInput bytes) throws IOException { - return bytes; - } - - @Override - public CompressionCodecName getCodecName() { - return CompressionCodecName.UNCOMPRESSED; - } - - @Override - public void release() {} -} diff --git a/src/main/java/org/apache/parquet/local/codec/NoopBytesInputDecompressor.java b/src/main/java/org/apache/parquet/local/codec/NoopBytesInputDecompressor.java deleted file mode 100644 index 414a61b..0000000 --- a/src/main/java/org/apache/parquet/local/codec/NoopBytesInputDecompressor.java +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Copyright 2023 IginX of Tsinghua University - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.parquet.local.codec; - -import org.apache.parquet.bytes.BytesInput; -import org.apache.parquet.compression.CompressionCodecFactory; - -import java.io.IOException; -import java.nio.ByteBuffer; - -public class NoopBytesInputDecompressor implements CompressionCodecFactory.BytesInputDecompressor { - @Override - public void decompress( - ByteBuffer input, int compressedSize, ByteBuffer output, int uncompressedSize) - throws IOException { - if (compressedSize != uncompressedSize) { - throw new IOException( - "Non-compressed data did not have matching compressed and uncompressed sizes."); - } - output.clear(); - output.put((ByteBuffer) input.duplicate().position(0).limit(compressedSize)); - } - - @Override - public BytesInput decompress(BytesInput bytes, int uncompressedSize) { - return bytes; - } - - @Override - public void release() {} -} diff --git a/src/main/java/org/apache/parquet/local/codec/SnappyBytesInputCompressor.java b/src/main/java/org/apache/parquet/local/codec/SnappyBytesInputCompressor.java deleted file mode 100644 index c2ba778..0000000 --- a/src/main/java/org/apache/parquet/local/codec/SnappyBytesInputCompressor.java +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Copyright 2023 IginX of Tsinghua University - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.parquet.local.codec; - -import org.apache.parquet.bytes.BytesInput; -import org.apache.parquet.compression.CompressionCodecFactory; -import org.apache.parquet.hadoop.metadata.CompressionCodecName; -import org.xerial.snappy.Snappy; - -import java.io.IOException; - -public class SnappyBytesInputCompressor implements CompressionCodecFactory.BytesInputCompressor { - - @Override - public BytesInput compress(BytesInput bytes) throws IOException { - int maxOutputSize = Snappy.maxCompressedLength((int) bytes.size()); - byte[] outgoing = new byte[maxOutputSize]; - int compressedSize = Snappy.compress(bytes.toByteArray(), 0, (int) bytes.size(), outgoing, 0); - return BytesInput.from(outgoing, 0, compressedSize); - } - - @Override - public CompressionCodecName getCodecName() { - return CompressionCodecName.SNAPPY; - } - - @Override - public void release() {} -} diff --git a/src/main/java/org/apache/parquet/local/codec/SnappyBytesInputDecompressor.java b/src/main/java/org/apache/parquet/local/codec/SnappyBytesInputDecompressor.java deleted file mode 100644 index ef12e5a..0000000 --- a/src/main/java/org/apache/parquet/local/codec/SnappyBytesInputDecompressor.java +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Copyright 2023 IginX of Tsinghua University - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.parquet.local.codec; - -import org.apache.parquet.bytes.BytesInput; -import org.apache.parquet.compression.CompressionCodecFactory; -import org.xerial.snappy.Snappy; - -import java.io.IOException; -import java.nio.ByteBuffer; - -public class SnappyBytesInputDecompressor - implements CompressionCodecFactory.BytesInputDecompressor { - - @Override - public BytesInput decompress(BytesInput bytes, int uncompressedSize) throws IOException { - byte[] ingoing = bytes.toByteArray(); - byte[] outgoing = Snappy.uncompress(ingoing); - if (outgoing.length != uncompressedSize) { - throw new IOException("Non-compressed data did not have matching uncompressed sizes."); - } - return BytesInput.from(outgoing); - } - - @Override - public void decompress( - ByteBuffer input, int compressedSize, ByteBuffer output, int uncompressedSize) - throws IOException { - output.clear(); - int size = Snappy.uncompress(input, output); - output.limit(size); - } - - @Override - public void release() {} -} diff --git a/src/main/java/org/apache/parquet/local/filter2/bloomfilterlevel/BloomFilterImpl.java b/src/main/java/org/apache/parquet/local/filter2/bloomfilterlevel/BloomFilterImpl.java deleted file mode 100644 index 6e2071c..0000000 --- a/src/main/java/org/apache/parquet/local/filter2/bloomfilterlevel/BloomFilterImpl.java +++ /dev/null @@ -1,190 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.parquet.local.filter2.bloomfilterlevel; - -import org.apache.parquet.column.values.bloomfilter.BloomFilter; -import org.apache.parquet.filter2.predicate.FilterPredicate; -import org.apache.parquet.filter2.predicate.Operators; -import org.apache.parquet.filter2.predicate.UserDefinedPredicate; -import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; -import org.apache.parquet.hadoop.metadata.ColumnPath; -import org.apache.parquet.local.BloomFilterReader; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.util.*; - -public class BloomFilterImpl implements FilterPredicate.Visitor { - private static final Logger LOG = LoggerFactory.getLogger(BloomFilterImpl.class); - private static final boolean BLOCK_MIGHT_MATCH = false; - private static final boolean BLOCK_CANNOT_MATCH = true; - - private final Map columns = - new HashMap(); - - public static boolean canDrop( - FilterPredicate pred, - List columns, - BloomFilterReader bloomFilterReader) { - Objects.requireNonNull(pred, "pred"); - Objects.requireNonNull(columns, "columns"); - return pred.accept(new BloomFilterImpl(columns, bloomFilterReader)); - } - - private BloomFilterImpl( - List columnsList, BloomFilterReader bloomFilterReader) { - for (ColumnChunkMetaData chunk : columnsList) { - columns.put(chunk.getPath(), chunk); - } - - this.bloomFilterReader = bloomFilterReader; - } - - private final BloomFilterReader bloomFilterReader; - - private ColumnChunkMetaData getColumnChunk(ColumnPath columnPath) { - return columns.get(columnPath); - } - - @Override - public > Boolean visit(Operators.Eq eq) { - T value = eq.getValue(); - - if (value == null) { - // the bloom filter bitset contains only non-null values so isn't helpful. this - // could check the column stats, but the StatisticsFilter is responsible - return BLOCK_MIGHT_MATCH; - } - - Operators.Column filterColumn = eq.getColumn(); - ColumnChunkMetaData meta = getColumnChunk(filterColumn.getColumnPath()); - if (meta == null) { - // the column isn't in this file so all values are null, but the value - // must be non-null because of the above check. - return BLOCK_CANNOT_MATCH; - } - - try { - BloomFilter bloomFilter = bloomFilterReader.readBloomFilter(meta); - if (bloomFilter != null && !bloomFilter.findHash(bloomFilter.hash(value))) { - return BLOCK_CANNOT_MATCH; - } - } catch (RuntimeException e) { - LOG.warn(e.getMessage()); - return BLOCK_MIGHT_MATCH; - } - - return BLOCK_MIGHT_MATCH; - } - - @Override - public > Boolean visit(Operators.NotEq notEq) { - return BLOCK_MIGHT_MATCH; - } - - @Override - public > Boolean visit(Operators.Lt lt) { - return BLOCK_MIGHT_MATCH; - } - - @Override - public > Boolean visit(Operators.LtEq ltEq) { - return BLOCK_MIGHT_MATCH; - } - - @Override - public > Boolean visit(Operators.Gt gt) { - return BLOCK_MIGHT_MATCH; - } - - @Override - public > Boolean visit(Operators.GtEq gtEq) { - return BLOCK_MIGHT_MATCH; - } - - @Override - public > Boolean visit(Operators.In in) { - Set values = in.getValues(); - - if (values.contains(null)) { - // the bloom filter bitset contains only non-null values so isn't helpful. this - // could check the column stats, but the StatisticsFilter is responsible - return BLOCK_MIGHT_MATCH; - } - - Operators.Column filterColumn = in.getColumn(); - ColumnChunkMetaData meta = getColumnChunk(filterColumn.getColumnPath()); - if (meta == null) { - // the column isn't in this file so all values are null, but the value - // must be non-null because of the above check. - return BLOCK_CANNOT_MATCH; - } - - BloomFilter bloomFilter = bloomFilterReader.readBloomFilter(meta); - if (bloomFilter != null) { - for (T value : values) { - if (bloomFilter.findHash(bloomFilter.hash(value))) { - return BLOCK_MIGHT_MATCH; - } - } - return BLOCK_CANNOT_MATCH; - } - return BLOCK_MIGHT_MATCH; - } - - @Override - public > Boolean visit(Operators.NotIn notIn) { - return BLOCK_MIGHT_MATCH; - } - - @Override - public Boolean visit(Operators.And and) { - return and.getLeft().accept(this) || and.getRight().accept(this); - } - - @Override - public Boolean visit(Operators.Or or) { - return or.getLeft().accept(this) && or.getRight().accept(this); - } - - @Override - public Boolean visit(Operators.Not not) { - throw new IllegalArgumentException( - "This predicate contains a not! Did you forget to run this predicate through LogicalInverseRewriter? " - + not); - } - - private , U extends UserDefinedPredicate> Boolean visit( - Operators.UserDefined ud, boolean inverted) { - return BLOCK_MIGHT_MATCH; - } - - @Override - public , U extends UserDefinedPredicate> Boolean visit( - Operators.UserDefined udp) { - return visit(udp, false); - } - - @Override - public , U extends UserDefinedPredicate> Boolean visit( - Operators.LogicalNotUserDefined udp) { - return visit(udp.getUserDefined(), true); - } -} diff --git a/src/main/java/org/apache/parquet/local/filter2/compat/RowGroupFilter.java b/src/main/java/org/apache/parquet/local/filter2/compat/RowGroupFilter.java deleted file mode 100644 index 2298097..0000000 --- a/src/main/java/org/apache/parquet/local/filter2/compat/RowGroupFilter.java +++ /dev/null @@ -1,118 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.parquet.local.filter2.compat; - -import org.apache.parquet.filter2.compat.FilterCompat; -import org.apache.parquet.filter2.compat.FilterCompat.Filter; -import org.apache.parquet.filter2.compat.FilterCompat.NoOpFilter; -import org.apache.parquet.filter2.compat.FilterCompat.Visitor; -import org.apache.parquet.filter2.dictionarylevel.DictionaryFilter; -import org.apache.parquet.filter2.predicate.FilterPredicate; -import org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator; -import org.apache.parquet.filter2.statisticslevel.StatisticsFilter; -import org.apache.parquet.hadoop.metadata.BlockMetaData; -import org.apache.parquet.local.ParquetFileReader; -import org.apache.parquet.local.filter2.bloomfilterlevel.BloomFilterImpl; -import org.apache.parquet.schema.MessageType; - -import java.util.ArrayList; -import java.util.List; -import java.util.Objects; - -/** - * Given a {@link Filter} applies it to a list of BlockMetaData (row groups) If the Filter is an - * {@link org.apache.parquet.filter.UnboundRecordFilter} or the no op filter, no filtering will be - * performed. - */ -public class RowGroupFilter implements Visitor> { - private final List blocks; - private final MessageType schema; - private final List levels; - private final ParquetFileReader reader; - - public enum FilterLevel { - STATISTICS, - DICTIONARY, - BLOOMFILTER - } - - public static List filterRowGroups( - List levels, - Filter filter, - List blocks, - ParquetFileReader reader) { - Objects.requireNonNull(filter, "filter cannot be null"); - return filter.accept(new RowGroupFilter(levels, blocks, reader)); - } - - private RowGroupFilter( - List levels, List blocks, ParquetFileReader reader) { - this.blocks = Objects.requireNonNull(blocks, "blocks cannnot be null"); - this.reader = Objects.requireNonNull(reader, "reader cannnot be null"); - this.schema = reader.getFileMetaData().getSchema(); - this.levels = levels; - } - - @Override - public List visit(FilterCompat.FilterPredicateCompat filterPredicateCompat) { - FilterPredicate filterPredicate = filterPredicateCompat.getFilterPredicate(); - - // check that the schema of the filter matches the schema of the file - SchemaCompatibilityValidator.validate(filterPredicate, schema); - - List filteredBlocks = new ArrayList(); - - for (BlockMetaData block : blocks) { - boolean drop = false; - - if (levels.contains(FilterLevel.STATISTICS)) { - drop = StatisticsFilter.canDrop(filterPredicate, block.getColumns()); - } - - if (!drop && levels.contains(FilterLevel.DICTIONARY)) { - drop = - DictionaryFilter.canDrop( - filterPredicate, block.getColumns(), reader.getDictionaryReader(block)); - } - - if (!drop && levels.contains(FilterLevel.BLOOMFILTER)) { - drop = - BloomFilterImpl.canDrop( - filterPredicate, block.getColumns(), reader.getBloomFilterDataReader(block)); - } - - if (!drop) { - filteredBlocks.add(block); - } - } - - return filteredBlocks; - } - - @Override - public List visit( - FilterCompat.UnboundRecordFilterCompat unboundRecordFilterCompat) { - return blocks; - } - - @Override - public List visit(NoOpFilter noOpFilter) { - return blocks; - } -} diff --git a/src/main/java/org/apache/parquet/local/metadata/EncryptedColumnChunkMetaData.java b/src/main/java/org/apache/parquet/local/metadata/EncryptedColumnChunkMetaData.java deleted file mode 100644 index 4084db0..0000000 --- a/src/main/java/org/apache/parquet/local/metadata/EncryptedColumnChunkMetaData.java +++ /dev/null @@ -1,191 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.parquet.local.metadata; - -import org.apache.parquet.column.Encoding; -import org.apache.parquet.column.EncodingStats; -import org.apache.parquet.column.statistics.Statistics; -import org.apache.parquet.crypto.AesCipher; -import org.apache.parquet.crypto.InternalColumnDecryptionSetup; -import org.apache.parquet.crypto.InternalFileDecryptor; -import org.apache.parquet.crypto.ModuleCipherFactory.ModuleType; -import org.apache.parquet.crypto.ParquetCryptoRuntimeException; -import org.apache.parquet.format.ColumnMetaData; -import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; -import org.apache.parquet.hadoop.metadata.ColumnChunkProperties; -import org.apache.parquet.hadoop.metadata.ColumnPath; -import org.apache.parquet.hadoop.metadata.CompressionCodecName; -import org.apache.parquet.local.ParquetMetadataConverter; -import org.apache.parquet.schema.PrimitiveType; - -import java.io.ByteArrayInputStream; -import java.io.IOException; -import java.util.Set; - -import static org.apache.parquet.format.Util.readColumnMetaData; - -public class EncryptedColumnChunkMetaData extends ColumnChunkMetaData { - private final ParquetMetadataConverter parquetMetadataConverter; - private final byte[] encryptedMetadata; - private final byte[] columnKeyMetadata; - private final InternalFileDecryptor fileDecryptor; - - private final int columnOrdinal; - private final PrimitiveType primitiveType; - private final String createdBy; - private ColumnPath path; - - private boolean decrypted; - private ColumnChunkMetaData shadowColumnChunkMetaData; - - public EncryptedColumnChunkMetaData( - ParquetMetadataConverter parquetMetadataConverter, - ColumnPath path, - PrimitiveType type, - byte[] encryptedMetadata, - byte[] columnKeyMetadata, - InternalFileDecryptor fileDecryptor, - int rowGroupOrdinal, - int columnOrdinal, - String createdBy) { - super((EncodingStats) null, (ColumnChunkProperties) null); - this.parquetMetadataConverter = parquetMetadataConverter; - this.path = path; - this.encryptedMetadata = encryptedMetadata; - this.columnKeyMetadata = columnKeyMetadata; - this.fileDecryptor = fileDecryptor; - this.rowGroupOrdinal = rowGroupOrdinal; - this.columnOrdinal = columnOrdinal; - this.primitiveType = type; - this.createdBy = createdBy; - - this.decrypted = false; - } - - @Override - protected void decryptIfNeeded() { - if (decrypted) return; - - if (null == fileDecryptor) { - throw new ParquetCryptoRuntimeException(path + ". Null File Decryptor"); - } - - // Decrypt the ColumnMetaData - InternalColumnDecryptionSetup columnDecryptionSetup = - fileDecryptor.setColumnCryptoMetadata(path, true, false, columnKeyMetadata, columnOrdinal); - - ColumnMetaData metaData; - ByteArrayInputStream tempInputStream = new ByteArrayInputStream(encryptedMetadata); - byte[] columnMetaDataAAD = - AesCipher.createModuleAAD( - fileDecryptor.getFileAAD(), - ModuleType.ColumnMetaData, - rowGroupOrdinal, - columnOrdinal, - -1); - try { - metaData = - readColumnMetaData( - tempInputStream, columnDecryptionSetup.getMetaDataDecryptor(), columnMetaDataAAD); - } catch (IOException e) { - throw new ParquetCryptoRuntimeException(path + ". Failed to decrypt column metadata", e); - } - decrypted = true; - shadowColumnChunkMetaData = - parquetMetadataConverter.buildColumnChunkMetaData(metaData, path, primitiveType, createdBy); - if (metaData.isSetBloom_filter_offset()) { - setBloomFilterOffset(metaData.getBloom_filter_offset()); - } - } - - @Override - public ColumnPath getPath() { - return path; - } - - @Override - public long getFirstDataPageOffset() { - decryptIfNeeded(); - return shadowColumnChunkMetaData.getFirstDataPageOffset(); - } - - @Override - public long getDictionaryPageOffset() { - decryptIfNeeded(); - return shadowColumnChunkMetaData.getDictionaryPageOffset(); - } - - @Override - public long getValueCount() { - decryptIfNeeded(); - return shadowColumnChunkMetaData.getValueCount(); - } - - @Override - public long getTotalUncompressedSize() { - decryptIfNeeded(); - return shadowColumnChunkMetaData.getTotalUncompressedSize(); - } - - @Override - public long getTotalSize() { - decryptIfNeeded(); - return shadowColumnChunkMetaData.getTotalSize(); - } - - @Override - public Statistics getStatistics() { - decryptIfNeeded(); - return shadowColumnChunkMetaData.getStatistics(); - } - - @Override - public boolean isEncrypted() { - return true; - } - - public CompressionCodecName getCodec() { - decryptIfNeeded(); - return shadowColumnChunkMetaData.getCodec(); - } - - @Override - public PrimitiveType.PrimitiveTypeName getType() { - decryptIfNeeded(); - return shadowColumnChunkMetaData.getType(); - } - - @Override - public PrimitiveType getPrimitiveType() { - decryptIfNeeded(); - return shadowColumnChunkMetaData.getPrimitiveType(); - } - - @Override - public Set getEncodings() { - decryptIfNeeded(); - return shadowColumnChunkMetaData.getEncodings(); - } - - @Override - public EncodingStats getEncodingStats() { - decryptIfNeeded(); - return shadowColumnChunkMetaData.getEncodingStats(); - } -}