Skip to content

Commit

Permalink
[parquet] support read parquet nested columns. (apache#3656)
Browse files Browse the repository at this point in the history
  • Loading branch information
Stephen0421 authored Jul 3, 2024
1 parent 3c3d384 commit ddd41a4
Show file tree
Hide file tree
Showing 17 changed files with 2,332 additions and 62 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
public class HeapRowVector extends AbstractHeapVector
implements WritableColumnVector, RowColumnVector {

private final WritableColumnVector[] fields;
private WritableColumnVector[] fields;

public HeapRowVector(int len, WritableColumnVector... fields) {
super(len);
Expand Down Expand Up @@ -57,4 +57,8 @@ public void reset() {
field.reset();
}
}

public void setFields(WritableColumnVector[] fields) {
this.fields = fields;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.paimon.utils;

import java.util.Arrays;

/** Minimal implementation of an array-backed list of booleans. */
public class BooleanArrayList {
private int size;

private boolean[] array;

public BooleanArrayList(int capacity) {
this.size = 0;
this.array = new boolean[capacity];
}

public int size() {
return size;
}

public boolean add(boolean element) {
grow(size + 1);
array[size++] = element;
return true;
}

public void clear() {
size = 0;
}

public boolean isEmpty() {
return (size == 0);
}

public boolean[] toArray() {
return Arrays.copyOf(array, size);
}

private void grow(int length) {
if (length > array.length) {
final int newLength =
(int) Math.max(Math.min(2L * array.length, Integer.MAX_VALUE - 8), length);
final boolean[] t = new boolean[newLength];
System.arraycopy(array, 0, t, 0, size);
array = t;
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.paimon.utils;

import java.util.Arrays;
import java.util.NoSuchElementException;

/** Minimal implementation of an array-backed list of ints. */
public class IntArrayList {

private int size;

private int[] array;

public IntArrayList(final int capacity) {
this.size = 0;
this.array = new int[capacity];
}

public int size() {
return size;
}

public boolean add(final int number) {
grow(size + 1);
array[size++] = number;
return true;
}

public int removeLast() {
if (size == 0) {
throw new NoSuchElementException();
}
--size;
return array[size];
}

public void clear() {
size = 0;
}

public boolean isEmpty() {
return size == 0;
}

private void grow(final int length) {
if (length > array.length) {
final int newLength =
(int) Math.max(Math.min(2L * array.length, Integer.MAX_VALUE - 8), length);
final int[] t = new int[newLength];
System.arraycopy(array, 0, t, 0, size);
array = t;
}
}

public int[] toArray() {
return Arrays.copyOf(array, size);
}

public static final IntArrayList EMPTY =
new IntArrayList(0) {

@Override
public boolean add(int number) {
throw new UnsupportedOperationException();
}

@Override
public int removeLast() {
throw new UnsupportedOperationException();
}
};
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.paimon.utils;

import java.util.Arrays;

/** Minimal implementation of an array-backed list of longs. */
public class LongArrayList {

private int size;

private long[] array;

public LongArrayList(int capacity) {
this.size = 0;
this.array = new long[capacity];
}

public int size() {
return size;
}

public boolean add(long number) {
grow(size + 1);
array[size++] = number;
return true;
}

public long removeLong(int index) {
if (index >= size) {
throw new IndexOutOfBoundsException(
"Index (" + index + ") is greater than or equal to list size (" + size + ")");
}
final long old = array[index];
size--;
if (index != size) {
System.arraycopy(array, index + 1, array, index, size - index);
}
return old;
}

public void clear() {
size = 0;
}

public boolean isEmpty() {
return (size == 0);
}

public long[] toArray() {
return Arrays.copyOf(array, size);
}

private void grow(int length) {
if (length > array.length) {
final int newLength =
(int) Math.max(Math.min(2L * array.length, Integer.MAX_VALUE - 8), length);
final long[] t = new long[newLength];
System.arraycopy(array, 0, t, 0, size);
array = t;
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
import org.apache.paimon.format.parquet.reader.ColumnReader;
import org.apache.paimon.format.parquet.reader.ParquetDecimalVector;
import org.apache.paimon.format.parquet.reader.ParquetTimestampVector;
import org.apache.paimon.format.parquet.type.ParquetField;
import org.apache.paimon.fs.Path;
import org.apache.paimon.options.Options;
import org.apache.paimon.reader.RecordReader;
Expand All @@ -42,6 +43,8 @@
import org.apache.parquet.filter2.compat.FilterCompat;
import org.apache.parquet.hadoop.ParquetFileReader;
import org.apache.parquet.hadoop.ParquetInputFormat;
import org.apache.parquet.io.ColumnIOFactory;
import org.apache.parquet.io.MessageColumnIO;
import org.apache.parquet.schema.GroupType;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.Type;
Expand All @@ -57,6 +60,7 @@
import java.util.List;
import java.util.Set;

import static org.apache.paimon.format.parquet.reader.ParquetSplitReaderUtil.buildFieldsList;
import static org.apache.paimon.format.parquet.reader.ParquetSplitReaderUtil.createColumnReader;
import static org.apache.paimon.format.parquet.reader.ParquetSplitReaderUtil.createWritableColumnVector;
import static org.apache.parquet.hadoop.UnmaterializableRecordCounter.BAD_RECORD_THRESHOLD_CONF_KEY;
Expand All @@ -72,6 +76,8 @@ public class ParquetReaderFactory implements FormatReaderFactory {
private static final String ALLOCATION_SIZE = "parquet.read.allocation.size";

private final Options conf;

private final RowType projectedType;
private final String[] projectedFields;
private final DataType[] projectedTypes;
private final int batchSize;
Expand All @@ -81,6 +87,7 @@ public class ParquetReaderFactory implements FormatReaderFactory {
public ParquetReaderFactory(
Options conf, RowType projectedType, int batchSize, FilterCompat.Filter filter) {
this.conf = conf;
this.projectedType = projectedType;
this.projectedFields = projectedType.getFieldNames().toArray(new String[0]);
this.projectedTypes = projectedType.getFieldTypes().toArray(new DataType[0]);
this.batchSize = batchSize;
Expand All @@ -106,7 +113,12 @@ public ParquetReader createReader(FormatReaderFactory.Context context) throws IO
Pool<ParquetReaderBatch> poolOfBatches =
createPoolOfBatches(context.filePath(), requestedSchema);

return new ParquetReader(reader, requestedSchema, reader.getRecordCount(), poolOfBatches);
MessageColumnIO columnIO = new ColumnIOFactory().getColumnIO(requestedSchema);
List<ParquetField> fields =
buildFieldsList(projectedType.getFields(), projectedType.getFieldNames(), columnIO);

return new ParquetReader(
reader, requestedSchema, reader.getRecordCount(), poolOfBatches, fields);
}

private void setReadOptions(ParquetReadOptions.Builder builder) {
Expand Down Expand Up @@ -270,11 +282,14 @@ private class ParquetReader implements RecordReader<InternalRow> {
@SuppressWarnings("rawtypes")
private ColumnReader[] columnReaders;

private final List<ParquetField> fields;

private ParquetReader(
ParquetFileReader reader,
MessageType requestedSchema,
long totalRowCount,
Pool<ParquetReaderBatch> pool) {
Pool<ParquetReaderBatch> pool,
List<ParquetField> fields) {
this.reader = reader;
this.requestedSchema = requestedSchema;
this.totalRowCount = totalRowCount;
Expand All @@ -283,6 +298,7 @@ private ParquetReader(
this.totalCountLoadedSoFar = 0;
this.currentRowPosition = 0;
this.nextRowPosition = 0;
this.fields = fields;
}

@Nullable
Expand Down Expand Up @@ -348,6 +364,7 @@ private void readNextRowGroup() throws IOException {
types.get(i),
requestedSchema.getColumns(),
rowGroup,
fields.get(i),
0);
}
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.paimon.format.parquet.position;

import javax.annotation.Nullable;

/** To represent collection's position in repeated type. */
public class CollectionPosition {
@Nullable private final boolean[] isNull;
private final long[] offsets;

private final long[] length;

private final int valueCount;

public CollectionPosition(boolean[] isNull, long[] offsets, long[] length, int valueCount) {
this.isNull = isNull;
this.offsets = offsets;
this.length = length;
this.valueCount = valueCount;
}

public boolean[] getIsNull() {
return isNull;
}

public long[] getOffsets() {
return offsets;
}

public long[] getLength() {
return length;
}

public int getValueCount() {
return valueCount;
}
}
Loading

0 comments on commit ddd41a4

Please sign in to comment.