From 9873457df6cded72603eb434746fc3ccfca5bd90 Mon Sep 17 00:00:00 2001 From: "ranxianglei.rxl" Date: Wed, 21 Aug 2024 14:44:29 +0800 Subject: [PATCH 1/8] [core] fix hll class not found --- .../java/org/apache/paimon/utils/HllSketchUtil.java | 10 ++++++++++ .../compact/aggregate/FieldHllSketchAgg.java | 11 ++--------- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/paimon-common/src/main/java/org/apache/paimon/utils/HllSketchUtil.java b/paimon-common/src/main/java/org/apache/paimon/utils/HllSketchUtil.java index 34c5464f7106..609862dafc20 100644 --- a/paimon-common/src/main/java/org/apache/paimon/utils/HllSketchUtil.java +++ b/paimon-common/src/main/java/org/apache/paimon/utils/HllSketchUtil.java @@ -21,10 +21,20 @@ import org.apache.paimon.annotation.VisibleForTesting; import org.apache.datasketches.hll.HllSketch; +import org.apache.datasketches.hll.TgtHllType; +import org.apache.datasketches.hll.Union; /** A compressed bitmap for 32-bit integer. */ public class HllSketchUtil { + public static byte[] union(byte[] sketchBytes1, byte[] sketchBytes2) { + HllSketch heapify = HllSketch.heapify((byte[]) sketchBytes1); + org.apache.datasketches.hll.Union union = Union.heapify((byte[]) sketchBytes2); + union.update(heapify); + HllSketch result = union.getResult(TgtHllType.HLL_4); + return result.toCompactByteArray(); + } + @VisibleForTesting public static byte[] sketchOf(int... values) { HllSketch hllSketch = new HllSketch(); diff --git a/paimon-core/src/main/java/org/apache/paimon/mergetree/compact/aggregate/FieldHllSketchAgg.java b/paimon-core/src/main/java/org/apache/paimon/mergetree/compact/aggregate/FieldHllSketchAgg.java index 93901753645a..0ccf4af6497c 100644 --- a/paimon-core/src/main/java/org/apache/paimon/mergetree/compact/aggregate/FieldHllSketchAgg.java +++ b/paimon-core/src/main/java/org/apache/paimon/mergetree/compact/aggregate/FieldHllSketchAgg.java @@ -19,10 +19,7 @@ package org.apache.paimon.mergetree.compact.aggregate; import org.apache.paimon.types.VarBinaryType; - -import org.apache.datasketches.hll.HllSketch; -import org.apache.datasketches.hll.TgtHllType; -import org.apache.datasketches.hll.Union; +import org.apache.paimon.utils.HllSketchUtil; /** HllSketch aggregate a field of a row. */ public class FieldHllSketchAgg extends FieldAggregator { @@ -50,10 +47,6 @@ public Object agg(Object accumulator, Object inputField) { return accumulator == null ? inputField : accumulator; } - HllSketch heapify = HllSketch.heapify((byte[]) accumulator); - Union union = Union.heapify((byte[]) inputField); - union.update(heapify); - HllSketch result = union.getResult(TgtHllType.HLL_4); - return result.toCompactByteArray(); + return HllSketchUtil.union((byte[]) accumulator, (byte[]) inputField); } } From dedcaa4933759e2b41036cd237c5263ab56e450d Mon Sep 17 00:00:00 2001 From: "ranxianglei.rxl" Date: Sun, 22 Sep 2024 20:16:35 +0800 Subject: [PATCH 2/8] [format][orc] open orc switch useSelected,allowSARGToFilter to make sure pushdown works --- .../paimon/format/orc/OrcFileFormat.java | 1 - .../paimon/format/orc/OrcReaderFactory.java | 14 ++++++--- .../orc/reader/AbstractOrcColumnVector.java | 31 ++++++++++++------- .../orc/reader/OrcArrayColumnVector.java | 7 +++-- .../orc/reader/OrcBytesColumnVector.java | 7 +++-- .../orc/reader/OrcDecimalColumnVector.java | 5 +-- .../orc/reader/OrcDoubleColumnVector.java | 6 ++-- .../OrcLegacyTimestampColumnVector.java | 5 +-- .../orc/reader/OrcLongColumnVector.java | 9 ++++-- .../format/orc/reader/OrcMapColumnVector.java | 10 +++--- .../format/orc/reader/OrcRowColumnVector.java | 8 +++-- .../orc/reader/OrcTimestampColumnVector.java | 5 +-- 12 files changed, 68 insertions(+), 40 deletions(-) diff --git a/paimon-format/src/main/java/org/apache/paimon/format/orc/OrcFileFormat.java b/paimon-format/src/main/java/org/apache/paimon/format/orc/OrcFileFormat.java index de28487b715f..fdff90563440 100644 --- a/paimon-format/src/main/java/org/apache/paimon/format/orc/OrcFileFormat.java +++ b/paimon-format/src/main/java/org/apache/paimon/format/orc/OrcFileFormat.java @@ -99,7 +99,6 @@ public Optional createStatsExtractor( public FormatReaderFactory createReaderFactory( RowType projectedRowType, @Nullable List filters) { List orcPredicates = new ArrayList<>(); - if (filters != null) { for (Predicate pred : filters) { Optional orcPred = diff --git a/paimon-format/src/main/java/org/apache/paimon/format/orc/OrcReaderFactory.java b/paimon-format/src/main/java/org/apache/paimon/format/orc/OrcReaderFactory.java index 5093a5010773..3d583333ad0d 100644 --- a/paimon-format/src/main/java/org/apache/paimon/format/orc/OrcReaderFactory.java +++ b/paimon-format/src/main/java/org/apache/paimon/format/orc/OrcReaderFactory.java @@ -123,7 +123,10 @@ public OrcReaderBatch createReaderBatch( for (int i = 0; i < vectors.length; i++) { String name = tableFieldNames.get(i); DataType type = tableFieldTypes.get(i); - vectors[i] = createPaimonVector(orcBatch.cols[tableFieldNames.indexOf(name)], type); + int[] selected = orcBatch.getSelected(); + vectors[i] = + createPaimonVector( + orcBatch.cols[tableFieldNames.indexOf(name)], selected, type); } return new OrcReaderBatch(filePath, orcBatch, new VectorizedColumnBatch(vectors), recycler); } @@ -265,10 +268,11 @@ private static RecordReader createRecordReader( .schema(schema) .range(offsetAndLength.getLeft(), offsetAndLength.getRight()) .useZeroCopy(OrcConf.USE_ZEROCOPY.getBoolean(conf)) - .skipCorruptRecords(OrcConf.SKIP_CORRUPT_DATA.getBoolean(conf)) - .tolerateMissingSchema( - OrcConf.TOLERATE_MISSING_SCHEMA.getBoolean(conf)); - + .skipCorruptRecords(OrcConf.SKIP_CORRUPT_DATA.getBoolean(conf)); + if (!conjunctPredicates.isEmpty()) { + options.useSelected(true); + options.allowSARGToFilter(true); + } // configure filters if (!conjunctPredicates.isEmpty()) { SearchArgument.Builder b = SearchArgumentFactory.newBuilder(); diff --git a/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/AbstractOrcColumnVector.java b/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/AbstractOrcColumnVector.java index 21154c4967b7..5b612005f1c9 100644 --- a/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/AbstractOrcColumnVector.java +++ b/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/AbstractOrcColumnVector.java @@ -40,8 +40,15 @@ public abstract class AbstractOrcColumnVector private final ColumnVector vector; - AbstractOrcColumnVector(ColumnVector vector) { + private final int[] selected; + + AbstractOrcColumnVector(ColumnVector vector, int[] selected) { this.vector = vector; + this.selected = selected; + } + + protected int rowMapper(int r) { + return selected[r]; } @Override @@ -50,27 +57,29 @@ public boolean isNullAt(int i) { } public static org.apache.paimon.data.columnar.ColumnVector createPaimonVector( - ColumnVector vector, DataType dataType) { + ColumnVector vector, int[] selected, DataType dataType) { if (vector instanceof LongColumnVector) { if (dataType.getTypeRoot() == DataTypeRoot.TIMESTAMP_WITHOUT_TIME_ZONE) { - return new OrcLegacyTimestampColumnVector((LongColumnVector) vector); + return new OrcLegacyTimestampColumnVector((LongColumnVector) vector, selected); } else { - return new OrcLongColumnVector((LongColumnVector) vector); + return new OrcLongColumnVector((LongColumnVector) vector, selected); } } else if (vector instanceof DoubleColumnVector) { - return new OrcDoubleColumnVector((DoubleColumnVector) vector); + return new OrcDoubleColumnVector((DoubleColumnVector) vector, selected); } else if (vector instanceof BytesColumnVector) { - return new OrcBytesColumnVector((BytesColumnVector) vector); + return new OrcBytesColumnVector((BytesColumnVector) vector, selected); } else if (vector instanceof DecimalColumnVector) { - return new OrcDecimalColumnVector((DecimalColumnVector) vector); + return new OrcDecimalColumnVector((DecimalColumnVector) vector, selected); } else if (vector instanceof TimestampColumnVector) { - return new OrcTimestampColumnVector(vector); + return new OrcTimestampColumnVector(vector, selected); } else if (vector instanceof ListColumnVector) { - return new OrcArrayColumnVector((ListColumnVector) vector, (ArrayType) dataType); + return new OrcArrayColumnVector( + (ListColumnVector) vector, selected, (ArrayType) dataType); } else if (vector instanceof StructColumnVector) { - return new OrcRowColumnVector((StructColumnVector) vector, (RowType) dataType); + return new OrcRowColumnVector( + (StructColumnVector) vector, selected, (RowType) dataType); } else if (vector instanceof MapColumnVector) { - return new OrcMapColumnVector((MapColumnVector) vector, (MapType) dataType); + return new OrcMapColumnVector((MapColumnVector) vector, selected, (MapType) dataType); } else { throw new UnsupportedOperationException( "Unsupported vector: " + vector.getClass().getName()); diff --git a/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcArrayColumnVector.java b/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcArrayColumnVector.java index ed16a0b51084..ade154221835 100644 --- a/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcArrayColumnVector.java +++ b/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcArrayColumnVector.java @@ -32,14 +32,15 @@ public class OrcArrayColumnVector extends AbstractOrcColumnVector private final ListColumnVector hiveVector; private final ColumnVector paimonVector; - public OrcArrayColumnVector(ListColumnVector hiveVector, ArrayType type) { - super(hiveVector); + public OrcArrayColumnVector(ListColumnVector hiveVector, int[] selected, ArrayType type) { + super(hiveVector, selected); this.hiveVector = hiveVector; - this.paimonVector = createPaimonVector(hiveVector.child, type.getElementType()); + this.paimonVector = createPaimonVector(hiveVector.child, selected, type.getElementType()); } @Override public InternalArray getArray(int i) { + i = rowMapper(i); long offset = hiveVector.offsets[i]; long length = hiveVector.lengths[i]; return new ColumnarArray(paimonVector, (int) offset, (int) length); diff --git a/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcBytesColumnVector.java b/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcBytesColumnVector.java index d48bad886a47..a2664f3b8e45 100644 --- a/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcBytesColumnVector.java +++ b/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcBytesColumnVector.java @@ -26,17 +26,18 @@ public class OrcBytesColumnVector extends AbstractOrcColumnVector private final BytesColumnVector vector; - public OrcBytesColumnVector(BytesColumnVector vector) { - super(vector); + public OrcBytesColumnVector(BytesColumnVector vector, int[] selected) { + super(vector, selected); this.vector = vector; } @Override public Bytes getBytes(int i) { int rowId = vector.isRepeating ? 0 : i; + int selectedRowId = rowMapper(rowId); byte[][] data = vector.vector; int[] start = vector.start; int[] length = vector.length; - return new Bytes(data[rowId], start[rowId], length[rowId]); + return new Bytes(data[selectedRowId], start[selectedRowId], length[selectedRowId]); } } diff --git a/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcDecimalColumnVector.java b/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcDecimalColumnVector.java index 9ea4d763a5d8..30963bb29fee 100644 --- a/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcDecimalColumnVector.java +++ b/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcDecimalColumnVector.java @@ -32,13 +32,14 @@ public class OrcDecimalColumnVector extends AbstractOrcColumnVector private final DecimalColumnVector vector; - public OrcDecimalColumnVector(DecimalColumnVector vector) { - super(vector); + public OrcDecimalColumnVector(DecimalColumnVector vector, int[] selected) { + super(vector, selected); this.vector = vector; } @Override public Decimal getDecimal(int i, int precision, int scale) { + i = rowMapper(i); BigDecimal data = vector.vector[vector.isRepeating ? 0 : i].getHiveDecimal().bigDecimalValue(); return Decimal.fromBigDecimal(data, precision, scale); diff --git a/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcDoubleColumnVector.java b/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcDoubleColumnVector.java index 0c0b0cc51d38..0353e4aea9f4 100644 --- a/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcDoubleColumnVector.java +++ b/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcDoubleColumnVector.java @@ -30,18 +30,20 @@ public class OrcDoubleColumnVector extends AbstractOrcColumnVector private final DoubleColumnVector vector; - public OrcDoubleColumnVector(DoubleColumnVector vector) { - super(vector); + public OrcDoubleColumnVector(DoubleColumnVector vector, int[] selected) { + super(vector, selected); this.vector = vector; } @Override public double getDouble(int i) { + i = rowMapper(i); return vector.vector[vector.isRepeating ? 0 : i]; } @Override public float getFloat(int i) { + i = rowMapper(i); return (float) vector.vector[vector.isRepeating ? 0 : i]; } } diff --git a/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcLegacyTimestampColumnVector.java b/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcLegacyTimestampColumnVector.java index 18227ecf3dd2..508895632374 100644 --- a/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcLegacyTimestampColumnVector.java +++ b/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcLegacyTimestampColumnVector.java @@ -34,13 +34,14 @@ public class OrcLegacyTimestampColumnVector extends AbstractOrcColumnVector private final LongColumnVector hiveVector; - OrcLegacyTimestampColumnVector(LongColumnVector vector) { - super(vector); + OrcLegacyTimestampColumnVector(LongColumnVector vector, int[] selected) { + super(vector, selected); this.hiveVector = vector; } @Override public Timestamp getTimestamp(int i, int precision) { + i = rowMapper(i); int index = hiveVector.isRepeating ? 0 : i; java.sql.Timestamp timestamp = toTimestamp(hiveVector.vector[index]); return Timestamp.fromSQLTimestamp(timestamp); diff --git a/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcLongColumnVector.java b/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcLongColumnVector.java index e7dfe0e6134e..96f6922029a3 100644 --- a/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcLongColumnVector.java +++ b/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcLongColumnVector.java @@ -33,33 +33,38 @@ public class OrcLongColumnVector extends AbstractOrcColumnVector private final LongColumnVector vector; - public OrcLongColumnVector(LongColumnVector vector) { - super(vector); + public OrcLongColumnVector(LongColumnVector vector, int[] selected) { + super(vector, selected); this.vector = vector; } @Override public long getLong(int i) { + i = rowMapper(i); return vector.vector[vector.isRepeating ? 0 : i]; } @Override public boolean getBoolean(int i) { + i = rowMapper(i); return vector.vector[vector.isRepeating ? 0 : i] == 1; } @Override public byte getByte(int i) { + i = rowMapper(i); return (byte) vector.vector[vector.isRepeating ? 0 : i]; } @Override public int getInt(int i) { + i = rowMapper(i); return (int) vector.vector[vector.isRepeating ? 0 : i]; } @Override public short getShort(int i) { + i = rowMapper(i); return (short) vector.vector[vector.isRepeating ? 0 : i]; } } diff --git a/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcMapColumnVector.java b/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcMapColumnVector.java index 66a1af6dccf4..e45aea60b59c 100644 --- a/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcMapColumnVector.java +++ b/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcMapColumnVector.java @@ -33,15 +33,17 @@ public class OrcMapColumnVector extends AbstractOrcColumnVector private final ColumnVector keyPaimonVector; private final ColumnVector valuePaimonVector; - public OrcMapColumnVector(MapColumnVector hiveVector, MapType type) { - super(hiveVector); + public OrcMapColumnVector(MapColumnVector hiveVector, int[] selected, MapType type) { + super(hiveVector, selected); this.hiveVector = hiveVector; - this.keyPaimonVector = createPaimonVector(hiveVector.keys, type.getKeyType()); - this.valuePaimonVector = createPaimonVector(hiveVector.values, type.getValueType()); + this.keyPaimonVector = createPaimonVector(hiveVector.keys, selected, type.getKeyType()); + this.valuePaimonVector = + createPaimonVector(hiveVector.values, selected, type.getValueType()); } @Override public InternalMap getMap(int i) { + i = rowMapper(i); long offset = hiveVector.offsets[i]; long length = hiveVector.lengths[i]; return new ColumnarMap(keyPaimonVector, valuePaimonVector, (int) offset, (int) length); diff --git a/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcRowColumnVector.java b/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcRowColumnVector.java index caa22467f9c3..2572dcb565f2 100644 --- a/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcRowColumnVector.java +++ b/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcRowColumnVector.java @@ -31,18 +31,20 @@ public class OrcRowColumnVector extends AbstractOrcColumnVector private final VectorizedColumnBatch batch; - public OrcRowColumnVector(StructColumnVector hiveVector, RowType type) { - super(hiveVector); + public OrcRowColumnVector(StructColumnVector hiveVector, int[] selected, RowType type) { + super(hiveVector, selected); int len = hiveVector.fields.length; ColumnVector[] paimonVectors = new ColumnVector[len]; for (int i = 0; i < len; i++) { - paimonVectors[i] = createPaimonVector(hiveVector.fields[i], type.getTypeAt(i)); + paimonVectors[i] = + createPaimonVector(hiveVector.fields[i], selected, type.getTypeAt(i)); } this.batch = new VectorizedColumnBatch(paimonVectors); } @Override public ColumnarRow getRow(int i) { + i = rowMapper(i); return new ColumnarRow(batch, i); } diff --git a/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcTimestampColumnVector.java b/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcTimestampColumnVector.java index dd8ac08f2f57..758ef5d6e94c 100644 --- a/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcTimestampColumnVector.java +++ b/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcTimestampColumnVector.java @@ -33,13 +33,14 @@ public class OrcTimestampColumnVector extends AbstractOrcColumnVector private final TimestampColumnVector vector; - public OrcTimestampColumnVector(ColumnVector vector) { - super(vector); + public OrcTimestampColumnVector(ColumnVector vector, int[] selected) { + super(vector, selected); this.vector = (TimestampColumnVector) vector; } @Override public Timestamp getTimestamp(int i, int precision) { + i = rowMapper(i); int index = vector.isRepeating ? 0 : i; return DateTimeUtils.toInternal(vector.time[index], vector.nanos[index] % 1_000_000); } From 462edc6b5cc58f7821fd4153dfe9ac38ce4f6bcf Mon Sep 17 00:00:00 2001 From: "ranxianglei.rxl" Date: Sun, 22 Sep 2024 20:38:18 +0800 Subject: [PATCH 3/8] [format][orc] miss tolerateMissingSchema --- .../java/org/apache/paimon/format/orc/OrcReaderFactory.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/paimon-format/src/main/java/org/apache/paimon/format/orc/OrcReaderFactory.java b/paimon-format/src/main/java/org/apache/paimon/format/orc/OrcReaderFactory.java index 3d583333ad0d..21a7498704b2 100644 --- a/paimon-format/src/main/java/org/apache/paimon/format/orc/OrcReaderFactory.java +++ b/paimon-format/src/main/java/org/apache/paimon/format/orc/OrcReaderFactory.java @@ -268,7 +268,9 @@ private static RecordReader createRecordReader( .schema(schema) .range(offsetAndLength.getLeft(), offsetAndLength.getRight()) .useZeroCopy(OrcConf.USE_ZEROCOPY.getBoolean(conf)) - .skipCorruptRecords(OrcConf.SKIP_CORRUPT_DATA.getBoolean(conf)); + .skipCorruptRecords(OrcConf.SKIP_CORRUPT_DATA.getBoolean(conf)) + .tolerateMissingSchema( + OrcConf.TOLERATE_MISSING_SCHEMA.getBoolean(conf)); if (!conjunctPredicates.isEmpty()) { options.useSelected(true); options.allowSARGToFilter(true); From 2226fb9898cf3f29990c404d0c428954a2b53ab6 Mon Sep 17 00:00:00 2001 From: "ranxianglei.rxl" Date: Sun, 22 Sep 2024 21:08:24 +0800 Subject: [PATCH 4/8] [format][orc] fix orc selected close for no filter condition --- .../java/org/apache/paimon/format/orc/OrcReaderFactory.java | 2 +- .../paimon/format/orc/reader/AbstractOrcColumnVector.java | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/paimon-format/src/main/java/org/apache/paimon/format/orc/OrcReaderFactory.java b/paimon-format/src/main/java/org/apache/paimon/format/orc/OrcReaderFactory.java index 21a7498704b2..9071fe3b2525 100644 --- a/paimon-format/src/main/java/org/apache/paimon/format/orc/OrcReaderFactory.java +++ b/paimon-format/src/main/java/org/apache/paimon/format/orc/OrcReaderFactory.java @@ -123,7 +123,7 @@ public OrcReaderBatch createReaderBatch( for (int i = 0; i < vectors.length; i++) { String name = tableFieldNames.get(i); DataType type = tableFieldTypes.get(i); - int[] selected = orcBatch.getSelected(); + int[] selected = orcBatch.selectedInUse ? orcBatch.getSelected() : null; vectors[i] = createPaimonVector( orcBatch.cols[tableFieldNames.indexOf(name)], selected, type); diff --git a/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/AbstractOrcColumnVector.java b/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/AbstractOrcColumnVector.java index 5b612005f1c9..587ffc1c459d 100644 --- a/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/AbstractOrcColumnVector.java +++ b/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/AbstractOrcColumnVector.java @@ -48,7 +48,10 @@ public abstract class AbstractOrcColumnVector } protected int rowMapper(int r) { - return selected[r]; + if (this.selected != null) { + return selected[r]; + } + return r; } @Override From ee915c893229ca7c308090004ffc31ec987e7990 Mon Sep 17 00:00:00 2001 From: "ranxianglei.rxl" Date: Fri, 27 Sep 2024 11:39:48 +0800 Subject: [PATCH 5/8] [orc] keep useSelected and allowSARGToFilter close default, or deletion vectors would not work --- .../java/org/apache/paimon/format/orc/OrcReaderFactory.java | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/paimon-format/src/main/java/org/apache/paimon/format/orc/OrcReaderFactory.java b/paimon-format/src/main/java/org/apache/paimon/format/orc/OrcReaderFactory.java index 9071fe3b2525..2b9e88d89ccd 100644 --- a/paimon-format/src/main/java/org/apache/paimon/format/orc/OrcReaderFactory.java +++ b/paimon-format/src/main/java/org/apache/paimon/format/orc/OrcReaderFactory.java @@ -272,8 +272,10 @@ private static RecordReader createRecordReader( .tolerateMissingSchema( OrcConf.TOLERATE_MISSING_SCHEMA.getBoolean(conf)); if (!conjunctPredicates.isEmpty()) { - options.useSelected(true); - options.allowSARGToFilter(true); + // TODO fix it , if open this option,future deletion vectors would not work, + // cased by getRowNumber would be changed . + options.useSelected(OrcConf.READER_USE_SELECTED.getBoolean(conf)); + options.allowSARGToFilter(OrcConf.ALLOW_SARG_TO_FILTER.getBoolean(conf)); } // configure filters if (!conjunctPredicates.isEmpty()) { From 2920fc953805d99d07053b48551602f86a12fc2e Mon Sep 17 00:00:00 2001 From: "ranxianglei.rxl" Date: Tue, 15 Oct 2024 11:55:49 +0800 Subject: [PATCH 6/8] [format][orc] VectorizedRowBatch to OrcColumnVector for selected rows can be saw. --- .../paimon/format/orc/OrcReaderFactory.java | 3 +- .../orc/reader/AbstractOrcColumnVector.java | 33 ++++++++++--------- .../orc/reader/OrcArrayColumnVector.java | 8 +++-- .../orc/reader/OrcBytesColumnVector.java | 5 +-- .../orc/reader/OrcDecimalColumnVector.java | 5 +-- .../orc/reader/OrcDoubleColumnVector.java | 5 +-- .../OrcLegacyTimestampColumnVector.java | 5 +-- .../orc/reader/OrcLongColumnVector.java | 5 +-- .../format/orc/reader/OrcMapColumnVector.java | 10 +++--- .../format/orc/reader/OrcRowColumnVector.java | 8 +++-- .../orc/reader/OrcTimestampColumnVector.java | 5 +-- 11 files changed, 52 insertions(+), 40 deletions(-) diff --git a/paimon-format/src/main/java/org/apache/paimon/format/orc/OrcReaderFactory.java b/paimon-format/src/main/java/org/apache/paimon/format/orc/OrcReaderFactory.java index 2b9e88d89ccd..6b1590f03109 100644 --- a/paimon-format/src/main/java/org/apache/paimon/format/orc/OrcReaderFactory.java +++ b/paimon-format/src/main/java/org/apache/paimon/format/orc/OrcReaderFactory.java @@ -123,10 +123,9 @@ public OrcReaderBatch createReaderBatch( for (int i = 0; i < vectors.length; i++) { String name = tableFieldNames.get(i); DataType type = tableFieldTypes.get(i); - int[] selected = orcBatch.selectedInUse ? orcBatch.getSelected() : null; vectors[i] = createPaimonVector( - orcBatch.cols[tableFieldNames.indexOf(name)], selected, type); + orcBatch.cols[tableFieldNames.indexOf(name)], orcBatch, type); } return new OrcReaderBatch(filePath, orcBatch, new VectorizedColumnBatch(vectors), recycler); } diff --git a/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/AbstractOrcColumnVector.java b/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/AbstractOrcColumnVector.java index 587ffc1c459d..377ff37a2e68 100644 --- a/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/AbstractOrcColumnVector.java +++ b/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/AbstractOrcColumnVector.java @@ -33,6 +33,7 @@ import org.apache.hadoop.hive.ql.exec.vector.MapColumnVector; import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector; import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; /** This column vector is used to adapt hive's ColumnVector to Paimon's ColumnVector. */ public abstract class AbstractOrcColumnVector @@ -40,18 +41,18 @@ public abstract class AbstractOrcColumnVector private final ColumnVector vector; - private final int[] selected; + private final VectorizedRowBatch orcBatch; - AbstractOrcColumnVector(ColumnVector vector, int[] selected) { + AbstractOrcColumnVector(ColumnVector vector, VectorizedRowBatch orcBatch) { this.vector = vector; - this.selected = selected; + this.orcBatch = orcBatch; } protected int rowMapper(int r) { - if (this.selected != null) { - return selected[r]; + if (vector.isRepeating) { + return 0; } - return r; + return this.orcBatch.selectedInUse ? this.orcBatch.getSelected()[r] : r; } @Override @@ -60,29 +61,29 @@ public boolean isNullAt(int i) { } public static org.apache.paimon.data.columnar.ColumnVector createPaimonVector( - ColumnVector vector, int[] selected, DataType dataType) { + ColumnVector vector, VectorizedRowBatch orcBatch, DataType dataType) { if (vector instanceof LongColumnVector) { if (dataType.getTypeRoot() == DataTypeRoot.TIMESTAMP_WITHOUT_TIME_ZONE) { - return new OrcLegacyTimestampColumnVector((LongColumnVector) vector, selected); + return new OrcLegacyTimestampColumnVector((LongColumnVector) vector, orcBatch); } else { - return new OrcLongColumnVector((LongColumnVector) vector, selected); + return new OrcLongColumnVector((LongColumnVector) vector, orcBatch); } } else if (vector instanceof DoubleColumnVector) { - return new OrcDoubleColumnVector((DoubleColumnVector) vector, selected); + return new OrcDoubleColumnVector((DoubleColumnVector) vector, orcBatch); } else if (vector instanceof BytesColumnVector) { - return new OrcBytesColumnVector((BytesColumnVector) vector, selected); + return new OrcBytesColumnVector((BytesColumnVector) vector, orcBatch); } else if (vector instanceof DecimalColumnVector) { - return new OrcDecimalColumnVector((DecimalColumnVector) vector, selected); + return new OrcDecimalColumnVector((DecimalColumnVector) vector, orcBatch); } else if (vector instanceof TimestampColumnVector) { - return new OrcTimestampColumnVector(vector, selected); + return new OrcTimestampColumnVector(vector, orcBatch); } else if (vector instanceof ListColumnVector) { return new OrcArrayColumnVector( - (ListColumnVector) vector, selected, (ArrayType) dataType); + (ListColumnVector) vector, orcBatch, (ArrayType) dataType); } else if (vector instanceof StructColumnVector) { return new OrcRowColumnVector( - (StructColumnVector) vector, selected, (RowType) dataType); + (StructColumnVector) vector, orcBatch, (RowType) dataType); } else if (vector instanceof MapColumnVector) { - return new OrcMapColumnVector((MapColumnVector) vector, selected, (MapType) dataType); + return new OrcMapColumnVector((MapColumnVector) vector, orcBatch, (MapType) dataType); } else { throw new UnsupportedOperationException( "Unsupported vector: " + vector.getClass().getName()); diff --git a/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcArrayColumnVector.java b/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcArrayColumnVector.java index ade154221835..25a1935f3e4b 100644 --- a/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcArrayColumnVector.java +++ b/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcArrayColumnVector.java @@ -24,6 +24,7 @@ import org.apache.paimon.types.ArrayType; import org.apache.hadoop.hive.ql.exec.vector.ListColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; /** This column vector is used to adapt hive's ListColumnVector to Paimon's ArrayColumnVector. */ public class OrcArrayColumnVector extends AbstractOrcColumnVector @@ -32,10 +33,11 @@ public class OrcArrayColumnVector extends AbstractOrcColumnVector private final ListColumnVector hiveVector; private final ColumnVector paimonVector; - public OrcArrayColumnVector(ListColumnVector hiveVector, int[] selected, ArrayType type) { - super(hiveVector, selected); + public OrcArrayColumnVector( + ListColumnVector hiveVector, VectorizedRowBatch orcBatch, ArrayType type) { + super(hiveVector, orcBatch); this.hiveVector = hiveVector; - this.paimonVector = createPaimonVector(hiveVector.child, selected, type.getElementType()); + this.paimonVector = createPaimonVector(hiveVector.child, orcBatch, type.getElementType()); } @Override diff --git a/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcBytesColumnVector.java b/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcBytesColumnVector.java index a2664f3b8e45..92b4853aaae4 100644 --- a/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcBytesColumnVector.java +++ b/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcBytesColumnVector.java @@ -19,6 +19,7 @@ package org.apache.paimon.format.orc.reader; import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; /** This column vector is used to adapt hive's BytesColumnVector to Paimon's BytesColumnVector. */ public class OrcBytesColumnVector extends AbstractOrcColumnVector @@ -26,8 +27,8 @@ public class OrcBytesColumnVector extends AbstractOrcColumnVector private final BytesColumnVector vector; - public OrcBytesColumnVector(BytesColumnVector vector, int[] selected) { - super(vector, selected); + public OrcBytesColumnVector(BytesColumnVector vector, VectorizedRowBatch orcBatch) { + super(vector, orcBatch); this.vector = vector; } diff --git a/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcDecimalColumnVector.java b/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcDecimalColumnVector.java index 30963bb29fee..c8545723caf0 100644 --- a/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcDecimalColumnVector.java +++ b/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcDecimalColumnVector.java @@ -21,6 +21,7 @@ import org.apache.paimon.data.Decimal; import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; import java.math.BigDecimal; @@ -32,8 +33,8 @@ public class OrcDecimalColumnVector extends AbstractOrcColumnVector private final DecimalColumnVector vector; - public OrcDecimalColumnVector(DecimalColumnVector vector, int[] selected) { - super(vector, selected); + public OrcDecimalColumnVector(DecimalColumnVector vector, VectorizedRowBatch orcBatch) { + super(vector, orcBatch); this.vector = vector; } diff --git a/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcDoubleColumnVector.java b/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcDoubleColumnVector.java index 0353e4aea9f4..3e19b137a222 100644 --- a/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcDoubleColumnVector.java +++ b/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcDoubleColumnVector.java @@ -19,6 +19,7 @@ package org.apache.paimon.format.orc.reader; import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; /** * This column vector is used to adapt hive's DoubleColumnVector to Paimon's float and double @@ -30,8 +31,8 @@ public class OrcDoubleColumnVector extends AbstractOrcColumnVector private final DoubleColumnVector vector; - public OrcDoubleColumnVector(DoubleColumnVector vector, int[] selected) { - super(vector, selected); + public OrcDoubleColumnVector(DoubleColumnVector vector, VectorizedRowBatch orcBatch) { + super(vector, orcBatch); this.vector = vector; } diff --git a/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcLegacyTimestampColumnVector.java b/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcLegacyTimestampColumnVector.java index 508895632374..eb3e014f2fcf 100644 --- a/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcLegacyTimestampColumnVector.java +++ b/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcLegacyTimestampColumnVector.java @@ -22,6 +22,7 @@ import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; import java.time.LocalDateTime; @@ -34,8 +35,8 @@ public class OrcLegacyTimestampColumnVector extends AbstractOrcColumnVector private final LongColumnVector hiveVector; - OrcLegacyTimestampColumnVector(LongColumnVector vector, int[] selected) { - super(vector, selected); + OrcLegacyTimestampColumnVector(LongColumnVector vector, VectorizedRowBatch orcBatch) { + super(vector, orcBatch); this.hiveVector = vector; } diff --git a/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcLongColumnVector.java b/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcLongColumnVector.java index 96f6922029a3..76c1c198d21d 100644 --- a/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcLongColumnVector.java +++ b/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcLongColumnVector.java @@ -19,6 +19,7 @@ package org.apache.paimon.format.orc.reader; import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; /** * This column vector is used to adapt hive's LongColumnVector to Paimon's boolean, byte, short, int @@ -33,8 +34,8 @@ public class OrcLongColumnVector extends AbstractOrcColumnVector private final LongColumnVector vector; - public OrcLongColumnVector(LongColumnVector vector, int[] selected) { - super(vector, selected); + public OrcLongColumnVector(LongColumnVector vector, VectorizedRowBatch orcBatch) { + super(vector, orcBatch); this.vector = vector; } diff --git a/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcMapColumnVector.java b/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcMapColumnVector.java index e45aea60b59c..c7245275fdd2 100644 --- a/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcMapColumnVector.java +++ b/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcMapColumnVector.java @@ -24,6 +24,7 @@ import org.apache.paimon.types.MapType; import org.apache.hadoop.hive.ql.exec.vector.MapColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; /** This column vector is used to adapt hive's MapColumnVector to Paimon's MapColumnVector. */ public class OrcMapColumnVector extends AbstractOrcColumnVector @@ -33,12 +34,13 @@ public class OrcMapColumnVector extends AbstractOrcColumnVector private final ColumnVector keyPaimonVector; private final ColumnVector valuePaimonVector; - public OrcMapColumnVector(MapColumnVector hiveVector, int[] selected, MapType type) { - super(hiveVector, selected); + public OrcMapColumnVector( + MapColumnVector hiveVector, VectorizedRowBatch orcBatch, MapType type) { + super(hiveVector, orcBatch); this.hiveVector = hiveVector; - this.keyPaimonVector = createPaimonVector(hiveVector.keys, selected, type.getKeyType()); + this.keyPaimonVector = createPaimonVector(hiveVector.keys, orcBatch, type.getKeyType()); this.valuePaimonVector = - createPaimonVector(hiveVector.values, selected, type.getValueType()); + createPaimonVector(hiveVector.values, orcBatch, type.getValueType()); } @Override diff --git a/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcRowColumnVector.java b/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcRowColumnVector.java index 2572dcb565f2..6c73c9fdbe0d 100644 --- a/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcRowColumnVector.java +++ b/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcRowColumnVector.java @@ -24,6 +24,7 @@ import org.apache.paimon.types.RowType; import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; /** This column vector is used to adapt hive's StructColumnVector to Flink's RowColumnVector. */ public class OrcRowColumnVector extends AbstractOrcColumnVector @@ -31,13 +32,14 @@ public class OrcRowColumnVector extends AbstractOrcColumnVector private final VectorizedColumnBatch batch; - public OrcRowColumnVector(StructColumnVector hiveVector, int[] selected, RowType type) { - super(hiveVector, selected); + public OrcRowColumnVector( + StructColumnVector hiveVector, VectorizedRowBatch orcBatch, RowType type) { + super(hiveVector, orcBatch); int len = hiveVector.fields.length; ColumnVector[] paimonVectors = new ColumnVector[len]; for (int i = 0; i < len; i++) { paimonVectors[i] = - createPaimonVector(hiveVector.fields[i], selected, type.getTypeAt(i)); + createPaimonVector(hiveVector.fields[i], orcBatch, type.getTypeAt(i)); } this.batch = new VectorizedColumnBatch(paimonVectors); } diff --git a/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcTimestampColumnVector.java b/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcTimestampColumnVector.java index 758ef5d6e94c..8840a02a8c83 100644 --- a/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcTimestampColumnVector.java +++ b/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcTimestampColumnVector.java @@ -23,6 +23,7 @@ import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; /** * This column vector is used to adapt hive's TimestampColumnVector to Paimon's @@ -33,8 +34,8 @@ public class OrcTimestampColumnVector extends AbstractOrcColumnVector private final TimestampColumnVector vector; - public OrcTimestampColumnVector(ColumnVector vector, int[] selected) { - super(vector, selected); + public OrcTimestampColumnVector(ColumnVector vector, VectorizedRowBatch orcBatch) { + super(vector, orcBatch); this.vector = (TimestampColumnVector) vector; } From 8a89649cecb2e9c886e97cf7ef4a9baf4f1a0a7a Mon Sep 17 00:00:00 2001 From: "ranxianglei.rxl" Date: Tue, 15 Oct 2024 16:09:59 +0800 Subject: [PATCH 7/8] [format][orc] remove all isRepeating --- .../format/orc/reader/AbstractOrcColumnVector.java | 2 +- .../paimon/format/orc/reader/OrcBytesColumnVector.java | 5 ++--- .../format/orc/reader/OrcDecimalColumnVector.java | 3 +-- .../format/orc/reader/OrcDoubleColumnVector.java | 4 ++-- .../orc/reader/OrcLegacyTimestampColumnVector.java | 3 +-- .../paimon/format/orc/reader/OrcLongColumnVector.java | 10 +++++----- .../format/orc/reader/OrcTimestampColumnVector.java | 3 +-- 7 files changed, 13 insertions(+), 17 deletions(-) diff --git a/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/AbstractOrcColumnVector.java b/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/AbstractOrcColumnVector.java index 377ff37a2e68..0557a72230cc 100644 --- a/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/AbstractOrcColumnVector.java +++ b/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/AbstractOrcColumnVector.java @@ -57,7 +57,7 @@ protected int rowMapper(int r) { @Override public boolean isNullAt(int i) { - return !vector.noNulls && vector.isNull[vector.isRepeating ? 0 : i]; + return !vector.noNulls && vector.isNull[rowMapper(i)]; } public static org.apache.paimon.data.columnar.ColumnVector createPaimonVector( diff --git a/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcBytesColumnVector.java b/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcBytesColumnVector.java index 92b4853aaae4..7f812bb5628b 100644 --- a/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcBytesColumnVector.java +++ b/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcBytesColumnVector.java @@ -34,11 +34,10 @@ public OrcBytesColumnVector(BytesColumnVector vector, VectorizedRowBatch orcBatc @Override public Bytes getBytes(int i) { - int rowId = vector.isRepeating ? 0 : i; - int selectedRowId = rowMapper(rowId); + int rowId = rowMapper(i); byte[][] data = vector.vector; int[] start = vector.start; int[] length = vector.length; - return new Bytes(data[selectedRowId], start[selectedRowId], length[selectedRowId]); + return new Bytes(data[rowId], start[rowId], length[rowId]); } } diff --git a/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcDecimalColumnVector.java b/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcDecimalColumnVector.java index c8545723caf0..382c19f45be1 100644 --- a/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcDecimalColumnVector.java +++ b/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcDecimalColumnVector.java @@ -41,8 +41,7 @@ public OrcDecimalColumnVector(DecimalColumnVector vector, VectorizedRowBatch orc @Override public Decimal getDecimal(int i, int precision, int scale) { i = rowMapper(i); - BigDecimal data = - vector.vector[vector.isRepeating ? 0 : i].getHiveDecimal().bigDecimalValue(); + BigDecimal data = vector.vector[i].getHiveDecimal().bigDecimalValue(); return Decimal.fromBigDecimal(data, precision, scale); } } diff --git a/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcDoubleColumnVector.java b/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcDoubleColumnVector.java index 3e19b137a222..f26dac6de9da 100644 --- a/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcDoubleColumnVector.java +++ b/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcDoubleColumnVector.java @@ -39,12 +39,12 @@ public OrcDoubleColumnVector(DoubleColumnVector vector, VectorizedRowBatch orcBa @Override public double getDouble(int i) { i = rowMapper(i); - return vector.vector[vector.isRepeating ? 0 : i]; + return vector.vector[i]; } @Override public float getFloat(int i) { i = rowMapper(i); - return (float) vector.vector[vector.isRepeating ? 0 : i]; + return (float) vector.vector[i]; } } diff --git a/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcLegacyTimestampColumnVector.java b/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcLegacyTimestampColumnVector.java index eb3e014f2fcf..5107e722edb4 100644 --- a/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcLegacyTimestampColumnVector.java +++ b/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcLegacyTimestampColumnVector.java @@ -43,8 +43,7 @@ public class OrcLegacyTimestampColumnVector extends AbstractOrcColumnVector @Override public Timestamp getTimestamp(int i, int precision) { i = rowMapper(i); - int index = hiveVector.isRepeating ? 0 : i; - java.sql.Timestamp timestamp = toTimestamp(hiveVector.vector[index]); + java.sql.Timestamp timestamp = toTimestamp(hiveVector.vector[i]); return Timestamp.fromSQLTimestamp(timestamp); } diff --git a/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcLongColumnVector.java b/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcLongColumnVector.java index 76c1c198d21d..c289b74c58b2 100644 --- a/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcLongColumnVector.java +++ b/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcLongColumnVector.java @@ -42,30 +42,30 @@ public OrcLongColumnVector(LongColumnVector vector, VectorizedRowBatch orcBatch) @Override public long getLong(int i) { i = rowMapper(i); - return vector.vector[vector.isRepeating ? 0 : i]; + return vector.vector[i]; } @Override public boolean getBoolean(int i) { i = rowMapper(i); - return vector.vector[vector.isRepeating ? 0 : i] == 1; + return vector.vector[i] == 1; } @Override public byte getByte(int i) { i = rowMapper(i); - return (byte) vector.vector[vector.isRepeating ? 0 : i]; + return (byte) vector.vector[i]; } @Override public int getInt(int i) { i = rowMapper(i); - return (int) vector.vector[vector.isRepeating ? 0 : i]; + return (int) vector.vector[i]; } @Override public short getShort(int i) { i = rowMapper(i); - return (short) vector.vector[vector.isRepeating ? 0 : i]; + return (short) vector.vector[i]; } } diff --git a/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcTimestampColumnVector.java b/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcTimestampColumnVector.java index 8840a02a8c83..a6e71d6016f2 100644 --- a/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcTimestampColumnVector.java +++ b/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcTimestampColumnVector.java @@ -42,7 +42,6 @@ public OrcTimestampColumnVector(ColumnVector vector, VectorizedRowBatch orcBatch @Override public Timestamp getTimestamp(int i, int precision) { i = rowMapper(i); - int index = vector.isRepeating ? 0 : i; - return DateTimeUtils.toInternal(vector.time[index], vector.nanos[index] % 1_000_000); + return DateTimeUtils.toInternal(vector.time[i], vector.nanos[i] % 1_000_000); } } From 1876ac2c47ae1d8b1944e88b082db061f258c0dd Mon Sep 17 00:00:00 2001 From: "ranxianglei.rxl" Date: Tue, 12 Nov 2024 12:17:08 +0800 Subject: [PATCH 8/8] [format][orc] add pushdown option only for reader . --- .../src/main/java/org/apache/orc/OrcConf.java | 15 +++++++++++++++ .../paimon/format/orc/OrcReaderFactory.java | 5 +++-- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/paimon-format/src/main/java/org/apache/orc/OrcConf.java b/paimon-format/src/main/java/org/apache/orc/OrcConf.java index a7fa1a21bc8c..ee07e45117a4 100644 --- a/paimon-format/src/main/java/org/apache/orc/OrcConf.java +++ b/paimon-format/src/main/java/org/apache/orc/OrcConf.java @@ -305,6 +305,21 @@ public enum OrcConf { + "must have the filter\n" + "reapplied to avoid using unset values in the unselected rows.\n" + "If unsure please leave this as false."), + + READER_ONLY_ALLOW_SARG_TO_FILTER( + "orc.reader.sarg.to.filter", + "orc.reader.sarg.to.filter", + false, + "A boolean flag to determine if a SArg is allowed to become a filter, only for reader."), + READER_ONLY_USE_SELECTED( + "orc.reader.filter.use.selected", + "orc.reader.filter.use.selected", + false, + "A boolean flag to determine if the selected vector is supported by\n" + + "the reading application, only for reader. If false, the output of the ORC reader " + + "must have the filter\n" + + "reapplied to avoid using unset values in the unselected rows.\n" + + "If unsure please leave this as false."), ALLOW_PLUGIN_FILTER( "orc.filter.plugin", "orc.filter.plugin", diff --git a/paimon-format/src/main/java/org/apache/paimon/format/orc/OrcReaderFactory.java b/paimon-format/src/main/java/org/apache/paimon/format/orc/OrcReaderFactory.java index 6b1590f03109..c0b0131e3401 100644 --- a/paimon-format/src/main/java/org/apache/paimon/format/orc/OrcReaderFactory.java +++ b/paimon-format/src/main/java/org/apache/paimon/format/orc/OrcReaderFactory.java @@ -273,8 +273,9 @@ private static RecordReader createRecordReader( if (!conjunctPredicates.isEmpty()) { // TODO fix it , if open this option,future deletion vectors would not work, // cased by getRowNumber would be changed . - options.useSelected(OrcConf.READER_USE_SELECTED.getBoolean(conf)); - options.allowSARGToFilter(OrcConf.ALLOW_SARG_TO_FILTER.getBoolean(conf)); + options.useSelected(OrcConf.READER_ONLY_USE_SELECTED.getBoolean(conf)); + options.allowSARGToFilter( + OrcConf.READER_ONLY_ALLOW_SARG_TO_FILTER.getBoolean(conf)); } // configure filters if (!conjunctPredicates.isEmpty()) {