From 06fbb5e1f1d9162c87dac6e707c38315beb4f6a8 Mon Sep 17 00:00:00 2001 From: yuzelin <33053040+yuzelin@users.noreply.github.com> Date: Tue, 19 Nov 2024 20:20:57 +0800 Subject: [PATCH] [orc] Add type id to orc files (#4523) --- .../paimon/format/orc/OrcFileFormat.java | 8 +- .../paimon/format/orc/OrcReaderFactory.java | 4 +- .../apache/paimon/format/orc/OrcTypeUtil.java | 147 +++++++++++++ .../format/orc/reader/OrcSplitReaderUtil.java | 99 --------- .../format/orc/writer/RowDataVectorizer.java | 3 +- .../paimon/format/orc/writer/Vectorizer.java | 4 +- .../format/orc/OrcSplitReaderUtilTest.java | 68 ------ .../paimon/format/orc/OrcTypeUtilTest.java | 206 ++++++++++++++++++ .../format/orc/OrcWriterFactoryTest.java | 3 +- 9 files changed, 364 insertions(+), 178 deletions(-) create mode 100644 paimon-format/src/main/java/org/apache/paimon/format/orc/OrcTypeUtil.java delete mode 100644 paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcSplitReaderUtil.java delete mode 100644 paimon-format/src/test/java/org/apache/paimon/format/orc/OrcSplitReaderUtilTest.java create mode 100644 paimon-format/src/test/java/org/apache/paimon/format/orc/OrcTypeUtilTest.java diff --git a/paimon-format/src/main/java/org/apache/paimon/format/orc/OrcFileFormat.java b/paimon-format/src/main/java/org/apache/paimon/format/orc/OrcFileFormat.java index c564b69409c5..c3521c6f1a37 100644 --- a/paimon-format/src/main/java/org/apache/paimon/format/orc/OrcFileFormat.java +++ b/paimon-format/src/main/java/org/apache/paimon/format/orc/OrcFileFormat.java @@ -28,7 +28,6 @@ import org.apache.paimon.format.orc.filter.OrcFilters; import org.apache.paimon.format.orc.filter.OrcPredicateFunctionVisitor; import org.apache.paimon.format.orc.filter.OrcSimpleStatsExtractor; -import org.apache.paimon.format.orc.reader.OrcSplitReaderUtil; import org.apache.paimon.format.orc.writer.RowDataVectorizer; import org.apache.paimon.format.orc.writer.Vectorizer; import org.apache.paimon.options.MemorySize; @@ -123,7 +122,7 @@ public FormatReaderFactory createReaderFactory( @Override public void validateDataFields(RowType rowType) { DataType refinedType = refineDataType(rowType); - OrcSplitReaderUtil.toOrcType(refinedType); + OrcTypeUtil.convertToOrcSchema((RowType) refinedType); } /** @@ -141,9 +140,8 @@ public FormatWriterFactory createWriterFactory(RowType type) { DataType refinedType = refineDataType(type); DataType[] orcTypes = getFieldTypes(refinedType).toArray(new DataType[0]); - TypeDescription typeDescription = OrcSplitReaderUtil.toOrcType(refinedType); - Vectorizer vectorizer = - new RowDataVectorizer(typeDescription.toString(), orcTypes); + TypeDescription typeDescription = OrcTypeUtil.convertToOrcSchema((RowType) refinedType); + Vectorizer vectorizer = new RowDataVectorizer(typeDescription, orcTypes); return new OrcWriterFactory(vectorizer, orcProperties, writerConf, writeBatchSize); } diff --git a/paimon-format/src/main/java/org/apache/paimon/format/orc/OrcReaderFactory.java b/paimon-format/src/main/java/org/apache/paimon/format/orc/OrcReaderFactory.java index 05f3dd7851e8..ee0f8a55c034 100644 --- a/paimon-format/src/main/java/org/apache/paimon/format/orc/OrcReaderFactory.java +++ b/paimon-format/src/main/java/org/apache/paimon/format/orc/OrcReaderFactory.java @@ -55,8 +55,8 @@ import java.io.IOException; import java.util.List; +import static org.apache.paimon.format.orc.OrcTypeUtil.convertToOrcSchema; import static org.apache.paimon.format.orc.reader.AbstractOrcColumnVector.createPaimonVector; -import static org.apache.paimon.format.orc.reader.OrcSplitReaderUtil.toOrcType; import static org.apache.paimon.utils.Preconditions.checkNotNull; /** An ORC reader that produces a stream of {@link ColumnarRow} records. */ @@ -81,7 +81,7 @@ public OrcReaderFactory( final int batchSize, final boolean deletionVectorsEnabled) { this.hadoopConfig = checkNotNull(hadoopConfig); - this.schema = toOrcType(readType); + this.schema = convertToOrcSchema(readType); this.tableType = readType; this.conjunctPredicates = checkNotNull(conjunctPredicates); this.batchSize = batchSize; diff --git a/paimon-format/src/main/java/org/apache/paimon/format/orc/OrcTypeUtil.java b/paimon-format/src/main/java/org/apache/paimon/format/orc/OrcTypeUtil.java new file mode 100644 index 000000000000..f7d3d626d44f --- /dev/null +++ b/paimon-format/src/main/java/org/apache/paimon/format/orc/OrcTypeUtil.java @@ -0,0 +1,147 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.format.orc; + +import org.apache.paimon.annotation.VisibleForTesting; +import org.apache.paimon.table.SpecialFields; +import org.apache.paimon.types.ArrayType; +import org.apache.paimon.types.CharType; +import org.apache.paimon.types.DataField; +import org.apache.paimon.types.DataType; +import org.apache.paimon.types.DataTypes; +import org.apache.paimon.types.DecimalType; +import org.apache.paimon.types.MapType; +import org.apache.paimon.types.RowType; +import org.apache.paimon.types.VarCharType; + +import org.apache.orc.TypeDescription; + +/** Util for orc types. */ +public class OrcTypeUtil { + + public static final String PAIMON_ORC_FIELD_ID_KEY = "paimon.id"; + + public static TypeDescription convertToOrcSchema(RowType rowType) { + TypeDescription struct = TypeDescription.createStruct(); + for (DataField dataField : rowType.getFields()) { + TypeDescription child = convertToOrcType(dataField.type(), dataField.id(), 0); + struct.addField(dataField.name(), child); + } + return struct; + } + + @VisibleForTesting + static TypeDescription convertToOrcType(DataType type, int fieldId, int depth) { + type = type.copy(true); + switch (type.getTypeRoot()) { + case CHAR: + return TypeDescription.createChar() + .withMaxLength(((CharType) type).getLength()) + .setAttribute(PAIMON_ORC_FIELD_ID_KEY, String.valueOf(fieldId)); + case VARCHAR: + int len = ((VarCharType) type).getLength(); + if (len == VarCharType.MAX_LENGTH) { + return TypeDescription.createString() + .setAttribute(PAIMON_ORC_FIELD_ID_KEY, String.valueOf(fieldId)); + } else { + return TypeDescription.createVarchar() + .withMaxLength(len) + .setAttribute(PAIMON_ORC_FIELD_ID_KEY, String.valueOf(fieldId)); + } + case BOOLEAN: + return TypeDescription.createBoolean() + .setAttribute(PAIMON_ORC_FIELD_ID_KEY, String.valueOf(fieldId)); + case VARBINARY: + if (type.equals(DataTypes.BYTES())) { + return TypeDescription.createBinary() + .setAttribute(PAIMON_ORC_FIELD_ID_KEY, String.valueOf(fieldId)); + } else { + throw new UnsupportedOperationException( + "Not support other binary type: " + type); + } + case DECIMAL: + DecimalType decimalType = (DecimalType) type; + return TypeDescription.createDecimal() + .withScale(decimalType.getScale()) + .withPrecision(decimalType.getPrecision()) + .setAttribute(PAIMON_ORC_FIELD_ID_KEY, String.valueOf(fieldId)); + case TINYINT: + return TypeDescription.createByte() + .setAttribute(PAIMON_ORC_FIELD_ID_KEY, String.valueOf(fieldId)); + case SMALLINT: + return TypeDescription.createShort() + .setAttribute(PAIMON_ORC_FIELD_ID_KEY, String.valueOf(fieldId)); + case INTEGER: + case TIME_WITHOUT_TIME_ZONE: + return TypeDescription.createInt() + .setAttribute(PAIMON_ORC_FIELD_ID_KEY, String.valueOf(fieldId)); + case BIGINT: + return TypeDescription.createLong() + .setAttribute(PAIMON_ORC_FIELD_ID_KEY, String.valueOf(fieldId)); + case FLOAT: + return TypeDescription.createFloat() + .setAttribute(PAIMON_ORC_FIELD_ID_KEY, String.valueOf(fieldId)); + case DOUBLE: + return TypeDescription.createDouble() + .setAttribute(PAIMON_ORC_FIELD_ID_KEY, String.valueOf(fieldId)); + case DATE: + return TypeDescription.createDate() + .setAttribute(PAIMON_ORC_FIELD_ID_KEY, String.valueOf(fieldId)); + case TIMESTAMP_WITHOUT_TIME_ZONE: + return TypeDescription.createTimestamp() + .setAttribute(PAIMON_ORC_FIELD_ID_KEY, String.valueOf(fieldId)); + case TIMESTAMP_WITH_LOCAL_TIME_ZONE: + return TypeDescription.createTimestampInstant() + .setAttribute(PAIMON_ORC_FIELD_ID_KEY, String.valueOf(fieldId)); + case ARRAY: + ArrayType arrayType = (ArrayType) type; + + String elementFieldId = + String.valueOf(SpecialFields.getArrayElementFieldId(fieldId, depth + 1)); + TypeDescription elementOrcType = + convertToOrcType(arrayType.getElementType(), fieldId, depth + 1) + .setAttribute(PAIMON_ORC_FIELD_ID_KEY, elementFieldId); + + return TypeDescription.createList(elementOrcType) + .setAttribute(PAIMON_ORC_FIELD_ID_KEY, String.valueOf(fieldId)); + case MAP: + MapType mapType = (MapType) type; + + String mapKeyFieldId = + String.valueOf(SpecialFields.getMapKeyFieldId(fieldId, depth + 1)); + TypeDescription mapKeyOrcType = + convertToOrcType(mapType.getKeyType(), fieldId, depth + 1) + .setAttribute(PAIMON_ORC_FIELD_ID_KEY, mapKeyFieldId); + + String mapValueFieldId = + String.valueOf(SpecialFields.getMapValueFieldId(fieldId, depth + 1)); + TypeDescription mapValueOrcType = + convertToOrcType(mapType.getValueType(), fieldId, depth + 1) + .setAttribute(PAIMON_ORC_FIELD_ID_KEY, mapValueFieldId); + + return TypeDescription.createMap(mapKeyOrcType, mapValueOrcType) + .setAttribute(PAIMON_ORC_FIELD_ID_KEY, String.valueOf(fieldId)); + case ROW: + return convertToOrcSchema((RowType) type) + .setAttribute(PAIMON_ORC_FIELD_ID_KEY, String.valueOf(fieldId)); + default: + throw new UnsupportedOperationException("Unsupported type: " + type); + } + } +} diff --git a/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcSplitReaderUtil.java b/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcSplitReaderUtil.java deleted file mode 100644 index 882f1c753991..000000000000 --- a/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcSplitReaderUtil.java +++ /dev/null @@ -1,99 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.paimon.format.orc.reader; - -import org.apache.paimon.types.ArrayType; -import org.apache.paimon.types.CharType; -import org.apache.paimon.types.DataType; -import org.apache.paimon.types.DataTypes; -import org.apache.paimon.types.DecimalType; -import org.apache.paimon.types.MapType; -import org.apache.paimon.types.RowType; -import org.apache.paimon.types.VarCharType; - -import org.apache.orc.TypeDescription; - -/** Util for orc types. */ -public class OrcSplitReaderUtil { - - public static TypeDescription toOrcType(DataType type) { - type = type.copy(true); - switch (type.getTypeRoot()) { - case CHAR: - return TypeDescription.createChar().withMaxLength(((CharType) type).getLength()); - case VARCHAR: - int len = ((VarCharType) type).getLength(); - if (len == VarCharType.MAX_LENGTH) { - return TypeDescription.createString(); - } else { - return TypeDescription.createVarchar().withMaxLength(len); - } - case BOOLEAN: - return TypeDescription.createBoolean(); - case VARBINARY: - if (type.equals(DataTypes.BYTES())) { - return TypeDescription.createBinary(); - } else { - throw new UnsupportedOperationException( - "Not support other binary type: " + type); - } - case DECIMAL: - DecimalType decimalType = (DecimalType) type; - return TypeDescription.createDecimal() - .withScale(decimalType.getScale()) - .withPrecision(decimalType.getPrecision()); - case TINYINT: - return TypeDescription.createByte(); - case SMALLINT: - return TypeDescription.createShort(); - case INTEGER: - case TIME_WITHOUT_TIME_ZONE: - return TypeDescription.createInt(); - case BIGINT: - return TypeDescription.createLong(); - case FLOAT: - return TypeDescription.createFloat(); - case DOUBLE: - return TypeDescription.createDouble(); - case DATE: - return TypeDescription.createDate(); - case TIMESTAMP_WITHOUT_TIME_ZONE: - return TypeDescription.createTimestamp(); - case TIMESTAMP_WITH_LOCAL_TIME_ZONE: - return TypeDescription.createTimestampInstant(); - case ARRAY: - ArrayType arrayType = (ArrayType) type; - return TypeDescription.createList(toOrcType(arrayType.getElementType())); - case MAP: - MapType mapType = (MapType) type; - return TypeDescription.createMap( - toOrcType(mapType.getKeyType()), toOrcType(mapType.getValueType())); - case ROW: - RowType rowType = (RowType) type; - TypeDescription struct = TypeDescription.createStruct(); - for (int i = 0; i < rowType.getFieldCount(); i++) { - struct.addField( - rowType.getFieldNames().get(i), toOrcType(rowType.getTypeAt(i))); - } - return struct; - default: - throw new UnsupportedOperationException("Unsupported type: " + type); - } - } -} diff --git a/paimon-format/src/main/java/org/apache/paimon/format/orc/writer/RowDataVectorizer.java b/paimon-format/src/main/java/org/apache/paimon/format/orc/writer/RowDataVectorizer.java index 21443cdf9463..46c936a0263e 100644 --- a/paimon-format/src/main/java/org/apache/paimon/format/orc/writer/RowDataVectorizer.java +++ b/paimon-format/src/main/java/org/apache/paimon/format/orc/writer/RowDataVectorizer.java @@ -23,6 +23,7 @@ import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.orc.TypeDescription; import java.util.Arrays; import java.util.List; @@ -35,7 +36,7 @@ public class RowDataVectorizer extends Vectorizer { private final List fieldWriters; - public RowDataVectorizer(String schema, DataType[] fieldTypes) { + public RowDataVectorizer(TypeDescription schema, DataType[] fieldTypes) { super(schema); this.fieldWriters = Arrays.stream(fieldTypes) diff --git a/paimon-format/src/main/java/org/apache/paimon/format/orc/writer/Vectorizer.java b/paimon-format/src/main/java/org/apache/paimon/format/orc/writer/Vectorizer.java index 0f0e6bba74a8..2add46531a61 100644 --- a/paimon-format/src/main/java/org/apache/paimon/format/orc/writer/Vectorizer.java +++ b/paimon-format/src/main/java/org/apache/paimon/format/orc/writer/Vectorizer.java @@ -39,9 +39,9 @@ public abstract class Vectorizer implements Serializable { private final TypeDescription schema; - public Vectorizer(final String schema) { + public Vectorizer(final TypeDescription schema) { checkNotNull(schema); - this.schema = TypeDescription.fromString(schema); + this.schema = schema; } /** diff --git a/paimon-format/src/test/java/org/apache/paimon/format/orc/OrcSplitReaderUtilTest.java b/paimon-format/src/test/java/org/apache/paimon/format/orc/OrcSplitReaderUtilTest.java deleted file mode 100644 index c07838dfa34c..000000000000 --- a/paimon-format/src/test/java/org/apache/paimon/format/orc/OrcSplitReaderUtilTest.java +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.paimon.format.orc; - -import org.apache.paimon.format.orc.reader.OrcSplitReaderUtil; -import org.apache.paimon.types.DataType; -import org.apache.paimon.types.DataTypes; - -import org.junit.jupiter.api.Test; - -import static org.apache.paimon.format.orc.reader.OrcSplitReaderUtil.toOrcType; -import static org.assertj.core.api.Assertions.assertThat; - -/** Test for {@link OrcSplitReaderUtil}. */ -class OrcSplitReaderUtilTest { - - @Test - void testDataTypeToOrcType() { - test("boolean", DataTypes.BOOLEAN()); - test("char(123)", DataTypes.CHAR(123)); - test("varchar(123)", DataTypes.VARCHAR(123)); - test("string", DataTypes.STRING()); - test("binary", DataTypes.BYTES()); - test("tinyint", DataTypes.TINYINT()); - test("smallint", DataTypes.SMALLINT()); - test("int", DataTypes.INT()); - test("bigint", DataTypes.BIGINT()); - test("float", DataTypes.FLOAT()); - test("double", DataTypes.DOUBLE()); - test("date", DataTypes.DATE()); - test("timestamp", DataTypes.TIMESTAMP()); - test("array", DataTypes.ARRAY(DataTypes.FLOAT())); - test("map", DataTypes.MAP(DataTypes.FLOAT(), DataTypes.BIGINT())); - test( - "struct>", - DataTypes.ROW( - DataTypes.FIELD(0, "int0", DataTypes.INT()), - DataTypes.FIELD(1, "str1", DataTypes.STRING()), - DataTypes.FIELD(2, "double2", DataTypes.DOUBLE()), - DataTypes.FIELD( - 3, - "row3", - DataTypes.ROW( - DataTypes.FIELD(4, "int0", DataTypes.INT()), - DataTypes.FIELD(5, "int1", DataTypes.INT()))))); - test("decimal(4,2)", DataTypes.DECIMAL(4, 2)); - } - - private void test(String expected, DataType type) { - assertThat(toOrcType(type)).hasToString(expected); - } -} diff --git a/paimon-format/src/test/java/org/apache/paimon/format/orc/OrcTypeUtilTest.java b/paimon-format/src/test/java/org/apache/paimon/format/orc/OrcTypeUtilTest.java new file mode 100644 index 000000000000..5669ac33d443 --- /dev/null +++ b/paimon-format/src/test/java/org/apache/paimon/format/orc/OrcTypeUtilTest.java @@ -0,0 +1,206 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.format.orc; + +import org.apache.paimon.format.FileFormatFactory; +import org.apache.paimon.format.FormatWriter; +import org.apache.paimon.fs.FileIO; +import org.apache.paimon.fs.Path; +import org.apache.paimon.fs.PositionOutputStream; +import org.apache.paimon.fs.local.LocalFileIO; +import org.apache.paimon.options.Options; +import org.apache.paimon.types.DataType; +import org.apache.paimon.types.DataTypes; +import org.apache.paimon.types.RowType; + +import org.apache.paimon.shade.guava30.com.google.common.base.Objects; + +import org.apache.hadoop.conf.Configuration; +import org.apache.orc.Reader; +import org.apache.orc.TypeDescription; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +import java.io.IOException; +import java.util.List; +import java.util.UUID; +import java.util.concurrent.atomic.AtomicInteger; + +import static org.apache.paimon.format.orc.OrcFileFormat.refineDataType; +import static org.apache.paimon.format.orc.OrcTypeUtil.PAIMON_ORC_FIELD_ID_KEY; +import static org.apache.paimon.format.orc.OrcTypeUtil.convertToOrcSchema; +import static org.apache.paimon.format.orc.OrcTypeUtil.convertToOrcType; +import static org.apache.paimon.utils.Preconditions.checkArgument; +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatNoException; + +/** Test for {@link OrcTypeUtil}. */ +class OrcTypeUtilTest { + + @Test + void testDataTypeToOrcType() { + test("boolean", DataTypes.BOOLEAN()); + test("char(123)", DataTypes.CHAR(123)); + test("varchar(123)", DataTypes.VARCHAR(123)); + test("string", DataTypes.STRING()); + test("binary", DataTypes.BYTES()); + test("tinyint", DataTypes.TINYINT()); + test("smallint", DataTypes.SMALLINT()); + test("int", DataTypes.INT()); + test("bigint", DataTypes.BIGINT()); + test("float", DataTypes.FLOAT()); + test("double", DataTypes.DOUBLE()); + test("date", DataTypes.DATE()); + test("timestamp", DataTypes.TIMESTAMP()); + test("array", DataTypes.ARRAY(DataTypes.FLOAT())); + test("map", DataTypes.MAP(DataTypes.FLOAT(), DataTypes.BIGINT())); + test( + "struct>", + DataTypes.ROW( + DataTypes.FIELD(0, "int0", DataTypes.INT()), + DataTypes.FIELD(1, "str1", DataTypes.STRING()), + DataTypes.FIELD(2, "double2", DataTypes.DOUBLE()), + DataTypes.FIELD( + 3, + "row3", + DataTypes.ROW( + DataTypes.FIELD(4, "int0", DataTypes.INT()), + DataTypes.FIELD(5, "int1", DataTypes.INT()))))); + test("decimal(4,2)", DataTypes.DECIMAL(4, 2)); + } + + private void test(String expected, DataType type) { + assertThat(convertToOrcType(type, -1, -1)).hasToString(expected); + } + + @Test + void testFieldIdAttribute(@TempDir java.nio.file.Path tempPath) throws IOException { + RowType rowType = + RowType.builder() + .field("a", DataTypes.INT()) + .field( + "b", + RowType.builder(true, new AtomicInteger(10)) + .field("f0", DataTypes.STRING()) + .field("f1", DataTypes.INT()) + .build()) + .field("c", DataTypes.ARRAY(DataTypes.INT())) + .field("d", DataTypes.MAP(DataTypes.INT(), DataTypes.STRING())) + .field( + "e", + DataTypes.ARRAY( + RowType.builder(true, new AtomicInteger(20)) + .field("f0", DataTypes.STRING()) + .field("f1", DataTypes.INT()) + .build())) + .field( + "f", + RowType.builder(true, new AtomicInteger(30)) + .field("f0", DataTypes.ARRAY(DataTypes.INT())) + .build()) + .build(); + + // write schema to orc file then get + FileIO fileIO = LocalFileIO.create(); + Path tempFile = new Path(new Path(tempPath.toUri()), UUID.randomUUID().toString()); + + OrcFileFormat format = + new OrcFileFormat(new FileFormatFactory.FormatContext(new Options(), 1024, 1024)); + PositionOutputStream out = fileIO.newOutputStream(tempFile, false); + FormatWriter writer = format.createWriterFactory(rowType).create(out, "zstd"); + writer.close(); + out.close(); + + Reader orcReader = + OrcReaderFactory.createReader(new Configuration(), fileIO, tempFile, null); + TypeDescription orcSchema = orcReader.getSchema(); + + RowType refined = (RowType) refineDataType(rowType); + + assertThatNoException() + .isThrownBy(() -> checkStruct(convertToOrcSchema(refined), orcSchema)); + + assertThatNoException() + .isThrownBy( + () -> + checkStruct( + convertToOrcSchema(refined.project("c", "b", "d")), + orcSchema)); + + assertThatNoException() + .isThrownBy( + () -> + checkStruct( + convertToOrcSchema(refined.project("a", "e", "f")), + orcSchema)); + } + + private void checkStruct(TypeDescription requiredStruct, TypeDescription orcStruct) { + List requiredFields = requiredStruct.getFieldNames(); + List requiredTypes = requiredStruct.getChildren(); + List orcFields = orcStruct.getFieldNames(); + List orcTypes = orcStruct.getChildren(); + + for (int i = 0; i < requiredFields.size(); i++) { + String field = requiredFields.get(i); + int orcIndex = orcFields.indexOf(field); + checkArgument(orcIndex != -1, "Cannot find field %s in orc file meta.", field); + TypeDescription requiredType = requiredTypes.get(i); + TypeDescription orcType = orcTypes.get(orcIndex); + checkField(field, requiredType, orcType); + } + } + + private void checkField( + String fieldName, TypeDescription requiredType, TypeDescription orcType) { + checkFieldIdAttribute(fieldName, requiredType, orcType); + if (requiredType.getCategory().isPrimitive()) { + return; + } + + switch (requiredType.getCategory()) { + case LIST: + checkField( + "_elem", requiredType.getChildren().get(0), orcType.getChildren().get(0)); + return; + case MAP: + checkField("_key", requiredType.getChildren().get(0), orcType.getChildren().get(0)); + checkField( + "_value", requiredType.getChildren().get(1), orcType.getChildren().get(1)); + return; + case STRUCT: + checkStruct(requiredType, orcType); + return; + default: + throw new UnsupportedOperationException("Unsupported orc type: " + requiredType); + } + } + + private void checkFieldIdAttribute( + String fieldName, TypeDescription requiredType, TypeDescription orcType) { + String requiredId = requiredType.getAttributeValue(PAIMON_ORC_FIELD_ID_KEY); + String orcId = orcType.getAttributeValue(PAIMON_ORC_FIELD_ID_KEY); + checkArgument( + Objects.equal(requiredId, orcId), + "Field %s has different id: read type id is %s but orc type id is %s. This is unexpected.", + fieldName, + requiredId, + orcId); + } +} diff --git a/paimon-format/src/test/java/org/apache/paimon/format/orc/OrcWriterFactoryTest.java b/paimon-format/src/test/java/org/apache/paimon/format/orc/OrcWriterFactoryTest.java index 2511d7ed7a9e..52df5afb4efd 100644 --- a/paimon-format/src/test/java/org/apache/paimon/format/orc/OrcWriterFactoryTest.java +++ b/paimon-format/src/test/java/org/apache/paimon/format/orc/OrcWriterFactoryTest.java @@ -28,6 +28,7 @@ import org.apache.hadoop.fs.Path; import org.apache.orc.MemoryManager; import org.apache.orc.OrcFile; +import org.apache.orc.TypeDescription; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.io.TempDir; @@ -47,7 +48,7 @@ void testNotOverrideInMemoryManager(@TempDir java.nio.file.Path tmpDir) throws I OrcWriterFactory factory = new TestOrcWriterFactory( new RowDataVectorizer( - "struct<_col0:string,_col1:int>", + TypeDescription.fromString("struct<_col0:string,_col1:int>"), new DataType[] {DataTypes.STRING(), DataTypes.INT()}), memoryManager); factory.create(new LocalPositionOutputStream(tmpDir.resolve("file1").toFile()), "LZ4");