From 4302002a86fa51135e2de417c99f3d8f4dc6e387 Mon Sep 17 00:00:00 2001 From: Wenchao Wu <60921147+Stephen0421@users.noreply.github.com> Date: Mon, 23 Dec 2024 15:50:21 +0800 Subject: [PATCH] [core] fix parquet can not read row with last column is array. (#4755) --- .../parquet/reader/NestedPositionUtil.java | 15 +++---- .../format/parquet/ParquetReadWriteTest.java | 44 +++++++++---------- 2 files changed, 27 insertions(+), 32 deletions(-) diff --git a/paimon-format/src/main/java/org/apache/paimon/format/parquet/reader/NestedPositionUtil.java b/paimon-format/src/main/java/org/apache/paimon/format/parquet/reader/NestedPositionUtil.java index 99892c84377e..69c342b8b120 100644 --- a/paimon-format/src/main/java/org/apache/paimon/format/parquet/reader/NestedPositionUtil.java +++ b/paimon-format/src/main/java/org/apache/paimon/format/parquet/reader/NestedPositionUtil.java @@ -50,16 +50,11 @@ public static RowPosition calculateRowOffsets( int nullValuesCount = 0; BooleanArrayList nullRowFlags = new BooleanArrayList(0); for (int i = 0; i < fieldDefinitionLevels.length; i++) { - // TODO: this is not correct ? - // if (fieldRepetitionLevels[i] > rowRepetitionLevel) { - // throw new IllegalStateException( - // format( - // "In parquet's row type field repetition level should - // not larger than row's repetition level. " - // + "Row repetition level is %s, row field - // repetition level is %s.", - // rowRepetitionLevel, fieldRepetitionLevels[i])); - // } + // If a row's last field is an array, the repetition levels for the array's items will + // be larger than the parent row's repetition level, so we need to skip those values. + if (fieldRepetitionLevels[i] > rowRepetitionLevel) { + continue; + } if (fieldDefinitionLevels[i] >= rowDefinitionLevel) { // current row is defined and not empty diff --git a/paimon-format/src/test/java/org/apache/paimon/format/parquet/ParquetReadWriteTest.java b/paimon-format/src/test/java/org/apache/paimon/format/parquet/ParquetReadWriteTest.java index 7db10bab9644..878414498315 100644 --- a/paimon-format/src/test/java/org/apache/paimon/format/parquet/ParquetReadWriteTest.java +++ b/paimon-format/src/test/java/org/apache/paimon/format/parquet/ParquetReadWriteTest.java @@ -165,6 +165,7 @@ public class ParquetReadWriteTest { new VarCharType(VarCharType.MAX_LENGTH))), new ArrayType(true, RowType.builder().field("a", new IntType()).build()), RowType.of( + new IntType(), new ArrayType( true, RowType.builder() @@ -174,8 +175,7 @@ public class ParquetReadWriteTest { true, new ArrayType(true, new IntType()))) .field("c", new IntType()) - .build()), - new IntType()), + .build())), RowType.of( new ArrayType(RowType.of(new VarCharType(255))), RowType.of(new IntType()), @@ -808,6 +808,7 @@ null, new GenericMap(mp1), new GenericMap(mp2) new GenericArray( new GenericRow[] {GenericRow.of(i), GenericRow.of(i + 1)}), GenericRow.of( + i, new GenericArray( new GenericRow[] { GenericRow.of( @@ -826,8 +827,7 @@ null, new GenericMap(mp1), new GenericMap(mp2) null }), i) - }), - i), + })), null)); } return rows; @@ -881,15 +881,15 @@ private Path createNestedDataByOriginWriter(int rowNum, File tmpDir, int rowGrou row2.add(0, i + 1); f4.addGroup(0); - // add ROW<`f0` ARRAY>, `c` INT>>, `f1` INT>> + // add ROW<`f0` INT , `f1` INTARRAY>, `c` INT>>>> Group f5 = row.addGroup("f5"); - Group arrayRow = f5.addGroup(0); + f5.add(0, i); + Group arrayRow = f5.addGroup(1); Group insideRow = arrayRow.addGroup(0).addGroup(0); Group insideArray = insideRow.addGroup(0); createParquetDoubleNestedArray(insideArray, i); insideRow.add(1, i); arrayRow.addGroup(0); - f5.add(1, i); writer.write(row); } } catch (Exception e) { @@ -982,43 +982,43 @@ private void compareNestedRow(List rows, List results) origin.getArray(4).getRow(1, 1).getInt(0), result.getArray(4).getRow(1, 1).getInt(0)); + Assertions.assertEquals(origin.getRow(5, 2).getInt(0), result.getRow(5, 2).getInt(0)); Assertions.assertEquals( - origin.getRow(5, 2).getArray(0).getRow(0, 2).getArray(0).getArray(0).getInt(0), - result.getRow(5, 2).getArray(0).getRow(0, 2).getArray(0).getArray(0).getInt(0)); + origin.getRow(5, 2).getArray(1).getRow(0, 2).getArray(0).getArray(0).getInt(0), + result.getRow(5, 2).getArray(1).getRow(0, 2).getArray(0).getArray(0).getInt(0)); Assertions.assertEquals( - origin.getRow(5, 2).getArray(0).getRow(0, 2).getArray(0).getArray(0).getInt(1), - result.getRow(5, 2).getArray(0).getRow(0, 2).getArray(0).getArray(0).getInt(1)); + origin.getRow(5, 2).getArray(1).getRow(0, 2).getArray(0).getArray(0).getInt(1), + result.getRow(5, 2).getArray(1).getRow(0, 2).getArray(0).getArray(0).getInt(1)); Assertions.assertTrue( result.getRow(5, 2) - .getArray(0) + .getArray(1) .getRow(0, 2) .getArray(0) .getArray(0) .isNullAt(2)); Assertions.assertEquals( - origin.getRow(5, 2).getArray(0).getRow(0, 2).getArray(0).getArray(1).getInt(0), - result.getRow(5, 2).getArray(0).getRow(0, 2).getArray(0).getArray(1).getInt(0)); + origin.getRow(5, 2).getArray(1).getRow(0, 2).getArray(0).getArray(1).getInt(0), + result.getRow(5, 2).getArray(1).getRow(0, 2).getArray(0).getArray(1).getInt(0)); Assertions.assertEquals( - origin.getRow(5, 2).getArray(0).getRow(0, 2).getArray(0).getArray(1).getInt(1), - result.getRow(5, 2).getArray(0).getRow(0, 2).getArray(0).getArray(1).getInt(1)); + origin.getRow(5, 2).getArray(1).getRow(0, 2).getArray(0).getArray(1).getInt(1), + result.getRow(5, 2).getArray(1).getRow(0, 2).getArray(0).getArray(1).getInt(1)); Assertions.assertTrue( result.getRow(5, 2) - .getArray(0) + .getArray(1) .getRow(0, 2) .getArray(0) .getArray(1) .isNullAt(2)); Assertions.assertEquals( - 0, result.getRow(5, 2).getArray(0).getRow(0, 2).getArray(0).getArray(2).size()); + 0, result.getRow(5, 2).getArray(1).getRow(0, 2).getArray(0).getArray(2).size()); Assertions.assertTrue( - result.getRow(5, 2).getArray(0).getRow(0, 2).getArray(0).isNullAt(3)); + result.getRow(5, 2).getArray(1).getRow(0, 2).getArray(0).isNullAt(3)); Assertions.assertEquals( - origin.getRow(5, 2).getArray(0).getRow(0, 2).getInt(1), - result.getRow(5, 2).getArray(0).getRow(0, 2).getInt(1)); - Assertions.assertEquals(origin.getRow(5, 2).getInt(1), result.getRow(5, 2).getInt(1)); + origin.getRow(5, 2).getArray(1).getRow(0, 2).getInt(1), + result.getRow(5, 2).getArray(1).getRow(0, 2).getInt(1)); Assertions.assertTrue(result.isNullAt(6)); Assertions.assertTrue(result.getRow(6, 2).isNullAt(0)); Assertions.assertTrue(result.getRow(6, 2).isNullAt(1));