From 96587abf7fbbeb9c728d60cbdc7bdd7e2096dad5 Mon Sep 17 00:00:00 2001 From: Tom Tanaka <43331405+tomtongue@users.noreply.github.com> Date: Wed, 31 Jul 2024 00:18:39 +0900 Subject: [PATCH 01/55] Flink: Remove MiniClusterResource (#10817) --- .../iceberg/flink/MiniClusterResource.java | 53 --------------- .../org/apache/iceberg/flink/TestBase.java | 2 +- .../iceberg/flink/TestFlinkTableSink.java | 2 +- .../apache/iceberg/flink/TestFlinkUpsert.java | 2 +- .../iceberg/flink/TestIcebergConnector.java | 2 +- ...TestBucketPartitionerFlinkIcebergSink.java | 4 +- .../flink/sink/TestFlinkIcebergSink.java | 7 +- .../sink/TestFlinkIcebergSinkBranch.java | 4 +- .../flink/sink/TestFlinkIcebergSinkV2.java | 5 +- .../sink/TestFlinkIcebergSinkV2Branch.java | 4 +- .../flink/source/ChangeLogTableTestBase.java | 4 +- .../iceberg/flink/source/TestFlinkScan.java | 2 +- ...stIcebergSourceWithWatermarkExtractor.java | 2 +- .../iceberg/flink/source/TestSqlBase.java | 2 +- .../flink/source/TestStreamScanSql.java | 4 +- .../iceberg/flink/MiniClusterResource.java | 68 ------------------- .../org/apache/iceberg/flink/TestBase.java | 2 +- .../iceberg/flink/TestFlinkTableSink.java | 2 +- .../apache/iceberg/flink/TestFlinkUpsert.java | 2 +- .../iceberg/flink/TestIcebergConnector.java | 2 +- ...TestBucketPartitionerFlinkIcebergSink.java | 4 +- .../flink/sink/TestFlinkIcebergSink.java | 7 +- .../sink/TestFlinkIcebergSinkBranch.java | 4 +- .../flink/sink/TestFlinkIcebergSinkV2.java | 5 +- .../sink/TestFlinkIcebergSinkV2Branch.java | 4 +- .../flink/source/ChangeLogTableTestBase.java | 4 +- .../iceberg/flink/source/TestFlinkScan.java | 2 +- .../source/TestIcebergSourceContinuous.java | 2 +- ...stIcebergSourceWithWatermarkExtractor.java | 2 +- .../iceberg/flink/source/TestSqlBase.java | 2 +- .../flink/source/TestStreamScanSql.java | 4 +- .../iceberg/flink/MiniClusterResource.java | 68 ------------------- .../org/apache/iceberg/flink/TestBase.java | 2 +- .../iceberg/flink/TestFlinkTableSink.java | 2 +- .../apache/iceberg/flink/TestFlinkUpsert.java | 2 +- .../iceberg/flink/TestIcebergConnector.java | 2 +- .../operator/OperatorTestBase.java | 4 +- ...TestBucketPartitionerFlinkIcebergSink.java | 4 +- .../flink/sink/TestFlinkIcebergSink.java | 7 +- .../sink/TestFlinkIcebergSinkBranch.java | 4 +- .../flink/sink/TestFlinkIcebergSinkV2.java | 5 +- .../sink/TestFlinkIcebergSinkV2Branch.java | 4 +- .../flink/source/ChangeLogTableTestBase.java | 4 +- .../iceberg/flink/source/TestFlinkScan.java | 2 +- ...stIcebergSourceWithWatermarkExtractor.java | 2 +- ...estIcebergSpeculativeExecutionSupport.java | 2 +- .../iceberg/flink/source/TestSqlBase.java | 2 +- .../flink/source/TestStreamScanSql.java | 4 +- 48 files changed, 70 insertions(+), 265 deletions(-) delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/MiniClusterResource.java delete mode 100644 flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/MiniClusterResource.java delete mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/MiniClusterResource.java diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/MiniClusterResource.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/MiniClusterResource.java deleted file mode 100644 index 45af9241b743..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/MiniClusterResource.java +++ /dev/null @@ -1,53 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import org.apache.flink.configuration.Configuration; -import org.apache.flink.configuration.CoreOptions; -import org.apache.flink.runtime.testutils.MiniClusterResourceConfiguration; -import org.apache.flink.test.util.MiniClusterWithClientResource; - -public class MiniClusterResource { - - private static final int DEFAULT_TM_NUM = 1; - private static final int DEFAULT_PARALLELISM = 4; - - public static final Configuration DISABLE_CLASSLOADER_CHECK_CONFIG = - new Configuration() - // disable classloader check as Avro may cache class/object in the serializers. - .set(CoreOptions.CHECK_LEAKED_CLASSLOADER, false); - - private MiniClusterResource() {} - - /** - * It will start a mini cluster with classloader.check-leaked-classloader=false, so that we won't - * break the unit tests because of the class loader leak issue. In our iceberg integration tests, - * there're some that will assert the results after finished the flink jobs, so actually we may - * access the class loader that has been closed by the flink task managers if we enable the switch - * classloader.check-leaked-classloader by default. - */ - public static MiniClusterWithClientResource createWithClassloaderCheckDisabled() { - return new MiniClusterWithClientResource( - new MiniClusterResourceConfiguration.Builder() - .setNumberTaskManagers(DEFAULT_TM_NUM) - .setNumberSlotsPerTaskManager(DEFAULT_PARALLELISM) - .setConfiguration(DISABLE_CLASSLOADER_CHECK_CONFIG) - .build()); - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestBase.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestBase.java index 6367a064f283..a74226092f38 100644 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestBase.java +++ b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestBase.java @@ -43,7 +43,7 @@ public abstract class TestBase extends TestBaseUtils { @RegisterExtension - public static MiniClusterExtension miniClusterResource = + public static MiniClusterExtension miniClusterExtension = MiniFlinkClusterExtension.createWithClassloaderCheckDisabled(); @TempDir protected Path temporaryDirectory; diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSink.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSink.java index b7fce104f490..b73300e3f170 100644 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSink.java +++ b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSink.java @@ -91,7 +91,7 @@ protected TableEnvironment getTableEnv() { settingsBuilder.inStreamingMode(); StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment( - MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG); + MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG); env.enableCheckpointing(400); env.setMaxParallelism(2); env.setParallelism(2); diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestFlinkUpsert.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestFlinkUpsert.java index 5674c83e40b8..d52d54e159e6 100644 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestFlinkUpsert.java +++ b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestFlinkUpsert.java @@ -75,7 +75,7 @@ protected TableEnvironment getTableEnv() { settingsBuilder.inStreamingMode(); StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment( - MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG); + MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG); env.enableCheckpointing(400); env.setMaxParallelism(2); env.setParallelism(2); diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestIcebergConnector.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestIcebergConnector.java index 013b98e3b82b..b709c0058f7d 100644 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestIcebergConnector.java +++ b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestIcebergConnector.java @@ -176,7 +176,7 @@ protected TableEnvironment getTableEnv() { settingsBuilder.inStreamingMode(); StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment( - MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG); + MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG); env.enableCheckpointing(400); env.setMaxParallelism(2); env.setParallelism(2); diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitionerFlinkIcebergSink.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitionerFlinkIcebergSink.java index dc3eb93280df..ba0ea867ffb7 100644 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitionerFlinkIcebergSink.java +++ b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitionerFlinkIcebergSink.java @@ -18,7 +18,7 @@ */ package org.apache.iceberg.flink.sink; -import static org.apache.iceberg.flink.MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG; +import static org.apache.iceberg.flink.MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG; import static org.apache.iceberg.flink.TestFixtures.DATABASE; import static org.apache.iceberg.flink.TestFixtures.TABLE_IDENTIFIER; import static org.assertj.core.api.Assertions.assertThat; @@ -63,7 +63,7 @@ public class TestBucketPartitionerFlinkIcebergSink { private static final int SLOTS_PER_TASK_MANAGER = 8; @RegisterExtension - private static final MiniClusterExtension MINI_CLUSTER_RESOURCE = + private static final MiniClusterExtension MINI_CLUSTER_EXTENSION = new MiniClusterExtension( new MiniClusterResourceConfiguration.Builder() .setNumberTaskManagers(NUMBER_TASK_MANAGERS) diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSink.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSink.java index 8cad35c859c4..61ab087f2ca3 100644 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSink.java +++ b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSink.java @@ -44,7 +44,6 @@ import org.apache.iceberg.catalog.TableIdentifier; import org.apache.iceberg.flink.FlinkWriteOptions; import org.apache.iceberg.flink.HadoopCatalogExtension; -import org.apache.iceberg.flink.MiniClusterResource; import org.apache.iceberg.flink.MiniFlinkClusterExtension; import org.apache.iceberg.flink.SimpleDataUtil; import org.apache.iceberg.flink.TableLoader; @@ -62,7 +61,7 @@ public class TestFlinkIcebergSink extends TestFlinkIcebergSinkBase { @RegisterExtension - public static MiniClusterExtension miniClusterResource = + public static final MiniClusterExtension MINI_CLUSTER_EXTENSION = MiniFlinkClusterExtension.createWithClassloaderCheckDisabled(); @RegisterExtension @@ -113,7 +112,7 @@ public void before() throws IOException { env = StreamExecutionEnvironment.getExecutionEnvironment( - MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG) + MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG) .enableCheckpointing(100) .setParallelism(parallelism) .setMaxParallelism(parallelism); @@ -271,7 +270,7 @@ public void testTwoSinksInDisjointedDAG() throws Exception { env = StreamExecutionEnvironment.getExecutionEnvironment( - MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG) + MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG) .enableCheckpointing(100) .setParallelism(parallelism) .setMaxParallelism(parallelism); diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkBranch.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkBranch.java index 3edaafca0e42..441b5ed2a4ae 100644 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkBranch.java +++ b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkBranch.java @@ -36,7 +36,7 @@ import org.apache.iceberg.SnapshotRef; import org.apache.iceberg.TableProperties; import org.apache.iceberg.flink.HadoopCatalogExtension; -import org.apache.iceberg.flink.MiniClusterResource; +import org.apache.iceberg.flink.MiniFlinkClusterExtension; import org.apache.iceberg.flink.SimpleDataUtil; import org.apache.iceberg.flink.TableLoader; import org.apache.iceberg.flink.TestFixtures; @@ -88,7 +88,7 @@ public void before() throws IOException { env = StreamExecutionEnvironment.getExecutionEnvironment( - MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG) + MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG) .enableCheckpointing(100); tableLoader = CATALOG_EXTENSION.tableLoader(); diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2.java index 55909874ccce..577c54976b9a 100644 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2.java +++ b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2.java @@ -38,7 +38,6 @@ import org.apache.iceberg.TableProperties; import org.apache.iceberg.data.Record; import org.apache.iceberg.flink.HadoopCatalogExtension; -import org.apache.iceberg.flink.MiniClusterResource; import org.apache.iceberg.flink.MiniFlinkClusterExtension; import org.apache.iceberg.flink.SimpleDataUtil; import org.apache.iceberg.flink.TestFixtures; @@ -57,7 +56,7 @@ @Timeout(value = 60) public class TestFlinkIcebergSinkV2 extends TestFlinkIcebergSinkV2Base { @RegisterExtension - public static MiniClusterExtension miniClusterResource = + public static final MiniClusterExtension MINI_CLUSTER_EXTENSION = MiniFlinkClusterExtension.createWithClassloaderCheckDisabled(); @RegisterExtension @@ -89,7 +88,7 @@ public void setupTable() { env = StreamExecutionEnvironment.getExecutionEnvironment( - MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG) + MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG) .enableCheckpointing(100L) .setParallelism(parallelism) .setMaxParallelism(parallelism); diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2Branch.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2Branch.java index ffeab673386d..0b0c55f51c32 100644 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2Branch.java +++ b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2Branch.java @@ -31,7 +31,7 @@ import org.apache.iceberg.SnapshotRef; import org.apache.iceberg.TableProperties; import org.apache.iceberg.flink.HadoopCatalogExtension; -import org.apache.iceberg.flink.MiniClusterResource; +import org.apache.iceberg.flink.MiniFlinkClusterExtension; import org.apache.iceberg.flink.SimpleDataUtil; import org.apache.iceberg.flink.TestFixtures; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; @@ -71,7 +71,7 @@ public void before() throws IOException { env = StreamExecutionEnvironment.getExecutionEnvironment( - MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG) + MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG) .enableCheckpointing(100); tableLoader = CATALOG_EXTENSION.tableLoader(); diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/ChangeLogTableTestBase.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/ChangeLogTableTestBase.java index d3748e008b8e..5dfbbe3abe73 100644 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/ChangeLogTableTestBase.java +++ b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/ChangeLogTableTestBase.java @@ -28,7 +28,7 @@ import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; import org.apache.flink.types.Row; import org.apache.flink.types.RowKind; -import org.apache.iceberg.flink.MiniClusterResource; +import org.apache.iceberg.flink.MiniFlinkClusterExtension; import org.apache.iceberg.flink.TestBase; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; @@ -61,7 +61,7 @@ protected TableEnvironment getTableEnv() { StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment( - MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG) + MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG) .enableCheckpointing(400) .setMaxParallelism(1) .setParallelism(1); diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScan.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScan.java index 049ddf9e3f1e..cf6b233dcec6 100644 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScan.java +++ b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScan.java @@ -64,7 +64,7 @@ @ExtendWith(ParameterizedTestExtension.class) public abstract class TestFlinkScan { @RegisterExtension - protected static MiniClusterExtension miniClusterResource = + protected static MiniClusterExtension miniClusterExtension = MiniFlinkClusterExtension.createWithClassloaderCheckDisabled(); @TempDir protected Path temporaryDirectory; diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceWithWatermarkExtractor.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceWithWatermarkExtractor.java index 5e6a2b3caec6..70889f4f76aa 100644 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceWithWatermarkExtractor.java +++ b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceWithWatermarkExtractor.java @@ -18,7 +18,7 @@ */ package org.apache.iceberg.flink.source; -import static org.apache.iceberg.flink.MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG; +import static org.apache.iceberg.flink.MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG; import java.io.Serializable; import java.nio.file.Path; diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestSqlBase.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestSqlBase.java index 8013bce3f415..f9b776397cfc 100644 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestSqlBase.java +++ b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestSqlBase.java @@ -52,7 +52,7 @@ /** Test other more advanced usage of SQL. They don't need to run for every file format. */ public abstract class TestSqlBase { @RegisterExtension - public static MiniClusterExtension miniClusterResource = + public static MiniClusterExtension miniClusterExtension = MiniFlinkClusterExtension.createWithClassloaderCheckDisabled(); @RegisterExtension diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamScanSql.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamScanSql.java index d6cf6791270e..57ee7baf202c 100644 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamScanSql.java +++ b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamScanSql.java @@ -42,7 +42,7 @@ import org.apache.iceberg.data.GenericRecord; import org.apache.iceberg.data.Record; import org.apache.iceberg.flink.CatalogTestBase; -import org.apache.iceberg.flink.MiniClusterResource; +import org.apache.iceberg.flink.MiniFlinkClusterExtension; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.junit.jupiter.api.AfterEach; @@ -70,7 +70,7 @@ protected TableEnvironment getTableEnv() { StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment( - MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG); + MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG); env.enableCheckpointing(400); StreamTableEnvironment streamTableEnv = diff --git a/flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/MiniClusterResource.java b/flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/MiniClusterResource.java deleted file mode 100644 index 399d7aaff64c..000000000000 --- a/flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/MiniClusterResource.java +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import org.apache.flink.configuration.Configuration; -import org.apache.flink.configuration.CoreOptions; -import org.apache.flink.runtime.testutils.InMemoryReporter; -import org.apache.flink.runtime.testutils.MiniClusterResourceConfiguration; -import org.apache.flink.test.util.MiniClusterWithClientResource; - -public class MiniClusterResource { - - private static final int DEFAULT_TM_NUM = 1; - private static final int DEFAULT_PARALLELISM = 4; - - public static final Configuration DISABLE_CLASSLOADER_CHECK_CONFIG = - new Configuration() - // disable classloader check as Avro may cache class/object in the serializers. - .set(CoreOptions.CHECK_LEAKED_CLASSLOADER, false); - - private MiniClusterResource() {} - - /** - * It will start a mini cluster with classloader.check-leaked-classloader=false, so that we won't - * break the unit tests because of the class loader leak issue. In our iceberg integration tests, - * there're some that will assert the results after finished the flink jobs, so actually we may - * access the class loader that has been closed by the flink task managers if we enable the switch - * classloader.check-leaked-classloader by default. - */ - public static MiniClusterWithClientResource createWithClassloaderCheckDisabled() { - return new MiniClusterWithClientResource( - new MiniClusterResourceConfiguration.Builder() - .setNumberTaskManagers(DEFAULT_TM_NUM) - .setNumberSlotsPerTaskManager(DEFAULT_PARALLELISM) - .setConfiguration(DISABLE_CLASSLOADER_CHECK_CONFIG) - .build()); - } - - public static MiniClusterWithClientResource createWithClassloaderCheckDisabled( - InMemoryReporter inMemoryReporter) { - Configuration configuration = - new Configuration(MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG); - inMemoryReporter.addToConfiguration(configuration); - - return new MiniClusterWithClientResource( - new MiniClusterResourceConfiguration.Builder() - .setNumberTaskManagers(MiniClusterResource.DEFAULT_TM_NUM) - .setNumberSlotsPerTaskManager(MiniClusterResource.DEFAULT_PARALLELISM) - .setConfiguration(configuration) - .build()); - } -} diff --git a/flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/TestBase.java b/flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/TestBase.java index 6367a064f283..a74226092f38 100644 --- a/flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/TestBase.java +++ b/flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/TestBase.java @@ -43,7 +43,7 @@ public abstract class TestBase extends TestBaseUtils { @RegisterExtension - public static MiniClusterExtension miniClusterResource = + public static MiniClusterExtension miniClusterExtension = MiniFlinkClusterExtension.createWithClassloaderCheckDisabled(); @TempDir protected Path temporaryDirectory; diff --git a/flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSink.java b/flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSink.java index b7fce104f490..b73300e3f170 100644 --- a/flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSink.java +++ b/flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSink.java @@ -91,7 +91,7 @@ protected TableEnvironment getTableEnv() { settingsBuilder.inStreamingMode(); StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment( - MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG); + MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG); env.enableCheckpointing(400); env.setMaxParallelism(2); env.setParallelism(2); diff --git a/flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/TestFlinkUpsert.java b/flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/TestFlinkUpsert.java index 5674c83e40b8..d52d54e159e6 100644 --- a/flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/TestFlinkUpsert.java +++ b/flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/TestFlinkUpsert.java @@ -75,7 +75,7 @@ protected TableEnvironment getTableEnv() { settingsBuilder.inStreamingMode(); StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment( - MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG); + MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG); env.enableCheckpointing(400); env.setMaxParallelism(2); env.setParallelism(2); diff --git a/flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/TestIcebergConnector.java b/flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/TestIcebergConnector.java index 013b98e3b82b..b709c0058f7d 100644 --- a/flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/TestIcebergConnector.java +++ b/flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/TestIcebergConnector.java @@ -176,7 +176,7 @@ protected TableEnvironment getTableEnv() { settingsBuilder.inStreamingMode(); StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment( - MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG); + MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG); env.enableCheckpointing(400); env.setMaxParallelism(2); env.setParallelism(2); diff --git a/flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitionerFlinkIcebergSink.java b/flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitionerFlinkIcebergSink.java index dc3eb93280df..ba0ea867ffb7 100644 --- a/flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitionerFlinkIcebergSink.java +++ b/flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitionerFlinkIcebergSink.java @@ -18,7 +18,7 @@ */ package org.apache.iceberg.flink.sink; -import static org.apache.iceberg.flink.MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG; +import static org.apache.iceberg.flink.MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG; import static org.apache.iceberg.flink.TestFixtures.DATABASE; import static org.apache.iceberg.flink.TestFixtures.TABLE_IDENTIFIER; import static org.assertj.core.api.Assertions.assertThat; @@ -63,7 +63,7 @@ public class TestBucketPartitionerFlinkIcebergSink { private static final int SLOTS_PER_TASK_MANAGER = 8; @RegisterExtension - private static final MiniClusterExtension MINI_CLUSTER_RESOURCE = + private static final MiniClusterExtension MINI_CLUSTER_EXTENSION = new MiniClusterExtension( new MiniClusterResourceConfiguration.Builder() .setNumberTaskManagers(NUMBER_TASK_MANAGERS) diff --git a/flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSink.java b/flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSink.java index 8cad35c859c4..61ab087f2ca3 100644 --- a/flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSink.java +++ b/flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSink.java @@ -44,7 +44,6 @@ import org.apache.iceberg.catalog.TableIdentifier; import org.apache.iceberg.flink.FlinkWriteOptions; import org.apache.iceberg.flink.HadoopCatalogExtension; -import org.apache.iceberg.flink.MiniClusterResource; import org.apache.iceberg.flink.MiniFlinkClusterExtension; import org.apache.iceberg.flink.SimpleDataUtil; import org.apache.iceberg.flink.TableLoader; @@ -62,7 +61,7 @@ public class TestFlinkIcebergSink extends TestFlinkIcebergSinkBase { @RegisterExtension - public static MiniClusterExtension miniClusterResource = + public static final MiniClusterExtension MINI_CLUSTER_EXTENSION = MiniFlinkClusterExtension.createWithClassloaderCheckDisabled(); @RegisterExtension @@ -113,7 +112,7 @@ public void before() throws IOException { env = StreamExecutionEnvironment.getExecutionEnvironment( - MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG) + MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG) .enableCheckpointing(100) .setParallelism(parallelism) .setMaxParallelism(parallelism); @@ -271,7 +270,7 @@ public void testTwoSinksInDisjointedDAG() throws Exception { env = StreamExecutionEnvironment.getExecutionEnvironment( - MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG) + MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG) .enableCheckpointing(100) .setParallelism(parallelism) .setMaxParallelism(parallelism); diff --git a/flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkBranch.java b/flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkBranch.java index 3edaafca0e42..441b5ed2a4ae 100644 --- a/flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkBranch.java +++ b/flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkBranch.java @@ -36,7 +36,7 @@ import org.apache.iceberg.SnapshotRef; import org.apache.iceberg.TableProperties; import org.apache.iceberg.flink.HadoopCatalogExtension; -import org.apache.iceberg.flink.MiniClusterResource; +import org.apache.iceberg.flink.MiniFlinkClusterExtension; import org.apache.iceberg.flink.SimpleDataUtil; import org.apache.iceberg.flink.TableLoader; import org.apache.iceberg.flink.TestFixtures; @@ -88,7 +88,7 @@ public void before() throws IOException { env = StreamExecutionEnvironment.getExecutionEnvironment( - MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG) + MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG) .enableCheckpointing(100); tableLoader = CATALOG_EXTENSION.tableLoader(); diff --git a/flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2.java b/flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2.java index 55909874ccce..577c54976b9a 100644 --- a/flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2.java +++ b/flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2.java @@ -38,7 +38,6 @@ import org.apache.iceberg.TableProperties; import org.apache.iceberg.data.Record; import org.apache.iceberg.flink.HadoopCatalogExtension; -import org.apache.iceberg.flink.MiniClusterResource; import org.apache.iceberg.flink.MiniFlinkClusterExtension; import org.apache.iceberg.flink.SimpleDataUtil; import org.apache.iceberg.flink.TestFixtures; @@ -57,7 +56,7 @@ @Timeout(value = 60) public class TestFlinkIcebergSinkV2 extends TestFlinkIcebergSinkV2Base { @RegisterExtension - public static MiniClusterExtension miniClusterResource = + public static final MiniClusterExtension MINI_CLUSTER_EXTENSION = MiniFlinkClusterExtension.createWithClassloaderCheckDisabled(); @RegisterExtension @@ -89,7 +88,7 @@ public void setupTable() { env = StreamExecutionEnvironment.getExecutionEnvironment( - MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG) + MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG) .enableCheckpointing(100L) .setParallelism(parallelism) .setMaxParallelism(parallelism); diff --git a/flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2Branch.java b/flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2Branch.java index ffeab673386d..0b0c55f51c32 100644 --- a/flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2Branch.java +++ b/flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2Branch.java @@ -31,7 +31,7 @@ import org.apache.iceberg.SnapshotRef; import org.apache.iceberg.TableProperties; import org.apache.iceberg.flink.HadoopCatalogExtension; -import org.apache.iceberg.flink.MiniClusterResource; +import org.apache.iceberg.flink.MiniFlinkClusterExtension; import org.apache.iceberg.flink.SimpleDataUtil; import org.apache.iceberg.flink.TestFixtures; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; @@ -71,7 +71,7 @@ public void before() throws IOException { env = StreamExecutionEnvironment.getExecutionEnvironment( - MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG) + MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG) .enableCheckpointing(100); tableLoader = CATALOG_EXTENSION.tableLoader(); diff --git a/flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/source/ChangeLogTableTestBase.java b/flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/source/ChangeLogTableTestBase.java index d3748e008b8e..5dfbbe3abe73 100644 --- a/flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/source/ChangeLogTableTestBase.java +++ b/flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/source/ChangeLogTableTestBase.java @@ -28,7 +28,7 @@ import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; import org.apache.flink.types.Row; import org.apache.flink.types.RowKind; -import org.apache.iceberg.flink.MiniClusterResource; +import org.apache.iceberg.flink.MiniFlinkClusterExtension; import org.apache.iceberg.flink.TestBase; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; @@ -61,7 +61,7 @@ protected TableEnvironment getTableEnv() { StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment( - MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG) + MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG) .enableCheckpointing(400) .setMaxParallelism(1) .setParallelism(1); diff --git a/flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScan.java b/flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScan.java index 049ddf9e3f1e..cf6b233dcec6 100644 --- a/flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScan.java +++ b/flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScan.java @@ -64,7 +64,7 @@ @ExtendWith(ParameterizedTestExtension.class) public abstract class TestFlinkScan { @RegisterExtension - protected static MiniClusterExtension miniClusterResource = + protected static MiniClusterExtension miniClusterExtension = MiniFlinkClusterExtension.createWithClassloaderCheckDisabled(); @TempDir protected Path temporaryDirectory; diff --git a/flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceContinuous.java b/flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceContinuous.java index 749cbf89338a..e0e2bf5e61e2 100644 --- a/flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceContinuous.java +++ b/flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceContinuous.java @@ -67,7 +67,7 @@ public class TestIcebergSourceContinuous { @TempDir protected Path temporaryFolder; @RegisterExtension - public static MiniClusterExtension miniClusterExtension = + public static final MiniClusterExtension MINI_CLUSTER_EXTENSION = MiniFlinkClusterExtension.createWithClassloaderCheckDisabled(METRIC_REPORTER); @RegisterExtension diff --git a/flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceWithWatermarkExtractor.java b/flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceWithWatermarkExtractor.java index 5e6a2b3caec6..70889f4f76aa 100644 --- a/flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceWithWatermarkExtractor.java +++ b/flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceWithWatermarkExtractor.java @@ -18,7 +18,7 @@ */ package org.apache.iceberg.flink.source; -import static org.apache.iceberg.flink.MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG; +import static org.apache.iceberg.flink.MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG; import java.io.Serializable; import java.nio.file.Path; diff --git a/flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/source/TestSqlBase.java b/flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/source/TestSqlBase.java index 8013bce3f415..f9b776397cfc 100644 --- a/flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/source/TestSqlBase.java +++ b/flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/source/TestSqlBase.java @@ -52,7 +52,7 @@ /** Test other more advanced usage of SQL. They don't need to run for every file format. */ public abstract class TestSqlBase { @RegisterExtension - public static MiniClusterExtension miniClusterResource = + public static MiniClusterExtension miniClusterExtension = MiniFlinkClusterExtension.createWithClassloaderCheckDisabled(); @RegisterExtension diff --git a/flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamScanSql.java b/flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamScanSql.java index d6cf6791270e..57ee7baf202c 100644 --- a/flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamScanSql.java +++ b/flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamScanSql.java @@ -42,7 +42,7 @@ import org.apache.iceberg.data.GenericRecord; import org.apache.iceberg.data.Record; import org.apache.iceberg.flink.CatalogTestBase; -import org.apache.iceberg.flink.MiniClusterResource; +import org.apache.iceberg.flink.MiniFlinkClusterExtension; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.junit.jupiter.api.AfterEach; @@ -70,7 +70,7 @@ protected TableEnvironment getTableEnv() { StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment( - MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG); + MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG); env.enableCheckpointing(400); StreamTableEnvironment streamTableEnv = diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/MiniClusterResource.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/MiniClusterResource.java deleted file mode 100644 index 399d7aaff64c..000000000000 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/MiniClusterResource.java +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import org.apache.flink.configuration.Configuration; -import org.apache.flink.configuration.CoreOptions; -import org.apache.flink.runtime.testutils.InMemoryReporter; -import org.apache.flink.runtime.testutils.MiniClusterResourceConfiguration; -import org.apache.flink.test.util.MiniClusterWithClientResource; - -public class MiniClusterResource { - - private static final int DEFAULT_TM_NUM = 1; - private static final int DEFAULT_PARALLELISM = 4; - - public static final Configuration DISABLE_CLASSLOADER_CHECK_CONFIG = - new Configuration() - // disable classloader check as Avro may cache class/object in the serializers. - .set(CoreOptions.CHECK_LEAKED_CLASSLOADER, false); - - private MiniClusterResource() {} - - /** - * It will start a mini cluster with classloader.check-leaked-classloader=false, so that we won't - * break the unit tests because of the class loader leak issue. In our iceberg integration tests, - * there're some that will assert the results after finished the flink jobs, so actually we may - * access the class loader that has been closed by the flink task managers if we enable the switch - * classloader.check-leaked-classloader by default. - */ - public static MiniClusterWithClientResource createWithClassloaderCheckDisabled() { - return new MiniClusterWithClientResource( - new MiniClusterResourceConfiguration.Builder() - .setNumberTaskManagers(DEFAULT_TM_NUM) - .setNumberSlotsPerTaskManager(DEFAULT_PARALLELISM) - .setConfiguration(DISABLE_CLASSLOADER_CHECK_CONFIG) - .build()); - } - - public static MiniClusterWithClientResource createWithClassloaderCheckDisabled( - InMemoryReporter inMemoryReporter) { - Configuration configuration = - new Configuration(MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG); - inMemoryReporter.addToConfiguration(configuration); - - return new MiniClusterWithClientResource( - new MiniClusterResourceConfiguration.Builder() - .setNumberTaskManagers(MiniClusterResource.DEFAULT_TM_NUM) - .setNumberSlotsPerTaskManager(MiniClusterResource.DEFAULT_PARALLELISM) - .setConfiguration(configuration) - .build()); - } -} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestBase.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestBase.java index 773d22e19e64..633690044692 100644 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestBase.java +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestBase.java @@ -44,7 +44,7 @@ public abstract class TestBase extends TestBaseUtils { @RegisterExtension - public static MiniClusterExtension miniClusterResource = + public static MiniClusterExtension miniClusterExtension = MiniFlinkClusterExtension.createWithClassloaderCheckDisabled(); @TempDir protected Path temporaryDirectory; diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSink.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSink.java index 3f66174049a4..a0341e6834d4 100644 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSink.java +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSink.java @@ -91,7 +91,7 @@ protected TableEnvironment getTableEnv() { settingsBuilder.inStreamingMode(); StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment( - MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG); + MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG); env.enableCheckpointing(400); env.setMaxParallelism(2); env.setParallelism(2); diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkUpsert.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkUpsert.java index baf13017ff99..c5becb6caca1 100644 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkUpsert.java +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkUpsert.java @@ -75,7 +75,7 @@ protected TableEnvironment getTableEnv() { settingsBuilder.inStreamingMode(); StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment( - MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG); + MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG); env.enableCheckpointing(400); env.setMaxParallelism(2); env.setParallelism(2); diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestIcebergConnector.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestIcebergConnector.java index c978ec6f8bd4..fdb0e0cf19df 100644 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestIcebergConnector.java +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestIcebergConnector.java @@ -176,7 +176,7 @@ protected TableEnvironment getTableEnv() { settingsBuilder.inStreamingMode(); StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment( - MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG); + MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG); env.enableCheckpointing(400); env.setMaxParallelism(2); env.setParallelism(2); diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/OperatorTestBase.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/OperatorTestBase.java index 269ae681b02e..272e0b693fd3 100644 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/OperatorTestBase.java +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/OperatorTestBase.java @@ -18,7 +18,7 @@ */ package org.apache.iceberg.flink.maintenance.operator; -import static org.apache.iceberg.flink.MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG; +import static org.apache.iceberg.flink.MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG; import org.apache.flink.configuration.Configuration; import org.apache.flink.runtime.testutils.MiniClusterResourceConfiguration; @@ -34,7 +34,7 @@ class OperatorTestBase { static final String TABLE_NAME = "test_table"; @RegisterExtension - protected static final MiniClusterExtension MINI_CLUSTER_RESOURCE = + protected static final MiniClusterExtension MINI_CLUSTER_EXTENSION = new MiniClusterExtension( new MiniClusterResourceConfiguration.Builder() .setNumberTaskManagers(NUMBER_TASK_MANAGERS) diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitionerFlinkIcebergSink.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitionerFlinkIcebergSink.java index dc3eb93280df..ba0ea867ffb7 100644 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitionerFlinkIcebergSink.java +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitionerFlinkIcebergSink.java @@ -18,7 +18,7 @@ */ package org.apache.iceberg.flink.sink; -import static org.apache.iceberg.flink.MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG; +import static org.apache.iceberg.flink.MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG; import static org.apache.iceberg.flink.TestFixtures.DATABASE; import static org.apache.iceberg.flink.TestFixtures.TABLE_IDENTIFIER; import static org.assertj.core.api.Assertions.assertThat; @@ -63,7 +63,7 @@ public class TestBucketPartitionerFlinkIcebergSink { private static final int SLOTS_PER_TASK_MANAGER = 8; @RegisterExtension - private static final MiniClusterExtension MINI_CLUSTER_RESOURCE = + private static final MiniClusterExtension MINI_CLUSTER_EXTENSION = new MiniClusterExtension( new MiniClusterResourceConfiguration.Builder() .setNumberTaskManagers(NUMBER_TASK_MANAGERS) diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSink.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSink.java index 8cad35c859c4..61ab087f2ca3 100644 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSink.java +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSink.java @@ -44,7 +44,6 @@ import org.apache.iceberg.catalog.TableIdentifier; import org.apache.iceberg.flink.FlinkWriteOptions; import org.apache.iceberg.flink.HadoopCatalogExtension; -import org.apache.iceberg.flink.MiniClusterResource; import org.apache.iceberg.flink.MiniFlinkClusterExtension; import org.apache.iceberg.flink.SimpleDataUtil; import org.apache.iceberg.flink.TableLoader; @@ -62,7 +61,7 @@ public class TestFlinkIcebergSink extends TestFlinkIcebergSinkBase { @RegisterExtension - public static MiniClusterExtension miniClusterResource = + public static final MiniClusterExtension MINI_CLUSTER_EXTENSION = MiniFlinkClusterExtension.createWithClassloaderCheckDisabled(); @RegisterExtension @@ -113,7 +112,7 @@ public void before() throws IOException { env = StreamExecutionEnvironment.getExecutionEnvironment( - MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG) + MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG) .enableCheckpointing(100) .setParallelism(parallelism) .setMaxParallelism(parallelism); @@ -271,7 +270,7 @@ public void testTwoSinksInDisjointedDAG() throws Exception { env = StreamExecutionEnvironment.getExecutionEnvironment( - MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG) + MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG) .enableCheckpointing(100) .setParallelism(parallelism) .setMaxParallelism(parallelism); diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkBranch.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkBranch.java index 3edaafca0e42..441b5ed2a4ae 100644 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkBranch.java +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkBranch.java @@ -36,7 +36,7 @@ import org.apache.iceberg.SnapshotRef; import org.apache.iceberg.TableProperties; import org.apache.iceberg.flink.HadoopCatalogExtension; -import org.apache.iceberg.flink.MiniClusterResource; +import org.apache.iceberg.flink.MiniFlinkClusterExtension; import org.apache.iceberg.flink.SimpleDataUtil; import org.apache.iceberg.flink.TableLoader; import org.apache.iceberg.flink.TestFixtures; @@ -88,7 +88,7 @@ public void before() throws IOException { env = StreamExecutionEnvironment.getExecutionEnvironment( - MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG) + MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG) .enableCheckpointing(100); tableLoader = CATALOG_EXTENSION.tableLoader(); diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2.java index 55909874ccce..577c54976b9a 100644 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2.java +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2.java @@ -38,7 +38,6 @@ import org.apache.iceberg.TableProperties; import org.apache.iceberg.data.Record; import org.apache.iceberg.flink.HadoopCatalogExtension; -import org.apache.iceberg.flink.MiniClusterResource; import org.apache.iceberg.flink.MiniFlinkClusterExtension; import org.apache.iceberg.flink.SimpleDataUtil; import org.apache.iceberg.flink.TestFixtures; @@ -57,7 +56,7 @@ @Timeout(value = 60) public class TestFlinkIcebergSinkV2 extends TestFlinkIcebergSinkV2Base { @RegisterExtension - public static MiniClusterExtension miniClusterResource = + public static final MiniClusterExtension MINI_CLUSTER_EXTENSION = MiniFlinkClusterExtension.createWithClassloaderCheckDisabled(); @RegisterExtension @@ -89,7 +88,7 @@ public void setupTable() { env = StreamExecutionEnvironment.getExecutionEnvironment( - MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG) + MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG) .enableCheckpointing(100L) .setParallelism(parallelism) .setMaxParallelism(parallelism); diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2Branch.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2Branch.java index ffeab673386d..0b0c55f51c32 100644 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2Branch.java +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2Branch.java @@ -31,7 +31,7 @@ import org.apache.iceberg.SnapshotRef; import org.apache.iceberg.TableProperties; import org.apache.iceberg.flink.HadoopCatalogExtension; -import org.apache.iceberg.flink.MiniClusterResource; +import org.apache.iceberg.flink.MiniFlinkClusterExtension; import org.apache.iceberg.flink.SimpleDataUtil; import org.apache.iceberg.flink.TestFixtures; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; @@ -71,7 +71,7 @@ public void before() throws IOException { env = StreamExecutionEnvironment.getExecutionEnvironment( - MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG) + MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG) .enableCheckpointing(100); tableLoader = CATALOG_EXTENSION.tableLoader(); diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/ChangeLogTableTestBase.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/ChangeLogTableTestBase.java index d3748e008b8e..5dfbbe3abe73 100644 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/ChangeLogTableTestBase.java +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/ChangeLogTableTestBase.java @@ -28,7 +28,7 @@ import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; import org.apache.flink.types.Row; import org.apache.flink.types.RowKind; -import org.apache.iceberg.flink.MiniClusterResource; +import org.apache.iceberg.flink.MiniFlinkClusterExtension; import org.apache.iceberg.flink.TestBase; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; @@ -61,7 +61,7 @@ protected TableEnvironment getTableEnv() { StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment( - MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG) + MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG) .enableCheckpointing(400) .setMaxParallelism(1) .setParallelism(1); diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScan.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScan.java index 049ddf9e3f1e..cf6b233dcec6 100644 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScan.java +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScan.java @@ -64,7 +64,7 @@ @ExtendWith(ParameterizedTestExtension.class) public abstract class TestFlinkScan { @RegisterExtension - protected static MiniClusterExtension miniClusterResource = + protected static MiniClusterExtension miniClusterExtension = MiniFlinkClusterExtension.createWithClassloaderCheckDisabled(); @TempDir protected Path temporaryDirectory; diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceWithWatermarkExtractor.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceWithWatermarkExtractor.java index 5e6a2b3caec6..70889f4f76aa 100644 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceWithWatermarkExtractor.java +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceWithWatermarkExtractor.java @@ -18,7 +18,7 @@ */ package org.apache.iceberg.flink.source; -import static org.apache.iceberg.flink.MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG; +import static org.apache.iceberg.flink.MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG; import java.io.Serializable; import java.nio.file.Path; diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSpeculativeExecutionSupport.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSpeculativeExecutionSupport.java index 3285a16a1277..b21010a91bed 100644 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSpeculativeExecutionSupport.java +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSpeculativeExecutionSupport.java @@ -55,7 +55,7 @@ public class TestIcebergSpeculativeExecutionSupport extends TestBase { private static final int NUM_TASK_SLOTS = 3; @RegisterExtension - public static MiniClusterExtension miniClusterResource = + public static final MiniClusterExtension MINI_CLUSTER_EXTENSION = new MiniClusterExtension( new MiniClusterResourceConfiguration.Builder() .setNumberTaskManagers(NUM_TASK_MANAGERS) diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestSqlBase.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestSqlBase.java index 8013bce3f415..f9b776397cfc 100644 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestSqlBase.java +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestSqlBase.java @@ -52,7 +52,7 @@ /** Test other more advanced usage of SQL. They don't need to run for every file format. */ public abstract class TestSqlBase { @RegisterExtension - public static MiniClusterExtension miniClusterResource = + public static MiniClusterExtension miniClusterExtension = MiniFlinkClusterExtension.createWithClassloaderCheckDisabled(); @RegisterExtension diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamScanSql.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamScanSql.java index fcf5c1479df5..97ed4ca1e93f 100644 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamScanSql.java +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamScanSql.java @@ -42,7 +42,7 @@ import org.apache.iceberg.data.GenericRecord; import org.apache.iceberg.data.Record; import org.apache.iceberg.flink.CatalogTestBase; -import org.apache.iceberg.flink.MiniClusterResource; +import org.apache.iceberg.flink.MiniFlinkClusterExtension; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.junit.jupiter.api.AfterEach; @@ -70,7 +70,7 @@ protected TableEnvironment getTableEnv() { StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment( - MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG); + MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG); env.enableCheckpointing(400); StreamTableEnvironment streamTableEnv = From 4d1ceac275859df0ab16a8f919a76fcd3b97d622 Mon Sep 17 00:00:00 2001 From: liu yang Date: Wed, 31 Jul 2024 05:25:56 +0800 Subject: [PATCH 02/55] Docs: Use link addresses instead of descriptions in releases.md (#10815) --- site/docs/releases.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/site/docs/releases.md b/site/docs/releases.md index 73d8bc32a071..cc29857ed802 100644 --- a/site/docs/releases.md +++ b/site/docs/releases.md @@ -83,7 +83,7 @@ The 1.6.0 release contains fixes, dependency updates, and new features (like Kaf - Add EnvironmentContext to commit summary ([\#9273](https://github.com/apache/iceberg/pull/9273)) - Add explicit JSON parser for ConfigResponse ([\#9952](https://github.com/apache/iceberg/pull/9952)) - Calling rewrite_position_delete_files fails on tables with more than 1k columns ([\#10020](https://github.com/apache/iceberg/pull/10020)) - - Expose table incremental scan for appends API in SerializableTable ([\#10682](Expose table incremental scan for appends API in SerializableTable)) + - Expose table incremental scan for appends API in SerializableTable ([\#10682](https://github.com/apache/iceberg/pull/10682)) - Fix NPE during conflict handling of NULL partitions ([\#10680](https://github.com/apache/iceberg/pull/10680)) - Fix ParallelIterable memory leak where queue continues to be populated even after iterator close ([\#9402](https://github.com/apache/iceberg/pull/9402)) - Fix logging table name in scanning metadata table ([\#10141](https://github.com/apache/iceberg/pull/10141)) @@ -100,7 +100,7 @@ The 1.6.0 release contains fixes, dependency updates, and new features (like Kaf - REST Catalog: Handles possible heap data corruption of OAuth2Util.AuthSession#headers ([\#10615](https://github.com/apache/iceberg/pull/10615)) - REST Catalog: Handles potential NPE in RESTSessionCatalog#newSessionCache ([\#10607](https://github.com/apache/iceberg/pull/10607)) - REST Catalog: Introduce AuthConfig ([\#10161](https://github.com/apache/iceberg/pull/10161)) - - REST Catalog: Mark 502 and 504 statuses as retryable to the REST exponential retry strategy ([\#9885](Mark 502 and 504 statuses as retryable to the REST exponential retry strategy)) + - REST Catalog: Mark 502 and 504 statuses as retryable to the REST exponential retry strategy ([\#9885](https://github.com/apache/iceberg/pull/9885)) - REST Catalog: disallow overriding "credential" in table sessions ([\#10345](https://github.com/apache/iceberg/pull/10345)) - REST Catalog: fix incorrect token refresh thread name ([\#10223](https://github.com/apache/iceberg/pull/10223)) - REST Catalog: fix spurious warning when shutting down refresh executor ([\#10087](https://github.com/apache/iceberg/pull/10087)) From 0ff90e7732574fa1a1e094bb66c7c3793e3d1ebb Mon Sep 17 00:00:00 2001 From: Devin Smith Date: Tue, 30 Jul 2024 14:26:55 -0700 Subject: [PATCH 03/55] Build: Declare avro as an api dependency of iceberg-core (#10573) iceberg-core should declare an api dependency on avro. For example, the public class `org.apache.iceberg.PartitionData` extends avro-specific types. In addition, there are public methods that deal with avro types, see `org.apache.iceberg.avro.AvroSchemaUtil` --- build.gradle | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build.gradle b/build.gradle index 09a04e3ae977..a0b1a2e018bf 100644 --- a/build.gradle +++ b/build.gradle @@ -347,7 +347,7 @@ project(':iceberg-core') { annotationProcessor libs.immutables.value compileOnly libs.immutables.value - implementation(libs.avro.avro) { + api(libs.avro.avro) { exclude group: 'org.tukaani' // xz compression is not supported } From 72b39ab91dfa04d713552e70b009c24510a1cd07 Mon Sep 17 00:00:00 2001 From: Steven Zhen Wu Date: Tue, 30 Jul 2024 14:49:44 -0700 Subject: [PATCH 04/55] Flink: backport PR #10748 for limit pushdown (#10813) --- .../iceberg/flink/source/IcebergSource.java | 3 +- .../source/reader/LimitableDataIterator.java | 56 +++++++++++++ .../flink/source/reader/RecordLimiter.java | 45 ++++++++++ .../source/reader/RowDataReaderFunction.java | 40 ++++++++- .../flink/source/TestFlinkSourceConfig.java | 7 +- .../reader/TestLimitableDataIterator.java | 84 +++++++++++++++++++ .../iceberg/flink/source/IcebergSource.java | 3 +- .../source/reader/LimitableDataIterator.java | 56 +++++++++++++ .../flink/source/reader/RecordLimiter.java | 45 ++++++++++ .../source/reader/RowDataReaderFunction.java | 40 ++++++++- .../flink/source/TestFlinkSourceConfig.java | 7 +- .../reader/TestLimitableDataIterator.java | 84 +++++++++++++++++++ 12 files changed, 456 insertions(+), 14 deletions(-) create mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/LimitableDataIterator.java create mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordLimiter.java create mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestLimitableDataIterator.java create mode 100644 flink/v1.18/flink/src/main/java/org/apache/iceberg/flink/source/reader/LimitableDataIterator.java create mode 100644 flink/v1.18/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordLimiter.java create mode 100644 flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestLimitableDataIterator.java diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/IcebergSource.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/IcebergSource.java index 48201ea09359..ccbd0d9997ed 100644 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/IcebergSource.java +++ b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/IcebergSource.java @@ -521,7 +521,8 @@ public IcebergSource build() { context.caseSensitive(), table.io(), table.encryption(), - context.filters()); + context.filters(), + context.limit()); this.readerFunction = (ReaderFunction) rowDataReaderFunction; } } diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/LimitableDataIterator.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/LimitableDataIterator.java new file mode 100644 index 000000000000..020e87646d05 --- /dev/null +++ b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/LimitableDataIterator.java @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.reader; + +import org.apache.iceberg.CombinedScanTask; +import org.apache.iceberg.encryption.EncryptionManager; +import org.apache.iceberg.flink.source.DataIterator; +import org.apache.iceberg.flink.source.FileScanTaskReader; +import org.apache.iceberg.io.FileIO; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; + +class LimitableDataIterator extends DataIterator { + private final RecordLimiter limiter; + + LimitableDataIterator( + FileScanTaskReader fileScanTaskReader, + CombinedScanTask task, + FileIO io, + EncryptionManager encryption, + RecordLimiter limiter) { + super(fileScanTaskReader, task, io, encryption); + Preconditions.checkArgument(limiter != null, "Invalid record limiter: null"); + this.limiter = limiter; + } + + @Override + public boolean hasNext() { + if (limiter.reachedLimit()) { + return false; + } + + return super.hasNext(); + } + + @Override + public T next() { + limiter.increment(); + return super.next(); + } +} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordLimiter.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordLimiter.java new file mode 100644 index 000000000000..f260a53089ff --- /dev/null +++ b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordLimiter.java @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.reader; + +import java.util.concurrent.atomic.AtomicLong; +import org.apache.flink.annotation.Internal; + +@Internal +class RecordLimiter { + private final long limit; + private final AtomicLong counter; + + static RecordLimiter create(long limit) { + return new RecordLimiter(limit); + } + + private RecordLimiter(long limit) { + this.limit = limit; + this.counter = new AtomicLong(0); + } + + public boolean reachedLimit() { + return limit > 0 && counter.get() >= limit; + } + + public void increment() { + counter.incrementAndGet(); + } +} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataReaderFunction.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataReaderFunction.java index 5d0a00954e7a..c9208a0e1834 100644 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataReaderFunction.java +++ b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataReaderFunction.java @@ -39,6 +39,9 @@ public class RowDataReaderFunction extends DataIteratorReaderFunction { private final FileIO io; private final EncryptionManager encryption; private final List filters; + private final long limit; + + private transient RecordLimiter recordLimiter = null; public RowDataReaderFunction( ReadableConfig config, @@ -49,6 +52,28 @@ public RowDataReaderFunction( FileIO io, EncryptionManager encryption, List filters) { + this( + config, + tableSchema, + projectedSchema, + nameMapping, + caseSensitive, + io, + encryption, + filters, + -1L); + } + + public RowDataReaderFunction( + ReadableConfig config, + Schema tableSchema, + Schema projectedSchema, + String nameMapping, + boolean caseSensitive, + FileIO io, + EncryptionManager encryption, + List filters, + long limit) { super( new ArrayPoolDataIteratorBatcher<>( config, @@ -61,19 +86,30 @@ public RowDataReaderFunction( this.io = io; this.encryption = encryption; this.filters = filters; + this.limit = limit; } @Override public DataIterator createDataIterator(IcebergSourceSplit split) { - return new DataIterator<>( + return new LimitableDataIterator<>( new RowDataFileScanTaskReader(tableSchema, readSchema, nameMapping, caseSensitive, filters), split.task(), io, - encryption); + encryption, + lazyLimiter()); } private static Schema readSchema(Schema tableSchema, Schema projectedSchema) { Preconditions.checkNotNull(tableSchema, "Table schema can't be null"); return projectedSchema == null ? tableSchema : projectedSchema; } + + /** Lazily create RecordLimiter to avoid the need to make it serializable */ + private RecordLimiter lazyLimiter() { + if (recordLimiter == null) { + this.recordLimiter = RecordLimiter.create(limit); + } + + return recordLimiter; + } } diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSourceConfig.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSourceConfig.java index 8131bd7ab0d3..14131d9e96d5 100644 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSourceConfig.java +++ b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSourceConfig.java @@ -20,7 +20,6 @@ import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.assertThatThrownBy; -import static org.assertj.core.api.Assumptions.assumeThat; import java.util.List; import org.apache.flink.types.Row; @@ -49,11 +48,11 @@ public void testFlinkHintConfig() { @TestTemplate public void testReadOptionHierarchy() { - // TODO: FLIP-27 source doesn't implement limit pushdown yet - assumeThat(useFlip27Source).isFalse(); - getTableEnv().getConfig().set(FlinkReadOptions.LIMIT_OPTION, 1L); List result = sql("SELECT * FROM %s", TABLE); + // Note that this query doesn't have the limit clause in the SQL. + // This assertions works because limit is pushed down to the reader and + // reader parallelism is 1. assertThat(result).hasSize(1); result = sql("SELECT * FROM %s /*+ OPTIONS('limit'='3')*/", TABLE); diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestLimitableDataIterator.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestLimitableDataIterator.java new file mode 100644 index 000000000000..36749d3ec2dc --- /dev/null +++ b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestLimitableDataIterator.java @@ -0,0 +1,84 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.reader; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.nio.file.Path; +import java.util.Collections; +import java.util.List; +import org.apache.flink.table.data.RowData; +import org.apache.iceberg.CombinedScanTask; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.data.GenericAppenderFactory; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.encryption.EncryptionManager; +import org.apache.iceberg.encryption.PlaintextEncryptionManager; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.flink.source.RowDataFileScanTaskReader; +import org.apache.iceberg.hadoop.HadoopFileIO; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.io.TempDir; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; + +public class TestLimitableDataIterator { + @TempDir private static Path temporaryFolder; + + private final RowDataFileScanTaskReader reader = + new RowDataFileScanTaskReader( + TestFixtures.SCHEMA, TestFixtures.SCHEMA, null, true, Collections.emptyList()); + private final HadoopFileIO fileIO = new HadoopFileIO(new org.apache.hadoop.conf.Configuration()); + private final EncryptionManager encryptionManager = PlaintextEncryptionManager.instance(); + + private static CombinedScanTask combinedScanTask; + private static int totalRecords; + + @BeforeAll + public static void beforeClass() throws Exception { + GenericAppenderFactory appenderFactory = new GenericAppenderFactory(TestFixtures.SCHEMA); + List> recordBatchList = + ReaderUtil.createRecordBatchList(TestFixtures.SCHEMA, 3, 2); + combinedScanTask = + ReaderUtil.createCombinedScanTask( + recordBatchList, temporaryFolder, FileFormat.PARQUET, appenderFactory); + totalRecords = 3 * 2; + } + + @ParameterizedTest + @ValueSource(longs = {-1L, 0L, 1L, 6L, 7L}) + public void testUnlimited(long limit) { + LimitableDataIterator dataIterator = + new LimitableDataIterator<>( + reader, combinedScanTask, fileIO, encryptionManager, RecordLimiter.create(limit)); + + List result = Lists.newArrayList(); + while (dataIterator.hasNext()) { + result.add(dataIterator.next()); + } + + if (limit <= 0 || limit > totalRecords) { + // read all records + assertThat(result).hasSize(totalRecords); + } else { + assertThat(result).hasSize((int) limit); + } + } +} diff --git a/flink/v1.18/flink/src/main/java/org/apache/iceberg/flink/source/IcebergSource.java b/flink/v1.18/flink/src/main/java/org/apache/iceberg/flink/source/IcebergSource.java index 48201ea09359..ccbd0d9997ed 100644 --- a/flink/v1.18/flink/src/main/java/org/apache/iceberg/flink/source/IcebergSource.java +++ b/flink/v1.18/flink/src/main/java/org/apache/iceberg/flink/source/IcebergSource.java @@ -521,7 +521,8 @@ public IcebergSource build() { context.caseSensitive(), table.io(), table.encryption(), - context.filters()); + context.filters(), + context.limit()); this.readerFunction = (ReaderFunction) rowDataReaderFunction; } } diff --git a/flink/v1.18/flink/src/main/java/org/apache/iceberg/flink/source/reader/LimitableDataIterator.java b/flink/v1.18/flink/src/main/java/org/apache/iceberg/flink/source/reader/LimitableDataIterator.java new file mode 100644 index 000000000000..020e87646d05 --- /dev/null +++ b/flink/v1.18/flink/src/main/java/org/apache/iceberg/flink/source/reader/LimitableDataIterator.java @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.reader; + +import org.apache.iceberg.CombinedScanTask; +import org.apache.iceberg.encryption.EncryptionManager; +import org.apache.iceberg.flink.source.DataIterator; +import org.apache.iceberg.flink.source.FileScanTaskReader; +import org.apache.iceberg.io.FileIO; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; + +class LimitableDataIterator extends DataIterator { + private final RecordLimiter limiter; + + LimitableDataIterator( + FileScanTaskReader fileScanTaskReader, + CombinedScanTask task, + FileIO io, + EncryptionManager encryption, + RecordLimiter limiter) { + super(fileScanTaskReader, task, io, encryption); + Preconditions.checkArgument(limiter != null, "Invalid record limiter: null"); + this.limiter = limiter; + } + + @Override + public boolean hasNext() { + if (limiter.reachedLimit()) { + return false; + } + + return super.hasNext(); + } + + @Override + public T next() { + limiter.increment(); + return super.next(); + } +} diff --git a/flink/v1.18/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordLimiter.java b/flink/v1.18/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordLimiter.java new file mode 100644 index 000000000000..f260a53089ff --- /dev/null +++ b/flink/v1.18/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordLimiter.java @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.reader; + +import java.util.concurrent.atomic.AtomicLong; +import org.apache.flink.annotation.Internal; + +@Internal +class RecordLimiter { + private final long limit; + private final AtomicLong counter; + + static RecordLimiter create(long limit) { + return new RecordLimiter(limit); + } + + private RecordLimiter(long limit) { + this.limit = limit; + this.counter = new AtomicLong(0); + } + + public boolean reachedLimit() { + return limit > 0 && counter.get() >= limit; + } + + public void increment() { + counter.incrementAndGet(); + } +} diff --git a/flink/v1.18/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataReaderFunction.java b/flink/v1.18/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataReaderFunction.java index 5d0a00954e7a..c9208a0e1834 100644 --- a/flink/v1.18/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataReaderFunction.java +++ b/flink/v1.18/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataReaderFunction.java @@ -39,6 +39,9 @@ public class RowDataReaderFunction extends DataIteratorReaderFunction { private final FileIO io; private final EncryptionManager encryption; private final List filters; + private final long limit; + + private transient RecordLimiter recordLimiter = null; public RowDataReaderFunction( ReadableConfig config, @@ -49,6 +52,28 @@ public RowDataReaderFunction( FileIO io, EncryptionManager encryption, List filters) { + this( + config, + tableSchema, + projectedSchema, + nameMapping, + caseSensitive, + io, + encryption, + filters, + -1L); + } + + public RowDataReaderFunction( + ReadableConfig config, + Schema tableSchema, + Schema projectedSchema, + String nameMapping, + boolean caseSensitive, + FileIO io, + EncryptionManager encryption, + List filters, + long limit) { super( new ArrayPoolDataIteratorBatcher<>( config, @@ -61,19 +86,30 @@ public RowDataReaderFunction( this.io = io; this.encryption = encryption; this.filters = filters; + this.limit = limit; } @Override public DataIterator createDataIterator(IcebergSourceSplit split) { - return new DataIterator<>( + return new LimitableDataIterator<>( new RowDataFileScanTaskReader(tableSchema, readSchema, nameMapping, caseSensitive, filters), split.task(), io, - encryption); + encryption, + lazyLimiter()); } private static Schema readSchema(Schema tableSchema, Schema projectedSchema) { Preconditions.checkNotNull(tableSchema, "Table schema can't be null"); return projectedSchema == null ? tableSchema : projectedSchema; } + + /** Lazily create RecordLimiter to avoid the need to make it serializable */ + private RecordLimiter lazyLimiter() { + if (recordLimiter == null) { + this.recordLimiter = RecordLimiter.create(limit); + } + + return recordLimiter; + } } diff --git a/flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSourceConfig.java b/flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSourceConfig.java index 8131bd7ab0d3..14131d9e96d5 100644 --- a/flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSourceConfig.java +++ b/flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSourceConfig.java @@ -20,7 +20,6 @@ import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.assertThatThrownBy; -import static org.assertj.core.api.Assumptions.assumeThat; import java.util.List; import org.apache.flink.types.Row; @@ -49,11 +48,11 @@ public void testFlinkHintConfig() { @TestTemplate public void testReadOptionHierarchy() { - // TODO: FLIP-27 source doesn't implement limit pushdown yet - assumeThat(useFlip27Source).isFalse(); - getTableEnv().getConfig().set(FlinkReadOptions.LIMIT_OPTION, 1L); List result = sql("SELECT * FROM %s", TABLE); + // Note that this query doesn't have the limit clause in the SQL. + // This assertions works because limit is pushed down to the reader and + // reader parallelism is 1. assertThat(result).hasSize(1); result = sql("SELECT * FROM %s /*+ OPTIONS('limit'='3')*/", TABLE); diff --git a/flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestLimitableDataIterator.java b/flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestLimitableDataIterator.java new file mode 100644 index 000000000000..36749d3ec2dc --- /dev/null +++ b/flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestLimitableDataIterator.java @@ -0,0 +1,84 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.reader; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.nio.file.Path; +import java.util.Collections; +import java.util.List; +import org.apache.flink.table.data.RowData; +import org.apache.iceberg.CombinedScanTask; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.data.GenericAppenderFactory; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.encryption.EncryptionManager; +import org.apache.iceberg.encryption.PlaintextEncryptionManager; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.flink.source.RowDataFileScanTaskReader; +import org.apache.iceberg.hadoop.HadoopFileIO; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.io.TempDir; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; + +public class TestLimitableDataIterator { + @TempDir private static Path temporaryFolder; + + private final RowDataFileScanTaskReader reader = + new RowDataFileScanTaskReader( + TestFixtures.SCHEMA, TestFixtures.SCHEMA, null, true, Collections.emptyList()); + private final HadoopFileIO fileIO = new HadoopFileIO(new org.apache.hadoop.conf.Configuration()); + private final EncryptionManager encryptionManager = PlaintextEncryptionManager.instance(); + + private static CombinedScanTask combinedScanTask; + private static int totalRecords; + + @BeforeAll + public static void beforeClass() throws Exception { + GenericAppenderFactory appenderFactory = new GenericAppenderFactory(TestFixtures.SCHEMA); + List> recordBatchList = + ReaderUtil.createRecordBatchList(TestFixtures.SCHEMA, 3, 2); + combinedScanTask = + ReaderUtil.createCombinedScanTask( + recordBatchList, temporaryFolder, FileFormat.PARQUET, appenderFactory); + totalRecords = 3 * 2; + } + + @ParameterizedTest + @ValueSource(longs = {-1L, 0L, 1L, 6L, 7L}) + public void testUnlimited(long limit) { + LimitableDataIterator dataIterator = + new LimitableDataIterator<>( + reader, combinedScanTask, fileIO, encryptionManager, RecordLimiter.create(limit)); + + List result = Lists.newArrayList(); + while (dataIterator.hasNext()) { + result.add(dataIterator.next()); + } + + if (limit <= 0 || limit > totalRecords) { + // read all records + assertThat(result).hasSize(totalRecords); + } else { + assertThat(result).hasSize((int) limit); + } + } +} From 76dba8fe83c4496318cc34436e610cf50f43054d Mon Sep 17 00:00:00 2001 From: gaborkaszab Date: Wed, 31 Jul 2024 12:59:13 +0200 Subject: [PATCH 05/55] Docs: Fix header for entries metadata table (#10826) --- docs/docs/spark-queries.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/docs/spark-queries.md b/docs/docs/spark-queries.md index b606d849a692..494ca698533a 100644 --- a/docs/docs/spark-queries.md +++ b/docs/docs/spark-queries.md @@ -288,6 +288,7 @@ order by made_current_at; | 2019-02-09 16:24:30.13 | delete | 29641004024753 | false | application_1520379288616_151109 | | 2019-02-09 16:32:47.336 | append | 57897183625154 | true | application_1520379288616_155055 | | 2019-02-08 03:47:55.948 | overwrite | 51792995261850 | true | application_1520379288616_152431 | + ### Entries To show all the table's current manifest entries for both data and delete files. From 506fee492620bc8e13d7da4f104462fb97ceef82 Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Wed, 31 Jul 2024 08:41:20 -0700 Subject: [PATCH 06/55] Spark 3.5: Support Reporting Column Stats (#10659) Co-authored-by: Karuppayya Rajendran --- .../apache/iceberg/spark/SparkReadConf.java | 8 + .../iceberg/spark/SparkSQLProperties.java | 4 + .../spark/source/SparkChangelogScan.java | 2 +- .../spark/source/SparkColumnStatistics.java | 88 +++++++++ .../iceberg/spark/source/SparkScan.java | 54 +++++- .../apache/iceberg/spark/source/Stats.java | 12 +- .../iceberg/spark/source/TestSparkScan.java | 183 ++++++++++++++++++ 7 files changed, 346 insertions(+), 5 deletions(-) create mode 100644 spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SparkColumnStatistics.java diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/SparkReadConf.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/SparkReadConf.java index bd29fb0d6d42..67e9d78ada4d 100644 --- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/SparkReadConf.java +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/SparkReadConf.java @@ -347,4 +347,12 @@ private boolean executorCacheLocalityEnabledInternal() { .defaultValue(SparkSQLProperties.EXECUTOR_CACHE_LOCALITY_ENABLED_DEFAULT) .parse(); } + + public boolean reportColumnStats() { + return confParser + .booleanConf() + .sessionConf(SparkSQLProperties.REPORT_COLUMN_STATS) + .defaultValue(SparkSQLProperties.REPORT_COLUMN_STATS_DEFAULT) + .parse(); + } } diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/SparkSQLProperties.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/SparkSQLProperties.java index ea8f6fe0718b..77ae796ffb76 100644 --- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/SparkSQLProperties.java +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/SparkSQLProperties.java @@ -90,4 +90,8 @@ private SparkSQLProperties() {} public static final String EXECUTOR_CACHE_LOCALITY_ENABLED = "spark.sql.iceberg.executor-cache.locality.enabled"; public static final boolean EXECUTOR_CACHE_LOCALITY_ENABLED_DEFAULT = false; + + // Controls whether to report available column statistics to Spark for query optimization. + public static final String REPORT_COLUMN_STATS = "spark.sql.iceberg.report-column-stats"; + public static final boolean REPORT_COLUMN_STATS_DEFAULT = true; } diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SparkChangelogScan.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SparkChangelogScan.java index 7cde3e1fbe11..71b53d70262f 100644 --- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SparkChangelogScan.java +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SparkChangelogScan.java @@ -88,7 +88,7 @@ class SparkChangelogScan implements Scan, SupportsReportStatistics { public Statistics estimateStatistics() { long rowsCount = taskGroups().stream().mapToLong(ScanTaskGroup::estimatedRowsCount).sum(); long sizeInBytes = SparkSchemaUtil.estimateSize(readSchema(), rowsCount); - return new Stats(sizeInBytes, rowsCount); + return new Stats(sizeInBytes, rowsCount, Collections.emptyMap()); } @Override diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SparkColumnStatistics.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SparkColumnStatistics.java new file mode 100644 index 000000000000..faaff3631d7c --- /dev/null +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SparkColumnStatistics.java @@ -0,0 +1,88 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.spark.source; + +import java.util.Optional; +import java.util.OptionalLong; +import org.apache.spark.sql.connector.read.colstats.ColumnStatistics; +import org.apache.spark.sql.connector.read.colstats.Histogram; + +class SparkColumnStatistics implements ColumnStatistics { + + private final OptionalLong distinctCount; + private final Optional min; + private final Optional max; + private final OptionalLong nullCount; + private final OptionalLong avgLen; + private final OptionalLong maxLen; + private final Optional histogram; + + SparkColumnStatistics( + Long distinctCount, + Object min, + Object max, + Long nullCount, + Long avgLen, + Long maxLen, + Histogram histogram) { + this.distinctCount = + (distinctCount == null) ? OptionalLong.empty() : OptionalLong.of(distinctCount); + this.min = Optional.ofNullable(min); + this.max = Optional.ofNullable(max); + this.nullCount = (nullCount == null) ? OptionalLong.empty() : OptionalLong.of(nullCount); + this.avgLen = (avgLen == null) ? OptionalLong.empty() : OptionalLong.of(avgLen); + this.maxLen = (maxLen == null) ? OptionalLong.empty() : OptionalLong.of(maxLen); + this.histogram = Optional.ofNullable(histogram); + } + + @Override + public OptionalLong distinctCount() { + return distinctCount; + } + + @Override + public Optional min() { + return min; + } + + @Override + public Optional max() { + return max; + } + + @Override + public OptionalLong nullCount() { + return nullCount; + } + + @Override + public OptionalLong avgLen() { + return avgLen; + } + + @Override + public OptionalLong maxLen() { + return maxLen; + } + + @Override + public Optional histogram() { + return histogram; + } +} diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SparkScan.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SparkScan.java index 6efe8a080bde..8b88cf49c692 100644 --- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SparkScan.java +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SparkScan.java @@ -23,15 +23,19 @@ import java.util.Map; import java.util.function.Supplier; import java.util.stream.Collectors; +import org.apache.iceberg.BlobMetadata; import org.apache.iceberg.ScanTask; import org.apache.iceberg.ScanTaskGroup; import org.apache.iceberg.Schema; import org.apache.iceberg.Snapshot; import org.apache.iceberg.SnapshotSummary; +import org.apache.iceberg.StatisticsFile; import org.apache.iceberg.Table; import org.apache.iceberg.expressions.Expression; import org.apache.iceberg.metrics.ScanReport; +import org.apache.iceberg.relocated.com.google.common.base.Strings; import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.spark.Spark3Util; import org.apache.iceberg.spark.SparkReadConf; import org.apache.iceberg.spark.SparkSchemaUtil; @@ -75,22 +79,28 @@ import org.apache.iceberg.util.TableScanUtil; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.connector.expressions.FieldReference; +import org.apache.spark.sql.connector.expressions.NamedReference; import org.apache.spark.sql.connector.metric.CustomMetric; import org.apache.spark.sql.connector.metric.CustomTaskMetric; import org.apache.spark.sql.connector.read.Batch; import org.apache.spark.sql.connector.read.Scan; import org.apache.spark.sql.connector.read.Statistics; import org.apache.spark.sql.connector.read.SupportsReportStatistics; +import org.apache.spark.sql.connector.read.colstats.ColumnStatistics; import org.apache.spark.sql.connector.read.streaming.MicroBatchStream; +import org.apache.spark.sql.internal.SQLConf; import org.apache.spark.sql.types.StructType; import org.slf4j.Logger; import org.slf4j.LoggerFactory; abstract class SparkScan implements Scan, SupportsReportStatistics { private static final Logger LOG = LoggerFactory.getLogger(SparkScan.class); + private static final String NDV_KEY = "ndv"; private final JavaSparkContext sparkContext; private final Table table; + private final SparkSession spark; private final SparkReadConf readConf; private final boolean caseSensitive; private final Schema expectedSchema; @@ -111,6 +121,7 @@ abstract class SparkScan implements Scan, SupportsReportStatistics { Schema snapshotSchema = SnapshotUtil.schemaFor(table, readConf.branch()); SparkSchemaUtil.validateMetadataColumnReferences(snapshotSchema, expectedSchema); + this.spark = spark; this.sparkContext = JavaSparkContext.fromSparkContext(spark.sparkContext()); this.table = table; this.readConf = readConf; @@ -175,7 +186,43 @@ public Statistics estimateStatistics() { protected Statistics estimateStatistics(Snapshot snapshot) { // its a fresh table, no data if (snapshot == null) { - return new Stats(0L, 0L); + return new Stats(0L, 0L, Collections.emptyMap()); + } + + boolean cboEnabled = + Boolean.parseBoolean(spark.conf().get(SQLConf.CBO_ENABLED().key(), "false")); + Map colStatsMap = Collections.emptyMap(); + if (readConf.reportColumnStats() && cboEnabled) { + colStatsMap = Maps.newHashMap(); + List files = table.statisticsFiles(); + if (!files.isEmpty()) { + List metadataList = (files.get(0)).blobMetadata(); + + for (BlobMetadata blobMetadata : metadataList) { + int id = blobMetadata.fields().get(0); + String colName = table.schema().findColumnName(id); + NamedReference ref = FieldReference.column(colName); + + Long ndv = null; + if (blobMetadata + .type() + .equals(org.apache.iceberg.puffin.StandardBlobTypes.APACHE_DATASKETCHES_THETA_V1)) { + String ndvStr = blobMetadata.properties().get(NDV_KEY); + if (!Strings.isNullOrEmpty(ndvStr)) { + ndv = Long.parseLong(ndvStr); + } else { + LOG.debug("ndv is not set in BlobMetadata for column {}", colName); + } + } else { + LOG.debug("DataSketch blob is not available for column {}", colName); + } + + ColumnStatistics colStats = + new SparkColumnStatistics(ndv, null, null, null, null, null, null); + + colStatsMap.put(ref, colStats); + } + } } // estimate stats using snapshot summary only for partitioned tables @@ -186,12 +233,13 @@ protected Statistics estimateStatistics(Snapshot snapshot) { snapshot.snapshotId(), table.name()); long totalRecords = totalRecords(snapshot); - return new Stats(SparkSchemaUtil.estimateSize(readSchema(), totalRecords), totalRecords); + return new Stats( + SparkSchemaUtil.estimateSize(readSchema(), totalRecords), totalRecords, colStatsMap); } long rowsCount = taskGroups().stream().mapToLong(ScanTaskGroup::estimatedRowsCount).sum(); long sizeInBytes = SparkSchemaUtil.estimateSize(readSchema(), rowsCount); - return new Stats(sizeInBytes, rowsCount); + return new Stats(sizeInBytes, rowsCount, colStatsMap); } private long totalRecords(Snapshot snapshot) { diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/Stats.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/Stats.java index ddf6ca834d9b..ccf523cb4b05 100644 --- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/Stats.java +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/Stats.java @@ -18,16 +18,21 @@ */ package org.apache.iceberg.spark.source; +import java.util.Map; import java.util.OptionalLong; +import org.apache.spark.sql.connector.expressions.NamedReference; import org.apache.spark.sql.connector.read.Statistics; +import org.apache.spark.sql.connector.read.colstats.ColumnStatistics; class Stats implements Statistics { private final OptionalLong sizeInBytes; private final OptionalLong numRows; + private final Map colstats; - Stats(long sizeInBytes, long numRows) { + Stats(long sizeInBytes, long numRows, Map colstats) { this.sizeInBytes = OptionalLong.of(sizeInBytes); this.numRows = OptionalLong.of(numRows); + this.colstats = colstats; } @Override @@ -39,4 +44,9 @@ public OptionalLong sizeInBytes() { public OptionalLong numRows() { return numRows; } + + @Override + public Map columnStats() { + return colstats; + } } diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkScan.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkScan.java index d539b0123951..7d5475ff919e 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkScan.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkScan.java @@ -18,6 +18,7 @@ */ package org.apache.iceberg.spark.source; +import static org.apache.iceberg.puffin.StandardBlobTypes.APACHE_DATASKETCHES_THETA_V1; import static org.apache.iceberg.spark.SystemFunctionPushDownHelper.createPartitionedTable; import static org.apache.iceberg.spark.SystemFunctionPushDownHelper.createUnpartitionedTable; import static org.apache.iceberg.spark.SystemFunctionPushDownHelper.timestampStrToDayOrdinal; @@ -28,14 +29,22 @@ import static org.apache.spark.sql.functions.expr; import static org.assertj.core.api.Assertions.assertThat; +import java.util.List; +import java.util.Map; +import org.apache.iceberg.GenericBlobMetadata; +import org.apache.iceberg.GenericStatisticsFile; import org.apache.iceberg.Parameter; import org.apache.iceberg.ParameterizedTestExtension; import org.apache.iceberg.Parameters; import org.apache.iceberg.Table; import org.apache.iceberg.TableProperties; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.spark.Spark3Util; import org.apache.iceberg.spark.SparkCatalogConfig; +import org.apache.iceberg.spark.SparkSQLProperties; import org.apache.iceberg.spark.TestBaseWithCatalog; import org.apache.iceberg.spark.functions.BucketFunction; import org.apache.iceberg.spark.functions.DaysFunction; @@ -44,6 +53,7 @@ import org.apache.iceberg.spark.functions.TruncateFunction; import org.apache.iceberg.spark.functions.YearsFunction; import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoders; import org.apache.spark.sql.Row; import org.apache.spark.sql.catalyst.analysis.NoSuchTableException; import org.apache.spark.sql.connector.catalog.functions.BoundFunction; @@ -60,6 +70,8 @@ import org.apache.spark.sql.connector.read.ScanBuilder; import org.apache.spark.sql.connector.read.Statistics; import org.apache.spark.sql.connector.read.SupportsPushDownV2Filters; +import org.apache.spark.sql.connector.read.colstats.ColumnStatistics; +import org.apache.spark.sql.internal.SQLConf; import org.apache.spark.sql.types.DataTypes; import org.apache.spark.sql.util.CaseInsensitiveStringMap; import org.junit.jupiter.api.AfterEach; @@ -130,6 +142,157 @@ public void testEstimatedRowCount() throws NoSuchTableException { assertThat(stats.numRows().getAsLong()).isEqualTo(10000L); } + @TestTemplate + public void testTableWithoutColStats() throws NoSuchTableException { + sql("CREATE TABLE %s (id int, data string) USING iceberg", tableName); + + List records = + Lists.newArrayList( + new SimpleRecord(1, "a"), + new SimpleRecord(2, "b"), + new SimpleRecord(3, "a"), + new SimpleRecord(4, "b")); + spark + .createDataset(records, Encoders.bean(SimpleRecord.class)) + .coalesce(1) + .writeTo(tableName) + .append(); + + Table table = validationCatalog.loadTable(tableIdent); + + SparkScanBuilder scanBuilder = + new SparkScanBuilder(spark, table, CaseInsensitiveStringMap.empty()); + SparkScan scan = (SparkScan) scanBuilder.build(); + + Map reportColStatsDisabled = + ImmutableMap.of( + SQLConf.CBO_ENABLED().key(), "true", SparkSQLProperties.REPORT_COLUMN_STATS, "false"); + + Map reportColStatsEnabled = + ImmutableMap.of(SQLConf.CBO_ENABLED().key(), "true"); + + checkColStatisticsNotReported(scan, 4L); + withSQLConf(reportColStatsDisabled, () -> checkColStatisticsNotReported(scan, 4L)); + // The expected col NDVs are nulls + withSQLConf( + reportColStatsEnabled, () -> checkColStatisticsReported(scan, 4L, Maps.newHashMap())); + } + + @TestTemplate + public void testTableWithOneColStats() throws NoSuchTableException { + sql("CREATE TABLE %s (id int, data string) USING iceberg", tableName); + + List records = + Lists.newArrayList( + new SimpleRecord(1, "a"), + new SimpleRecord(2, "b"), + new SimpleRecord(3, "a"), + new SimpleRecord(4, "b")); + spark + .createDataset(records, Encoders.bean(SimpleRecord.class)) + .coalesce(1) + .writeTo(tableName) + .append(); + + Table table = validationCatalog.loadTable(tableIdent); + long snapshotId = table.currentSnapshot().snapshotId(); + + SparkScanBuilder scanBuilder = + new SparkScanBuilder(spark, table, CaseInsensitiveStringMap.empty()); + SparkScan scan = (SparkScan) scanBuilder.build(); + + Map reportColStatsDisabled = + ImmutableMap.of( + SQLConf.CBO_ENABLED().key(), "true", SparkSQLProperties.REPORT_COLUMN_STATS, "false"); + + Map reportColStatsEnabled = + ImmutableMap.of(SQLConf.CBO_ENABLED().key(), "true"); + + GenericStatisticsFile statisticsFile = + new GenericStatisticsFile( + snapshotId, + "/test/statistics/file.puffin", + 100, + 42, + ImmutableList.of( + new GenericBlobMetadata( + APACHE_DATASKETCHES_THETA_V1, + snapshotId, + 1, + ImmutableList.of(1), + ImmutableMap.of("ndv", "4")))); + + table.updateStatistics().setStatistics(snapshotId, statisticsFile).commit(); + + checkColStatisticsNotReported(scan, 4L); + withSQLConf(reportColStatsDisabled, () -> checkColStatisticsNotReported(scan, 4L)); + + Map expectedOneNDV = Maps.newHashMap(); + expectedOneNDV.put("id", 4L); + withSQLConf(reportColStatsEnabled, () -> checkColStatisticsReported(scan, 4L, expectedOneNDV)); + } + + @TestTemplate + public void testTableWithTwoColStats() throws NoSuchTableException { + sql("CREATE TABLE %s (id int, data string) USING iceberg", tableName); + + List records = + Lists.newArrayList( + new SimpleRecord(1, "a"), + new SimpleRecord(2, "b"), + new SimpleRecord(3, "a"), + new SimpleRecord(4, "b")); + spark + .createDataset(records, Encoders.bean(SimpleRecord.class)) + .coalesce(1) + .writeTo(tableName) + .append(); + + Table table = validationCatalog.loadTable(tableIdent); + long snapshotId = table.currentSnapshot().snapshotId(); + + SparkScanBuilder scanBuilder = + new SparkScanBuilder(spark, table, CaseInsensitiveStringMap.empty()); + SparkScan scan = (SparkScan) scanBuilder.build(); + + Map reportColStatsDisabled = + ImmutableMap.of( + SQLConf.CBO_ENABLED().key(), "true", SparkSQLProperties.REPORT_COLUMN_STATS, "false"); + + Map reportColStatsEnabled = + ImmutableMap.of(SQLConf.CBO_ENABLED().key(), "true"); + + GenericStatisticsFile statisticsFile = + new GenericStatisticsFile( + snapshotId, + "/test/statistics/file.puffin", + 100, + 42, + ImmutableList.of( + new GenericBlobMetadata( + APACHE_DATASKETCHES_THETA_V1, + snapshotId, + 1, + ImmutableList.of(1), + ImmutableMap.of("ndv", "4")), + new GenericBlobMetadata( + APACHE_DATASKETCHES_THETA_V1, + snapshotId, + 1, + ImmutableList.of(2), + ImmutableMap.of("ndv", "2")))); + + table.updateStatistics().setStatistics(snapshotId, statisticsFile).commit(); + + checkColStatisticsNotReported(scan, 4L); + withSQLConf(reportColStatsDisabled, () -> checkColStatisticsNotReported(scan, 4L)); + + Map expectedTwoNDVs = Maps.newHashMap(); + expectedTwoNDVs.put("id", 4L); + expectedTwoNDVs.put("data", 2L); + withSQLConf(reportColStatsEnabled, () -> checkColStatisticsReported(scan, 4L, expectedTwoNDVs)); + } + @TestTemplate public void testUnpartitionedYears() throws Exception { createUnpartitionedTable(spark, tableName); @@ -734,6 +897,26 @@ private Expression[] expressions(Expression... expressions) { return expressions; } + private void checkColStatisticsNotReported(SparkScan scan, long expectedRowCount) { + Statistics stats = scan.estimateStatistics(); + assertThat(stats.numRows().getAsLong()).isEqualTo(expectedRowCount); + + Map columnStats = stats.columnStats(); + assertThat(columnStats.isEmpty()); + } + + private void checkColStatisticsReported( + SparkScan scan, long expectedRowCount, Map expectedNDVs) { + Statistics stats = scan.estimateStatistics(); + assertThat(stats.numRows().getAsLong()).isEqualTo(expectedRowCount); + + Map columnStats = stats.columnStats(); + for (Map.Entry entry : expectedNDVs.entrySet()) { + assertThat(columnStats.get(FieldReference.column(entry.getKey())).distinctCount().getAsLong()) + .isEqualTo(entry.getValue()); + } + } + private static LiteralValue intLit(int value) { return LiteralValue.apply(value, DataTypes.IntegerType); } From 84c91251738cb86f741952bd1b23daa45c80d2aa Mon Sep 17 00:00:00 2001 From: Venkata krishnan Sowrirajan Date: Thu, 1 Aug 2024 00:22:51 -0700 Subject: [PATCH 07/55] Flink: Backport #10548 to v1.18 and v1.17 (#10776) --- .../enumerator/AbstractIcebergEnumerator.java | 11 +- ...estIcebergSpeculativeExecutionSupport.java | 184 ++++++++++++++++++ .../enumerator/AbstractIcebergEnumerator.java | 11 +- ...estIcebergSpeculativeExecutionSupport.java | 184 ++++++++++++++++++ ...estIcebergSpeculativeExecutionSupport.java | 2 +- 5 files changed, 389 insertions(+), 3 deletions(-) create mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSpeculativeExecutionSupport.java create mode 100644 flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSpeculativeExecutionSupport.java diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/AbstractIcebergEnumerator.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/AbstractIcebergEnumerator.java index 3aca390755ed..801baf77a612 100644 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/AbstractIcebergEnumerator.java +++ b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/AbstractIcebergEnumerator.java @@ -29,6 +29,7 @@ import org.apache.flink.api.connector.source.SourceEvent; import org.apache.flink.api.connector.source.SplitEnumerator; import org.apache.flink.api.connector.source.SplitEnumeratorContext; +import org.apache.flink.api.connector.source.SupportsHandleExecutionAttemptSourceEvent; import org.apache.iceberg.flink.source.assigner.GetSplitResult; import org.apache.iceberg.flink.source.assigner.SplitAssigner; import org.apache.iceberg.flink.source.split.IcebergSourceSplit; @@ -41,7 +42,8 @@ * resolved */ abstract class AbstractIcebergEnumerator - implements SplitEnumerator { + implements SplitEnumerator, + SupportsHandleExecutionAttemptSourceEvent { private static final Logger LOG = LoggerFactory.getLogger(AbstractIcebergEnumerator.class); private final SplitEnumeratorContext enumeratorContext; @@ -93,6 +95,13 @@ public void handleSourceEvent(int subtaskId, SourceEvent sourceEvent) { } } + // Flink's SourceCoordinator already keeps track of subTask to splits mapping. + // It already takes care of re-assigning splits to speculated attempts as well. + @Override + public void handleSourceEvent(int subTaskId, int attemptNumber, SourceEvent sourceEvent) { + handleSourceEvent(subTaskId, sourceEvent); + } + @Override public void addSplitsBack(List splits, int subtaskId) { LOG.info("Add {} splits back to the pool for failed subtask {}", splits.size(), subtaskId); diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSpeculativeExecutionSupport.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSpeculativeExecutionSupport.java new file mode 100644 index 000000000000..95d0b90b6ca9 --- /dev/null +++ b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSpeculativeExecutionSupport.java @@ -0,0 +1,184 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.IOException; +import java.nio.file.Files; +import java.time.Duration; +import java.util.Arrays; +import java.util.List; +import java.util.stream.Collectors; +import org.apache.flink.api.common.RuntimeExecutionMode; +import org.apache.flink.api.common.functions.RichMapFunction; +import org.apache.flink.api.common.typeinfo.Types; +import org.apache.flink.configuration.BatchExecutionOptions; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.configuration.CoreOptions; +import org.apache.flink.configuration.JobManagerOptions; +import org.apache.flink.configuration.RestOptions; +import org.apache.flink.configuration.SlowTaskDetectorOptions; +import org.apache.flink.runtime.testutils.MiniClusterResourceConfiguration; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.api.Table; +import org.apache.flink.table.api.TableEnvironment; +import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; +import org.apache.flink.test.junit5.MiniClusterExtension; +import org.apache.flink.types.Row; +import org.apache.iceberg.flink.FlinkConfigOptions; +import org.apache.iceberg.flink.TestBase; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.RegisterExtension; + +public class TestIcebergSpeculativeExecutionSupport extends TestBase { + private static final int NUM_TASK_MANAGERS = 1; + private static final int NUM_TASK_SLOTS = 3; + + @RegisterExtension + public static MiniClusterExtension miniClusterResource = + new MiniClusterExtension( + new MiniClusterResourceConfiguration.Builder() + .setNumberTaskManagers(NUM_TASK_MANAGERS) + .setNumberSlotsPerTaskManager(NUM_TASK_SLOTS) + .setConfiguration(configure()) + .build()); + + private StreamTableEnvironment tEnv; + private static final String CATALOG_NAME = "test_catalog"; + private static final String DATABASE_NAME = "test_db"; + private static final String INPUT_TABLE_NAME = "test_table"; + private static final String OUTPUT_TABLE_NAME = "sink_table"; + + @Override + protected TableEnvironment getTableEnv() { + if (tEnv == null) { + synchronized (this) { + StreamExecutionEnvironment env = + StreamExecutionEnvironment.getExecutionEnvironment(configure()); + env.setRuntimeMode(RuntimeExecutionMode.BATCH); + tEnv = StreamTableEnvironment.create(env); + } + } + + return tEnv; + } + + @BeforeEach + public void before() throws IOException { + String warehouse = + String.format("file:%s", Files.createTempDirectory(temporaryDirectory, "junit").toString()); + sql( + "CREATE CATALOG %s WITH ('type'='iceberg', 'catalog-type'='hadoop', 'warehouse'='%s')", + CATALOG_NAME, warehouse); + sql("USE CATALOG %s", CATALOG_NAME); + sql("CREATE DATABASE %s", DATABASE_NAME); + sql("USE %s", DATABASE_NAME); + + sql("CREATE TABLE %s (i INT, j INT)", INPUT_TABLE_NAME); + sql("INSERT INTO %s VALUES (1, -1),(2, -1),(3, -1)", INPUT_TABLE_NAME); + sql("CREATE TABLE %s (i INT, j INT, subTask INT, attempt INT)", OUTPUT_TABLE_NAME); + } + + @AfterEach + public void after() { + sql("DROP TABLE IF EXISTS %s.%s", DATABASE_NAME, INPUT_TABLE_NAME); + sql("DROP TABLE IF EXISTS %s.%s", DATABASE_NAME, OUTPUT_TABLE_NAME); + sql("DROP DATABASE %s", DATABASE_NAME); + dropCatalog(CATALOG_NAME, true); + } + + @Test + public void testSpeculativeExecution() throws Exception { + Table table = + tEnv.sqlQuery(String.format("SELECT * FROM %s.%s", DATABASE_NAME, INPUT_TABLE_NAME)); + DataStream slowStream = + tEnv.toDataStream(table, Row.class) + .map(new TestingMap()) + .name("test_map") + .returns( + Types.ROW_NAMED( + new String[] {"i", "j", "subTask", "attempt"}, + Types.INT, + Types.INT, + Types.INT, + Types.INT)) + .setParallelism(NUM_TASK_SLOTS); + + tEnv.fromDataStream(slowStream) + .executeInsert(String.format("%s.%s", DATABASE_NAME, OUTPUT_TABLE_NAME)) + .await(); + + List output = sql(String.format("SELECT * FROM %s.%s", DATABASE_NAME, OUTPUT_TABLE_NAME)); + + // Ensure that all subTasks has attemptNum > 0 + assertThat(output.stream().map(x -> x.getField(3)).collect(Collectors.toSet())).contains(1); + + // Ensure the test_table rows are returned exactly the same after the slow map task from the + // sink_table + assertSameElements( + output.stream().map(x -> Row.of(x.getField(0), x.getField(1))).collect(Collectors.toList()), + Arrays.asList(Row.of(1, -1), Row.of(2, -1), Row.of(3, -1))); + } + + /** A testing map function that simulates the slow task. */ + private static class TestingMap extends RichMapFunction { + @Override + public Row map(Row row) throws Exception { + // Put the subtasks with the first attempt to sleep to trigger speculative + // execution + if (getRuntimeContext().getAttemptNumber() <= 0) { + Thread.sleep(Integer.MAX_VALUE); + } + + Row output = + Row.of( + row.getField(0), + row.getField(1), + getRuntimeContext().getIndexOfThisSubtask(), + getRuntimeContext().getAttemptNumber()); + + return output; + } + } + + private static Configuration configure() { + Configuration configuration = new Configuration(); + configuration.set(CoreOptions.CHECK_LEAKED_CLASSLOADER, false); + configuration.set(RestOptions.BIND_PORT, "0"); + configuration.set(JobManagerOptions.SLOT_REQUEST_TIMEOUT, 5000L); + + // Use FLIP-27 source + configuration.set(FlinkConfigOptions.TABLE_EXEC_ICEBERG_USE_FLIP27_SOURCE, true); + + // for speculative execution + configuration.set(BatchExecutionOptions.SPECULATIVE_ENABLED, true); + + configuration.set(SlowTaskDetectorOptions.EXECUTION_TIME_BASELINE_MULTIPLIER, 1.0); + configuration.set(SlowTaskDetectorOptions.EXECUTION_TIME_BASELINE_RATIO, 0.2); + configuration.set( + SlowTaskDetectorOptions.EXECUTION_TIME_BASELINE_LOWER_BOUND, Duration.ofMillis(0)); + configuration.set(BatchExecutionOptions.BLOCK_SLOW_NODE_DURATION, Duration.ofMillis(0)); + + return configuration; + } +} diff --git a/flink/v1.18/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/AbstractIcebergEnumerator.java b/flink/v1.18/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/AbstractIcebergEnumerator.java index 6c9a855bc149..280a126a46ce 100644 --- a/flink/v1.18/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/AbstractIcebergEnumerator.java +++ b/flink/v1.18/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/AbstractIcebergEnumerator.java @@ -29,6 +29,7 @@ import org.apache.flink.api.connector.source.SourceEvent; import org.apache.flink.api.connector.source.SplitEnumerator; import org.apache.flink.api.connector.source.SplitEnumeratorContext; +import org.apache.flink.api.connector.source.SupportsHandleExecutionAttemptSourceEvent; import org.apache.iceberg.flink.source.assigner.GetSplitResult; import org.apache.iceberg.flink.source.assigner.SplitAssigner; import org.apache.iceberg.flink.source.split.IcebergSourceSplit; @@ -37,7 +38,8 @@ import org.slf4j.LoggerFactory; abstract class AbstractIcebergEnumerator - implements SplitEnumerator { + implements SplitEnumerator, + SupportsHandleExecutionAttemptSourceEvent { private static final Logger LOG = LoggerFactory.getLogger(AbstractIcebergEnumerator.class); private final SplitEnumeratorContext enumeratorContext; @@ -95,6 +97,13 @@ public void handleSourceEvent(int subtaskId, SourceEvent sourceEvent) { } } + // Flink's SourceCoordinator already keeps track of subTask to splits mapping. + // It already takes care of re-assigning splits to speculated attempts as well. + @Override + public void handleSourceEvent(int subTaskId, int attemptNumber, SourceEvent sourceEvent) { + handleSourceEvent(subTaskId, sourceEvent); + } + @Override public void addSplitsBack(List splits, int subtaskId) { LOG.info("Add {} splits back to the pool for failed subtask {}", splits.size(), subtaskId); diff --git a/flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSpeculativeExecutionSupport.java b/flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSpeculativeExecutionSupport.java new file mode 100644 index 000000000000..95d0b90b6ca9 --- /dev/null +++ b/flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSpeculativeExecutionSupport.java @@ -0,0 +1,184 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.IOException; +import java.nio.file.Files; +import java.time.Duration; +import java.util.Arrays; +import java.util.List; +import java.util.stream.Collectors; +import org.apache.flink.api.common.RuntimeExecutionMode; +import org.apache.flink.api.common.functions.RichMapFunction; +import org.apache.flink.api.common.typeinfo.Types; +import org.apache.flink.configuration.BatchExecutionOptions; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.configuration.CoreOptions; +import org.apache.flink.configuration.JobManagerOptions; +import org.apache.flink.configuration.RestOptions; +import org.apache.flink.configuration.SlowTaskDetectorOptions; +import org.apache.flink.runtime.testutils.MiniClusterResourceConfiguration; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.api.Table; +import org.apache.flink.table.api.TableEnvironment; +import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; +import org.apache.flink.test.junit5.MiniClusterExtension; +import org.apache.flink.types.Row; +import org.apache.iceberg.flink.FlinkConfigOptions; +import org.apache.iceberg.flink.TestBase; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.RegisterExtension; + +public class TestIcebergSpeculativeExecutionSupport extends TestBase { + private static final int NUM_TASK_MANAGERS = 1; + private static final int NUM_TASK_SLOTS = 3; + + @RegisterExtension + public static MiniClusterExtension miniClusterResource = + new MiniClusterExtension( + new MiniClusterResourceConfiguration.Builder() + .setNumberTaskManagers(NUM_TASK_MANAGERS) + .setNumberSlotsPerTaskManager(NUM_TASK_SLOTS) + .setConfiguration(configure()) + .build()); + + private StreamTableEnvironment tEnv; + private static final String CATALOG_NAME = "test_catalog"; + private static final String DATABASE_NAME = "test_db"; + private static final String INPUT_TABLE_NAME = "test_table"; + private static final String OUTPUT_TABLE_NAME = "sink_table"; + + @Override + protected TableEnvironment getTableEnv() { + if (tEnv == null) { + synchronized (this) { + StreamExecutionEnvironment env = + StreamExecutionEnvironment.getExecutionEnvironment(configure()); + env.setRuntimeMode(RuntimeExecutionMode.BATCH); + tEnv = StreamTableEnvironment.create(env); + } + } + + return tEnv; + } + + @BeforeEach + public void before() throws IOException { + String warehouse = + String.format("file:%s", Files.createTempDirectory(temporaryDirectory, "junit").toString()); + sql( + "CREATE CATALOG %s WITH ('type'='iceberg', 'catalog-type'='hadoop', 'warehouse'='%s')", + CATALOG_NAME, warehouse); + sql("USE CATALOG %s", CATALOG_NAME); + sql("CREATE DATABASE %s", DATABASE_NAME); + sql("USE %s", DATABASE_NAME); + + sql("CREATE TABLE %s (i INT, j INT)", INPUT_TABLE_NAME); + sql("INSERT INTO %s VALUES (1, -1),(2, -1),(3, -1)", INPUT_TABLE_NAME); + sql("CREATE TABLE %s (i INT, j INT, subTask INT, attempt INT)", OUTPUT_TABLE_NAME); + } + + @AfterEach + public void after() { + sql("DROP TABLE IF EXISTS %s.%s", DATABASE_NAME, INPUT_TABLE_NAME); + sql("DROP TABLE IF EXISTS %s.%s", DATABASE_NAME, OUTPUT_TABLE_NAME); + sql("DROP DATABASE %s", DATABASE_NAME); + dropCatalog(CATALOG_NAME, true); + } + + @Test + public void testSpeculativeExecution() throws Exception { + Table table = + tEnv.sqlQuery(String.format("SELECT * FROM %s.%s", DATABASE_NAME, INPUT_TABLE_NAME)); + DataStream slowStream = + tEnv.toDataStream(table, Row.class) + .map(new TestingMap()) + .name("test_map") + .returns( + Types.ROW_NAMED( + new String[] {"i", "j", "subTask", "attempt"}, + Types.INT, + Types.INT, + Types.INT, + Types.INT)) + .setParallelism(NUM_TASK_SLOTS); + + tEnv.fromDataStream(slowStream) + .executeInsert(String.format("%s.%s", DATABASE_NAME, OUTPUT_TABLE_NAME)) + .await(); + + List output = sql(String.format("SELECT * FROM %s.%s", DATABASE_NAME, OUTPUT_TABLE_NAME)); + + // Ensure that all subTasks has attemptNum > 0 + assertThat(output.stream().map(x -> x.getField(3)).collect(Collectors.toSet())).contains(1); + + // Ensure the test_table rows are returned exactly the same after the slow map task from the + // sink_table + assertSameElements( + output.stream().map(x -> Row.of(x.getField(0), x.getField(1))).collect(Collectors.toList()), + Arrays.asList(Row.of(1, -1), Row.of(2, -1), Row.of(3, -1))); + } + + /** A testing map function that simulates the slow task. */ + private static class TestingMap extends RichMapFunction { + @Override + public Row map(Row row) throws Exception { + // Put the subtasks with the first attempt to sleep to trigger speculative + // execution + if (getRuntimeContext().getAttemptNumber() <= 0) { + Thread.sleep(Integer.MAX_VALUE); + } + + Row output = + Row.of( + row.getField(0), + row.getField(1), + getRuntimeContext().getIndexOfThisSubtask(), + getRuntimeContext().getAttemptNumber()); + + return output; + } + } + + private static Configuration configure() { + Configuration configuration = new Configuration(); + configuration.set(CoreOptions.CHECK_LEAKED_CLASSLOADER, false); + configuration.set(RestOptions.BIND_PORT, "0"); + configuration.set(JobManagerOptions.SLOT_REQUEST_TIMEOUT, 5000L); + + // Use FLIP-27 source + configuration.set(FlinkConfigOptions.TABLE_EXEC_ICEBERG_USE_FLIP27_SOURCE, true); + + // for speculative execution + configuration.set(BatchExecutionOptions.SPECULATIVE_ENABLED, true); + + configuration.set(SlowTaskDetectorOptions.EXECUTION_TIME_BASELINE_MULTIPLIER, 1.0); + configuration.set(SlowTaskDetectorOptions.EXECUTION_TIME_BASELINE_RATIO, 0.2); + configuration.set( + SlowTaskDetectorOptions.EXECUTION_TIME_BASELINE_LOWER_BOUND, Duration.ofMillis(0)); + configuration.set(BatchExecutionOptions.BLOCK_SLOW_NODE_DURATION, Duration.ofMillis(0)); + + return configuration; + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSpeculativeExecutionSupport.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSpeculativeExecutionSupport.java index b21010a91bed..41b023b93617 100644 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSpeculativeExecutionSupport.java +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSpeculativeExecutionSupport.java @@ -144,7 +144,7 @@ public void testSpeculativeExecution() throws Exception { private static class TestingMap extends RichMapFunction { @Override public Row map(Row row) throws Exception { - // Put the even subtask indices with the first attempt to sleep to trigger speculative + // Put the subtasks with the first attempt to sleep to trigger speculative // execution if (getRuntimeContext().getTaskInfo().getAttemptNumber() <= 0) { Thread.sleep(Integer.MAX_VALUE); From 806da5cfc7dba7b8fd872cf7fc6a6b36ac8a3876 Mon Sep 17 00:00:00 2001 From: Eduard Tudenhoefner Date: Thu, 1 Aug 2024 09:30:04 +0200 Subject: [PATCH 08/55] Infra: Improve feature request template (#10825) --- .github/ISSUE_TEMPLATE/iceberg_improvement.yml | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/.github/ISSUE_TEMPLATE/iceberg_improvement.yml b/.github/ISSUE_TEMPLATE/iceberg_improvement.yml index f3d6d6de923a..80048f99465a 100644 --- a/.github/ISSUE_TEMPLATE/iceberg_improvement.yml +++ b/.github/ISSUE_TEMPLATE/iceberg_improvement.yml @@ -50,4 +50,12 @@ body: - Hive - Other validations: - required: false \ No newline at end of file + required: false + - type: checkboxes + attributes: + label: Willingness to contribute + description: The Apache Iceberg community encourages contributions. Would you or another member of your organization be willing to contribute this improvement/feature to the Apache Iceberg codebase? + options: + - label: I can contribute this improvement/feature independently + - label: I would be willing to contribute this improvement/feature with guidance from the Iceberg community + - label: I cannot contribute this improvement/feature at this time From 99b8e88a88486f541b0ad2703fdc97ab615c5398 Mon Sep 17 00:00:00 2001 From: hsiang-c <137842490+hsiang-c@users.noreply.github.com> Date: Thu, 1 Aug 2024 23:23:05 +0800 Subject: [PATCH 09/55] Core: Replace the duplicated ALL_DATA_FILES with ALL_DELETE_FILES (#10836) --- .../java/org/apache/iceberg/TestMetadataTableFilters.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/core/src/test/java/org/apache/iceberg/TestMetadataTableFilters.java b/core/src/test/java/org/apache/iceberg/TestMetadataTableFilters.java index 8125e064f0d0..9e535dd77747 100644 --- a/core/src/test/java/org/apache/iceberg/TestMetadataTableFilters.java +++ b/core/src/test/java/org/apache/iceberg/TestMetadataTableFilters.java @@ -40,7 +40,7 @@ public class TestMetadataTableFilters extends TestBase { private static final Set AGG_FILE_TABLES = Sets.newHashSet( MetadataTableType.ALL_DATA_FILES, - MetadataTableType.ALL_DATA_FILES, + MetadataTableType.ALL_DELETE_FILES, MetadataTableType.ALL_FILES, MetadataTableType.ALL_ENTRIES); @@ -132,9 +132,9 @@ private int expectedScanTaskCount(int partitions) { } case DATA_FILES: case DELETE_FILES: - case ALL_DELETE_FILES: return partitions; case ALL_DATA_FILES: + case ALL_DELETE_FILES: return partitions * 2; // ScanTask for Data Manifest in DELETED and ADDED states case ALL_FILES: case ALL_ENTRIES: From eb9d3951eeefc51824b87d36ca3824f7a968e81e Mon Sep 17 00:00:00 2001 From: Russell Spitzer Date: Thu, 1 Aug 2024 10:23:49 -0500 Subject: [PATCH 10/55] Core: Adds Basic Classes for Iceberg Table Version 3 (#10760) --- .../iceberg/BaseUpdatePartitionSpec.java | 2 +- .../org/apache/iceberg/ManifestFiles.java | 4 + .../apache/iceberg/ManifestListWriter.java | 35 ++ .../org/apache/iceberg/ManifestLists.java | 3 + .../org/apache/iceberg/ManifestWriter.java | 73 +++ .../org/apache/iceberg/TableMetadata.java | 2 +- .../java/org/apache/iceberg/V3Metadata.java | 575 ++++++++++++++++++ .../iceberg/DeleteFileIndexTestBase.java | 2 +- .../iceberg/MetadataTableScanTestBase.java | 2 +- .../ScanPlanningAndReportingTestBase.java | 2 +- .../java/org/apache/iceberg/TestBase.java | 2 +- .../org/apache/iceberg/TestBatchScans.java | 2 +- .../apache/iceberg/TestCommitReporting.java | 2 +- .../apache/iceberg/TestCreateTransaction.java | 2 +- .../iceberg/TestEntriesMetadataTable.java | 2 +- .../org/apache/iceberg/TestFastAppend.java | 2 +- .../org/apache/iceberg/TestFindFiles.java | 2 +- .../apache/iceberg/TestFormatVersions.java | 2 +- .../iceberg/TestIncrementalDataTableScan.java | 2 +- .../apache/iceberg/TestLocalFilterFiles.java | 2 +- .../apache/iceberg/TestLocationProvider.java | 2 +- .../apache/iceberg/TestManifestCleanup.java | 2 +- .../iceberg/TestManifestReaderStats.java | 2 +- .../apache/iceberg/TestManifestWriter.java | 2 +- .../iceberg/TestMetadataTableScans.java | 46 +- ...adataTableScansWithPartitionEvolution.java | 3 +- .../java/org/apache/iceberg/TestMetrics.java | 2 +- .../org/apache/iceberg/TestMetricsModes.java | 2 +- .../apache/iceberg/TestMicroBatchBuilder.java | 2 +- .../apache/iceberg/TestPartitionSpecInfo.java | 2 +- .../iceberg/TestReplaceTransaction.java | 2 +- .../apache/iceberg/TestRewriteManifests.java | 2 +- .../org/apache/iceberg/TestScanSummary.java | 2 +- .../iceberg/TestScansAndSchemaEvolution.java | 2 +- .../iceberg/TestSchemaAndMappingUpdate.java | 2 +- .../java/org/apache/iceberg/TestSchemaID.java | 2 +- .../iceberg/TestSetPartitionStatistics.java | 2 +- .../org/apache/iceberg/TestSetStatistics.java | 2 +- .../java/org/apache/iceberg/TestSnapshot.java | 2 +- .../apache/iceberg/TestSnapshotLoading.java | 2 +- .../apache/iceberg/TestSnapshotManager.java | 2 +- .../apache/iceberg/TestSnapshotSelection.java | 2 +- .../apache/iceberg/TestSnapshotSummary.java | 2 +- .../org/apache/iceberg/TestSortOrder.java | 2 +- .../org/apache/iceberg/TestSplitPlanning.java | 2 +- .../TestTableMetadataSerialization.java | 2 +- .../iceberg/TestTableUpdatePartitionSpec.java | 2 +- .../iceberg/TestTimestampPartitions.java | 2 +- .../org/apache/iceberg/TestTransaction.java | 2 +- .../iceberg/TestUpdatePartitionSpec.java | 2 +- .../org/apache/iceberg/TestWapWorkflow.java | 2 +- .../actions/TestSizeBasedRewriter.java | 2 +- .../iceberg/io/TestOutputFileFactory.java | 2 +- .../iceberg/mapping/TestMappingUpdates.java | 2 +- .../TableMetadataUnsupportedVersion.json | 4 +- 55 files changed, 762 insertions(+), 75 deletions(-) create mode 100644 core/src/main/java/org/apache/iceberg/V3Metadata.java diff --git a/core/src/main/java/org/apache/iceberg/BaseUpdatePartitionSpec.java b/core/src/main/java/org/apache/iceberg/BaseUpdatePartitionSpec.java index b59292c397a1..2e1c9199174c 100644 --- a/core/src/main/java/org/apache/iceberg/BaseUpdatePartitionSpec.java +++ b/core/src/main/java/org/apache/iceberg/BaseUpdatePartitionSpec.java @@ -118,7 +118,7 @@ private int assignFieldId() { */ private PartitionField recycleOrCreatePartitionField( Pair> sourceTransform, String name) { - if (formatVersion == 2 && base != null) { + if (formatVersion >= 2 && base != null) { int sourceId = sourceTransform.first(); Transform transform = sourceTransform.second(); diff --git a/core/src/main/java/org/apache/iceberg/ManifestFiles.java b/core/src/main/java/org/apache/iceberg/ManifestFiles.java index 840c90bebdde..f630bb3eb743 100644 --- a/core/src/main/java/org/apache/iceberg/ManifestFiles.java +++ b/core/src/main/java/org/apache/iceberg/ManifestFiles.java @@ -182,6 +182,8 @@ public static ManifestWriter write( return new ManifestWriter.V1Writer(spec, encryptedOutputFile, snapshotId); case 2: return new ManifestWriter.V2Writer(spec, encryptedOutputFile, snapshotId); + case 3: + return new ManifestWriter.V3Writer(spec, encryptedOutputFile, snapshotId); } throw new UnsupportedOperationException( "Cannot write manifest for table version: " + formatVersion); @@ -238,6 +240,8 @@ public static ManifestWriter writeDeleteManifest( throw new IllegalArgumentException("Cannot write delete files in a v1 table"); case 2: return new ManifestWriter.V2DeleteWriter(spec, outputFile, snapshotId); + case 3: + return new ManifestWriter.V3DeleteWriter(spec, outputFile, snapshotId); } throw new UnsupportedOperationException( "Cannot write manifest for table version: " + formatVersion); diff --git a/core/src/main/java/org/apache/iceberg/ManifestListWriter.java b/core/src/main/java/org/apache/iceberg/ManifestListWriter.java index 3f7f20d4df6c..b17eedad18af 100644 --- a/core/src/main/java/org/apache/iceberg/ManifestListWriter.java +++ b/core/src/main/java/org/apache/iceberg/ManifestListWriter.java @@ -70,6 +70,41 @@ public long length() { return writer.length(); } + static class V3Writer extends ManifestListWriter { + private final V3Metadata.IndexedManifestFile wrapper; + + V3Writer(OutputFile snapshotFile, long snapshotId, Long parentSnapshotId, long sequenceNumber) { + super( + snapshotFile, + ImmutableMap.of( + "snapshot-id", String.valueOf(snapshotId), + "parent-snapshot-id", String.valueOf(parentSnapshotId), + "sequence-number", String.valueOf(sequenceNumber), + "format-version", "3")); + this.wrapper = new V3Metadata.IndexedManifestFile(snapshotId, sequenceNumber); + } + + @Override + protected ManifestFile prepare(ManifestFile manifest) { + return wrapper.wrap(manifest); + } + + @Override + protected FileAppender newAppender(OutputFile file, Map meta) { + try { + return Avro.write(file) + .schema(V3Metadata.MANIFEST_LIST_SCHEMA) + .named("manifest_file") + .meta(meta) + .overwrite() + .build(); + + } catch (IOException e) { + throw new RuntimeIOException(e, "Failed to create snapshot list writer for path: %s", file); + } + } + } + static class V2Writer extends ManifestListWriter { private final V2Metadata.IndexedManifestFile wrapper; diff --git a/core/src/main/java/org/apache/iceberg/ManifestLists.java b/core/src/main/java/org/apache/iceberg/ManifestLists.java index c7b3e5fee5a9..f20a481cf25a 100644 --- a/core/src/main/java/org/apache/iceberg/ManifestLists.java +++ b/core/src/main/java/org/apache/iceberg/ManifestLists.java @@ -66,6 +66,9 @@ static ManifestListWriter write( case 2: return new ManifestListWriter.V2Writer( manifestListFile, snapshotId, parentSnapshotId, sequenceNumber); + case 3: + return new ManifestListWriter.V3Writer( + manifestListFile, snapshotId, parentSnapshotId, sequenceNumber); } throw new UnsupportedOperationException( "Cannot write manifest list for table version: " + formatVersion); diff --git a/core/src/main/java/org/apache/iceberg/ManifestWriter.java b/core/src/main/java/org/apache/iceberg/ManifestWriter.java index cea907ddaca1..88587a1ebc89 100644 --- a/core/src/main/java/org/apache/iceberg/ManifestWriter.java +++ b/core/src/main/java/org/apache/iceberg/ManifestWriter.java @@ -217,6 +217,79 @@ public void close() throws IOException { writer.close(); } + static class V3Writer extends ManifestWriter { + private final V3Metadata.IndexedManifestEntry entryWrapper; + + V3Writer(PartitionSpec spec, EncryptedOutputFile file, Long snapshotId) { + super(spec, file, snapshotId); + this.entryWrapper = new V3Metadata.IndexedManifestEntry<>(snapshotId, spec.partitionType()); + } + + @Override + protected ManifestEntry prepare(ManifestEntry entry) { + return entryWrapper.wrap(entry); + } + + @Override + protected FileAppender> newAppender( + PartitionSpec spec, OutputFile file) { + Schema manifestSchema = V3Metadata.entrySchema(spec.partitionType()); + try { + return Avro.write(file) + .schema(manifestSchema) + .named("manifest_entry") + .meta("schema", SchemaParser.toJson(spec.schema())) + .meta("partition-spec", PartitionSpecParser.toJsonFields(spec)) + .meta("partition-spec-id", String.valueOf(spec.specId())) + .meta("format-version", "3") + .meta("content", "data") + .overwrite() + .build(); + } catch (IOException e) { + throw new RuntimeIOException(e, "Failed to create manifest writer for path: %s", file); + } + } + } + + static class V3DeleteWriter extends ManifestWriter { + private final V3Metadata.IndexedManifestEntry entryWrapper; + + V3DeleteWriter(PartitionSpec spec, EncryptedOutputFile file, Long snapshotId) { + super(spec, file, snapshotId); + this.entryWrapper = new V3Metadata.IndexedManifestEntry<>(snapshotId, spec.partitionType()); + } + + @Override + protected ManifestEntry prepare(ManifestEntry entry) { + return entryWrapper.wrap(entry); + } + + @Override + protected FileAppender> newAppender( + PartitionSpec spec, OutputFile file) { + Schema manifestSchema = V3Metadata.entrySchema(spec.partitionType()); + try { + return Avro.write(file) + .schema(manifestSchema) + .named("manifest_entry") + .meta("schema", SchemaParser.toJson(spec.schema())) + .meta("partition-spec", PartitionSpecParser.toJsonFields(spec)) + .meta("partition-spec-id", String.valueOf(spec.specId())) + .meta("format-version", "3") + .meta("content", "deletes") + .overwrite() + .build(); + } catch (IOException e) { + throw new RuntimeIOException(e, "Failed to create manifest writer for path: %s", file); + } + } + + @Override + protected ManifestContent content() { + return ManifestContent.DELETES; + } + } + static class V2Writer extends ManifestWriter { private final V2Metadata.IndexedManifestEntry entryWrapper; diff --git a/core/src/main/java/org/apache/iceberg/TableMetadata.java b/core/src/main/java/org/apache/iceberg/TableMetadata.java index 74b8ad0bbddc..bd1c8a1a0371 100644 --- a/core/src/main/java/org/apache/iceberg/TableMetadata.java +++ b/core/src/main/java/org/apache/iceberg/TableMetadata.java @@ -51,7 +51,7 @@ public class TableMetadata implements Serializable { static final long INITIAL_SEQUENCE_NUMBER = 0; static final long INVALID_SEQUENCE_NUMBER = -1; static final int DEFAULT_TABLE_FORMAT_VERSION = 2; - static final int SUPPORTED_TABLE_FORMAT_VERSION = 2; + static final int SUPPORTED_TABLE_FORMAT_VERSION = 3; static final int INITIAL_SPEC_ID = 0; static final int INITIAL_SORT_ORDER_ID = 1; static final int INITIAL_SCHEMA_ID = 0; diff --git a/core/src/main/java/org/apache/iceberg/V3Metadata.java b/core/src/main/java/org/apache/iceberg/V3Metadata.java new file mode 100644 index 000000000000..94e20ea99858 --- /dev/null +++ b/core/src/main/java/org/apache/iceberg/V3Metadata.java @@ -0,0 +1,575 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg; + +import static org.apache.iceberg.types.Types.NestedField.required; + +import java.nio.ByteBuffer; +import java.util.List; +import java.util.Map; +import java.util.Set; +import org.apache.avro.generic.IndexedRecord; +import org.apache.iceberg.avro.AvroSchemaUtil; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.types.Types; + +class V3Metadata { + private V3Metadata() {} + + static final Schema MANIFEST_LIST_SCHEMA = + new Schema( + ManifestFile.PATH, + ManifestFile.LENGTH, + ManifestFile.SPEC_ID, + ManifestFile.MANIFEST_CONTENT.asRequired(), + ManifestFile.SEQUENCE_NUMBER.asRequired(), + ManifestFile.MIN_SEQUENCE_NUMBER.asRequired(), + ManifestFile.SNAPSHOT_ID.asRequired(), + ManifestFile.ADDED_FILES_COUNT.asRequired(), + ManifestFile.EXISTING_FILES_COUNT.asRequired(), + ManifestFile.DELETED_FILES_COUNT.asRequired(), + ManifestFile.ADDED_ROWS_COUNT.asRequired(), + ManifestFile.EXISTING_ROWS_COUNT.asRequired(), + ManifestFile.DELETED_ROWS_COUNT.asRequired(), + ManifestFile.PARTITION_SUMMARIES, + ManifestFile.KEY_METADATA); + + /** + * A wrapper class to write any ManifestFile implementation to Avro using the v3 write schema. + * + *

This is used to maintain compatibility with v3 by writing manifest list files with the old + * schema, instead of writing a sequence number into metadata files in v3 tables. + */ + static class IndexedManifestFile implements ManifestFile, IndexedRecord { + private static final org.apache.avro.Schema AVRO_SCHEMA = + AvroSchemaUtil.convert(MANIFEST_LIST_SCHEMA, "manifest_file"); + + private final long commitSnapshotId; + private final long sequenceNumber; + private ManifestFile wrapped = null; + + IndexedManifestFile(long commitSnapshotId, long sequenceNumber) { + this.commitSnapshotId = commitSnapshotId; + this.sequenceNumber = sequenceNumber; + } + + public ManifestFile wrap(ManifestFile file) { + this.wrapped = file; + return this; + } + + @Override + public org.apache.avro.Schema getSchema() { + return AVRO_SCHEMA; + } + + @Override + public void put(int i, Object v) { + throw new UnsupportedOperationException("Cannot modify IndexedManifestFile wrapper via put"); + } + + @Override + public Object get(int pos) { + switch (pos) { + case 0: + return wrapped.path(); + case 1: + return wrapped.length(); + case 2: + return wrapped.partitionSpecId(); + case 3: + return wrapped.content().id(); + case 4: + if (wrapped.sequenceNumber() == ManifestWriter.UNASSIGNED_SEQ) { + // if the sequence number is being assigned here, then the manifest must be created by + // the current + // operation. to validate this, check that the snapshot id matches the current commit + Preconditions.checkState( + commitSnapshotId == wrapped.snapshotId(), + "Found unassigned sequence number for a manifest from snapshot: %s", + wrapped.snapshotId()); + return sequenceNumber; + } else { + return wrapped.sequenceNumber(); + } + case 5: + if (wrapped.minSequenceNumber() == ManifestWriter.UNASSIGNED_SEQ) { + // same sanity check as above + Preconditions.checkState( + commitSnapshotId == wrapped.snapshotId(), + "Found unassigned sequence number for a manifest from snapshot: %s", + wrapped.snapshotId()); + // if the min sequence number is not determined, then there was no assigned sequence + // number for any file + // written to the wrapped manifest. replace the unassigned sequence number with the one + // for this commit + return sequenceNumber; + } else { + return wrapped.minSequenceNumber(); + } + case 6: + return wrapped.snapshotId(); + case 7: + return wrapped.addedFilesCount(); + case 8: + return wrapped.existingFilesCount(); + case 9: + return wrapped.deletedFilesCount(); + case 10: + return wrapped.addedRowsCount(); + case 11: + return wrapped.existingRowsCount(); + case 12: + return wrapped.deletedRowsCount(); + case 13: + return wrapped.partitions(); + case 14: + return wrapped.keyMetadata(); + default: + throw new UnsupportedOperationException("Unknown field ordinal: " + pos); + } + } + + @Override + public String path() { + return wrapped.path(); + } + + @Override + public long length() { + return wrapped.length(); + } + + @Override + public int partitionSpecId() { + return wrapped.partitionSpecId(); + } + + @Override + public ManifestContent content() { + return wrapped.content(); + } + + @Override + public long sequenceNumber() { + return wrapped.sequenceNumber(); + } + + @Override + public long minSequenceNumber() { + return wrapped.minSequenceNumber(); + } + + @Override + public Long snapshotId() { + return wrapped.snapshotId(); + } + + @Override + public boolean hasAddedFiles() { + return wrapped.hasAddedFiles(); + } + + @Override + public Integer addedFilesCount() { + return wrapped.addedFilesCount(); + } + + @Override + public Long addedRowsCount() { + return wrapped.addedRowsCount(); + } + + @Override + public boolean hasExistingFiles() { + return wrapped.hasExistingFiles(); + } + + @Override + public Integer existingFilesCount() { + return wrapped.existingFilesCount(); + } + + @Override + public Long existingRowsCount() { + return wrapped.existingRowsCount(); + } + + @Override + public boolean hasDeletedFiles() { + return wrapped.hasDeletedFiles(); + } + + @Override + public Integer deletedFilesCount() { + return wrapped.deletedFilesCount(); + } + + @Override + public Long deletedRowsCount() { + return wrapped.deletedRowsCount(); + } + + @Override + public List partitions() { + return wrapped.partitions(); + } + + @Override + public ByteBuffer keyMetadata() { + return wrapped.keyMetadata(); + } + + @Override + public ManifestFile copy() { + return wrapped.copy(); + } + } + + static Schema entrySchema(Types.StructType partitionType) { + return wrapFileSchema(fileType(partitionType)); + } + + static Schema wrapFileSchema(Types.StructType fileSchema) { + // this is used to build projection schemas + return new Schema( + ManifestEntry.STATUS, + ManifestEntry.SNAPSHOT_ID, + ManifestEntry.SEQUENCE_NUMBER, + ManifestEntry.FILE_SEQUENCE_NUMBER, + required(ManifestEntry.DATA_FILE_ID, "data_file", fileSchema)); + } + + static Types.StructType fileType(Types.StructType partitionType) { + return Types.StructType.of( + DataFile.CONTENT.asRequired(), + DataFile.FILE_PATH, + DataFile.FILE_FORMAT, + required( + DataFile.PARTITION_ID, DataFile.PARTITION_NAME, partitionType, DataFile.PARTITION_DOC), + DataFile.RECORD_COUNT, + DataFile.FILE_SIZE, + DataFile.COLUMN_SIZES, + DataFile.VALUE_COUNTS, + DataFile.NULL_VALUE_COUNTS, + DataFile.NAN_VALUE_COUNTS, + DataFile.LOWER_BOUNDS, + DataFile.UPPER_BOUNDS, + DataFile.KEY_METADATA, + DataFile.SPLIT_OFFSETS, + DataFile.EQUALITY_IDS, + DataFile.SORT_ORDER_ID); + } + + static class IndexedManifestEntry> + implements ManifestEntry, IndexedRecord { + private final org.apache.avro.Schema avroSchema; + private final Long commitSnapshotId; + private final IndexedDataFile fileWrapper; + private ManifestEntry wrapped = null; + + IndexedManifestEntry(Long commitSnapshotId, Types.StructType partitionType) { + this.avroSchema = AvroSchemaUtil.convert(entrySchema(partitionType), "manifest_entry"); + this.commitSnapshotId = commitSnapshotId; + this.fileWrapper = new IndexedDataFile<>(partitionType); + } + + public IndexedManifestEntry wrap(ManifestEntry entry) { + this.wrapped = entry; + return this; + } + + @Override + public org.apache.avro.Schema getSchema() { + return avroSchema; + } + + @Override + public void put(int i, Object v) { + throw new UnsupportedOperationException("Cannot modify IndexedManifestEntry wrapper via put"); + } + + @Override + public Object get(int i) { + switch (i) { + case 0: + return wrapped.status().id(); + case 1: + return wrapped.snapshotId(); + case 2: + if (wrapped.dataSequenceNumber() == null) { + // if the entry's data sequence number is null, + // then it will inherit the sequence number of the current commit. + // to validate that this is correct, check that the snapshot id is either null (will + // also be inherited) or that it matches the id of the current commit. + Preconditions.checkState( + wrapped.snapshotId() == null || wrapped.snapshotId().equals(commitSnapshotId), + "Found unassigned sequence number for an entry from snapshot: %s", + wrapped.snapshotId()); + + // inheritance should work only for ADDED entries + Preconditions.checkState( + wrapped.status() == Status.ADDED, + "Only entries with status ADDED can have null sequence number"); + + return null; + } + return wrapped.dataSequenceNumber(); + case 3: + return wrapped.fileSequenceNumber(); + case 4: + return fileWrapper.wrap(wrapped.file()); + default: + throw new UnsupportedOperationException("Unknown field ordinal: " + i); + } + } + + @Override + public Status status() { + return wrapped.status(); + } + + @Override + public Long snapshotId() { + return wrapped.snapshotId(); + } + + @Override + public void setSnapshotId(long snapshotId) { + wrapped.setSnapshotId(snapshotId); + } + + @Override + public Long dataSequenceNumber() { + return wrapped.dataSequenceNumber(); + } + + @Override + public void setDataSequenceNumber(long dataSequenceNumber) { + wrapped.setDataSequenceNumber(dataSequenceNumber); + } + + @Override + public Long fileSequenceNumber() { + return wrapped.fileSequenceNumber(); + } + + @Override + public void setFileSequenceNumber(long fileSequenceNumber) { + wrapped.setFileSequenceNumber(fileSequenceNumber); + } + + @Override + public F file() { + return wrapped.file(); + } + + @Override + public ManifestEntry copy() { + return wrapped.copy(); + } + + @Override + public ManifestEntry copyWithoutStats() { + return wrapped.copyWithoutStats(); + } + } + + /** Wrapper used to write DataFile or DeleteFile to v3 metadata. */ + static class IndexedDataFile implements ContentFile, IndexedRecord { + private final org.apache.avro.Schema avroSchema; + private final IndexedStructLike partitionWrapper; + private ContentFile wrapped = null; + + IndexedDataFile(Types.StructType partitionType) { + this.avroSchema = AvroSchemaUtil.convert(fileType(partitionType), "data_file"); + this.partitionWrapper = new IndexedStructLike(avroSchema.getField("partition").schema()); + } + + @SuppressWarnings("unchecked") + IndexedDataFile wrap(ContentFile file) { + this.wrapped = (ContentFile) file; + return this; + } + + @Override + public org.apache.avro.Schema getSchema() { + return avroSchema; + } + + @Override + public Object get(int pos) { + switch (pos) { + case 0: + return wrapped.content().id(); + case 1: + return wrapped.path().toString(); + case 2: + return wrapped.format() != null ? wrapped.format().toString() : null; + case 3: + return partitionWrapper.wrap(wrapped.partition()); + case 4: + return wrapped.recordCount(); + case 5: + return wrapped.fileSizeInBytes(); + case 6: + return wrapped.columnSizes(); + case 7: + return wrapped.valueCounts(); + case 8: + return wrapped.nullValueCounts(); + case 9: + return wrapped.nanValueCounts(); + case 10: + return wrapped.lowerBounds(); + case 11: + return wrapped.upperBounds(); + case 12: + return wrapped.keyMetadata(); + case 13: + return wrapped.splitOffsets(); + case 14: + return wrapped.equalityFieldIds(); + case 15: + return wrapped.sortOrderId(); + } + throw new IllegalArgumentException("Unknown field ordinal: " + pos); + } + + @Override + public void put(int i, Object v) { + throw new UnsupportedOperationException("Cannot modify IndexedDataFile wrapper via put"); + } + + @Override + public Long pos() { + return null; + } + + @Override + public int specId() { + return wrapped.specId(); + } + + @Override + public FileContent content() { + return wrapped.content(); + } + + @Override + public CharSequence path() { + return wrapped.path(); + } + + @Override + public FileFormat format() { + return wrapped.format(); + } + + @Override + public StructLike partition() { + return wrapped.partition(); + } + + @Override + public long recordCount() { + return wrapped.recordCount(); + } + + @Override + public long fileSizeInBytes() { + return wrapped.fileSizeInBytes(); + } + + @Override + public Map columnSizes() { + return wrapped.columnSizes(); + } + + @Override + public Map valueCounts() { + return wrapped.valueCounts(); + } + + @Override + public Map nullValueCounts() { + return wrapped.nullValueCounts(); + } + + @Override + public Map nanValueCounts() { + return wrapped.nanValueCounts(); + } + + @Override + public Map lowerBounds() { + return wrapped.lowerBounds(); + } + + @Override + public Map upperBounds() { + return wrapped.upperBounds(); + } + + @Override + public ByteBuffer keyMetadata() { + return wrapped.keyMetadata(); + } + + @Override + public List splitOffsets() { + return wrapped.splitOffsets(); + } + + @Override + public List equalityFieldIds() { + return wrapped.equalityFieldIds(); + } + + @Override + public Integer sortOrderId() { + return wrapped.sortOrderId(); + } + + @Override + public Long dataSequenceNumber() { + return wrapped.dataSequenceNumber(); + } + + @Override + public Long fileSequenceNumber() { + return wrapped.fileSequenceNumber(); + } + + @Override + public F copy() { + throw new UnsupportedOperationException("Cannot copy IndexedDataFile wrapper"); + } + + @Override + public F copyWithStats(Set requestedColumnIds) { + throw new UnsupportedOperationException("Cannot copy IndexedDataFile wrapper"); + } + + @Override + public F copyWithoutStats() { + throw new UnsupportedOperationException("Cannot copy IndexedDataFile wrapper"); + } + } +} diff --git a/core/src/test/java/org/apache/iceberg/DeleteFileIndexTestBase.java b/core/src/test/java/org/apache/iceberg/DeleteFileIndexTestBase.java index 229650566ca8..836a1ddd80f5 100644 --- a/core/src/test/java/org/apache/iceberg/DeleteFileIndexTestBase.java +++ b/core/src/test/java/org/apache/iceberg/DeleteFileIndexTestBase.java @@ -46,7 +46,7 @@ public abstract class DeleteFileIndexTestBase< @Parameters(name = "formatVersion = {0}") public static List parameters() { - return Arrays.asList(2); + return Arrays.asList(2, 3); } static final DeleteFile FILE_A_POS_1 = diff --git a/core/src/test/java/org/apache/iceberg/MetadataTableScanTestBase.java b/core/src/test/java/org/apache/iceberg/MetadataTableScanTestBase.java index a4e964b017ba..98d2d8f38af6 100644 --- a/core/src/test/java/org/apache/iceberg/MetadataTableScanTestBase.java +++ b/core/src/test/java/org/apache/iceberg/MetadataTableScanTestBase.java @@ -40,7 +40,7 @@ public abstract class MetadataTableScanTestBase extends TestBase { @Parameters(name = "formatVersion = {0}") protected static List parameters() { - return Arrays.asList(1, 2); + return Arrays.asList(1, 2, 3); } protected Set scannedPaths(TableScan scan) { diff --git a/core/src/test/java/org/apache/iceberg/ScanPlanningAndReportingTestBase.java b/core/src/test/java/org/apache/iceberg/ScanPlanningAndReportingTestBase.java index 3ba74c3b4b71..13e96869b454 100644 --- a/core/src/test/java/org/apache/iceberg/ScanPlanningAndReportingTestBase.java +++ b/core/src/test/java/org/apache/iceberg/ScanPlanningAndReportingTestBase.java @@ -47,7 +47,7 @@ public abstract class ScanPlanningAndReportingTestBase< @Parameters(name = "formatVersion = {0}") public static List parameters() { - return Arrays.asList(2); + return Arrays.asList(2, 3); } protected abstract ScanT newScan(Table table); diff --git a/core/src/test/java/org/apache/iceberg/TestBase.java b/core/src/test/java/org/apache/iceberg/TestBase.java index 2322062dad85..e03a1efd5156 100644 --- a/core/src/test/java/org/apache/iceberg/TestBase.java +++ b/core/src/test/java/org/apache/iceberg/TestBase.java @@ -174,7 +174,7 @@ public class TestBase { @Parameters(name = "formatVersion = {0}") protected static List parameters() { - return Arrays.asList(1, 2); + return Arrays.asList(1, 2, 3); } @Parameter protected int formatVersion; diff --git a/core/src/test/java/org/apache/iceberg/TestBatchScans.java b/core/src/test/java/org/apache/iceberg/TestBatchScans.java index c7210486e05e..1597f44f6338 100644 --- a/core/src/test/java/org/apache/iceberg/TestBatchScans.java +++ b/core/src/test/java/org/apache/iceberg/TestBatchScans.java @@ -34,7 +34,7 @@ public class TestBatchScans extends TestBase { @Parameters(name = "formatVersion = {0}") protected static List parameters() { - return Arrays.asList(1, 2); + return Arrays.asList(1, 2, 3); } @TestTemplate diff --git a/core/src/test/java/org/apache/iceberg/TestCommitReporting.java b/core/src/test/java/org/apache/iceberg/TestCommitReporting.java index bbba192fab7f..41b301668722 100644 --- a/core/src/test/java/org/apache/iceberg/TestCommitReporting.java +++ b/core/src/test/java/org/apache/iceberg/TestCommitReporting.java @@ -37,7 +37,7 @@ public class TestCommitReporting extends TestBase { @Parameters(name = "formatVersion = {0}") protected static List parameters() { - return Arrays.asList(2); + return Arrays.asList(2, 3); } @TestTemplate diff --git a/core/src/test/java/org/apache/iceberg/TestCreateTransaction.java b/core/src/test/java/org/apache/iceberg/TestCreateTransaction.java index 0c6b50b37792..766dd85a0655 100644 --- a/core/src/test/java/org/apache/iceberg/TestCreateTransaction.java +++ b/core/src/test/java/org/apache/iceberg/TestCreateTransaction.java @@ -40,7 +40,7 @@ public class TestCreateTransaction extends TestBase { @Parameters(name = "formatVersion = {0}") protected static List parameters() { - return Arrays.asList(1, 2); + return Arrays.asList(1, 2, 3); } @TestTemplate diff --git a/core/src/test/java/org/apache/iceberg/TestEntriesMetadataTable.java b/core/src/test/java/org/apache/iceberg/TestEntriesMetadataTable.java index 60e79aad91ce..9bce4e60a4f3 100644 --- a/core/src/test/java/org/apache/iceberg/TestEntriesMetadataTable.java +++ b/core/src/test/java/org/apache/iceberg/TestEntriesMetadataTable.java @@ -34,7 +34,7 @@ public class TestEntriesMetadataTable extends TestBase { @Parameters(name = "formatVersion = {0}") protected static List parameters() { - return Arrays.asList(1, 2); + return Arrays.asList(1, 2, 3); } @TestTemplate diff --git a/core/src/test/java/org/apache/iceberg/TestFastAppend.java b/core/src/test/java/org/apache/iceberg/TestFastAppend.java index 9dd479ecf0fc..b281536ab0fa 100644 --- a/core/src/test/java/org/apache/iceberg/TestFastAppend.java +++ b/core/src/test/java/org/apache/iceberg/TestFastAppend.java @@ -39,7 +39,7 @@ public class TestFastAppend extends TestBase { @Parameters(name = "formatVersion = {0}") protected static List parameters() { - return Arrays.asList(1, 2); + return Arrays.asList(1, 2, 3); } @TestTemplate diff --git a/core/src/test/java/org/apache/iceberg/TestFindFiles.java b/core/src/test/java/org/apache/iceberg/TestFindFiles.java index 68d757de5cdb..191dbd384197 100644 --- a/core/src/test/java/org/apache/iceberg/TestFindFiles.java +++ b/core/src/test/java/org/apache/iceberg/TestFindFiles.java @@ -36,7 +36,7 @@ public class TestFindFiles extends TestBase { @Parameters(name = "formatVersion = {0}") protected static List parameters() { - return Arrays.asList(1, 2); + return Arrays.asList(1, 2, 3); } @TestTemplate diff --git a/core/src/test/java/org/apache/iceberg/TestFormatVersions.java b/core/src/test/java/org/apache/iceberg/TestFormatVersions.java index 2f6a01c6e603..4a9f18581d09 100644 --- a/core/src/test/java/org/apache/iceberg/TestFormatVersions.java +++ b/core/src/test/java/org/apache/iceberg/TestFormatVersions.java @@ -71,7 +71,7 @@ public void testFormatVersionUpgradeNotSupported() { base, base.upgradeToFormatVersion(TableMetadata.SUPPORTED_TABLE_FORMAT_VERSION + 1))) .isInstanceOf(IllegalArgumentException.class) - .hasMessage("Cannot upgrade table to unsupported format version: v3 (supported: v2)"); + .hasMessage("Cannot upgrade table to unsupported format version: v4 (supported: v3)"); assertThat(ops.current().formatVersion()).isEqualTo(1); } diff --git a/core/src/test/java/org/apache/iceberg/TestIncrementalDataTableScan.java b/core/src/test/java/org/apache/iceberg/TestIncrementalDataTableScan.java index fe6b9b0c763c..ecd6a14ffefb 100644 --- a/core/src/test/java/org/apache/iceberg/TestIncrementalDataTableScan.java +++ b/core/src/test/java/org/apache/iceberg/TestIncrementalDataTableScan.java @@ -43,7 +43,7 @@ public class TestIncrementalDataTableScan extends TestBase { @Parameters(name = "formatVersion = {0}") protected static List parameters() { - return Arrays.asList(1, 2); + return Arrays.asList(1, 2, 3); } @BeforeEach diff --git a/core/src/test/java/org/apache/iceberg/TestLocalFilterFiles.java b/core/src/test/java/org/apache/iceberg/TestLocalFilterFiles.java index 275b89df3695..8a4f5db256f0 100644 --- a/core/src/test/java/org/apache/iceberg/TestLocalFilterFiles.java +++ b/core/src/test/java/org/apache/iceberg/TestLocalFilterFiles.java @@ -28,7 +28,7 @@ public class TestLocalFilterFiles @Parameters(name = "formatVersion = {0}") public static List parameters() { - return Arrays.asList(1, 2); + return Arrays.asList(1, 2, 3); } @Override diff --git a/core/src/test/java/org/apache/iceberg/TestLocationProvider.java b/core/src/test/java/org/apache/iceberg/TestLocationProvider.java index 93dd1a8555ce..7afb69483490 100644 --- a/core/src/test/java/org/apache/iceberg/TestLocationProvider.java +++ b/core/src/test/java/org/apache/iceberg/TestLocationProvider.java @@ -34,7 +34,7 @@ public class TestLocationProvider extends TestBase { @Parameters(name = "formatVersion = {0}") protected static List parameters() { - return Arrays.asList(1, 2); + return Arrays.asList(1, 2, 3); } // publicly visible for testing to be dynamically loaded diff --git a/core/src/test/java/org/apache/iceberg/TestManifestCleanup.java b/core/src/test/java/org/apache/iceberg/TestManifestCleanup.java index b5f6d05cc6a0..37ccbb8bb845 100644 --- a/core/src/test/java/org/apache/iceberg/TestManifestCleanup.java +++ b/core/src/test/java/org/apache/iceberg/TestManifestCleanup.java @@ -30,7 +30,7 @@ public class TestManifestCleanup extends TestBase { @Parameters(name = "formatVersion = {0}") protected static List parameters() { - return Arrays.asList(1, 2); + return Arrays.asList(1, 2, 3); } @TestTemplate diff --git a/core/src/test/java/org/apache/iceberg/TestManifestReaderStats.java b/core/src/test/java/org/apache/iceberg/TestManifestReaderStats.java index 4c60a943f76c..175178e48167 100644 --- a/core/src/test/java/org/apache/iceberg/TestManifestReaderStats.java +++ b/core/src/test/java/org/apache/iceberg/TestManifestReaderStats.java @@ -39,7 +39,7 @@ public class TestManifestReaderStats extends TestBase { @Parameters(name = "formatVersion = {0}") protected static List parameters() { - return Arrays.asList(1, 2); + return Arrays.asList(1, 2, 3); } private static final Map VALUE_COUNT = ImmutableMap.of(3, 3L); diff --git a/core/src/test/java/org/apache/iceberg/TestManifestWriter.java b/core/src/test/java/org/apache/iceberg/TestManifestWriter.java index eb7910a79fc5..7dd3ea2d1ce7 100644 --- a/core/src/test/java/org/apache/iceberg/TestManifestWriter.java +++ b/core/src/test/java/org/apache/iceberg/TestManifestWriter.java @@ -38,7 +38,7 @@ public class TestManifestWriter extends TestBase { @Parameters(name = "formatVersion = {0}") protected static List parameters() { - return Arrays.asList(1, 2); + return Arrays.asList(1, 2, 3); } private static final int FILE_SIZE_CHECK_ROWS_DIVISOR = 250; diff --git a/core/src/test/java/org/apache/iceberg/TestMetadataTableScans.java b/core/src/test/java/org/apache/iceberg/TestMetadataTableScans.java index 0a14a89c01b5..755eb57de8e8 100644 --- a/core/src/test/java/org/apache/iceberg/TestMetadataTableScans.java +++ b/core/src/test/java/org/apache/iceberg/TestMetadataTableScans.java @@ -55,7 +55,7 @@ public class TestMetadataTableScans extends MetadataTableScanTestBase { private void preparePartitionedTable(boolean transactional) { preparePartitionedTableData(transactional); - if (formatVersion == 2) { + if (formatVersion >= 2) { if (transactional) { table .newRowDelta() @@ -485,7 +485,7 @@ public void testPartitionsTableScanNoFilter() { CloseableIterable> entries = PartitionsTable.planEntries((StaticTableScan) scanNoFilter); - if (formatVersion == 2) { + if (formatVersion >= 2) { assertThat(entries).hasSize(8); } else { assertThat(entries).hasSize(4); @@ -510,7 +510,7 @@ public void testPartitionsTableScanWithProjection() { assertThat(scanWithProjection.schema().asStruct()).isEqualTo(expected); CloseableIterable> entries = PartitionsTable.planEntries((StaticTableScan) scanWithProjection); - if (formatVersion == 2) { + if (formatVersion >= 2) { assertThat(entries).hasSize(8); } else { assertThat(entries).hasSize(4); @@ -552,7 +552,7 @@ public void testPartitionsTableScanAndFilter() { TableScan scanAndEq = partitionsTable.newScan().filter(andEquals); CloseableIterable> entries = PartitionsTable.planEntries((StaticTableScan) scanAndEq); - if (formatVersion == 2) { + if (formatVersion >= 2) { assertThat(entries).hasSize(2); } else { assertThat(entries).hasSize(1); @@ -574,7 +574,7 @@ public void testPartitionsTableScanLtFilter() { TableScan scanLtAnd = partitionsTable.newScan().filter(ltAnd); CloseableIterable> entries = PartitionsTable.planEntries((StaticTableScan) scanLtAnd); - if (formatVersion == 2) { + if (formatVersion >= 2) { assertThat(entries).hasSize(4); } else { assertThat(entries).hasSize(2); @@ -598,7 +598,7 @@ public void testPartitionsTableScanOrFilter() { CloseableIterable> entries = PartitionsTable.planEntries((StaticTableScan) scanOr); - if (formatVersion == 2) { + if (formatVersion >= 2) { assertThat(entries).hasSize(8); } else { assertThat(entries).hasSize(4); @@ -619,7 +619,7 @@ public void testPartitionsScanNotFilter() { TableScan scanNot = partitionsTable.newScan().filter(not); CloseableIterable> entries = PartitionsTable.planEntries((StaticTableScan) scanNot); - if (formatVersion == 2) { + if (formatVersion >= 2) { assertThat(entries).hasSize(4); } else { assertThat(entries).hasSize(2); @@ -639,7 +639,7 @@ public void testPartitionsTableScanInFilter() { TableScan scanSet = partitionsTable.newScan().filter(set); CloseableIterable> entries = PartitionsTable.planEntries((StaticTableScan) scanSet); - if (formatVersion == 2) { + if (formatVersion >= 2) { assertThat(entries).hasSize(4); } else { assertThat(entries).hasSize(2); @@ -659,7 +659,7 @@ public void testPartitionsTableScanNotNullFilter() { TableScan scanUnary = partitionsTable.newScan().filter(unary); CloseableIterable> entries = PartitionsTable.planEntries((StaticTableScan) scanUnary); - if (formatVersion == 2) { + if (formatVersion >= 2) { assertThat(entries).hasSize(8); } else { assertThat(entries).hasSize(4); @@ -719,8 +719,7 @@ public void testFilesTableScanWithDroppedPartition() throws IOException { @TestTemplate public void testDeleteFilesTableSelection() throws IOException { - assumeThat(formatVersion).as("Only V2 Tables Support Deletes").isGreaterThanOrEqualTo(2); - + assumeThat(formatVersion).as("Position deletes are not supported by V1 Tables").isNotEqualTo(1); table.newFastAppend().appendFile(FILE_A).commit(); table.newRowDelta().addDeletes(FILE_A_DELETES).addDeletes(FILE_A2_DELETES).commit(); @@ -960,7 +959,7 @@ public void testPartitionSpecEvolutionAdditive() { TableScan scan = metadataTable.newScan().filter(filter); CloseableIterable> entries = PartitionsTable.planEntries((StaticTableScan) scan); - if (formatVersion == 2) { + if (formatVersion >= 2) { // Four data files and delete files of old spec, one new data file of new spec assertThat(entries).hasSize(9); } else { @@ -975,7 +974,7 @@ public void testPartitionSpecEvolutionAdditive() { scan = metadataTable.newScan().filter(filter); entries = PartitionsTable.planEntries((StaticTableScan) scan); - if (formatVersion == 2) { + if (formatVersion >= 2) { // 1 original data file and delete file written by old spec, plus 1 new data file written by // new spec assertThat(entries).hasSize(3); @@ -1026,7 +1025,7 @@ public void testPartitionSpecEvolutionRemoval() { CloseableIterable> entries = PartitionsTable.planEntries((StaticTableScan) scan); - if (formatVersion == 2) { + if (formatVersion >= 2) { // Four data and delete files of original spec, one data file written by new spec assertThat(entries).hasSize(9); } else { @@ -1187,7 +1186,7 @@ public void testPartitionsTableScanWithPlanExecutor() { })); CloseableIterable> entries = PartitionsTable.planEntries((StaticTableScan) scan); - if (formatVersion == 2) { + if (formatVersion >= 2) { assertThat(entries).hasSize(8); } else { assertThat(entries).hasSize(4); @@ -1366,7 +1365,7 @@ public void testAllManifestsTableSnapshotNot() { @TestTemplate public void testPositionDeletesWithFilter() { - assumeThat(formatVersion).as("Position deletes supported only for v2 tables").isEqualTo(2); + assumeThat(formatVersion).as("Position deletes are not supported by V1 Tables").isNotEqualTo(1); preparePartitionedTable(); PositionDeletesTable positionDeletesTable = new PositionDeletesTable(table); @@ -1429,7 +1428,7 @@ public void testPositionDeletesBaseTableFilterEntriesLevel() { } private void testPositionDeletesBaseTableFilter(boolean transactional) { - assumeThat(formatVersion).as("Position deletes supported only for v2 tables").isEqualTo(2); + assumeThat(formatVersion).as("Position deletes are not supported by V1 Tables").isNotEqualTo(1); preparePartitionedTable(transactional); PositionDeletesTable positionDeletesTable = new PositionDeletesTable(table); @@ -1490,9 +1489,9 @@ private void testPositionDeletesBaseTableFilter(boolean transactional) { @TestTemplate public void testPositionDeletesWithBaseTableFilterNot() { - assumeThat(formatVersion).as("Position deletes supported only for v2 tables").isEqualTo(2); - - // use identity rather than bucket partition spec, + assumeThat(formatVersion) + .as("Position deletes are not supported by V1 Tables") + .isNotEqualTo(1); // use identity rather than bucket partition spec, // as bucket.project does not support projecting notEq table.updateSpec().removeField("data_bucket").addField("id").commit(); PartitionSpec spec = table.spec(); @@ -1574,7 +1573,7 @@ public void testPositionDeletesWithBaseTableFilterNot() { @TestTemplate public void testPositionDeletesResiduals() { - assumeThat(formatVersion).as("Position deletes supported only for v2 tables").isEqualTo(2); + assumeThat(formatVersion).as("Position deletes are not supported by V1 Tables").isNotEqualTo(1); preparePartitionedTable(); PositionDeletesTable positionDeletesTable = new PositionDeletesTable(table); @@ -1603,7 +1602,7 @@ public void testPositionDeletesResiduals() { @TestTemplate public void testPositionDeletesUnpartitioned() { - assumeThat(formatVersion).as("Position deletes supported only for v2 tables").isEqualTo(2); + assumeThat(formatVersion).as("Position deletes are not supported by V1 Tables").isNotEqualTo(1); table.updateSpec().removeField(Expressions.bucket("data", BUCKETS_NUMBER)).commit(); assertThat(table.spec().fields()).as("Table should now be unpartitioned").hasSize(0); @@ -1694,8 +1693,7 @@ public void testPositionDeletesUnpartitioned() { @TestTemplate public void testPositionDeletesManyColumns() { - assumeThat(formatVersion).as("Position deletes supported only for v2 tables").isEqualTo(2); - + assumeThat(formatVersion).as("Position deletes are not supported by V1 Tables").isNotEqualTo(1); UpdateSchema updateSchema = table.updateSchema(); for (int i = 0; i <= 2000; i++) { updateSchema.addColumn(String.valueOf(i), Types.IntegerType.get()); diff --git a/core/src/test/java/org/apache/iceberg/TestMetadataTableScansWithPartitionEvolution.java b/core/src/test/java/org/apache/iceberg/TestMetadataTableScansWithPartitionEvolution.java index a2e5386d29df..ac96642319a3 100644 --- a/core/src/test/java/org/apache/iceberg/TestMetadataTableScansWithPartitionEvolution.java +++ b/core/src/test/java/org/apache/iceberg/TestMetadataTableScansWithPartitionEvolution.java @@ -163,8 +163,7 @@ public void testPartitionsTableScanWithAddPartitionOnNestedField() { @TestTemplate public void testPositionDeletesPartitionSpecRemoval() { - assumeThat(formatVersion).as("Position deletes supported only for v2 tables").isEqualTo(2); - + assumeThat(formatVersion).as("Position deletes are not supported by V1 Tables").isNotEqualTo(1); table.updateSpec().removeField("id").commit(); DeleteFile deleteFile = newDeleteFile(table.ops().current().spec().specId(), "nested.id=1"); diff --git a/core/src/test/java/org/apache/iceberg/TestMetrics.java b/core/src/test/java/org/apache/iceberg/TestMetrics.java index b95b92979f91..2c4849135f64 100644 --- a/core/src/test/java/org/apache/iceberg/TestMetrics.java +++ b/core/src/test/java/org/apache/iceberg/TestMetrics.java @@ -69,7 +69,7 @@ public abstract class TestMetrics { @Parameters(name = "formatVersion = {0}") public static List parameters() { - return Arrays.asList(1, 2); + return Arrays.asList(1, 2, 3); } @TempDir public Path temp; diff --git a/core/src/test/java/org/apache/iceberg/TestMetricsModes.java b/core/src/test/java/org/apache/iceberg/TestMetricsModes.java index 31ae459df506..00e961097c34 100644 --- a/core/src/test/java/org/apache/iceberg/TestMetricsModes.java +++ b/core/src/test/java/org/apache/iceberg/TestMetricsModes.java @@ -47,7 +47,7 @@ public class TestMetricsModes { @Parameters(name = "formatVersion = {0}") protected static List parameters() { - return Arrays.asList(1, 2); + return Arrays.asList(1, 2, 3); } @TempDir private Path temp; diff --git a/core/src/test/java/org/apache/iceberg/TestMicroBatchBuilder.java b/core/src/test/java/org/apache/iceberg/TestMicroBatchBuilder.java index 733bb0bb38fd..8c6f18619ac0 100644 --- a/core/src/test/java/org/apache/iceberg/TestMicroBatchBuilder.java +++ b/core/src/test/java/org/apache/iceberg/TestMicroBatchBuilder.java @@ -34,7 +34,7 @@ public class TestMicroBatchBuilder extends TestBase { @Parameters(name = "formatVersion = {0}") protected static List parameters() { - return Arrays.asList(1, 2); + return Arrays.asList(1, 2, 3); } @BeforeEach diff --git a/core/src/test/java/org/apache/iceberg/TestPartitionSpecInfo.java b/core/src/test/java/org/apache/iceberg/TestPartitionSpecInfo.java index ff54929504ee..a324b8af2e70 100644 --- a/core/src/test/java/org/apache/iceberg/TestPartitionSpecInfo.java +++ b/core/src/test/java/org/apache/iceberg/TestPartitionSpecInfo.java @@ -47,7 +47,7 @@ public class TestPartitionSpecInfo { @Parameters(name = "formatVersion = {0}") protected static List parameters() { - return Arrays.asList(1, 2); + return Arrays.asList(1, 2, 3); } @Parameter private int formatVersion; diff --git a/core/src/test/java/org/apache/iceberg/TestReplaceTransaction.java b/core/src/test/java/org/apache/iceberg/TestReplaceTransaction.java index 3a6d2017eb82..b1b481dd5305 100644 --- a/core/src/test/java/org/apache/iceberg/TestReplaceTransaction.java +++ b/core/src/test/java/org/apache/iceberg/TestReplaceTransaction.java @@ -50,7 +50,7 @@ public class TestReplaceTransaction extends TestBase { @Parameters(name = "formatVersion = {0}") protected static List parameters() { - return Arrays.asList(1, 2); + return Arrays.asList(1, 2, 3); } @TestTemplate diff --git a/core/src/test/java/org/apache/iceberg/TestRewriteManifests.java b/core/src/test/java/org/apache/iceberg/TestRewriteManifests.java index 176f61079fc3..f1d23de32a42 100644 --- a/core/src/test/java/org/apache/iceberg/TestRewriteManifests.java +++ b/core/src/test/java/org/apache/iceberg/TestRewriteManifests.java @@ -48,7 +48,7 @@ public class TestRewriteManifests extends TestBase { @Parameters(name = "formatVersion = {0}") protected static List parameters() { - return Arrays.asList(1, 2); + return Arrays.asList(1, 2, 3); } @TestTemplate diff --git a/core/src/test/java/org/apache/iceberg/TestScanSummary.java b/core/src/test/java/org/apache/iceberg/TestScanSummary.java index b326274842a3..10c22b759988 100644 --- a/core/src/test/java/org/apache/iceberg/TestScanSummary.java +++ b/core/src/test/java/org/apache/iceberg/TestScanSummary.java @@ -39,7 +39,7 @@ public class TestScanSummary extends TestBase { @Parameters(name = "formatVersion = {0}") protected static List parameters() { - return Arrays.asList(1, 2); + return Arrays.asList(1, 2, 3); } @TestTemplate diff --git a/core/src/test/java/org/apache/iceberg/TestScansAndSchemaEvolution.java b/core/src/test/java/org/apache/iceberg/TestScansAndSchemaEvolution.java index 0d5b1bc7066b..3fc691ce942a 100644 --- a/core/src/test/java/org/apache/iceberg/TestScansAndSchemaEvolution.java +++ b/core/src/test/java/org/apache/iceberg/TestScansAndSchemaEvolution.java @@ -55,7 +55,7 @@ public class TestScansAndSchemaEvolution { @Parameters(name = "formatVersion = {0}") protected static List parameters() { - return Arrays.asList(1, 2); + return Arrays.asList(1, 2, 3); } @Parameter private int formatVersion; diff --git a/core/src/test/java/org/apache/iceberg/TestSchemaAndMappingUpdate.java b/core/src/test/java/org/apache/iceberg/TestSchemaAndMappingUpdate.java index 1bd1761ffc60..111693792f78 100644 --- a/core/src/test/java/org/apache/iceberg/TestSchemaAndMappingUpdate.java +++ b/core/src/test/java/org/apache/iceberg/TestSchemaAndMappingUpdate.java @@ -43,7 +43,7 @@ public class TestSchemaAndMappingUpdate extends TestBase { @Parameters(name = "formatVersion = {0}") protected static List parameters() { - return Arrays.asList(1, 2); + return Arrays.asList(1, 2, 3); } @TestTemplate diff --git a/core/src/test/java/org/apache/iceberg/TestSchemaID.java b/core/src/test/java/org/apache/iceberg/TestSchemaID.java index 33ac84d20801..e3c58af1cbe3 100644 --- a/core/src/test/java/org/apache/iceberg/TestSchemaID.java +++ b/core/src/test/java/org/apache/iceberg/TestSchemaID.java @@ -36,7 +36,7 @@ public class TestSchemaID extends TestBase { @Parameters(name = "formatVersion = {0}") protected static List parameters() { - return Arrays.asList(1, 2); + return Arrays.asList(1, 2, 3); } @TestTemplate diff --git a/core/src/test/java/org/apache/iceberg/TestSetPartitionStatistics.java b/core/src/test/java/org/apache/iceberg/TestSetPartitionStatistics.java index 3ebe047e90b7..9504f78f5345 100644 --- a/core/src/test/java/org/apache/iceberg/TestSetPartitionStatistics.java +++ b/core/src/test/java/org/apache/iceberg/TestSetPartitionStatistics.java @@ -29,7 +29,7 @@ public class TestSetPartitionStatistics extends TestBase { @Parameters(name = "formatVersion = {0}") protected static List parameters() { - return Arrays.asList(1, 2); + return Arrays.asList(1, 2, 3); } @TestTemplate diff --git a/core/src/test/java/org/apache/iceberg/TestSetStatistics.java b/core/src/test/java/org/apache/iceberg/TestSetStatistics.java index 41941e3c6630..03c644117a95 100644 --- a/core/src/test/java/org/apache/iceberg/TestSetStatistics.java +++ b/core/src/test/java/org/apache/iceberg/TestSetStatistics.java @@ -31,7 +31,7 @@ public class TestSetStatistics extends TestBase { @Parameters(name = "formatVersion = {0}") protected static List parameters() { - return Arrays.asList(1, 2); + return Arrays.asList(1, 2, 3); } @TestTemplate diff --git a/core/src/test/java/org/apache/iceberg/TestSnapshot.java b/core/src/test/java/org/apache/iceberg/TestSnapshot.java index 2ec6abd4e428..8a30036f3242 100644 --- a/core/src/test/java/org/apache/iceberg/TestSnapshot.java +++ b/core/src/test/java/org/apache/iceberg/TestSnapshot.java @@ -33,7 +33,7 @@ public class TestSnapshot extends TestBase { @Parameters(name = "formatVersion = {0}") protected static List parameters() { - return Arrays.asList(1, 2); + return Arrays.asList(1, 2, 3); } @TestTemplate diff --git a/core/src/test/java/org/apache/iceberg/TestSnapshotLoading.java b/core/src/test/java/org/apache/iceberg/TestSnapshotLoading.java index 89312201265d..96dde0f48eb7 100644 --- a/core/src/test/java/org/apache/iceberg/TestSnapshotLoading.java +++ b/core/src/test/java/org/apache/iceberg/TestSnapshotLoading.java @@ -43,7 +43,7 @@ public class TestSnapshotLoading extends TestBase { @Parameters(name = "formatVersion = {0}") protected static List parameters() { - return Arrays.asList(1, 2); + return Arrays.asList(1, 2, 3); } private Snapshot currentSnapshot; diff --git a/core/src/test/java/org/apache/iceberg/TestSnapshotManager.java b/core/src/test/java/org/apache/iceberg/TestSnapshotManager.java index 88233dd99097..61dd082701c2 100644 --- a/core/src/test/java/org/apache/iceberg/TestSnapshotManager.java +++ b/core/src/test/java/org/apache/iceberg/TestSnapshotManager.java @@ -51,7 +51,7 @@ public class TestSnapshotManager extends TestBase { @Parameters(name = "formatVersion = {0}") protected static List parameters() { - return Arrays.asList(1, 2); + return Arrays.asList(1, 2, 3); } @TestTemplate diff --git a/core/src/test/java/org/apache/iceberg/TestSnapshotSelection.java b/core/src/test/java/org/apache/iceberg/TestSnapshotSelection.java index 7ce59e9df1c9..6d339c0cbeaf 100644 --- a/core/src/test/java/org/apache/iceberg/TestSnapshotSelection.java +++ b/core/src/test/java/org/apache/iceberg/TestSnapshotSelection.java @@ -33,7 +33,7 @@ public class TestSnapshotSelection extends TestBase { @Parameters(name = "formatVersion = {0}") protected static List parameters() { - return Arrays.asList(1, 2); + return Arrays.asList(1, 2, 3); } @TestTemplate diff --git a/core/src/test/java/org/apache/iceberg/TestSnapshotSummary.java b/core/src/test/java/org/apache/iceberg/TestSnapshotSummary.java index 053a9c374178..529e0cc614f6 100644 --- a/core/src/test/java/org/apache/iceberg/TestSnapshotSummary.java +++ b/core/src/test/java/org/apache/iceberg/TestSnapshotSummary.java @@ -32,7 +32,7 @@ public class TestSnapshotSummary extends TestBase { @Parameters(name = "formatVersion = {0}") protected static List parameters() { - return Arrays.asList(1, 2); + return Arrays.asList(1, 2, 3); } @TestTemplate diff --git a/core/src/test/java/org/apache/iceberg/TestSortOrder.java b/core/src/test/java/org/apache/iceberg/TestSortOrder.java index a3ba69a808b3..ad773192b417 100644 --- a/core/src/test/java/org/apache/iceberg/TestSortOrder.java +++ b/core/src/test/java/org/apache/iceberg/TestSortOrder.java @@ -76,7 +76,7 @@ public class TestSortOrder { @Parameters(name = "formatVersion = {0}") protected static List parameters() { - return Arrays.asList(1, 2); + return Arrays.asList(1, 2, 3); } @Parameter private int formatVersion; diff --git a/core/src/test/java/org/apache/iceberg/TestSplitPlanning.java b/core/src/test/java/org/apache/iceberg/TestSplitPlanning.java index 04bb2ae215d8..ea2142982382 100644 --- a/core/src/test/java/org/apache/iceberg/TestSplitPlanning.java +++ b/core/src/test/java/org/apache/iceberg/TestSplitPlanning.java @@ -58,7 +58,7 @@ public class TestSplitPlanning extends TestBase { @Parameters(name = "formatVersion = {0}") protected static List parameters() { - return Arrays.asList(1, 2); + return Arrays.asList(1, 2, 3); } @Override diff --git a/core/src/test/java/org/apache/iceberg/TestTableMetadataSerialization.java b/core/src/test/java/org/apache/iceberg/TestTableMetadataSerialization.java index f47968434bfe..94a3d35b35a6 100644 --- a/core/src/test/java/org/apache/iceberg/TestTableMetadataSerialization.java +++ b/core/src/test/java/org/apache/iceberg/TestTableMetadataSerialization.java @@ -35,7 +35,7 @@ public class TestTableMetadataSerialization extends TestBase { @Parameters(name = "formatVersion = {0}") protected static List parameters() { - return Arrays.asList(1, 2); + return Arrays.asList(1, 2, 3); } @TestTemplate diff --git a/core/src/test/java/org/apache/iceberg/TestTableUpdatePartitionSpec.java b/core/src/test/java/org/apache/iceberg/TestTableUpdatePartitionSpec.java index a4e587068e74..482514c40093 100644 --- a/core/src/test/java/org/apache/iceberg/TestTableUpdatePartitionSpec.java +++ b/core/src/test/java/org/apache/iceberg/TestTableUpdatePartitionSpec.java @@ -36,7 +36,7 @@ public class TestTableUpdatePartitionSpec extends TestBase { @Parameters(name = "formatVersion = {0}") protected static List parameters() { - return Arrays.asList(1, 2); + return Arrays.asList(1, 2, 3); } @BeforeEach diff --git a/core/src/test/java/org/apache/iceberg/TestTimestampPartitions.java b/core/src/test/java/org/apache/iceberg/TestTimestampPartitions.java index 08714dec01f1..656dd782cfe4 100644 --- a/core/src/test/java/org/apache/iceberg/TestTimestampPartitions.java +++ b/core/src/test/java/org/apache/iceberg/TestTimestampPartitions.java @@ -35,7 +35,7 @@ public class TestTimestampPartitions extends TestBase { @Parameters(name = "formatVersion = {0}") protected static List parameters() { - return Arrays.asList(1, 2); + return Arrays.asList(1, 2, 3); } @TestTemplate diff --git a/core/src/test/java/org/apache/iceberg/TestTransaction.java b/core/src/test/java/org/apache/iceberg/TestTransaction.java index 393494da1283..8fed7134fae1 100644 --- a/core/src/test/java/org/apache/iceberg/TestTransaction.java +++ b/core/src/test/java/org/apache/iceberg/TestTransaction.java @@ -42,7 +42,7 @@ public class TestTransaction extends TestBase { @Parameters(name = "formatVersion = {0}") protected static List parameters() { - return Arrays.asList(1, 2); + return Arrays.asList(1, 2, 3); } @TestTemplate diff --git a/core/src/test/java/org/apache/iceberg/TestUpdatePartitionSpec.java b/core/src/test/java/org/apache/iceberg/TestUpdatePartitionSpec.java index 33b003cfd8c5..97f695315e16 100644 --- a/core/src/test/java/org/apache/iceberg/TestUpdatePartitionSpec.java +++ b/core/src/test/java/org/apache/iceberg/TestUpdatePartitionSpec.java @@ -54,7 +54,7 @@ public class TestUpdatePartitionSpec extends TestBase { @Parameters(name = "formatVersion = {0}") protected static List parameters() { - return Arrays.asList(1, 2); + return Arrays.asList(1, 2, 3); } @TestTemplate diff --git a/core/src/test/java/org/apache/iceberg/TestWapWorkflow.java b/core/src/test/java/org/apache/iceberg/TestWapWorkflow.java index 58b83f43f399..b1e8e57850e8 100644 --- a/core/src/test/java/org/apache/iceberg/TestWapWorkflow.java +++ b/core/src/test/java/org/apache/iceberg/TestWapWorkflow.java @@ -35,7 +35,7 @@ public class TestWapWorkflow extends TestBase { @Parameters(name = "formatVersion = {0}") protected static List parameters() { - return Arrays.asList(1, 2); + return Arrays.asList(1, 2, 3); } @BeforeEach diff --git a/core/src/test/java/org/apache/iceberg/actions/TestSizeBasedRewriter.java b/core/src/test/java/org/apache/iceberg/actions/TestSizeBasedRewriter.java index a9a23d587ac9..77d16d3bc821 100644 --- a/core/src/test/java/org/apache/iceberg/actions/TestSizeBasedRewriter.java +++ b/core/src/test/java/org/apache/iceberg/actions/TestSizeBasedRewriter.java @@ -41,7 +41,7 @@ public class TestSizeBasedRewriter extends TestBase { @Parameters(name = "formatVersion = {0}") protected static List parameters() { - return Arrays.asList(1, 2); + return Arrays.asList(1, 2, 3); } @TestTemplate diff --git a/core/src/test/java/org/apache/iceberg/io/TestOutputFileFactory.java b/core/src/test/java/org/apache/iceberg/io/TestOutputFileFactory.java index ceffeb3749da..bb98925f504c 100644 --- a/core/src/test/java/org/apache/iceberg/io/TestOutputFileFactory.java +++ b/core/src/test/java/org/apache/iceberg/io/TestOutputFileFactory.java @@ -40,7 +40,7 @@ public class TestOutputFileFactory extends TestBase { @Parameters(name = "formatVersion = {0}") protected static List parameters() { - return Arrays.asList(1, 2); + return Arrays.asList(1, 2, 3); } private static final int PARTITION_ID = 1; diff --git a/core/src/test/java/org/apache/iceberg/mapping/TestMappingUpdates.java b/core/src/test/java/org/apache/iceberg/mapping/TestMappingUpdates.java index b9ae9dc273f2..2161b40a60b5 100644 --- a/core/src/test/java/org/apache/iceberg/mapping/TestMappingUpdates.java +++ b/core/src/test/java/org/apache/iceberg/mapping/TestMappingUpdates.java @@ -36,7 +36,7 @@ public class TestMappingUpdates extends TestBase { @Parameters(name = "formatVersion = {0}") protected static List parameters() { - return Arrays.asList(1, 2); + return Arrays.asList(1, 2, 3); } @TestTemplate diff --git a/core/src/test/resources/TableMetadataUnsupportedVersion.json b/core/src/test/resources/TableMetadataUnsupportedVersion.json index 0633a71d24d5..c40a0c9cd5ae 100644 --- a/core/src/test/resources/TableMetadataUnsupportedVersion.json +++ b/core/src/test/resources/TableMetadataUnsupportedVersion.json @@ -1,5 +1,5 @@ { - "format-version": 3, + "format-version": 4, "table-uuid": "d20125c8-7284-442c-9aea-15fee620737c", "location": "s3://bucket/test/location", "last-updated-ms": 1602638573874, @@ -33,4 +33,4 @@ "properties": {}, "current-snapshot-id": -1, "snapshots": [] -} \ No newline at end of file +} From 39373d09c276586ddcec971fe35951975bdac66f Mon Sep 17 00:00:00 2001 From: Grant Nicholas <43971820+grantatspothero@users.noreply.github.com> Date: Thu, 1 Aug 2024 14:31:18 -0500 Subject: [PATCH 11/55] Core: Allow SnapshotProducer to skip uncommitted manifest cleanup after commit (#10523) --- .../java/org/apache/iceberg/FastAppend.java | 10 ++++ .../org/apache/iceberg/SnapshotProducer.java | 43 ++++++++-------- .../org/apache/iceberg/TestFastAppend.java | 50 +++++++++++++++++++ 3 files changed, 82 insertions(+), 21 deletions(-) diff --git a/core/src/main/java/org/apache/iceberg/FastAppend.java b/core/src/main/java/org/apache/iceberg/FastAppend.java index 1439289130d7..4976a8081c44 100644 --- a/core/src/main/java/org/apache/iceberg/FastAppend.java +++ b/core/src/main/java/org/apache/iceberg/FastAppend.java @@ -198,6 +198,16 @@ protected void cleanUncommitted(Set committed) { } } + /** + * Cleanup after committing is disabled for FastAppend unless there are rewrittenAppendManifests + * because: 1.) Appended manifests are never rewritten 2.) Manifests which are written out as part + * of appendFile are already cleaned up between commit attempts in writeNewManifests + */ + @Override + protected boolean cleanupAfterCommit() { + return !rewrittenAppendManifests.isEmpty(); + } + private List writeNewManifests() throws IOException { if (hasNewFiles && newManifests != null) { newManifests.forEach(file -> deleteFile(file.path())); diff --git a/core/src/main/java/org/apache/iceberg/SnapshotProducer.java b/core/src/main/java/org/apache/iceberg/SnapshotProducer.java index 9f4bcbc6bba9..0a040fe34471 100644 --- a/core/src/main/java/org/apache/iceberg/SnapshotProducer.java +++ b/core/src/main/java/org/apache/iceberg/SnapshotProducer.java @@ -41,7 +41,7 @@ import java.util.UUID; import java.util.concurrent.ExecutorService; import java.util.concurrent.atomic.AtomicInteger; -import java.util.concurrent.atomic.AtomicLong; +import java.util.concurrent.atomic.AtomicReference; import java.util.function.Consumer; import org.apache.iceberg.encryption.EncryptedOutputFile; import org.apache.iceberg.encryption.EncryptingFileIO; @@ -368,8 +368,8 @@ protected TableMetadata refresh() { @Override @SuppressWarnings("checkstyle:CyclomaticComplexity") public void commit() { - // this is always set to the latest commit attempt's snapshot id. - AtomicLong newSnapshotId = new AtomicLong(-1L); + // this is always set to the latest commit attempt's snapshot + AtomicReference stagedSnapshot = new AtomicReference<>(); try (Timed ignore = commitMetrics().totalDuration().start()) { try { Tasks.foreach(ops) @@ -384,7 +384,7 @@ public void commit() { .run( taskOps -> { Snapshot newSnapshot = apply(); - newSnapshotId.set(newSnapshot.snapshotId()); + stagedSnapshot.set(newSnapshot); TableMetadata.Builder update = TableMetadata.buildFrom(base); if (base.snapshot(newSnapshot.snapshotId()) != null) { // this is a rollback operation @@ -422,26 +422,23 @@ public void commit() { throw e; } + // at this point, the commit must have succeeded so the stagedSnapshot is committed + Snapshot committedSnapshot = stagedSnapshot.get(); try { - LOG.info("Committed snapshot {} ({})", newSnapshotId.get(), getClass().getSimpleName()); - - // at this point, the commit must have succeeded. after a refresh, the snapshot is loaded by - // id in case another commit was added between this commit and the refresh. - Snapshot saved = ops.refresh().snapshot(newSnapshotId.get()); - if (saved != null) { - cleanUncommitted(Sets.newHashSet(saved.allManifests(ops.io()))); - // also clean up unused manifest lists created by multiple attempts - for (String manifestList : manifestLists) { - if (!saved.manifestListLocation().equals(manifestList)) { - deleteFile(manifestList); - } + LOG.info( + "Committed snapshot {} ({})", + committedSnapshot.snapshotId(), + getClass().getSimpleName()); + + if (cleanupAfterCommit()) { + cleanUncommitted(Sets.newHashSet(committedSnapshot.allManifests(ops.io()))); + } + // also clean up unused manifest lists created by multiple attempts + for (String manifestList : manifestLists) { + if (!committedSnapshot.manifestListLocation().equals(manifestList)) { + deleteFile(manifestList); } - } else { - // saved may not be present if the latest metadata couldn't be loaded due to eventual - // consistency problems in refresh. in that case, don't clean up. - LOG.warn("Failed to load committed snapshot, skipping manifest clean-up"); } - } catch (Throwable e) { LOG.warn( "Failed to load committed table metadata or during cleanup, skipping further cleanup", @@ -565,6 +562,10 @@ protected boolean canInheritSnapshotId() { return canInheritSnapshotId; } + protected boolean cleanupAfterCommit() { + return true; + } + private static ManifestFile addMetadata(TableOperations ops, ManifestFile manifest) { try (ManifestReader reader = ManifestFiles.read(manifest, ops.io(), ops.current().specsById())) { diff --git a/core/src/test/java/org/apache/iceberg/TestFastAppend.java b/core/src/test/java/org/apache/iceberg/TestFastAppend.java index b281536ab0fa..8125c528d9c3 100644 --- a/core/src/test/java/org/apache/iceberg/TestFastAppend.java +++ b/core/src/test/java/org/apache/iceberg/TestFastAppend.java @@ -324,6 +324,56 @@ public void testRecoveryWithoutManifestList() { assertThat(metadata.currentSnapshot().allManifests(FILE_IO)).contains(newManifest); } + @TestTemplate + public void testWriteNewManifestsIdempotency() { + // inject 3 failures, the last try will succeed + TestTables.TestTableOperations ops = table.ops(); + ops.failCommits(3); + + AppendFiles append = table.newFastAppend().appendFile(FILE_B); + Snapshot pending = append.apply(); + ManifestFile newManifest = pending.allManifests(FILE_IO).get(0); + assertThat(new File(newManifest.path())).exists(); + + append.commit(); + + TableMetadata metadata = readMetadata(); + + // contains only a single manifest, does not duplicate manifests on retries + validateSnapshot(null, metadata.currentSnapshot(), FILE_B); + assertThat(new File(newManifest.path())).exists(); + assertThat(metadata.currentSnapshot().allManifests(FILE_IO)).contains(newManifest); + assertThat(listManifestFiles(tableDir)).containsExactly(new File(newManifest.path())); + } + + @TestTemplate + public void testWriteNewManifestsCleanup() { + // append file, stage changes with apply() but do not commit + AppendFiles append = table.newFastAppend().appendFile(FILE_A); + Snapshot pending = append.apply(); + ManifestFile oldManifest = pending.allManifests(FILE_IO).get(0); + assertThat(new File(oldManifest.path())).exists(); + + // append file, stage changes with apply() but do not commit + // validate writeNewManifests deleted the old staged manifest + append.appendFile(FILE_B); + Snapshot newPending = append.apply(); + List manifestFiles = newPending.allManifests(FILE_IO); + assertThat(manifestFiles).hasSize(1); + ManifestFile newManifest = manifestFiles.get(0); + assertThat(newManifest.path()).isNotEqualTo(oldManifest.path()); + + append.commit(); + TableMetadata metadata = readMetadata(); + + // contains only a single manifest, old staged manifest is deleted + validateSnapshot(null, metadata.currentSnapshot(), FILE_A, FILE_B); + assertThat(new File(oldManifest.path())).doesNotExist(); + assertThat(new File(newManifest.path())).exists(); + assertThat(metadata.currentSnapshot().allManifests(FILE_IO)).containsExactly(newManifest); + assertThat(listManifestFiles(tableDir)).containsExactly(new File(newManifest.path())); + } + @TestTemplate public void testAppendManifestWithSnapshotIdInheritance() throws IOException { table.updateProperties().set(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, "true").commit(); From 6e7113a5291dffad38ffacc7d264456a2366a707 Mon Sep 17 00:00:00 2001 From: Steven Zhen Wu Date: Thu, 1 Aug 2024 14:10:37 -0700 Subject: [PATCH 12/55] Flink: a few small fixes or tuning for range partitioner (#10823) --- .../shuffle/AggregatedStatisticsTracker.java | 23 +++-- .../sink/shuffle/CompletedStatistics.java | 8 ++ .../shuffle/DataStatisticsCoordinator.java | 45 ++++++---- .../flink/sink/shuffle/RangePartitioner.java | 8 +- .../sink/shuffle/SketchRangePartitioner.java | 19 +--- .../flink/sink/shuffle/SketchUtil.java | 17 ++++ .../flink/sink/shuffle/SortKeyUtil.java | 59 +++++++++++++ .../sink/shuffle/TestRangePartitioner.java | 65 ++++++++++++++ .../shuffle/TestSketchRangePartitioner.java | 88 +++++++++++++++++++ .../flink/sink/shuffle/TestSketchUtil.java | 64 +++++++++++++- .../flink/sink/shuffle/TestSortKeyUtil.java | 73 +++++++++++++++ 11 files changed, 420 insertions(+), 49 deletions(-) create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SortKeyUtil.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestRangePartitioner.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchRangePartitioner.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeyUtil.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/AggregatedStatisticsTracker.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/AggregatedStatisticsTracker.java index 338523b7b074..5525f02c873e 100644 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/AggregatedStatisticsTracker.java +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/AggregatedStatisticsTracker.java @@ -223,7 +223,9 @@ private void merge(DataStatistics taskStatistics) { convertCoordinatorToSketch(); } - sketchStatistics.update(taskSketch); + if (taskSketch.getNumSamples() > 0) { + sketchStatistics.update(taskSketch); + } } } @@ -242,13 +244,18 @@ private CompletedStatistics completedStatistics(long checkpointId) { return CompletedStatistics.fromKeyFrequency(checkpointId, mapStatistics); } else { ReservoirItemsSketch sketch = sketchStatistics.getResult(); - LOG.info( - "Completed sketch statistics aggregation: " - + "reservoir size = {}, number of items seen = {}, number of samples = {}", - sketch.getK(), - sketch.getN(), - sketch.getNumSamples()); - return CompletedStatistics.fromKeySamples(checkpointId, sketch.getSamples()); + if (sketch != null) { + LOG.info( + "Completed sketch statistics aggregation: " + + "reservoir size = {}, number of items seen = {}, number of samples = {}", + sketch.getK(), + sketch.getN(), + sketch.getNumSamples()); + return CompletedStatistics.fromKeySamples(checkpointId, sketch.getSamples()); + } else { + LOG.info("Empty sketch statistics."); + return CompletedStatistics.fromKeySamples(checkpointId, new SortKey[0]); + } } } } diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/CompletedStatistics.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/CompletedStatistics.java index c0e228965ddd..e4cba174f0f2 100644 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/CompletedStatistics.java +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/CompletedStatistics.java @@ -100,4 +100,12 @@ Map keyFrequency() { SortKey[] keySamples() { return keySamples; } + + boolean isEmpty() { + if (type == StatisticsType.Sketch) { + return keySamples.length == 0; + } else { + return keyFrequency().isEmpty(); + } + } } diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsCoordinator.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsCoordinator.java index 3b21fbae315a..4bfde7204acf 100644 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsCoordinator.java +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsCoordinator.java @@ -38,11 +38,11 @@ import org.apache.flink.util.function.ThrowingRunnable; import org.apache.iceberg.Schema; import org.apache.iceberg.SortOrder; -import org.apache.iceberg.SortOrderComparators; import org.apache.iceberg.StructLike; import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; import org.apache.iceberg.relocated.com.google.common.collect.Iterables; import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.types.Comparators; import org.jetbrains.annotations.NotNull; import org.jetbrains.annotations.Nullable; import org.slf4j.Logger; @@ -91,7 +91,7 @@ class DataStatisticsCoordinator implements OperatorCoordinator { this.context = context; this.schema = schema; this.sortOrder = sortOrder; - this.comparator = SortOrderComparators.forSchema(schema, sortOrder); + this.comparator = Comparators.forType(SortKeyUtil.sortKeySchema(schema, sortOrder).asStruct()); this.downstreamParallelism = downstreamParallelism; this.statisticsType = statisticsType; this.closeFileCostWeightPercentage = closeFileCostWeightPercentage; @@ -202,17 +202,23 @@ private void handleDataStatisticRequest(int subtask, StatisticsEvent event) { aggregatedStatisticsTracker.updateAndCheckCompletion(subtask, event); if (maybeCompletedStatistics != null) { - // completedStatistics contains the complete samples, which is needed to compute - // the range bounds in globalStatistics if downstreamParallelism changed. - this.completedStatistics = maybeCompletedStatistics; - // globalStatistics only contains assignment calculated based on Map or Sketch statistics - this.globalStatistics = - globalStatistics( - maybeCompletedStatistics, - downstreamParallelism, - comparator, - closeFileCostWeightPercentage); - sendGlobalStatisticsToSubtasks(globalStatistics); + if (maybeCompletedStatistics.isEmpty()) { + LOG.info( + "Skip aggregated statistics for checkpoint {} as it is empty.", event.checkpointId()); + } else { + LOG.info("Completed statistics aggregation for checkpoint {}", event.checkpointId()); + // completedStatistics contains the complete samples, which is needed to compute + // the range bounds in globalStatistics if downstreamParallelism changed. + this.completedStatistics = maybeCompletedStatistics; + // globalStatistics only contains assignment calculated based on Map or Sketch statistics + this.globalStatistics = + globalStatistics( + maybeCompletedStatistics, + downstreamParallelism, + comparator, + closeFileCostWeightPercentage); + sendGlobalStatisticsToSubtasks(globalStatistics); + } } } @@ -324,9 +330,14 @@ public void checkpointCoordinator(long checkpointId, CompletableFuture r "Snapshotting data statistics coordinator {} for checkpoint {}", operatorName, checkpointId); - resultFuture.complete( - StatisticsUtil.serializeCompletedStatistics( - completedStatistics, completedStatisticsSerializer)); + if (completedStatistics == null) { + // null checkpoint result is not allowed, hence supply an empty byte array + resultFuture.complete(new byte[0]); + } else { + resultFuture.complete( + StatisticsUtil.serializeCompletedStatistics( + completedStatistics, completedStatisticsSerializer)); + } }, String.format("taking checkpoint %d", checkpointId)); } @@ -338,7 +349,7 @@ public void notifyCheckpointComplete(long checkpointId) {} public void resetToCheckpoint(long checkpointId, byte[] checkpointData) { Preconditions.checkState( !started, "The coordinator %s can only be reset if it was not yet started", operatorName); - if (checkpointData == null) { + if (checkpointData == null || checkpointData.length == 0) { LOG.info( "Data statistic coordinator {} has nothing to restore from checkpoint {}", operatorName, diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/RangePartitioner.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/RangePartitioner.java index 83a9461233d2..6608b938f5a8 100644 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/RangePartitioner.java +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/RangePartitioner.java @@ -23,12 +23,13 @@ import org.apache.flink.annotation.Internal; import org.apache.flink.api.common.functions.Partitioner; import org.apache.flink.table.data.RowData; +import org.apache.iceberg.DistributionMode; import org.apache.iceberg.Schema; import org.apache.iceberg.SortOrder; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -/** The wrapper class */ +/** This custom partitioner implements the {@link DistributionMode#RANGE} for Flink sink. */ @Internal public class RangePartitioner implements Partitioner { private static final Logger LOG = LoggerFactory.getLogger(RangePartitioner.class); @@ -94,9 +95,8 @@ static int adjustPartitionWithRescale( if (numPartitionsStatsCalculation <= numPartitions) { // no rescale or scale-up case. // new subtasks are ignored and not assigned any keys, which is sub-optimal and only - // transient. - // when rescale is detected, operator requests new statistics from coordinator upon - // initialization. + // transient. when rescale is detected, operator requests new statistics from + // coordinator upon initialization. return partition; } else { // scale-down case. diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchRangePartitioner.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchRangePartitioner.java index af78271ea5dc..dddb0d8722c0 100644 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchRangePartitioner.java +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchRangePartitioner.java @@ -18,17 +18,16 @@ */ package org.apache.iceberg.flink.sink.shuffle; -import java.util.Arrays; import java.util.Comparator; import org.apache.flink.api.common.functions.Partitioner; import org.apache.flink.table.data.RowData; import org.apache.iceberg.Schema; import org.apache.iceberg.SortKey; import org.apache.iceberg.SortOrder; -import org.apache.iceberg.SortOrderComparators; import org.apache.iceberg.StructLike; import org.apache.iceberg.flink.FlinkSchemaUtil; import org.apache.iceberg.flink.RowDataWrapper; +import org.apache.iceberg.types.Comparators; class SketchRangePartitioner implements Partitioner { private final SortKey sortKey; @@ -38,7 +37,7 @@ class SketchRangePartitioner implements Partitioner { SketchRangePartitioner(Schema schema, SortOrder sortOrder, SortKey[] rangeBounds) { this.sortKey = new SortKey(schema, sortOrder); - this.comparator = SortOrderComparators.forSchema(schema, sortOrder); + this.comparator = Comparators.forType(SortKeyUtil.sortKeySchema(schema, sortOrder).asStruct()); this.rangeBounds = rangeBounds; this.rowDataWrapper = new RowDataWrapper(FlinkSchemaUtil.convert(schema), schema.asStruct()); } @@ -47,18 +46,6 @@ class SketchRangePartitioner implements Partitioner { public int partition(RowData row, int numPartitions) { // reuse the sortKey and rowDataWrapper sortKey.wrap(rowDataWrapper.wrap(row)); - int partition = Arrays.binarySearch(rangeBounds, sortKey, comparator); - - // binarySearch either returns the match location or -[insertion point]-1 - if (partition < 0) { - partition = -partition - 1; - } - - if (partition > rangeBounds.length) { - partition = rangeBounds.length; - } - - return RangePartitioner.adjustPartitionWithRescale( - partition, rangeBounds.length + 1, numPartitions); + return SketchUtil.partition(sortKey, numPartitions, rangeBounds, comparator); } } diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchUtil.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchUtil.java index a58310611e8d..871ef9ef1149 100644 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchUtil.java +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchUtil.java @@ -139,4 +139,21 @@ static void convertMapToSketch( } }); } + + static int partition( + SortKey key, int numPartitions, SortKey[] rangeBounds, Comparator comparator) { + int partition = Arrays.binarySearch(rangeBounds, key, comparator); + + // binarySearch either returns the match location or -[insertion point]-1 + if (partition < 0) { + partition = -partition - 1; + } + + if (partition > rangeBounds.length) { + partition = rangeBounds.length; + } + + return RangePartitioner.adjustPartitionWithRescale( + partition, rangeBounds.length + 1, numPartitions); + } } diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SortKeyUtil.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SortKeyUtil.java new file mode 100644 index 000000000000..1e5bdbbac3e4 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SortKeyUtil.java @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import java.util.List; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SortField; +import org.apache.iceberg.SortOrder; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.types.Type; +import org.apache.iceberg.types.Types; + +class SortKeyUtil { + private SortKeyUtil() {} + + /** Compute the result schema of {@code SortKey} transformation */ + static Schema sortKeySchema(Schema schema, SortOrder sortOrder) { + List sortFields = sortOrder.fields(); + int size = sortFields.size(); + List transformedFields = Lists.newArrayListWithCapacity(size); + for (int i = 0; i < size; ++i) { + int sourceFieldId = sortFields.get(i).sourceId(); + Types.NestedField sourceField = schema.findField(sourceFieldId); + Preconditions.checkArgument( + sourceField != null, "Cannot find source field: %s", sourceFieldId); + Type transformedType = sortFields.get(i).transform().getResultType(sourceField.type()); + // There could be multiple transformations on the same source column, like in the PartitionKey + // case. To resolve the collision, field id is set to transform index and field name is set to + // sourceFieldName_transformIndex + Types.NestedField transformedField = + Types.NestedField.of( + i, + sourceField.isOptional(), + sourceField.name() + '_' + i, + transformedType, + sourceField.doc()); + transformedFields.add(transformedField); + } + + return new Schema(transformedFields); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestRangePartitioner.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestRangePartitioner.java new file mode 100644 index 000000000000..0485fdb7fa04 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestRangePartitioner.java @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import static org.apache.iceberg.flink.sink.shuffle.Fixtures.CHAR_KEYS; +import static org.apache.iceberg.flink.sink.shuffle.Fixtures.SCHEMA; +import static org.apache.iceberg.flink.sink.shuffle.Fixtures.SORT_ORDER; +import static org.assertj.core.api.Assertions.assertThat; + +import java.util.Set; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.StringData; +import org.apache.iceberg.SortKey; +import org.apache.iceberg.relocated.com.google.common.collect.Sets; +import org.junit.jupiter.api.Test; + +public class TestRangePartitioner { + private final int numPartitions = 4; + + @Test + public void testRoundRobinRecordsBeforeStatisticsAvailable() { + RangePartitioner partitioner = new RangePartitioner(SCHEMA, SORT_ORDER); + Set results = Sets.newHashSetWithExpectedSize(numPartitions); + for (int i = 0; i < numPartitions; ++i) { + results.add( + partitioner.partition( + StatisticsOrRecord.fromRecord(GenericRowData.of(StringData.fromString("a"), 1)), + numPartitions)); + } + + // round-robin. every partition should get an assignment + assertThat(results).containsExactlyInAnyOrder(0, 1, 2, 3); + } + + @Test + public void testRoundRobinStatisticsWrapper() { + RangePartitioner partitioner = new RangePartitioner(SCHEMA, SORT_ORDER); + Set results = Sets.newHashSetWithExpectedSize(numPartitions); + for (int i = 0; i < numPartitions; ++i) { + GlobalStatistics statistics = + GlobalStatistics.fromRangeBounds(1L, new SortKey[] {CHAR_KEYS.get("a")}); + results.add( + partitioner.partition(StatisticsOrRecord.fromStatistics(statistics), numPartitions)); + } + + // round-robin. every partition should get an assignment + assertThat(results).containsExactlyInAnyOrder(0, 1, 2, 3); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchRangePartitioner.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchRangePartitioner.java new file mode 100644 index 000000000000..378c6afff077 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchRangePartitioner.java @@ -0,0 +1,88 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import static org.assertj.core.api.Assertions.assertThat; + +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.iceberg.SortKey; +import org.apache.iceberg.SortOrder; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.RowDataWrapper; +import org.apache.iceberg.flink.TestFixtures; +import org.junit.jupiter.api.Test; + +public class TestSketchRangePartitioner { + // sort on the long id field + private static final SortOrder SORT_ORDER = + SortOrder.builderFor(TestFixtures.SCHEMA).asc("id").build(); + private static final SortKey SORT_KEY = new SortKey(TestFixtures.SCHEMA, SORT_ORDER); + private static final RowType ROW_TYPE = FlinkSchemaUtil.convert(TestFixtures.SCHEMA); + private static final int NUM_PARTITIONS = 16; + private static final long RANGE_STEP = 1_000; + private static final long MAX_ID = RANGE_STEP * NUM_PARTITIONS; + private static final SortKey[] RANGE_BOUNDS = createRangeBounds(); + + /** + * To understand how range bounds are used in range partitioning, here is an example for human + * ages with 4 partitions: [15, 32, 60]. The 4 ranges would be + * + *
    + *
  • age <= 15 + *
  • age > 15 && age <= 32 + *
  • age >32 && age <= 60 + *
  • age > 60 + *
+ */ + private static SortKey[] createRangeBounds() { + SortKey[] rangeBounds = new SortKey[NUM_PARTITIONS - 1]; + for (int i = 0; i < NUM_PARTITIONS - 1; ++i) { + RowData rowData = + GenericRowData.of( + StringData.fromString("data"), + RANGE_STEP * (i + 1), + StringData.fromString("2023-06-20")); + RowDataWrapper keyWrapper = new RowDataWrapper(ROW_TYPE, TestFixtures.SCHEMA.asStruct()); + keyWrapper.wrap(rowData); + SortKey sortKey = new SortKey(TestFixtures.SCHEMA, SORT_ORDER); + sortKey.wrap(keyWrapper); + rangeBounds[i] = sortKey; + } + + return rangeBounds; + } + + @Test + public void testRangePartitioningWithRangeBounds() { + SketchRangePartitioner partitioner = + new SketchRangePartitioner(TestFixtures.SCHEMA, SORT_ORDER, RANGE_BOUNDS); + GenericRowData row = + GenericRowData.of(StringData.fromString("data"), 0L, StringData.fromString("2023-06-20")); + for (long id = 0; id < MAX_ID; ++id) { + row.setField(1, id); + int partition = partitioner.partition(row, NUM_PARTITIONS); + assertThat(partition).isGreaterThanOrEqualTo(0).isLessThan(NUM_PARTITIONS); + int expectedPartition = id == 0L ? 0 : (int) ((id - 1) / RANGE_STEP); + assertThat(partition).isEqualTo(expectedPartition); + } + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchUtil.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchUtil.java index 31dae5c76aeb..16202c075ea0 100644 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchUtil.java +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchUtil.java @@ -19,10 +19,13 @@ package org.apache.iceberg.flink.sink.shuffle; import static org.apache.iceberg.flink.sink.shuffle.Fixtures.CHAR_KEYS; +import static org.apache.iceberg.flink.sink.shuffle.Fixtures.SORT_ORDER_COMPARTOR; import static org.assertj.core.api.Assertions.assertThat; import org.apache.iceberg.SortKey; import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; public class TestSketchUtil { @Test @@ -55,7 +58,7 @@ public void testRangeBoundsOneChannel() { assertThat( SketchUtil.rangeBounds( 1, - Fixtures.SORT_ORDER_COMPARTOR, + SORT_ORDER_COMPARTOR, new SortKey[] { CHAR_KEYS.get("a"), CHAR_KEYS.get("b"), @@ -72,7 +75,7 @@ public void testRangeBoundsDivisible() { assertThat( SketchUtil.rangeBounds( 3, - Fixtures.SORT_ORDER_COMPARTOR, + SORT_ORDER_COMPARTOR, new SortKey[] { CHAR_KEYS.get("a"), CHAR_KEYS.get("b"), @@ -90,7 +93,7 @@ public void testRangeBoundsNonDivisible() { assertThat( SketchUtil.rangeBounds( 4, - Fixtures.SORT_ORDER_COMPARTOR, + SORT_ORDER_COMPARTOR, new SortKey[] { CHAR_KEYS.get("a"), CHAR_KEYS.get("b"), @@ -113,7 +116,7 @@ public void testRangeBoundsSkipDuplicates() { assertThat( SketchUtil.rangeBounds( 4, - Fixtures.SORT_ORDER_COMPARTOR, + SORT_ORDER_COMPARTOR, new SortKey[] { CHAR_KEYS.get("a"), CHAR_KEYS.get("b"), @@ -130,4 +133,57 @@ public void testRangeBoundsSkipDuplicates() { // skipped duplicate c's .containsExactly(CHAR_KEYS.get("c"), CHAR_KEYS.get("g"), CHAR_KEYS.get("j")); } + + @ParameterizedTest + @ValueSource(ints = {4, 6}) + public void testPartitioningAndScaleUp(int numPartitions) { + // Range bounds are calculated based on 4 partitions + SortKey[] rangeBounds = + new SortKey[] {CHAR_KEYS.get("c"), CHAR_KEYS.get("j"), CHAR_KEYS.get("m")}; + + // <= c + assertPartition(0, CHAR_KEYS.get("a"), numPartitions, rangeBounds); + assertPartition(0, CHAR_KEYS.get("c"), numPartitions, rangeBounds); + // > c && <= j + assertPartition(1, CHAR_KEYS.get("d"), numPartitions, rangeBounds); + assertPartition(1, CHAR_KEYS.get("i"), numPartitions, rangeBounds); + assertPartition(1, CHAR_KEYS.get("j"), numPartitions, rangeBounds); + // > j && <= m + assertPartition(2, CHAR_KEYS.get("k"), numPartitions, rangeBounds); + assertPartition(2, CHAR_KEYS.get("l"), numPartitions, rangeBounds); + assertPartition(2, CHAR_KEYS.get("m"), numPartitions, rangeBounds); + // > m + assertPartition(3, CHAR_KEYS.get("n"), numPartitions, rangeBounds); + assertPartition(3, CHAR_KEYS.get("z"), numPartitions, rangeBounds); + } + + @Test + public void testPartitionScaleDown() { + // Range bounds are calculated based on 4 partitions + SortKey[] rangeBounds = + new SortKey[] {CHAR_KEYS.get("c"), CHAR_KEYS.get("j"), CHAR_KEYS.get("m")}; + int numPartitions = 3; + + // <= c + assertPartition(0, CHAR_KEYS.get("a"), numPartitions, rangeBounds); + assertPartition(0, CHAR_KEYS.get("c"), numPartitions, rangeBounds); + // > c && <= j + assertPartition(1, CHAR_KEYS.get("d"), numPartitions, rangeBounds); + assertPartition(1, CHAR_KEYS.get("i"), numPartitions, rangeBounds); + assertPartition(1, CHAR_KEYS.get("j"), numPartitions, rangeBounds); + // > j && <= m + assertPartition(2, CHAR_KEYS.get("k"), numPartitions, rangeBounds); + assertPartition(2, CHAR_KEYS.get("l"), numPartitions, rangeBounds); + assertPartition(2, CHAR_KEYS.get("m"), numPartitions, rangeBounds); + // > m + // reassigns out-of-range partitions via mod (% 3 in this case) + assertPartition(0, CHAR_KEYS.get("n"), numPartitions, rangeBounds); + assertPartition(0, CHAR_KEYS.get("z"), numPartitions, rangeBounds); + } + + private static void assertPartition( + int expectedPartition, SortKey key, int numPartitions, SortKey[] rangeBounds) { + assertThat(SketchUtil.partition(key, numPartitions, rangeBounds, SORT_ORDER_COMPARTOR)) + .isEqualTo(expectedPartition); + } } diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeyUtil.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeyUtil.java new file mode 100644 index 000000000000..1be7e27f2c01 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeyUtil.java @@ -0,0 +1,73 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import static org.assertj.core.api.Assertions.assertThat; + +import org.apache.iceberg.NullOrder; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SortDirection; +import org.apache.iceberg.SortOrder; +import org.apache.iceberg.expressions.Expressions; +import org.apache.iceberg.types.Types; +import org.junit.jupiter.api.Test; + +public class TestSortKeyUtil { + @Test + public void testResultSchema() { + Schema schema = + new Schema( + Types.NestedField.required(1, "id", Types.StringType.get()), + Types.NestedField.required(2, "ratio", Types.DoubleType.get()), + Types.NestedField.optional( + 3, + "user", + Types.StructType.of( + Types.NestedField.required(11, "name", Types.StringType.get()), + Types.NestedField.required(12, "ts", Types.TimestampType.withoutZone()), + Types.NestedField.optional(13, "device_id", Types.UUIDType.get()), + Types.NestedField.optional( + 14, + "location", + Types.StructType.of( + Types.NestedField.required(101, "lat", Types.FloatType.get()), + Types.NestedField.required(102, "long", Types.FloatType.get()), + Types.NestedField.required(103, "blob", Types.BinaryType.get())))))); + + SortOrder sortOrder = + SortOrder.builderFor(schema) + .asc("ratio") + .sortBy(Expressions.hour("user.ts"), SortDirection.ASC, NullOrder.NULLS_FIRST) + .sortBy( + Expressions.bucket("user.device_id", 16), SortDirection.ASC, NullOrder.NULLS_FIRST) + .sortBy( + Expressions.truncate("user.location.blob", 16), + SortDirection.ASC, + NullOrder.NULLS_FIRST) + .build(); + + assertThat(SortKeyUtil.sortKeySchema(schema, sortOrder).asStruct()) + .isEqualTo( + Types.StructType.of( + Types.NestedField.required(0, "ratio_0", Types.DoubleType.get()), + Types.NestedField.required(1, "ts_1", Types.IntegerType.get()), + Types.NestedField.optional(2, "device_id_2", Types.IntegerType.get()), + Types.NestedField.required(3, "blob_3", Types.BinaryType.get()))); + } +} From 9a67f0b85c82ae09089e150f2c663a65f145670e Mon Sep 17 00:00:00 2001 From: Piotr Findeisen Date: Fri, 2 Aug 2024 09:23:23 +0200 Subject: [PATCH 13/55] Drop support for Java 8 (#10518) --- .github/workflows/delta-conversion-ci.yml | 4 +- .github/workflows/flink-ci.yml | 2 +- .github/workflows/hive-ci.yml | 4 +- .github/workflows/java-ci.yml | 6 +-- .github/workflows/publish-snapshot.yml | 2 +- .github/workflows/spark-ci.yml | 2 +- README.md | 2 +- build.gradle | 52 ++++++++--------------- deploy.gradle | 4 +- hive-runtime/build.gradle | 2 +- jmh.gradle | 4 +- site/docs/contribute.md | 2 +- tasks.gradle | 8 ++-- 13 files changed, 38 insertions(+), 56 deletions(-) diff --git a/.github/workflows/delta-conversion-ci.yml b/.github/workflows/delta-conversion-ci.yml index ac5314e8afb8..cd16847cf95a 100644 --- a/.github/workflows/delta-conversion-ci.yml +++ b/.github/workflows/delta-conversion-ci.yml @@ -71,7 +71,7 @@ jobs: runs-on: ubuntu-22.04 strategy: matrix: - jvm: [8, 11, 17, 21] + jvm: [11, 17, 21] env: SPARK_LOCAL_IP: localhost steps: @@ -100,7 +100,7 @@ jobs: runs-on: ubuntu-22.04 strategy: matrix: - jvm: [8, 11, 17, 21] + jvm: [11, 17, 21] env: SPARK_LOCAL_IP: localhost steps: diff --git a/.github/workflows/flink-ci.yml b/.github/workflows/flink-ci.yml index d2e249c279f7..3df36e2be86a 100644 --- a/.github/workflows/flink-ci.yml +++ b/.github/workflows/flink-ci.yml @@ -73,7 +73,7 @@ jobs: runs-on: ubuntu-22.04 strategy: matrix: - jvm: [8, 11, 17, 21] + jvm: [11, 17, 21] flink: ['1.17', '1.18', '1.19'] exclude: # Flink 1.17 does not support Java 17. diff --git a/.github/workflows/hive-ci.yml b/.github/workflows/hive-ci.yml index ee487807cff4..6ad9f58410d6 100644 --- a/.github/workflows/hive-ci.yml +++ b/.github/workflows/hive-ci.yml @@ -69,7 +69,7 @@ jobs: runs-on: ubuntu-22.04 strategy: matrix: - jvm: [8, 11, 17, 21] + jvm: [11, 17, 21] env: SPARK_LOCAL_IP: localhost steps: @@ -98,7 +98,7 @@ jobs: runs-on: ubuntu-22.04 strategy: matrix: - jvm: [8, 11, 17, 21] + jvm: [11, 17, 21] env: SPARK_LOCAL_IP: localhost steps: diff --git a/.github/workflows/java-ci.yml b/.github/workflows/java-ci.yml index e1fd90116ef1..0d39ee8646ad 100644 --- a/.github/workflows/java-ci.yml +++ b/.github/workflows/java-ci.yml @@ -65,7 +65,7 @@ jobs: runs-on: ubuntu-22.04 strategy: matrix: - jvm: [8, 11, 17, 21] + jvm: [11, 17, 21] env: SPARK_LOCAL_IP: localhost steps: @@ -94,7 +94,7 @@ jobs: runs-on: ubuntu-22.04 strategy: matrix: - jvm: [8, 11, 17, 21] + jvm: [11, 17, 21] steps: - uses: actions/checkout@v4 - uses: actions/setup-java@v4 @@ -107,7 +107,7 @@ jobs: runs-on: ubuntu-22.04 strategy: matrix: - jvm: [8, 11, 17, 21] + jvm: [11, 17, 21] steps: - uses: actions/checkout@v4 - uses: actions/setup-java@v4 diff --git a/.github/workflows/publish-snapshot.yml b/.github/workflows/publish-snapshot.yml index 411c3a5636c9..6acee54bde3e 100644 --- a/.github/workflows/publish-snapshot.yml +++ b/.github/workflows/publish-snapshot.yml @@ -37,7 +37,7 @@ jobs: - uses: actions/setup-java@v4 with: distribution: zulu - java-version: 8 + java-version: 11 - run: | ./gradlew printVersion ./gradlew -DallModules publishApachePublicationToMavenRepository -PmavenUser=${{ secrets.NEXUS_USER }} -PmavenPassword=${{ secrets.NEXUS_PW }} diff --git a/.github/workflows/spark-ci.yml b/.github/workflows/spark-ci.yml index 7a47beeed7a4..1cc0425b73a3 100644 --- a/.github/workflows/spark-ci.yml +++ b/.github/workflows/spark-ci.yml @@ -71,7 +71,7 @@ jobs: runs-on: ubuntu-22.04 strategy: matrix: - jvm: [8, 11, 17, 21] + jvm: [11, 17, 21] spark: ['3.3', '3.4', '3.5'] scala: ['2.12', '2.13'] exclude: diff --git a/README.md b/README.md index fe0d2b94c308..7d2056077804 100644 --- a/README.md +++ b/README.md @@ -51,7 +51,7 @@ Community discussions happen primarily on the [dev mailing list][dev-list] or on ### Building -Iceberg is built using Gradle with Java 8, 11, 17, or 21. +Iceberg is built using Gradle with Java 11, 17, or 21. * To invoke a build and run tests: `./gradlew build` * To skip tests: `./gradlew build -x test -x integrationTest` diff --git a/build.gradle b/build.gradle index a0b1a2e018bf..450a92f24b7c 100644 --- a/build.gradle +++ b/build.gradle @@ -56,10 +56,7 @@ try { project.logger.error(e.getMessage()) } -if (JavaVersion.current() == JavaVersion.VERSION_1_8) { - project.ext.jdkVersion = '8' - project.ext.extraJvmArgs = [] -} else if (JavaVersion.current() == JavaVersion.VERSION_11) { +if (JavaVersion.current() == JavaVersion.VERSION_11) { project.ext.jdkVersion = '11' project.ext.extraJvmArgs = [] } else if (JavaVersion.current() == JavaVersion.VERSION_17 || JavaVersion.current() == JavaVersion.VERSION_21) { @@ -86,7 +83,7 @@ if (JavaVersion.current() == JavaVersion.VERSION_1_8) { "--add-opens", "java.base/sun.security.action=ALL-UNNAMED", "--add-opens", "java.base/sun.util.calendar=ALL-UNNAMED"] } else { - throw new GradleException("This build must be run with JDK 8 or 11 or 17 or 21 but was executed with JDK " + JavaVersion.current()) + throw new GradleException("This build must be run with JDK 11 or 17 or 21 but was executed with JDK " + JavaVersion.current()) } tasks.withType(AbstractArchiveTask).configureEach { @@ -887,23 +884,12 @@ project(':iceberg-pig') { } project(':iceberg-nessie') { - if (JavaVersion.current().isJava11Compatible()) { - test { - useJUnitPlatform() - } - compileTestJava { - sourceCompatibility = "11" - targetCompatibility = "11" - } - } else { - // Do not test Nessie against Java 8, because in-JVM testing requires Nessie server components, - // which require Java 11+. - test { - enabled = false - } - compileTestJava { - enabled = false - } + test { + useJUnitPlatform() + } + compileTestJava { + sourceCompatibility = "11" + targetCompatibility = "11" } dependencies { @@ -922,21 +908,19 @@ project(':iceberg-nessie') { // Only there to prevent "warning: unknown enum constant SchemaType.OBJECT" compile messages compileOnly libs.microprofile.openapi.api - if (JavaVersion.current().isJava11Compatible()) { - testImplementation libs.nessie.jaxrs.testextension - testImplementation libs.nessie.versioned.storage.inmemory.tests - testImplementation libs.nessie.versioned.storage.testextension - // Need to "pull in" el-api explicitly :( - testImplementation libs.jakarta.el.api + testImplementation libs.nessie.jaxrs.testextension + testImplementation libs.nessie.versioned.storage.inmemory.tests + testImplementation libs.nessie.versioned.storage.testextension + // Need to "pull in" el-api explicitly :( + testImplementation libs.jakarta.el.api - testImplementation libs.avro.avro + testImplementation libs.avro.avro - testImplementation project(path: ':iceberg-api', configuration: 'testArtifacts') - testImplementation project(path: ':iceberg-core', configuration: 'testArtifacts') + testImplementation project(path: ':iceberg-api', configuration: 'testArtifacts') + testImplementation project(path: ':iceberg-core', configuration: 'testArtifacts') - // Only there to prevent "warning: unknown enum constant SchemaType.OBJECT" compile messages - testCompileOnly libs.microprofile.openapi.api - } + // Only there to prevent "warning: unknown enum constant SchemaType.OBJECT" compile messages + testCompileOnly libs.microprofile.openapi.api } } diff --git a/deploy.gradle b/deploy.gradle index 3e2eda2a5a60..462448303253 100644 --- a/deploy.gradle +++ b/deploy.gradle @@ -17,8 +17,8 @@ * under the License. */ -if (project.hasProperty('release') && jdkVersion != '8') { - throw new GradleException("Releases must be built with Java 8") +if (project.hasProperty('release') && jdkVersion != '11') { + throw new GradleException("Releases must be built with Java 11") } subprojects { diff --git a/hive-runtime/build.gradle b/hive-runtime/build.gradle index b2051d529512..6f891be8087e 100644 --- a/hive-runtime/build.gradle +++ b/hive-runtime/build.gradle @@ -38,7 +38,7 @@ project(':iceberg-hive-runtime') { dependencies { implementation project(':iceberg-mr') - if (jdkVersion == '8' && hiveVersions.contains("3")) { + if (hiveVersions.contains("3")) { implementation project(':iceberg-hive3') } implementation(project(':iceberg-nessie')) { diff --git a/jmh.gradle b/jmh.gradle index 80f5f8d0ea63..5e5e0151219f 100644 --- a/jmh.gradle +++ b/jmh.gradle @@ -17,8 +17,8 @@ * under the License. */ -if (jdkVersion != '8' && jdkVersion != '11' && jdkVersion != '17' && jdkVersion != '21') { - throw new GradleException("The JMH benchmarks must be run with JDK 8 or JDK 11 or JDK 17 or JDK 21") +if (jdkVersion != '11' && jdkVersion != '17' && jdkVersion != '21') { + throw new GradleException("The JMH benchmarks must be run with JDK 11 or JDK 17 or JDK 21") } def flinkVersions = (System.getProperty("flinkVersions") != null ? System.getProperty("flinkVersions") : System.getProperty("defaultFlinkVersions")).split(",") diff --git a/site/docs/contribute.md b/site/docs/contribute.md index 88a14e7153b4..60bc89f9537f 100644 --- a/site/docs/contribute.md +++ b/site/docs/contribute.md @@ -84,7 +84,7 @@ settle disagreements or to force a decision. ## Building the Project Locally -Iceberg is built using Gradle with Java 8, 11, 17, or 21. +Iceberg is built using Gradle with Java 11, 17, or 21. * To invoke a build and run tests: `./gradlew build` * To skip tests: `./gradlew build -x test -x integrationTest` diff --git a/tasks.gradle b/tasks.gradle index 5fc24d41ae62..5515d7b75052 100644 --- a/tasks.gradle +++ b/tasks.gradle @@ -32,11 +32,9 @@ task aggregateJavadoc(type: Javadoc) { doLast { // Fix bug with search - if (JavaVersion.current() >= JavaVersion.VERSION_11) { - // Append the fix to the file - def searchScript = new File("site/docs/javadoc/${getJavadocVersion()}" + '/search.js') - searchScript.append JAVADOC_FIX_SEARCH_STR - } + // Append the fix to the file + def searchScript = new File("site/docs/javadoc/${getJavadocVersion()}" + '/search.js') + searchScript.append JAVADOC_FIX_SEARCH_STR } } From c2db97ce9a31d0af21341deaa22f7b91da04cd32 Mon Sep 17 00:00:00 2001 From: Eduard Tudenhoefner Date: Fri, 2 Aug 2024 14:02:56 +0200 Subject: [PATCH 14/55] Build: Bump com.adobe.testing:s3mock-junit5 from 2.11.0 to 2.17.0 (#10851) --- aws/src/test/java/org/apache/iceberg/aws/s3/TestS3FileIO.java | 3 ++- .../test/java/org/apache/iceberg/aws/s3/TestS3InputStream.java | 3 ++- .../java/org/apache/iceberg/aws/s3/TestS3OutputStream.java | 3 ++- gradle/libs.versions.toml | 2 +- 4 files changed, 7 insertions(+), 4 deletions(-) diff --git a/aws/src/test/java/org/apache/iceberg/aws/s3/TestS3FileIO.java b/aws/src/test/java/org/apache/iceberg/aws/s3/TestS3FileIO.java index 135eb76772cc..38489e367434 100644 --- a/aws/src/test/java/org/apache/iceberg/aws/s3/TestS3FileIO.java +++ b/aws/src/test/java/org/apache/iceberg/aws/s3/TestS3FileIO.java @@ -81,6 +81,7 @@ import software.amazon.awssdk.regions.Region; import software.amazon.awssdk.services.s3.S3Client; import software.amazon.awssdk.services.s3.model.BucketAlreadyExistsException; +import software.amazon.awssdk.services.s3.model.BucketAlreadyOwnedByYouException; import software.amazon.awssdk.services.s3.model.CreateBucketRequest; import software.amazon.awssdk.services.s3.model.DeleteObjectsRequest; import software.amazon.awssdk.services.s3.model.DeleteObjectsResponse; @@ -447,7 +448,7 @@ private void createRandomObjects(String prefix, int count) { private void createBucket(String bucketName) { try { s3.get().createBucket(CreateBucketRequest.builder().bucket(bucketName).build()); - } catch (BucketAlreadyExistsException e) { + } catch (BucketAlreadyExistsException | BucketAlreadyOwnedByYouException e) { // do nothing } } diff --git a/aws/src/test/java/org/apache/iceberg/aws/s3/TestS3InputStream.java b/aws/src/test/java/org/apache/iceberg/aws/s3/TestS3InputStream.java index 95f4d09a7e0b..ed71e259a26c 100644 --- a/aws/src/test/java/org/apache/iceberg/aws/s3/TestS3InputStream.java +++ b/aws/src/test/java/org/apache/iceberg/aws/s3/TestS3InputStream.java @@ -35,6 +35,7 @@ import software.amazon.awssdk.core.sync.RequestBody; import software.amazon.awssdk.services.s3.S3Client; import software.amazon.awssdk.services.s3.model.BucketAlreadyExistsException; +import software.amazon.awssdk.services.s3.model.BucketAlreadyOwnedByYouException; import software.amazon.awssdk.services.s3.model.CreateBucketRequest; import software.amazon.awssdk.services.s3.model.PutObjectRequest; @@ -195,7 +196,7 @@ private void writeS3Data(S3URI uri, byte[] data) throws IOException { private void createBucket(String bucketName) { try { s3.createBucket(CreateBucketRequest.builder().bucket(bucketName).build()); - } catch (BucketAlreadyExistsException e) { + } catch (BucketAlreadyExistsException | BucketAlreadyOwnedByYouException e) { // don't do anything } } diff --git a/aws/src/test/java/org/apache/iceberg/aws/s3/TestS3OutputStream.java b/aws/src/test/java/org/apache/iceberg/aws/s3/TestS3OutputStream.java index 6fbe59e47f7f..88488bf4c313 100644 --- a/aws/src/test/java/org/apache/iceberg/aws/s3/TestS3OutputStream.java +++ b/aws/src/test/java/org/apache/iceberg/aws/s3/TestS3OutputStream.java @@ -62,6 +62,7 @@ import software.amazon.awssdk.services.s3.S3Client; import software.amazon.awssdk.services.s3.model.AbortMultipartUploadRequest; import software.amazon.awssdk.services.s3.model.BucketAlreadyExistsException; +import software.amazon.awssdk.services.s3.model.BucketAlreadyOwnedByYouException; import software.amazon.awssdk.services.s3.model.CompleteMultipartUploadRequest; import software.amazon.awssdk.services.s3.model.CreateBucketRequest; import software.amazon.awssdk.services.s3.model.GetObjectRequest; @@ -339,7 +340,7 @@ private S3URI randomURI() { private void createBucket(String bucketName) { try { s3.createBucket(CreateBucketRequest.builder().bucket(bucketName).build()); - } catch (BucketAlreadyExistsException e) { + } catch (BucketAlreadyExistsException | BucketAlreadyOwnedByYouException e) { // do nothing } } diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index 055475809296..5ede5abf23cc 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -74,7 +74,7 @@ orc = "1.9.4" parquet = "1.13.1" pig = "0.17.0" roaringbitmap = "1.2.1" -s3mock-junit5 = "2.11.0" +s3mock-junit5 = "2.17.0" scala-collection-compat = "2.12.0" slf4j = "1.7.36" snowflake-jdbc = "3.18.0" From 122176a37fed02f6883a75fee440fb1e7ba161ec Mon Sep 17 00:00:00 2001 From: Eduard Tudenhoefner Date: Fri, 2 Aug 2024 14:08:43 +0200 Subject: [PATCH 15/55] Core: Upgrade Jetty and Servlet API (#10850) This is the latest Jetty version that runs with JDK11 --- build.gradle | 1 + .../java/org/apache/iceberg/rest/RESTCatalogServlet.java | 6 +++--- gradle/libs.versions.toml | 4 +++- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/build.gradle b/build.gradle index 450a92f24b7c..36928c869502 100644 --- a/build.gradle +++ b/build.gradle @@ -361,6 +361,7 @@ project(':iceberg-core') { } testImplementation libs.jetty.servlet + testImplementation libs.jakarta.servlet testImplementation libs.jetty.server testImplementation libs.mockserver.netty testImplementation libs.mockserver.client.java diff --git a/core/src/test/java/org/apache/iceberg/rest/RESTCatalogServlet.java b/core/src/test/java/org/apache/iceberg/rest/RESTCatalogServlet.java index 954f5130eca2..f456bb4d354d 100644 --- a/core/src/test/java/org/apache/iceberg/rest/RESTCatalogServlet.java +++ b/core/src/test/java/org/apache/iceberg/rest/RESTCatalogServlet.java @@ -20,6 +20,9 @@ import static java.lang.String.format; +import jakarta.servlet.http.HttpServlet; +import jakarta.servlet.http.HttpServletRequest; +import jakarta.servlet.http.HttpServletResponse; import java.io.IOException; import java.io.InputStreamReader; import java.io.Reader; @@ -30,9 +33,6 @@ import java.util.function.Consumer; import java.util.function.Function; import java.util.stream.Collectors; -import javax.servlet.http.HttpServlet; -import javax.servlet.http.HttpServletRequest; -import javax.servlet.http.HttpServletResponse; import org.apache.hc.core5.http.ContentType; import org.apache.hc.core5.http.HttpHeaders; import org.apache.iceberg.exceptions.RESTException; diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index 5ede5abf23cc..de109572dec4 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -57,9 +57,10 @@ jackson213 = { strictly = "2.13.4"} jackson214 = { strictly = "2.14.2"} jackson215 = { strictly = "2.15.2"} jakarta-el-api = "3.0.3" +jakarta-servlet-api = "6.1.0" jaxb-api = "2.3.1" jaxb-runtime = "2.3.9" -jetty = "9.4.55.v20240627" +jetty = "11.0.22" junit = "5.10.1" kafka = "3.7.1" kryo-shaded = "4.0.3" @@ -196,6 +197,7 @@ flink119-test-utils = { module = "org.apache.flink:flink-test-utils", version.re flink119-test-utilsjunit = { module = "org.apache.flink:flink-test-utils-junit", version.ref = "flink119" } guava-testlib = { module = "com.google.guava:guava-testlib", version.ref = "guava" } jakarta-el-api = { module = "jakarta.el:jakarta.el-api", version.ref = "jakarta-el-api" } +jakarta-servlet = {module = "jakarta.servlet:jakarta.servlet-api", version.ref = "jakarta-servlet-api"} jetty-server = { module = "org.eclipse.jetty:jetty-server", version.ref = "jetty" } jetty-servlet = { module = "org.eclipse.jetty:jetty-servlet", version.ref = "jetty" } junit-jupiter = { module = "org.junit.jupiter:junit-jupiter", version.ref = "junit" } From 08aed72beeedeeffd80c87803e075593f91e9ba7 Mon Sep 17 00:00:00 2001 From: Robert Stupp Date: Fri, 2 Aug 2024 15:14:40 +0200 Subject: [PATCH 16/55] Build: Configure options.release = 11 / remove com.palantir.baseline-release-compatibility plugin (#10849) --- baseline.gradle | 9 +++++++-- build.gradle | 14 ++------------ 2 files changed, 9 insertions(+), 14 deletions(-) diff --git a/baseline.gradle b/baseline.gradle index 62ade9a632f4..f97b872e93e7 100644 --- a/baseline.gradle +++ b/baseline.gradle @@ -42,9 +42,14 @@ subprojects { apply plugin: 'com.palantir.baseline-error-prone' } apply plugin: 'com.palantir.baseline-class-uniqueness' - apply plugin: 'com.palantir.baseline-reproducibility' + // What 'com.palantir.baseline-reproducibility' used to do, except the check for the + // `sourceCompatibility` Java compile option, which conflicts with the `release` compile option. + tasks.withType(AbstractArchiveTask.class).configureEach(t -> { + t.setPreserveFileTimestamps(false); + t.setReproducibleFileOrder(true); + t.setDuplicatesStrategy(DuplicatesStrategy.WARN); + }); apply plugin: 'com.palantir.baseline-exact-dependencies' - apply plugin: 'com.palantir.baseline-release-compatibility' // We need to update Google Java Format to 1.17.0+ to run spotless on JDK 8, but that requires dropping support for JDK 8. if (JavaVersion.current() == JavaVersion.VERSION_21) { task spotlessApply { diff --git a/build.gradle b/build.gradle index 36928c869502..a1aef9b37fd1 100644 --- a/build.gradle +++ b/build.gradle @@ -191,21 +191,15 @@ subprojects { testArtifacts } - compileJava { - options.encoding = "UTF-8" - } - - compileTestJava { + tasks.withType(JavaCompile.class).configureEach { options.encoding = "UTF-8" + options.release = 11 } javadoc { options.encoding = 'UTF-8' } - sourceCompatibility = '1.8' - targetCompatibility = '1.8' - dependencies { implementation libs.slf4j.api @@ -888,10 +882,6 @@ project(':iceberg-nessie') { test { useJUnitPlatform() } - compileTestJava { - sourceCompatibility = "11" - targetCompatibility = "11" - } dependencies { api project(':iceberg-api') From 39295753e8e949096236a5dc8b063c77f976650b Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 2 Aug 2024 15:15:06 +0200 Subject: [PATCH 17/55] Build: Bump kafka from 3.7.1 to 3.8.0 (#10797) Bumps `kafka` from 3.7.1 to 3.8.0. Updates `org.apache.kafka:kafka-clients` from 3.7.1 to 3.8.0 Updates `org.apache.kafka:connect-api` from 3.7.1 to 3.8.0 Updates `org.apache.kafka:connect-json` from 3.7.1 to 3.8.0 --- updated-dependencies: - dependency-name: org.apache.kafka:kafka-clients dependency-type: direct:production update-type: version-update:semver-minor - dependency-name: org.apache.kafka:connect-api dependency-type: direct:production update-type: version-update:semver-minor - dependency-name: org.apache.kafka:connect-json dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- gradle/libs.versions.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index de109572dec4..02702955a5fe 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -62,7 +62,7 @@ jaxb-api = "2.3.1" jaxb-runtime = "2.3.9" jetty = "11.0.22" junit = "5.10.1" -kafka = "3.7.1" +kafka = "3.8.0" kryo-shaded = "4.0.3" microprofile-openapi-api = "3.1.1" mockito = "4.11.0" From 674214cb6bbc8f6e739245293ba31f9a3d36114e Mon Sep 17 00:00:00 2001 From: Piotr Findeisen Date: Fri, 2 Aug 2024 15:44:46 +0200 Subject: [PATCH 18/55] Build: Update baseline gradle plugin to 5.58.0 (#10788) --- baseline.gradle | 10 ++++++++++ build.gradle | 7 +------ .../main/java/org/apache/iceberg/StaticDataTask.java | 6 +++++- .../iceberg/flink/sink/shuffle/KeyAssignment.java | 6 +++++- .../iceberg/flink/sink/shuffle/MapAssignment.java | 2 ++ .../iceberg/flink/sink/shuffle/KeyAssignment.java | 6 +++++- .../iceberg/flink/sink/shuffle/MapAssignment.java | 2 ++ .../iceberg/flink/sink/shuffle/KeyAssignment.java | 6 +++++- .../iceberg/flink/sink/shuffle/MapAssignment.java | 2 ++ 9 files changed, 37 insertions(+), 10 deletions(-) diff --git a/baseline.gradle b/baseline.gradle index f97b872e93e7..e3fe602e91c9 100644 --- a/baseline.gradle +++ b/baseline.gradle @@ -91,11 +91,17 @@ subprojects { '-Xep:CollectionUndefinedEquality:ERROR', // specific to Palantir - Uses name `log` but we use name `LOG` '-Xep:ConsistentLoggerName:OFF', + // TODO (https://github.com/apache/iceberg/issues/10853) this is a recently added check. Figure out whether we adjust the code or suppress for good + '-Xep:DangerousJavaDeserialization:WARN', '-Xep:DangerousThreadPoolExecutorUsage:OFF', // subclasses are not equal '-Xep:EqualsGetClass:OFF', // specific to Palantir '-Xep:FinalClass:OFF', + // TODO (https://github.com/apache/iceberg/issues/10854) this is a recently added check. Figure out whether we adjust the code or suppress for good + '-Xep:FormatStringAnnotation:WARN', + // TODO (https://github.com/apache/iceberg/issues/10855) this is a recently added check. Figure out whether we adjust the code or suppress for good + '-Xep:ImmutablesReferenceEquality:WARN', '-Xep:IntLongMath:ERROR', // prefer method references over lambdas '-Xep:LambdaMethodReference:ERROR', @@ -108,6 +114,8 @@ subprojects { '-Xep:MissingSummary:ERROR', // Enforce hashCode over hash '-Xep:ObjectsHashCodeUnnecessaryVarargs:ERROR', + // Triggers false-positives whenever relocated @VisibleForTesting is used + '-Xep:PreferCommonAnnotations:OFF', // specific to Palantir '-Xep:PreferSafeLoggableExceptions:OFF', '-Xep:PreferSafeLogger:OFF', @@ -125,6 +133,8 @@ subprojects { '-Xep:StringSplitter:ERROR', '-Xep:TypeParameterShadowing:OFF', '-Xep:TypeParameterUnusedInFormals:OFF', + // Palantir's UnnecessarilyQualified may throw during analysis + '-Xep:UnnecessarilyQualified:OFF', ) } } diff --git a/build.gradle b/build.gradle index a1aef9b37fd1..60fb5e7830d3 100644 --- a/build.gradle +++ b/build.gradle @@ -27,12 +27,7 @@ buildscript { } dependencies { classpath 'io.github.goooler.shadow:shadow-gradle-plugin:8.1.8' - classpath 'com.palantir.baseline:gradle-baseline-java:4.42.0' - // com.palantir.baseline:gradle-baseline-java:4.42.0 (the last version supporting Java 8) pulls - // in an old version of the errorprone, which doesn't work w/ Gradle 8, so bump errorpone as - // well. - classpath "net.ltgt.gradle:gradle-errorprone-plugin:3.1.0" - + classpath 'com.palantir.baseline:gradle-baseline-java:5.58.0' classpath 'com.diffplug.spotless:spotless-plugin-gradle:6.13.0' classpath 'gradle.plugin.org.inferred:gradle-processors:3.7.0' classpath 'me.champeau.jmh:jmh-gradle-plugin:0.7.2' diff --git a/core/src/main/java/org/apache/iceberg/StaticDataTask.java b/core/src/main/java/org/apache/iceberg/StaticDataTask.java index f25ebd49c9d8..1a396f0bfc7e 100644 --- a/core/src/main/java/org/apache/iceberg/StaticDataTask.java +++ b/core/src/main/java/org/apache/iceberg/StaticDataTask.java @@ -127,7 +127,11 @@ DataFile metadataFile() { return metadataFile; } - /** @return the table rows before projection */ + /** + * Returns the table rows before projection. + * + * @return the table rows before projection + */ StructLike[] tableRows() { return rows; } diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/KeyAssignment.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/KeyAssignment.java index a164d83ac3b0..781bcc646023 100644 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/KeyAssignment.java +++ b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/KeyAssignment.java @@ -98,7 +98,11 @@ long[] subtaskWeightsExcludingCloseCost() { return subtaskWeightsExcludingCloseCost; } - /** @return subtask id */ + /** + * Select a subtask for the key. + * + * @return subtask id + */ int select() { if (assignedSubtasks.size() == 1) { // only choice. no need to run random number generator. diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapAssignment.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapAssignment.java index 0abb030c2279..9d8167460a1b 100644 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapAssignment.java +++ b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapAssignment.java @@ -93,6 +93,8 @@ Map keyAssignments() { } /** + * Returns assignment summary for every subtask. + * * @return assignment summary for every subtask. Key is subtaskId. Value pair is (weight assigned * to the subtask, number of keys assigned to the subtask) */ diff --git a/flink/v1.18/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/KeyAssignment.java b/flink/v1.18/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/KeyAssignment.java index a164d83ac3b0..781bcc646023 100644 --- a/flink/v1.18/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/KeyAssignment.java +++ b/flink/v1.18/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/KeyAssignment.java @@ -98,7 +98,11 @@ long[] subtaskWeightsExcludingCloseCost() { return subtaskWeightsExcludingCloseCost; } - /** @return subtask id */ + /** + * Select a subtask for the key. + * + * @return subtask id + */ int select() { if (assignedSubtasks.size() == 1) { // only choice. no need to run random number generator. diff --git a/flink/v1.18/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapAssignment.java b/flink/v1.18/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapAssignment.java index 0abb030c2279..9d8167460a1b 100644 --- a/flink/v1.18/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapAssignment.java +++ b/flink/v1.18/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapAssignment.java @@ -93,6 +93,8 @@ Map keyAssignments() { } /** + * Returns assignment summary for every subtask. + * * @return assignment summary for every subtask. Key is subtaskId. Value pair is (weight assigned * to the subtask, number of keys assigned to the subtask) */ diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/KeyAssignment.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/KeyAssignment.java index a164d83ac3b0..781bcc646023 100644 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/KeyAssignment.java +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/KeyAssignment.java @@ -98,7 +98,11 @@ long[] subtaskWeightsExcludingCloseCost() { return subtaskWeightsExcludingCloseCost; } - /** @return subtask id */ + /** + * Select a subtask for the key. + * + * @return subtask id + */ int select() { if (assignedSubtasks.size() == 1) { // only choice. no need to run random number generator. diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapAssignment.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapAssignment.java index 0abb030c2279..9d8167460a1b 100644 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapAssignment.java +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapAssignment.java @@ -93,6 +93,8 @@ Map keyAssignments() { } /** + * Returns assignment summary for every subtask. + * * @return assignment summary for every subtask. Key is subtaskId. Value pair is (weight assigned * to the subtask, number of keys assigned to the subtask) */ From dc7ad7190989d50f3288ea02eb26d527c9f629c6 Mon Sep 17 00:00:00 2001 From: Steven Zhen Wu Date: Fri, 2 Aug 2024 08:39:10 -0700 Subject: [PATCH 19/55] Flink: refactor sink tests to reduce the number of combinations with parameterized tests (#10777) --- .../apache/iceberg/flink/CatalogTestBase.java | 22 -- .../org/apache/iceberg/flink/SqlBase.java | 110 +++++++ .../org/apache/iceberg/flink/TestBase.java | 4 +- .../iceberg/flink/TestFlinkTableSink.java | 114 -------- .../flink/TestFlinkTableSinkExtended.java | 244 ++++++++++++++++ .../iceberg/flink/TestIcebergConnector.java | 4 - .../flink/sink/TestFlinkIcebergSink.java | 270 +----------------- .../flink/sink/TestFlinkIcebergSinkBase.java | 51 +++- .../TestFlinkIcebergSinkDistributionMode.java | 180 ++++++++++++ .../sink/TestFlinkIcebergSinkExtended.java | 208 ++++++++++++++ 10 files changed, 798 insertions(+), 409 deletions(-) create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/SqlBase.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSinkExtended.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkDistributionMode.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkExtended.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/CatalogTestBase.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/CatalogTestBase.java index 91ed3c4adea3..062ff68d5d85 100644 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/CatalogTestBase.java +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/CatalogTestBase.java @@ -118,26 +118,4 @@ protected String getFullQualifiedTableName(String tableName) { static String getURI(HiveConf conf) { return conf.get(HiveConf.ConfVars.METASTOREURIS.varname); } - - static String toWithClause(Map props) { - StringBuilder builder = new StringBuilder(); - builder.append("("); - int propCount = 0; - for (Map.Entry entry : props.entrySet()) { - if (propCount > 0) { - builder.append(","); - } - builder - .append("'") - .append(entry.getKey()) - .append("'") - .append("=") - .append("'") - .append(entry.getValue()) - .append("'"); - propCount++; - } - builder.append(")"); - return builder.toString(); - } } diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/SqlBase.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/SqlBase.java new file mode 100644 index 000000000000..9411ea4f7d71 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/SqlBase.java @@ -0,0 +1,110 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import static org.apache.iceberg.flink.FlinkCatalogFactory.DEFAULT_CATALOG_NAME; +import static org.assertj.core.api.Assertions.assertThat; + +import java.util.List; +import java.util.Map; +import org.apache.flink.table.api.TableEnvironment; +import org.apache.flink.table.api.TableResult; +import org.apache.flink.types.Row; +import org.apache.flink.util.CloseableIterator; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; + +public abstract class SqlBase { + protected abstract TableEnvironment getTableEnv(); + + protected static TableResult exec(TableEnvironment env, String query, Object... args) { + return env.executeSql(String.format(query, args)); + } + + protected TableResult exec(String query, Object... args) { + return exec(getTableEnv(), query, args); + } + + protected List sql(String query, Object... args) { + TableResult tableResult = exec(query, args); + try (CloseableIterator iter = tableResult.collect()) { + return Lists.newArrayList(iter); + } catch (Exception e) { + throw new RuntimeException("Failed to collect table result", e); + } + } + + protected void assertSameElements(Iterable expected, Iterable actual) { + assertThat(actual).isNotNull().containsExactlyInAnyOrderElementsOf(expected); + } + + protected void assertSameElements(String message, Iterable expected, Iterable actual) { + assertThat(actual).isNotNull().as(message).containsExactlyInAnyOrderElementsOf(expected); + } + + /** + * We can not drop currently used catalog after FLINK-29677, so we have make sure that we do not + * use the current catalog before dropping it. This method switches to the 'default_catalog' and + * drops the one requested. + * + * @param catalogName The catalog to drop + * @param ifExists If we should use the 'IF EXISTS' when dropping the catalog + */ + protected void dropCatalog(String catalogName, boolean ifExists) { + sql("USE CATALOG %s", DEFAULT_CATALOG_NAME); + sql("DROP CATALOG %s %s", ifExists ? "IF EXISTS" : "", catalogName); + } + + /** + * We can not drop currently used database after FLINK-33226, so we have make sure that we do not + * use the current database before dropping it. This method switches to the default database in + * the default catalog, and then it and drops the one requested. + * + * @param database The database to drop + * @param ifExists If we should use the 'IF EXISTS' when dropping the database + */ + protected void dropDatabase(String database, boolean ifExists) { + String currentCatalog = getTableEnv().getCurrentCatalog(); + sql("USE CATALOG %s", DEFAULT_CATALOG_NAME); + sql("USE %s", getTableEnv().listDatabases()[0]); + sql("USE CATALOG %s", currentCatalog); + sql("DROP DATABASE %s %s", ifExists ? "IF EXISTS" : "", database); + } + + protected static String toWithClause(Map props) { + StringBuilder builder = new StringBuilder(); + builder.append("("); + int propCount = 0; + for (Map.Entry entry : props.entrySet()) { + if (propCount > 0) { + builder.append(","); + } + builder + .append("'") + .append(entry.getKey()) + .append("'") + .append("=") + .append("'") + .append(entry.getValue()) + .append("'"); + propCount++; + } + builder.append(")"); + return builder.toString(); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestBase.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestBase.java index 633690044692..401960c3591b 100644 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestBase.java +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestBase.java @@ -27,7 +27,6 @@ import org.apache.flink.table.api.TableEnvironment; import org.apache.flink.table.api.TableResult; import org.apache.flink.test.junit5.MiniClusterExtension; -import org.apache.flink.test.util.TestBaseUtils; import org.apache.flink.types.Row; import org.apache.flink.util.CloseableIterator; import org.apache.hadoop.hive.conf.HiveConf; @@ -41,7 +40,7 @@ import org.junit.jupiter.api.extension.RegisterExtension; import org.junit.jupiter.api.io.TempDir; -public abstract class TestBase extends TestBaseUtils { +public abstract class TestBase extends SqlBase { @RegisterExtension public static MiniClusterExtension miniClusterExtension = @@ -72,6 +71,7 @@ public static void stopMetastore() throws Exception { TestBase.catalog = null; } + @Override protected TableEnvironment getTableEnv() { if (tEnv == null) { synchronized (this) { diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSink.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSink.java index a0341e6834d4..2978a92945a2 100644 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSink.java +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSink.java @@ -18,36 +18,21 @@ */ package org.apache.iceberg.flink; -import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assumptions.assumeThat; -import java.util.Collections; import java.util.List; -import java.util.Map; -import java.util.stream.Collectors; -import java.util.stream.IntStream; -import org.apache.flink.api.dag.Transformation; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; import org.apache.flink.table.api.EnvironmentSettings; import org.apache.flink.table.api.Expressions; import org.apache.flink.table.api.TableEnvironment; import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; -import org.apache.flink.table.api.internal.TableEnvironmentImpl; -import org.apache.flink.table.operations.ModifyOperation; -import org.apache.flink.table.planner.delegation.PlannerBase; -import org.apache.flink.types.Row; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.DistributionMode; import org.apache.iceberg.FileFormat; import org.apache.iceberg.Parameter; import org.apache.iceberg.Parameters; import org.apache.iceberg.Table; -import org.apache.iceberg.TableProperties; import org.apache.iceberg.catalog.Namespace; import org.apache.iceberg.catalog.TableIdentifier; import org.apache.iceberg.flink.source.BoundedTableFactory; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; @@ -169,39 +154,6 @@ public void testOverwriteTable() throws Exception { icebergTable, Lists.newArrayList(SimpleDataUtil.createRecord(2, "b"))); } - @TestTemplate - public void testWriteParallelism() throws Exception { - List dataSet = - IntStream.range(1, 1000) - .mapToObj(i -> ImmutableList.of(Row.of(i, "aaa"), Row.of(i, "bbb"), Row.of(i, "ccc"))) - .flatMap(List::stream) - .collect(Collectors.toList()); - String dataId = BoundedTableFactory.registerDataSet(ImmutableList.of(dataSet)); - sql( - "CREATE TABLE %s(id INT NOT NULL, data STRING NOT NULL)" - + " WITH ('connector'='BoundedSource', 'data-id'='%s')", - SOURCE_TABLE, dataId); - - PlannerBase planner = (PlannerBase) ((TableEnvironmentImpl) getTableEnv()).getPlanner(); - String insertSQL = - String.format( - "INSERT INTO %s /*+ OPTIONS('write-parallelism'='1') */ SELECT * FROM %s", - TABLE_NAME, SOURCE_TABLE); - ModifyOperation operation = (ModifyOperation) planner.getParser().parse(insertSQL).get(0); - Transformation dummySink = planner.translate(Collections.singletonList(operation)).get(0); - Transformation committer = dummySink.getInputs().get(0); - Transformation writer = committer.getInputs().get(0); - - assertThat(writer.getParallelism()).as("Should have the expected 1 parallelism.").isEqualTo(1); - writer - .getInputs() - .forEach( - input -> - assertThat(input.getParallelism()) - .as("Should have the expected parallelism.") - .isEqualTo(isStreamingJob ? 2 : 4)); - } - @TestTemplate public void testReplacePartitions() throws Exception { assumeThat(isStreamingJob) @@ -289,70 +241,4 @@ public void testInsertIntoPartition() throws Exception { sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, tableName); } } - - @TestTemplate - public void testHashDistributeMode() throws Exception { - String tableName = "test_hash_distribution_mode"; - Map tableProps = - ImmutableMap.of( - "write.format.default", - format.name(), - TableProperties.WRITE_DISTRIBUTION_MODE, - DistributionMode.HASH.modeName()); - - // Initialize a BoundedSource table to precisely emit those rows in only one checkpoint. - List dataSet = - IntStream.range(1, 1000) - .mapToObj(i -> ImmutableList.of(Row.of(i, "aaa"), Row.of(i, "bbb"), Row.of(i, "ccc"))) - .flatMap(List::stream) - .collect(Collectors.toList()); - String dataId = BoundedTableFactory.registerDataSet(ImmutableList.of(dataSet)); - sql( - "CREATE TABLE %s(id INT NOT NULL, data STRING NOT NULL)" - + " WITH ('connector'='BoundedSource', 'data-id'='%s')", - SOURCE_TABLE, dataId); - - assertThat(sql("SELECT * FROM %s", SOURCE_TABLE)) - .as("Should have the expected rows in source table.") - .containsExactlyInAnyOrderElementsOf(dataSet); - - sql( - "CREATE TABLE %s(id INT, data VARCHAR) PARTITIONED BY (data) WITH %s", - tableName, toWithClause(tableProps)); - - try { - // Insert data set. - sql("INSERT INTO %s SELECT * FROM %s", tableName, SOURCE_TABLE); - - assertThat(sql("SELECT * FROM %s", tableName)) - .as("Should have the expected rows in sink table.") - .containsExactlyInAnyOrderElementsOf(dataSet); - - // Sometimes we will have more than one checkpoint if we pass the auto checkpoint interval, - // thus producing multiple snapshots. Here we assert that each snapshot has only 1 file per - // partition. - Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, tableName)); - Map> snapshotToDataFiles = SimpleDataUtil.snapshotToDataFiles(table); - for (List dataFiles : snapshotToDataFiles.values()) { - if (dataFiles.isEmpty()) { - continue; - } - - assertThat( - SimpleDataUtil.matchingPartitions( - dataFiles, table.spec(), ImmutableMap.of("data", "aaa"))) - .hasSize(1); - assertThat( - SimpleDataUtil.matchingPartitions( - dataFiles, table.spec(), ImmutableMap.of("data", "bbb"))) - .hasSize(1); - assertThat( - SimpleDataUtil.matchingPartitions( - dataFiles, table.spec(), ImmutableMap.of("data", "ccc"))) - .hasSize(1); - } - } finally { - sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, tableName); - } - } } diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSinkExtended.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSinkExtended.java new file mode 100644 index 000000000000..482cfd110bde --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSinkExtended.java @@ -0,0 +1,244 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import static org.apache.iceberg.flink.FlinkCatalogFactory.ICEBERG_CATALOG_TYPE_HADOOP; +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.File; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; +import java.util.stream.IntStream; +import org.apache.flink.api.dag.Transformation; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.api.EnvironmentSettings; +import org.apache.flink.table.api.TableEnvironment; +import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; +import org.apache.flink.table.api.internal.TableEnvironmentImpl; +import org.apache.flink.table.operations.ModifyOperation; +import org.apache.flink.table.planner.delegation.PlannerBase; +import org.apache.flink.test.junit5.MiniClusterExtension; +import org.apache.flink.types.Row; +import org.apache.hadoop.conf.Configuration; +import org.apache.iceberg.CatalogProperties; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.DistributionMode; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Parameter; +import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.Parameters; +import org.apache.iceberg.Table; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.catalog.Namespace; +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.flink.source.BoundedTableFactory; +import org.apache.iceberg.hadoop.HadoopCatalog; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.api.extension.ExtendWith; +import org.junit.jupiter.api.extension.RegisterExtension; +import org.junit.jupiter.api.io.TempDir; + +/** + * This class tests the more extended features of Flink sink. Extract them separately since it is + * unnecessary to test all the parameters combinations in {@link TestFlinkTableSink}, like catalog + * types, namespaces, file format, streaming/batch. Those combinations explode exponentially. Each + * test method in {@link TestFlinkTableSink} runs 21 combinations, which are expensive and slow. + */ +@ExtendWith(ParameterizedTestExtension.class) +public class TestFlinkTableSinkExtended extends SqlBase { + protected static final String CATALOG = "testhadoop"; + protected static final String DATABASE = "db"; + protected static final String TABLE = "tbl"; + + @RegisterExtension + public static MiniClusterExtension miniClusterResource = + MiniFlinkClusterExtension.createWithClassloaderCheckDisabled(); + + private static final String SOURCE_TABLE = "default_catalog.default_database.bounded_source"; + private static final String FLINK_DATABASE = CATALOG + "." + DATABASE; + private static final Namespace ICEBERG_NAMESPACE = Namespace.of(new String[] {DATABASE}); + + @TempDir protected File warehouseRoot; + + protected HadoopCatalog catalog = null; + + private TableEnvironment tEnv; + + @Parameter protected boolean isStreamingJob; + + @Parameters(name = "isStreamingJob={0}") + protected static List parameters() { + return Arrays.asList(new Boolean[] {true}, new Boolean[] {false}); + } + + protected synchronized TableEnvironment getTableEnv() { + if (tEnv == null) { + EnvironmentSettings.Builder settingsBuilder = EnvironmentSettings.newInstance(); + if (isStreamingJob) { + settingsBuilder.inStreamingMode(); + StreamExecutionEnvironment env = + StreamExecutionEnvironment.getExecutionEnvironment( + MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG); + env.enableCheckpointing(400); + env.setMaxParallelism(2); + env.setParallelism(2); + tEnv = StreamTableEnvironment.create(env, settingsBuilder.build()); + } else { + settingsBuilder.inBatchMode(); + tEnv = TableEnvironment.create(settingsBuilder.build()); + } + } + return tEnv; + } + + @BeforeEach + public void before() { + String warehouseLocation = "file:" + warehouseRoot.getPath(); + this.catalog = new HadoopCatalog(new Configuration(), warehouseLocation); + Map config = Maps.newHashMap(); + config.put("type", "iceberg"); + config.put(FlinkCatalogFactory.ICEBERG_CATALOG_TYPE, ICEBERG_CATALOG_TYPE_HADOOP); + config.put(CatalogProperties.WAREHOUSE_LOCATION, warehouseLocation); + sql("CREATE CATALOG %s WITH %s", CATALOG, toWithClause(config)); + + sql("CREATE DATABASE %s", FLINK_DATABASE); + sql("USE CATALOG %s", CATALOG); + sql("USE %s", DATABASE); + sql( + "CREATE TABLE %s (id int, data varchar) with ('write.format.default'='%s')", + TABLE, FileFormat.PARQUET.name()); + } + + @AfterEach + public void clean() throws Exception { + sql("DROP TABLE IF EXISTS %s.%s", FLINK_DATABASE, TABLE); + dropDatabase(FLINK_DATABASE, true); + BoundedTableFactory.clearDataSets(); + + dropCatalog(CATALOG, true); + catalog.close(); + } + + @TestTemplate + public void testWriteParallelism() { + List dataSet = + IntStream.range(1, 1000) + .mapToObj(i -> ImmutableList.of(Row.of(i, "aaa"), Row.of(i, "bbb"), Row.of(i, "ccc"))) + .flatMap(List::stream) + .collect(Collectors.toList()); + String dataId = BoundedTableFactory.registerDataSet(ImmutableList.of(dataSet)); + sql( + "CREATE TABLE %s(id INT NOT NULL, data STRING NOT NULL)" + + " WITH ('connector'='BoundedSource', 'data-id'='%s')", + SOURCE_TABLE, dataId); + + PlannerBase planner = (PlannerBase) ((TableEnvironmentImpl) getTableEnv()).getPlanner(); + String insertSQL = + String.format( + "INSERT INTO %s /*+ OPTIONS('write-parallelism'='1') */ SELECT * FROM %s", + TABLE, SOURCE_TABLE); + ModifyOperation operation = (ModifyOperation) planner.getParser().parse(insertSQL).get(0); + Transformation dummySink = planner.translate(Collections.singletonList(operation)).get(0); + Transformation committer = dummySink.getInputs().get(0); + Transformation writer = committer.getInputs().get(0); + + assertThat(writer.getParallelism()).as("Should have the expected 1 parallelism.").isEqualTo(1); + writer + .getInputs() + .forEach( + input -> + assertThat(input.getParallelism()) + .as("Should have the expected parallelism.") + .isEqualTo(isStreamingJob ? 2 : 4)); + } + + @TestTemplate + public void testHashDistributeMode() throws Exception { + // Initialize a BoundedSource table to precisely emit those rows in only one checkpoint. + List dataSet = + IntStream.range(1, 1000) + .mapToObj(i -> ImmutableList.of(Row.of(i, "aaa"), Row.of(i, "bbb"), Row.of(i, "ccc"))) + .flatMap(List::stream) + .collect(Collectors.toList()); + String dataId = BoundedTableFactory.registerDataSet(ImmutableList.of(dataSet)); + sql( + "CREATE TABLE %s(id INT NOT NULL, data STRING NOT NULL)" + + " WITH ('connector'='BoundedSource', 'data-id'='%s')", + SOURCE_TABLE, dataId); + + assertThat(sql("SELECT * FROM %s", SOURCE_TABLE)) + .as("Should have the expected rows in source table.") + .containsExactlyInAnyOrderElementsOf(dataSet); + + Map tableProps = + ImmutableMap.of( + "write.format.default", + FileFormat.PARQUET.name(), + TableProperties.WRITE_DISTRIBUTION_MODE, + DistributionMode.HASH.modeName()); + + String tableName = "test_hash_distribution_mode"; + sql( + "CREATE TABLE %s(id INT, data VARCHAR) PARTITIONED BY (data) WITH %s", + tableName, toWithClause(tableProps)); + + try { + // Insert data set. + sql("INSERT INTO %s SELECT * FROM %s", tableName, SOURCE_TABLE); + + assertThat(sql("SELECT * FROM %s", tableName)) + .as("Should have the expected rows in sink table.") + .containsExactlyInAnyOrderElementsOf(dataSet); + + // Sometimes we will have more than one checkpoint if we pass the auto checkpoint interval, + // thus producing multiple snapshots. Here we assert that each snapshot has only 1 file per + // partition. + Table table = catalog.loadTable(TableIdentifier.of(ICEBERG_NAMESPACE, tableName)); + Map> snapshotToDataFiles = SimpleDataUtil.snapshotToDataFiles(table); + for (List dataFiles : snapshotToDataFiles.values()) { + if (dataFiles.isEmpty()) { + continue; + } + + assertThat( + SimpleDataUtil.matchingPartitions( + dataFiles, table.spec(), ImmutableMap.of("data", "aaa"))) + .hasSize(1); + assertThat( + SimpleDataUtil.matchingPartitions( + dataFiles, table.spec(), ImmutableMap.of("data", "bbb"))) + .hasSize(1); + assertThat( + SimpleDataUtil.matchingPartitions( + dataFiles, table.spec(), ImmutableMap.of("data", "ccc"))) + .hasSize(1); + } + } finally { + sql("DROP TABLE IF EXISTS %s.%s", FLINK_DATABASE, tableName); + } + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestIcebergConnector.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestIcebergConnector.java index fdb0e0cf19df..47f5485df879 100644 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestIcebergConnector.java +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestIcebergConnector.java @@ -319,10 +319,6 @@ private String databaseName() { return properties.getOrDefault("catalog-database", "default_database"); } - private String toWithClause(Map props) { - return CatalogTestBase.toWithClause(props); - } - private String createWarehouse() { try { return String.format( diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSink.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSink.java index 61ab087f2ca3..b778037c559c 100644 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSink.java +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSink.java @@ -18,20 +18,11 @@ */ package org.apache.iceberg.flink.sink; -import static org.apache.iceberg.flink.TestFixtures.DATABASE; -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatThrownBy; - import java.io.IOException; -import java.util.Collections; import java.util.List; -import java.util.Map; -import org.apache.flink.configuration.Configuration; import org.apache.flink.streaming.api.datastream.DataStream; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.table.api.TableSchema; import org.apache.flink.table.data.RowData; -import org.apache.flink.test.junit5.MiniClusterExtension; import org.apache.flink.types.Row; import org.apache.iceberg.DistributionMode; import org.apache.iceberg.FileFormat; @@ -39,37 +30,19 @@ import org.apache.iceberg.ParameterizedTestExtension; import org.apache.iceberg.Parameters; import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Table; import org.apache.iceberg.TableProperties; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.flink.FlinkWriteOptions; -import org.apache.iceberg.flink.HadoopCatalogExtension; import org.apache.iceberg.flink.MiniFlinkClusterExtension; import org.apache.iceberg.flink.SimpleDataUtil; -import org.apache.iceberg.flink.TableLoader; import org.apache.iceberg.flink.TestFixtures; import org.apache.iceberg.flink.util.FlinkCompatibilityUtil; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.TestTemplate; import org.junit.jupiter.api.extension.ExtendWith; -import org.junit.jupiter.api.extension.RegisterExtension; @ExtendWith(ParameterizedTestExtension.class) public class TestFlinkIcebergSink extends TestFlinkIcebergSinkBase { - - @RegisterExtension - public static final MiniClusterExtension MINI_CLUSTER_EXTENSION = - MiniFlinkClusterExtension.createWithClassloaderCheckDisabled(); - - @RegisterExtension - private static final HadoopCatalogExtension CATALOG_EXTENSION = - new HadoopCatalogExtension(DATABASE, TestFixtures.TABLE); - - private TableLoader tableLoader; - @Parameter(index = 0) private FileFormat format; @@ -99,7 +72,7 @@ public static Object[][] parameters() { @BeforeEach public void before() throws IOException { - table = + this.table = CATALOG_EXTENSION .catalog() .createTable( @@ -110,14 +83,14 @@ public void before() throws IOException { : PartitionSpec.unpartitioned(), ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name())); - env = + this.env = StreamExecutionEnvironment.getExecutionEnvironment( MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG) .enableCheckpointing(100) .setParallelism(parallelism) .setMaxParallelism(parallelism); - tableLoader = CATALOG_EXTENSION.tableLoader(); + this.tableLoader = CATALOG_EXTENSION.tableLoader(); } @TestTemplate @@ -140,246 +113,13 @@ public void testWriteRowData() throws Exception { SimpleDataUtil.assertTableRows(table, convertToRowData(rows)); } - private void testWriteRow(TableSchema tableSchema, DistributionMode distributionMode) - throws Exception { - List rows = createRows(""); - DataStream dataStream = env.addSource(createBoundedSource(rows), ROW_TYPE_INFO); - - FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) - .table(table) - .tableLoader(tableLoader) - .tableSchema(tableSchema) - .writeParallelism(parallelism) - .distributionMode(distributionMode) - .append(); - - // Execute the program. - env.execute("Test Iceberg DataStream."); - - SimpleDataUtil.assertTableRows(table, convertToRowData(rows)); - } - - private int partitionFiles(String partition) throws IOException { - return SimpleDataUtil.partitionDataFiles(table, ImmutableMap.of("data", partition)).size(); - } - @TestTemplate public void testWriteRow() throws Exception { - testWriteRow(null, DistributionMode.NONE); + testWriteRow(parallelism, null, DistributionMode.NONE); } @TestTemplate public void testWriteRowWithTableSchema() throws Exception { - testWriteRow(SimpleDataUtil.FLINK_SCHEMA, DistributionMode.NONE); - } - - @TestTemplate - public void testJobNoneDistributeMode() throws Exception { - table - .updateProperties() - .set(TableProperties.WRITE_DISTRIBUTION_MODE, DistributionMode.HASH.modeName()) - .commit(); - - testWriteRow(null, DistributionMode.NONE); - - if (parallelism > 1) { - if (partitioned) { - int files = partitionFiles("aaa") + partitionFiles("bbb") + partitionFiles("ccc"); - assertThat(files).isGreaterThan(3); - } - } - } - - @TestTemplate - public void testJobHashDistributionMode() { - table - .updateProperties() - .set(TableProperties.WRITE_DISTRIBUTION_MODE, DistributionMode.HASH.modeName()) - .commit(); - - assertThatThrownBy(() -> testWriteRow(null, DistributionMode.RANGE)) - .isInstanceOf(IllegalArgumentException.class) - .hasMessage("Flink does not support 'range' write distribution mode now."); - } - - @TestTemplate - public void testJobNullDistributionMode() throws Exception { - table - .updateProperties() - .set(TableProperties.WRITE_DISTRIBUTION_MODE, DistributionMode.HASH.modeName()) - .commit(); - - testWriteRow(null, null); - - if (partitioned) { - assertThat(partitionFiles("aaa")).isEqualTo(1); - assertThat(partitionFiles("bbb")).isEqualTo(1); - assertThat(partitionFiles("ccc")).isEqualTo(1); - } - } - - @TestTemplate - public void testPartitionWriteMode() throws Exception { - testWriteRow(null, DistributionMode.HASH); - if (partitioned) { - assertThat(partitionFiles("aaa")).isEqualTo(1); - assertThat(partitionFiles("bbb")).isEqualTo(1); - assertThat(partitionFiles("ccc")).isEqualTo(1); - } - } - - @TestTemplate - public void testShuffleByPartitionWithSchema() throws Exception { - testWriteRow(SimpleDataUtil.FLINK_SCHEMA, DistributionMode.HASH); - if (partitioned) { - assertThat(partitionFiles("aaa")).isEqualTo(1); - assertThat(partitionFiles("bbb")).isEqualTo(1); - assertThat(partitionFiles("ccc")).isEqualTo(1); - } - } - - @TestTemplate - public void testTwoSinksInDisjointedDAG() throws Exception { - Map props = ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name()); - - Table leftTable = - CATALOG_EXTENSION - .catalog() - .createTable( - TableIdentifier.of("left"), - SimpleDataUtil.SCHEMA, - partitioned - ? PartitionSpec.builderFor(SimpleDataUtil.SCHEMA).identity("data").build() - : PartitionSpec.unpartitioned(), - props); - TableLoader leftTableLoader = - TableLoader.fromCatalog(CATALOG_EXTENSION.catalogLoader(), TableIdentifier.of("left")); - - Table rightTable = - CATALOG_EXTENSION - .catalog() - .createTable( - TableIdentifier.of("right"), - SimpleDataUtil.SCHEMA, - partitioned - ? PartitionSpec.builderFor(SimpleDataUtil.SCHEMA).identity("data").build() - : PartitionSpec.unpartitioned(), - props); - TableLoader rightTableLoader = - TableLoader.fromCatalog(CATALOG_EXTENSION.catalogLoader(), TableIdentifier.of("right")); - - env = - StreamExecutionEnvironment.getExecutionEnvironment( - MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG) - .enableCheckpointing(100) - .setParallelism(parallelism) - .setMaxParallelism(parallelism); - env.getConfig().disableAutoGeneratedUIDs(); - - List leftRows = createRows("left-"); - DataStream leftStream = - env.fromCollection(leftRows, ROW_TYPE_INFO) - .name("leftCustomSource") - .uid("leftCustomSource"); - FlinkSink.forRow(leftStream, SimpleDataUtil.FLINK_SCHEMA) - .table(leftTable) - .tableLoader(leftTableLoader) - .tableSchema(SimpleDataUtil.FLINK_SCHEMA) - .distributionMode(DistributionMode.NONE) - .uidPrefix("leftIcebergSink") - .append(); - - List rightRows = createRows("right-"); - DataStream rightStream = - env.fromCollection(rightRows, ROW_TYPE_INFO) - .name("rightCustomSource") - .uid("rightCustomSource"); - FlinkSink.forRow(rightStream, SimpleDataUtil.FLINK_SCHEMA) - .table(rightTable) - .tableLoader(rightTableLoader) - .tableSchema(SimpleDataUtil.FLINK_SCHEMA) - .writeParallelism(parallelism) - .distributionMode(DistributionMode.HASH) - .uidPrefix("rightIcebergSink") - .setSnapshotProperty("flink.test", TestFlinkIcebergSink.class.getName()) - .setSnapshotProperties(Collections.singletonMap("direction", "rightTable")) - .append(); - - // Execute the program. - env.execute("Test Iceberg DataStream."); - - SimpleDataUtil.assertTableRows(leftTable, convertToRowData(leftRows)); - SimpleDataUtil.assertTableRows(rightTable, convertToRowData(rightRows)); - - leftTable.refresh(); - assertThat(leftTable.currentSnapshot().summary()).doesNotContainKeys("flink.test", "direction"); - rightTable.refresh(); - assertThat(rightTable.currentSnapshot().summary()) - .containsEntry("flink.test", TestFlinkIcebergSink.class.getName()) - .containsEntry("direction", "rightTable"); - } - - @TestTemplate - public void testOverrideWriteConfigWithUnknownDistributionMode() { - Map newProps = Maps.newHashMap(); - newProps.put(FlinkWriteOptions.DISTRIBUTION_MODE.key(), "UNRECOGNIZED"); - - List rows = createRows(""); - DataStream dataStream = env.addSource(createBoundedSource(rows), ROW_TYPE_INFO); - - FlinkSink.Builder builder = - FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) - .table(table) - .tableLoader(tableLoader) - .writeParallelism(parallelism) - .setAll(newProps); - - assertThatThrownBy(builder::append) - .isInstanceOf(IllegalArgumentException.class) - .hasMessage("Invalid distribution mode: UNRECOGNIZED"); - } - - @TestTemplate - public void testOverrideWriteConfigWithUnknownFileFormat() { - Map newProps = Maps.newHashMap(); - newProps.put(FlinkWriteOptions.WRITE_FORMAT.key(), "UNRECOGNIZED"); - - List rows = createRows(""); - DataStream dataStream = env.addSource(createBoundedSource(rows), ROW_TYPE_INFO); - - FlinkSink.Builder builder = - FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) - .table(table) - .tableLoader(tableLoader) - .writeParallelism(parallelism) - .setAll(newProps); - - assertThatThrownBy(builder::append) - .isInstanceOf(IllegalArgumentException.class) - .hasMessage("Invalid file format: UNRECOGNIZED"); - } - - @TestTemplate - public void testWriteRowWithTableRefreshInterval() throws Exception { - List rows = Lists.newArrayList(Row.of(1, "hello"), Row.of(2, "world"), Row.of(3, "foo")); - DataStream dataStream = - env.addSource(createBoundedSource(rows), ROW_TYPE_INFO) - .map(CONVERTER::toInternal, FlinkCompatibilityUtil.toTypeInfo(SimpleDataUtil.ROW_TYPE)); - - Configuration flinkConf = new Configuration(); - flinkConf.setString(FlinkWriteOptions.TABLE_REFRESH_INTERVAL.key(), "100ms"); - - FlinkSink.forRowData(dataStream) - .table(table) - .tableLoader(tableLoader) - .flinkConf(flinkConf) - .writeParallelism(parallelism) - .append(); - - // Execute the program. - env.execute("Test Iceberg DataStream"); - - // Assert the iceberg table's records. - SimpleDataUtil.assertTableRows(table, convertToRowData(rows)); + testWriteRow(parallelism, SimpleDataUtil.FLINK_SCHEMA, DistributionMode.NONE); } } diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkBase.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkBase.java index b38aa6b50ce6..9ce36cc1e8d0 100644 --- a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkBase.java +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkBase.java @@ -18,29 +18,52 @@ */ package org.apache.iceberg.flink.sink; +import static org.apache.iceberg.flink.TestFixtures.DATABASE; + +import java.io.IOException; import java.util.List; import java.util.stream.Collectors; import org.apache.flink.api.common.typeinfo.TypeInformation; import org.apache.flink.api.java.typeutils.RowTypeInfo; +import org.apache.flink.streaming.api.datastream.DataStream; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.api.TableSchema; import org.apache.flink.table.data.RowData; import org.apache.flink.table.data.util.DataFormatConverters; +import org.apache.flink.test.junit5.MiniClusterExtension; import org.apache.flink.types.Row; +import org.apache.iceberg.DistributionMode; import org.apache.iceberg.Table; +import org.apache.iceberg.flink.HadoopCatalogExtension; +import org.apache.iceberg.flink.MiniFlinkClusterExtension; import org.apache.iceberg.flink.SimpleDataUtil; +import org.apache.iceberg.flink.TableLoader; +import org.apache.iceberg.flink.TestFixtures; import org.apache.iceberg.flink.source.BoundedTestSource; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.junit.jupiter.api.extension.RegisterExtension; public class TestFlinkIcebergSinkBase { - protected Table table; - protected StreamExecutionEnvironment env; + @RegisterExtension + public static MiniClusterExtension miniClusterResource = + MiniFlinkClusterExtension.createWithClassloaderCheckDisabled(); + + @RegisterExtension + protected static final HadoopCatalogExtension CATALOG_EXTENSION = + new HadoopCatalogExtension(DATABASE, TestFixtures.TABLE); + protected static final TypeInformation ROW_TYPE_INFO = new RowTypeInfo(SimpleDataUtil.FLINK_SCHEMA.getFieldTypes()); protected static final DataFormatConverters.RowConverter CONVERTER = new DataFormatConverters.RowConverter(SimpleDataUtil.FLINK_SCHEMA.getFieldDataTypes()); + protected TableLoader tableLoader; + protected Table table; + protected StreamExecutionEnvironment env; + protected BoundedTestSource createBoundedSource(List rows) { return new BoundedTestSource<>(rows.toArray(new Row[0])); } @@ -61,4 +84,28 @@ protected List createRows(String prefix) { protected List convertToRowData(List rows) { return rows.stream().map(CONVERTER::toInternal).collect(Collectors.toList()); } + + protected void testWriteRow( + int writerParallelism, TableSchema tableSchema, DistributionMode distributionMode) + throws Exception { + List rows = createRows(""); + DataStream dataStream = env.addSource(createBoundedSource(rows), ROW_TYPE_INFO); + + FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) + .table(table) + .tableLoader(tableLoader) + .tableSchema(tableSchema) + .writeParallelism(writerParallelism) + .distributionMode(distributionMode) + .append(); + + // Execute the program. + env.execute("Test Iceberg DataStream."); + + SimpleDataUtil.assertTableRows(table, convertToRowData(rows)); + } + + protected int partitionFiles(String partition) throws IOException { + return SimpleDataUtil.partitionDataFiles(table, ImmutableMap.of("data", partition)).size(); + } } diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkDistributionMode.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkDistributionMode.java new file mode 100644 index 000000000000..75e397d3f203 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkDistributionMode.java @@ -0,0 +1,180 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import java.io.IOException; +import java.util.List; +import java.util.Map; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.test.junit5.MiniClusterExtension; +import org.apache.flink.types.Row; +import org.apache.iceberg.DistributionMode; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Parameter; +import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.Parameters; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.flink.FlinkWriteOptions; +import org.apache.iceberg.flink.HadoopCatalogExtension; +import org.apache.iceberg.flink.MiniFlinkClusterExtension; +import org.apache.iceberg.flink.SimpleDataUtil; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.api.extension.ExtendWith; +import org.junit.jupiter.api.extension.RegisterExtension; + +/** + * This tests the distribution mode of Flink sink. Extract them separately since it is unnecessary + * to test different file formats (Avro, Orc, Parquet) like in {@link TestFlinkIcebergSink}. + * Removing the file format dimension reduces the number of combinations from 12 to 4, which helps + * reduce test run time. + */ +@ExtendWith(ParameterizedTestExtension.class) +public class TestFlinkIcebergSinkDistributionMode extends TestFlinkIcebergSinkBase { + + @RegisterExtension + public static MiniClusterExtension miniClusterResource = + MiniFlinkClusterExtension.createWithClassloaderCheckDisabled(); + + @RegisterExtension + private static final HadoopCatalogExtension CATALOG_EXTENSION = + new HadoopCatalogExtension(TestFixtures.DATABASE, TestFixtures.TABLE); + + private final FileFormat format = FileFormat.PARQUET; + + @Parameter(index = 0) + private int parallelism; + + @Parameter(index = 1) + private boolean partitioned; + + @Parameters(name = "parallelism = {0}, partitioned = {1}") + public static Object[][] parameters() { + return new Object[][] { + {1, true}, + {1, false}, + {2, true}, + {2, false} + }; + } + + @BeforeEach + public void before() throws IOException { + this.table = + CATALOG_EXTENSION + .catalog() + .createTable( + TestFixtures.TABLE_IDENTIFIER, + SimpleDataUtil.SCHEMA, + partitioned + ? PartitionSpec.builderFor(SimpleDataUtil.SCHEMA).identity("data").build() + : PartitionSpec.unpartitioned(), + ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name())); + + this.env = + StreamExecutionEnvironment.getExecutionEnvironment( + MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG) + .enableCheckpointing(100) + .setParallelism(parallelism) + .setMaxParallelism(parallelism); + + this.tableLoader = CATALOG_EXTENSION.tableLoader(); + } + + @TestTemplate + public void testShuffleByPartitionWithSchema() throws Exception { + testWriteRow(parallelism, SimpleDataUtil.FLINK_SCHEMA, DistributionMode.HASH); + if (partitioned) { + assertThat(partitionFiles("aaa")).isEqualTo(1); + assertThat(partitionFiles("bbb")).isEqualTo(1); + assertThat(partitionFiles("ccc")).isEqualTo(1); + } + } + + @TestTemplate + public void testJobNoneDistributeMode() throws Exception { + table + .updateProperties() + .set(TableProperties.WRITE_DISTRIBUTION_MODE, DistributionMode.HASH.modeName()) + .commit(); + + testWriteRow(parallelism, null, DistributionMode.NONE); + + if (parallelism > 1) { + if (partitioned) { + int files = partitionFiles("aaa") + partitionFiles("bbb") + partitionFiles("ccc"); + assertThat(files).isGreaterThan(3); + } + } + } + + @TestTemplate + public void testJobNullDistributionMode() throws Exception { + table + .updateProperties() + .set(TableProperties.WRITE_DISTRIBUTION_MODE, DistributionMode.HASH.modeName()) + .commit(); + + testWriteRow(parallelism, null, null); + + if (partitioned) { + assertThat(partitionFiles("aaa")).isEqualTo(1); + assertThat(partitionFiles("bbb")).isEqualTo(1); + assertThat(partitionFiles("ccc")).isEqualTo(1); + } + } + + @TestTemplate + public void testPartitionWriteMode() throws Exception { + testWriteRow(parallelism, null, DistributionMode.HASH); + if (partitioned) { + assertThat(partitionFiles("aaa")).isEqualTo(1); + assertThat(partitionFiles("bbb")).isEqualTo(1); + assertThat(partitionFiles("ccc")).isEqualTo(1); + } + } + + @TestTemplate + public void testOverrideWriteConfigWithUnknownDistributionMode() { + Map newProps = Maps.newHashMap(); + newProps.put(FlinkWriteOptions.DISTRIBUTION_MODE.key(), "UNRECOGNIZED"); + + List rows = createRows(""); + DataStream dataStream = env.addSource(createBoundedSource(rows), ROW_TYPE_INFO); + + FlinkSink.Builder builder = + FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) + .table(table) + .tableLoader(tableLoader) + .writeParallelism(parallelism) + .setAll(newProps); + + assertThatThrownBy(builder::append) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Invalid distribution mode: UNRECOGNIZED"); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkExtended.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkExtended.java new file mode 100644 index 000000000000..36a59b20431c --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkExtended.java @@ -0,0 +1,208 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import java.io.IOException; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.data.RowData; +import org.apache.flink.types.Row; +import org.apache.iceberg.DistributionMode; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Table; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.flink.FlinkWriteOptions; +import org.apache.iceberg.flink.MiniFlinkClusterExtension; +import org.apache.iceberg.flink.SimpleDataUtil; +import org.apache.iceberg.flink.TableLoader; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.flink.util.FlinkCompatibilityUtil; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +/** + * This class tests the more extended features of Flink sink. Extract them separately since it is + * unnecessary to test all the parameters combinations in {@link TestFlinkIcebergSink}. Each test + * method in {@link TestFlinkIcebergSink} runs 12 combinations, which are expensive and slow. + */ +public class TestFlinkIcebergSinkExtended extends TestFlinkIcebergSinkBase { + private final boolean partitioned = true; + private final int parallelism = 2; + private final FileFormat format = FileFormat.PARQUET; + + @BeforeEach + public void before() throws IOException { + this.table = + CATALOG_EXTENSION + .catalog() + .createTable( + TestFixtures.TABLE_IDENTIFIER, + SimpleDataUtil.SCHEMA, + partitioned + ? PartitionSpec.builderFor(SimpleDataUtil.SCHEMA).identity("data").build() + : PartitionSpec.unpartitioned(), + ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name())); + + this.env = + StreamExecutionEnvironment.getExecutionEnvironment( + MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG) + .enableCheckpointing(100) + .setParallelism(parallelism) + .setMaxParallelism(parallelism); + + this.tableLoader = CATALOG_EXTENSION.tableLoader(); + } + + @Test + public void testTwoSinksInDisjointedDAG() throws Exception { + Map props = ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name()); + + Table leftTable = + CATALOG_EXTENSION + .catalog() + .createTable( + TableIdentifier.of("left"), + SimpleDataUtil.SCHEMA, + partitioned + ? PartitionSpec.builderFor(SimpleDataUtil.SCHEMA).identity("data").build() + : PartitionSpec.unpartitioned(), + props); + TableLoader leftTableLoader = + TableLoader.fromCatalog(CATALOG_EXTENSION.catalogLoader(), TableIdentifier.of("left")); + + Table rightTable = + CATALOG_EXTENSION + .catalog() + .createTable( + TableIdentifier.of("right"), + SimpleDataUtil.SCHEMA, + partitioned + ? PartitionSpec.builderFor(SimpleDataUtil.SCHEMA).identity("data").build() + : PartitionSpec.unpartitioned(), + props); + TableLoader rightTableLoader = + TableLoader.fromCatalog(CATALOG_EXTENSION.catalogLoader(), TableIdentifier.of("right")); + + env = + StreamExecutionEnvironment.getExecutionEnvironment( + MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG) + .enableCheckpointing(100) + .setParallelism(parallelism) + .setMaxParallelism(parallelism); + env.getConfig().disableAutoGeneratedUIDs(); + + List leftRows = createRows("left-"); + DataStream leftStream = + env.fromCollection(leftRows, ROW_TYPE_INFO) + .name("leftCustomSource") + .uid("leftCustomSource"); + FlinkSink.forRow(leftStream, SimpleDataUtil.FLINK_SCHEMA) + .table(leftTable) + .tableLoader(leftTableLoader) + .tableSchema(SimpleDataUtil.FLINK_SCHEMA) + .distributionMode(DistributionMode.NONE) + .uidPrefix("leftIcebergSink") + .append(); + + List rightRows = createRows("right-"); + DataStream rightStream = + env.fromCollection(rightRows, ROW_TYPE_INFO) + .name("rightCustomSource") + .uid("rightCustomSource"); + FlinkSink.forRow(rightStream, SimpleDataUtil.FLINK_SCHEMA) + .table(rightTable) + .tableLoader(rightTableLoader) + .tableSchema(SimpleDataUtil.FLINK_SCHEMA) + .writeParallelism(parallelism) + .distributionMode(DistributionMode.HASH) + .uidPrefix("rightIcebergSink") + .setSnapshotProperty("flink.test", TestFlinkIcebergSink.class.getName()) + .setSnapshotProperties(Collections.singletonMap("direction", "rightTable")) + .append(); + + // Execute the program. + env.execute("Test Iceberg DataStream."); + + SimpleDataUtil.assertTableRows(leftTable, convertToRowData(leftRows)); + SimpleDataUtil.assertTableRows(rightTable, convertToRowData(rightRows)); + + leftTable.refresh(); + assertThat(leftTable.currentSnapshot().summary()).doesNotContainKeys("flink.test", "direction"); + rightTable.refresh(); + assertThat(rightTable.currentSnapshot().summary()) + .containsEntry("flink.test", TestFlinkIcebergSink.class.getName()) + .containsEntry("direction", "rightTable"); + } + + @Test + public void testOverrideWriteConfigWithUnknownFileFormat() { + Map newProps = Maps.newHashMap(); + newProps.put(FlinkWriteOptions.WRITE_FORMAT.key(), "UNRECOGNIZED"); + + List rows = createRows(""); + DataStream dataStream = env.addSource(createBoundedSource(rows), ROW_TYPE_INFO); + + FlinkSink.Builder builder = + FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) + .table(table) + .tableLoader(tableLoader) + .writeParallelism(parallelism) + .setAll(newProps); + + assertThatThrownBy(builder::append) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Invalid file format: UNRECOGNIZED"); + } + + @Test + public void testWriteRowWithTableRefreshInterval() throws Exception { + List rows = Lists.newArrayList(Row.of(1, "hello"), Row.of(2, "world"), Row.of(3, "foo")); + DataStream dataStream = + env.addSource(createBoundedSource(rows), ROW_TYPE_INFO) + .map(CONVERTER::toInternal, FlinkCompatibilityUtil.toTypeInfo(SimpleDataUtil.ROW_TYPE)); + + Configuration flinkConf = new Configuration(); + flinkConf.setString(FlinkWriteOptions.TABLE_REFRESH_INTERVAL.key(), "100ms"); + + FlinkSink.forRowData(dataStream) + .table(table) + .tableLoader(tableLoader) + .flinkConf(flinkConf) + .writeParallelism(parallelism) + .append(); + + // Execute the program. + env.execute("Test Iceberg DataStream"); + + // Assert the iceberg table's records. + SimpleDataUtil.assertTableRows(table, convertToRowData(rows)); + } +} From af75440da8d6e7b509b3197251c965543017b015 Mon Sep 17 00:00:00 2001 From: Steven Zhen Wu Date: Fri, 2 Aug 2024 08:39:47 -0700 Subject: [PATCH 20/55] Flink: backport PR #10823 for range partitioner fixup (#10847) --- .../shuffle/AggregatedStatisticsTracker.java | 23 +++-- .../sink/shuffle/CompletedStatistics.java | 8 ++ .../shuffle/DataStatisticsCoordinator.java | 45 ++++++---- .../flink/sink/shuffle/RangePartitioner.java | 8 +- .../sink/shuffle/SketchRangePartitioner.java | 19 +--- .../flink/sink/shuffle/SketchUtil.java | 17 ++++ .../flink/sink/shuffle/SortKeyUtil.java | 59 +++++++++++++ .../sink/shuffle/TestRangePartitioner.java | 65 ++++++++++++++ .../shuffle/TestSketchRangePartitioner.java | 88 +++++++++++++++++++ .../flink/sink/shuffle/TestSketchUtil.java | 64 +++++++++++++- .../flink/sink/shuffle/TestSortKeyUtil.java | 73 +++++++++++++++ .../shuffle/AggregatedStatisticsTracker.java | 23 +++-- .../sink/shuffle/CompletedStatistics.java | 8 ++ .../shuffle/DataStatisticsCoordinator.java | 45 ++++++---- .../flink/sink/shuffle/RangePartitioner.java | 8 +- .../sink/shuffle/SketchRangePartitioner.java | 19 +--- .../flink/sink/shuffle/SketchUtil.java | 17 ++++ .../flink/sink/shuffle/SortKeyUtil.java | 59 +++++++++++++ .../sink/shuffle/TestRangePartitioner.java | 65 ++++++++++++++ .../shuffle/TestSketchRangePartitioner.java | 88 +++++++++++++++++++ .../flink/sink/shuffle/TestSketchUtil.java | 64 +++++++++++++- .../flink/sink/shuffle/TestSortKeyUtil.java | 73 +++++++++++++++ 22 files changed, 840 insertions(+), 98 deletions(-) create mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SortKeyUtil.java create mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestRangePartitioner.java create mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchRangePartitioner.java create mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeyUtil.java create mode 100644 flink/v1.18/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SortKeyUtil.java create mode 100644 flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestRangePartitioner.java create mode 100644 flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchRangePartitioner.java create mode 100644 flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeyUtil.java diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/AggregatedStatisticsTracker.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/AggregatedStatisticsTracker.java index 338523b7b074..5525f02c873e 100644 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/AggregatedStatisticsTracker.java +++ b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/AggregatedStatisticsTracker.java @@ -223,7 +223,9 @@ private void merge(DataStatistics taskStatistics) { convertCoordinatorToSketch(); } - sketchStatistics.update(taskSketch); + if (taskSketch.getNumSamples() > 0) { + sketchStatistics.update(taskSketch); + } } } @@ -242,13 +244,18 @@ private CompletedStatistics completedStatistics(long checkpointId) { return CompletedStatistics.fromKeyFrequency(checkpointId, mapStatistics); } else { ReservoirItemsSketch sketch = sketchStatistics.getResult(); - LOG.info( - "Completed sketch statistics aggregation: " - + "reservoir size = {}, number of items seen = {}, number of samples = {}", - sketch.getK(), - sketch.getN(), - sketch.getNumSamples()); - return CompletedStatistics.fromKeySamples(checkpointId, sketch.getSamples()); + if (sketch != null) { + LOG.info( + "Completed sketch statistics aggregation: " + + "reservoir size = {}, number of items seen = {}, number of samples = {}", + sketch.getK(), + sketch.getN(), + sketch.getNumSamples()); + return CompletedStatistics.fromKeySamples(checkpointId, sketch.getSamples()); + } else { + LOG.info("Empty sketch statistics."); + return CompletedStatistics.fromKeySamples(checkpointId, new SortKey[0]); + } } } } diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/CompletedStatistics.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/CompletedStatistics.java index c0e228965ddd..e4cba174f0f2 100644 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/CompletedStatistics.java +++ b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/CompletedStatistics.java @@ -100,4 +100,12 @@ Map keyFrequency() { SortKey[] keySamples() { return keySamples; } + + boolean isEmpty() { + if (type == StatisticsType.Sketch) { + return keySamples.length == 0; + } else { + return keyFrequency().isEmpty(); + } + } } diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsCoordinator.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsCoordinator.java index 3b21fbae315a..4bfde7204acf 100644 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsCoordinator.java +++ b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsCoordinator.java @@ -38,11 +38,11 @@ import org.apache.flink.util.function.ThrowingRunnable; import org.apache.iceberg.Schema; import org.apache.iceberg.SortOrder; -import org.apache.iceberg.SortOrderComparators; import org.apache.iceberg.StructLike; import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; import org.apache.iceberg.relocated.com.google.common.collect.Iterables; import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.types.Comparators; import org.jetbrains.annotations.NotNull; import org.jetbrains.annotations.Nullable; import org.slf4j.Logger; @@ -91,7 +91,7 @@ class DataStatisticsCoordinator implements OperatorCoordinator { this.context = context; this.schema = schema; this.sortOrder = sortOrder; - this.comparator = SortOrderComparators.forSchema(schema, sortOrder); + this.comparator = Comparators.forType(SortKeyUtil.sortKeySchema(schema, sortOrder).asStruct()); this.downstreamParallelism = downstreamParallelism; this.statisticsType = statisticsType; this.closeFileCostWeightPercentage = closeFileCostWeightPercentage; @@ -202,17 +202,23 @@ private void handleDataStatisticRequest(int subtask, StatisticsEvent event) { aggregatedStatisticsTracker.updateAndCheckCompletion(subtask, event); if (maybeCompletedStatistics != null) { - // completedStatistics contains the complete samples, which is needed to compute - // the range bounds in globalStatistics if downstreamParallelism changed. - this.completedStatistics = maybeCompletedStatistics; - // globalStatistics only contains assignment calculated based on Map or Sketch statistics - this.globalStatistics = - globalStatistics( - maybeCompletedStatistics, - downstreamParallelism, - comparator, - closeFileCostWeightPercentage); - sendGlobalStatisticsToSubtasks(globalStatistics); + if (maybeCompletedStatistics.isEmpty()) { + LOG.info( + "Skip aggregated statistics for checkpoint {} as it is empty.", event.checkpointId()); + } else { + LOG.info("Completed statistics aggregation for checkpoint {}", event.checkpointId()); + // completedStatistics contains the complete samples, which is needed to compute + // the range bounds in globalStatistics if downstreamParallelism changed. + this.completedStatistics = maybeCompletedStatistics; + // globalStatistics only contains assignment calculated based on Map or Sketch statistics + this.globalStatistics = + globalStatistics( + maybeCompletedStatistics, + downstreamParallelism, + comparator, + closeFileCostWeightPercentage); + sendGlobalStatisticsToSubtasks(globalStatistics); + } } } @@ -324,9 +330,14 @@ public void checkpointCoordinator(long checkpointId, CompletableFuture r "Snapshotting data statistics coordinator {} for checkpoint {}", operatorName, checkpointId); - resultFuture.complete( - StatisticsUtil.serializeCompletedStatistics( - completedStatistics, completedStatisticsSerializer)); + if (completedStatistics == null) { + // null checkpoint result is not allowed, hence supply an empty byte array + resultFuture.complete(new byte[0]); + } else { + resultFuture.complete( + StatisticsUtil.serializeCompletedStatistics( + completedStatistics, completedStatisticsSerializer)); + } }, String.format("taking checkpoint %d", checkpointId)); } @@ -338,7 +349,7 @@ public void notifyCheckpointComplete(long checkpointId) {} public void resetToCheckpoint(long checkpointId, byte[] checkpointData) { Preconditions.checkState( !started, "The coordinator %s can only be reset if it was not yet started", operatorName); - if (checkpointData == null) { + if (checkpointData == null || checkpointData.length == 0) { LOG.info( "Data statistic coordinator {} has nothing to restore from checkpoint {}", operatorName, diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/RangePartitioner.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/RangePartitioner.java index 83a9461233d2..6608b938f5a8 100644 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/RangePartitioner.java +++ b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/RangePartitioner.java @@ -23,12 +23,13 @@ import org.apache.flink.annotation.Internal; import org.apache.flink.api.common.functions.Partitioner; import org.apache.flink.table.data.RowData; +import org.apache.iceberg.DistributionMode; import org.apache.iceberg.Schema; import org.apache.iceberg.SortOrder; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -/** The wrapper class */ +/** This custom partitioner implements the {@link DistributionMode#RANGE} for Flink sink. */ @Internal public class RangePartitioner implements Partitioner { private static final Logger LOG = LoggerFactory.getLogger(RangePartitioner.class); @@ -94,9 +95,8 @@ static int adjustPartitionWithRescale( if (numPartitionsStatsCalculation <= numPartitions) { // no rescale or scale-up case. // new subtasks are ignored and not assigned any keys, which is sub-optimal and only - // transient. - // when rescale is detected, operator requests new statistics from coordinator upon - // initialization. + // transient. when rescale is detected, operator requests new statistics from + // coordinator upon initialization. return partition; } else { // scale-down case. diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchRangePartitioner.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchRangePartitioner.java index af78271ea5dc..dddb0d8722c0 100644 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchRangePartitioner.java +++ b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchRangePartitioner.java @@ -18,17 +18,16 @@ */ package org.apache.iceberg.flink.sink.shuffle; -import java.util.Arrays; import java.util.Comparator; import org.apache.flink.api.common.functions.Partitioner; import org.apache.flink.table.data.RowData; import org.apache.iceberg.Schema; import org.apache.iceberg.SortKey; import org.apache.iceberg.SortOrder; -import org.apache.iceberg.SortOrderComparators; import org.apache.iceberg.StructLike; import org.apache.iceberg.flink.FlinkSchemaUtil; import org.apache.iceberg.flink.RowDataWrapper; +import org.apache.iceberg.types.Comparators; class SketchRangePartitioner implements Partitioner { private final SortKey sortKey; @@ -38,7 +37,7 @@ class SketchRangePartitioner implements Partitioner { SketchRangePartitioner(Schema schema, SortOrder sortOrder, SortKey[] rangeBounds) { this.sortKey = new SortKey(schema, sortOrder); - this.comparator = SortOrderComparators.forSchema(schema, sortOrder); + this.comparator = Comparators.forType(SortKeyUtil.sortKeySchema(schema, sortOrder).asStruct()); this.rangeBounds = rangeBounds; this.rowDataWrapper = new RowDataWrapper(FlinkSchemaUtil.convert(schema), schema.asStruct()); } @@ -47,18 +46,6 @@ class SketchRangePartitioner implements Partitioner { public int partition(RowData row, int numPartitions) { // reuse the sortKey and rowDataWrapper sortKey.wrap(rowDataWrapper.wrap(row)); - int partition = Arrays.binarySearch(rangeBounds, sortKey, comparator); - - // binarySearch either returns the match location or -[insertion point]-1 - if (partition < 0) { - partition = -partition - 1; - } - - if (partition > rangeBounds.length) { - partition = rangeBounds.length; - } - - return RangePartitioner.adjustPartitionWithRescale( - partition, rangeBounds.length + 1, numPartitions); + return SketchUtil.partition(sortKey, numPartitions, rangeBounds, comparator); } } diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchUtil.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchUtil.java index a58310611e8d..871ef9ef1149 100644 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchUtil.java +++ b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchUtil.java @@ -139,4 +139,21 @@ static void convertMapToSketch( } }); } + + static int partition( + SortKey key, int numPartitions, SortKey[] rangeBounds, Comparator comparator) { + int partition = Arrays.binarySearch(rangeBounds, key, comparator); + + // binarySearch either returns the match location or -[insertion point]-1 + if (partition < 0) { + partition = -partition - 1; + } + + if (partition > rangeBounds.length) { + partition = rangeBounds.length; + } + + return RangePartitioner.adjustPartitionWithRescale( + partition, rangeBounds.length + 1, numPartitions); + } } diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SortKeyUtil.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SortKeyUtil.java new file mode 100644 index 000000000000..1e5bdbbac3e4 --- /dev/null +++ b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SortKeyUtil.java @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import java.util.List; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SortField; +import org.apache.iceberg.SortOrder; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.types.Type; +import org.apache.iceberg.types.Types; + +class SortKeyUtil { + private SortKeyUtil() {} + + /** Compute the result schema of {@code SortKey} transformation */ + static Schema sortKeySchema(Schema schema, SortOrder sortOrder) { + List sortFields = sortOrder.fields(); + int size = sortFields.size(); + List transformedFields = Lists.newArrayListWithCapacity(size); + for (int i = 0; i < size; ++i) { + int sourceFieldId = sortFields.get(i).sourceId(); + Types.NestedField sourceField = schema.findField(sourceFieldId); + Preconditions.checkArgument( + sourceField != null, "Cannot find source field: %s", sourceFieldId); + Type transformedType = sortFields.get(i).transform().getResultType(sourceField.type()); + // There could be multiple transformations on the same source column, like in the PartitionKey + // case. To resolve the collision, field id is set to transform index and field name is set to + // sourceFieldName_transformIndex + Types.NestedField transformedField = + Types.NestedField.of( + i, + sourceField.isOptional(), + sourceField.name() + '_' + i, + transformedType, + sourceField.doc()); + transformedFields.add(transformedField); + } + + return new Schema(transformedFields); + } +} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestRangePartitioner.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestRangePartitioner.java new file mode 100644 index 000000000000..0485fdb7fa04 --- /dev/null +++ b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestRangePartitioner.java @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import static org.apache.iceberg.flink.sink.shuffle.Fixtures.CHAR_KEYS; +import static org.apache.iceberg.flink.sink.shuffle.Fixtures.SCHEMA; +import static org.apache.iceberg.flink.sink.shuffle.Fixtures.SORT_ORDER; +import static org.assertj.core.api.Assertions.assertThat; + +import java.util.Set; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.StringData; +import org.apache.iceberg.SortKey; +import org.apache.iceberg.relocated.com.google.common.collect.Sets; +import org.junit.jupiter.api.Test; + +public class TestRangePartitioner { + private final int numPartitions = 4; + + @Test + public void testRoundRobinRecordsBeforeStatisticsAvailable() { + RangePartitioner partitioner = new RangePartitioner(SCHEMA, SORT_ORDER); + Set results = Sets.newHashSetWithExpectedSize(numPartitions); + for (int i = 0; i < numPartitions; ++i) { + results.add( + partitioner.partition( + StatisticsOrRecord.fromRecord(GenericRowData.of(StringData.fromString("a"), 1)), + numPartitions)); + } + + // round-robin. every partition should get an assignment + assertThat(results).containsExactlyInAnyOrder(0, 1, 2, 3); + } + + @Test + public void testRoundRobinStatisticsWrapper() { + RangePartitioner partitioner = new RangePartitioner(SCHEMA, SORT_ORDER); + Set results = Sets.newHashSetWithExpectedSize(numPartitions); + for (int i = 0; i < numPartitions; ++i) { + GlobalStatistics statistics = + GlobalStatistics.fromRangeBounds(1L, new SortKey[] {CHAR_KEYS.get("a")}); + results.add( + partitioner.partition(StatisticsOrRecord.fromStatistics(statistics), numPartitions)); + } + + // round-robin. every partition should get an assignment + assertThat(results).containsExactlyInAnyOrder(0, 1, 2, 3); + } +} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchRangePartitioner.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchRangePartitioner.java new file mode 100644 index 000000000000..378c6afff077 --- /dev/null +++ b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchRangePartitioner.java @@ -0,0 +1,88 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import static org.assertj.core.api.Assertions.assertThat; + +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.iceberg.SortKey; +import org.apache.iceberg.SortOrder; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.RowDataWrapper; +import org.apache.iceberg.flink.TestFixtures; +import org.junit.jupiter.api.Test; + +public class TestSketchRangePartitioner { + // sort on the long id field + private static final SortOrder SORT_ORDER = + SortOrder.builderFor(TestFixtures.SCHEMA).asc("id").build(); + private static final SortKey SORT_KEY = new SortKey(TestFixtures.SCHEMA, SORT_ORDER); + private static final RowType ROW_TYPE = FlinkSchemaUtil.convert(TestFixtures.SCHEMA); + private static final int NUM_PARTITIONS = 16; + private static final long RANGE_STEP = 1_000; + private static final long MAX_ID = RANGE_STEP * NUM_PARTITIONS; + private static final SortKey[] RANGE_BOUNDS = createRangeBounds(); + + /** + * To understand how range bounds are used in range partitioning, here is an example for human + * ages with 4 partitions: [15, 32, 60]. The 4 ranges would be + * + *
    + *
  • age <= 15 + *
  • age > 15 && age <= 32 + *
  • age >32 && age <= 60 + *
  • age > 60 + *
+ */ + private static SortKey[] createRangeBounds() { + SortKey[] rangeBounds = new SortKey[NUM_PARTITIONS - 1]; + for (int i = 0; i < NUM_PARTITIONS - 1; ++i) { + RowData rowData = + GenericRowData.of( + StringData.fromString("data"), + RANGE_STEP * (i + 1), + StringData.fromString("2023-06-20")); + RowDataWrapper keyWrapper = new RowDataWrapper(ROW_TYPE, TestFixtures.SCHEMA.asStruct()); + keyWrapper.wrap(rowData); + SortKey sortKey = new SortKey(TestFixtures.SCHEMA, SORT_ORDER); + sortKey.wrap(keyWrapper); + rangeBounds[i] = sortKey; + } + + return rangeBounds; + } + + @Test + public void testRangePartitioningWithRangeBounds() { + SketchRangePartitioner partitioner = + new SketchRangePartitioner(TestFixtures.SCHEMA, SORT_ORDER, RANGE_BOUNDS); + GenericRowData row = + GenericRowData.of(StringData.fromString("data"), 0L, StringData.fromString("2023-06-20")); + for (long id = 0; id < MAX_ID; ++id) { + row.setField(1, id); + int partition = partitioner.partition(row, NUM_PARTITIONS); + assertThat(partition).isGreaterThanOrEqualTo(0).isLessThan(NUM_PARTITIONS); + int expectedPartition = id == 0L ? 0 : (int) ((id - 1) / RANGE_STEP); + assertThat(partition).isEqualTo(expectedPartition); + } + } +} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchUtil.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchUtil.java index 31dae5c76aeb..16202c075ea0 100644 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchUtil.java +++ b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchUtil.java @@ -19,10 +19,13 @@ package org.apache.iceberg.flink.sink.shuffle; import static org.apache.iceberg.flink.sink.shuffle.Fixtures.CHAR_KEYS; +import static org.apache.iceberg.flink.sink.shuffle.Fixtures.SORT_ORDER_COMPARTOR; import static org.assertj.core.api.Assertions.assertThat; import org.apache.iceberg.SortKey; import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; public class TestSketchUtil { @Test @@ -55,7 +58,7 @@ public void testRangeBoundsOneChannel() { assertThat( SketchUtil.rangeBounds( 1, - Fixtures.SORT_ORDER_COMPARTOR, + SORT_ORDER_COMPARTOR, new SortKey[] { CHAR_KEYS.get("a"), CHAR_KEYS.get("b"), @@ -72,7 +75,7 @@ public void testRangeBoundsDivisible() { assertThat( SketchUtil.rangeBounds( 3, - Fixtures.SORT_ORDER_COMPARTOR, + SORT_ORDER_COMPARTOR, new SortKey[] { CHAR_KEYS.get("a"), CHAR_KEYS.get("b"), @@ -90,7 +93,7 @@ public void testRangeBoundsNonDivisible() { assertThat( SketchUtil.rangeBounds( 4, - Fixtures.SORT_ORDER_COMPARTOR, + SORT_ORDER_COMPARTOR, new SortKey[] { CHAR_KEYS.get("a"), CHAR_KEYS.get("b"), @@ -113,7 +116,7 @@ public void testRangeBoundsSkipDuplicates() { assertThat( SketchUtil.rangeBounds( 4, - Fixtures.SORT_ORDER_COMPARTOR, + SORT_ORDER_COMPARTOR, new SortKey[] { CHAR_KEYS.get("a"), CHAR_KEYS.get("b"), @@ -130,4 +133,57 @@ public void testRangeBoundsSkipDuplicates() { // skipped duplicate c's .containsExactly(CHAR_KEYS.get("c"), CHAR_KEYS.get("g"), CHAR_KEYS.get("j")); } + + @ParameterizedTest + @ValueSource(ints = {4, 6}) + public void testPartitioningAndScaleUp(int numPartitions) { + // Range bounds are calculated based on 4 partitions + SortKey[] rangeBounds = + new SortKey[] {CHAR_KEYS.get("c"), CHAR_KEYS.get("j"), CHAR_KEYS.get("m")}; + + // <= c + assertPartition(0, CHAR_KEYS.get("a"), numPartitions, rangeBounds); + assertPartition(0, CHAR_KEYS.get("c"), numPartitions, rangeBounds); + // > c && <= j + assertPartition(1, CHAR_KEYS.get("d"), numPartitions, rangeBounds); + assertPartition(1, CHAR_KEYS.get("i"), numPartitions, rangeBounds); + assertPartition(1, CHAR_KEYS.get("j"), numPartitions, rangeBounds); + // > j && <= m + assertPartition(2, CHAR_KEYS.get("k"), numPartitions, rangeBounds); + assertPartition(2, CHAR_KEYS.get("l"), numPartitions, rangeBounds); + assertPartition(2, CHAR_KEYS.get("m"), numPartitions, rangeBounds); + // > m + assertPartition(3, CHAR_KEYS.get("n"), numPartitions, rangeBounds); + assertPartition(3, CHAR_KEYS.get("z"), numPartitions, rangeBounds); + } + + @Test + public void testPartitionScaleDown() { + // Range bounds are calculated based on 4 partitions + SortKey[] rangeBounds = + new SortKey[] {CHAR_KEYS.get("c"), CHAR_KEYS.get("j"), CHAR_KEYS.get("m")}; + int numPartitions = 3; + + // <= c + assertPartition(0, CHAR_KEYS.get("a"), numPartitions, rangeBounds); + assertPartition(0, CHAR_KEYS.get("c"), numPartitions, rangeBounds); + // > c && <= j + assertPartition(1, CHAR_KEYS.get("d"), numPartitions, rangeBounds); + assertPartition(1, CHAR_KEYS.get("i"), numPartitions, rangeBounds); + assertPartition(1, CHAR_KEYS.get("j"), numPartitions, rangeBounds); + // > j && <= m + assertPartition(2, CHAR_KEYS.get("k"), numPartitions, rangeBounds); + assertPartition(2, CHAR_KEYS.get("l"), numPartitions, rangeBounds); + assertPartition(2, CHAR_KEYS.get("m"), numPartitions, rangeBounds); + // > m + // reassigns out-of-range partitions via mod (% 3 in this case) + assertPartition(0, CHAR_KEYS.get("n"), numPartitions, rangeBounds); + assertPartition(0, CHAR_KEYS.get("z"), numPartitions, rangeBounds); + } + + private static void assertPartition( + int expectedPartition, SortKey key, int numPartitions, SortKey[] rangeBounds) { + assertThat(SketchUtil.partition(key, numPartitions, rangeBounds, SORT_ORDER_COMPARTOR)) + .isEqualTo(expectedPartition); + } } diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeyUtil.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeyUtil.java new file mode 100644 index 000000000000..1be7e27f2c01 --- /dev/null +++ b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeyUtil.java @@ -0,0 +1,73 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import static org.assertj.core.api.Assertions.assertThat; + +import org.apache.iceberg.NullOrder; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SortDirection; +import org.apache.iceberg.SortOrder; +import org.apache.iceberg.expressions.Expressions; +import org.apache.iceberg.types.Types; +import org.junit.jupiter.api.Test; + +public class TestSortKeyUtil { + @Test + public void testResultSchema() { + Schema schema = + new Schema( + Types.NestedField.required(1, "id", Types.StringType.get()), + Types.NestedField.required(2, "ratio", Types.DoubleType.get()), + Types.NestedField.optional( + 3, + "user", + Types.StructType.of( + Types.NestedField.required(11, "name", Types.StringType.get()), + Types.NestedField.required(12, "ts", Types.TimestampType.withoutZone()), + Types.NestedField.optional(13, "device_id", Types.UUIDType.get()), + Types.NestedField.optional( + 14, + "location", + Types.StructType.of( + Types.NestedField.required(101, "lat", Types.FloatType.get()), + Types.NestedField.required(102, "long", Types.FloatType.get()), + Types.NestedField.required(103, "blob", Types.BinaryType.get())))))); + + SortOrder sortOrder = + SortOrder.builderFor(schema) + .asc("ratio") + .sortBy(Expressions.hour("user.ts"), SortDirection.ASC, NullOrder.NULLS_FIRST) + .sortBy( + Expressions.bucket("user.device_id", 16), SortDirection.ASC, NullOrder.NULLS_FIRST) + .sortBy( + Expressions.truncate("user.location.blob", 16), + SortDirection.ASC, + NullOrder.NULLS_FIRST) + .build(); + + assertThat(SortKeyUtil.sortKeySchema(schema, sortOrder).asStruct()) + .isEqualTo( + Types.StructType.of( + Types.NestedField.required(0, "ratio_0", Types.DoubleType.get()), + Types.NestedField.required(1, "ts_1", Types.IntegerType.get()), + Types.NestedField.optional(2, "device_id_2", Types.IntegerType.get()), + Types.NestedField.required(3, "blob_3", Types.BinaryType.get()))); + } +} diff --git a/flink/v1.18/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/AggregatedStatisticsTracker.java b/flink/v1.18/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/AggregatedStatisticsTracker.java index 338523b7b074..5525f02c873e 100644 --- a/flink/v1.18/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/AggregatedStatisticsTracker.java +++ b/flink/v1.18/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/AggregatedStatisticsTracker.java @@ -223,7 +223,9 @@ private void merge(DataStatistics taskStatistics) { convertCoordinatorToSketch(); } - sketchStatistics.update(taskSketch); + if (taskSketch.getNumSamples() > 0) { + sketchStatistics.update(taskSketch); + } } } @@ -242,13 +244,18 @@ private CompletedStatistics completedStatistics(long checkpointId) { return CompletedStatistics.fromKeyFrequency(checkpointId, mapStatistics); } else { ReservoirItemsSketch sketch = sketchStatistics.getResult(); - LOG.info( - "Completed sketch statistics aggregation: " - + "reservoir size = {}, number of items seen = {}, number of samples = {}", - sketch.getK(), - sketch.getN(), - sketch.getNumSamples()); - return CompletedStatistics.fromKeySamples(checkpointId, sketch.getSamples()); + if (sketch != null) { + LOG.info( + "Completed sketch statistics aggregation: " + + "reservoir size = {}, number of items seen = {}, number of samples = {}", + sketch.getK(), + sketch.getN(), + sketch.getNumSamples()); + return CompletedStatistics.fromKeySamples(checkpointId, sketch.getSamples()); + } else { + LOG.info("Empty sketch statistics."); + return CompletedStatistics.fromKeySamples(checkpointId, new SortKey[0]); + } } } } diff --git a/flink/v1.18/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/CompletedStatistics.java b/flink/v1.18/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/CompletedStatistics.java index c0e228965ddd..e4cba174f0f2 100644 --- a/flink/v1.18/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/CompletedStatistics.java +++ b/flink/v1.18/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/CompletedStatistics.java @@ -100,4 +100,12 @@ Map keyFrequency() { SortKey[] keySamples() { return keySamples; } + + boolean isEmpty() { + if (type == StatisticsType.Sketch) { + return keySamples.length == 0; + } else { + return keyFrequency().isEmpty(); + } + } } diff --git a/flink/v1.18/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsCoordinator.java b/flink/v1.18/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsCoordinator.java index 3b21fbae315a..4bfde7204acf 100644 --- a/flink/v1.18/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsCoordinator.java +++ b/flink/v1.18/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsCoordinator.java @@ -38,11 +38,11 @@ import org.apache.flink.util.function.ThrowingRunnable; import org.apache.iceberg.Schema; import org.apache.iceberg.SortOrder; -import org.apache.iceberg.SortOrderComparators; import org.apache.iceberg.StructLike; import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; import org.apache.iceberg.relocated.com.google.common.collect.Iterables; import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.types.Comparators; import org.jetbrains.annotations.NotNull; import org.jetbrains.annotations.Nullable; import org.slf4j.Logger; @@ -91,7 +91,7 @@ class DataStatisticsCoordinator implements OperatorCoordinator { this.context = context; this.schema = schema; this.sortOrder = sortOrder; - this.comparator = SortOrderComparators.forSchema(schema, sortOrder); + this.comparator = Comparators.forType(SortKeyUtil.sortKeySchema(schema, sortOrder).asStruct()); this.downstreamParallelism = downstreamParallelism; this.statisticsType = statisticsType; this.closeFileCostWeightPercentage = closeFileCostWeightPercentage; @@ -202,17 +202,23 @@ private void handleDataStatisticRequest(int subtask, StatisticsEvent event) { aggregatedStatisticsTracker.updateAndCheckCompletion(subtask, event); if (maybeCompletedStatistics != null) { - // completedStatistics contains the complete samples, which is needed to compute - // the range bounds in globalStatistics if downstreamParallelism changed. - this.completedStatistics = maybeCompletedStatistics; - // globalStatistics only contains assignment calculated based on Map or Sketch statistics - this.globalStatistics = - globalStatistics( - maybeCompletedStatistics, - downstreamParallelism, - comparator, - closeFileCostWeightPercentage); - sendGlobalStatisticsToSubtasks(globalStatistics); + if (maybeCompletedStatistics.isEmpty()) { + LOG.info( + "Skip aggregated statistics for checkpoint {} as it is empty.", event.checkpointId()); + } else { + LOG.info("Completed statistics aggregation for checkpoint {}", event.checkpointId()); + // completedStatistics contains the complete samples, which is needed to compute + // the range bounds in globalStatistics if downstreamParallelism changed. + this.completedStatistics = maybeCompletedStatistics; + // globalStatistics only contains assignment calculated based on Map or Sketch statistics + this.globalStatistics = + globalStatistics( + maybeCompletedStatistics, + downstreamParallelism, + comparator, + closeFileCostWeightPercentage); + sendGlobalStatisticsToSubtasks(globalStatistics); + } } } @@ -324,9 +330,14 @@ public void checkpointCoordinator(long checkpointId, CompletableFuture r "Snapshotting data statistics coordinator {} for checkpoint {}", operatorName, checkpointId); - resultFuture.complete( - StatisticsUtil.serializeCompletedStatistics( - completedStatistics, completedStatisticsSerializer)); + if (completedStatistics == null) { + // null checkpoint result is not allowed, hence supply an empty byte array + resultFuture.complete(new byte[0]); + } else { + resultFuture.complete( + StatisticsUtil.serializeCompletedStatistics( + completedStatistics, completedStatisticsSerializer)); + } }, String.format("taking checkpoint %d", checkpointId)); } @@ -338,7 +349,7 @@ public void notifyCheckpointComplete(long checkpointId) {} public void resetToCheckpoint(long checkpointId, byte[] checkpointData) { Preconditions.checkState( !started, "The coordinator %s can only be reset if it was not yet started", operatorName); - if (checkpointData == null) { + if (checkpointData == null || checkpointData.length == 0) { LOG.info( "Data statistic coordinator {} has nothing to restore from checkpoint {}", operatorName, diff --git a/flink/v1.18/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/RangePartitioner.java b/flink/v1.18/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/RangePartitioner.java index 83a9461233d2..6608b938f5a8 100644 --- a/flink/v1.18/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/RangePartitioner.java +++ b/flink/v1.18/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/RangePartitioner.java @@ -23,12 +23,13 @@ import org.apache.flink.annotation.Internal; import org.apache.flink.api.common.functions.Partitioner; import org.apache.flink.table.data.RowData; +import org.apache.iceberg.DistributionMode; import org.apache.iceberg.Schema; import org.apache.iceberg.SortOrder; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -/** The wrapper class */ +/** This custom partitioner implements the {@link DistributionMode#RANGE} for Flink sink. */ @Internal public class RangePartitioner implements Partitioner { private static final Logger LOG = LoggerFactory.getLogger(RangePartitioner.class); @@ -94,9 +95,8 @@ static int adjustPartitionWithRescale( if (numPartitionsStatsCalculation <= numPartitions) { // no rescale or scale-up case. // new subtasks are ignored and not assigned any keys, which is sub-optimal and only - // transient. - // when rescale is detected, operator requests new statistics from coordinator upon - // initialization. + // transient. when rescale is detected, operator requests new statistics from + // coordinator upon initialization. return partition; } else { // scale-down case. diff --git a/flink/v1.18/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchRangePartitioner.java b/flink/v1.18/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchRangePartitioner.java index af78271ea5dc..dddb0d8722c0 100644 --- a/flink/v1.18/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchRangePartitioner.java +++ b/flink/v1.18/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchRangePartitioner.java @@ -18,17 +18,16 @@ */ package org.apache.iceberg.flink.sink.shuffle; -import java.util.Arrays; import java.util.Comparator; import org.apache.flink.api.common.functions.Partitioner; import org.apache.flink.table.data.RowData; import org.apache.iceberg.Schema; import org.apache.iceberg.SortKey; import org.apache.iceberg.SortOrder; -import org.apache.iceberg.SortOrderComparators; import org.apache.iceberg.StructLike; import org.apache.iceberg.flink.FlinkSchemaUtil; import org.apache.iceberg.flink.RowDataWrapper; +import org.apache.iceberg.types.Comparators; class SketchRangePartitioner implements Partitioner { private final SortKey sortKey; @@ -38,7 +37,7 @@ class SketchRangePartitioner implements Partitioner { SketchRangePartitioner(Schema schema, SortOrder sortOrder, SortKey[] rangeBounds) { this.sortKey = new SortKey(schema, sortOrder); - this.comparator = SortOrderComparators.forSchema(schema, sortOrder); + this.comparator = Comparators.forType(SortKeyUtil.sortKeySchema(schema, sortOrder).asStruct()); this.rangeBounds = rangeBounds; this.rowDataWrapper = new RowDataWrapper(FlinkSchemaUtil.convert(schema), schema.asStruct()); } @@ -47,18 +46,6 @@ class SketchRangePartitioner implements Partitioner { public int partition(RowData row, int numPartitions) { // reuse the sortKey and rowDataWrapper sortKey.wrap(rowDataWrapper.wrap(row)); - int partition = Arrays.binarySearch(rangeBounds, sortKey, comparator); - - // binarySearch either returns the match location or -[insertion point]-1 - if (partition < 0) { - partition = -partition - 1; - } - - if (partition > rangeBounds.length) { - partition = rangeBounds.length; - } - - return RangePartitioner.adjustPartitionWithRescale( - partition, rangeBounds.length + 1, numPartitions); + return SketchUtil.partition(sortKey, numPartitions, rangeBounds, comparator); } } diff --git a/flink/v1.18/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchUtil.java b/flink/v1.18/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchUtil.java index a58310611e8d..871ef9ef1149 100644 --- a/flink/v1.18/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchUtil.java +++ b/flink/v1.18/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchUtil.java @@ -139,4 +139,21 @@ static void convertMapToSketch( } }); } + + static int partition( + SortKey key, int numPartitions, SortKey[] rangeBounds, Comparator comparator) { + int partition = Arrays.binarySearch(rangeBounds, key, comparator); + + // binarySearch either returns the match location or -[insertion point]-1 + if (partition < 0) { + partition = -partition - 1; + } + + if (partition > rangeBounds.length) { + partition = rangeBounds.length; + } + + return RangePartitioner.adjustPartitionWithRescale( + partition, rangeBounds.length + 1, numPartitions); + } } diff --git a/flink/v1.18/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SortKeyUtil.java b/flink/v1.18/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SortKeyUtil.java new file mode 100644 index 000000000000..1e5bdbbac3e4 --- /dev/null +++ b/flink/v1.18/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SortKeyUtil.java @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import java.util.List; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SortField; +import org.apache.iceberg.SortOrder; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.types.Type; +import org.apache.iceberg.types.Types; + +class SortKeyUtil { + private SortKeyUtil() {} + + /** Compute the result schema of {@code SortKey} transformation */ + static Schema sortKeySchema(Schema schema, SortOrder sortOrder) { + List sortFields = sortOrder.fields(); + int size = sortFields.size(); + List transformedFields = Lists.newArrayListWithCapacity(size); + for (int i = 0; i < size; ++i) { + int sourceFieldId = sortFields.get(i).sourceId(); + Types.NestedField sourceField = schema.findField(sourceFieldId); + Preconditions.checkArgument( + sourceField != null, "Cannot find source field: %s", sourceFieldId); + Type transformedType = sortFields.get(i).transform().getResultType(sourceField.type()); + // There could be multiple transformations on the same source column, like in the PartitionKey + // case. To resolve the collision, field id is set to transform index and field name is set to + // sourceFieldName_transformIndex + Types.NestedField transformedField = + Types.NestedField.of( + i, + sourceField.isOptional(), + sourceField.name() + '_' + i, + transformedType, + sourceField.doc()); + transformedFields.add(transformedField); + } + + return new Schema(transformedFields); + } +} diff --git a/flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestRangePartitioner.java b/flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestRangePartitioner.java new file mode 100644 index 000000000000..0485fdb7fa04 --- /dev/null +++ b/flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestRangePartitioner.java @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import static org.apache.iceberg.flink.sink.shuffle.Fixtures.CHAR_KEYS; +import static org.apache.iceberg.flink.sink.shuffle.Fixtures.SCHEMA; +import static org.apache.iceberg.flink.sink.shuffle.Fixtures.SORT_ORDER; +import static org.assertj.core.api.Assertions.assertThat; + +import java.util.Set; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.StringData; +import org.apache.iceberg.SortKey; +import org.apache.iceberg.relocated.com.google.common.collect.Sets; +import org.junit.jupiter.api.Test; + +public class TestRangePartitioner { + private final int numPartitions = 4; + + @Test + public void testRoundRobinRecordsBeforeStatisticsAvailable() { + RangePartitioner partitioner = new RangePartitioner(SCHEMA, SORT_ORDER); + Set results = Sets.newHashSetWithExpectedSize(numPartitions); + for (int i = 0; i < numPartitions; ++i) { + results.add( + partitioner.partition( + StatisticsOrRecord.fromRecord(GenericRowData.of(StringData.fromString("a"), 1)), + numPartitions)); + } + + // round-robin. every partition should get an assignment + assertThat(results).containsExactlyInAnyOrder(0, 1, 2, 3); + } + + @Test + public void testRoundRobinStatisticsWrapper() { + RangePartitioner partitioner = new RangePartitioner(SCHEMA, SORT_ORDER); + Set results = Sets.newHashSetWithExpectedSize(numPartitions); + for (int i = 0; i < numPartitions; ++i) { + GlobalStatistics statistics = + GlobalStatistics.fromRangeBounds(1L, new SortKey[] {CHAR_KEYS.get("a")}); + results.add( + partitioner.partition(StatisticsOrRecord.fromStatistics(statistics), numPartitions)); + } + + // round-robin. every partition should get an assignment + assertThat(results).containsExactlyInAnyOrder(0, 1, 2, 3); + } +} diff --git a/flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchRangePartitioner.java b/flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchRangePartitioner.java new file mode 100644 index 000000000000..378c6afff077 --- /dev/null +++ b/flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchRangePartitioner.java @@ -0,0 +1,88 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import static org.assertj.core.api.Assertions.assertThat; + +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.iceberg.SortKey; +import org.apache.iceberg.SortOrder; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.RowDataWrapper; +import org.apache.iceberg.flink.TestFixtures; +import org.junit.jupiter.api.Test; + +public class TestSketchRangePartitioner { + // sort on the long id field + private static final SortOrder SORT_ORDER = + SortOrder.builderFor(TestFixtures.SCHEMA).asc("id").build(); + private static final SortKey SORT_KEY = new SortKey(TestFixtures.SCHEMA, SORT_ORDER); + private static final RowType ROW_TYPE = FlinkSchemaUtil.convert(TestFixtures.SCHEMA); + private static final int NUM_PARTITIONS = 16; + private static final long RANGE_STEP = 1_000; + private static final long MAX_ID = RANGE_STEP * NUM_PARTITIONS; + private static final SortKey[] RANGE_BOUNDS = createRangeBounds(); + + /** + * To understand how range bounds are used in range partitioning, here is an example for human + * ages with 4 partitions: [15, 32, 60]. The 4 ranges would be + * + *
    + *
  • age <= 15 + *
  • age > 15 && age <= 32 + *
  • age >32 && age <= 60 + *
  • age > 60 + *
+ */ + private static SortKey[] createRangeBounds() { + SortKey[] rangeBounds = new SortKey[NUM_PARTITIONS - 1]; + for (int i = 0; i < NUM_PARTITIONS - 1; ++i) { + RowData rowData = + GenericRowData.of( + StringData.fromString("data"), + RANGE_STEP * (i + 1), + StringData.fromString("2023-06-20")); + RowDataWrapper keyWrapper = new RowDataWrapper(ROW_TYPE, TestFixtures.SCHEMA.asStruct()); + keyWrapper.wrap(rowData); + SortKey sortKey = new SortKey(TestFixtures.SCHEMA, SORT_ORDER); + sortKey.wrap(keyWrapper); + rangeBounds[i] = sortKey; + } + + return rangeBounds; + } + + @Test + public void testRangePartitioningWithRangeBounds() { + SketchRangePartitioner partitioner = + new SketchRangePartitioner(TestFixtures.SCHEMA, SORT_ORDER, RANGE_BOUNDS); + GenericRowData row = + GenericRowData.of(StringData.fromString("data"), 0L, StringData.fromString("2023-06-20")); + for (long id = 0; id < MAX_ID; ++id) { + row.setField(1, id); + int partition = partitioner.partition(row, NUM_PARTITIONS); + assertThat(partition).isGreaterThanOrEqualTo(0).isLessThan(NUM_PARTITIONS); + int expectedPartition = id == 0L ? 0 : (int) ((id - 1) / RANGE_STEP); + assertThat(partition).isEqualTo(expectedPartition); + } + } +} diff --git a/flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchUtil.java b/flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchUtil.java index 31dae5c76aeb..16202c075ea0 100644 --- a/flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchUtil.java +++ b/flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchUtil.java @@ -19,10 +19,13 @@ package org.apache.iceberg.flink.sink.shuffle; import static org.apache.iceberg.flink.sink.shuffle.Fixtures.CHAR_KEYS; +import static org.apache.iceberg.flink.sink.shuffle.Fixtures.SORT_ORDER_COMPARTOR; import static org.assertj.core.api.Assertions.assertThat; import org.apache.iceberg.SortKey; import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; public class TestSketchUtil { @Test @@ -55,7 +58,7 @@ public void testRangeBoundsOneChannel() { assertThat( SketchUtil.rangeBounds( 1, - Fixtures.SORT_ORDER_COMPARTOR, + SORT_ORDER_COMPARTOR, new SortKey[] { CHAR_KEYS.get("a"), CHAR_KEYS.get("b"), @@ -72,7 +75,7 @@ public void testRangeBoundsDivisible() { assertThat( SketchUtil.rangeBounds( 3, - Fixtures.SORT_ORDER_COMPARTOR, + SORT_ORDER_COMPARTOR, new SortKey[] { CHAR_KEYS.get("a"), CHAR_KEYS.get("b"), @@ -90,7 +93,7 @@ public void testRangeBoundsNonDivisible() { assertThat( SketchUtil.rangeBounds( 4, - Fixtures.SORT_ORDER_COMPARTOR, + SORT_ORDER_COMPARTOR, new SortKey[] { CHAR_KEYS.get("a"), CHAR_KEYS.get("b"), @@ -113,7 +116,7 @@ public void testRangeBoundsSkipDuplicates() { assertThat( SketchUtil.rangeBounds( 4, - Fixtures.SORT_ORDER_COMPARTOR, + SORT_ORDER_COMPARTOR, new SortKey[] { CHAR_KEYS.get("a"), CHAR_KEYS.get("b"), @@ -130,4 +133,57 @@ public void testRangeBoundsSkipDuplicates() { // skipped duplicate c's .containsExactly(CHAR_KEYS.get("c"), CHAR_KEYS.get("g"), CHAR_KEYS.get("j")); } + + @ParameterizedTest + @ValueSource(ints = {4, 6}) + public void testPartitioningAndScaleUp(int numPartitions) { + // Range bounds are calculated based on 4 partitions + SortKey[] rangeBounds = + new SortKey[] {CHAR_KEYS.get("c"), CHAR_KEYS.get("j"), CHAR_KEYS.get("m")}; + + // <= c + assertPartition(0, CHAR_KEYS.get("a"), numPartitions, rangeBounds); + assertPartition(0, CHAR_KEYS.get("c"), numPartitions, rangeBounds); + // > c && <= j + assertPartition(1, CHAR_KEYS.get("d"), numPartitions, rangeBounds); + assertPartition(1, CHAR_KEYS.get("i"), numPartitions, rangeBounds); + assertPartition(1, CHAR_KEYS.get("j"), numPartitions, rangeBounds); + // > j && <= m + assertPartition(2, CHAR_KEYS.get("k"), numPartitions, rangeBounds); + assertPartition(2, CHAR_KEYS.get("l"), numPartitions, rangeBounds); + assertPartition(2, CHAR_KEYS.get("m"), numPartitions, rangeBounds); + // > m + assertPartition(3, CHAR_KEYS.get("n"), numPartitions, rangeBounds); + assertPartition(3, CHAR_KEYS.get("z"), numPartitions, rangeBounds); + } + + @Test + public void testPartitionScaleDown() { + // Range bounds are calculated based on 4 partitions + SortKey[] rangeBounds = + new SortKey[] {CHAR_KEYS.get("c"), CHAR_KEYS.get("j"), CHAR_KEYS.get("m")}; + int numPartitions = 3; + + // <= c + assertPartition(0, CHAR_KEYS.get("a"), numPartitions, rangeBounds); + assertPartition(0, CHAR_KEYS.get("c"), numPartitions, rangeBounds); + // > c && <= j + assertPartition(1, CHAR_KEYS.get("d"), numPartitions, rangeBounds); + assertPartition(1, CHAR_KEYS.get("i"), numPartitions, rangeBounds); + assertPartition(1, CHAR_KEYS.get("j"), numPartitions, rangeBounds); + // > j && <= m + assertPartition(2, CHAR_KEYS.get("k"), numPartitions, rangeBounds); + assertPartition(2, CHAR_KEYS.get("l"), numPartitions, rangeBounds); + assertPartition(2, CHAR_KEYS.get("m"), numPartitions, rangeBounds); + // > m + // reassigns out-of-range partitions via mod (% 3 in this case) + assertPartition(0, CHAR_KEYS.get("n"), numPartitions, rangeBounds); + assertPartition(0, CHAR_KEYS.get("z"), numPartitions, rangeBounds); + } + + private static void assertPartition( + int expectedPartition, SortKey key, int numPartitions, SortKey[] rangeBounds) { + assertThat(SketchUtil.partition(key, numPartitions, rangeBounds, SORT_ORDER_COMPARTOR)) + .isEqualTo(expectedPartition); + } } diff --git a/flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeyUtil.java b/flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeyUtil.java new file mode 100644 index 000000000000..1be7e27f2c01 --- /dev/null +++ b/flink/v1.18/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeyUtil.java @@ -0,0 +1,73 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import static org.assertj.core.api.Assertions.assertThat; + +import org.apache.iceberg.NullOrder; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SortDirection; +import org.apache.iceberg.SortOrder; +import org.apache.iceberg.expressions.Expressions; +import org.apache.iceberg.types.Types; +import org.junit.jupiter.api.Test; + +public class TestSortKeyUtil { + @Test + public void testResultSchema() { + Schema schema = + new Schema( + Types.NestedField.required(1, "id", Types.StringType.get()), + Types.NestedField.required(2, "ratio", Types.DoubleType.get()), + Types.NestedField.optional( + 3, + "user", + Types.StructType.of( + Types.NestedField.required(11, "name", Types.StringType.get()), + Types.NestedField.required(12, "ts", Types.TimestampType.withoutZone()), + Types.NestedField.optional(13, "device_id", Types.UUIDType.get()), + Types.NestedField.optional( + 14, + "location", + Types.StructType.of( + Types.NestedField.required(101, "lat", Types.FloatType.get()), + Types.NestedField.required(102, "long", Types.FloatType.get()), + Types.NestedField.required(103, "blob", Types.BinaryType.get())))))); + + SortOrder sortOrder = + SortOrder.builderFor(schema) + .asc("ratio") + .sortBy(Expressions.hour("user.ts"), SortDirection.ASC, NullOrder.NULLS_FIRST) + .sortBy( + Expressions.bucket("user.device_id", 16), SortDirection.ASC, NullOrder.NULLS_FIRST) + .sortBy( + Expressions.truncate("user.location.blob", 16), + SortDirection.ASC, + NullOrder.NULLS_FIRST) + .build(); + + assertThat(SortKeyUtil.sortKeySchema(schema, sortOrder).asStruct()) + .isEqualTo( + Types.StructType.of( + Types.NestedField.required(0, "ratio_0", Types.DoubleType.get()), + Types.NestedField.required(1, "ts_1", Types.IntegerType.get()), + Types.NestedField.optional(2, "device_id_2", Types.IntegerType.get()), + Types.NestedField.required(3, "blob_3", Types.BinaryType.get()))); + } +} From b17d1c9abdb8fbd668ac02194cadd6003c3e37f7 Mon Sep 17 00:00:00 2001 From: Piotr Findeisen Date: Fri, 2 Aug 2024 20:44:48 +0200 Subject: [PATCH 21/55] Core: Remove reflection from TestParallelIterable (#10857) This is a unit test, so can leverage package-private access. --- .../apache/iceberg/util/ParallelIterable.java | 9 +++- .../iceberg/util/TestParallelIterable.java | 41 ++++++++----------- 2 files changed, 25 insertions(+), 25 deletions(-) diff --git a/core/src/main/java/org/apache/iceberg/util/ParallelIterable.java b/core/src/main/java/org/apache/iceberg/util/ParallelIterable.java index 6486bd7fd483..16fa6f3d8537 100644 --- a/core/src/main/java/org/apache/iceberg/util/ParallelIterable.java +++ b/core/src/main/java/org/apache/iceberg/util/ParallelIterable.java @@ -35,6 +35,7 @@ import org.apache.iceberg.io.CloseableGroup; import org.apache.iceberg.io.CloseableIterable; import org.apache.iceberg.io.CloseableIterator; +import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.relocated.com.google.common.collect.Iterables; import org.apache.iceberg.relocated.com.google.common.io.Closer; @@ -77,7 +78,8 @@ public CloseableIterator iterator() { return iter; } - private static class ParallelIterator implements CloseableIterator { + @VisibleForTesting + static class ParallelIterator implements CloseableIterator { private final Iterator> tasks; private final Deque> yieldedTasks = new ArrayDeque<>(); private final ExecutorService workerPool; @@ -229,6 +231,11 @@ public synchronized T next() { } return queue.poll(); } + + @VisibleForTesting + int queueSize() { + return queue.size(); + } } private static class Task implements Supplier>>, Closeable { diff --git a/core/src/test/java/org/apache/iceberg/util/TestParallelIterable.java b/core/src/test/java/org/apache/iceberg/util/TestParallelIterable.java index 4910732f6e35..c259bbd0a7e2 100644 --- a/core/src/test/java/org/apache/iceberg/util/TestParallelIterable.java +++ b/core/src/test/java/org/apache/iceberg/util/TestParallelIterable.java @@ -21,12 +21,9 @@ import static org.assertj.core.api.Assertions.assertThat; import java.io.IOException; -import java.lang.reflect.Field; import java.util.Collections; import java.util.Iterator; import java.util.List; -import java.util.Queue; -import java.util.concurrent.ConcurrentLinkedQueue; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.TimeUnit; @@ -40,6 +37,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Iterables; import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.relocated.com.google.common.collect.Multiset; +import org.apache.iceberg.util.ParallelIterable.ParallelIterator; import org.awaitility.Awaitility; import org.junit.jupiter.api.Test; @@ -64,20 +62,17 @@ public CloseableIterator iterator() { }); ParallelIterable parallelIterable = new ParallelIterable<>(transform, executor); - CloseableIterator iterator = parallelIterable.iterator(); - Field queueField = iterator.getClass().getDeclaredField("queue"); - queueField.setAccessible(true); - ConcurrentLinkedQueue queue = (ConcurrentLinkedQueue) queueField.get(iterator); + ParallelIterator iterator = (ParallelIterator) parallelIterable.iterator(); assertThat(iterator.hasNext()).isTrue(); assertThat(iterator.next()).isNotNull(); Awaitility.await("Queue is populated") .atMost(5, TimeUnit.SECONDS) - .untilAsserted(() -> queueHasElements(iterator, queue)); + .untilAsserted(() -> queueHasElements(iterator)); iterator.close(); Awaitility.await("Queue is cleared") .atMost(5, TimeUnit.SECONDS) - .untilAsserted(() -> assertThat(queue).isEmpty()); + .untilAsserted(() -> assertThat(iterator.queueSize()).isEqualTo(0)); } @Test @@ -124,20 +119,21 @@ public CloseableIterator iterator() { }); ParallelIterable parallelIterable = new ParallelIterable<>(transform, executor); - CloseableIterator iterator = parallelIterable.iterator(); - Field queueField = iterator.getClass().getDeclaredField("queue"); - queueField.setAccessible(true); - ConcurrentLinkedQueue queue = (ConcurrentLinkedQueue) queueField.get(iterator); + ParallelIterator iterator = (ParallelIterator) parallelIterable.iterator(); assertThat(iterator.hasNext()).isTrue(); assertThat(iterator.next()).isNotNull(); Awaitility.await("Queue is populated") .atMost(5, TimeUnit.SECONDS) - .untilAsserted(() -> queueHasElements(iterator, queue)); + .untilAsserted(() -> queueHasElements(iterator)); iterator.close(); Awaitility.await("Queue is cleared") .atMost(5, TimeUnit.SECONDS) - .untilAsserted(() -> assertThat(queue).as("Queue is not empty after cleaning").isEmpty()); + .untilAsserted( + () -> + assertThat(iterator.queueSize()) + .as("Queue is not empty after cleaning") + .isEqualTo(0)); } @Test @@ -159,17 +155,14 @@ public void limitQueueSize() throws IOException, IllegalAccessException, NoSuchF ExecutorService executor = Executors.newCachedThreadPool(); ParallelIterable parallelIterable = new ParallelIterable<>(iterables, executor, maxQueueSize); - CloseableIterator iterator = parallelIterable.iterator(); - Field queueField = iterator.getClass().getDeclaredField("queue"); - queueField.setAccessible(true); - ConcurrentLinkedQueue queue = (ConcurrentLinkedQueue) queueField.get(iterator); + ParallelIterator iterator = (ParallelIterator) parallelIterable.iterator(); Multiset actualValues = HashMultiset.create(); while (iterator.hasNext()) { - assertThat(queue) - .as("iterator internal queue") - .hasSizeLessThanOrEqualTo(maxQueueSize + iterables.size()); + assertThat(iterator.queueSize()) + .as("iterator internal queue size") + .isLessThanOrEqualTo(maxQueueSize + iterables.size()); actualValues.add(iterator.next()); } @@ -181,9 +174,9 @@ public void limitQueueSize() throws IOException, IllegalAccessException, NoSuchF executor.shutdownNow(); } - private void queueHasElements(CloseableIterator iterator, Queue queue) { + private void queueHasElements(ParallelIterator iterator) { assertThat(iterator.hasNext()).isTrue(); assertThat(iterator.next()).isNotNull(); - assertThat(queue).isNotEmpty(); + assertThat(iterator.queueSize()).as("queue size").isGreaterThan(0); } } From 479f468c5f389bd7a30938114f8e79445c48f179 Mon Sep 17 00:00:00 2001 From: Ryan Blue Date: Sun, 4 Aug 2024 14:32:52 -0700 Subject: [PATCH 22/55] Spec: Deprecate the file system table scheme (#10833) --- format/spec.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/format/spec.md b/format/spec.md index 5a90f6fd978d..daef7538e730 100644 --- a/format/spec.md +++ b/format/spec.md @@ -779,7 +779,9 @@ When two commits happen at the same time and are based on the same version, only #### File System Tables -An atomic swap can be implemented using atomic rename in file systems that support it, like HDFS or most local file systems [1]. +_Note: This file system based scheme to commit a metadata file is **deprecated** and will be removed in version 4 of this spec. The scheme is **unsafe** in object stores and local file systems._ + +An atomic swap can be implemented using atomic rename in file systems that support it, like HDFS [1]. Each version of table metadata is stored in a metadata folder under the table’s base location using a file naming scheme that includes a version number, `V`: `v.metadata.json`. To commit a new metadata version, `V+1`, the writer performs the following steps: @@ -1393,4 +1395,4 @@ This section covers topics not required by the specification but recommendations Iceberg supports two types of histories for tables. A history of previous "current snapshots" stored in ["snapshot-log" table metadata](#table-metadata-fields) and [parent-child lineage stored in "snapshots"](#table-metadata-fields). These two histories might indicate different snapshot IDs for a specific timestamp. The discrepancies can be caused by a variety of table operations (e.g. updating the `current-snapshot-id` can be used to set the snapshot of a table to any arbitrary snapshot, which might have a lineage derived from a table branch or no lineage at all). -When processing point in time queries implementations should use "snapshot-log" metadata to lookup the table state at the given point in time. This ensures time-travel queries reflect the state of the table at the provided timestamp. For example a SQL query like `SELECT * FROM prod.db.table TIMESTAMP AS OF '1986-10-26 01:21:00Z';` would find the snapshot of the Iceberg table just prior to '1986-10-26 01:21:00 UTC' in the snapshot logs and use the metadata from that snapshot to perform the scan of the table. If no snapshot exists prior to the timestamp given or "snapshot-log" is not populated (it is an optional field), then systems should raise an informative error message about the missing metadata. \ No newline at end of file +When processing point in time queries implementations should use "snapshot-log" metadata to lookup the table state at the given point in time. This ensures time-travel queries reflect the state of the table at the provided timestamp. For example a SQL query like `SELECT * FROM prod.db.table TIMESTAMP AS OF '1986-10-26 01:21:00Z';` would find the snapshot of the Iceberg table just prior to '1986-10-26 01:21:00 UTC' in the snapshot logs and use the metadata from that snapshot to perform the scan of the table. If no snapshot exists prior to the timestamp given or "snapshot-log" is not populated (it is an optional field), then systems should raise an informative error message about the missing metadata. From d9aacd24cc9d730d6416a93d31dd5cde8cbd260a Mon Sep 17 00:00:00 2001 From: Shani Elharrar Date: Mon, 5 Aug 2024 05:45:46 +0300 Subject: [PATCH 23/55] Core, API: UpdatePartitionSpec: Added ability to create a new Partition Spec but not set it as the Default --- .../apache/iceberg/UpdatePartitionSpec.java | 11 +++++++++++ .../iceberg/BaseUpdatePartitionSpec.java | 16 +++++++++++++++- .../org/apache/iceberg/TableMetadata.java | 4 ++++ .../iceberg/TestTableUpdatePartitionSpec.java | 19 +++++++++++++++++++ 4 files changed, 49 insertions(+), 1 deletion(-) diff --git a/api/src/main/java/org/apache/iceberg/UpdatePartitionSpec.java b/api/src/main/java/org/apache/iceberg/UpdatePartitionSpec.java index f48d590af1ce..eeb596d42d5c 100644 --- a/api/src/main/java/org/apache/iceberg/UpdatePartitionSpec.java +++ b/api/src/main/java/org/apache/iceberg/UpdatePartitionSpec.java @@ -122,4 +122,15 @@ public interface UpdatePartitionSpec extends PendingUpdate { * change conflicts with other additions, removals, or renames. */ UpdatePartitionSpec renameField(String name, String newName); + + /** + * Sets that the new partition spec will be NOT set as the default partition spec for the table, + * the default behavior is to do so. + * + * @return this for method chaining + */ + default UpdatePartitionSpec addNonDefaultSpec() { + throw new UnsupportedOperationException( + this.getClass().getName() + " doesn't implement addNonDefaultSpec()"); + }; } diff --git a/core/src/main/java/org/apache/iceberg/BaseUpdatePartitionSpec.java b/core/src/main/java/org/apache/iceberg/BaseUpdatePartitionSpec.java index 2e1c9199174c..c69f6f3844f9 100644 --- a/core/src/main/java/org/apache/iceberg/BaseUpdatePartitionSpec.java +++ b/core/src/main/java/org/apache/iceberg/BaseUpdatePartitionSpec.java @@ -59,11 +59,13 @@ class BaseUpdatePartitionSpec implements UpdatePartitionSpec { private final Map renames = Maps.newHashMap(); private boolean caseSensitive; + private boolean setAsDefault; private int lastAssignedPartitionId; BaseUpdatePartitionSpec(TableOperations ops) { this.ops = ops; this.caseSensitive = true; + this.setAsDefault = true; this.base = ops.current(); this.formatVersion = base.formatVersion(); this.spec = base.spec(); @@ -95,6 +97,7 @@ class BaseUpdatePartitionSpec implements UpdatePartitionSpec { this.base = null; this.formatVersion = formatVersion; this.caseSensitive = true; + this.setAsDefault = true; this.spec = spec; this.schema = spec.schema(); this.nameToField = indexSpecByName(spec); @@ -146,6 +149,12 @@ public UpdatePartitionSpec caseSensitive(boolean isCaseSensitive) { return this; } + @Override + public UpdatePartitionSpec addNonDefaultSpec() { + this.setAsDefault = false; + return this; + } + @Override public BaseUpdatePartitionSpec addField(String sourceName) { return addField(Expressions.ref(sourceName)); @@ -327,7 +336,12 @@ public PartitionSpec apply() { @Override public void commit() { - TableMetadata update = base.updatePartitionSpec(apply()); + TableMetadata update; + if (setAsDefault) { + update = base.updatePartitionSpec(apply()); + } else { + update = base.addPartitionSpec(apply()); + } ops.commit(base, update); } diff --git a/core/src/main/java/org/apache/iceberg/TableMetadata.java b/core/src/main/java/org/apache/iceberg/TableMetadata.java index bd1c8a1a0371..923db6bbd68f 100644 --- a/core/src/main/java/org/apache/iceberg/TableMetadata.java +++ b/core/src/main/java/org/apache/iceberg/TableMetadata.java @@ -564,6 +564,10 @@ public TableMetadata updatePartitionSpec(PartitionSpec newPartitionSpec) { return new Builder(this).setDefaultPartitionSpec(newPartitionSpec).build(); } + public TableMetadata addPartitionSpec(PartitionSpec newPartitionSpec) { + return new Builder(this).addPartitionSpec(newPartitionSpec).build(); + } + public TableMetadata replaceSortOrder(SortOrder newOrder) { return new Builder(this).setDefaultSortOrder(newOrder).build(); } diff --git a/core/src/test/java/org/apache/iceberg/TestTableUpdatePartitionSpec.java b/core/src/test/java/org/apache/iceberg/TestTableUpdatePartitionSpec.java index 482514c40093..f327ef752947 100644 --- a/core/src/test/java/org/apache/iceberg/TestTableUpdatePartitionSpec.java +++ b/core/src/test/java/org/apache/iceberg/TestTableUpdatePartitionSpec.java @@ -287,4 +287,23 @@ public void testAddAfterLastFieldRemoved() { assertThat(table.spec().lastAssignedFieldId()).isEqualTo(1001); assertThat(table.ops().current().lastAssignedPartitionId()).isEqualTo(1001); } + + @TestTemplate + public void testCommitUpdatedSpecWithoutSettingNewDefault() { + PartitionSpec originalSpec = table.spec(); + table.updateSpec().addField("id").addNonDefaultSpec().commit(); + + assertThat(table.spec()) + .as("Should not set the default spec for the table") + .isSameAs(originalSpec); + + assertThat(table.specs().get(1)) + .as("The new spec created for the table") + .isEqualTo( + PartitionSpec.builderFor(table.schema()) + .withSpecId(1) + .bucket("data", 16) + .identity("id") + .build()); + } } From 4cfa38fbb2d8614eb22014b665b74f3b2c17b8ba Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 5 Aug 2024 01:14:32 -0500 Subject: [PATCH 24/55] Build: Bump com.palantir.baseline:gradle-baseline-java (#10864) Bumps [com.palantir.baseline:gradle-baseline-java](https://github.com/palantir/gradle-baseline) from 5.58.0 to 5.61.0. - [Release notes](https://github.com/palantir/gradle-baseline/releases) - [Changelog](https://github.com/palantir/gradle-baseline/blob/develop/.changelog.yml) - [Commits](https://github.com/palantir/gradle-baseline/compare/5.58.0...5.61.0) --- updated-dependencies: - dependency-name: com.palantir.baseline:gradle-baseline-java dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- build.gradle | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build.gradle b/build.gradle index 60fb5e7830d3..74029f3db2eb 100644 --- a/build.gradle +++ b/build.gradle @@ -27,7 +27,7 @@ buildscript { } dependencies { classpath 'io.github.goooler.shadow:shadow-gradle-plugin:8.1.8' - classpath 'com.palantir.baseline:gradle-baseline-java:5.58.0' + classpath 'com.palantir.baseline:gradle-baseline-java:5.61.0' classpath 'com.diffplug.spotless:spotless-plugin-gradle:6.13.0' classpath 'gradle.plugin.org.inferred:gradle-processors:3.7.0' classpath 'me.champeau.jmh:jmh-gradle-plugin:0.7.2' From 98ecc9a9ef7e7bd136d308828355c107de86f4b2 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 5 Aug 2024 08:57:45 +0200 Subject: [PATCH 25/55] Build: Bump nessie from 0.94.2 to 0.94.4 (#10869) Bumps `nessie` from 0.94.2 to 0.94.4. Updates `org.projectnessie.nessie:nessie-client` from 0.94.2 to 0.94.4 Updates `org.projectnessie.nessie:nessie-jaxrs-testextension` from 0.94.2 to 0.94.4 Updates `org.projectnessie.nessie:nessie-versioned-storage-inmemory-tests` from 0.94.2 to 0.94.4 Updates `org.projectnessie.nessie:nessie-versioned-storage-testextension` from 0.94.2 to 0.94.4 --- updated-dependencies: - dependency-name: org.projectnessie.nessie:nessie-client dependency-type: direct:production update-type: version-update:semver-patch - dependency-name: org.projectnessie.nessie:nessie-jaxrs-testextension dependency-type: direct:production update-type: version-update:semver-patch - dependency-name: org.projectnessie.nessie:nessie-versioned-storage-inmemory-tests dependency-type: direct:production update-type: version-update:semver-patch - dependency-name: org.projectnessie.nessie:nessie-versioned-storage-testextension dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- gradle/libs.versions.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index 02702955a5fe..2889ce9cb033 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -67,7 +67,7 @@ kryo-shaded = "4.0.3" microprofile-openapi-api = "3.1.1" mockito = "4.11.0" mockserver = "5.15.0" -nessie = "0.94.2" +nessie = "0.94.4" netty-buffer = "4.1.112.Final" netty-buffer-compat = "4.1.112.Final" object-client-bundle = "3.3.2" From e8582c0f00159638f822c831ddeccd47b01d7f9f Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 5 Aug 2024 09:02:39 +0200 Subject: [PATCH 26/55] Build: Bump org.xerial:sqlite-jdbc from 3.46.0.0 to 3.46.0.1 (#10871) Bumps [org.xerial:sqlite-jdbc](https://github.com/xerial/sqlite-jdbc) from 3.46.0.0 to 3.46.0.1. - [Release notes](https://github.com/xerial/sqlite-jdbc/releases) - [Changelog](https://github.com/xerial/sqlite-jdbc/blob/master/CHANGELOG) - [Commits](https://github.com/xerial/sqlite-jdbc/compare/3.46.0.0...3.46.0.1) --- updated-dependencies: - dependency-name: org.xerial:sqlite-jdbc dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- gradle/libs.versions.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index 2889ce9cb033..16ea78901858 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -84,7 +84,7 @@ spark-hive34 = "3.4.3" spark-hive35 = "3.5.1" spring-boot = "2.7.18" spring-web = "5.3.37" -sqlite-jdbc = "3.46.0.0" +sqlite-jdbc = "3.46.0.1" testcontainers = "1.20.0" tez010 = "0.10.3" tez08 = { strictly = "0.8.4"} # see rich version usage explanation above From 1f21989305d92f1530c785c826b221bddfb4ce0b Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 5 Aug 2024 09:08:07 +0200 Subject: [PATCH 27/55] Build: Bump org.apache.commons:commons-compress from 1.26.0 to 1.26.2 (#10868) Bumps org.apache.commons:commons-compress from 1.26.0 to 1.26.2. --- updated-dependencies: - dependency-name: org.apache.commons:commons-compress dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- kafka-connect/build.gradle | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kafka-connect/build.gradle b/kafka-connect/build.gradle index 60fa879d291f..a0e55f23405d 100644 --- a/kafka-connect/build.gradle +++ b/kafka-connect/build.gradle @@ -71,7 +71,7 @@ project(':iceberg-kafka-connect:iceberg-kafka-connect-runtime') { resolutionStrategy { force 'org.codehaus.jettison:jettison:1.5.4' force 'org.xerial.snappy:snappy-java:1.1.10.5' - force 'org.apache.commons:commons-compress:1.26.0' + force 'org.apache.commons:commons-compress:1.26.2' force 'org.apache.hadoop.thirdparty:hadoop-shaded-guava:1.2.0' } } From 9b70fdfd03d4a558d190495c2527d85386de7c94 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 5 Aug 2024 09:08:21 +0200 Subject: [PATCH 28/55] Build: Bump software.amazon.awssdk:bom from 2.26.25 to 2.26.29 (#10866) Bumps software.amazon.awssdk:bom from 2.26.25 to 2.26.29. --- updated-dependencies: - dependency-name: software.amazon.awssdk:bom dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- gradle/libs.versions.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index 16ea78901858..b8f20f3a2799 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -28,7 +28,7 @@ arrow = "15.0.2" avro = "1.11.3" assertj-core = "3.26.3" awaitility = "4.2.1" -awssdk-bom = "2.26.25" +awssdk-bom = "2.26.29" azuresdk-bom = "1.2.25" awssdk-s3accessgrants = "2.0.0" caffeine = "2.9.3" From 74a9adbc0b6c0f8bdb1dbee78c333d64fb52b41f Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 5 Aug 2024 09:08:35 +0200 Subject: [PATCH 29/55] Build: Bump mkdocs-material from 9.5.30 to 9.5.31 (#10863) Bumps [mkdocs-material](https://github.com/squidfunk/mkdocs-material) from 9.5.30 to 9.5.31. - [Release notes](https://github.com/squidfunk/mkdocs-material/releases) - [Changelog](https://github.com/squidfunk/mkdocs-material/blob/master/CHANGELOG) - [Commits](https://github.com/squidfunk/mkdocs-material/compare/9.5.30...9.5.31) --- updated-dependencies: - dependency-name: mkdocs-material dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- site/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/site/requirements.txt b/site/requirements.txt index 3fd9cf45e443..cb85511b7d96 100644 --- a/site/requirements.txt +++ b/site/requirements.txt @@ -17,7 +17,7 @@ mkdocs-awesome-pages-plugin==2.9.3 mkdocs-macros-plugin==1.0.5 -mkdocs-material==9.5.30 +mkdocs-material==9.5.31 mkdocs-material-extensions==1.3.1 mkdocs-monorepo-plugin @ git+https://github.com/bitsondatadev/mkdocs-monorepo-plugin@url-fix mkdocs-redirects==1.2.1 From 722a350afa8ef1d6fe3018b290fefb2a836020ec Mon Sep 17 00:00:00 2001 From: Robert Stupp Date: Mon, 5 Aug 2024 09:08:58 +0200 Subject: [PATCH 30/55] Build: Fix Scala compilation (#10860) `ScalaCompile` does not respect `options.release` and `-release:11` in the aruments is not enough. Re-adding `sourceCompatibility` + `targetCompatibility` for 11. --- build.gradle | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/build.gradle b/build.gradle index 74029f3db2eb..7a11943cf8be 100644 --- a/build.gradle +++ b/build.gradle @@ -237,6 +237,10 @@ subprojects { plugins.withType(ScalaPlugin.class) { tasks.withType(ScalaCompile.class) { scalaCompileOptions.keepAliveMode.set(KeepAliveMode.DAEMON) + // `options.release` doesn't seem to work for ScalaCompile :( + sourceCompatibility = "11" + targetCompatibility = "11" + scalaCompileOptions.additionalParameters.add("-release:11") } } } From 87537f995450a098a87b25d72124781b4da99a47 Mon Sep 17 00:00:00 2001 From: Piotr Findeisen Date: Mon, 5 Aug 2024 09:17:42 +0200 Subject: [PATCH 31/55] Build: Enable FormatStringAnnotation error-prone check (#10856) --- baseline.gradle | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/baseline.gradle b/baseline.gradle index e3fe602e91c9..42be6d8a24c6 100644 --- a/baseline.gradle +++ b/baseline.gradle @@ -98,8 +98,7 @@ subprojects { '-Xep:EqualsGetClass:OFF', // specific to Palantir '-Xep:FinalClass:OFF', - // TODO (https://github.com/apache/iceberg/issues/10854) this is a recently added check. Figure out whether we adjust the code or suppress for good - '-Xep:FormatStringAnnotation:WARN', + '-Xep:FormatStringAnnotation:ERROR', // TODO (https://github.com/apache/iceberg/issues/10855) this is a recently added check. Figure out whether we adjust the code or suppress for good '-Xep:ImmutablesReferenceEquality:WARN', '-Xep:IntLongMath:ERROR', From 5fc1413a5efc4419ccc081f3031325f107ccddab Mon Sep 17 00:00:00 2001 From: Eduard Tudenhoefner Date: Mon, 5 Aug 2024 14:35:07 +0200 Subject: [PATCH 32/55] Core: Use encoding/decoding methods for namespaces and deprecate Splitter/Joiner (#10858) --- .../apache/iceberg/rest/RESTSessionCatalog.java | 2 +- .../java/org/apache/iceberg/rest/RESTUtil.java | 16 +++++++++++++--- .../apache/iceberg/rest/RESTCatalogAdapter.java | 6 +----- 3 files changed, 15 insertions(+), 9 deletions(-) diff --git a/core/src/main/java/org/apache/iceberg/rest/RESTSessionCatalog.java b/core/src/main/java/org/apache/iceberg/rest/RESTSessionCatalog.java index a72d3958c140..1c607e3b0220 100644 --- a/core/src/main/java/org/apache/iceberg/rest/RESTSessionCatalog.java +++ b/core/src/main/java/org/apache/iceberg/rest/RESTSessionCatalog.java @@ -547,7 +547,7 @@ public void createNamespace( public List listNamespaces(SessionContext context, Namespace namespace) { Map queryParams = Maps.newHashMap(); if (!namespace.isEmpty()) { - queryParams.put("parent", RESTUtil.NAMESPACE_JOINER.join(namespace.levels())); + queryParams.put("parent", RESTUtil.encodeNamespace(namespace)); } ImmutableList.Builder namespaces = ImmutableList.builder(); diff --git a/core/src/main/java/org/apache/iceberg/rest/RESTUtil.java b/core/src/main/java/org/apache/iceberg/rest/RESTUtil.java index fab01162cad7..45422b8ae8b5 100644 --- a/core/src/main/java/org/apache/iceberg/rest/RESTUtil.java +++ b/core/src/main/java/org/apache/iceberg/rest/RESTUtil.java @@ -33,14 +33,24 @@ import org.apache.iceberg.relocated.com.google.common.collect.Maps; public class RESTUtil { - private static final char NAMESPACE_SEPARATOR = '\u001f'; - public static final Joiner NAMESPACE_JOINER = Joiner.on(NAMESPACE_SEPARATOR); - public static final Splitter NAMESPACE_SPLITTER = Splitter.on(NAMESPACE_SEPARATOR); private static final String NAMESPACE_ESCAPED_SEPARATOR = "%1F"; private static final Joiner NAMESPACE_ESCAPED_JOINER = Joiner.on(NAMESPACE_ESCAPED_SEPARATOR); private static final Splitter NAMESPACE_ESCAPED_SPLITTER = Splitter.on(NAMESPACE_ESCAPED_SEPARATOR); + /** + * @deprecated since 1.7.0, will be made private in 1.8.0; use {@link + * RESTUtil#encodeNamespace(Namespace)} instead. + */ + @Deprecated public static final Joiner NAMESPACE_JOINER = Joiner.on(NAMESPACE_ESCAPED_SEPARATOR); + + /** + * @deprecated since 1.7.0, will be made private in 1.8.0; use {@link + * RESTUtil#decodeNamespace(String)} instead. + */ + @Deprecated + public static final Splitter NAMESPACE_SPLITTER = Splitter.on(NAMESPACE_ESCAPED_SEPARATOR); + private RESTUtil() {} public static String stripTrailingSlash(String path) { diff --git a/core/src/test/java/org/apache/iceberg/rest/RESTCatalogAdapter.java b/core/src/test/java/org/apache/iceberg/rest/RESTCatalogAdapter.java index 53d57bee510f..2c928c06e52b 100644 --- a/core/src/test/java/org/apache/iceberg/rest/RESTCatalogAdapter.java +++ b/core/src/test/java/org/apache/iceberg/rest/RESTCatalogAdapter.java @@ -288,11 +288,7 @@ public T handleRequest( if (asNamespaceCatalog != null) { Namespace ns; if (vars.containsKey("parent")) { - ns = - Namespace.of( - RESTUtil.NAMESPACE_SPLITTER - .splitToStream(vars.get("parent")) - .toArray(String[]::new)); + ns = RESTUtil.decodeNamespace(vars.get("parent")); } else { ns = Namespace.empty(); } From 04c2533f1de5fdaf23b1dca8227a82d2b84b349d Mon Sep 17 00:00:00 2001 From: Eduard Tudenhoefner Date: Mon, 5 Aug 2024 20:40:26 +0200 Subject: [PATCH 33/55] Aliyun: Replace assert usage with assertThat (#10880) --- .../oss/mock/AliyunOSSMockLocalStore.java | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/aliyun/src/test/java/org/apache/iceberg/aliyun/oss/mock/AliyunOSSMockLocalStore.java b/aliyun/src/test/java/org/apache/iceberg/aliyun/oss/mock/AliyunOSSMockLocalStore.java index 499e61495fc0..f7a4b72e4b97 100644 --- a/aliyun/src/test/java/org/apache/iceberg/aliyun/oss/mock/AliyunOSSMockLocalStore.java +++ b/aliyun/src/test/java/org/apache/iceberg/aliyun/oss/mock/AliyunOSSMockLocalStore.java @@ -18,6 +18,8 @@ */ package org.apache.iceberg.aliyun.oss.mock; +import static org.assertj.core.api.Assertions.assertThat; + import com.aliyun.oss.OSSErrorCode; import com.aliyun.oss.model.Bucket; import com.fasterxml.jackson.databind.ObjectMapper; @@ -137,7 +139,9 @@ ObjectMetadata putObject( Map userMetaData) throws IOException { File bucketDir = new File(root, bucketName); - assert bucketDir.exists() || bucketDir.mkdirs(); + assertThat(bucketDir) + .satisfiesAnyOf( + bucket -> assertThat(bucket).exists(), bucket -> assertThat(bucket.mkdirs()).isTrue()); File dataFile = new File(bucketDir, fileName + DATA_FILE); File metaFile = new File(bucketDir, fileName + META_FILE); @@ -170,17 +174,21 @@ ObjectMetadata putObject( void deleteObject(String bucketName, String filename) { File bucketDir = new File(root, bucketName); - assert bucketDir.exists(); + assertThat(bucketDir).exists(); File dataFile = new File(bucketDir, filename + DATA_FILE); File metaFile = new File(bucketDir, filename + META_FILE); - assert !dataFile.exists() || dataFile.delete(); - assert !metaFile.exists() || metaFile.delete(); + assertThat(dataFile) + .satisfiesAnyOf( + file -> assertThat(file).doesNotExist(), file -> assertThat(file.delete()).isTrue()); + assertThat(metaFile) + .satisfiesAnyOf( + file -> assertThat(file).doesNotExist(), file -> assertThat(file.delete()).isTrue()); } ObjectMetadata getObjectMetadata(String bucketName, String filename) throws IOException { File bucketDir = new File(root, bucketName); - assert bucketDir.exists(); + assertThat(bucketDir).exists(); File dataFile = new File(bucketDir, filename + DATA_FILE); if (!dataFile.exists()) { From b531e97f66ef2bf80f3167152e268be0ce25f459 Mon Sep 17 00:00:00 2001 From: Denys Kuzmenko Date: Mon, 5 Aug 2024 22:43:34 +0200 Subject: [PATCH 34/55] Core: Extract filePath comparator into it's own class (#10664) --- .../org/apache/iceberg/types/Comparators.java | 41 +++++++++++++++++++ .../org/apache/iceberg/deletes/Deletes.java | 31 ++------------ 2 files changed, 45 insertions(+), 27 deletions(-) diff --git a/api/src/main/java/org/apache/iceberg/types/Comparators.java b/api/src/main/java/org/apache/iceberg/types/Comparators.java index d09d9f5395ce..a803afac104f 100644 --- a/api/src/main/java/org/apache/iceberg/types/Comparators.java +++ b/api/src/main/java/org/apache/iceberg/types/Comparators.java @@ -173,6 +173,10 @@ public static Comparator charSequences() { return CharSeqComparator.INSTANCE; } + public static Comparator filePath() { + return FilePathComparator.INSTANCE; + } + private static class NullsFirst implements Comparator { private static final NullsFirst INSTANCE = new NullsFirst<>(); @@ -351,4 +355,41 @@ public int compare(CharSequence s1, CharSequence s2) { return Integer.compare(s1.length(), s2.length()); } } + + private static class FilePathComparator implements Comparator { + private static final FilePathComparator INSTANCE = new FilePathComparator(); + + private FilePathComparator() {} + + @Override + public int compare(CharSequence s1, CharSequence s2) { + if (s1 == s2) { + return 0; + } + int count = s1.length(); + + int cmp = Integer.compare(count, s2.length()); + if (cmp != 0) { + return cmp; + } + + if (s1 instanceof String && s2 instanceof String) { + cmp = Integer.compare(s1.hashCode(), s2.hashCode()); + if (cmp != 0) { + return cmp; + } + } + // File paths inside a delete file normally have more identical chars at the beginning. For + // example, a typical + // path is like "s3:/bucket/db/table/data/partition/00000-0-[uuid]-00001.parquet". + // The uuid is where the difference starts. So it's faster to find the first diff backward. + for (int i = count - 1; i >= 0; i--) { + cmp = Character.compare(s1.charAt(i), s2.charAt(i)); + if (cmp != 0) { + return cmp; + } + } + return 0; + } + } } diff --git a/core/src/main/java/org/apache/iceberg/deletes/Deletes.java b/core/src/main/java/org/apache/iceberg/deletes/Deletes.java index ff20ba53ff70..cef57cd16726 100644 --- a/core/src/main/java/org/apache/iceberg/deletes/Deletes.java +++ b/core/src/main/java/org/apache/iceberg/deletes/Deletes.java @@ -36,6 +36,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; import org.apache.iceberg.relocated.com.google.common.collect.Iterables; import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.types.Comparators; import org.apache.iceberg.types.Types; import org.apache.iceberg.util.CharSequenceMap; import org.apache.iceberg.util.Filter; @@ -398,33 +399,9 @@ private static class DataFileFilter extends Filter { @Override protected boolean shouldKeep(T posDelete) { - return charSeqEquals(dataLocation, (CharSequence) FILENAME_ACCESSOR.get(posDelete)); - } - - private boolean charSeqEquals(CharSequence s1, CharSequence s2) { - if (s1 == s2) { - return true; - } - - int count = s1.length(); - if (count != s2.length()) { - return false; - } - - if (s1 instanceof String && s2 instanceof String && s1.hashCode() != s2.hashCode()) { - return false; - } - - // File paths inside a delete file normally have more identical chars at the beginning. For - // example, a typical - // path is like "s3:/bucket/db/table/data/partition/00000-0-[uuid]-00001.parquet". - // The uuid is where the difference starts. So it's faster to find the first diff backward. - for (int i = count - 1; i >= 0; i--) { - if (s1.charAt(i) != s2.charAt(i)) { - return false; - } - } - return true; + return Comparators.filePath() + .compare(dataLocation, (CharSequence) FILENAME_ACCESSOR.get(posDelete)) + == 0; } } } From 3d364f6d95600be4e5320fc5931bb51b2af61de6 Mon Sep 17 00:00:00 2001 From: "k.nakagaki" <141020064+nakaken-churadata@users.noreply.github.com> Date: Tue, 6 Aug 2024 05:50:21 +0900 Subject: [PATCH 35/55] Docs: Fix SQL in branching docs (#10876) --- docs/docs/branching.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/docs/branching.md b/docs/docs/branching.md index 3379264d8a5f..f00defda665b 100644 --- a/docs/docs/branching.md +++ b/docs/docs/branching.md @@ -148,7 +148,7 @@ SELECT * FROM db.table.branch_test_branch; Modify the table's schema by dropping the `col` column and adding a new column named `new_col`: ```sql -ALTER TABLE db.table drop column float; +ALTER TABLE db.table drop column col; ALTER TABLE db.table add column new_col date; From e9364faabcc67eef6c61af2ecdf7bcf9a3fef602 Mon Sep 17 00:00:00 2001 From: Amogh Jahagirdar Date: Mon, 5 Aug 2024 14:36:20 -0700 Subject: [PATCH 36/55] API: Add SupportsRecoveryOperations mixin for FileIO (#10711) --- .../io/SupportsRecoveryOperations.java | 36 +++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 api/src/main/java/org/apache/iceberg/io/SupportsRecoveryOperations.java diff --git a/api/src/main/java/org/apache/iceberg/io/SupportsRecoveryOperations.java b/api/src/main/java/org/apache/iceberg/io/SupportsRecoveryOperations.java new file mode 100644 index 000000000000..c402d2e68e7d --- /dev/null +++ b/api/src/main/java/org/apache/iceberg/io/SupportsRecoveryOperations.java @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.io; + +/** + * This interface is intended as an extension for FileIO implementations to provide additional + * best-effort recovery operations that can be useful for repairing corrupted tables where there are + * reachable files missing from disk. (e.g. a live manifest points to data file entry which no + * longer exists on disk) + */ +public interface SupportsRecoveryOperations { + + /** + * Perform a best-effort recovery of a file at a given path + * + * @param path Absolute path of file to attempt recovery for + * @return true if recovery was successful, false otherwise + */ + boolean recoverFile(String path); +} From 525d887811b2fd2140779e125243cb70742e169c Mon Sep 17 00:00:00 2001 From: emkornfield Date: Mon, 5 Aug 2024 18:06:36 -0700 Subject: [PATCH 37/55] Spec: Clarify identity partition edge cases (#10835) --- format/spec.md | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/format/spec.md b/format/spec.md index daef7538e730..c3321fa6991c 100644 --- a/format/spec.md +++ b/format/spec.md @@ -150,6 +150,10 @@ Readers should be more permissive because v1 metadata files are allowed in v2 ta Readers may be more strict for metadata JSON files because the JSON files are not reused and will always match the table version. Required v2 fields that were not present in v1 or optional in v1 may be handled as required fields. For example, a v2 table that is missing `last-sequence-number` can throw an exception. +##### Writing data files + +All columns must be written to data files even if they introduce redundancy with metadata stored in manifest files (e.g. columns with identity partition transforms). Writing all columns provides a backup in case of corruption or bugs in the metadata layer. + ### Schemas and Data Types A table's **schema** is a list of named columns. All data types are either primitives or nested types, which are maps, lists, or structs. A table schema is also a struct type. @@ -241,7 +245,14 @@ Struct evolution requires the following rules for default values: #### Column Projection -Columns in Iceberg data files are selected by field id. The table schema's column names and order may change after a data file is written, and projection must be done using field ids. If a field id is missing from a data file, its value for each row should be `null`. +Columns in Iceberg data files are selected by field id. The table schema's column names and order may change after a data file is written, and projection must be done using field ids. + +Values for field ids which are not present in a data file must be resolved according the following rules: + +* Return the value from partition metadata if an [Identity Transform](#partition-transforms) exists for the field and the partition value is present in the `partition` struct on `data_file` object in the manifest. This allows for metadata only migrations of Hive tables. +* Use `schema.name-mapping.default` metadata to map field id to columns without field id as described below and use the column if it is present. +* Return the default value if it has a defined `initial-default` (See [Default values](#default-values) section for more details). +* Return `null` in all other cases. For example, a file may be written with schema `1: a int, 2: b string, 3: c double` and read using projection schema `3: measurement, 2: name, 4: a`. This must select file columns `c` (renamed to `measurement`), `b` (now called `name`), and a column of `null` values called `a`; in that order. From 6ee6d1327d3811dbd5795c4e87efdc41b7a58eaa Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 6 Aug 2024 11:16:19 +0200 Subject: [PATCH 38/55] Build: Bump org.testcontainers:testcontainers from 1.20.0 to 1.20.1 (#10865) Bumps [org.testcontainers:testcontainers](https://github.com/testcontainers/testcontainers-java) from 1.20.0 to 1.20.1. - [Release notes](https://github.com/testcontainers/testcontainers-java/releases) - [Changelog](https://github.com/testcontainers/testcontainers-java/blob/main/CHANGELOG.md) - [Commits](https://github.com/testcontainers/testcontainers-java/compare/1.20.0...1.20.1) --- updated-dependencies: - dependency-name: org.testcontainers:testcontainers dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- gradle/libs.versions.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index b8f20f3a2799..12caeda95407 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -85,7 +85,7 @@ spark-hive35 = "3.5.1" spring-boot = "2.7.18" spring-web = "5.3.37" sqlite-jdbc = "3.46.0.1" -testcontainers = "1.20.0" +testcontainers = "1.20.1" tez010 = "0.10.3" tez08 = { strictly = "0.8.4"} # see rich version usage explanation above From 93f7839fa13d1deb40dc1e208d778cf07620d37f Mon Sep 17 00:00:00 2001 From: Steven Wu Date: Mon, 5 Aug 2024 08:57:16 -0700 Subject: [PATCH 39/55] Flink: move v1.19 to v.120 --- flink/{v1.19 => v1.20}/build.gradle | 0 flink/{v1.19 => v1.20}/flink-runtime/LICENSE | 0 flink/{v1.19 => v1.20}/flink-runtime/NOTICE | 0 .../java/org/apache/iceberg/flink/IcebergConnectorSmokeTest.java | 0 .../iceberg/flink/sink/shuffle/MapRangePartitionerBenchmark.java | 0 .../src/main/java/org/apache/iceberg/flink/CatalogLoader.java | 0 .../src/main/java/org/apache/iceberg/flink/FlinkCatalog.java | 0 .../main/java/org/apache/iceberg/flink/FlinkCatalogFactory.java | 0 .../src/main/java/org/apache/iceberg/flink/FlinkConfParser.java | 0 .../main/java/org/apache/iceberg/flink/FlinkConfigOptions.java | 0 .../java/org/apache/iceberg/flink/FlinkDynamicTableFactory.java | 0 .../java/org/apache/iceberg/flink/FlinkEnvironmentContext.java | 0 .../src/main/java/org/apache/iceberg/flink/FlinkFilters.java | 0 .../src/main/java/org/apache/iceberg/flink/FlinkFixupTypes.java | 0 .../src/main/java/org/apache/iceberg/flink/FlinkReadConf.java | 0 .../src/main/java/org/apache/iceberg/flink/FlinkReadOptions.java | 0 .../src/main/java/org/apache/iceberg/flink/FlinkSchemaUtil.java | 0 .../src/main/java/org/apache/iceberg/flink/FlinkSourceFilter.java | 0 .../src/main/java/org/apache/iceberg/flink/FlinkTypeToType.java | 0 .../src/main/java/org/apache/iceberg/flink/FlinkTypeVisitor.java | 0 .../src/main/java/org/apache/iceberg/flink/FlinkWriteConf.java | 0 .../src/main/java/org/apache/iceberg/flink/FlinkWriteOptions.java | 0 .../src/main/java/org/apache/iceberg/flink/IcebergTableSink.java | 0 .../src/main/java/org/apache/iceberg/flink/RowDataWrapper.java | 0 .../flink/src/main/java/org/apache/iceberg/flink/TableLoader.java | 0 .../src/main/java/org/apache/iceberg/flink/TypeToFlinkType.java | 0 .../src/main/java/org/apache/iceberg/flink/actions/Actions.java | 0 .../org/apache/iceberg/flink/actions/RewriteDataFilesAction.java | 0 .../org/apache/iceberg/flink/data/AvroWithFlinkSchemaVisitor.java | 0 .../main/java/org/apache/iceberg/flink/data/FlinkAvroReader.java | 0 .../main/java/org/apache/iceberg/flink/data/FlinkAvroWriter.java | 0 .../main/java/org/apache/iceberg/flink/data/FlinkOrcReader.java | 0 .../main/java/org/apache/iceberg/flink/data/FlinkOrcReaders.java | 0 .../main/java/org/apache/iceberg/flink/data/FlinkOrcWriter.java | 0 .../main/java/org/apache/iceberg/flink/data/FlinkOrcWriters.java | 0 .../java/org/apache/iceberg/flink/data/FlinkParquetReaders.java | 0 .../java/org/apache/iceberg/flink/data/FlinkParquetWriters.java | 0 .../java/org/apache/iceberg/flink/data/FlinkSchemaVisitor.java | 0 .../java/org/apache/iceberg/flink/data/FlinkValueReaders.java | 0 .../java/org/apache/iceberg/flink/data/FlinkValueWriters.java | 0 .../apache/iceberg/flink/data/ParquetWithFlinkSchemaVisitor.java | 0 .../java/org/apache/iceberg/flink/data/RowDataProjection.java | 0 .../src/main/java/org/apache/iceberg/flink/data/RowDataUtil.java | 0 .../main/java/org/apache/iceberg/flink/data/StructRowData.java | 0 .../apache/iceberg/flink/maintenance/operator/MonitorSource.java | 0 .../flink/maintenance/operator/SingleThreadedIteratorSource.java | 0 .../apache/iceberg/flink/maintenance/operator/TableChange.java | 0 .../iceberg/flink/sink/AvroGenericRecordToRowDataMapper.java | 0 .../java/org/apache/iceberg/flink/sink/BaseDeltaTaskWriter.java | 0 .../org/apache/iceberg/flink/sink/BucketPartitionKeySelector.java | 0 .../java/org/apache/iceberg/flink/sink/BucketPartitioner.java | 0 .../java/org/apache/iceberg/flink/sink/BucketPartitionerUtil.java | 0 .../java/org/apache/iceberg/flink/sink/CachingTableSupplier.java | 0 .../main/java/org/apache/iceberg/flink/sink/CommitSummary.java | 0 .../main/java/org/apache/iceberg/flink/sink/DeltaManifests.java | 0 .../org/apache/iceberg/flink/sink/DeltaManifestsSerializer.java | 0 .../org/apache/iceberg/flink/sink/EqualityFieldKeySelector.java | 0 .../java/org/apache/iceberg/flink/sink/FlinkAppenderFactory.java | 0 .../org/apache/iceberg/flink/sink/FlinkFileWriterFactory.java | 0 .../java/org/apache/iceberg/flink/sink/FlinkManifestUtil.java | 0 .../src/main/java/org/apache/iceberg/flink/sink/FlinkSink.java | 0 .../java/org/apache/iceberg/flink/sink/IcebergFilesCommitter.java | 0 .../apache/iceberg/flink/sink/IcebergFilesCommitterMetrics.java | 0 .../java/org/apache/iceberg/flink/sink/IcebergStreamWriter.java | 0 .../org/apache/iceberg/flink/sink/IcebergStreamWriterMetrics.java | 0 .../org/apache/iceberg/flink/sink/ManifestOutputFileFactory.java | 0 .../java/org/apache/iceberg/flink/sink/PartitionKeySelector.java | 0 .../org/apache/iceberg/flink/sink/PartitionedDeltaWriter.java | 0 .../org/apache/iceberg/flink/sink/RowDataTaskWriterFactory.java | 0 .../java/org/apache/iceberg/flink/sink/TaskWriterFactory.java | 0 .../org/apache/iceberg/flink/sink/UnpartitionedDeltaWriter.java | 0 .../iceberg/flink/sink/shuffle/AggregatedStatisticsTracker.java | 0 .../apache/iceberg/flink/sink/shuffle/CompletedStatistics.java | 0 .../iceberg/flink/sink/shuffle/CompletedStatisticsSerializer.java | 0 .../org/apache/iceberg/flink/sink/shuffle/DataStatistics.java | 0 .../iceberg/flink/sink/shuffle/DataStatisticsCoordinator.java | 0 .../flink/sink/shuffle/DataStatisticsCoordinatorProvider.java | 0 .../apache/iceberg/flink/sink/shuffle/DataStatisticsOperator.java | 0 .../iceberg/flink/sink/shuffle/DataStatisticsSerializer.java | 0 .../org/apache/iceberg/flink/sink/shuffle/GlobalStatistics.java | 0 .../iceberg/flink/sink/shuffle/GlobalStatisticsSerializer.java | 0 .../java/org/apache/iceberg/flink/sink/shuffle/KeyAssignment.java | 0 .../java/org/apache/iceberg/flink/sink/shuffle/MapAssignment.java | 0 .../org/apache/iceberg/flink/sink/shuffle/MapDataStatistics.java | 0 .../apache/iceberg/flink/sink/shuffle/MapRangePartitioner.java | 0 .../org/apache/iceberg/flink/sink/shuffle/RangePartitioner.java | 0 .../iceberg/flink/sink/shuffle/RequestGlobalStatisticsEvent.java | 0 .../apache/iceberg/flink/sink/shuffle/SketchDataStatistics.java | 0 .../apache/iceberg/flink/sink/shuffle/SketchRangePartitioner.java | 0 .../java/org/apache/iceberg/flink/sink/shuffle/SketchUtil.java | 0 .../org/apache/iceberg/flink/sink/shuffle/SortKeySerializer.java | 0 .../iceberg/flink/sink/shuffle/SortKeySketchSerializer.java | 0 .../java/org/apache/iceberg/flink/sink/shuffle/SortKeyUtil.java | 0 .../org/apache/iceberg/flink/sink/shuffle/StatisticsEvent.java | 0 .../org/apache/iceberg/flink/sink/shuffle/StatisticsOrRecord.java | 0 .../iceberg/flink/sink/shuffle/StatisticsOrRecordSerializer.java | 0 .../org/apache/iceberg/flink/sink/shuffle/StatisticsType.java | 0 .../org/apache/iceberg/flink/sink/shuffle/StatisticsUtil.java | 0 .../iceberg/flink/source/AvroGenericRecordFileScanTaskReader.java | 0 .../main/java/org/apache/iceberg/flink/source/DataIterator.java | 0 .../main/java/org/apache/iceberg/flink/source/DataTaskReader.java | 0 .../java/org/apache/iceberg/flink/source/FileScanTaskReader.java | 0 .../java/org/apache/iceberg/flink/source/FlinkInputFormat.java | 0 .../java/org/apache/iceberg/flink/source/FlinkInputSplit.java | 0 .../main/java/org/apache/iceberg/flink/source/FlinkSource.java | 0 .../java/org/apache/iceberg/flink/source/FlinkSplitPlanner.java | 0 .../main/java/org/apache/iceberg/flink/source/IcebergSource.java | 0 .../java/org/apache/iceberg/flink/source/IcebergTableSource.java | 0 .../apache/iceberg/flink/source/RowDataFileScanTaskReader.java | 0 .../java/org/apache/iceberg/flink/source/RowDataRewriter.java | 0 .../iceberg/flink/source/RowDataToAvroGenericRecordConverter.java | 0 .../main/java/org/apache/iceberg/flink/source/ScanContext.java | 0 .../src/main/java/org/apache/iceberg/flink/source/SourceUtil.java | 0 .../org/apache/iceberg/flink/source/StreamingMonitorFunction.java | 0 .../org/apache/iceberg/flink/source/StreamingReaderOperator.java | 0 .../apache/iceberg/flink/source/StreamingStartingStrategy.java | 0 .../iceberg/flink/source/assigner/DefaultSplitAssigner.java | 0 .../org/apache/iceberg/flink/source/assigner/GetSplitResult.java | 0 .../flink/source/assigner/OrderedSplitAssignerFactory.java | 0 .../iceberg/flink/source/assigner/SimpleSplitAssignerFactory.java | 0 .../org/apache/iceberg/flink/source/assigner/SplitAssigner.java | 0 .../iceberg/flink/source/assigner/SplitAssignerFactory.java | 0 .../apache/iceberg/flink/source/assigner/SplitAssignerType.java | 0 .../flink/source/enumerator/AbstractIcebergEnumerator.java | 0 .../flink/source/enumerator/ContinuousEnumerationResult.java | 0 .../flink/source/enumerator/ContinuousIcebergEnumerator.java | 0 .../iceberg/flink/source/enumerator/ContinuousSplitPlanner.java | 0 .../flink/source/enumerator/ContinuousSplitPlannerImpl.java | 0 .../iceberg/flink/source/enumerator/EnumerationHistory.java | 0 .../flink/source/enumerator/IcebergEnumeratorPosition.java | 0 .../source/enumerator/IcebergEnumeratorPositionSerializer.java | 0 .../iceberg/flink/source/enumerator/IcebergEnumeratorState.java | 0 .../flink/source/enumerator/IcebergEnumeratorStateSerializer.java | 0 .../iceberg/flink/source/enumerator/StaticIcebergEnumerator.java | 0 .../org/apache/iceberg/flink/source/reader/ArrayBatchRecords.java | 0 .../iceberg/flink/source/reader/ArrayPoolDataIteratorBatcher.java | 0 .../flink/source/reader/AvroGenericRecordReaderFunction.java | 0 .../flink/source/reader/ColumnStatsWatermarkExtractor.java | 0 .../apache/iceberg/flink/source/reader/DataIteratorBatcher.java | 0 .../iceberg/flink/source/reader/DataIteratorReaderFunction.java | 0 .../apache/iceberg/flink/source/reader/IcebergSourceReader.java | 0 .../iceberg/flink/source/reader/IcebergSourceReaderMetrics.java | 0 .../iceberg/flink/source/reader/IcebergSourceSplitReader.java | 0 .../apache/iceberg/flink/source/reader/LimitableDataIterator.java | 0 .../org/apache/iceberg/flink/source/reader/ListBatchRecords.java | 0 .../iceberg/flink/source/reader/ListDataIteratorBatcher.java | 0 .../iceberg/flink/source/reader/MetaDataReaderFunction.java | 0 .../org/apache/iceberg/flink/source/reader/ReaderFunction.java | 0 .../org/apache/iceberg/flink/source/reader/RecordAndPosition.java | 0 .../org/apache/iceberg/flink/source/reader/RecordFactory.java | 0 .../org/apache/iceberg/flink/source/reader/RecordLimiter.java | 0 .../apache/iceberg/flink/source/reader/RowDataReaderFunction.java | 0 .../apache/iceberg/flink/source/reader/RowDataRecordFactory.java | 0 .../iceberg/flink/source/reader/SerializableRecordEmitter.java | 0 .../iceberg/flink/source/reader/SplitWatermarkExtractor.java | 0 .../flink/source/reader/WatermarkExtractorRecordEmitter.java | 0 .../org/apache/iceberg/flink/source/split/IcebergSourceSplit.java | 0 .../iceberg/flink/source/split/IcebergSourceSplitSerializer.java | 0 .../iceberg/flink/source/split/IcebergSourceSplitState.java | 0 .../iceberg/flink/source/split/IcebergSourceSplitStatus.java | 0 .../apache/iceberg/flink/source/split/SerializableComparator.java | 0 .../org/apache/iceberg/flink/source/split/SerializerHelper.java | 0 .../org/apache/iceberg/flink/source/split/SplitComparators.java | 0 .../org/apache/iceberg/flink/source/split/SplitRequestEvent.java | 0 .../main/java/org/apache/iceberg/flink/util/ElapsedTimeGauge.java | 0 .../java/org/apache/iceberg/flink/util/FlinkAlterTableUtil.java | 0 .../org/apache/iceberg/flink/util/FlinkCompatibilityUtil.java | 0 .../src/main/java/org/apache/iceberg/flink/util/FlinkPackage.java | 0 .../META-INF/services/org.apache.flink.table.factories.Factory | 0 .../services/org.apache.flink.table.factories.TableFactory | 0 .../org/apache/iceberg/flink/AvroGenericRecordConverterBase.java | 0 .../src/test/java/org/apache/iceberg/flink/CatalogTestBase.java | 0 .../src/test/java/org/apache/iceberg/flink/DataGenerator.java | 0 .../src/test/java/org/apache/iceberg/flink/DataGenerators.java | 0 .../java/org/apache/iceberg/flink/HadoopCatalogExtension.java | 0 .../test/java/org/apache/iceberg/flink/HadoopTableExtension.java | 0 .../java/org/apache/iceberg/flink/MiniFlinkClusterExtension.java | 0 .../src/test/java/org/apache/iceberg/flink/RowDataConverter.java | 0 .../src/test/java/org/apache/iceberg/flink/SimpleDataUtil.java | 0 .../flink/src/test/java/org/apache/iceberg/flink/SqlBase.java | 0 .../flink/src/test/java/org/apache/iceberg/flink/TestBase.java | 0 .../src/test/java/org/apache/iceberg/flink/TestCatalogLoader.java | 0 .../java/org/apache/iceberg/flink/TestCatalogTableLoader.java | 0 .../test/java/org/apache/iceberg/flink/TestChangeLogTable.java | 0 .../java/org/apache/iceberg/flink/TestDataFileSerialization.java | 0 .../src/test/java/org/apache/iceberg/flink/TestFixtures.java | 0 .../java/org/apache/iceberg/flink/TestFlinkAnonymousTable.java | 0 .../java/org/apache/iceberg/flink/TestFlinkCatalogDatabase.java | 0 .../java/org/apache/iceberg/flink/TestFlinkCatalogFactory.java | 0 .../test/java/org/apache/iceberg/flink/TestFlinkCatalogTable.java | 0 .../org/apache/iceberg/flink/TestFlinkCatalogTablePartitions.java | 0 .../test/java/org/apache/iceberg/flink/TestFlinkConfParser.java | 0 .../src/test/java/org/apache/iceberg/flink/TestFlinkFilters.java | 0 .../test/java/org/apache/iceberg/flink/TestFlinkHiveCatalog.java | 0 .../test/java/org/apache/iceberg/flink/TestFlinkSchemaUtil.java | 0 .../test/java/org/apache/iceberg/flink/TestFlinkTableSink.java | 0 .../java/org/apache/iceberg/flink/TestFlinkTableSinkExtended.java | 0 .../src/test/java/org/apache/iceberg/flink/TestFlinkUpsert.java | 0 .../flink/src/test/java/org/apache/iceberg/flink/TestHelpers.java | 0 .../test/java/org/apache/iceberg/flink/TestIcebergConnector.java | 0 .../org/apache/iceberg/flink/TestManifestFileSerialization.java | 0 .../test/java/org/apache/iceberg/flink/TestRowDataWrapper.java | 0 .../src/test/java/org/apache/iceberg/flink/TestTableLoader.java | 0 .../java/org/apache/iceberg/flink/TestTableSerialization.java | 0 .../apache/iceberg/flink/actions/TestRewriteDataFilesAction.java | 0 .../test/java/org/apache/iceberg/flink/data/RandomRowData.java | 0 .../java/org/apache/iceberg/flink/data/RowDataToRowMapper.java | 0 .../org/apache/iceberg/flink/data/TestFlinkAvroReaderWriter.java | 0 .../org/apache/iceberg/flink/data/TestFlinkOrcReaderWriter.java | 0 .../org/apache/iceberg/flink/data/TestFlinkParquetReader.java | 0 .../org/apache/iceberg/flink/data/TestFlinkParquetWriter.java | 0 .../java/org/apache/iceberg/flink/data/TestRowDataProjection.java | 0 .../java/org/apache/iceberg/flink/data/TestRowProjection.java | 0 .../java/org/apache/iceberg/flink/data/TestStructRowData.java | 0 .../apache/iceberg/flink/maintenance/operator/CollectingSink.java | 0 .../iceberg/flink/maintenance/operator/FlinkSqlExtension.java | 0 .../flink/maintenance/operator/FlinkStreamingTestUtils.java | 0 .../apache/iceberg/flink/maintenance/operator/ManualSource.java | 0 .../iceberg/flink/maintenance/operator/OperatorTestBase.java | 0 .../iceberg/flink/maintenance/operator/TestMonitorSource.java | 0 .../iceberg/flink/sink/TestAvroGenericRecordToRowDataMapper.java | 0 .../apache/iceberg/flink/sink/TestBucketPartitionKeySelector.java | 0 .../java/org/apache/iceberg/flink/sink/TestBucketPartitioner.java | 0 .../iceberg/flink/sink/TestBucketPartitionerFlinkIcebergSink.java | 0 .../org/apache/iceberg/flink/sink/TestBucketPartitionerUtil.java | 0 .../org/apache/iceberg/flink/sink/TestCachingTableSupplier.java | 0 .../org/apache/iceberg/flink/sink/TestCompressionSettings.java | 0 .../java/org/apache/iceberg/flink/sink/TestDeltaTaskWriter.java | 0 .../org/apache/iceberg/flink/sink/TestFlinkAppenderFactory.java | 0 .../org/apache/iceberg/flink/sink/TestFlinkFileWriterFactory.java | 0 .../java/org/apache/iceberg/flink/sink/TestFlinkIcebergSink.java | 0 .../org/apache/iceberg/flink/sink/TestFlinkIcebergSinkBase.java | 0 .../org/apache/iceberg/flink/sink/TestFlinkIcebergSinkBranch.java | 0 .../iceberg/flink/sink/TestFlinkIcebergSinkDistributionMode.java | 0 .../apache/iceberg/flink/sink/TestFlinkIcebergSinkExtended.java | 0 .../org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2.java | 0 .../org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2Base.java | 0 .../apache/iceberg/flink/sink/TestFlinkIcebergSinkV2Branch.java | 0 .../java/org/apache/iceberg/flink/sink/TestFlinkManifest.java | 0 .../apache/iceberg/flink/sink/TestFlinkPartitioningWriters.java | 0 .../apache/iceberg/flink/sink/TestFlinkPositionDeltaWriters.java | 0 .../apache/iceberg/flink/sink/TestFlinkRollingFileWriters.java | 0 .../org/apache/iceberg/flink/sink/TestFlinkWriterMetrics.java | 0 .../org/apache/iceberg/flink/sink/TestIcebergFilesCommitter.java | 0 .../org/apache/iceberg/flink/sink/TestIcebergStreamWriter.java | 0 .../org/apache/iceberg/flink/sink/TestRowDataPartitionKey.java | 0 .../test/java/org/apache/iceberg/flink/sink/TestTaskWriters.java | 0 .../test/java/org/apache/iceberg/flink/sink/shuffle/Fixtures.java | 0 .../flink/sink/shuffle/TestAggregatedStatisticsTracker.java | 0 .../flink/sink/shuffle/TestCompletedStatisticsSerializer.java | 0 .../iceberg/flink/sink/shuffle/TestDataStatisticsCoordinator.java | 0 .../flink/sink/shuffle/TestDataStatisticsCoordinatorProvider.java | 0 .../iceberg/flink/sink/shuffle/TestDataStatisticsOperator.java | 0 .../iceberg/flink/sink/shuffle/TestDataStatisticsSerializer.java | 0 .../flink/sink/shuffle/TestGlobalStatisticsSerializer.java | 0 .../apache/iceberg/flink/sink/shuffle/TestMapDataStatistics.java | 0 .../iceberg/flink/sink/shuffle/TestMapRangePartitioner.java | 0 .../apache/iceberg/flink/sink/shuffle/TestRangePartitioner.java | 0 .../iceberg/flink/sink/shuffle/TestSketchDataStatistics.java | 0 .../iceberg/flink/sink/shuffle/TestSketchRangePartitioner.java | 0 .../org/apache/iceberg/flink/sink/shuffle/TestSketchUtil.java | 0 .../iceberg/flink/sink/shuffle/TestSortKeySerializerBase.java | 0 .../flink/sink/shuffle/TestSortKeySerializerNestedStruct.java | 0 .../flink/sink/shuffle/TestSortKeySerializerPrimitives.java | 0 .../iceberg/flink/sink/shuffle/TestSortKeySerializerSnapshot.java | 0 .../org/apache/iceberg/flink/sink/shuffle/TestSortKeyUtil.java | 0 .../java/org/apache/iceberg/flink/source/BoundedTableFactory.java | 0 .../java/org/apache/iceberg/flink/source/BoundedTestSource.java | 0 .../org/apache/iceberg/flink/source/ChangeLogTableTestBase.java | 0 .../test/java/org/apache/iceberg/flink/source/SplitHelpers.java | 0 .../src/test/java/org/apache/iceberg/flink/source/SqlHelpers.java | 0 .../java/org/apache/iceberg/flink/source/TableSourceTestBase.java | 0 .../org/apache/iceberg/flink/source/TestBoundedTableFactory.java | 0 .../org/apache/iceberg/flink/source/TestFlinkInputFormat.java | 0 .../iceberg/flink/source/TestFlinkInputFormatReaderDeletes.java | 0 .../org/apache/iceberg/flink/source/TestFlinkMergingMetrics.java | 0 .../org/apache/iceberg/flink/source/TestFlinkMetaDataTable.java | 0 .../apache/iceberg/flink/source/TestFlinkReaderDeletesBase.java | 0 .../test/java/org/apache/iceberg/flink/source/TestFlinkScan.java | 0 .../java/org/apache/iceberg/flink/source/TestFlinkScanSql.java | 0 .../java/org/apache/iceberg/flink/source/TestFlinkSource.java | 0 .../org/apache/iceberg/flink/source/TestFlinkSourceConfig.java | 0 .../java/org/apache/iceberg/flink/source/TestFlinkSourceSql.java | 0 .../org/apache/iceberg/flink/source/TestFlinkTableSource.java | 0 .../org/apache/iceberg/flink/source/TestIcebergSourceBounded.java | 0 .../flink/source/TestIcebergSourceBoundedGenericRecord.java | 0 .../apache/iceberg/flink/source/TestIcebergSourceBoundedSql.java | 0 .../apache/iceberg/flink/source/TestIcebergSourceContinuous.java | 0 .../apache/iceberg/flink/source/TestIcebergSourceFailover.java | 0 .../source/TestIcebergSourceFailoverWithWatermarkExtractor.java | 0 .../iceberg/flink/source/TestIcebergSourceReaderDeletes.java | 0 .../org/apache/iceberg/flink/source/TestIcebergSourceSql.java | 0 .../flink/source/TestIcebergSourceWithWatermarkExtractor.java | 0 .../flink/source/TestIcebergSpeculativeExecutionSupport.java | 0 .../iceberg/flink/source/TestMetadataTableReadableMetrics.java | 0 .../org/apache/iceberg/flink/source/TestProjectMetaColumn.java | 0 .../flink/source/TestRowDataToAvroGenericRecordConverter.java | 0 .../java/org/apache/iceberg/flink/source/TestScanContext.java | 0 .../test/java/org/apache/iceberg/flink/source/TestSourceUtil.java | 0 .../test/java/org/apache/iceberg/flink/source/TestSqlBase.java | 0 .../java/org/apache/iceberg/flink/source/TestStreamScanSql.java | 0 .../apache/iceberg/flink/source/TestStreamingMonitorFunction.java | 0 .../apache/iceberg/flink/source/TestStreamingReaderOperator.java | 0 .../iceberg/flink/source/assigner/SplitAssignerTestBase.java | 0 .../iceberg/flink/source/assigner/TestDefaultSplitAssigner.java | 0 .../source/assigner/TestFileSequenceNumberBasedSplitAssigner.java | 0 .../flink/source/assigner/TestWatermarkBasedSplitAssigner.java | 0 .../flink/source/enumerator/ManualContinuousSplitPlanner.java | 0 .../flink/source/enumerator/TestContinuousIcebergEnumerator.java | 0 .../flink/source/enumerator/TestContinuousSplitPlannerImpl.java | 0 .../enumerator/TestContinuousSplitPlannerImplStartStrategy.java | 0 .../iceberg/flink/source/enumerator/TestEnumerationHistory.java | 0 .../source/enumerator/TestIcebergEnumeratorStateSerializer.java | 0 .../iceberg/flink/source/reader/ReaderFunctionTestBase.java | 0 .../java/org/apache/iceberg/flink/source/reader/ReaderUtil.java | 0 .../apache/iceberg/flink/source/reader/TestArrayBatchRecords.java | 0 .../source/reader/TestArrayPoolDataIteratorBatcherRowData.java | 0 .../flink/source/reader/TestColumnStatsWatermarkExtractor.java | 0 .../iceberg/flink/source/reader/TestIcebergSourceReader.java | 0 .../iceberg/flink/source/reader/TestLimitableDataIterator.java | 0 .../iceberg/flink/source/reader/TestRowDataReaderFunction.java | 0 .../apache/iceberg/flink/source/reader/TestingMetricGroup.java | 0 .../flink/source/split/TestIcebergSourceSplitSerializer.java | 0 .../test/java/org/apache/iceberg/flink/util/TestFlinkPackage.java | 0 .../META-INF/services/org.apache.flink.table.factories.Factory | 0 325 files changed, 0 insertions(+), 0 deletions(-) rename flink/{v1.19 => v1.20}/build.gradle (100%) rename flink/{v1.19 => v1.20}/flink-runtime/LICENSE (100%) rename flink/{v1.19 => v1.20}/flink-runtime/NOTICE (100%) rename flink/{v1.19 => v1.20}/flink-runtime/src/integration/java/org/apache/iceberg/flink/IcebergConnectorSmokeTest.java (100%) rename flink/{v1.19 => v1.20}/flink/src/jmh/java/org/apache/iceberg/flink/sink/shuffle/MapRangePartitionerBenchmark.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/CatalogLoader.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/FlinkCatalog.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/FlinkCatalogFactory.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/FlinkConfParser.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/FlinkConfigOptions.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/FlinkDynamicTableFactory.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/FlinkEnvironmentContext.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/FlinkFilters.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/FlinkFixupTypes.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/FlinkReadConf.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/FlinkReadOptions.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/FlinkSchemaUtil.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/FlinkSourceFilter.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeToType.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeVisitor.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/FlinkWriteConf.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/FlinkWriteOptions.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/IcebergTableSink.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/RowDataWrapper.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/TableLoader.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/TypeToFlinkType.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/actions/Actions.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/actions/RewriteDataFilesAction.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/data/AvroWithFlinkSchemaVisitor.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/data/FlinkAvroReader.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/data/FlinkAvroWriter.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReader.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReaders.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriter.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriters.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/data/FlinkParquetReaders.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/data/FlinkParquetWriters.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/data/FlinkSchemaVisitor.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/data/FlinkValueReaders.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/data/FlinkValueWriters.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/data/ParquetWithFlinkSchemaVisitor.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/data/RowDataProjection.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/data/RowDataUtil.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/data/StructRowData.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/MonitorSource.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/SingleThreadedIteratorSource.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/TableChange.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/sink/AvroGenericRecordToRowDataMapper.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/sink/BaseDeltaTaskWriter.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/sink/BucketPartitionKeySelector.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/sink/BucketPartitioner.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/sink/BucketPartitionerUtil.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/sink/CachingTableSupplier.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/sink/CommitSummary.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/sink/DeltaManifests.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/sink/DeltaManifestsSerializer.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/sink/EqualityFieldKeySelector.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkAppenderFactory.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkFileWriterFactory.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkManifestUtil.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkSink.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergFilesCommitter.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergFilesCommitterMetrics.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriter.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriterMetrics.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/sink/ManifestOutputFileFactory.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/sink/PartitionKeySelector.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/sink/PartitionedDeltaWriter.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/sink/RowDataTaskWriterFactory.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/sink/TaskWriterFactory.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/sink/UnpartitionedDeltaWriter.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/AggregatedStatisticsTracker.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/CompletedStatistics.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/CompletedStatisticsSerializer.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatistics.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsCoordinator.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsCoordinatorProvider.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsOperator.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsSerializer.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/GlobalStatistics.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/GlobalStatisticsSerializer.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/KeyAssignment.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapAssignment.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapDataStatistics.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapRangePartitioner.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/RangePartitioner.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/RequestGlobalStatisticsEvent.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchDataStatistics.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchRangePartitioner.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchUtil.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SortKeySerializer.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SortKeySketchSerializer.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SortKeyUtil.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsEvent.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsOrRecord.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsOrRecordSerializer.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsType.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsUtil.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/source/AvroGenericRecordFileScanTaskReader.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/source/DataIterator.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/source/DataTaskReader.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/source/FileScanTaskReader.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/source/FlinkInputFormat.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/source/FlinkInputSplit.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSource.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSplitPlanner.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/source/IcebergSource.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/source/IcebergTableSource.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/source/RowDataFileScanTaskReader.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/source/RowDataRewriter.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/source/RowDataToAvroGenericRecordConverter.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/source/ScanContext.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/source/SourceUtil.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/source/StreamingMonitorFunction.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/source/StreamingReaderOperator.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/source/StreamingStartingStrategy.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/source/assigner/DefaultSplitAssigner.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/source/assigner/GetSplitResult.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/source/assigner/OrderedSplitAssignerFactory.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SimpleSplitAssignerFactory.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssigner.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssignerFactory.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssignerType.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/AbstractIcebergEnumerator.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousEnumerationResult.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousIcebergEnumerator.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousSplitPlanner.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousSplitPlannerImpl.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/EnumerationHistory.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorPosition.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorPositionSerializer.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorState.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorStateSerializer.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/StaticIcebergEnumerator.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/source/reader/ArrayBatchRecords.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/source/reader/ArrayPoolDataIteratorBatcher.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/source/reader/AvroGenericRecordReaderFunction.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/source/reader/ColumnStatsWatermarkExtractor.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorBatcher.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorReaderFunction.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceReader.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceReaderMetrics.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceSplitReader.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/source/reader/LimitableDataIterator.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/source/reader/ListBatchRecords.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/source/reader/ListDataIteratorBatcher.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/source/reader/MetaDataReaderFunction.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/source/reader/ReaderFunction.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordAndPosition.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordFactory.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordLimiter.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataReaderFunction.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataRecordFactory.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/source/reader/SerializableRecordEmitter.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/source/reader/SplitWatermarkExtractor.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/source/reader/WatermarkExtractorRecordEmitter.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplit.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitSerializer.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitState.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitStatus.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/source/split/SerializableComparator.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/source/split/SerializerHelper.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/source/split/SplitComparators.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/source/split/SplitRequestEvent.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/util/ElapsedTimeGauge.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/util/FlinkAlterTableUtil.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/util/FlinkCompatibilityUtil.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/java/org/apache/iceberg/flink/util/FlinkPackage.java (100%) rename flink/{v1.19 => v1.20}/flink/src/main/resources/META-INF/services/org.apache.flink.table.factories.Factory (100%) rename flink/{v1.19 => v1.20}/flink/src/main/resources/META-INF/services/org.apache.flink.table.factories.TableFactory (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/AvroGenericRecordConverterBase.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/CatalogTestBase.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/DataGenerator.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/DataGenerators.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/HadoopCatalogExtension.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/HadoopTableExtension.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/MiniFlinkClusterExtension.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/RowDataConverter.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/SimpleDataUtil.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/SqlBase.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/TestBase.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/TestCatalogLoader.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/TestCatalogTableLoader.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/TestChangeLogTable.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/TestDataFileSerialization.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/TestFixtures.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/TestFlinkAnonymousTable.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogDatabase.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogFactory.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogTable.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogTablePartitions.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/TestFlinkConfParser.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/TestFlinkFilters.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/TestFlinkHiveCatalog.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/TestFlinkSchemaUtil.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSink.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSinkExtended.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/TestFlinkUpsert.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/TestHelpers.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/TestIcebergConnector.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/TestManifestFileSerialization.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/TestRowDataWrapper.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/TestTableLoader.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/TestTableSerialization.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/actions/TestRewriteDataFilesAction.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/data/RandomRowData.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/data/RowDataToRowMapper.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkAvroReaderWriter.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkOrcReaderWriter.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkParquetReader.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkParquetWriter.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/data/TestRowDataProjection.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/data/TestRowProjection.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/data/TestStructRowData.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/CollectingSink.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/FlinkSqlExtension.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/FlinkStreamingTestUtils.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/ManualSource.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/OperatorTestBase.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestMonitorSource.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/sink/TestAvroGenericRecordToRowDataMapper.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitionKeySelector.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitioner.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitionerFlinkIcebergSink.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitionerUtil.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/sink/TestCachingTableSupplier.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/sink/TestCompressionSettings.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/sink/TestDeltaTaskWriter.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkAppenderFactory.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkFileWriterFactory.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSink.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkBase.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkBranch.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkDistributionMode.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkExtended.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2Base.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2Branch.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkManifest.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkPartitioningWriters.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkPositionDeltaWriters.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkRollingFileWriters.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkWriterMetrics.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergFilesCommitter.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergStreamWriter.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/sink/TestRowDataPartitionKey.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/sink/TestTaskWriters.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/Fixtures.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestAggregatedStatisticsTracker.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestCompletedStatisticsSerializer.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsCoordinator.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsCoordinatorProvider.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsOperator.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsSerializer.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestGlobalStatisticsSerializer.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestMapDataStatistics.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestMapRangePartitioner.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestRangePartitioner.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchDataStatistics.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchRangePartitioner.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchUtil.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerBase.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerNestedStruct.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerPrimitives.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerSnapshot.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeyUtil.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/source/BoundedTableFactory.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/source/BoundedTestSource.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/source/ChangeLogTableTestBase.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/source/SplitHelpers.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/source/SqlHelpers.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/source/TableSourceTestBase.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/source/TestBoundedTableFactory.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkInputFormat.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkInputFormatReaderDeletes.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkMergingMetrics.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkMetaDataTable.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkReaderDeletesBase.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScan.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScanSql.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSource.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSourceConfig.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSourceSql.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkTableSource.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBounded.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBoundedGenericRecord.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBoundedSql.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceContinuous.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceFailover.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceFailoverWithWatermarkExtractor.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceReaderDeletes.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceSql.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceWithWatermarkExtractor.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSpeculativeExecutionSupport.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/source/TestMetadataTableReadableMetrics.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/source/TestProjectMetaColumn.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/source/TestRowDataToAvroGenericRecordConverter.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/source/TestScanContext.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/source/TestSourceUtil.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/source/TestSqlBase.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamScanSql.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamingMonitorFunction.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamingReaderOperator.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/source/assigner/SplitAssignerTestBase.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/source/assigner/TestDefaultSplitAssigner.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/source/assigner/TestFileSequenceNumberBasedSplitAssigner.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/source/assigner/TestWatermarkBasedSplitAssigner.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/ManualContinuousSplitPlanner.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousIcebergEnumerator.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousSplitPlannerImpl.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousSplitPlannerImplStartStrategy.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestEnumerationHistory.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestIcebergEnumeratorStateSerializer.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/source/reader/ReaderFunctionTestBase.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/source/reader/ReaderUtil.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestArrayBatchRecords.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestArrayPoolDataIteratorBatcherRowData.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestColumnStatsWatermarkExtractor.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestIcebergSourceReader.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestLimitableDataIterator.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestRowDataReaderFunction.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestingMetricGroup.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/source/split/TestIcebergSourceSplitSerializer.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/java/org/apache/iceberg/flink/util/TestFlinkPackage.java (100%) rename flink/{v1.19 => v1.20}/flink/src/test/resources/META-INF/services/org.apache.flink.table.factories.Factory (100%) diff --git a/flink/v1.19/build.gradle b/flink/v1.20/build.gradle similarity index 100% rename from flink/v1.19/build.gradle rename to flink/v1.20/build.gradle diff --git a/flink/v1.19/flink-runtime/LICENSE b/flink/v1.20/flink-runtime/LICENSE similarity index 100% rename from flink/v1.19/flink-runtime/LICENSE rename to flink/v1.20/flink-runtime/LICENSE diff --git a/flink/v1.19/flink-runtime/NOTICE b/flink/v1.20/flink-runtime/NOTICE similarity index 100% rename from flink/v1.19/flink-runtime/NOTICE rename to flink/v1.20/flink-runtime/NOTICE diff --git a/flink/v1.19/flink-runtime/src/integration/java/org/apache/iceberg/flink/IcebergConnectorSmokeTest.java b/flink/v1.20/flink-runtime/src/integration/java/org/apache/iceberg/flink/IcebergConnectorSmokeTest.java similarity index 100% rename from flink/v1.19/flink-runtime/src/integration/java/org/apache/iceberg/flink/IcebergConnectorSmokeTest.java rename to flink/v1.20/flink-runtime/src/integration/java/org/apache/iceberg/flink/IcebergConnectorSmokeTest.java diff --git a/flink/v1.19/flink/src/jmh/java/org/apache/iceberg/flink/sink/shuffle/MapRangePartitionerBenchmark.java b/flink/v1.20/flink/src/jmh/java/org/apache/iceberg/flink/sink/shuffle/MapRangePartitionerBenchmark.java similarity index 100% rename from flink/v1.19/flink/src/jmh/java/org/apache/iceberg/flink/sink/shuffle/MapRangePartitionerBenchmark.java rename to flink/v1.20/flink/src/jmh/java/org/apache/iceberg/flink/sink/shuffle/MapRangePartitionerBenchmark.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/CatalogLoader.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/CatalogLoader.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/CatalogLoader.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/CatalogLoader.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkCatalog.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/FlinkCatalog.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkCatalog.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/FlinkCatalog.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkCatalogFactory.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/FlinkCatalogFactory.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkCatalogFactory.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/FlinkCatalogFactory.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkConfParser.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/FlinkConfParser.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkConfParser.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/FlinkConfParser.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkConfigOptions.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/FlinkConfigOptions.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkConfigOptions.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/FlinkConfigOptions.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkDynamicTableFactory.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/FlinkDynamicTableFactory.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkDynamicTableFactory.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/FlinkDynamicTableFactory.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkEnvironmentContext.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/FlinkEnvironmentContext.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkEnvironmentContext.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/FlinkEnvironmentContext.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkFilters.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/FlinkFilters.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkFilters.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/FlinkFilters.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkFixupTypes.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/FlinkFixupTypes.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkFixupTypes.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/FlinkFixupTypes.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkReadConf.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/FlinkReadConf.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkReadConf.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/FlinkReadConf.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkReadOptions.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/FlinkReadOptions.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkReadOptions.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/FlinkReadOptions.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkSchemaUtil.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/FlinkSchemaUtil.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkSchemaUtil.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/FlinkSchemaUtil.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkSourceFilter.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/FlinkSourceFilter.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkSourceFilter.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/FlinkSourceFilter.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeToType.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeToType.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeToType.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeToType.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeVisitor.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeVisitor.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeVisitor.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeVisitor.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkWriteConf.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/FlinkWriteConf.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkWriteConf.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/FlinkWriteConf.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkWriteOptions.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/FlinkWriteOptions.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkWriteOptions.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/FlinkWriteOptions.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/IcebergTableSink.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/IcebergTableSink.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/IcebergTableSink.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/IcebergTableSink.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/RowDataWrapper.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/RowDataWrapper.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/RowDataWrapper.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/RowDataWrapper.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/TableLoader.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/TableLoader.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/TableLoader.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/TableLoader.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/TypeToFlinkType.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/TypeToFlinkType.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/TypeToFlinkType.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/TypeToFlinkType.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/actions/Actions.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/actions/Actions.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/actions/Actions.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/actions/Actions.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/actions/RewriteDataFilesAction.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/actions/RewriteDataFilesAction.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/actions/RewriteDataFilesAction.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/actions/RewriteDataFilesAction.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/AvroWithFlinkSchemaVisitor.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/data/AvroWithFlinkSchemaVisitor.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/AvroWithFlinkSchemaVisitor.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/data/AvroWithFlinkSchemaVisitor.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkAvroReader.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/data/FlinkAvroReader.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkAvroReader.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/data/FlinkAvroReader.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkAvroWriter.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/data/FlinkAvroWriter.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkAvroWriter.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/data/FlinkAvroWriter.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReader.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReader.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReader.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReader.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReaders.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReaders.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReaders.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReaders.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriter.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriter.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriter.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriter.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriters.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriters.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriters.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriters.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkParquetReaders.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/data/FlinkParquetReaders.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkParquetReaders.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/data/FlinkParquetReaders.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkParquetWriters.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/data/FlinkParquetWriters.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkParquetWriters.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/data/FlinkParquetWriters.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkSchemaVisitor.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/data/FlinkSchemaVisitor.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkSchemaVisitor.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/data/FlinkSchemaVisitor.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkValueReaders.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/data/FlinkValueReaders.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkValueReaders.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/data/FlinkValueReaders.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkValueWriters.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/data/FlinkValueWriters.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkValueWriters.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/data/FlinkValueWriters.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/ParquetWithFlinkSchemaVisitor.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/data/ParquetWithFlinkSchemaVisitor.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/ParquetWithFlinkSchemaVisitor.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/data/ParquetWithFlinkSchemaVisitor.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/RowDataProjection.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/data/RowDataProjection.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/RowDataProjection.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/data/RowDataProjection.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/RowDataUtil.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/data/RowDataUtil.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/RowDataUtil.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/data/RowDataUtil.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/StructRowData.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/data/StructRowData.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/StructRowData.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/data/StructRowData.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/MonitorSource.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/MonitorSource.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/MonitorSource.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/MonitorSource.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/SingleThreadedIteratorSource.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/SingleThreadedIteratorSource.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/SingleThreadedIteratorSource.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/SingleThreadedIteratorSource.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/TableChange.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/TableChange.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/TableChange.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/TableChange.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/AvroGenericRecordToRowDataMapper.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/AvroGenericRecordToRowDataMapper.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/AvroGenericRecordToRowDataMapper.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/AvroGenericRecordToRowDataMapper.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/BaseDeltaTaskWriter.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/BaseDeltaTaskWriter.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/BaseDeltaTaskWriter.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/BaseDeltaTaskWriter.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/BucketPartitionKeySelector.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/BucketPartitionKeySelector.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/BucketPartitionKeySelector.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/BucketPartitionKeySelector.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/BucketPartitioner.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/BucketPartitioner.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/BucketPartitioner.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/BucketPartitioner.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/BucketPartitionerUtil.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/BucketPartitionerUtil.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/BucketPartitionerUtil.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/BucketPartitionerUtil.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/CachingTableSupplier.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/CachingTableSupplier.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/CachingTableSupplier.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/CachingTableSupplier.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/CommitSummary.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/CommitSummary.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/CommitSummary.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/CommitSummary.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/DeltaManifests.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/DeltaManifests.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/DeltaManifests.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/DeltaManifests.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/DeltaManifestsSerializer.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/DeltaManifestsSerializer.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/DeltaManifestsSerializer.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/DeltaManifestsSerializer.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/EqualityFieldKeySelector.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/EqualityFieldKeySelector.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/EqualityFieldKeySelector.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/EqualityFieldKeySelector.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkAppenderFactory.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkAppenderFactory.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkAppenderFactory.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkAppenderFactory.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkFileWriterFactory.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkFileWriterFactory.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkFileWriterFactory.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkFileWriterFactory.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkManifestUtil.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkManifestUtil.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkManifestUtil.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkManifestUtil.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkSink.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkSink.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkSink.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkSink.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergFilesCommitter.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergFilesCommitter.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergFilesCommitter.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergFilesCommitter.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergFilesCommitterMetrics.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergFilesCommitterMetrics.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergFilesCommitterMetrics.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergFilesCommitterMetrics.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriter.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriter.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriter.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriter.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriterMetrics.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriterMetrics.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriterMetrics.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriterMetrics.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/ManifestOutputFileFactory.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/ManifestOutputFileFactory.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/ManifestOutputFileFactory.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/ManifestOutputFileFactory.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/PartitionKeySelector.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/PartitionKeySelector.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/PartitionKeySelector.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/PartitionKeySelector.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/PartitionedDeltaWriter.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/PartitionedDeltaWriter.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/PartitionedDeltaWriter.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/PartitionedDeltaWriter.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/RowDataTaskWriterFactory.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/RowDataTaskWriterFactory.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/RowDataTaskWriterFactory.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/RowDataTaskWriterFactory.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/TaskWriterFactory.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/TaskWriterFactory.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/TaskWriterFactory.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/TaskWriterFactory.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/UnpartitionedDeltaWriter.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/UnpartitionedDeltaWriter.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/UnpartitionedDeltaWriter.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/UnpartitionedDeltaWriter.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/AggregatedStatisticsTracker.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/AggregatedStatisticsTracker.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/AggregatedStatisticsTracker.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/AggregatedStatisticsTracker.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/CompletedStatistics.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/CompletedStatistics.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/CompletedStatistics.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/CompletedStatistics.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/CompletedStatisticsSerializer.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/CompletedStatisticsSerializer.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/CompletedStatisticsSerializer.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/CompletedStatisticsSerializer.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatistics.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatistics.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatistics.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatistics.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsCoordinator.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsCoordinator.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsCoordinator.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsCoordinator.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsCoordinatorProvider.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsCoordinatorProvider.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsCoordinatorProvider.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsCoordinatorProvider.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsOperator.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsOperator.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsOperator.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsOperator.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsSerializer.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsSerializer.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsSerializer.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsSerializer.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/GlobalStatistics.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/GlobalStatistics.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/GlobalStatistics.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/GlobalStatistics.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/GlobalStatisticsSerializer.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/GlobalStatisticsSerializer.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/GlobalStatisticsSerializer.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/GlobalStatisticsSerializer.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/KeyAssignment.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/KeyAssignment.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/KeyAssignment.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/KeyAssignment.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapAssignment.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapAssignment.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapAssignment.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapAssignment.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapDataStatistics.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapDataStatistics.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapDataStatistics.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapDataStatistics.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapRangePartitioner.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapRangePartitioner.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapRangePartitioner.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapRangePartitioner.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/RangePartitioner.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/RangePartitioner.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/RangePartitioner.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/RangePartitioner.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/RequestGlobalStatisticsEvent.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/RequestGlobalStatisticsEvent.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/RequestGlobalStatisticsEvent.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/RequestGlobalStatisticsEvent.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchDataStatistics.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchDataStatistics.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchDataStatistics.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchDataStatistics.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchRangePartitioner.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchRangePartitioner.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchRangePartitioner.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchRangePartitioner.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchUtil.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchUtil.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchUtil.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchUtil.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SortKeySerializer.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SortKeySerializer.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SortKeySerializer.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SortKeySerializer.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SortKeySketchSerializer.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SortKeySketchSerializer.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SortKeySketchSerializer.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SortKeySketchSerializer.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SortKeyUtil.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SortKeyUtil.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SortKeyUtil.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SortKeyUtil.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsEvent.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsEvent.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsEvent.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsEvent.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsOrRecord.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsOrRecord.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsOrRecord.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsOrRecord.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsOrRecordSerializer.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsOrRecordSerializer.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsOrRecordSerializer.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsOrRecordSerializer.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsType.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsType.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsType.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsType.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsUtil.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsUtil.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsUtil.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsUtil.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/AvroGenericRecordFileScanTaskReader.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/AvroGenericRecordFileScanTaskReader.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/AvroGenericRecordFileScanTaskReader.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/AvroGenericRecordFileScanTaskReader.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/DataIterator.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/DataIterator.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/DataIterator.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/DataIterator.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/DataTaskReader.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/DataTaskReader.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/DataTaskReader.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/DataTaskReader.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/FileScanTaskReader.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/FileScanTaskReader.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/FileScanTaskReader.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/FileScanTaskReader.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/FlinkInputFormat.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/FlinkInputFormat.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/FlinkInputFormat.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/FlinkInputFormat.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/FlinkInputSplit.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/FlinkInputSplit.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/FlinkInputSplit.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/FlinkInputSplit.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSource.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSource.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSource.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSource.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSplitPlanner.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSplitPlanner.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSplitPlanner.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSplitPlanner.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/IcebergSource.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/IcebergSource.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/IcebergSource.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/IcebergSource.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/IcebergTableSource.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/IcebergTableSource.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/IcebergTableSource.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/IcebergTableSource.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/RowDataFileScanTaskReader.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/RowDataFileScanTaskReader.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/RowDataFileScanTaskReader.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/RowDataFileScanTaskReader.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/RowDataRewriter.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/RowDataRewriter.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/RowDataRewriter.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/RowDataRewriter.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/RowDataToAvroGenericRecordConverter.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/RowDataToAvroGenericRecordConverter.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/RowDataToAvroGenericRecordConverter.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/RowDataToAvroGenericRecordConverter.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/ScanContext.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/ScanContext.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/ScanContext.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/ScanContext.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/SourceUtil.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/SourceUtil.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/SourceUtil.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/SourceUtil.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/StreamingMonitorFunction.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/StreamingMonitorFunction.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/StreamingMonitorFunction.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/StreamingMonitorFunction.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/StreamingReaderOperator.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/StreamingReaderOperator.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/StreamingReaderOperator.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/StreamingReaderOperator.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/StreamingStartingStrategy.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/StreamingStartingStrategy.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/StreamingStartingStrategy.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/StreamingStartingStrategy.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/assigner/DefaultSplitAssigner.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/assigner/DefaultSplitAssigner.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/assigner/DefaultSplitAssigner.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/assigner/DefaultSplitAssigner.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/assigner/GetSplitResult.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/assigner/GetSplitResult.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/assigner/GetSplitResult.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/assigner/GetSplitResult.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/assigner/OrderedSplitAssignerFactory.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/assigner/OrderedSplitAssignerFactory.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/assigner/OrderedSplitAssignerFactory.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/assigner/OrderedSplitAssignerFactory.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SimpleSplitAssignerFactory.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SimpleSplitAssignerFactory.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SimpleSplitAssignerFactory.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SimpleSplitAssignerFactory.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssigner.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssigner.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssigner.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssigner.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssignerFactory.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssignerFactory.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssignerFactory.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssignerFactory.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssignerType.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssignerType.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssignerType.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssignerType.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/AbstractIcebergEnumerator.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/AbstractIcebergEnumerator.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/AbstractIcebergEnumerator.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/AbstractIcebergEnumerator.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousEnumerationResult.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousEnumerationResult.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousEnumerationResult.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousEnumerationResult.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousIcebergEnumerator.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousIcebergEnumerator.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousIcebergEnumerator.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousIcebergEnumerator.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousSplitPlanner.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousSplitPlanner.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousSplitPlanner.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousSplitPlanner.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousSplitPlannerImpl.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousSplitPlannerImpl.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousSplitPlannerImpl.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousSplitPlannerImpl.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/EnumerationHistory.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/EnumerationHistory.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/EnumerationHistory.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/EnumerationHistory.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorPosition.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorPosition.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorPosition.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorPosition.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorPositionSerializer.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorPositionSerializer.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorPositionSerializer.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorPositionSerializer.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorState.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorState.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorState.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorState.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorStateSerializer.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorStateSerializer.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorStateSerializer.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorStateSerializer.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/StaticIcebergEnumerator.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/StaticIcebergEnumerator.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/StaticIcebergEnumerator.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/StaticIcebergEnumerator.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/ArrayBatchRecords.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/reader/ArrayBatchRecords.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/ArrayBatchRecords.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/reader/ArrayBatchRecords.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/ArrayPoolDataIteratorBatcher.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/reader/ArrayPoolDataIteratorBatcher.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/ArrayPoolDataIteratorBatcher.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/reader/ArrayPoolDataIteratorBatcher.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/AvroGenericRecordReaderFunction.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/reader/AvroGenericRecordReaderFunction.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/AvroGenericRecordReaderFunction.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/reader/AvroGenericRecordReaderFunction.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/ColumnStatsWatermarkExtractor.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/reader/ColumnStatsWatermarkExtractor.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/ColumnStatsWatermarkExtractor.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/reader/ColumnStatsWatermarkExtractor.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorBatcher.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorBatcher.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorBatcher.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorBatcher.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorReaderFunction.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorReaderFunction.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorReaderFunction.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorReaderFunction.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceReader.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceReader.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceReader.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceReader.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceReaderMetrics.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceReaderMetrics.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceReaderMetrics.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceReaderMetrics.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceSplitReader.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceSplitReader.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceSplitReader.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceSplitReader.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/LimitableDataIterator.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/reader/LimitableDataIterator.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/LimitableDataIterator.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/reader/LimitableDataIterator.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/ListBatchRecords.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/reader/ListBatchRecords.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/ListBatchRecords.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/reader/ListBatchRecords.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/ListDataIteratorBatcher.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/reader/ListDataIteratorBatcher.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/ListDataIteratorBatcher.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/reader/ListDataIteratorBatcher.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/MetaDataReaderFunction.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/reader/MetaDataReaderFunction.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/MetaDataReaderFunction.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/reader/MetaDataReaderFunction.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/ReaderFunction.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/reader/ReaderFunction.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/ReaderFunction.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/reader/ReaderFunction.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordAndPosition.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordAndPosition.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordAndPosition.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordAndPosition.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordFactory.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordFactory.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordFactory.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordFactory.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordLimiter.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordLimiter.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordLimiter.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordLimiter.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataReaderFunction.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataReaderFunction.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataReaderFunction.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataReaderFunction.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataRecordFactory.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataRecordFactory.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataRecordFactory.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataRecordFactory.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/SerializableRecordEmitter.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/reader/SerializableRecordEmitter.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/SerializableRecordEmitter.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/reader/SerializableRecordEmitter.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/SplitWatermarkExtractor.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/reader/SplitWatermarkExtractor.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/SplitWatermarkExtractor.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/reader/SplitWatermarkExtractor.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/WatermarkExtractorRecordEmitter.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/reader/WatermarkExtractorRecordEmitter.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/WatermarkExtractorRecordEmitter.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/reader/WatermarkExtractorRecordEmitter.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplit.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplit.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplit.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplit.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitSerializer.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitSerializer.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitSerializer.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitSerializer.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitState.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitState.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitState.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitState.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitStatus.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitStatus.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitStatus.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitStatus.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/split/SerializableComparator.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/split/SerializableComparator.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/split/SerializableComparator.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/split/SerializableComparator.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/split/SerializerHelper.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/split/SerializerHelper.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/split/SerializerHelper.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/split/SerializerHelper.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/split/SplitComparators.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/split/SplitComparators.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/split/SplitComparators.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/split/SplitComparators.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/split/SplitRequestEvent.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/split/SplitRequestEvent.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/split/SplitRequestEvent.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/split/SplitRequestEvent.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/util/ElapsedTimeGauge.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/util/ElapsedTimeGauge.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/util/ElapsedTimeGauge.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/util/ElapsedTimeGauge.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/util/FlinkAlterTableUtil.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/util/FlinkAlterTableUtil.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/util/FlinkAlterTableUtil.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/util/FlinkAlterTableUtil.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/util/FlinkCompatibilityUtil.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/util/FlinkCompatibilityUtil.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/util/FlinkCompatibilityUtil.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/util/FlinkCompatibilityUtil.java diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/util/FlinkPackage.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/util/FlinkPackage.java similarity index 100% rename from flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/util/FlinkPackage.java rename to flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/util/FlinkPackage.java diff --git a/flink/v1.19/flink/src/main/resources/META-INF/services/org.apache.flink.table.factories.Factory b/flink/v1.20/flink/src/main/resources/META-INF/services/org.apache.flink.table.factories.Factory similarity index 100% rename from flink/v1.19/flink/src/main/resources/META-INF/services/org.apache.flink.table.factories.Factory rename to flink/v1.20/flink/src/main/resources/META-INF/services/org.apache.flink.table.factories.Factory diff --git a/flink/v1.19/flink/src/main/resources/META-INF/services/org.apache.flink.table.factories.TableFactory b/flink/v1.20/flink/src/main/resources/META-INF/services/org.apache.flink.table.factories.TableFactory similarity index 100% rename from flink/v1.19/flink/src/main/resources/META-INF/services/org.apache.flink.table.factories.TableFactory rename to flink/v1.20/flink/src/main/resources/META-INF/services/org.apache.flink.table.factories.TableFactory diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/AvroGenericRecordConverterBase.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/AvroGenericRecordConverterBase.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/AvroGenericRecordConverterBase.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/AvroGenericRecordConverterBase.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/CatalogTestBase.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/CatalogTestBase.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/CatalogTestBase.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/CatalogTestBase.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/DataGenerator.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/DataGenerator.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/DataGenerator.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/DataGenerator.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/DataGenerators.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/DataGenerators.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/DataGenerators.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/DataGenerators.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/HadoopCatalogExtension.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/HadoopCatalogExtension.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/HadoopCatalogExtension.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/HadoopCatalogExtension.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/HadoopTableExtension.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/HadoopTableExtension.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/HadoopTableExtension.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/HadoopTableExtension.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/MiniFlinkClusterExtension.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/MiniFlinkClusterExtension.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/MiniFlinkClusterExtension.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/MiniFlinkClusterExtension.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/RowDataConverter.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/RowDataConverter.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/RowDataConverter.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/RowDataConverter.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/SimpleDataUtil.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/SimpleDataUtil.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/SimpleDataUtil.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/SimpleDataUtil.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/SqlBase.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/SqlBase.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/SqlBase.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/SqlBase.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestBase.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/TestBase.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestBase.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/TestBase.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestCatalogLoader.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/TestCatalogLoader.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestCatalogLoader.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/TestCatalogLoader.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestCatalogTableLoader.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/TestCatalogTableLoader.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestCatalogTableLoader.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/TestCatalogTableLoader.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestChangeLogTable.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/TestChangeLogTable.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestChangeLogTable.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/TestChangeLogTable.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestDataFileSerialization.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/TestDataFileSerialization.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestDataFileSerialization.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/TestDataFileSerialization.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFixtures.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/TestFixtures.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFixtures.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/TestFixtures.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkAnonymousTable.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/TestFlinkAnonymousTable.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkAnonymousTable.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/TestFlinkAnonymousTable.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogDatabase.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogDatabase.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogDatabase.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogDatabase.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogFactory.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogFactory.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogFactory.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogFactory.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogTable.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogTable.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogTable.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogTable.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogTablePartitions.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogTablePartitions.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogTablePartitions.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogTablePartitions.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkConfParser.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/TestFlinkConfParser.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkConfParser.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/TestFlinkConfParser.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkFilters.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/TestFlinkFilters.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkFilters.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/TestFlinkFilters.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkHiveCatalog.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/TestFlinkHiveCatalog.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkHiveCatalog.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/TestFlinkHiveCatalog.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkSchemaUtil.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/TestFlinkSchemaUtil.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkSchemaUtil.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/TestFlinkSchemaUtil.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSink.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSink.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSink.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSink.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSinkExtended.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSinkExtended.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSinkExtended.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSinkExtended.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkUpsert.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/TestFlinkUpsert.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkUpsert.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/TestFlinkUpsert.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestHelpers.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/TestHelpers.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestHelpers.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/TestHelpers.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestIcebergConnector.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/TestIcebergConnector.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestIcebergConnector.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/TestIcebergConnector.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestManifestFileSerialization.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/TestManifestFileSerialization.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestManifestFileSerialization.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/TestManifestFileSerialization.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestRowDataWrapper.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/TestRowDataWrapper.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestRowDataWrapper.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/TestRowDataWrapper.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestTableLoader.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/TestTableLoader.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestTableLoader.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/TestTableLoader.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestTableSerialization.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/TestTableSerialization.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestTableSerialization.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/TestTableSerialization.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/actions/TestRewriteDataFilesAction.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/actions/TestRewriteDataFilesAction.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/actions/TestRewriteDataFilesAction.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/actions/TestRewriteDataFilesAction.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/data/RandomRowData.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/data/RandomRowData.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/data/RandomRowData.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/data/RandomRowData.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/data/RowDataToRowMapper.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/data/RowDataToRowMapper.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/data/RowDataToRowMapper.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/data/RowDataToRowMapper.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkAvroReaderWriter.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkAvroReaderWriter.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkAvroReaderWriter.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkAvroReaderWriter.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkOrcReaderWriter.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkOrcReaderWriter.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkOrcReaderWriter.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkOrcReaderWriter.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkParquetReader.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkParquetReader.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkParquetReader.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkParquetReader.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkParquetWriter.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkParquetWriter.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkParquetWriter.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkParquetWriter.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/data/TestRowDataProjection.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/data/TestRowDataProjection.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/data/TestRowDataProjection.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/data/TestRowDataProjection.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/data/TestRowProjection.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/data/TestRowProjection.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/data/TestRowProjection.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/data/TestRowProjection.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/data/TestStructRowData.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/data/TestStructRowData.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/data/TestStructRowData.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/data/TestStructRowData.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/CollectingSink.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/CollectingSink.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/CollectingSink.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/CollectingSink.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/FlinkSqlExtension.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/FlinkSqlExtension.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/FlinkSqlExtension.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/FlinkSqlExtension.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/FlinkStreamingTestUtils.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/FlinkStreamingTestUtils.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/FlinkStreamingTestUtils.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/FlinkStreamingTestUtils.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/ManualSource.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/ManualSource.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/ManualSource.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/ManualSource.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/OperatorTestBase.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/OperatorTestBase.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/OperatorTestBase.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/OperatorTestBase.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestMonitorSource.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestMonitorSource.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestMonitorSource.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestMonitorSource.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestAvroGenericRecordToRowDataMapper.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/TestAvroGenericRecordToRowDataMapper.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestAvroGenericRecordToRowDataMapper.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/TestAvroGenericRecordToRowDataMapper.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitionKeySelector.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitionKeySelector.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitionKeySelector.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitionKeySelector.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitioner.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitioner.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitioner.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitioner.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitionerFlinkIcebergSink.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitionerFlinkIcebergSink.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitionerFlinkIcebergSink.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitionerFlinkIcebergSink.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitionerUtil.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitionerUtil.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitionerUtil.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitionerUtil.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestCachingTableSupplier.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/TestCachingTableSupplier.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestCachingTableSupplier.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/TestCachingTableSupplier.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestCompressionSettings.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/TestCompressionSettings.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestCompressionSettings.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/TestCompressionSettings.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestDeltaTaskWriter.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/TestDeltaTaskWriter.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestDeltaTaskWriter.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/TestDeltaTaskWriter.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkAppenderFactory.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkAppenderFactory.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkAppenderFactory.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkAppenderFactory.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkFileWriterFactory.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkFileWriterFactory.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkFileWriterFactory.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkFileWriterFactory.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSink.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSink.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSink.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSink.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkBase.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkBase.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkBase.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkBase.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkBranch.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkBranch.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkBranch.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkBranch.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkDistributionMode.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkDistributionMode.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkDistributionMode.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkDistributionMode.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkExtended.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkExtended.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkExtended.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkExtended.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2Base.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2Base.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2Base.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2Base.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2Branch.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2Branch.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2Branch.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2Branch.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkManifest.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkManifest.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkManifest.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkManifest.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkPartitioningWriters.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkPartitioningWriters.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkPartitioningWriters.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkPartitioningWriters.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkPositionDeltaWriters.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkPositionDeltaWriters.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkPositionDeltaWriters.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkPositionDeltaWriters.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkRollingFileWriters.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkRollingFileWriters.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkRollingFileWriters.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkRollingFileWriters.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkWriterMetrics.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkWriterMetrics.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkWriterMetrics.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkWriterMetrics.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergFilesCommitter.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergFilesCommitter.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergFilesCommitter.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergFilesCommitter.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergStreamWriter.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergStreamWriter.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergStreamWriter.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergStreamWriter.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestRowDataPartitionKey.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/TestRowDataPartitionKey.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestRowDataPartitionKey.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/TestRowDataPartitionKey.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestTaskWriters.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/TestTaskWriters.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestTaskWriters.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/TestTaskWriters.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/Fixtures.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/Fixtures.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/Fixtures.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/Fixtures.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestAggregatedStatisticsTracker.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestAggregatedStatisticsTracker.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestAggregatedStatisticsTracker.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestAggregatedStatisticsTracker.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestCompletedStatisticsSerializer.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestCompletedStatisticsSerializer.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestCompletedStatisticsSerializer.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestCompletedStatisticsSerializer.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsCoordinator.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsCoordinator.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsCoordinator.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsCoordinator.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsCoordinatorProvider.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsCoordinatorProvider.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsCoordinatorProvider.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsCoordinatorProvider.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsOperator.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsOperator.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsOperator.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsOperator.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsSerializer.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsSerializer.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsSerializer.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsSerializer.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestGlobalStatisticsSerializer.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestGlobalStatisticsSerializer.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestGlobalStatisticsSerializer.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestGlobalStatisticsSerializer.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestMapDataStatistics.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestMapDataStatistics.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestMapDataStatistics.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestMapDataStatistics.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestMapRangePartitioner.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestMapRangePartitioner.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestMapRangePartitioner.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestMapRangePartitioner.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestRangePartitioner.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestRangePartitioner.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestRangePartitioner.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestRangePartitioner.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchDataStatistics.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchDataStatistics.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchDataStatistics.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchDataStatistics.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchRangePartitioner.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchRangePartitioner.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchRangePartitioner.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchRangePartitioner.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchUtil.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchUtil.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchUtil.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchUtil.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerBase.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerBase.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerBase.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerBase.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerNestedStruct.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerNestedStruct.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerNestedStruct.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerNestedStruct.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerPrimitives.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerPrimitives.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerPrimitives.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerPrimitives.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerSnapshot.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerSnapshot.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerSnapshot.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerSnapshot.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeyUtil.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeyUtil.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeyUtil.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeyUtil.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/BoundedTableFactory.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/BoundedTableFactory.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/BoundedTableFactory.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/BoundedTableFactory.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/BoundedTestSource.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/BoundedTestSource.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/BoundedTestSource.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/BoundedTestSource.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/ChangeLogTableTestBase.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/ChangeLogTableTestBase.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/ChangeLogTableTestBase.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/ChangeLogTableTestBase.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/SplitHelpers.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/SplitHelpers.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/SplitHelpers.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/SplitHelpers.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/SqlHelpers.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/SqlHelpers.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/SqlHelpers.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/SqlHelpers.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TableSourceTestBase.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/TableSourceTestBase.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TableSourceTestBase.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/TableSourceTestBase.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestBoundedTableFactory.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/TestBoundedTableFactory.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestBoundedTableFactory.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/TestBoundedTableFactory.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkInputFormat.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkInputFormat.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkInputFormat.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkInputFormat.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkInputFormatReaderDeletes.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkInputFormatReaderDeletes.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkInputFormatReaderDeletes.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkInputFormatReaderDeletes.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkMergingMetrics.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkMergingMetrics.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkMergingMetrics.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkMergingMetrics.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkMetaDataTable.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkMetaDataTable.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkMetaDataTable.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkMetaDataTable.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkReaderDeletesBase.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkReaderDeletesBase.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkReaderDeletesBase.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkReaderDeletesBase.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScan.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScan.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScan.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScan.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScanSql.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScanSql.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScanSql.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScanSql.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSource.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSource.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSource.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSource.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSourceConfig.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSourceConfig.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSourceConfig.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSourceConfig.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSourceSql.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSourceSql.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSourceSql.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSourceSql.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkTableSource.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkTableSource.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkTableSource.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkTableSource.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBounded.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBounded.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBounded.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBounded.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBoundedGenericRecord.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBoundedGenericRecord.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBoundedGenericRecord.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBoundedGenericRecord.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBoundedSql.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBoundedSql.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBoundedSql.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBoundedSql.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceContinuous.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceContinuous.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceContinuous.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceContinuous.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceFailover.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceFailover.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceFailover.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceFailover.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceFailoverWithWatermarkExtractor.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceFailoverWithWatermarkExtractor.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceFailoverWithWatermarkExtractor.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceFailoverWithWatermarkExtractor.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceReaderDeletes.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceReaderDeletes.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceReaderDeletes.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceReaderDeletes.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceSql.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceSql.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceSql.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceSql.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceWithWatermarkExtractor.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceWithWatermarkExtractor.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceWithWatermarkExtractor.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceWithWatermarkExtractor.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSpeculativeExecutionSupport.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSpeculativeExecutionSupport.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSpeculativeExecutionSupport.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSpeculativeExecutionSupport.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestMetadataTableReadableMetrics.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/TestMetadataTableReadableMetrics.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestMetadataTableReadableMetrics.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/TestMetadataTableReadableMetrics.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestProjectMetaColumn.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/TestProjectMetaColumn.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestProjectMetaColumn.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/TestProjectMetaColumn.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestRowDataToAvroGenericRecordConverter.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/TestRowDataToAvroGenericRecordConverter.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestRowDataToAvroGenericRecordConverter.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/TestRowDataToAvroGenericRecordConverter.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestScanContext.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/TestScanContext.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestScanContext.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/TestScanContext.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestSourceUtil.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/TestSourceUtil.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestSourceUtil.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/TestSourceUtil.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestSqlBase.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/TestSqlBase.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestSqlBase.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/TestSqlBase.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamScanSql.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamScanSql.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamScanSql.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamScanSql.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamingMonitorFunction.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamingMonitorFunction.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamingMonitorFunction.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamingMonitorFunction.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamingReaderOperator.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamingReaderOperator.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamingReaderOperator.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamingReaderOperator.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/assigner/SplitAssignerTestBase.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/assigner/SplitAssignerTestBase.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/assigner/SplitAssignerTestBase.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/assigner/SplitAssignerTestBase.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/assigner/TestDefaultSplitAssigner.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/assigner/TestDefaultSplitAssigner.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/assigner/TestDefaultSplitAssigner.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/assigner/TestDefaultSplitAssigner.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/assigner/TestFileSequenceNumberBasedSplitAssigner.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/assigner/TestFileSequenceNumberBasedSplitAssigner.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/assigner/TestFileSequenceNumberBasedSplitAssigner.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/assigner/TestFileSequenceNumberBasedSplitAssigner.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/assigner/TestWatermarkBasedSplitAssigner.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/assigner/TestWatermarkBasedSplitAssigner.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/assigner/TestWatermarkBasedSplitAssigner.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/assigner/TestWatermarkBasedSplitAssigner.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/ManualContinuousSplitPlanner.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/ManualContinuousSplitPlanner.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/ManualContinuousSplitPlanner.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/ManualContinuousSplitPlanner.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousIcebergEnumerator.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousIcebergEnumerator.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousIcebergEnumerator.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousIcebergEnumerator.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousSplitPlannerImpl.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousSplitPlannerImpl.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousSplitPlannerImpl.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousSplitPlannerImpl.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousSplitPlannerImplStartStrategy.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousSplitPlannerImplStartStrategy.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousSplitPlannerImplStartStrategy.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousSplitPlannerImplStartStrategy.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestEnumerationHistory.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestEnumerationHistory.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestEnumerationHistory.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestEnumerationHistory.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestIcebergEnumeratorStateSerializer.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestIcebergEnumeratorStateSerializer.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestIcebergEnumeratorStateSerializer.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestIcebergEnumeratorStateSerializer.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/reader/ReaderFunctionTestBase.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/reader/ReaderFunctionTestBase.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/reader/ReaderFunctionTestBase.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/reader/ReaderFunctionTestBase.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/reader/ReaderUtil.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/reader/ReaderUtil.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/reader/ReaderUtil.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/reader/ReaderUtil.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestArrayBatchRecords.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestArrayBatchRecords.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestArrayBatchRecords.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestArrayBatchRecords.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestArrayPoolDataIteratorBatcherRowData.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestArrayPoolDataIteratorBatcherRowData.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestArrayPoolDataIteratorBatcherRowData.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestArrayPoolDataIteratorBatcherRowData.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestColumnStatsWatermarkExtractor.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestColumnStatsWatermarkExtractor.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestColumnStatsWatermarkExtractor.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestColumnStatsWatermarkExtractor.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestIcebergSourceReader.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestIcebergSourceReader.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestIcebergSourceReader.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestIcebergSourceReader.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestLimitableDataIterator.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestLimitableDataIterator.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestLimitableDataIterator.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestLimitableDataIterator.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestRowDataReaderFunction.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestRowDataReaderFunction.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestRowDataReaderFunction.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestRowDataReaderFunction.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestingMetricGroup.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestingMetricGroup.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestingMetricGroup.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestingMetricGroup.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/split/TestIcebergSourceSplitSerializer.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/split/TestIcebergSourceSplitSerializer.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/split/TestIcebergSourceSplitSerializer.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/split/TestIcebergSourceSplitSerializer.java diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/util/TestFlinkPackage.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/util/TestFlinkPackage.java similarity index 100% rename from flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/util/TestFlinkPackage.java rename to flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/util/TestFlinkPackage.java diff --git a/flink/v1.19/flink/src/test/resources/META-INF/services/org.apache.flink.table.factories.Factory b/flink/v1.20/flink/src/test/resources/META-INF/services/org.apache.flink.table.factories.Factory similarity index 100% rename from flink/v1.19/flink/src/test/resources/META-INF/services/org.apache.flink.table.factories.Factory rename to flink/v1.20/flink/src/test/resources/META-INF/services/org.apache.flink.table.factories.Factory From fb60ecde9df0342fc628292d1e97dd96d7f25f39 Mon Sep 17 00:00:00 2001 From: Steven Wu Date: Mon, 5 Aug 2024 09:00:42 -0700 Subject: [PATCH 40/55] Flink: add v1.19 back after coping from 1.20 --- flink/v1.19/build.gradle | 268 ++++ flink/v1.19/flink-runtime/LICENSE | 502 +++++++ flink/v1.19/flink-runtime/NOTICE | 91 ++ .../flink/IcebergConnectorSmokeTest.java | 21 + .../shuffle/MapRangePartitionerBenchmark.java | 199 +++ .../apache/iceberg/flink/CatalogLoader.java | 215 +++ .../apache/iceberg/flink/FlinkCatalog.java | 833 ++++++++++++ .../iceberg/flink/FlinkCatalogFactory.java | 213 +++ .../apache/iceberg/flink/FlinkConfParser.java | 261 ++++ .../iceberg/flink/FlinkConfigOptions.java | 107 ++ .../flink/FlinkDynamicTableFactory.java | 208 +++ .../flink/FlinkEnvironmentContext.java | 31 + .../apache/iceberg/flink/FlinkFilters.java | 266 ++++ .../apache/iceberg/flink/FlinkFixupTypes.java | 50 + .../apache/iceberg/flink/FlinkReadConf.java | 213 +++ .../iceberg/flink/FlinkReadOptions.java | 123 ++ .../apache/iceberg/flink/FlinkSchemaUtil.java | 232 ++++ .../iceberg/flink/FlinkSourceFilter.java | 49 + .../apache/iceberg/flink/FlinkTypeToType.java | 203 +++ .../iceberg/flink/FlinkTypeVisitor.java | 80 ++ .../apache/iceberg/flink/FlinkWriteConf.java | 205 +++ .../iceberg/flink/FlinkWriteOptions.java | 73 + .../iceberg/flink/IcebergTableSink.java | 121 ++ .../apache/iceberg/flink/RowDataWrapper.java | 142 ++ .../org/apache/iceberg/flink/TableLoader.java | 159 +++ .../apache/iceberg/flink/TypeToFlinkType.java | 134 ++ .../apache/iceberg/flink/actions/Actions.java | 52 + .../flink/actions/RewriteDataFilesAction.java | 72 + .../data/AvroWithFlinkSchemaVisitor.java | 75 ++ .../iceberg/flink/data/FlinkAvroReader.java | 169 +++ .../iceberg/flink/data/FlinkAvroWriter.java | 165 +++ .../iceberg/flink/data/FlinkOrcReader.java | 131 ++ .../iceberg/flink/data/FlinkOrcReaders.java | 283 ++++ .../iceberg/flink/data/FlinkOrcWriter.java | 163 +++ .../iceberg/flink/data/FlinkOrcWriters.java | 317 +++++ .../flink/data/FlinkParquetReaders.java | 905 +++++++++++++ .../flink/data/FlinkParquetWriters.java | 504 +++++++ .../flink/data/FlinkSchemaVisitor.java | 161 +++ .../iceberg/flink/data/FlinkValueReaders.java | 312 +++++ .../iceberg/flink/data/FlinkValueWriters.java | 253 ++++ .../data/ParquetWithFlinkSchemaVisitor.java | 222 ++++ .../iceberg/flink/data/RowDataProjection.java | 341 +++++ .../iceberg/flink/data/RowDataUtil.java | 123 ++ .../iceberg/flink/data/StructRowData.java | 300 +++++ .../maintenance/operator/MonitorSource.java | 206 +++ .../SingleThreadedIteratorSource.java | 197 +++ .../maintenance/operator/TableChange.java | 133 ++ .../AvroGenericRecordToRowDataMapper.java | 61 + .../flink/sink/BaseDeltaTaskWriter.java | 126 ++ .../sink/BucketPartitionKeySelector.java | 70 + .../iceberg/flink/sink/BucketPartitioner.java | 103 ++ .../flink/sink/BucketPartitionerUtil.java | 125 ++ .../flink/sink/CachingTableSupplier.java | 91 ++ .../iceberg/flink/sink/CommitSummary.java | 93 ++ .../iceberg/flink/sink/DeltaManifests.java | 71 + .../flink/sink/DeltaManifestsSerializer.java | 122 ++ .../flink/sink/EqualityFieldKeySelector.java | 86 ++ .../flink/sink/FlinkAppenderFactory.java | 274 ++++ .../flink/sink/FlinkFileWriterFactory.java | 293 +++++ .../iceberg/flink/sink/FlinkManifestUtil.java | 132 ++ .../apache/iceberg/flink/sink/FlinkSink.java | 654 +++++++++ .../flink/sink/IcebergFilesCommitter.java | 516 ++++++++ .../sink/IcebergFilesCommitterMetrics.java | 73 + .../flink/sink/IcebergStreamWriter.java | 120 ++ .../sink/IcebergStreamWriterMetrics.java | 89 ++ .../flink/sink/ManifestOutputFileFactory.java | 94 ++ .../flink/sink/PartitionKeySelector.java | 64 + .../flink/sink/PartitionedDeltaWriter.java | 97 ++ .../flink/sink/RowDataTaskWriterFactory.java | 244 ++++ .../iceberg/flink/sink/TaskWriterFactory.java | 45 + .../flink/sink/UnpartitionedDeltaWriter.java | 69 + .../shuffle/AggregatedStatisticsTracker.java | 262 ++++ .../sink/shuffle/CompletedStatistics.java | 111 ++ .../CompletedStatisticsSerializer.java | 176 +++ .../flink/sink/shuffle/DataStatistics.java | 48 + .../shuffle/DataStatisticsCoordinator.java | 522 ++++++++ .../DataStatisticsCoordinatorProvider.java | 70 + .../sink/shuffle/DataStatisticsOperator.java | 266 ++++ .../shuffle/DataStatisticsSerializer.java | 204 +++ .../flink/sink/shuffle/GlobalStatistics.java | 114 ++ .../shuffle/GlobalStatisticsSerializer.java | 199 +++ .../flink/sink/shuffle/KeyAssignment.java | 155 +++ .../flink/sink/shuffle/MapAssignment.java | 242 ++++ .../flink/sink/shuffle/MapDataStatistics.java | 88 ++ .../sink/shuffle/MapRangePartitioner.java | 95 ++ .../flink/sink/shuffle/RangePartitioner.java | 110 ++ .../shuffle/RequestGlobalStatisticsEvent.java | 38 + .../sink/shuffle/SketchDataStatistics.java | 87 ++ .../sink/shuffle/SketchRangePartitioner.java | 51 + .../flink/sink/shuffle/SketchUtil.java | 159 +++ .../flink/sink/shuffle/SortKeySerializer.java | 359 +++++ .../sink/shuffle/SortKeySketchSerializer.java | 143 ++ .../flink/sink/shuffle/SortKeyUtil.java | 59 + .../flink/sink/shuffle/StatisticsEvent.java | 76 ++ .../sink/shuffle/StatisticsOrRecord.java | 112 ++ .../shuffle/StatisticsOrRecordSerializer.java | 208 +++ .../flink/sink/shuffle/StatisticsType.java | 55 + .../flink/sink/shuffle/StatisticsUtil.java | 126 ++ .../AvroGenericRecordFileScanTaskReader.java | 42 + .../iceberg/flink/source/DataIterator.java | 156 +++ .../iceberg/flink/source/DataTaskReader.java | 47 + .../flink/source/FileScanTaskReader.java | 35 + .../flink/source/FlinkInputFormat.java | 141 ++ .../iceberg/flink/source/FlinkInputSplit.java | 48 + .../iceberg/flink/source/FlinkSource.java | 307 +++++ .../flink/source/FlinkSplitPlanner.java | 189 +++ .../iceberg/flink/source/IcebergSource.java | 549 ++++++++ .../flink/source/IcebergTableSource.java | 229 ++++ .../source/RowDataFileScanTaskReader.java | 243 ++++ .../iceberg/flink/source/RowDataRewriter.java | 172 +++ .../RowDataToAvroGenericRecordConverter.java | 70 + .../iceberg/flink/source/ScanContext.java | 597 +++++++++ .../iceberg/flink/source/SourceUtil.java | 77 ++ .../source/StreamingMonitorFunction.java | 269 ++++ .../flink/source/StreamingReaderOperator.java | 246 ++++ .../source/StreamingStartingStrategy.java | 54 + .../source/assigner/DefaultSplitAssigner.java | 119 ++ .../flink/source/assigner/GetSplitResult.java | 77 ++ .../assigner/OrderedSplitAssignerFactory.java | 46 + .../assigner/SimpleSplitAssignerFactory.java | 37 + .../flink/source/assigner/SplitAssigner.java | 124 ++ .../source/assigner/SplitAssignerFactory.java | 30 + .../source/assigner/SplitAssignerType.java | 33 + .../enumerator/AbstractIcebergEnumerator.java | 184 +++ .../ContinuousEnumerationResult.java | 57 + .../ContinuousIcebergEnumerator.java | 187 +++ .../enumerator/ContinuousSplitPlanner.java | 30 + .../ContinuousSplitPlannerImpl.java | 240 ++++ .../source/enumerator/EnumerationHistory.java | 100 ++ .../enumerator/IcebergEnumeratorPosition.java | 79 ++ .../IcebergEnumeratorPositionSerializer.java | 90 ++ .../enumerator/IcebergEnumeratorState.java | 65 + .../IcebergEnumeratorStateSerializer.java | 194 +++ .../enumerator/StaticIcebergEnumerator.java | 51 + .../source/reader/ArrayBatchRecords.java | 171 +++ .../reader/ArrayPoolDataIteratorBatcher.java | 130 ++ .../AvroGenericRecordReaderFunction.java | 102 ++ .../reader/ColumnStatsWatermarkExtractor.java | 98 ++ .../source/reader/DataIteratorBatcher.java | 36 + .../reader/DataIteratorReaderFunction.java | 43 + .../source/reader/IcebergSourceReader.java | 77 ++ .../reader/IcebergSourceReaderMetrics.java | 61 + .../reader/IcebergSourceSplitReader.java | 167 +++ .../source/reader/LimitableDataIterator.java | 56 + .../flink/source/reader/ListBatchRecords.java | 85 ++ .../reader/ListDataIteratorBatcher.java | 94 ++ .../source/reader/MetaDataReaderFunction.java | 65 + .../flink/source/reader/ReaderFunction.java | 31 + .../source/reader/RecordAndPosition.java | 78 ++ .../flink/source/reader/RecordFactory.java | 34 + .../flink/source/reader/RecordLimiter.java | 45 + .../source/reader/RowDataReaderFunction.java | 115 ++ .../source/reader/RowDataRecordFactory.java | 73 + .../reader/SerializableRecordEmitter.java | 40 + .../reader/SplitWatermarkExtractor.java | 28 + .../WatermarkExtractorRecordEmitter.java | 67 + .../source/split/IcebergSourceSplit.java | 220 ++++ .../split/IcebergSourceSplitSerializer.java | 62 + .../source/split/IcebergSourceSplitState.java | 37 + .../split/IcebergSourceSplitStatus.java | 25 + .../source/split/SerializableComparator.java | 24 + .../flink/source/split/SerializerHelper.java | 206 +++ .../flink/source/split/SplitComparators.java | 76 ++ .../flink/source/split/SplitRequestEvent.java | 54 + .../iceberg/flink/util/ElapsedTimeGauge.java | 47 + .../flink/util/FlinkAlterTableUtil.java | 248 ++++ .../flink/util/FlinkCompatibilityUtil.java | 47 + .../iceberg/flink/util/FlinkPackage.java | 61 + .../org.apache.flink.table.factories.Factory | 16 + ....apache.flink.table.factories.TableFactory | 16 + .../flink/AvroGenericRecordConverterBase.java | 90 ++ .../apache/iceberg/flink/CatalogTestBase.java | 121 ++ .../apache/iceberg/flink/DataGenerator.java | 42 + .../apache/iceberg/flink/DataGenerators.java | 1172 +++++++++++++++++ .../iceberg/flink/HadoopCatalogExtension.java | 105 ++ .../iceberg/flink/HadoopTableExtension.java | 59 + .../flink/MiniFlinkClusterExtension.java | 67 + .../iceberg/flink/RowDataConverter.java | 135 ++ .../apache/iceberg/flink/SimpleDataUtil.java | 439 ++++++ .../org/apache/iceberg/flink/SqlBase.java | 110 ++ .../org/apache/iceberg/flink/TestBase.java | 145 ++ .../iceberg/flink/TestCatalogLoader.java | 116 ++ .../iceberg/flink/TestCatalogTableLoader.java | 113 ++ .../iceberg/flink/TestChangeLogTable.java | 296 +++++ .../flink/TestDataFileSerialization.java | 203 +++ .../apache/iceberg/flink/TestFixtures.java | 61 + .../flink/TestFlinkAnonymousTable.java | 65 + .../flink/TestFlinkCatalogDatabase.java | 253 ++++ .../flink/TestFlinkCatalogFactory.java | 119 ++ .../iceberg/flink/TestFlinkCatalogTable.java | 669 ++++++++++ .../TestFlinkCatalogTablePartitions.java | 119 ++ .../iceberg/flink/TestFlinkConfParser.java | 61 + .../iceberg/flink/TestFlinkFilters.java | 462 +++++++ .../iceberg/flink/TestFlinkHiveCatalog.java | 101 ++ .../iceberg/flink/TestFlinkSchemaUtil.java | 416 ++++++ .../iceberg/flink/TestFlinkTableSink.java | 244 ++++ .../flink/TestFlinkTableSinkExtended.java | 244 ++++ .../apache/iceberg/flink/TestFlinkUpsert.java | 334 +++++ .../org/apache/iceberg/flink/TestHelpers.java | 632 +++++++++ .../iceberg/flink/TestIcebergConnector.java | 331 +++++ .../flink/TestManifestFileSerialization.java | 173 +++ .../iceberg/flink/TestRowDataWrapper.java | 93 ++ .../apache/iceberg/flink/TestTableLoader.java | 57 + .../iceberg/flink/TestTableSerialization.java | 110 ++ .../actions/TestRewriteDataFilesAction.java | 481 +++++++ .../iceberg/flink/data/RandomRowData.java | 38 + .../flink/data/RowDataToRowMapper.java | 50 + .../flink/data/TestFlinkAvroReaderWriter.java | 185 +++ .../flink/data/TestFlinkOrcReaderWriter.java | 107 ++ .../flink/data/TestFlinkParquetReader.java | 239 ++++ .../flink/data/TestFlinkParquetWriter.java | 94 ++ .../flink/data/TestRowDataProjection.java | 593 +++++++++ .../iceberg/flink/data/TestRowProjection.java | 596 +++++++++ .../iceberg/flink/data/TestStructRowData.java | 100 ++ .../maintenance/operator/CollectingSink.java | 115 ++ .../operator/FlinkSqlExtension.java | 135 ++ .../operator/FlinkStreamingTestUtils.java | 73 + .../maintenance/operator/ManualSource.java | 316 +++++ .../operator/OperatorTestBase.java | 51 + .../operator/TestMonitorSource.java | 362 +++++ .../TestAvroGenericRecordToRowDataMapper.java | 38 + .../sink/TestBucketPartitionKeySelector.java | 67 + .../flink/sink/TestBucketPartitioner.java | 108 ++ ...TestBucketPartitionerFlinkIcebergSink.java | 227 ++++ .../flink/sink/TestBucketPartitionerUtil.java | 126 ++ .../flink/sink/TestCachingTableSupplier.java | 81 ++ .../flink/sink/TestCompressionSettings.java | 257 ++++ .../flink/sink/TestDeltaTaskWriter.java | 429 ++++++ .../flink/sink/TestFlinkAppenderFactory.java | 65 + .../sink/TestFlinkFileWriterFactory.java | 66 + .../flink/sink/TestFlinkIcebergSink.java | 125 ++ .../flink/sink/TestFlinkIcebergSinkBase.java | 111 ++ .../sink/TestFlinkIcebergSinkBranch.java | 137 ++ .../TestFlinkIcebergSinkDistributionMode.java | 180 +++ .../sink/TestFlinkIcebergSinkExtended.java | 208 +++ .../flink/sink/TestFlinkIcebergSinkV2.java | 235 ++++ .../sink/TestFlinkIcebergSinkV2Base.java | 389 ++++++ .../sink/TestFlinkIcebergSinkV2Branch.java | 125 ++ .../iceberg/flink/sink/TestFlinkManifest.java | 312 +++++ .../sink/TestFlinkPartitioningWriters.java | 77 ++ .../sink/TestFlinkPositionDeltaWriters.java | 66 + .../sink/TestFlinkRollingFileWriters.java | 51 + .../flink/sink/TestFlinkWriterMetrics.java | 60 + .../flink/sink/TestIcebergFilesCommitter.java | 1148 ++++++++++++++++ .../flink/sink/TestIcebergStreamWriter.java | 390 ++++++ .../flink/sink/TestRowDataPartitionKey.java | 251 ++++ .../iceberg/flink/sink/TestTaskWriters.java | 242 ++++ .../iceberg/flink/sink/shuffle/Fixtures.java | 100 ++ .../TestAggregatedStatisticsTracker.java | 465 +++++++ .../TestCompletedStatisticsSerializer.java | 54 + .../TestDataStatisticsCoordinator.java | 246 ++++ ...TestDataStatisticsCoordinatorProvider.java | 187 +++ .../shuffle/TestDataStatisticsOperator.java | 352 +++++ .../shuffle/TestDataStatisticsSerializer.java | 53 + .../TestGlobalStatisticsSerializer.java | 59 + .../sink/shuffle/TestMapDataStatistics.java | 67 + .../sink/shuffle/TestMapRangePartitioner.java | 434 ++++++ .../sink/shuffle/TestRangePartitioner.java | 65 + .../shuffle/TestSketchDataStatistics.java | 60 + .../shuffle/TestSketchRangePartitioner.java | 88 ++ .../flink/sink/shuffle/TestSketchUtil.java | 189 +++ .../shuffle/TestSortKeySerializerBase.java | 65 + .../TestSortKeySerializerNestedStruct.java | 55 + .../TestSortKeySerializerPrimitives.java | 90 ++ .../TestSortKeySerializerSnapshot.java | 213 +++ .../flink/sink/shuffle/TestSortKeyUtil.java | 73 + .../flink/source/BoundedTableFactory.java | 170 +++ .../flink/source/BoundedTestSource.java | 108 ++ .../flink/source/ChangeLogTableTestBase.java | 95 ++ .../iceberg/flink/source/SplitHelpers.java | 200 +++ .../iceberg/flink/source/SqlHelpers.java | 60 + .../flink/source/TableSourceTestBase.java | 105 ++ .../flink/source/TestBoundedTableFactory.java | 81 ++ .../flink/source/TestFlinkInputFormat.java | 211 +++ .../TestFlinkInputFormatReaderDeletes.java | 71 + .../flink/source/TestFlinkMergingMetrics.java | 67 + .../flink/source/TestFlinkMetaDataTable.java | 813 ++++++++++++ .../source/TestFlinkReaderDeletesBase.java | 90 ++ .../iceberg/flink/source/TestFlinkScan.java | 540 ++++++++ .../flink/source/TestFlinkScanSql.java | 69 + .../iceberg/flink/source/TestFlinkSource.java | 90 ++ .../flink/source/TestFlinkSourceConfig.java | 61 + .../flink/source/TestFlinkSourceSql.java | 85 ++ .../flink/source/TestFlinkTableSource.java | 561 ++++++++ .../source/TestIcebergSourceBounded.java | 147 +++ ...TestIcebergSourceBoundedGenericRecord.java | 196 +++ .../source/TestIcebergSourceBoundedSql.java | 76 ++ .../source/TestIcebergSourceContinuous.java | 573 ++++++++ .../source/TestIcebergSourceFailover.java | 394 ++++++ ...gSourceFailoverWithWatermarkExtractor.java | 130 ++ .../TestIcebergSourceReaderDeletes.java | 102 ++ .../flink/source/TestIcebergSourceSql.java | 158 +++ ...stIcebergSourceWithWatermarkExtractor.java | 408 ++++++ ...estIcebergSpeculativeExecutionSupport.java | 184 +++ .../TestMetadataTableReadableMetrics.java | 299 +++++ .../flink/source/TestProjectMetaColumn.java | 188 +++ ...stRowDataToAvroGenericRecordConverter.java | 36 + .../iceberg/flink/source/TestScanContext.java | 112 ++ .../iceberg/flink/source/TestSourceUtil.java | 61 + .../iceberg/flink/source/TestSqlBase.java | 160 +++ .../flink/source/TestStreamScanSql.java | 434 ++++++ .../source/TestStreamingMonitorFunction.java | 402 ++++++ .../source/TestStreamingReaderOperator.java | 293 +++++ .../assigner/SplitAssignerTestBase.java | 132 ++ .../assigner/TestDefaultSplitAssigner.java | 43 + ...tFileSequenceNumberBasedSplitAssigner.java | 81 ++ .../TestWatermarkBasedSplitAssigner.java | 146 ++ .../ManualContinuousSplitPlanner.java | 97 ++ .../TestContinuousIcebergEnumerator.java | 352 +++++ .../TestContinuousSplitPlannerImpl.java | 692 ++++++++++ ...ntinuousSplitPlannerImplStartStrategy.java | 200 +++ .../enumerator/TestEnumerationHistory.java | 135 ++ .../TestIcebergEnumeratorStateSerializer.java | 146 ++ .../source/reader/ReaderFunctionTestBase.java | 218 +++ .../flink/source/reader/ReaderUtil.java | 128 ++ .../source/reader/TestArrayBatchRecords.java | 69 + ...stArrayPoolDataIteratorBatcherRowData.java | 360 +++++ .../TestColumnStatsWatermarkExtractor.java | 176 +++ .../reader/TestIcebergSourceReader.java | 181 +++ .../reader/TestLimitableDataIterator.java | 84 ++ .../reader/TestRowDataReaderFunction.java | 69 + .../source/reader/TestingMetricGroup.java | 102 ++ .../TestIcebergSourceSplitSerializer.java | 183 +++ .../iceberg/flink/util/TestFlinkPackage.java | 55 + .../org.apache.flink.table.factories.Factory | 16 + 325 files changed, 57252 insertions(+) create mode 100644 flink/v1.19/build.gradle create mode 100644 flink/v1.19/flink-runtime/LICENSE create mode 100644 flink/v1.19/flink-runtime/NOTICE create mode 100644 flink/v1.19/flink-runtime/src/integration/java/org/apache/iceberg/flink/IcebergConnectorSmokeTest.java create mode 100644 flink/v1.19/flink/src/jmh/java/org/apache/iceberg/flink/sink/shuffle/MapRangePartitionerBenchmark.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/CatalogLoader.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkCatalog.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkCatalogFactory.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkConfParser.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkConfigOptions.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkDynamicTableFactory.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkEnvironmentContext.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkFilters.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkFixupTypes.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkReadConf.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkReadOptions.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkSchemaUtil.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkSourceFilter.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeToType.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeVisitor.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkWriteConf.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkWriteOptions.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/IcebergTableSink.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/RowDataWrapper.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/TableLoader.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/TypeToFlinkType.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/actions/Actions.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/actions/RewriteDataFilesAction.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/AvroWithFlinkSchemaVisitor.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkAvroReader.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkAvroWriter.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReader.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReaders.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriter.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriters.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkParquetReaders.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkParquetWriters.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkSchemaVisitor.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkValueReaders.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkValueWriters.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/ParquetWithFlinkSchemaVisitor.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/RowDataProjection.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/RowDataUtil.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/StructRowData.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/MonitorSource.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/SingleThreadedIteratorSource.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/TableChange.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/AvroGenericRecordToRowDataMapper.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/BaseDeltaTaskWriter.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/BucketPartitionKeySelector.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/BucketPartitioner.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/BucketPartitionerUtil.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/CachingTableSupplier.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/CommitSummary.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/DeltaManifests.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/DeltaManifestsSerializer.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/EqualityFieldKeySelector.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkAppenderFactory.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkFileWriterFactory.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkManifestUtil.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkSink.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergFilesCommitter.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergFilesCommitterMetrics.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriter.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriterMetrics.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/ManifestOutputFileFactory.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/PartitionKeySelector.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/PartitionedDeltaWriter.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/RowDataTaskWriterFactory.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/TaskWriterFactory.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/UnpartitionedDeltaWriter.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/AggregatedStatisticsTracker.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/CompletedStatistics.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/CompletedStatisticsSerializer.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatistics.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsCoordinator.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsCoordinatorProvider.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsOperator.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsSerializer.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/GlobalStatistics.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/GlobalStatisticsSerializer.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/KeyAssignment.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapAssignment.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapDataStatistics.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapRangePartitioner.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/RangePartitioner.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/RequestGlobalStatisticsEvent.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchDataStatistics.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchRangePartitioner.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchUtil.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SortKeySerializer.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SortKeySketchSerializer.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SortKeyUtil.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsEvent.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsOrRecord.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsOrRecordSerializer.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsType.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsUtil.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/AvroGenericRecordFileScanTaskReader.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/DataIterator.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/DataTaskReader.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/FileScanTaskReader.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/FlinkInputFormat.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/FlinkInputSplit.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSource.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSplitPlanner.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/IcebergSource.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/IcebergTableSource.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/RowDataFileScanTaskReader.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/RowDataRewriter.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/RowDataToAvroGenericRecordConverter.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/ScanContext.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/SourceUtil.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/StreamingMonitorFunction.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/StreamingReaderOperator.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/StreamingStartingStrategy.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/assigner/DefaultSplitAssigner.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/assigner/GetSplitResult.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/assigner/OrderedSplitAssignerFactory.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SimpleSplitAssignerFactory.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssigner.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssignerFactory.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssignerType.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/AbstractIcebergEnumerator.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousEnumerationResult.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousIcebergEnumerator.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousSplitPlanner.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousSplitPlannerImpl.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/EnumerationHistory.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorPosition.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorPositionSerializer.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorState.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorStateSerializer.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/StaticIcebergEnumerator.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/ArrayBatchRecords.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/ArrayPoolDataIteratorBatcher.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/AvroGenericRecordReaderFunction.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/ColumnStatsWatermarkExtractor.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorBatcher.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorReaderFunction.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceReader.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceReaderMetrics.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceSplitReader.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/LimitableDataIterator.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/ListBatchRecords.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/ListDataIteratorBatcher.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/MetaDataReaderFunction.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/ReaderFunction.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordAndPosition.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordFactory.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordLimiter.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataReaderFunction.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataRecordFactory.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/SerializableRecordEmitter.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/SplitWatermarkExtractor.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/WatermarkExtractorRecordEmitter.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplit.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitSerializer.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitState.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitStatus.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/split/SerializableComparator.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/split/SerializerHelper.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/split/SplitComparators.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/split/SplitRequestEvent.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/util/ElapsedTimeGauge.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/util/FlinkAlterTableUtil.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/util/FlinkCompatibilityUtil.java create mode 100644 flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/util/FlinkPackage.java create mode 100644 flink/v1.19/flink/src/main/resources/META-INF/services/org.apache.flink.table.factories.Factory create mode 100644 flink/v1.19/flink/src/main/resources/META-INF/services/org.apache.flink.table.factories.TableFactory create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/AvroGenericRecordConverterBase.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/CatalogTestBase.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/DataGenerator.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/DataGenerators.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/HadoopCatalogExtension.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/HadoopTableExtension.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/MiniFlinkClusterExtension.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/RowDataConverter.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/SimpleDataUtil.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/SqlBase.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestBase.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestCatalogLoader.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestCatalogTableLoader.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestChangeLogTable.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestDataFileSerialization.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFixtures.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkAnonymousTable.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogDatabase.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogFactory.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogTable.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogTablePartitions.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkConfParser.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkFilters.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkHiveCatalog.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkSchemaUtil.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSink.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSinkExtended.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkUpsert.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestHelpers.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestIcebergConnector.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestManifestFileSerialization.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestRowDataWrapper.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestTableLoader.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestTableSerialization.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/actions/TestRewriteDataFilesAction.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/data/RandomRowData.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/data/RowDataToRowMapper.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkAvroReaderWriter.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkOrcReaderWriter.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkParquetReader.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkParquetWriter.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/data/TestRowDataProjection.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/data/TestRowProjection.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/data/TestStructRowData.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/CollectingSink.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/FlinkSqlExtension.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/FlinkStreamingTestUtils.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/ManualSource.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/OperatorTestBase.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestMonitorSource.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestAvroGenericRecordToRowDataMapper.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitionKeySelector.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitioner.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitionerFlinkIcebergSink.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitionerUtil.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestCachingTableSupplier.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestCompressionSettings.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestDeltaTaskWriter.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkAppenderFactory.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkFileWriterFactory.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSink.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkBase.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkBranch.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkDistributionMode.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkExtended.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2Base.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2Branch.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkManifest.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkPartitioningWriters.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkPositionDeltaWriters.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkRollingFileWriters.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkWriterMetrics.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergFilesCommitter.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergStreamWriter.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestRowDataPartitionKey.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestTaskWriters.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/Fixtures.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestAggregatedStatisticsTracker.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestCompletedStatisticsSerializer.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsCoordinator.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsCoordinatorProvider.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsOperator.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsSerializer.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestGlobalStatisticsSerializer.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestMapDataStatistics.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestMapRangePartitioner.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestRangePartitioner.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchDataStatistics.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchRangePartitioner.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchUtil.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerBase.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerNestedStruct.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerPrimitives.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerSnapshot.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeyUtil.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/BoundedTableFactory.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/BoundedTestSource.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/ChangeLogTableTestBase.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/SplitHelpers.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/SqlHelpers.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TableSourceTestBase.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestBoundedTableFactory.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkInputFormat.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkInputFormatReaderDeletes.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkMergingMetrics.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkMetaDataTable.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkReaderDeletesBase.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScan.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScanSql.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSource.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSourceConfig.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSourceSql.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkTableSource.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBounded.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBoundedGenericRecord.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBoundedSql.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceContinuous.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceFailover.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceFailoverWithWatermarkExtractor.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceReaderDeletes.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceSql.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceWithWatermarkExtractor.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSpeculativeExecutionSupport.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestMetadataTableReadableMetrics.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestProjectMetaColumn.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestRowDataToAvroGenericRecordConverter.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestScanContext.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestSourceUtil.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestSqlBase.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamScanSql.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamingMonitorFunction.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamingReaderOperator.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/assigner/SplitAssignerTestBase.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/assigner/TestDefaultSplitAssigner.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/assigner/TestFileSequenceNumberBasedSplitAssigner.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/assigner/TestWatermarkBasedSplitAssigner.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/ManualContinuousSplitPlanner.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousIcebergEnumerator.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousSplitPlannerImpl.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousSplitPlannerImplStartStrategy.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestEnumerationHistory.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestIcebergEnumeratorStateSerializer.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/reader/ReaderFunctionTestBase.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/reader/ReaderUtil.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestArrayBatchRecords.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestArrayPoolDataIteratorBatcherRowData.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestColumnStatsWatermarkExtractor.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestIcebergSourceReader.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestLimitableDataIterator.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestRowDataReaderFunction.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestingMetricGroup.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/split/TestIcebergSourceSplitSerializer.java create mode 100644 flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/util/TestFlinkPackage.java create mode 100644 flink/v1.19/flink/src/test/resources/META-INF/services/org.apache.flink.table.factories.Factory diff --git a/flink/v1.19/build.gradle b/flink/v1.19/build.gradle new file mode 100644 index 000000000000..392a1cb124f0 --- /dev/null +++ b/flink/v1.19/build.gradle @@ -0,0 +1,268 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +String flinkMajorVersion = '1.19' +String scalaVersion = System.getProperty("scalaVersion") != null ? System.getProperty("scalaVersion") : System.getProperty("defaultScalaVersion") + +project(":iceberg-flink:iceberg-flink-${flinkMajorVersion}") { + + dependencies { + implementation project(path: ':iceberg-bundled-guava', configuration: 'shadow') + api project(':iceberg-api') + implementation project(':iceberg-common') + implementation project(':iceberg-core') + api project(':iceberg-data') + implementation project(':iceberg-orc') + implementation project(':iceberg-parquet') + implementation project(':iceberg-hive-metastore') + + compileOnly libs.flink119.avro + // for dropwizard histogram metrics implementation + compileOnly libs.flink119.metrics.dropwizard + compileOnly libs.flink119.streaming.java + compileOnly "${libs.flink119.streaming.java.get().module}:${libs.flink119.streaming.java.get().getVersion()}:tests" + compileOnly libs.flink119.table.api.java.bridge + compileOnly "org.apache.flink:flink-table-planner_${scalaVersion}:${libs.versions.flink119.get()}" + compileOnly libs.flink119.connector.base + compileOnly libs.flink119.connector.files + + compileOnly libs.hadoop2.hdfs + compileOnly libs.hadoop2.common + compileOnly(libs.hadoop2.minicluster) { + exclude group: 'org.apache.avro', module: 'avro' + } + + implementation(libs.parquet.avro) { + exclude group: 'org.apache.avro', module: 'avro' + // already shaded by Parquet + exclude group: 'it.unimi.dsi' + exclude group: 'org.codehaus.jackson' + } + + compileOnly libs.avro.avro + + implementation("${libs.orc.core.get().module}:${libs.versions.orc.get()}:nohive") { + exclude group: 'org.apache.hadoop' + exclude group: 'commons-lang' + // These artifacts are shaded and included in the orc-core fat jar + exclude group: 'com.google.protobuf', module: 'protobuf-java' + exclude group: 'org.apache.hive', module: 'hive-storage-api' + exclude group: 'org.slf4j' + } + + implementation libs.datasketches + + testImplementation libs.flink119.connector.test.utils + testImplementation libs.flink119.core + testImplementation libs.flink119.runtime + testImplementation(libs.flink119.test.utilsjunit) { + exclude group: 'junit' + } + testImplementation(libs.flink119.test.utils) { + exclude group: "org.apache.curator", module: 'curator-test' + exclude group: 'junit' + } + + testImplementation project(path: ':iceberg-hive-metastore', configuration: 'testArtifacts') + testImplementation project(path: ':iceberg-api', configuration: 'testArtifacts') + testImplementation project(path: ':iceberg-core', configuration: 'testArtifacts') + testImplementation project(path: ':iceberg-data', configuration: 'testArtifacts') + + // By default, hive-exec is a fat/uber jar and it exports a guava library + // that's really old. We use the core classifier to be able to override our guava + // version. Luckily, hive-exec seems to work okay so far with this version of guava + // See: https://github.com/apache/hive/blob/master/ql/pom.xml#L911 for more context. + testImplementation("${libs.hive2.exec.get().module}:${libs.hive2.exec.get().getVersion()}:core") { + exclude group: 'org.apache.avro', module: 'avro' + exclude group: 'org.slf4j', module: 'slf4j-log4j12' + exclude group: 'org.pentaho' // missing dependency + exclude group: 'org.apache.hive', module: 'hive-llap-tez' + exclude group: 'org.apache.logging.log4j' + exclude group: 'com.google.protobuf', module: 'protobuf-java' + exclude group: 'org.apache.calcite' + exclude group: 'org.apache.calcite.avatica' + exclude group: 'com.google.code.findbugs', module: 'jsr305' + } + + testImplementation(libs.hive2.metastore) { + exclude group: 'org.apache.avro', module: 'avro' + exclude group: 'org.slf4j', module: 'slf4j-log4j12' + exclude group: 'org.pentaho' // missing dependency + exclude group: 'org.apache.hbase' + exclude group: 'org.apache.logging.log4j' + exclude group: 'co.cask.tephra' + exclude group: 'com.google.code.findbugs', module: 'jsr305' + exclude group: 'org.eclipse.jetty.aggregate', module: 'jetty-all' + exclude group: 'org.eclipse.jetty.orbit', module: 'javax.servlet' + exclude group: 'org.apache.parquet', module: 'parquet-hadoop-bundle' + exclude group: 'com.tdunning', module: 'json' + exclude group: 'javax.transaction', module: 'transaction-api' + exclude group: 'com.zaxxer', module: 'HikariCP' + exclude group: 'org.slf4j' + } + + testImplementation libs.awaitility + testImplementation libs.assertj.core + } + + test { + useJUnitPlatform() + } +} + +project(":iceberg-flink:iceberg-flink-runtime-${flinkMajorVersion}") { + apply plugin: 'io.github.goooler.shadow' + + tasks.jar.dependsOn tasks.shadowJar + + sourceSets { + integration { + java.srcDir "$projectDir/src/integration/java" + resources.srcDir "$projectDir/src/integration/resources" + } + } + + configurations { + implementation { + // included in Flink + exclude group: 'org.slf4j' + exclude group: 'org.apache.commons' + exclude group: 'commons-pool' + exclude group: 'commons-codec' + exclude group: 'org.xerial.snappy' + exclude group: 'javax.xml.bind' + exclude group: 'javax.annotation' + } + } + + dependencies { + implementation(project(":iceberg-flink:iceberg-flink-${flinkMajorVersion}")) { + exclude group: 'org.apache.flink' + } + implementation project(':iceberg-aws') + implementation project(':iceberg-azure') + implementation(project(':iceberg-aliyun')) { + exclude group: 'edu.umd.cs.findbugs', module: 'findbugs' + exclude group: 'org.apache.httpcomponents', module: 'httpclient' + exclude group: 'commons-logging', module: 'commons-logging' + } + implementation project(':iceberg-gcp') + implementation(project(':iceberg-nessie')) { + exclude group: 'com.google.code.findbugs', module: 'jsr305' + } + + // for dropwizard histogram metrics implementation + implementation libs.flink119.metrics.dropwizard + + // for integration testing with the flink-runtime-jar + // all of those dependencies are required because the integration test extends FlinkTestBase + integrationCompileOnly project(':iceberg-api') + integrationImplementation libs.junit.vintage.engine + integrationImplementation libs.assertj.core + integrationImplementation project(path: ":iceberg-flink:iceberg-flink-${flinkMajorVersion}", configuration: "testArtifacts") + integrationImplementation project(path: ':iceberg-api', configuration: 'testArtifacts') + integrationImplementation project(path: ':iceberg-hive-metastore', configuration: 'testArtifacts') + integrationImplementation(libs.flink119.test.utils) { + exclude group: "org.apache.curator", module: 'curator-test' + exclude group: 'junit' + } + + integrationImplementation libs.flink119.table.api.java.bridge + integrationImplementation "org.apache.flink:flink-table-planner_${scalaVersion}:${libs.versions.flink119.get()}" + + integrationImplementation libs.hadoop2.common + integrationImplementation libs.hadoop2.hdfs + integrationImplementation(libs.hadoop2.minicluster) { + exclude group: 'org.apache.avro', module: 'avro' + } + + integrationImplementation(libs.hive2.metastore) { + exclude group: 'org.apache.avro', module: 'avro' + exclude group: 'org.slf4j', module: 'slf4j-log4j12' + exclude group: 'org.pentaho' // missing dependency + exclude group: 'org.apache.hbase' + exclude group: 'org.apache.logging.log4j' + exclude group: 'co.cask.tephra' + exclude group: 'com.google.code.findbugs', module: 'jsr305' + exclude group: 'org.eclipse.jetty.aggregate', module: 'jetty-all' + exclude group: 'org.eclipse.jetty.orbit', module: 'javax.servlet' + exclude group: 'org.apache.parquet', module: 'parquet-hadoop-bundle' + exclude group: 'com.tdunning', module: 'json' + exclude group: 'javax.transaction', module: 'transaction-api' + exclude group: 'com.zaxxer', module: 'HikariCP' + exclude group: 'org.slf4j' + } + + integrationImplementation("${libs.hive2.exec.get().module}:${libs.hive2.exec.get().getVersion()}:core") { + exclude group: 'org.apache.avro', module: 'avro' + exclude group: 'org.slf4j', module: 'slf4j-log4j12' + exclude group: 'org.pentaho' // missing dependency + exclude group: 'org.apache.hive', module: 'hive-llap-tez' + exclude group: 'org.apache.logging.log4j' + exclude group: 'com.google.protobuf', module: 'protobuf-java' + exclude group: 'org.apache.calcite' + exclude group: 'org.apache.calcite.avatica' + exclude group: 'com.google.code.findbugs', module: 'jsr305' + } + } + + shadowJar { + configurations = [project.configurations.runtimeClasspath] + + zip64 true + + // include the LICENSE and NOTICE files for the shaded Jar + from(projectDir) { + include 'LICENSE' + include 'NOTICE' + } + + // Relocate dependencies to avoid conflicts + relocate 'org.apache.avro', 'org.apache.iceberg.shaded.org.apache.avro' + relocate 'org.apache.parquet', 'org.apache.iceberg.shaded.org.apache.parquet' + relocate 'com.google.errorprone', 'org.apache.iceberg.shaded.com.google.errorprone' + relocate 'com.google.flatbuffers', 'org.apache.iceberg.shaded.com.google.flatbuffers' + relocate 'com.fasterxml', 'org.apache.iceberg.shaded.com.fasterxml' + relocate 'com.github.benmanes', 'org.apache.iceberg.shaded.com.github.benmanes' + relocate 'org.checkerframework', 'org.apache.iceberg.shaded.org.checkerframework' + relocate 'shaded.parquet', 'org.apache.iceberg.shaded.org.apache.parquet.shaded' + relocate 'org.apache.orc', 'org.apache.iceberg.shaded.org.apache.orc' + relocate 'io.airlift', 'org.apache.iceberg.shaded.io.airlift' + relocate 'org.threeten.extra', 'org.apache.iceberg.shaded.org.threeten.extra' + relocate 'org.apache.hc.client5', 'org.apache.iceberg.shaded.org.apache.hc.client5' + relocate 'org.apache.hc.core5', 'org.apache.iceberg.shaded.org.apache.hc.core5' + + archiveClassifier.set(null) + } + + task integrationTest(type: Test) { + description = "Test Flink Runtime Jar against Flink ${flinkMajorVersion}" + group = "verification" + jvmArgs += project.property('extraJvmArgs') + testClassesDirs = sourceSets.integration.output.classesDirs + classpath = sourceSets.integration.runtimeClasspath + files(shadowJar.archiveFile.get().asFile.path) + inputs.file(shadowJar.archiveFile.get().asFile.path) + } + integrationTest.dependsOn shadowJar + check.dependsOn integrationTest + + jar { + enabled = false + } +} diff --git a/flink/v1.19/flink-runtime/LICENSE b/flink/v1.19/flink-runtime/LICENSE new file mode 100644 index 000000000000..8ab53469eb87 --- /dev/null +++ b/flink/v1.19/flink-runtime/LICENSE @@ -0,0 +1,502 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +-------------------------------------------------------------------------------- + +This binary artifact contains Apache Avro. + +Copyright: 2014-2020 The Apache Software Foundation. +Home page: https://parquet.apache.org/ +License: http://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + +This binary artifact contains the Jackson JSON processor. + +Copyright: 2007-2020 Tatu Saloranta and other contributors +Home page: http://jackson.codehaus.org/ +License: http://www.apache.org/licenses/LICENSE-2.0.txt + +-------------------------------------------------------------------------------- + +This binary artifact contains Apache Parquet. + +Copyright: 2014-2020 The Apache Software Foundation. +Home page: https://parquet.apache.org/ +License: http://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + +This binary artifact contains Apache Thrift. + +Copyright: 2006-2010 The Apache Software Foundation. +Home page: https://thrift.apache.org/ +License: http://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + +This binary artifact contains fastutil. + +Copyright: 2002-2014 Sebastiano Vigna +Home page: http://fastutil.di.unimi.it/ +License: http://www.apache.org/licenses/LICENSE-2.0.html + +-------------------------------------------------------------------------------- + +This binary artifact contains Apache ORC. + +Copyright: 2013-2020 The Apache Software Foundation. +Home page: https://orc.apache.org/ +License: http://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + +This binary artifact contains Apache Hive's storage API via ORC. + +Copyright: 2013-2020 The Apache Software Foundation. +Home page: https://hive.apache.org/ +License: http://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + +This binary artifact contains Airlift Aircompressor. + +Copyright: 2011-2020 Aircompressor authors. +Home page: https://github.com/airlift/aircompressor +License: http://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + +This binary artifact contains Airlift Slice. + +Copyright: 2013-2020 Slice authors. +Home page: https://github.com/airlift/slice +License: http://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + +This binary artifact contains JetBrains annotations. + +Copyright: 2000-2020 JetBrains s.r.o. +Home page: https://github.com/JetBrains/java-annotations +License: http://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + +This binary artifact contains Google Guava. + +Copyright: 2006-2020 The Guava Authors +Home page: https://github.com/google/guava +License: http://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + +This binary artifact contains Google Error Prone Annotations. + +Copyright: Copyright 2011-2019 The Error Prone Authors +Home page: https://github.com/google/error-prone +License: http://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + +This binary artifact contains checkerframework checker-qual Annotations. + +Copyright: 2004-2020 the Checker Framework developers +Home page: https://github.com/typetools/checker-framework +License: https://github.com/typetools/checker-framework/blob/master/LICENSE.txt (MIT license) + +License text: +| The annotations are licensed under the MIT License. (The text of this +| license appears below.) More specifically, all the parts of the Checker +| Framework that you might want to include with your own program use the +| MIT License. This is the checker-qual.jar file and all the files that +| appear in it: every file in a qual/ directory, plus utility files such +| as NullnessUtil.java, RegexUtil.java, SignednessUtil.java, etc. +| In addition, the cleanroom implementations of third-party annotations, +| which the Checker Framework recognizes as aliases for its own +| annotations, are licensed under the MIT License. +| +| Permission is hereby granted, free of charge, to any person obtaining a copy +| of this software and associated documentation files (the "Software"), to deal +| in the Software without restriction, including without limitation the rights +| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +| copies of the Software, and to permit persons to whom the Software is +| furnished to do so, subject to the following conditions: +| +| The above copyright notice and this permission notice shall be included in +| all copies or substantial portions of the Software. +| +| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +| THE SOFTWARE. + +-------------------------------------------------------------------------------- + +This binary artifact contains Animal Sniffer Annotations. + +Copyright: 2009-2018 codehaus.org +Home page: https://www.mojohaus.org/animal-sniffer/animal-sniffer-annotations/ +License: https://www.mojohaus.org/animal-sniffer/animal-sniffer-annotations/license.html (MIT license) + +License text: +| The MIT License +| +| Copyright (c) 2009 codehaus.org. +| +| Permission is hereby granted, free of charge, to any person obtaining a copy +| of this software and associated documentation files (the "Software"), to deal +| in the Software without restriction, including without limitation the rights +| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +| copies of the Software, and to permit persons to whom the Software is +| furnished to do so, subject to the following conditions: +| +| The above copyright notice and this permission notice shall be included in +| all copies or substantial portions of the Software. +| +| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +| THE SOFTWARE. + +-------------------------------------------------------------------------------- + +This binary artifact contains Caffeine by Ben Manes. + +Copyright: 2014-2020 Ben Manes and contributors +Home page: https://github.com/ben-manes/caffeine +License: http://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + +This binary artifact contains Apache Yetus audience annotations. + +Copyright: 2008-2020 The Apache Software Foundation. +Home page: https://yetus.apache.org/ +License: http://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + +This binary artifact contains Google protobuf. + +Copyright: 2008 Google Inc. +Home page: https://developers.google.com/protocol-buffers +License: https://github.com/protocolbuffers/protobuf/blob/master/LICENSE (BSD) + +License text: + +| Copyright 2008 Google Inc. All rights reserved. +| +| Redistribution and use in source and binary forms, with or without +| modification, are permitted provided that the following conditions are +| met: +| +| * Redistributions of source code must retain the above copyright +| notice, this list of conditions and the following disclaimer. +| * Redistributions in binary form must reproduce the above +| copyright notice, this list of conditions and the following disclaimer +| in the documentation and/or other materials provided with the +| distribution. +| * Neither the name of Google Inc. nor the names of its +| contributors may be used to endorse or promote products derived from +| this software without specific prior written permission. +| +| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +| "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +| LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +| A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +| OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +| SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +| LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +| DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +| THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +| (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +| OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +| +| Code generated by the Protocol Buffer compiler is owned by the owner +| of the input file used when generating it. This code is not +| standalone and requires a support library to be linked with it. This +| support library is itself covered by the above license. + +-------------------------------------------------------------------------------- + +This binary artifact contains ThreeTen. + +Copyright: 2007-present, Stephen Colebourne & Michael Nascimento Santos. +Home page: https://www.threeten.org/threeten-extra/ +License: https://github.com/ThreeTen/threeten-extra/blob/master/LICENSE.txt (BSD 3-clause) + +License text: + +| All rights reserved. +| +| * Redistribution and use in source and binary forms, with or without +| modification, are permitted provided that the following conditions are met: +| +| * Redistributions of source code must retain the above copyright notice, +| this list of conditions and the following disclaimer. +| +| * Redistributions in binary form must reproduce the above copyright notice, +| this list of conditions and the following disclaimer in the documentation +| and/or other materials provided with the distribution. +| +| * Neither the name of JSR-310 nor the names of its contributors +| may be used to endorse or promote products derived from this software +| without specific prior written permission. +| +| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +| "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +| LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +| A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +| CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +| EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +| PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +| PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +| LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +| NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +| SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +-------------------------------------------------------------------------------- + +This binary artifact includes Project Nessie with the following in its NOTICE +file: + +| Dremio +| Copyright 2015-2017 Dremio Corporation +| +| This product includes software developed at +| The Apache Software Foundation (http://www.apache.org/). + +-------------------------------------------------------------------------------- + +This binary includes code from Apache Commons. + +* Core ArrayUtil. + +Copyright: 2020 The Apache Software Foundation +Home page: https://commons.apache.org/ +License: https://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + +This binary artifact contains Apache HttpComponents Client. + +Copyright: 1999-2022 The Apache Software Foundation. +Home page: https://hc.apache.org/ +License: http://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + +This product includes code from Apache HttpComponents Client. + +* retry and error handling logic in ExponentialHttpRequestRetryStrategy.java + +Copyright: 1999-2022 The Apache Software Foundation. +Home page: https://hc.apache.org/ +License: https://www.apache.org/licenses/LICENSE-2.0 diff --git a/flink/v1.19/flink-runtime/NOTICE b/flink/v1.19/flink-runtime/NOTICE new file mode 100644 index 000000000000..dc36f84c4ac5 --- /dev/null +++ b/flink/v1.19/flink-runtime/NOTICE @@ -0,0 +1,91 @@ + +Apache Iceberg +Copyright 2017-2024 The Apache Software Foundation + +This product includes software developed at +The Apache Software Foundation (http://www.apache.org/). + +-------------------------------------------------------------------------------- + +This binary artifact includes Apache ORC with the following in its NOTICE file: + +| Apache ORC +| Copyright 2013-2019 The Apache Software Foundation +| +| This product includes software developed by The Apache Software +| Foundation (http://www.apache.org/). +| +| This product includes software developed by Hewlett-Packard: +| (c) Copyright [2014-2015] Hewlett-Packard Development Company, L.P + +-------------------------------------------------------------------------------- + +This binary artifact includes Airlift Aircompressor with the following in its +NOTICE file: + +| Snappy Copyright Notices +| ========================= +| +| * Copyright 2011 Dain Sundstrom +| * Copyright 2011, Google Inc. +| +| +| Snappy License +| =============== +| Copyright 2011, Google Inc. +| All rights reserved. +| +| Redistribution and use in source and binary forms, with or without +| modification, are permitted provided that the following conditions are +| met: +| +| * Redistributions of source code must retain the above copyright +| notice, this list of conditions and the following disclaimer. +| * Redistributions in binary form must reproduce the above +| copyright notice, this list of conditions and the following disclaimer +| in the documentation and/or other materials provided with the +| distribution. +| * Neither the name of Google Inc. nor the names of its +| contributors may be used to endorse or promote products derived from +| this software without specific prior written permission. +| +| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +| "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +| LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +| A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +| OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +| SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +| LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +| DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +| THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +| (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +| OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +-------------------------------------------------------------------------------- + +This binary artifact includes Apache Yetus with the following in its NOTICE +file: + +| Apache Yetus +| Copyright 2008-2020 The Apache Software Foundation +| +| This product includes software developed at +| The Apache Software Foundation (https://www.apache.org/). +| +| --- +| Additional licenses for the Apache Yetus Source/Website: +| --- +| +| +| See LICENSE for terms. + +-------------------------------------------------------------------------------- + +This binary artifact includes Project Nessie with the following in its NOTICE +file: + +| Dremio +| Copyright 2015-2017 Dremio Corporation +| +| This product includes software developed at +| The Apache Software Foundation (http://www.apache.org/). diff --git a/flink/v1.19/flink-runtime/src/integration/java/org/apache/iceberg/flink/IcebergConnectorSmokeTest.java b/flink/v1.19/flink-runtime/src/integration/java/org/apache/iceberg/flink/IcebergConnectorSmokeTest.java new file mode 100644 index 000000000000..0d9bbf9d3601 --- /dev/null +++ b/flink/v1.19/flink-runtime/src/integration/java/org/apache/iceberg/flink/IcebergConnectorSmokeTest.java @@ -0,0 +1,21 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +public class IcebergConnectorSmokeTest extends TestIcebergConnector {} diff --git a/flink/v1.19/flink/src/jmh/java/org/apache/iceberg/flink/sink/shuffle/MapRangePartitionerBenchmark.java b/flink/v1.19/flink/src/jmh/java/org/apache/iceberg/flink/sink/shuffle/MapRangePartitionerBenchmark.java new file mode 100644 index 000000000000..c3917165753d --- /dev/null +++ b/flink/v1.19/flink/src/jmh/java/org/apache/iceberg/flink/sink/shuffle/MapRangePartitionerBenchmark.java @@ -0,0 +1,199 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import java.util.List; +import java.util.Map; +import java.util.NavigableMap; +import java.util.concurrent.ThreadLocalRandom; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SortKey; +import org.apache.iceberg.SortOrder; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.types.Types; +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Fork; +import org.openjdk.jmh.annotations.Measurement; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.TearDown; +import org.openjdk.jmh.annotations.Threads; +import org.openjdk.jmh.annotations.Warmup; +import org.openjdk.jmh.infra.Blackhole; + +@Fork(1) +@State(Scope.Benchmark) +@Warmup(iterations = 3) +@Measurement(iterations = 5) +@BenchmarkMode(Mode.SingleShotTime) +public class MapRangePartitionerBenchmark { + private static final String CHARS = + "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_-.!?"; + private static final int SAMPLE_SIZE = 100_000; + private static final Schema SCHEMA = + new Schema( + Types.NestedField.required(1, "id", Types.IntegerType.get()), + Types.NestedField.required(2, "name2", Types.StringType.get()), + Types.NestedField.required(3, "name3", Types.StringType.get()), + Types.NestedField.required(4, "name4", Types.StringType.get()), + Types.NestedField.required(5, "name5", Types.StringType.get()), + Types.NestedField.required(6, "name6", Types.StringType.get()), + Types.NestedField.required(7, "name7", Types.StringType.get()), + Types.NestedField.required(8, "name8", Types.StringType.get()), + Types.NestedField.required(9, "name9", Types.StringType.get())); + + private static final SortOrder SORT_ORDER = SortOrder.builderFor(SCHEMA).asc("id").build(); + private static final SortKey SORT_KEY = new SortKey(SCHEMA, SORT_ORDER); + + private MapRangePartitioner partitioner; + private RowData[] rows; + + @Setup + public void setupBenchmark() { + NavigableMap weights = longTailDistribution(100_000, 24, 240, 100, 2.0); + Map mapStatistics = Maps.newHashMapWithExpectedSize(weights.size()); + weights.forEach( + (id, weight) -> { + SortKey sortKey = SORT_KEY.copy(); + sortKey.set(0, id); + mapStatistics.put(sortKey, weight); + }); + + MapDataStatistics dataStatistics = new MapDataStatistics(mapStatistics); + this.partitioner = + new MapRangePartitioner( + SCHEMA, SortOrder.builderFor(SCHEMA).asc("id").build(), dataStatistics, 2); + + List keys = Lists.newArrayList(weights.keySet().iterator()); + long[] weightsCDF = new long[keys.size()]; + long totalWeight = 0; + for (int i = 0; i < keys.size(); ++i) { + totalWeight += weights.get(keys.get(i)); + weightsCDF[i] = totalWeight; + } + + // pre-calculate the samples for benchmark run + this.rows = new GenericRowData[SAMPLE_SIZE]; + for (int i = 0; i < SAMPLE_SIZE; ++i) { + long weight = ThreadLocalRandom.current().nextLong(totalWeight); + int index = binarySearchIndex(weightsCDF, weight); + rows[i] = + GenericRowData.of( + keys.get(index), + randomString("name2-"), + randomString("name3-"), + randomString("name4-"), + randomString("name5-"), + randomString("name6-"), + randomString("name7-"), + randomString("name8-"), + randomString("name9-")); + } + } + + @TearDown + public void tearDownBenchmark() {} + + @Benchmark + @Threads(1) + public void testPartitionerLongTailDistribution(Blackhole blackhole) { + for (int i = 0; i < SAMPLE_SIZE; ++i) { + blackhole.consume(partitioner.partition(rows[i], 128)); + } + } + + private static String randomString(String prefix) { + int length = ThreadLocalRandom.current().nextInt(200); + byte[] buffer = new byte[length]; + + for (int i = 0; i < length; i += 1) { + buffer[i] = (byte) CHARS.charAt(ThreadLocalRandom.current().nextInt(CHARS.length())); + } + + return prefix + new String(buffer); + } + + /** find the index where weightsUDF[index] < weight && weightsUDF[index+1] >= weight */ + private static int binarySearchIndex(long[] weightsUDF, long target) { + Preconditions.checkArgument( + target < weightsUDF[weightsUDF.length - 1], + "weight is out of range: total weight = %s, search target = %s", + weightsUDF[weightsUDF.length - 1], + target); + int start = 0; + int end = weightsUDF.length - 1; + while (start < end) { + int mid = (start + end) / 2; + if (weightsUDF[mid] < target && weightsUDF[mid + 1] >= target) { + return mid; + } + + if (weightsUDF[mid] >= target) { + end = mid - 1; + } else if (weightsUDF[mid + 1] < target) { + start = mid + 1; + } + } + return start; + } + + /** Key is the id string and value is the weight in long value. */ + private static NavigableMap longTailDistribution( + long startingWeight, + int longTailStartingIndex, + int longTailLength, + long longTailBaseWeight, + double weightRandomJitterPercentage) { + + NavigableMap weights = Maps.newTreeMap(); + + // first part just decays the weight by half + long currentWeight = startingWeight; + for (int index = 0; index < longTailStartingIndex; ++index) { + double jitter = ThreadLocalRandom.current().nextDouble(weightRandomJitterPercentage / 100); + long weight = (long) (currentWeight * (1.0 + jitter)); + weight = weight > 0 ? weight : 1; + weights.put(index, weight); + if (currentWeight > longTailBaseWeight) { + currentWeight = currentWeight / 2; + } + } + + // long tail part + for (int index = longTailStartingIndex; + index < longTailStartingIndex + longTailLength; + ++index) { + long longTailWeight = + (long) + (longTailBaseWeight + * ThreadLocalRandom.current().nextDouble(weightRandomJitterPercentage)); + longTailWeight = longTailWeight > 0 ? longTailWeight : 1; + weights.put(index, longTailWeight); + } + + return weights; + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/CatalogLoader.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/CatalogLoader.java new file mode 100644 index 000000000000..18473bf4f190 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/CatalogLoader.java @@ -0,0 +1,215 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import java.io.Serializable; +import java.util.Map; +import org.apache.hadoop.conf.Configuration; +import org.apache.iceberg.CatalogProperties; +import org.apache.iceberg.CatalogUtil; +import org.apache.iceberg.catalog.Catalog; +import org.apache.iceberg.hadoop.HadoopCatalog; +import org.apache.iceberg.hadoop.SerializableConfiguration; +import org.apache.iceberg.hive.HiveCatalog; +import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.rest.RESTCatalog; + +/** Serializable loader to load an Iceberg {@link Catalog}. */ +public interface CatalogLoader extends Serializable, Cloneable { + + /** + * Create a new catalog with the provided properties. NOTICE: for flink, we may initialize the + * {@link CatalogLoader} at flink sql client side or job manager side, and then serialize this + * catalog loader to task manager, finally deserialize it and create a new catalog at task manager + * side. + * + * @return a newly created {@link Catalog} + */ + Catalog loadCatalog(); + + /** Clone a CatalogLoader. */ + @SuppressWarnings({"checkstyle:NoClone", "checkstyle:SuperClone"}) + CatalogLoader clone(); + + static CatalogLoader hadoop( + String name, Configuration hadoopConf, Map properties) { + return new HadoopCatalogLoader(name, hadoopConf, properties); + } + + static CatalogLoader hive(String name, Configuration hadoopConf, Map properties) { + return new HiveCatalogLoader(name, hadoopConf, properties); + } + + static CatalogLoader rest(String name, Configuration hadoopConf, Map properties) { + return new RESTCatalogLoader(name, hadoopConf, properties); + } + + static CatalogLoader custom( + String name, Map properties, Configuration hadoopConf, String impl) { + return new CustomCatalogLoader(name, properties, hadoopConf, impl); + } + + class HadoopCatalogLoader implements CatalogLoader { + private final String catalogName; + private final SerializableConfiguration hadoopConf; + private final String warehouseLocation; + private final Map properties; + + private HadoopCatalogLoader( + String catalogName, Configuration conf, Map properties) { + this.catalogName = catalogName; + this.hadoopConf = new SerializableConfiguration(conf); + this.warehouseLocation = properties.get(CatalogProperties.WAREHOUSE_LOCATION); + this.properties = Maps.newHashMap(properties); + } + + @Override + public Catalog loadCatalog() { + return CatalogUtil.loadCatalog( + HadoopCatalog.class.getName(), catalogName, properties, hadoopConf.get()); + } + + @Override + @SuppressWarnings({"checkstyle:NoClone", "checkstyle:SuperClone"}) + public CatalogLoader clone() { + return new HadoopCatalogLoader(catalogName, new Configuration(hadoopConf.get()), properties); + } + + @Override + public String toString() { + return MoreObjects.toStringHelper(this) + .add("catalogName", catalogName) + .add("warehouseLocation", warehouseLocation) + .toString(); + } + } + + class HiveCatalogLoader implements CatalogLoader { + private final String catalogName; + private final SerializableConfiguration hadoopConf; + private final String uri; + private final String warehouse; + private final int clientPoolSize; + private final Map properties; + + private HiveCatalogLoader( + String catalogName, Configuration conf, Map properties) { + this.catalogName = catalogName; + this.hadoopConf = new SerializableConfiguration(conf); + this.uri = properties.get(CatalogProperties.URI); + this.warehouse = properties.get(CatalogProperties.WAREHOUSE_LOCATION); + this.clientPoolSize = + properties.containsKey(CatalogProperties.CLIENT_POOL_SIZE) + ? Integer.parseInt(properties.get(CatalogProperties.CLIENT_POOL_SIZE)) + : CatalogProperties.CLIENT_POOL_SIZE_DEFAULT; + this.properties = Maps.newHashMap(properties); + } + + @Override + public Catalog loadCatalog() { + return CatalogUtil.loadCatalog( + HiveCatalog.class.getName(), catalogName, properties, hadoopConf.get()); + } + + @Override + @SuppressWarnings({"checkstyle:NoClone", "checkstyle:SuperClone"}) + public CatalogLoader clone() { + return new HiveCatalogLoader(catalogName, new Configuration(hadoopConf.get()), properties); + } + + @Override + public String toString() { + return MoreObjects.toStringHelper(this) + .add("catalogName", catalogName) + .add("uri", uri) + .add("warehouse", warehouse) + .add("clientPoolSize", clientPoolSize) + .toString(); + } + } + + class RESTCatalogLoader implements CatalogLoader { + private final String catalogName; + private final SerializableConfiguration hadoopConf; + private final Map properties; + + private RESTCatalogLoader( + String catalogName, Configuration conf, Map properties) { + this.catalogName = catalogName; + this.hadoopConf = new SerializableConfiguration(conf); + this.properties = Maps.newHashMap(properties); + } + + @Override + public Catalog loadCatalog() { + return CatalogUtil.loadCatalog( + RESTCatalog.class.getName(), catalogName, properties, hadoopConf.get()); + } + + @Override + @SuppressWarnings({"checkstyle:NoClone", "checkstyle:SuperClone"}) + public CatalogLoader clone() { + return new RESTCatalogLoader(catalogName, new Configuration(hadoopConf.get()), properties); + } + + @Override + public String toString() { + return MoreObjects.toStringHelper(this) + .add("catalogName", catalogName) + .add("properties", properties) + .toString(); + } + } + + class CustomCatalogLoader implements CatalogLoader { + + private final SerializableConfiguration hadoopConf; + private final Map properties; + private final String name; + private final String impl; + + private CustomCatalogLoader( + String name, Map properties, Configuration conf, String impl) { + this.hadoopConf = new SerializableConfiguration(conf); + this.properties = Maps.newHashMap(properties); // wrap into a hashmap for serialization + this.name = name; + this.impl = + Preconditions.checkNotNull( + impl, "Cannot initialize custom Catalog, impl class name is null"); + } + + @Override + public Catalog loadCatalog() { + return CatalogUtil.loadCatalog(impl, name, properties, hadoopConf.get()); + } + + @Override + @SuppressWarnings({"checkstyle:NoClone", "checkstyle:SuperClone"}) + public CatalogLoader clone() { + return new CustomCatalogLoader(name, properties, new Configuration(hadoopConf.get()), impl); + } + + @Override + public String toString() { + return MoreObjects.toStringHelper(this).add("name", name).add("impl", impl).toString(); + } + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkCatalog.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkCatalog.java new file mode 100644 index 000000000000..86295d78cc13 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkCatalog.java @@ -0,0 +1,833 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import java.io.Closeable; +import java.io.IOException; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Optional; +import java.util.Set; +import java.util.stream.Collectors; +import org.apache.flink.table.api.TableSchema; +import org.apache.flink.table.catalog.AbstractCatalog; +import org.apache.flink.table.catalog.CatalogBaseTable; +import org.apache.flink.table.catalog.CatalogDatabase; +import org.apache.flink.table.catalog.CatalogDatabaseImpl; +import org.apache.flink.table.catalog.CatalogFunction; +import org.apache.flink.table.catalog.CatalogPartition; +import org.apache.flink.table.catalog.CatalogPartitionSpec; +import org.apache.flink.table.catalog.CatalogTable; +import org.apache.flink.table.catalog.CatalogTableImpl; +import org.apache.flink.table.catalog.ObjectPath; +import org.apache.flink.table.catalog.ResolvedCatalogTable; +import org.apache.flink.table.catalog.TableChange; +import org.apache.flink.table.catalog.exceptions.CatalogException; +import org.apache.flink.table.catalog.exceptions.DatabaseAlreadyExistException; +import org.apache.flink.table.catalog.exceptions.DatabaseNotEmptyException; +import org.apache.flink.table.catalog.exceptions.DatabaseNotExistException; +import org.apache.flink.table.catalog.exceptions.FunctionNotExistException; +import org.apache.flink.table.catalog.exceptions.TableAlreadyExistException; +import org.apache.flink.table.catalog.exceptions.TableNotExistException; +import org.apache.flink.table.catalog.exceptions.TableNotPartitionedException; +import org.apache.flink.table.catalog.stats.CatalogColumnStatistics; +import org.apache.flink.table.catalog.stats.CatalogTableStatistics; +import org.apache.flink.table.expressions.Expression; +import org.apache.flink.table.factories.Factory; +import org.apache.flink.util.StringUtils; +import org.apache.iceberg.CachingCatalog; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.FileScanTask; +import org.apache.iceberg.MetadataTableType; +import org.apache.iceberg.PartitionField; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.StructLike; +import org.apache.iceberg.Table; +import org.apache.iceberg.catalog.Catalog; +import org.apache.iceberg.catalog.Namespace; +import org.apache.iceberg.catalog.SupportsNamespaces; +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.exceptions.AlreadyExistsException; +import org.apache.iceberg.exceptions.NamespaceNotEmptyException; +import org.apache.iceberg.exceptions.NoSuchNamespaceException; +import org.apache.iceberg.flink.util.FlinkAlterTableUtil; +import org.apache.iceberg.flink.util.FlinkCompatibilityUtil; +import org.apache.iceberg.io.CloseableIterable; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.base.Splitter; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.relocated.com.google.common.collect.Sets; + +/** + * A Flink Catalog implementation that wraps an Iceberg {@link Catalog}. + * + *

The mapping between Flink database and Iceberg namespace: Supplying a base namespace for a + * given catalog, so if you have a catalog that supports a 2-level namespace, you would supply the + * first level in the catalog configuration and the second level would be exposed as Flink + * databases. + * + *

The Iceberg table manages its partitions by itself. The partition of the Iceberg table is + * independent of the partition of Flink. + */ +public class FlinkCatalog extends AbstractCatalog { + private final CatalogLoader catalogLoader; + private final Catalog icebergCatalog; + private final Namespace baseNamespace; + private final SupportsNamespaces asNamespaceCatalog; + private final Closeable closeable; + private final boolean cacheEnabled; + + public FlinkCatalog( + String catalogName, + String defaultDatabase, + Namespace baseNamespace, + CatalogLoader catalogLoader, + boolean cacheEnabled, + long cacheExpirationIntervalMs) { + super(catalogName, defaultDatabase); + this.catalogLoader = catalogLoader; + this.baseNamespace = baseNamespace; + this.cacheEnabled = cacheEnabled; + + Catalog originalCatalog = catalogLoader.loadCatalog(); + icebergCatalog = + cacheEnabled + ? CachingCatalog.wrap(originalCatalog, cacheExpirationIntervalMs) + : originalCatalog; + asNamespaceCatalog = + originalCatalog instanceof SupportsNamespaces ? (SupportsNamespaces) originalCatalog : null; + closeable = originalCatalog instanceof Closeable ? (Closeable) originalCatalog : null; + + FlinkEnvironmentContext.init(); + } + + @Override + public void open() throws CatalogException {} + + @Override + public void close() throws CatalogException { + if (closeable != null) { + try { + closeable.close(); + } catch (IOException e) { + throw new CatalogException(e); + } + } + } + + public Catalog catalog() { + return icebergCatalog; + } + + /** Append a new level to the base namespace */ + private static Namespace appendLevel(Namespace baseNamespace, String newLevel) { + String[] namespace = new String[baseNamespace.levels().length + 1]; + System.arraycopy(baseNamespace.levels(), 0, namespace, 0, baseNamespace.levels().length); + namespace[baseNamespace.levels().length] = newLevel; + return Namespace.of(namespace); + } + + TableIdentifier toIdentifier(ObjectPath path) { + String objectName = path.getObjectName(); + List tableName = Splitter.on('$').splitToList(objectName); + + if (tableName.size() == 1) { + return TableIdentifier.of( + appendLevel(baseNamespace, path.getDatabaseName()), path.getObjectName()); + } else if (tableName.size() == 2 && MetadataTableType.from(tableName.get(1)) != null) { + return TableIdentifier.of( + appendLevel(appendLevel(baseNamespace, path.getDatabaseName()), tableName.get(0)), + tableName.get(1)); + } else { + throw new IllegalArgumentException("Illegal table name:" + objectName); + } + } + + @Override + public List listDatabases() throws CatalogException { + if (asNamespaceCatalog == null) { + return Collections.singletonList(getDefaultDatabase()); + } + + return asNamespaceCatalog.listNamespaces(baseNamespace).stream() + .map(n -> n.level(n.levels().length - 1)) + .collect(Collectors.toList()); + } + + @Override + public CatalogDatabase getDatabase(String databaseName) + throws DatabaseNotExistException, CatalogException { + if (asNamespaceCatalog == null) { + if (!getDefaultDatabase().equals(databaseName)) { + throw new DatabaseNotExistException(getName(), databaseName); + } else { + return new CatalogDatabaseImpl(Maps.newHashMap(), ""); + } + } else { + try { + Map metadata = + Maps.newHashMap( + asNamespaceCatalog.loadNamespaceMetadata(appendLevel(baseNamespace, databaseName))); + String comment = metadata.remove("comment"); + return new CatalogDatabaseImpl(metadata, comment); + } catch (NoSuchNamespaceException e) { + throw new DatabaseNotExistException(getName(), databaseName, e); + } + } + } + + @Override + public boolean databaseExists(String databaseName) throws CatalogException { + try { + getDatabase(databaseName); + return true; + } catch (DatabaseNotExistException ignore) { + return false; + } + } + + @Override + public void createDatabase(String name, CatalogDatabase database, boolean ignoreIfExists) + throws DatabaseAlreadyExistException, CatalogException { + createDatabase( + name, mergeComment(database.getProperties(), database.getComment()), ignoreIfExists); + } + + private void createDatabase( + String databaseName, Map metadata, boolean ignoreIfExists) + throws DatabaseAlreadyExistException, CatalogException { + if (asNamespaceCatalog != null) { + try { + asNamespaceCatalog.createNamespace(appendLevel(baseNamespace, databaseName), metadata); + } catch (AlreadyExistsException e) { + if (!ignoreIfExists) { + throw new DatabaseAlreadyExistException(getName(), databaseName, e); + } + } + } else { + throw new UnsupportedOperationException( + "Namespaces are not supported by catalog: " + getName()); + } + } + + private Map mergeComment(Map metadata, String comment) { + Map ret = Maps.newHashMap(metadata); + if (metadata.containsKey("comment")) { + throw new CatalogException("Database properties should not contain key: 'comment'."); + } + + if (!StringUtils.isNullOrWhitespaceOnly(comment)) { + ret.put("comment", comment); + } + return ret; + } + + @Override + public void dropDatabase(String name, boolean ignoreIfNotExists, boolean cascade) + throws DatabaseNotExistException, DatabaseNotEmptyException, CatalogException { + if (asNamespaceCatalog != null) { + try { + boolean success = asNamespaceCatalog.dropNamespace(appendLevel(baseNamespace, name)); + if (!success && !ignoreIfNotExists) { + throw new DatabaseNotExistException(getName(), name); + } + } catch (NoSuchNamespaceException e) { + if (!ignoreIfNotExists) { + throw new DatabaseNotExistException(getName(), name, e); + } + } catch (NamespaceNotEmptyException e) { + throw new DatabaseNotEmptyException(getName(), name, e); + } + } else { + if (!ignoreIfNotExists) { + throw new DatabaseNotExistException(getName(), name); + } + } + } + + @Override + public void alterDatabase(String name, CatalogDatabase newDatabase, boolean ignoreIfNotExists) + throws DatabaseNotExistException, CatalogException { + if (asNamespaceCatalog != null) { + Namespace namespace = appendLevel(baseNamespace, name); + Map updates = Maps.newHashMap(); + Set removals = Sets.newHashSet(); + + try { + Map oldProperties = asNamespaceCatalog.loadNamespaceMetadata(namespace); + Map newProperties = + mergeComment(newDatabase.getProperties(), newDatabase.getComment()); + + for (String key : oldProperties.keySet()) { + if (!newProperties.containsKey(key)) { + removals.add(key); + } + } + + for (Map.Entry entry : newProperties.entrySet()) { + if (!entry.getValue().equals(oldProperties.get(entry.getKey()))) { + updates.put(entry.getKey(), entry.getValue()); + } + } + + if (!updates.isEmpty()) { + asNamespaceCatalog.setProperties(namespace, updates); + } + + if (!removals.isEmpty()) { + asNamespaceCatalog.removeProperties(namespace, removals); + } + + } catch (NoSuchNamespaceException e) { + if (!ignoreIfNotExists) { + throw new DatabaseNotExistException(getName(), name, e); + } + } + } else { + if (getDefaultDatabase().equals(name)) { + throw new CatalogException( + "Can not alter the default database when the iceberg catalog doesn't support namespaces."); + } + if (!ignoreIfNotExists) { + throw new DatabaseNotExistException(getName(), name); + } + } + } + + @Override + public List listTables(String databaseName) + throws DatabaseNotExistException, CatalogException { + try { + return icebergCatalog.listTables(appendLevel(baseNamespace, databaseName)).stream() + .map(TableIdentifier::name) + .collect(Collectors.toList()); + } catch (NoSuchNamespaceException e) { + throw new DatabaseNotExistException(getName(), databaseName, e); + } + } + + @Override + public CatalogTable getTable(ObjectPath tablePath) + throws TableNotExistException, CatalogException { + Table table = loadIcebergTable(tablePath); + return toCatalogTable(table); + } + + private Table loadIcebergTable(ObjectPath tablePath) throws TableNotExistException { + try { + Table table = icebergCatalog.loadTable(toIdentifier(tablePath)); + if (cacheEnabled) { + table.refresh(); + } + + return table; + } catch (org.apache.iceberg.exceptions.NoSuchTableException e) { + throw new TableNotExistException(getName(), tablePath, e); + } + } + + @Override + public boolean tableExists(ObjectPath tablePath) throws CatalogException { + return icebergCatalog.tableExists(toIdentifier(tablePath)); + } + + @Override + public void dropTable(ObjectPath tablePath, boolean ignoreIfNotExists) + throws TableNotExistException, CatalogException { + try { + icebergCatalog.dropTable(toIdentifier(tablePath)); + } catch (org.apache.iceberg.exceptions.NoSuchTableException e) { + if (!ignoreIfNotExists) { + throw new TableNotExistException(getName(), tablePath, e); + } + } + } + + @Override + public void renameTable(ObjectPath tablePath, String newTableName, boolean ignoreIfNotExists) + throws TableNotExistException, TableAlreadyExistException, CatalogException { + try { + icebergCatalog.renameTable( + toIdentifier(tablePath), + toIdentifier(new ObjectPath(tablePath.getDatabaseName(), newTableName))); + } catch (org.apache.iceberg.exceptions.NoSuchTableException e) { + if (!ignoreIfNotExists) { + throw new TableNotExistException(getName(), tablePath, e); + } + } catch (AlreadyExistsException e) { + throw new TableAlreadyExistException(getName(), tablePath, e); + } + } + + @Override + public void createTable(ObjectPath tablePath, CatalogBaseTable table, boolean ignoreIfExists) + throws CatalogException, TableAlreadyExistException { + if (Objects.equals( + table.getOptions().get("connector"), FlinkDynamicTableFactory.FACTORY_IDENTIFIER)) { + throw new IllegalArgumentException( + "Cannot create the table with 'connector'='iceberg' table property in " + + "an iceberg catalog, Please create table with 'connector'='iceberg' property in a non-iceberg catalog or " + + "create table without 'connector'='iceberg' related properties in an iceberg table."); + } + Preconditions.checkArgument(table instanceof ResolvedCatalogTable, "table should be resolved"); + createIcebergTable(tablePath, (ResolvedCatalogTable) table, ignoreIfExists); + } + + void createIcebergTable(ObjectPath tablePath, ResolvedCatalogTable table, boolean ignoreIfExists) + throws CatalogException, TableAlreadyExistException { + validateFlinkTable(table); + + Schema icebergSchema = FlinkSchemaUtil.convert(table.getResolvedSchema()); + PartitionSpec spec = toPartitionSpec(((CatalogTable) table).getPartitionKeys(), icebergSchema); + ImmutableMap.Builder properties = ImmutableMap.builder(); + String location = null; + for (Map.Entry entry : table.getOptions().entrySet()) { + if ("location".equalsIgnoreCase(entry.getKey())) { + location = entry.getValue(); + } else { + properties.put(entry.getKey(), entry.getValue()); + } + } + + try { + icebergCatalog.createTable( + toIdentifier(tablePath), icebergSchema, spec, location, properties.build()); + } catch (AlreadyExistsException e) { + if (!ignoreIfExists) { + throw new TableAlreadyExistException(getName(), tablePath, e); + } + } + } + + private static void validateTableSchemaAndPartition(CatalogTable ct1, CatalogTable ct2) { + TableSchema ts1 = ct1.getSchema(); + TableSchema ts2 = ct2.getSchema(); + boolean equalsPrimary = false; + + if (ts1.getPrimaryKey().isPresent() && ts2.getPrimaryKey().isPresent()) { + equalsPrimary = + Objects.equals(ts1.getPrimaryKey().get().getType(), ts2.getPrimaryKey().get().getType()) + && Objects.equals( + ts1.getPrimaryKey().get().getColumns(), ts2.getPrimaryKey().get().getColumns()); + } else if (!ts1.getPrimaryKey().isPresent() && !ts2.getPrimaryKey().isPresent()) { + equalsPrimary = true; + } + + if (!(Objects.equals(ts1.getTableColumns(), ts2.getTableColumns()) + && Objects.equals(ts1.getWatermarkSpecs(), ts2.getWatermarkSpecs()) + && equalsPrimary)) { + throw new UnsupportedOperationException( + "Altering schema is not supported in the old alterTable API. " + + "To alter schema, use the other alterTable API and provide a list of TableChange's."); + } + + validateTablePartition(ct1, ct2); + } + + private static void validateTablePartition(CatalogTable ct1, CatalogTable ct2) { + if (!ct1.getPartitionKeys().equals(ct2.getPartitionKeys())) { + throw new UnsupportedOperationException("Altering partition keys is not supported yet."); + } + } + + /** + * This alterTable API only supports altering table properties. + * + *

Support for adding/removing/renaming columns cannot be done by comparing CatalogTable + * instances, unless the Flink schema contains Iceberg column IDs. + * + *

To alter columns, use the other alterTable API and provide a list of TableChange's. + * + * @param tablePath path of the table or view to be modified + * @param newTable the new table definition + * @param ignoreIfNotExists flag to specify behavior when the table or view does not exist: if set + * to false, throw an exception, if set to true, do nothing. + * @throws CatalogException in case of any runtime exception + * @throws TableNotExistException if the table does not exist + */ + @Override + public void alterTable(ObjectPath tablePath, CatalogBaseTable newTable, boolean ignoreIfNotExists) + throws CatalogException, TableNotExistException { + validateFlinkTable(newTable); + + Table icebergTable; + try { + icebergTable = loadIcebergTable(tablePath); + } catch (TableNotExistException e) { + if (!ignoreIfNotExists) { + throw e; + } else { + return; + } + } + + CatalogTable table = toCatalogTable(icebergTable); + validateTableSchemaAndPartition(table, (CatalogTable) newTable); + + Map oldProperties = table.getOptions(); + Map setProperties = Maps.newHashMap(); + + String setLocation = null; + String setSnapshotId = null; + String pickSnapshotId = null; + + for (Map.Entry entry : newTable.getOptions().entrySet()) { + String key = entry.getKey(); + String value = entry.getValue(); + + if (Objects.equals(value, oldProperties.get(key))) { + continue; + } + + if ("location".equalsIgnoreCase(key)) { + setLocation = value; + } else if ("current-snapshot-id".equalsIgnoreCase(key)) { + setSnapshotId = value; + } else if ("cherry-pick-snapshot-id".equalsIgnoreCase(key)) { + pickSnapshotId = value; + } else { + setProperties.put(key, value); + } + } + + oldProperties + .keySet() + .forEach( + k -> { + if (!newTable.getOptions().containsKey(k)) { + setProperties.put(k, null); + } + }); + + FlinkAlterTableUtil.commitChanges( + icebergTable, setLocation, setSnapshotId, pickSnapshotId, setProperties); + } + + @Override + public void alterTable( + ObjectPath tablePath, + CatalogBaseTable newTable, + List tableChanges, + boolean ignoreIfNotExists) + throws TableNotExistException, CatalogException { + validateFlinkTable(newTable); + + Table icebergTable; + try { + icebergTable = loadIcebergTable(tablePath); + } catch (TableNotExistException e) { + if (!ignoreIfNotExists) { + throw e; + } else { + return; + } + } + + // Does not support altering partition yet. + validateTablePartition(toCatalogTable(icebergTable), (CatalogTable) newTable); + + String setLocation = null; + String setSnapshotId = null; + String cherrypickSnapshotId = null; + + List propertyChanges = Lists.newArrayList(); + List schemaChanges = Lists.newArrayList(); + for (TableChange change : tableChanges) { + if (change instanceof TableChange.SetOption) { + TableChange.SetOption set = (TableChange.SetOption) change; + + if ("location".equalsIgnoreCase(set.getKey())) { + setLocation = set.getValue(); + } else if ("current-snapshot-id".equalsIgnoreCase(set.getKey())) { + setSnapshotId = set.getValue(); + } else if ("cherry-pick-snapshot-id".equalsIgnoreCase(set.getKey())) { + cherrypickSnapshotId = set.getValue(); + } else { + propertyChanges.add(change); + } + } else if (change instanceof TableChange.ResetOption) { + propertyChanges.add(change); + } else { + schemaChanges.add(change); + } + } + + FlinkAlterTableUtil.commitChanges( + icebergTable, + setLocation, + setSnapshotId, + cherrypickSnapshotId, + schemaChanges, + propertyChanges); + } + + private static void validateFlinkTable(CatalogBaseTable table) { + Preconditions.checkArgument( + table instanceof CatalogTable, "The Table should be a CatalogTable."); + + TableSchema schema = table.getSchema(); + schema + .getTableColumns() + .forEach( + column -> { + if (!FlinkCompatibilityUtil.isPhysicalColumn(column)) { + throw new UnsupportedOperationException( + "Creating table with computed columns is not supported yet."); + } + }); + + if (!schema.getWatermarkSpecs().isEmpty()) { + throw new UnsupportedOperationException( + "Creating table with watermark specs is not supported yet."); + } + } + + private static PartitionSpec toPartitionSpec(List partitionKeys, Schema icebergSchema) { + PartitionSpec.Builder builder = PartitionSpec.builderFor(icebergSchema); + partitionKeys.forEach(builder::identity); + return builder.build(); + } + + private static List toPartitionKeys(PartitionSpec spec, Schema icebergSchema) { + ImmutableList.Builder partitionKeysBuilder = ImmutableList.builder(); + for (PartitionField field : spec.fields()) { + if (field.transform().isIdentity()) { + partitionKeysBuilder.add(icebergSchema.findColumnName(field.sourceId())); + } else { + // Not created by Flink SQL. + // For compatibility with iceberg tables, return empty. + // TODO modify this after Flink support partition transform. + return Collections.emptyList(); + } + } + return partitionKeysBuilder.build(); + } + + static CatalogTable toCatalogTable(Table table) { + TableSchema schema = FlinkSchemaUtil.toSchema(table.schema()); + List partitionKeys = toPartitionKeys(table.spec(), table.schema()); + + // NOTE: We can not create a IcebergCatalogTable extends CatalogTable, because Flink optimizer + // may use + // CatalogTableImpl to copy a new catalog table. + // Let's re-loading table from Iceberg catalog when creating source/sink operators. + // Iceberg does not have Table comment, so pass a null (Default comment value in Flink). + return new CatalogTableImpl(schema, partitionKeys, table.properties(), null); + } + + @Override + public Optional getFactory() { + return Optional.of(new FlinkDynamicTableFactory(this)); + } + + CatalogLoader getCatalogLoader() { + return catalogLoader; + } + + // ------------------------------ Unsupported methods + // --------------------------------------------- + + @Override + public List listViews(String databaseName) throws CatalogException { + return Collections.emptyList(); + } + + @Override + public CatalogPartition getPartition(ObjectPath tablePath, CatalogPartitionSpec partitionSpec) + throws CatalogException { + throw new UnsupportedOperationException(); + } + + @Override + public boolean partitionExists(ObjectPath tablePath, CatalogPartitionSpec partitionSpec) + throws CatalogException { + throw new UnsupportedOperationException(); + } + + @Override + public void createPartition( + ObjectPath tablePath, + CatalogPartitionSpec partitionSpec, + CatalogPartition partition, + boolean ignoreIfExists) + throws CatalogException { + throw new UnsupportedOperationException(); + } + + @Override + public void dropPartition( + ObjectPath tablePath, CatalogPartitionSpec partitionSpec, boolean ignoreIfNotExists) + throws CatalogException { + throw new UnsupportedOperationException(); + } + + @Override + public void alterPartition( + ObjectPath tablePath, + CatalogPartitionSpec partitionSpec, + CatalogPartition newPartition, + boolean ignoreIfNotExists) + throws CatalogException { + throw new UnsupportedOperationException(); + } + + @Override + public List listFunctions(String dbName) throws CatalogException { + return Collections.emptyList(); + } + + @Override + public CatalogFunction getFunction(ObjectPath functionPath) + throws FunctionNotExistException, CatalogException { + throw new FunctionNotExistException(getName(), functionPath); + } + + @Override + public boolean functionExists(ObjectPath functionPath) throws CatalogException { + return false; + } + + @Override + public void createFunction( + ObjectPath functionPath, CatalogFunction function, boolean ignoreIfExists) + throws CatalogException { + throw new UnsupportedOperationException(); + } + + @Override + public void alterFunction( + ObjectPath functionPath, CatalogFunction newFunction, boolean ignoreIfNotExists) + throws CatalogException { + throw new UnsupportedOperationException(); + } + + @Override + public void dropFunction(ObjectPath functionPath, boolean ignoreIfNotExists) + throws CatalogException { + throw new UnsupportedOperationException(); + } + + @Override + public void alterTableStatistics( + ObjectPath tablePath, CatalogTableStatistics tableStatistics, boolean ignoreIfNotExists) + throws CatalogException { + throw new UnsupportedOperationException(); + } + + @Override + public void alterTableColumnStatistics( + ObjectPath tablePath, CatalogColumnStatistics columnStatistics, boolean ignoreIfNotExists) + throws CatalogException { + throw new UnsupportedOperationException(); + } + + @Override + public void alterPartitionStatistics( + ObjectPath tablePath, + CatalogPartitionSpec partitionSpec, + CatalogTableStatistics partitionStatistics, + boolean ignoreIfNotExists) + throws CatalogException { + throw new UnsupportedOperationException(); + } + + @Override + public void alterPartitionColumnStatistics( + ObjectPath tablePath, + CatalogPartitionSpec partitionSpec, + CatalogColumnStatistics columnStatistics, + boolean ignoreIfNotExists) + throws CatalogException { + throw new UnsupportedOperationException(); + } + + @Override + public List listPartitions(ObjectPath tablePath) + throws TableNotExistException, TableNotPartitionedException, CatalogException { + Table table = loadIcebergTable(tablePath); + + if (table.spec().isUnpartitioned()) { + throw new TableNotPartitionedException(icebergCatalog.name(), tablePath); + } + + Set set = Sets.newHashSet(); + try (CloseableIterable tasks = table.newScan().planFiles()) { + for (DataFile dataFile : CloseableIterable.transform(tasks, FileScanTask::file)) { + Map map = Maps.newHashMap(); + StructLike structLike = dataFile.partition(); + PartitionSpec spec = table.specs().get(dataFile.specId()); + for (int i = 0; i < structLike.size(); i++) { + map.put(spec.fields().get(i).name(), String.valueOf(structLike.get(i, Object.class))); + } + set.add(new CatalogPartitionSpec(map)); + } + } catch (IOException e) { + throw new CatalogException( + String.format("Failed to list partitions of table %s", tablePath), e); + } + + return Lists.newArrayList(set); + } + + @Override + public List listPartitions( + ObjectPath tablePath, CatalogPartitionSpec partitionSpec) throws CatalogException { + throw new UnsupportedOperationException(); + } + + @Override + public List listPartitionsByFilter( + ObjectPath tablePath, List filters) throws CatalogException { + throw new UnsupportedOperationException(); + } + + // After partition pruning and filter push down, the statistics have become very inaccurate, so + // the statistics from + // here are of little significance. + // Flink will support something like SupportsReportStatistics in future. + + @Override + public CatalogTableStatistics getTableStatistics(ObjectPath tablePath) throws CatalogException { + return CatalogTableStatistics.UNKNOWN; + } + + @Override + public CatalogColumnStatistics getTableColumnStatistics(ObjectPath tablePath) + throws CatalogException { + return CatalogColumnStatistics.UNKNOWN; + } + + @Override + public CatalogTableStatistics getPartitionStatistics( + ObjectPath tablePath, CatalogPartitionSpec partitionSpec) throws CatalogException { + return CatalogTableStatistics.UNKNOWN; + } + + @Override + public CatalogColumnStatistics getPartitionColumnStatistics( + ObjectPath tablePath, CatalogPartitionSpec partitionSpec) throws CatalogException { + return CatalogColumnStatistics.UNKNOWN; + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkCatalogFactory.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkCatalogFactory.java new file mode 100644 index 000000000000..fe4008a13ce5 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkCatalogFactory.java @@ -0,0 +1,213 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import java.net.URL; +import java.nio.file.Files; +import java.nio.file.Paths; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import org.apache.flink.configuration.GlobalConfiguration; +import org.apache.flink.runtime.util.HadoopUtils; +import org.apache.flink.table.catalog.Catalog; +import org.apache.flink.table.factories.CatalogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.iceberg.CatalogProperties; +import org.apache.iceberg.catalog.Namespace; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.base.Strings; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.util.PropertyUtil; + +/** + * A Flink Catalog factory implementation that creates {@link FlinkCatalog}. + * + *

This supports the following catalog configuration options: + * + *

    + *
  • type - Flink catalog factory key, should be "iceberg" + *
  • catalog-type - iceberg catalog type, "hive", "hadoop" or "rest" + *
  • uri - the Hive Metastore URI (Hive catalog only) + *
  • clients - the Hive Client Pool Size (Hive catalog only) + *
  • warehouse - the warehouse path (Hadoop catalog only) + *
  • default-database - a database name to use as the default + *
  • base-namespace - a base namespace as the prefix for all databases (Hadoop + * catalog only) + *
  • cache-enabled - whether to enable catalog cache + *
+ * + *

To use a custom catalog that is not a Hive or Hadoop catalog, extend this class and override + * {@link #createCatalogLoader(String, Map, Configuration)}. + */ +public class FlinkCatalogFactory implements CatalogFactory { + + // Can not just use "type", it conflicts with CATALOG_TYPE. + public static final String ICEBERG_CATALOG_TYPE = "catalog-type"; + public static final String ICEBERG_CATALOG_TYPE_HADOOP = "hadoop"; + public static final String ICEBERG_CATALOG_TYPE_HIVE = "hive"; + public static final String ICEBERG_CATALOG_TYPE_REST = "rest"; + + public static final String HIVE_CONF_DIR = "hive-conf-dir"; + public static final String HADOOP_CONF_DIR = "hadoop-conf-dir"; + public static final String DEFAULT_DATABASE = "default-database"; + public static final String DEFAULT_DATABASE_NAME = "default"; + public static final String DEFAULT_CATALOG_NAME = "default_catalog"; + public static final String BASE_NAMESPACE = "base-namespace"; + public static final String TYPE = "type"; + public static final String PROPERTY_VERSION = "property-version"; + + /** + * Create an Iceberg {@link org.apache.iceberg.catalog.Catalog} loader to be used by this Flink + * catalog adapter. + * + * @param name Flink's catalog name + * @param properties Flink's catalog properties + * @param hadoopConf Hadoop configuration for catalog + * @return an Iceberg catalog loader + */ + static CatalogLoader createCatalogLoader( + String name, Map properties, Configuration hadoopConf) { + String catalogImpl = properties.get(CatalogProperties.CATALOG_IMPL); + if (catalogImpl != null) { + String catalogType = properties.get(ICEBERG_CATALOG_TYPE); + Preconditions.checkArgument( + catalogType == null, + "Cannot create catalog %s, both catalog-type and catalog-impl are set: catalog-type=%s, catalog-impl=%s", + name, + catalogType, + catalogImpl); + return CatalogLoader.custom(name, properties, hadoopConf, catalogImpl); + } + + String catalogType = properties.getOrDefault(ICEBERG_CATALOG_TYPE, ICEBERG_CATALOG_TYPE_HIVE); + switch (catalogType.toLowerCase(Locale.ENGLISH)) { + case ICEBERG_CATALOG_TYPE_HIVE: + // The values of properties 'uri', 'warehouse', 'hive-conf-dir' are allowed to be null, in + // that case it will + // fallback to parse those values from hadoop configuration which is loaded from classpath. + String hiveConfDir = properties.get(HIVE_CONF_DIR); + String hadoopConfDir = properties.get(HADOOP_CONF_DIR); + Configuration newHadoopConf = mergeHiveConf(hadoopConf, hiveConfDir, hadoopConfDir); + return CatalogLoader.hive(name, newHadoopConf, properties); + + case ICEBERG_CATALOG_TYPE_HADOOP: + return CatalogLoader.hadoop(name, hadoopConf, properties); + + case ICEBERG_CATALOG_TYPE_REST: + return CatalogLoader.rest(name, hadoopConf, properties); + + default: + throw new UnsupportedOperationException( + "Unknown catalog-type: " + catalogType + " (Must be 'hive', 'hadoop' or 'rest')"); + } + } + + @Override + public Map requiredContext() { + Map context = Maps.newHashMap(); + context.put(TYPE, "iceberg"); + context.put(PROPERTY_VERSION, "1"); + return context; + } + + @Override + public List supportedProperties() { + return ImmutableList.of("*"); + } + + @Override + public Catalog createCatalog(String name, Map properties) { + return createCatalog(name, properties, clusterHadoopConf()); + } + + protected Catalog createCatalog( + String name, Map properties, Configuration hadoopConf) { + CatalogLoader catalogLoader = createCatalogLoader(name, properties, hadoopConf); + String defaultDatabase = properties.getOrDefault(DEFAULT_DATABASE, DEFAULT_DATABASE_NAME); + + Namespace baseNamespace = Namespace.empty(); + if (properties.containsKey(BASE_NAMESPACE)) { + baseNamespace = Namespace.of(properties.get(BASE_NAMESPACE).split("\\.")); + } + + boolean cacheEnabled = + PropertyUtil.propertyAsBoolean( + properties, CatalogProperties.CACHE_ENABLED, CatalogProperties.CACHE_ENABLED_DEFAULT); + + long cacheExpirationIntervalMs = + PropertyUtil.propertyAsLong( + properties, + CatalogProperties.CACHE_EXPIRATION_INTERVAL_MS, + CatalogProperties.CACHE_EXPIRATION_INTERVAL_MS_OFF); + Preconditions.checkArgument( + cacheExpirationIntervalMs != 0, + "%s is not allowed to be 0.", + CatalogProperties.CACHE_EXPIRATION_INTERVAL_MS); + + return new FlinkCatalog( + name, + defaultDatabase, + baseNamespace, + catalogLoader, + cacheEnabled, + cacheExpirationIntervalMs); + } + + private static Configuration mergeHiveConf( + Configuration hadoopConf, String hiveConfDir, String hadoopConfDir) { + Configuration newConf = new Configuration(hadoopConf); + if (!Strings.isNullOrEmpty(hiveConfDir)) { + Preconditions.checkState( + Files.exists(Paths.get(hiveConfDir, "hive-site.xml")), + "There should be a hive-site.xml file under the directory %s", + hiveConfDir); + newConf.addResource(new Path(hiveConfDir, "hive-site.xml")); + } else { + // If don't provide the hive-site.xml path explicitly, it will try to load resource from + // classpath. If still + // couldn't load the configuration file, then it will throw exception in HiveCatalog. + URL configFile = CatalogLoader.class.getClassLoader().getResource("hive-site.xml"); + if (configFile != null) { + newConf.addResource(configFile); + } + } + + if (!Strings.isNullOrEmpty(hadoopConfDir)) { + Preconditions.checkState( + Files.exists(Paths.get(hadoopConfDir, "hdfs-site.xml")), + "Failed to load Hadoop configuration: missing %s", + Paths.get(hadoopConfDir, "hdfs-site.xml")); + newConf.addResource(new Path(hadoopConfDir, "hdfs-site.xml")); + Preconditions.checkState( + Files.exists(Paths.get(hadoopConfDir, "core-site.xml")), + "Failed to load Hadoop configuration: missing %s", + Paths.get(hadoopConfDir, "core-site.xml")); + newConf.addResource(new Path(hadoopConfDir, "core-site.xml")); + } + + return newConf; + } + + public static Configuration clusterHadoopConf() { + return HadoopUtils.getHadoopConfiguration(GlobalConfiguration.loadConfiguration()); + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkConfParser.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkConfParser.java new file mode 100644 index 000000000000..7167859e600c --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkConfParser.java @@ -0,0 +1,261 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import java.time.Duration; +import java.util.List; +import java.util.Map; +import java.util.function.Function; +import org.apache.flink.configuration.ConfigOption; +import org.apache.flink.configuration.ReadableConfig; +import org.apache.flink.util.TimeUtils; +import org.apache.iceberg.Table; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; + +class FlinkConfParser { + + private final Map tableProperties; + private final Map options; + private final ReadableConfig readableConfig; + + FlinkConfParser(Table table, Map options, ReadableConfig readableConfig) { + this.tableProperties = table.properties(); + this.options = options; + this.readableConfig = readableConfig; + } + + public BooleanConfParser booleanConf() { + return new BooleanConfParser(); + } + + public IntConfParser intConf() { + return new IntConfParser(); + } + + public LongConfParser longConf() { + return new LongConfParser(); + } + + public > EnumConfParser enumConfParser(Class enumClass) { + return new EnumConfParser<>(enumClass); + } + + public StringConfParser stringConf() { + return new StringConfParser(); + } + + public DurationConfParser durationConf() { + return new DurationConfParser(); + } + + class BooleanConfParser extends ConfParser { + private Boolean defaultValue; + + @Override + protected BooleanConfParser self() { + return this; + } + + public BooleanConfParser defaultValue(boolean value) { + this.defaultValue = value; + return self(); + } + + public BooleanConfParser defaultValue(String value) { + this.defaultValue = Boolean.parseBoolean(value); + return self(); + } + + public boolean parse() { + Preconditions.checkArgument(defaultValue != null, "Default value cannot be null"); + return parse(Boolean::parseBoolean, defaultValue); + } + } + + class IntConfParser extends ConfParser { + private Integer defaultValue; + + @Override + protected IntConfParser self() { + return this; + } + + public IntConfParser defaultValue(int value) { + this.defaultValue = value; + return self(); + } + + public int parse() { + Preconditions.checkArgument(defaultValue != null, "Default value cannot be null"); + return parse(Integer::parseInt, defaultValue); + } + + public Integer parseOptional() { + return parse(Integer::parseInt, null); + } + } + + class LongConfParser extends ConfParser { + private Long defaultValue; + + @Override + protected LongConfParser self() { + return this; + } + + public LongConfParser defaultValue(long value) { + this.defaultValue = value; + return self(); + } + + public long parse() { + Preconditions.checkArgument(defaultValue != null, "Default value cannot be null"); + return parse(Long::parseLong, defaultValue); + } + + public Long parseOptional() { + return parse(Long::parseLong, null); + } + } + + class StringConfParser extends ConfParser { + private String defaultValue; + + @Override + protected StringConfParser self() { + return this; + } + + public StringConfParser defaultValue(String value) { + this.defaultValue = value; + return self(); + } + + public String parse() { + Preconditions.checkArgument(defaultValue != null, "Default value cannot be null"); + return parse(Function.identity(), defaultValue); + } + + public String parseOptional() { + return parse(Function.identity(), null); + } + } + + class EnumConfParser> extends ConfParser, E> { + private E defaultValue; + private final Class enumClass; + + EnumConfParser(Class enumClass) { + this.enumClass = enumClass; + } + + @Override + protected EnumConfParser self() { + return this; + } + + public EnumConfParser defaultValue(E value) { + this.defaultValue = value; + return self(); + } + + public E parse() { + Preconditions.checkArgument(defaultValue != null, "Default value cannot be null"); + return parse(s -> Enum.valueOf(enumClass, s), defaultValue); + } + + public E parseOptional() { + return parse(s -> Enum.valueOf(enumClass, s), null); + } + } + + class DurationConfParser extends ConfParser { + private Duration defaultValue; + + @Override + protected DurationConfParser self() { + return this; + } + + public DurationConfParser defaultValue(Duration value) { + this.defaultValue = value; + return self(); + } + + public Duration parse() { + Preconditions.checkArgument(defaultValue != null, "Default value cannot be null"); + return parse(TimeUtils::parseDuration, defaultValue); + } + + public Duration parseOptional() { + return parse(TimeUtils::parseDuration, null); + } + } + + abstract class ConfParser { + private final List optionNames = Lists.newArrayList(); + private String tablePropertyName; + private ConfigOption configOption; + + protected abstract ThisT self(); + + public ThisT option(String name) { + this.optionNames.add(name); + return self(); + } + + public ThisT flinkConfig(ConfigOption newConfigOption) { + this.configOption = newConfigOption; + return self(); + } + + public ThisT tableProperty(String name) { + this.tablePropertyName = name; + return self(); + } + + protected T parse(Function conversion, T defaultValue) { + if (!optionNames.isEmpty()) { + for (String optionName : optionNames) { + String optionValue = options.get(optionName); + if (optionValue != null) { + return conversion.apply(optionValue); + } + } + } + + if (configOption != null) { + T propertyValue = readableConfig.get(configOption); + if (propertyValue != null) { + return propertyValue; + } + } + + if (tablePropertyName != null) { + String propertyValue = tableProperties.get(tablePropertyName); + if (propertyValue != null) { + return conversion.apply(propertyValue); + } + } + + return defaultValue; + } + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkConfigOptions.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkConfigOptions.java new file mode 100644 index 000000000000..7c7afd24ed8e --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkConfigOptions.java @@ -0,0 +1,107 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import org.apache.flink.configuration.ConfigOption; +import org.apache.flink.configuration.ConfigOptions; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.configuration.description.Description; +import org.apache.flink.configuration.description.TextElement; +import org.apache.flink.table.api.TableEnvironment; +import org.apache.iceberg.flink.source.assigner.SplitAssignerType; +import org.apache.iceberg.util.ThreadPools; + +/** + * When constructing Flink Iceberg source via Java API, configs can be set in {@link Configuration} + * passed to source builder. E.g. + * + *

+ *   configuration.setBoolean(FlinkConfigOptions.TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM, true);
+ *   FlinkSource.forRowData()
+ *       .flinkConf(configuration)
+ *       ...
+ * 
+ * + *

When using Flink SQL/table API, connector options can be set in Flink's {@link + * TableEnvironment}. + * + *

+ *   TableEnvironment tEnv = createTableEnv();
+ *   tEnv.getConfig()
+ *        .getConfiguration()
+ *        .setBoolean(FlinkConfigOptions.TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM, true);
+ * 
+ */ +public class FlinkConfigOptions { + + private FlinkConfigOptions() {} + + public static final ConfigOption TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM = + ConfigOptions.key("table.exec.iceberg.infer-source-parallelism") + .booleanType() + .defaultValue(true) + .withDescription( + "If is false, parallelism of source are set by config.\n" + + "If is true, source parallelism is inferred according to splits number.\n"); + + public static final ConfigOption TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM_MAX = + ConfigOptions.key("table.exec.iceberg.infer-source-parallelism.max") + .intType() + .defaultValue(100) + .withDescription("Sets max infer parallelism for source operator."); + + public static final ConfigOption TABLE_EXEC_ICEBERG_EXPOSE_SPLIT_LOCALITY_INFO = + ConfigOptions.key("table.exec.iceberg.expose-split-locality-info") + .booleanType() + .noDefaultValue() + .withDescription( + "Expose split host information to use Flink's locality aware split assigner."); + + public static final ConfigOption SOURCE_READER_FETCH_BATCH_RECORD_COUNT = + ConfigOptions.key("table.exec.iceberg.fetch-batch-record-count") + .intType() + .defaultValue(2048) + .withDescription("The target number of records for Iceberg reader fetch batch."); + + public static final ConfigOption TABLE_EXEC_ICEBERG_WORKER_POOL_SIZE = + ConfigOptions.key("table.exec.iceberg.worker-pool-size") + .intType() + .defaultValue(ThreadPools.WORKER_THREAD_POOL_SIZE) + .withDescription("The size of workers pool used to plan or scan manifests."); + + public static final ConfigOption TABLE_EXEC_ICEBERG_USE_FLIP27_SOURCE = + ConfigOptions.key("table.exec.iceberg.use-flip27-source") + .booleanType() + .defaultValue(false) + .withDescription("Use the FLIP-27 based Iceberg source implementation."); + + public static final ConfigOption TABLE_EXEC_SPLIT_ASSIGNER_TYPE = + ConfigOptions.key("table.exec.iceberg.split-assigner-type") + .enumType(SplitAssignerType.class) + .defaultValue(SplitAssignerType.SIMPLE) + .withDescription( + Description.builder() + .text("Split assigner type that determine how splits are assigned to readers.") + .linebreak() + .list( + TextElement.text( + SplitAssignerType.SIMPLE + + ": simple assigner that doesn't provide any guarantee on order or locality.")) + .build()); +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkDynamicTableFactory.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkDynamicTableFactory.java new file mode 100644 index 000000000000..b7f1be4b93fb --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkDynamicTableFactory.java @@ -0,0 +1,208 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import java.util.Map; +import java.util.Set; +import org.apache.flink.configuration.ConfigOption; +import org.apache.flink.configuration.ConfigOptions; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.table.api.TableSchema; +import org.apache.flink.table.catalog.CatalogDatabaseImpl; +import org.apache.flink.table.catalog.ObjectIdentifier; +import org.apache.flink.table.catalog.ObjectPath; +import org.apache.flink.table.catalog.ResolvedCatalogTable; +import org.apache.flink.table.catalog.exceptions.DatabaseAlreadyExistException; +import org.apache.flink.table.catalog.exceptions.TableAlreadyExistException; +import org.apache.flink.table.connector.sink.DynamicTableSink; +import org.apache.flink.table.connector.source.DynamicTableSource; +import org.apache.flink.table.factories.DynamicTableSinkFactory; +import org.apache.flink.table.factories.DynamicTableSourceFactory; +import org.apache.flink.table.utils.TableSchemaUtils; +import org.apache.flink.util.Preconditions; +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.exceptions.AlreadyExistsException; +import org.apache.iceberg.flink.source.IcebergTableSource; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.relocated.com.google.common.collect.Sets; + +public class FlinkDynamicTableFactory + implements DynamicTableSinkFactory, DynamicTableSourceFactory { + static final String FACTORY_IDENTIFIER = "iceberg"; + + private static final ConfigOption CATALOG_NAME = + ConfigOptions.key("catalog-name") + .stringType() + .noDefaultValue() + .withDescription("Catalog name"); + + private static final ConfigOption CATALOG_TYPE = + ConfigOptions.key(FlinkCatalogFactory.ICEBERG_CATALOG_TYPE) + .stringType() + .noDefaultValue() + .withDescription("Catalog type, the optional types are: custom, hadoop, hive."); + + private static final ConfigOption CATALOG_DATABASE = + ConfigOptions.key("catalog-database") + .stringType() + .defaultValue(FlinkCatalogFactory.DEFAULT_DATABASE_NAME) + .withDescription("Database name managed in the iceberg catalog."); + + private static final ConfigOption CATALOG_TABLE = + ConfigOptions.key("catalog-table") + .stringType() + .noDefaultValue() + .withDescription("Table name managed in the underlying iceberg catalog and database."); + + private final FlinkCatalog catalog; + + public FlinkDynamicTableFactory() { + this.catalog = null; + } + + public FlinkDynamicTableFactory(FlinkCatalog catalog) { + this.catalog = catalog; + } + + @Override + public DynamicTableSource createDynamicTableSource(Context context) { + ObjectIdentifier objectIdentifier = context.getObjectIdentifier(); + ResolvedCatalogTable resolvedCatalogTable = context.getCatalogTable(); + Map tableProps = resolvedCatalogTable.getOptions(); + TableSchema tableSchema = TableSchemaUtils.getPhysicalSchema(resolvedCatalogTable.getSchema()); + + TableLoader tableLoader; + if (catalog != null) { + tableLoader = createTableLoader(catalog, objectIdentifier.toObjectPath()); + } else { + tableLoader = + createTableLoader( + resolvedCatalogTable, + tableProps, + objectIdentifier.getDatabaseName(), + objectIdentifier.getObjectName()); + } + + return new IcebergTableSource(tableLoader, tableSchema, tableProps, context.getConfiguration()); + } + + @Override + public DynamicTableSink createDynamicTableSink(Context context) { + ObjectIdentifier objectIdentifier = context.getObjectIdentifier(); + ResolvedCatalogTable resolvedCatalogTable = context.getCatalogTable(); + Map writeProps = resolvedCatalogTable.getOptions(); + TableSchema tableSchema = TableSchemaUtils.getPhysicalSchema(resolvedCatalogTable.getSchema()); + + TableLoader tableLoader; + if (catalog != null) { + tableLoader = createTableLoader(catalog, objectIdentifier.toObjectPath()); + } else { + tableLoader = + createTableLoader( + resolvedCatalogTable, + writeProps, + objectIdentifier.getDatabaseName(), + objectIdentifier.getObjectName()); + } + + return new IcebergTableSink(tableLoader, tableSchema, context.getConfiguration(), writeProps); + } + + @Override + public Set> requiredOptions() { + Set> options = Sets.newHashSet(); + options.add(CATALOG_TYPE); + options.add(CATALOG_NAME); + return options; + } + + @Override + public Set> optionalOptions() { + Set> options = Sets.newHashSet(); + options.add(CATALOG_DATABASE); + options.add(CATALOG_TABLE); + return options; + } + + @Override + public String factoryIdentifier() { + return FACTORY_IDENTIFIER; + } + + private static TableLoader createTableLoader( + ResolvedCatalogTable resolvedCatalogTable, + Map tableProps, + String databaseName, + String tableName) { + Configuration flinkConf = new Configuration(); + tableProps.forEach(flinkConf::setString); + + String catalogName = flinkConf.getString(CATALOG_NAME); + Preconditions.checkNotNull( + catalogName, "Table property '%s' cannot be null", CATALOG_NAME.key()); + + String catalogDatabase = flinkConf.getString(CATALOG_DATABASE, databaseName); + Preconditions.checkNotNull(catalogDatabase, "The iceberg database name cannot be null"); + + String catalogTable = flinkConf.getString(CATALOG_TABLE, tableName); + Preconditions.checkNotNull(catalogTable, "The iceberg table name cannot be null"); + + org.apache.hadoop.conf.Configuration hadoopConf = FlinkCatalogFactory.clusterHadoopConf(); + FlinkCatalogFactory factory = new FlinkCatalogFactory(); + FlinkCatalog flinkCatalog = + (FlinkCatalog) factory.createCatalog(catalogName, tableProps, hadoopConf); + ObjectPath objectPath = new ObjectPath(catalogDatabase, catalogTable); + + // Create database if not exists in the external catalog. + if (!flinkCatalog.databaseExists(catalogDatabase)) { + try { + flinkCatalog.createDatabase( + catalogDatabase, new CatalogDatabaseImpl(Maps.newHashMap(), null), true); + } catch (DatabaseAlreadyExistException e) { + throw new AlreadyExistsException( + e, + "Database %s already exists in the iceberg catalog %s.", + catalogName, + catalogDatabase); + } + } + + // Create table if not exists in the external catalog. + if (!flinkCatalog.tableExists(objectPath)) { + try { + flinkCatalog.createIcebergTable(objectPath, resolvedCatalogTable, true); + } catch (TableAlreadyExistException e) { + throw new AlreadyExistsException( + e, + "Table %s already exists in the database %s and catalog %s", + catalogTable, + catalogDatabase, + catalogName); + } + } + + return TableLoader.fromCatalog( + flinkCatalog.getCatalogLoader(), TableIdentifier.of(catalogDatabase, catalogTable)); + } + + private static TableLoader createTableLoader(FlinkCatalog catalog, ObjectPath objectPath) { + Preconditions.checkNotNull(catalog, "Flink catalog cannot be null"); + return TableLoader.fromCatalog(catalog.getCatalogLoader(), catalog.toIdentifier(objectPath)); + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkEnvironmentContext.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkEnvironmentContext.java new file mode 100644 index 000000000000..f35bb577fbba --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkEnvironmentContext.java @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import org.apache.iceberg.EnvironmentContext; +import org.apache.iceberg.flink.util.FlinkPackage; + +class FlinkEnvironmentContext { + private FlinkEnvironmentContext() {} + + public static void init() { + EnvironmentContext.put(EnvironmentContext.ENGINE_NAME, "flink"); + EnvironmentContext.put(EnvironmentContext.ENGINE_VERSION, FlinkPackage.version()); + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkFilters.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkFilters.java new file mode 100644 index 000000000000..f2244d5137a1 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkFilters.java @@ -0,0 +1,266 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import java.time.Instant; +import java.time.LocalDate; +import java.time.LocalDateTime; +import java.time.LocalTime; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.function.BiFunction; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import org.apache.flink.table.expressions.CallExpression; +import org.apache.flink.table.expressions.FieldReferenceExpression; +import org.apache.flink.table.expressions.ResolvedExpression; +import org.apache.flink.table.expressions.ValueLiteralExpression; +import org.apache.flink.table.functions.BuiltInFunctionDefinitions; +import org.apache.flink.table.functions.FunctionDefinition; +import org.apache.iceberg.expressions.Expression; +import org.apache.iceberg.expressions.Expression.Operation; +import org.apache.iceberg.expressions.Expressions; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.util.DateTimeUtil; +import org.apache.iceberg.util.NaNUtil; + +public class FlinkFilters { + private FlinkFilters() {} + + private static final Pattern STARTS_WITH_PATTERN = Pattern.compile("([^%]+)%"); + + private static final Map FILTERS = + ImmutableMap.builder() + .put(BuiltInFunctionDefinitions.EQUALS, Operation.EQ) + .put(BuiltInFunctionDefinitions.NOT_EQUALS, Operation.NOT_EQ) + .put(BuiltInFunctionDefinitions.GREATER_THAN, Operation.GT) + .put(BuiltInFunctionDefinitions.GREATER_THAN_OR_EQUAL, Operation.GT_EQ) + .put(BuiltInFunctionDefinitions.LESS_THAN, Operation.LT) + .put(BuiltInFunctionDefinitions.LESS_THAN_OR_EQUAL, Operation.LT_EQ) + .put(BuiltInFunctionDefinitions.IS_NULL, Operation.IS_NULL) + .put(BuiltInFunctionDefinitions.IS_NOT_NULL, Operation.NOT_NULL) + .put(BuiltInFunctionDefinitions.AND, Operation.AND) + .put(BuiltInFunctionDefinitions.OR, Operation.OR) + .put(BuiltInFunctionDefinitions.NOT, Operation.NOT) + .put(BuiltInFunctionDefinitions.LIKE, Operation.STARTS_WITH) + .buildOrThrow(); + + /** + * Convert flink expression to iceberg expression. + * + *

the BETWEEN, NOT_BETWEEN, IN expression will be converted by flink automatically. the + * BETWEEN will be converted to (GT_EQ AND LT_EQ), the NOT_BETWEEN will be converted to (LT_EQ OR + * GT_EQ), the IN will be converted to OR, so we do not add the conversion here + * + * @param flinkExpression the flink expression + * @return the iceberg expression + */ + public static Optional convert( + org.apache.flink.table.expressions.Expression flinkExpression) { + if (!(flinkExpression instanceof CallExpression)) { + return Optional.empty(); + } + + CallExpression call = (CallExpression) flinkExpression; + Operation op = FILTERS.get(call.getFunctionDefinition()); + if (op != null) { + switch (op) { + case IS_NULL: + return onlyChildAs(call, FieldReferenceExpression.class) + .map(FieldReferenceExpression::getName) + .map(Expressions::isNull); + + case NOT_NULL: + return onlyChildAs(call, FieldReferenceExpression.class) + .map(FieldReferenceExpression::getName) + .map(Expressions::notNull); + + case LT: + return convertFieldAndLiteral(Expressions::lessThan, Expressions::greaterThan, call); + + case LT_EQ: + return convertFieldAndLiteral( + Expressions::lessThanOrEqual, Expressions::greaterThanOrEqual, call); + + case GT: + return convertFieldAndLiteral(Expressions::greaterThan, Expressions::lessThan, call); + + case GT_EQ: + return convertFieldAndLiteral( + Expressions::greaterThanOrEqual, Expressions::lessThanOrEqual, call); + + case EQ: + return convertFieldAndLiteral( + (ref, lit) -> { + if (NaNUtil.isNaN(lit)) { + return Expressions.isNaN(ref); + } else { + return Expressions.equal(ref, lit); + } + }, + call); + + case NOT_EQ: + return convertFieldAndLiteral( + (ref, lit) -> { + if (NaNUtil.isNaN(lit)) { + return Expressions.notNaN(ref); + } else { + return Expressions.notEqual(ref, lit); + } + }, + call); + + case NOT: + return onlyChildAs(call, CallExpression.class) + .flatMap(FlinkFilters::convert) + .map(Expressions::not); + + case AND: + return convertLogicExpression(Expressions::and, call); + + case OR: + return convertLogicExpression(Expressions::or, call); + + case STARTS_WITH: + return convertLike(call); + } + } + + return Optional.empty(); + } + + private static Optional onlyChildAs( + CallExpression call, Class expectedChildClass) { + List children = call.getResolvedChildren(); + if (children.size() != 1) { + return Optional.empty(); + } + + ResolvedExpression child = children.get(0); + if (!expectedChildClass.isInstance(child)) { + return Optional.empty(); + } + + return Optional.of(expectedChildClass.cast(child)); + } + + private static Optional convertLike(CallExpression call) { + List args = call.getResolvedChildren(); + if (args.size() != 2) { + return Optional.empty(); + } + + org.apache.flink.table.expressions.Expression left = args.get(0); + org.apache.flink.table.expressions.Expression right = args.get(1); + + if (left instanceof FieldReferenceExpression && right instanceof ValueLiteralExpression) { + String name = ((FieldReferenceExpression) left).getName(); + return convertLiteral((ValueLiteralExpression) right) + .flatMap( + lit -> { + if (lit instanceof String) { + String pattern = (String) lit; + Matcher matcher = STARTS_WITH_PATTERN.matcher(pattern); + // exclude special char of LIKE + // '_' is the wildcard of the SQL LIKE + if (!pattern.contains("_") && matcher.matches()) { + return Optional.of(Expressions.startsWith(name, matcher.group(1))); + } + } + + return Optional.empty(); + }); + } + + return Optional.empty(); + } + + private static Optional convertLogicExpression( + BiFunction function, CallExpression call) { + List args = call.getResolvedChildren(); + if (args == null || args.size() != 2) { + return Optional.empty(); + } + + Optional left = convert(args.get(0)); + Optional right = convert(args.get(1)); + if (left.isPresent() && right.isPresent()) { + return Optional.of(function.apply(left.get(), right.get())); + } + + return Optional.empty(); + } + + private static Optional convertLiteral(ValueLiteralExpression expression) { + Optional value = + expression.getValueAs( + expression.getOutputDataType().getLogicalType().getDefaultConversion()); + return value.map( + o -> { + if (o instanceof LocalDateTime) { + return DateTimeUtil.microsFromTimestamp((LocalDateTime) o); + } else if (o instanceof Instant) { + return DateTimeUtil.microsFromInstant((Instant) o); + } else if (o instanceof LocalTime) { + return DateTimeUtil.microsFromTime((LocalTime) o); + } else if (o instanceof LocalDate) { + return DateTimeUtil.daysFromDate((LocalDate) o); + } + + return o; + }); + } + + private static Optional convertFieldAndLiteral( + BiFunction expr, CallExpression call) { + return convertFieldAndLiteral(expr, expr, call); + } + + private static Optional convertFieldAndLiteral( + BiFunction convertLR, + BiFunction convertRL, + CallExpression call) { + List args = call.getResolvedChildren(); + if (args.size() != 2) { + return Optional.empty(); + } + + org.apache.flink.table.expressions.Expression left = args.get(0); + org.apache.flink.table.expressions.Expression right = args.get(1); + + if (left instanceof FieldReferenceExpression && right instanceof ValueLiteralExpression) { + String name = ((FieldReferenceExpression) left).getName(); + Optional lit = convertLiteral((ValueLiteralExpression) right); + if (lit.isPresent()) { + return Optional.of(convertLR.apply(name, lit.get())); + } + } else if (left instanceof ValueLiteralExpression + && right instanceof FieldReferenceExpression) { + Optional lit = convertLiteral((ValueLiteralExpression) left); + String name = ((FieldReferenceExpression) right).getName(); + if (lit.isPresent()) { + return Optional.of(convertRL.apply(name, lit.get())); + } + } + + return Optional.empty(); + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkFixupTypes.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkFixupTypes.java new file mode 100644 index 000000000000..767d4497ac91 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkFixupTypes.java @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import org.apache.iceberg.Schema; +import org.apache.iceberg.types.FixupTypes; +import org.apache.iceberg.types.Type; +import org.apache.iceberg.types.TypeUtil; +import org.apache.iceberg.types.Types; + +/** + * The uuid and fixed are converted to the same Flink type. Conversion back can produce only one, + * which may not be correct. + */ +class FlinkFixupTypes extends FixupTypes { + + private FlinkFixupTypes(Schema referenceSchema) { + super(referenceSchema); + } + + static Schema fixup(Schema schema, Schema referenceSchema) { + return new Schema( + TypeUtil.visit(schema, new FlinkFixupTypes(referenceSchema)).asStructType().fields()); + } + + @Override + protected boolean fixupPrimitive(Type.PrimitiveType type, Type source) { + if (type instanceof Types.FixedType) { + int length = ((Types.FixedType) type).length(); + return source.typeId() == Type.TypeID.UUID && length == 16; + } + return false; + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkReadConf.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkReadConf.java new file mode 100644 index 000000000000..804a956ec9b9 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkReadConf.java @@ -0,0 +1,213 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import java.time.Duration; +import java.util.Map; +import java.util.concurrent.TimeUnit; +import org.apache.flink.configuration.ReadableConfig; +import org.apache.flink.util.TimeUtils; +import org.apache.iceberg.Table; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.flink.source.StreamingStartingStrategy; + +public class FlinkReadConf { + + private final FlinkConfParser confParser; + + public FlinkReadConf( + Table table, Map readOptions, ReadableConfig readableConfig) { + this.confParser = new FlinkConfParser(table, readOptions, readableConfig); + } + + public Long snapshotId() { + return confParser.longConf().option(FlinkReadOptions.SNAPSHOT_ID.key()).parseOptional(); + } + + public String tag() { + return confParser.stringConf().option(FlinkReadOptions.TAG.key()).parseOptional(); + } + + public String startTag() { + return confParser.stringConf().option(FlinkReadOptions.START_TAG.key()).parseOptional(); + } + + public String endTag() { + return confParser.stringConf().option(FlinkReadOptions.END_TAG.key()).parseOptional(); + } + + public String branch() { + return confParser.stringConf().option(FlinkReadOptions.BRANCH.key()).parseOptional(); + } + + public boolean caseSensitive() { + return confParser + .booleanConf() + .option(FlinkReadOptions.CASE_SENSITIVE) + .flinkConfig(FlinkReadOptions.CASE_SENSITIVE_OPTION) + .defaultValue(FlinkReadOptions.CASE_SENSITIVE_OPTION.defaultValue()) + .parse(); + } + + public Long asOfTimestamp() { + return confParser.longConf().option(FlinkReadOptions.AS_OF_TIMESTAMP.key()).parseOptional(); + } + + public StreamingStartingStrategy startingStrategy() { + return confParser + .enumConfParser(StreamingStartingStrategy.class) + .option(FlinkReadOptions.STARTING_STRATEGY) + .flinkConfig(FlinkReadOptions.STARTING_STRATEGY_OPTION) + .defaultValue(StreamingStartingStrategy.INCREMENTAL_FROM_LATEST_SNAPSHOT) + .parse(); + } + + public Long startSnapshotTimestamp() { + return confParser + .longConf() + .option(FlinkReadOptions.START_SNAPSHOT_TIMESTAMP.key()) + .parseOptional(); + } + + public Long startSnapshotId() { + return confParser.longConf().option(FlinkReadOptions.START_SNAPSHOT_ID.key()).parseOptional(); + } + + public Long endSnapshotId() { + return confParser.longConf().option(FlinkReadOptions.END_SNAPSHOT_ID.key()).parseOptional(); + } + + public long splitSize() { + return confParser + .longConf() + .option(FlinkReadOptions.SPLIT_SIZE) + .flinkConfig(FlinkReadOptions.SPLIT_SIZE_OPTION) + .tableProperty(TableProperties.SPLIT_SIZE) + .defaultValue(TableProperties.SPLIT_SIZE_DEFAULT) + .parse(); + } + + public int splitLookback() { + return confParser + .intConf() + .option(FlinkReadOptions.SPLIT_LOOKBACK) + .flinkConfig(FlinkReadOptions.SPLIT_LOOKBACK_OPTION) + .tableProperty(TableProperties.SPLIT_LOOKBACK) + .defaultValue(TableProperties.SPLIT_LOOKBACK_DEFAULT) + .parse(); + } + + public long splitFileOpenCost() { + return confParser + .longConf() + .option(FlinkReadOptions.SPLIT_FILE_OPEN_COST) + .flinkConfig(FlinkReadOptions.SPLIT_FILE_OPEN_COST_OPTION) + .tableProperty(TableProperties.SPLIT_OPEN_FILE_COST) + .defaultValue(TableProperties.SPLIT_OPEN_FILE_COST_DEFAULT) + .parse(); + } + + public boolean streaming() { + return confParser + .booleanConf() + .option(FlinkReadOptions.STREAMING) + .flinkConfig(FlinkReadOptions.STREAMING_OPTION) + .defaultValue(FlinkReadOptions.STREAMING_OPTION.defaultValue()) + .parse(); + } + + public Duration monitorInterval() { + String duration = + confParser + .stringConf() + .option(FlinkReadOptions.MONITOR_INTERVAL) + .flinkConfig(FlinkReadOptions.MONITOR_INTERVAL_OPTION) + .defaultValue(FlinkReadOptions.MONITOR_INTERVAL_OPTION.defaultValue()) + .parse(); + + return TimeUtils.parseDuration(duration); + } + + public boolean includeColumnStats() { + return confParser + .booleanConf() + .option(FlinkReadOptions.INCLUDE_COLUMN_STATS) + .flinkConfig(FlinkReadOptions.INCLUDE_COLUMN_STATS_OPTION) + .defaultValue(FlinkReadOptions.INCLUDE_COLUMN_STATS_OPTION.defaultValue()) + .parse(); + } + + public int maxPlanningSnapshotCount() { + return confParser + .intConf() + .option(FlinkReadOptions.MAX_PLANNING_SNAPSHOT_COUNT) + .flinkConfig(FlinkReadOptions.MAX_PLANNING_SNAPSHOT_COUNT_OPTION) + .defaultValue(FlinkReadOptions.MAX_PLANNING_SNAPSHOT_COUNT_OPTION.defaultValue()) + .parse(); + } + + public String nameMapping() { + return confParser.stringConf().option(TableProperties.DEFAULT_NAME_MAPPING).parseOptional(); + } + + public long limit() { + return confParser + .longConf() + .option(FlinkReadOptions.LIMIT) + .flinkConfig(FlinkReadOptions.LIMIT_OPTION) + .defaultValue(FlinkReadOptions.LIMIT_OPTION.defaultValue()) + .parse(); + } + + public int workerPoolSize() { + return confParser + .intConf() + .option(FlinkConfigOptions.TABLE_EXEC_ICEBERG_WORKER_POOL_SIZE.key()) + .flinkConfig(FlinkConfigOptions.TABLE_EXEC_ICEBERG_WORKER_POOL_SIZE) + .defaultValue(FlinkConfigOptions.TABLE_EXEC_ICEBERG_WORKER_POOL_SIZE.defaultValue()) + .parse(); + } + + public int maxAllowedPlanningFailures() { + return confParser + .intConf() + .option(FlinkReadOptions.MAX_ALLOWED_PLANNING_FAILURES) + .flinkConfig(FlinkReadOptions.MAX_ALLOWED_PLANNING_FAILURES_OPTION) + .defaultValue(FlinkReadOptions.MAX_ALLOWED_PLANNING_FAILURES_OPTION.defaultValue()) + .parse(); + } + + public String watermarkColumn() { + return confParser + .stringConf() + .option(FlinkReadOptions.WATERMARK_COLUMN) + .flinkConfig(FlinkReadOptions.WATERMARK_COLUMN_OPTION) + .defaultValue(FlinkReadOptions.WATERMARK_COLUMN_OPTION.defaultValue()) + .parseOptional(); + } + + public TimeUnit watermarkColumnTimeUnit() { + return confParser + .enumConfParser(TimeUnit.class) + .option(FlinkReadOptions.WATERMARK_COLUMN_TIME_UNIT) + .flinkConfig(FlinkReadOptions.WATERMARK_COLUMN_TIME_UNIT_OPTION) + .defaultValue(FlinkReadOptions.WATERMARK_COLUMN_TIME_UNIT_OPTION.defaultValue()) + .parse(); + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkReadOptions.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkReadOptions.java new file mode 100644 index 000000000000..1bbd88146c8f --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkReadOptions.java @@ -0,0 +1,123 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import java.util.concurrent.TimeUnit; +import org.apache.flink.configuration.ConfigOption; +import org.apache.flink.configuration.ConfigOptions; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.flink.source.StreamingStartingStrategy; + +/** Flink source read options */ +public class FlinkReadOptions { + private static final String PREFIX = "connector.iceberg."; + + private FlinkReadOptions() {} + + public static final ConfigOption SNAPSHOT_ID = + ConfigOptions.key("snapshot-id").longType().defaultValue(null); + + public static final ConfigOption TAG = + ConfigOptions.key("tag").stringType().defaultValue(null); + + public static final ConfigOption BRANCH = + ConfigOptions.key("branch").stringType().defaultValue(null); + + public static final ConfigOption START_TAG = + ConfigOptions.key("start-tag").stringType().defaultValue(null); + + public static final ConfigOption END_TAG = + ConfigOptions.key("end-tag").stringType().defaultValue(null); + + public static final String CASE_SENSITIVE = "case-sensitive"; + public static final ConfigOption CASE_SENSITIVE_OPTION = + ConfigOptions.key(PREFIX + CASE_SENSITIVE).booleanType().defaultValue(false); + + public static final ConfigOption AS_OF_TIMESTAMP = + ConfigOptions.key("as-of-timestamp").longType().defaultValue(null); + + public static final String STARTING_STRATEGY = "starting-strategy"; + public static final ConfigOption STARTING_STRATEGY_OPTION = + ConfigOptions.key(PREFIX + STARTING_STRATEGY) + .enumType(StreamingStartingStrategy.class) + .defaultValue(StreamingStartingStrategy.INCREMENTAL_FROM_LATEST_SNAPSHOT); + + public static final ConfigOption START_SNAPSHOT_TIMESTAMP = + ConfigOptions.key("start-snapshot-timestamp").longType().defaultValue(null); + + public static final ConfigOption START_SNAPSHOT_ID = + ConfigOptions.key("start-snapshot-id").longType().defaultValue(null); + + public static final ConfigOption END_SNAPSHOT_ID = + ConfigOptions.key("end-snapshot-id").longType().defaultValue(null); + + public static final String SPLIT_SIZE = "split-size"; + public static final ConfigOption SPLIT_SIZE_OPTION = + ConfigOptions.key(PREFIX + SPLIT_SIZE) + .longType() + .defaultValue(TableProperties.SPLIT_SIZE_DEFAULT); + + public static final String SPLIT_LOOKBACK = "split-lookback"; + public static final ConfigOption SPLIT_LOOKBACK_OPTION = + ConfigOptions.key(PREFIX + SPLIT_LOOKBACK) + .intType() + .defaultValue(TableProperties.SPLIT_LOOKBACK_DEFAULT); + + public static final String SPLIT_FILE_OPEN_COST = "split-file-open-cost"; + public static final ConfigOption SPLIT_FILE_OPEN_COST_OPTION = + ConfigOptions.key(PREFIX + SPLIT_FILE_OPEN_COST) + .longType() + .defaultValue(TableProperties.SPLIT_OPEN_FILE_COST_DEFAULT); + + public static final String STREAMING = "streaming"; + public static final ConfigOption STREAMING_OPTION = + ConfigOptions.key(PREFIX + STREAMING).booleanType().defaultValue(false); + + public static final String MONITOR_INTERVAL = "monitor-interval"; + public static final ConfigOption MONITOR_INTERVAL_OPTION = + ConfigOptions.key(PREFIX + MONITOR_INTERVAL).stringType().defaultValue("60s"); + + public static final String INCLUDE_COLUMN_STATS = "include-column-stats"; + public static final ConfigOption INCLUDE_COLUMN_STATS_OPTION = + ConfigOptions.key(PREFIX + INCLUDE_COLUMN_STATS).booleanType().defaultValue(false); + + public static final String MAX_PLANNING_SNAPSHOT_COUNT = "max-planning-snapshot-count"; + public static final ConfigOption MAX_PLANNING_SNAPSHOT_COUNT_OPTION = + ConfigOptions.key(PREFIX + MAX_PLANNING_SNAPSHOT_COUNT) + .intType() + .defaultValue(Integer.MAX_VALUE); + + public static final String LIMIT = "limit"; + public static final ConfigOption LIMIT_OPTION = + ConfigOptions.key(PREFIX + LIMIT).longType().defaultValue(-1L); + + public static final String MAX_ALLOWED_PLANNING_FAILURES = "max-allowed-planning-failures"; + public static final ConfigOption MAX_ALLOWED_PLANNING_FAILURES_OPTION = + ConfigOptions.key(PREFIX + MAX_ALLOWED_PLANNING_FAILURES).intType().defaultValue(3); + + public static final String WATERMARK_COLUMN = "watermark-column"; + public static final ConfigOption WATERMARK_COLUMN_OPTION = + ConfigOptions.key(PREFIX + WATERMARK_COLUMN).stringType().noDefaultValue(); + + public static final String WATERMARK_COLUMN_TIME_UNIT = "watermark-column-time-unit"; + public static final ConfigOption WATERMARK_COLUMN_TIME_UNIT_OPTION = + ConfigOptions.key(PREFIX + WATERMARK_COLUMN_TIME_UNIT) + .enumType(TimeUnit.class) + .defaultValue(TimeUnit.MICROSECONDS); +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkSchemaUtil.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkSchemaUtil.java new file mode 100644 index 000000000000..4790dc85bf28 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkSchemaUtil.java @@ -0,0 +1,232 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import java.util.List; +import java.util.Set; +import org.apache.flink.table.api.DataTypes; +import org.apache.flink.table.api.TableSchema; +import org.apache.flink.table.catalog.Column; +import org.apache.flink.table.catalog.ResolvedSchema; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.table.types.utils.TypeConversions; +import org.apache.iceberg.Schema; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Sets; +import org.apache.iceberg.types.Type; +import org.apache.iceberg.types.TypeUtil; +import org.apache.iceberg.types.Types; + +/** + * Converter between Flink types and Iceberg type. The conversion is not a 1:1 mapping that not + * allows back-and-forth conversion. So some information might get lost during the back-and-forth + * conversion. + * + *

This inconsistent types: + * + *

    + *
  • map Iceberg UUID type to Flink BinaryType(16) + *
  • map Flink VarCharType(_) and CharType(_) to Iceberg String type + *
  • map Flink VarBinaryType(_) to Iceberg Binary type + *
  • map Flink TimeType(_) to Iceberg Time type (microseconds) + *
  • map Flink TimestampType(_) to Iceberg Timestamp without zone type (microseconds) + *
  • map Flink LocalZonedTimestampType(_) to Iceberg Timestamp with zone type (microseconds) + *
  • map Flink MultiSetType to Iceberg Map type(element, int) + *
+ * + *

+ */ +public class FlinkSchemaUtil { + + private FlinkSchemaUtil() {} + + /** @deprecated Use {@link #convert(ResolvedSchema)} instead. */ + @Deprecated + public static Schema convert(TableSchema schema) { + LogicalType schemaType = schema.toRowDataType().getLogicalType(); + Preconditions.checkArgument( + schemaType instanceof RowType, "Schema logical type should be row type."); + + RowType root = (RowType) schemaType; + Type converted = root.accept(new FlinkTypeToType(root)); + + Schema icebergSchema = new Schema(converted.asStructType().fields()); + if (schema.getPrimaryKey().isPresent()) { + return freshIdentifierFieldIds(icebergSchema, schema.getPrimaryKey().get().getColumns()); + } else { + return icebergSchema; + } + } + + /** Convert the flink table schema to apache iceberg schema with column comment. */ + public static Schema convert(ResolvedSchema flinkSchema) { + List tableColumns = flinkSchema.getColumns(); + // copy from org.apache.flink.table.api.Schema#toRowDataType + DataTypes.Field[] fields = + tableColumns.stream() + .map( + column -> { + if (column.getComment().isPresent()) { + return DataTypes.FIELD( + column.getName(), column.getDataType(), column.getComment().get()); + } else { + return DataTypes.FIELD(column.getName(), column.getDataType()); + } + }) + .toArray(DataTypes.Field[]::new); + + LogicalType schemaType = DataTypes.ROW(fields).notNull().getLogicalType(); + Preconditions.checkArgument( + schemaType instanceof RowType, "Schema logical type should be row type."); + + RowType root = (RowType) schemaType; + Type converted = root.accept(new FlinkTypeToType(root)); + Schema icebergSchema = new Schema(converted.asStructType().fields()); + if (flinkSchema.getPrimaryKey().isPresent()) { + return freshIdentifierFieldIds(icebergSchema, flinkSchema.getPrimaryKey().get().getColumns()); + } else { + return icebergSchema; + } + } + + private static Schema freshIdentifierFieldIds(Schema icebergSchema, List primaryKeys) { + // Locate the identifier field id list. + Set identifierFieldIds = Sets.newHashSet(); + for (String primaryKey : primaryKeys) { + Types.NestedField field = icebergSchema.findField(primaryKey); + Preconditions.checkNotNull( + field, + "Cannot find field ID for the primary key column %s in schema %s", + primaryKey, + icebergSchema); + identifierFieldIds.add(field.fieldId()); + } + return new Schema( + icebergSchema.schemaId(), icebergSchema.asStruct().fields(), identifierFieldIds); + } + + /** + * Convert a Flink {@link TableSchema} to a {@link Schema} based on the given schema. + * + *

This conversion does not assign new ids; it uses ids from the base schema. + * + *

Data types, field order, and nullability will match the Flink type. This conversion may + * return a schema that is not compatible with base schema. + * + * @param baseSchema a Schema on which conversion is based + * @param flinkSchema a Flink TableSchema + * @return the equivalent Schema + * @throws IllegalArgumentException if the type cannot be converted or there are missing ids + */ + public static Schema convert(Schema baseSchema, TableSchema flinkSchema) { + // convert to a type with fresh ids + Types.StructType struct = convert(flinkSchema).asStruct(); + // reassign ids to match the base schema + Schema schema = TypeUtil.reassignIds(new Schema(struct.fields()), baseSchema); + // reassign doc to match the base schema + schema = TypeUtil.reassignDoc(schema, baseSchema); + + // fix types that can't be represented in Flink (UUID) + Schema fixedSchema = FlinkFixupTypes.fixup(schema, baseSchema); + if (flinkSchema.getPrimaryKey().isPresent()) { + return freshIdentifierFieldIds(fixedSchema, flinkSchema.getPrimaryKey().get().getColumns()); + } else { + return fixedSchema; + } + } + + /** + * Convert a {@link Schema} to a {@link RowType Flink type}. + * + * @param schema a Schema + * @return the equivalent Flink type + * @throws IllegalArgumentException if the type cannot be converted to Flink + */ + public static RowType convert(Schema schema) { + return (RowType) TypeUtil.visit(schema, new TypeToFlinkType()); + } + + /** + * Convert a {@link Type} to a {@link LogicalType Flink type}. + * + * @param type a Type + * @return the equivalent Flink type + * @throws IllegalArgumentException if the type cannot be converted to Flink + */ + public static LogicalType convert(Type type) { + return TypeUtil.visit(type, new TypeToFlinkType()); + } + + /** + * Convert a {@link LogicalType Flink type} to a {@link Type}. + * + * @param flinkType a FlinkType + * @return the equivalent Iceberg type + */ + public static Type convert(LogicalType flinkType) { + return flinkType.accept(new FlinkTypeToType()); + } + + /** + * Convert a {@link RowType} to a {@link TableSchema}. + * + * @param rowType a RowType + * @return Flink TableSchema + */ + public static TableSchema toSchema(RowType rowType) { + TableSchema.Builder builder = TableSchema.builder(); + for (RowType.RowField field : rowType.getFields()) { + builder.field(field.getName(), TypeConversions.fromLogicalToDataType(field.getType())); + } + return builder.build(); + } + + /** + * Convert a {@link Schema} to a {@link TableSchema}. + * + * @param schema iceberg schema to convert. + * @return Flink TableSchema. + */ + public static TableSchema toSchema(Schema schema) { + TableSchema.Builder builder = TableSchema.builder(); + + // Add columns. + for (RowType.RowField field : convert(schema).getFields()) { + builder.field(field.getName(), TypeConversions.fromLogicalToDataType(field.getType())); + } + + // Add primary key. + Set identifierFieldIds = schema.identifierFieldIds(); + if (!identifierFieldIds.isEmpty()) { + List columns = Lists.newArrayListWithExpectedSize(identifierFieldIds.size()); + for (Integer identifierFieldId : identifierFieldIds) { + String columnName = schema.findColumnName(identifierFieldId); + Preconditions.checkNotNull( + columnName, "Cannot find field with id %s in schema %s", identifierFieldId, schema); + + columns.add(columnName); + } + builder.primaryKey(columns.toArray(new String[0])); + } + + return builder.build(); + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkSourceFilter.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkSourceFilter.java new file mode 100644 index 000000000000..5fbd84909d69 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkSourceFilter.java @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import org.apache.flink.api.common.functions.FilterFunction; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.iceberg.Schema; +import org.apache.iceberg.expressions.Evaluator; +import org.apache.iceberg.expressions.Expression; +import org.apache.iceberg.types.Types; + +public class FlinkSourceFilter implements FilterFunction { + + private final RowType rowType; + private final Evaluator evaluator; + private final Types.StructType struct; + private volatile RowDataWrapper wrapper; + + public FlinkSourceFilter(Schema schema, Expression expr, boolean caseSensitive) { + this.rowType = FlinkSchemaUtil.convert(schema); + this.struct = schema.asStruct(); + this.evaluator = new Evaluator(struct, expr, caseSensitive); + } + + @Override + public boolean filter(RowData value) { + if (wrapper == null) { + this.wrapper = new RowDataWrapper(rowType, struct); + } + return evaluator.eval(wrapper.wrap(value)); + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeToType.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeToType.java new file mode 100644 index 000000000000..408065f06057 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeToType.java @@ -0,0 +1,203 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import java.util.List; +import java.util.stream.Collectors; +import org.apache.flink.table.types.logical.ArrayType; +import org.apache.flink.table.types.logical.BigIntType; +import org.apache.flink.table.types.logical.BinaryType; +import org.apache.flink.table.types.logical.BooleanType; +import org.apache.flink.table.types.logical.CharType; +import org.apache.flink.table.types.logical.DateType; +import org.apache.flink.table.types.logical.DecimalType; +import org.apache.flink.table.types.logical.DoubleType; +import org.apache.flink.table.types.logical.FloatType; +import org.apache.flink.table.types.logical.IntType; +import org.apache.flink.table.types.logical.LocalZonedTimestampType; +import org.apache.flink.table.types.logical.MapType; +import org.apache.flink.table.types.logical.MultisetType; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.table.types.logical.SmallIntType; +import org.apache.flink.table.types.logical.TimeType; +import org.apache.flink.table.types.logical.TimestampType; +import org.apache.flink.table.types.logical.TinyIntType; +import org.apache.flink.table.types.logical.VarBinaryType; +import org.apache.flink.table.types.logical.VarCharType; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.types.Type; +import org.apache.iceberg.types.Types; + +class FlinkTypeToType extends FlinkTypeVisitor { + + private final RowType root; + private int nextId; + + FlinkTypeToType() { + this.root = null; + } + + FlinkTypeToType(RowType root) { + this.root = root; + // the root struct's fields use the first ids + this.nextId = root.getFieldCount(); + } + + private int getNextId() { + int next = nextId; + nextId += 1; + return next; + } + + @Override + public Type visit(CharType charType) { + return Types.StringType.get(); + } + + @Override + public Type visit(VarCharType varCharType) { + return Types.StringType.get(); + } + + @Override + public Type visit(BooleanType booleanType) { + return Types.BooleanType.get(); + } + + @Override + public Type visit(BinaryType binaryType) { + return Types.FixedType.ofLength(binaryType.getLength()); + } + + @Override + public Type visit(VarBinaryType varBinaryType) { + return Types.BinaryType.get(); + } + + @Override + public Type visit(DecimalType decimalType) { + return Types.DecimalType.of(decimalType.getPrecision(), decimalType.getScale()); + } + + @Override + public Type visit(TinyIntType tinyIntType) { + return Types.IntegerType.get(); + } + + @Override + public Type visit(SmallIntType smallIntType) { + return Types.IntegerType.get(); + } + + @Override + public Type visit(IntType intType) { + return Types.IntegerType.get(); + } + + @Override + public Type visit(BigIntType bigIntType) { + return Types.LongType.get(); + } + + @Override + public Type visit(FloatType floatType) { + return Types.FloatType.get(); + } + + @Override + public Type visit(DoubleType doubleType) { + return Types.DoubleType.get(); + } + + @Override + public Type visit(DateType dateType) { + return Types.DateType.get(); + } + + @Override + public Type visit(TimeType timeType) { + return Types.TimeType.get(); + } + + @Override + public Type visit(TimestampType timestampType) { + return Types.TimestampType.withoutZone(); + } + + @Override + public Type visit(LocalZonedTimestampType localZonedTimestampType) { + return Types.TimestampType.withZone(); + } + + @Override + public Type visit(ArrayType arrayType) { + Type elementType = arrayType.getElementType().accept(this); + if (arrayType.getElementType().isNullable()) { + return Types.ListType.ofOptional(getNextId(), elementType); + } else { + return Types.ListType.ofRequired(getNextId(), elementType); + } + } + + @Override + public Type visit(MultisetType multisetType) { + Type elementType = multisetType.getElementType().accept(this); + return Types.MapType.ofRequired(getNextId(), getNextId(), elementType, Types.IntegerType.get()); + } + + @Override + public Type visit(MapType mapType) { + // keys in map are not allowed to be null. + Type keyType = mapType.getKeyType().accept(this); + Type valueType = mapType.getValueType().accept(this); + if (mapType.getValueType().isNullable()) { + return Types.MapType.ofOptional(getNextId(), getNextId(), keyType, valueType); + } else { + return Types.MapType.ofRequired(getNextId(), getNextId(), keyType, valueType); + } + } + + @Override + @SuppressWarnings("ReferenceEquality") + public Type visit(RowType rowType) { + List newFields = Lists.newArrayListWithExpectedSize(rowType.getFieldCount()); + boolean isRoot = root == rowType; + + List types = + rowType.getFields().stream() + .map(f -> f.getType().accept(this)) + .collect(Collectors.toList()); + + for (int i = 0; i < rowType.getFieldCount(); i++) { + int id = isRoot ? i : getNextId(); + + RowType.RowField field = rowType.getFields().get(i); + String name = field.getName(); + String comment = field.getDescription().orElse(null); + + if (field.getType().isNullable()) { + newFields.add(Types.NestedField.optional(id, name, types.get(i), comment)); + } else { + newFields.add(Types.NestedField.required(id, name, types.get(i), comment)); + } + } + + return Types.StructType.of(newFields); + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeVisitor.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeVisitor.java new file mode 100644 index 000000000000..f3de2416088c --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeVisitor.java @@ -0,0 +1,80 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import org.apache.flink.table.types.logical.DayTimeIntervalType; +import org.apache.flink.table.types.logical.DistinctType; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.table.types.logical.LogicalTypeVisitor; +import org.apache.flink.table.types.logical.NullType; +import org.apache.flink.table.types.logical.RawType; +import org.apache.flink.table.types.logical.StructuredType; +import org.apache.flink.table.types.logical.SymbolType; +import org.apache.flink.table.types.logical.YearMonthIntervalType; +import org.apache.flink.table.types.logical.ZonedTimestampType; + +public abstract class FlinkTypeVisitor implements LogicalTypeVisitor { + + // ------------------------- Unsupported types ------------------------------ + + @Override + public T visit(ZonedTimestampType zonedTimestampType) { + throw new UnsupportedOperationException("Unsupported ZonedTimestampType."); + } + + @Override + public T visit(YearMonthIntervalType yearMonthIntervalType) { + throw new UnsupportedOperationException("Unsupported YearMonthIntervalType."); + } + + @Override + public T visit(DayTimeIntervalType dayTimeIntervalType) { + throw new UnsupportedOperationException("Unsupported DayTimeIntervalType."); + } + + @Override + public T visit(DistinctType distinctType) { + throw new UnsupportedOperationException("Unsupported DistinctType."); + } + + @Override + public T visit(StructuredType structuredType) { + throw new UnsupportedOperationException("Unsupported StructuredType."); + } + + @Override + public T visit(NullType nullType) { + throw new UnsupportedOperationException("Unsupported NullType."); + } + + @Override + public T visit(RawType rawType) { + throw new UnsupportedOperationException("Unsupported RawType."); + } + + @Override + public T visit(SymbolType symbolType) { + throw new UnsupportedOperationException("Unsupported SymbolType."); + } + + @Override + public T visit(LogicalType other) { + throw new UnsupportedOperationException("Unsupported type: " + other); + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkWriteConf.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkWriteConf.java new file mode 100644 index 000000000000..ca7b1120bc81 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkWriteConf.java @@ -0,0 +1,205 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import java.time.Duration; +import java.util.Map; +import org.apache.flink.annotation.Experimental; +import org.apache.flink.configuration.ReadableConfig; +import org.apache.iceberg.DistributionMode; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Table; +import org.apache.iceberg.TableProperties; + +/** + * A class for common Iceberg configs for Flink writes. + * + *

If a config is set at multiple levels, the following order of precedence is used (top to + * bottom): + * + *

    + *
  1. Write options + *
  2. flink ReadableConfig + *
  3. Table metadata + *
+ * + * The most specific value is set in write options and takes precedence over all other configs. If + * no write option is provided, this class checks the flink configuration for any overrides. If no + * applicable value is found in the write options, this class uses the table metadata. + * + *

Note this class is NOT meant to be serialized. + */ +public class FlinkWriteConf { + + private final FlinkConfParser confParser; + + public FlinkWriteConf( + Table table, Map writeOptions, ReadableConfig readableConfig) { + this.confParser = new FlinkConfParser(table, writeOptions, readableConfig); + } + + public boolean overwriteMode() { + return confParser + .booleanConf() + .option(FlinkWriteOptions.OVERWRITE_MODE.key()) + .flinkConfig(FlinkWriteOptions.OVERWRITE_MODE) + .defaultValue(FlinkWriteOptions.OVERWRITE_MODE.defaultValue()) + .parse(); + } + + public boolean upsertMode() { + return confParser + .booleanConf() + .option(FlinkWriteOptions.WRITE_UPSERT_ENABLED.key()) + .flinkConfig(FlinkWriteOptions.WRITE_UPSERT_ENABLED) + .tableProperty(TableProperties.UPSERT_ENABLED) + .defaultValue(TableProperties.UPSERT_ENABLED_DEFAULT) + .parse(); + } + + public FileFormat dataFileFormat() { + String valueAsString = + confParser + .stringConf() + .option(FlinkWriteOptions.WRITE_FORMAT.key()) + .flinkConfig(FlinkWriteOptions.WRITE_FORMAT) + .tableProperty(TableProperties.DEFAULT_FILE_FORMAT) + .defaultValue(TableProperties.DEFAULT_FILE_FORMAT_DEFAULT) + .parse(); + return FileFormat.fromString(valueAsString); + } + + public long targetDataFileSize() { + return confParser + .longConf() + .option(FlinkWriteOptions.TARGET_FILE_SIZE_BYTES.key()) + .flinkConfig(FlinkWriteOptions.TARGET_FILE_SIZE_BYTES) + .tableProperty(TableProperties.WRITE_TARGET_FILE_SIZE_BYTES) + .defaultValue(TableProperties.WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT) + .parse(); + } + + public String parquetCompressionCodec() { + return confParser + .stringConf() + .option(FlinkWriteOptions.COMPRESSION_CODEC.key()) + .flinkConfig(FlinkWriteOptions.COMPRESSION_CODEC) + .tableProperty(TableProperties.PARQUET_COMPRESSION) + .defaultValue(TableProperties.PARQUET_COMPRESSION_DEFAULT) + .parse(); + } + + public String parquetCompressionLevel() { + return confParser + .stringConf() + .option(FlinkWriteOptions.COMPRESSION_LEVEL.key()) + .flinkConfig(FlinkWriteOptions.COMPRESSION_LEVEL) + .tableProperty(TableProperties.PARQUET_COMPRESSION_LEVEL) + .defaultValue(TableProperties.PARQUET_COMPRESSION_LEVEL_DEFAULT) + .parseOptional(); + } + + public String avroCompressionCodec() { + return confParser + .stringConf() + .option(FlinkWriteOptions.COMPRESSION_CODEC.key()) + .flinkConfig(FlinkWriteOptions.COMPRESSION_CODEC) + .tableProperty(TableProperties.AVRO_COMPRESSION) + .defaultValue(TableProperties.AVRO_COMPRESSION_DEFAULT) + .parse(); + } + + public String avroCompressionLevel() { + return confParser + .stringConf() + .option(FlinkWriteOptions.COMPRESSION_LEVEL.key()) + .flinkConfig(FlinkWriteOptions.COMPRESSION_LEVEL) + .tableProperty(TableProperties.AVRO_COMPRESSION_LEVEL) + .defaultValue(TableProperties.AVRO_COMPRESSION_LEVEL_DEFAULT) + .parseOptional(); + } + + public String orcCompressionCodec() { + return confParser + .stringConf() + .option(FlinkWriteOptions.COMPRESSION_CODEC.key()) + .flinkConfig(FlinkWriteOptions.COMPRESSION_CODEC) + .tableProperty(TableProperties.ORC_COMPRESSION) + .defaultValue(TableProperties.ORC_COMPRESSION_DEFAULT) + .parse(); + } + + public String orcCompressionStrategy() { + return confParser + .stringConf() + .option(FlinkWriteOptions.COMPRESSION_STRATEGY.key()) + .flinkConfig(FlinkWriteOptions.COMPRESSION_STRATEGY) + .tableProperty(TableProperties.ORC_COMPRESSION_STRATEGY) + .defaultValue(TableProperties.ORC_COMPRESSION_STRATEGY_DEFAULT) + .parse(); + } + + public DistributionMode distributionMode() { + String modeName = + confParser + .stringConf() + .option(FlinkWriteOptions.DISTRIBUTION_MODE.key()) + .flinkConfig(FlinkWriteOptions.DISTRIBUTION_MODE) + .tableProperty(TableProperties.WRITE_DISTRIBUTION_MODE) + .defaultValue(TableProperties.WRITE_DISTRIBUTION_MODE_NONE) + .parse(); + return DistributionMode.fromName(modeName); + } + + public int workerPoolSize() { + return confParser + .intConf() + .flinkConfig(FlinkConfigOptions.TABLE_EXEC_ICEBERG_WORKER_POOL_SIZE) + .defaultValue(FlinkConfigOptions.TABLE_EXEC_ICEBERG_WORKER_POOL_SIZE.defaultValue()) + .parse(); + } + + public String branch() { + return confParser + .stringConf() + .option(FlinkWriteOptions.BRANCH.key()) + .defaultValue(FlinkWriteOptions.BRANCH.defaultValue()) + .parse(); + } + + public Integer writeParallelism() { + return confParser.intConf().option(FlinkWriteOptions.WRITE_PARALLELISM.key()).parseOptional(); + } + + /** + * NOTE: This may be removed or changed in a future release. This value specifies the interval for + * refreshing the table instances in sink writer subtasks. If not specified then the default + * behavior is to not refresh the table. + * + * @return the interval for refreshing the table in sink writer subtasks + */ + @Experimental + public Duration tableRefreshInterval() { + return confParser + .durationConf() + .option(FlinkWriteOptions.TABLE_REFRESH_INTERVAL.key()) + .flinkConfig(FlinkWriteOptions.TABLE_REFRESH_INTERVAL) + .parseOptional(); + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkWriteOptions.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkWriteOptions.java new file mode 100644 index 000000000000..df73f2e09cac --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/FlinkWriteOptions.java @@ -0,0 +1,73 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import java.time.Duration; +import org.apache.flink.annotation.Experimental; +import org.apache.flink.configuration.ConfigOption; +import org.apache.flink.configuration.ConfigOptions; +import org.apache.iceberg.SnapshotRef; + +/** Flink sink write options */ +public class FlinkWriteOptions { + + private FlinkWriteOptions() {} + + // File format for write operations(default: Table write.format.default ) + public static final ConfigOption WRITE_FORMAT = + ConfigOptions.key("write-format").stringType().noDefaultValue(); + + // Overrides this table's write.target-file-size-bytes + public static final ConfigOption TARGET_FILE_SIZE_BYTES = + ConfigOptions.key("target-file-size-bytes").longType().noDefaultValue(); + + // Overrides this table's write..compression-codec + public static final ConfigOption COMPRESSION_CODEC = + ConfigOptions.key("compression-codec").stringType().noDefaultValue(); + + // Overrides this table's write..compression-level + public static final ConfigOption COMPRESSION_LEVEL = + ConfigOptions.key("compression-level").stringType().noDefaultValue(); + + // Overrides this table's write..compression-strategy + public static final ConfigOption COMPRESSION_STRATEGY = + ConfigOptions.key("compression-strategy").stringType().noDefaultValue(); + + // Overrides this table's write.upsert.enabled + public static final ConfigOption WRITE_UPSERT_ENABLED = + ConfigOptions.key("upsert-enabled").booleanType().noDefaultValue(); + + public static final ConfigOption OVERWRITE_MODE = + ConfigOptions.key("overwrite-enabled").booleanType().defaultValue(false); + + // Overrides the table's write.distribution-mode + public static final ConfigOption DISTRIBUTION_MODE = + ConfigOptions.key("distribution-mode").stringType().noDefaultValue(); + + // Branch to write to + public static final ConfigOption BRANCH = + ConfigOptions.key("branch").stringType().defaultValue(SnapshotRef.MAIN_BRANCH); + + public static final ConfigOption WRITE_PARALLELISM = + ConfigOptions.key("write-parallelism").intType().noDefaultValue(); + + @Experimental + public static final ConfigOption TABLE_REFRESH_INTERVAL = + ConfigOptions.key("table-refresh-interval").durationType().noDefaultValue(); +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/IcebergTableSink.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/IcebergTableSink.java new file mode 100644 index 000000000000..1b9268569d9a --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/IcebergTableSink.java @@ -0,0 +1,121 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import java.util.List; +import java.util.Map; +import org.apache.flink.configuration.ReadableConfig; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.datastream.DataStreamSink; +import org.apache.flink.table.api.TableSchema; +import org.apache.flink.table.api.constraints.UniqueConstraint; +import org.apache.flink.table.connector.ChangelogMode; +import org.apache.flink.table.connector.ProviderContext; +import org.apache.flink.table.connector.sink.DataStreamSinkProvider; +import org.apache.flink.table.connector.sink.DynamicTableSink; +import org.apache.flink.table.connector.sink.abilities.SupportsOverwrite; +import org.apache.flink.table.connector.sink.abilities.SupportsPartitioning; +import org.apache.flink.table.data.RowData; +import org.apache.flink.types.RowKind; +import org.apache.flink.util.Preconditions; +import org.apache.iceberg.flink.sink.FlinkSink; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; + +public class IcebergTableSink implements DynamicTableSink, SupportsPartitioning, SupportsOverwrite { + private final TableLoader tableLoader; + private final TableSchema tableSchema; + private final ReadableConfig readableConfig; + private final Map writeProps; + + private boolean overwrite = false; + + private IcebergTableSink(IcebergTableSink toCopy) { + this.tableLoader = toCopy.tableLoader; + this.tableSchema = toCopy.tableSchema; + this.overwrite = toCopy.overwrite; + this.readableConfig = toCopy.readableConfig; + this.writeProps = toCopy.writeProps; + } + + public IcebergTableSink( + TableLoader tableLoader, + TableSchema tableSchema, + ReadableConfig readableConfig, + Map writeProps) { + this.tableLoader = tableLoader; + this.tableSchema = tableSchema; + this.readableConfig = readableConfig; + this.writeProps = writeProps; + } + + @Override + public SinkRuntimeProvider getSinkRuntimeProvider(Context context) { + Preconditions.checkState( + !overwrite || context.isBounded(), + "Unbounded data stream doesn't support overwrite operation."); + + List equalityColumns = + tableSchema.getPrimaryKey().map(UniqueConstraint::getColumns).orElseGet(ImmutableList::of); + + return new DataStreamSinkProvider() { + @Override + public DataStreamSink consumeDataStream( + ProviderContext providerContext, DataStream dataStream) { + return FlinkSink.forRowData(dataStream) + .tableLoader(tableLoader) + .tableSchema(tableSchema) + .equalityFieldColumns(equalityColumns) + .overwrite(overwrite) + .setAll(writeProps) + .flinkConf(readableConfig) + .append(); + } + }; + } + + @Override + public void applyStaticPartition(Map partition) { + // The flink's PartitionFanoutWriter will handle the static partition write policy + // automatically. + } + + @Override + public ChangelogMode getChangelogMode(ChangelogMode requestedMode) { + ChangelogMode.Builder builder = ChangelogMode.newBuilder(); + for (RowKind kind : requestedMode.getContainedKinds()) { + builder.addContainedKind(kind); + } + return builder.build(); + } + + @Override + public DynamicTableSink copy() { + return new IcebergTableSink(this); + } + + @Override + public String asSummaryString() { + return "Iceberg table sink"; + } + + @Override + public void applyOverwrite(boolean newOverwrite) { + this.overwrite = newOverwrite; + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/RowDataWrapper.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/RowDataWrapper.java new file mode 100644 index 000000000000..d4cec7a3e80b --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/RowDataWrapper.java @@ -0,0 +1,142 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import java.lang.reflect.Array; +import java.nio.ByteBuffer; +import java.time.LocalDateTime; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.TimestampData; +import org.apache.flink.table.types.logical.DecimalType; +import org.apache.flink.table.types.logical.LocalZonedTimestampType; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.table.types.logical.TimestampType; +import org.apache.iceberg.StructLike; +import org.apache.iceberg.types.Type; +import org.apache.iceberg.types.Types; +import org.apache.iceberg.util.DateTimeUtil; +import org.apache.iceberg.util.UUIDUtil; + +public class RowDataWrapper implements StructLike { + + private final LogicalType[] types; + private final PositionalGetter[] getters; + private RowData rowData = null; + + public RowDataWrapper(RowType rowType, Types.StructType struct) { + int size = rowType.getFieldCount(); + + types = (LogicalType[]) Array.newInstance(LogicalType.class, size); + getters = (PositionalGetter[]) Array.newInstance(PositionalGetter.class, size); + + for (int i = 0; i < size; i++) { + types[i] = rowType.getTypeAt(i); + getters[i] = buildGetter(types[i], struct.fields().get(i).type()); + } + } + + public RowDataWrapper wrap(RowData data) { + this.rowData = data; + return this; + } + + @Override + public int size() { + return types.length; + } + + @Override + public T get(int pos, Class javaClass) { + if (rowData.isNullAt(pos)) { + return null; + } else if (getters[pos] != null) { + return javaClass.cast(getters[pos].get(rowData, pos)); + } + + Object value = RowData.createFieldGetter(types[pos], pos).getFieldOrNull(rowData); + return javaClass.cast(value); + } + + @Override + public void set(int pos, T value) { + throw new UnsupportedOperationException( + "Could not set a field in the RowDataWrapper because rowData is read-only"); + } + + private interface PositionalGetter { + T get(RowData data, int pos); + } + + private static PositionalGetter buildGetter(LogicalType logicalType, Type type) { + switch (logicalType.getTypeRoot()) { + case TINYINT: + return (row, pos) -> (int) row.getByte(pos); + case SMALLINT: + return (row, pos) -> (int) row.getShort(pos); + case CHAR: + case VARCHAR: + return (row, pos) -> row.getString(pos).toString(); + + case BINARY: + case VARBINARY: + if (Type.TypeID.UUID == type.typeId()) { + return (row, pos) -> UUIDUtil.convert(row.getBinary(pos)); + } else { + return (row, pos) -> ByteBuffer.wrap(row.getBinary(pos)); + } + + case DECIMAL: + DecimalType decimalType = (DecimalType) logicalType; + return (row, pos) -> + row.getDecimal(pos, decimalType.getPrecision(), decimalType.getScale()).toBigDecimal(); + + case TIME_WITHOUT_TIME_ZONE: + // Time in RowData is in milliseconds (Integer), while iceberg's time is microseconds + // (Long). + return (row, pos) -> ((long) row.getInt(pos)) * 1_000; + + case TIMESTAMP_WITHOUT_TIME_ZONE: + TimestampType timestampType = (TimestampType) logicalType; + return (row, pos) -> { + LocalDateTime localDateTime = + row.getTimestamp(pos, timestampType.getPrecision()).toLocalDateTime(); + return DateTimeUtil.microsFromTimestamp(localDateTime); + }; + + case TIMESTAMP_WITH_LOCAL_TIME_ZONE: + LocalZonedTimestampType lzTs = (LocalZonedTimestampType) logicalType; + return (row, pos) -> { + TimestampData timestampData = row.getTimestamp(pos, lzTs.getPrecision()); + return timestampData.getMillisecond() * 1000 + + timestampData.getNanoOfMillisecond() / 1000; + }; + + case ROW: + RowType rowType = (RowType) logicalType; + Types.StructType structType = (Types.StructType) type; + + RowDataWrapper nestedWrapper = new RowDataWrapper(rowType, structType); + return (row, pos) -> nestedWrapper.wrap(row.getRow(pos, rowType.getFieldCount())); + + default: + return null; + } + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/TableLoader.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/TableLoader.java new file mode 100644 index 000000000000..da509451fee7 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/TableLoader.java @@ -0,0 +1,159 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import java.io.Closeable; +import java.io.IOException; +import java.io.Serializable; +import org.apache.hadoop.conf.Configuration; +import org.apache.iceberg.Table; +import org.apache.iceberg.catalog.Catalog; +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.hadoop.HadoopTables; +import org.apache.iceberg.hadoop.SerializableConfiguration; +import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; + +/** + * Serializable loader to load an Iceberg {@link Table}. Flink needs to get {@link Table} objects in + * the cluster (for example, to get splits), not just on the client side. So we need an Iceberg + * table loader to get the {@link Table} object. + */ +public interface TableLoader extends Closeable, Serializable, Cloneable { + + void open(); + + boolean isOpen(); + + Table loadTable(); + + /** Clone a TableLoader */ + @SuppressWarnings({"checkstyle:NoClone", "checkstyle:SuperClone"}) + TableLoader clone(); + + static TableLoader fromCatalog(CatalogLoader catalogLoader, TableIdentifier identifier) { + return new CatalogTableLoader(catalogLoader, identifier); + } + + static TableLoader fromHadoopTable(String location) { + return fromHadoopTable(location, FlinkCatalogFactory.clusterHadoopConf()); + } + + static TableLoader fromHadoopTable(String location, Configuration hadoopConf) { + return new HadoopTableLoader(location, hadoopConf); + } + + class HadoopTableLoader implements TableLoader { + + private static final long serialVersionUID = 1L; + + private final String location; + private final SerializableConfiguration hadoopConf; + + private transient HadoopTables tables; + + private HadoopTableLoader(String location, Configuration conf) { + this.location = location; + this.hadoopConf = new SerializableConfiguration(conf); + } + + @Override + public void open() { + tables = new HadoopTables(hadoopConf.get()); + } + + @Override + public boolean isOpen() { + return tables != null; + } + + @Override + public Table loadTable() { + FlinkEnvironmentContext.init(); + return tables.load(location); + } + + @Override + @SuppressWarnings({"checkstyle:NoClone", "checkstyle:SuperClone"}) + public TableLoader clone() { + return new HadoopTableLoader(location, new Configuration(hadoopConf.get())); + } + + @Override + public void close() {} + + @Override + public String toString() { + return MoreObjects.toStringHelper(this).add("location", location).toString(); + } + } + + class CatalogTableLoader implements TableLoader { + + private static final long serialVersionUID = 1L; + + private final CatalogLoader catalogLoader; + private final String identifier; + + private transient Catalog catalog; + + private CatalogTableLoader(CatalogLoader catalogLoader, TableIdentifier tableIdentifier) { + this.catalogLoader = catalogLoader; + this.identifier = tableIdentifier.toString(); + } + + @Override + public void open() { + catalog = catalogLoader.loadCatalog(); + } + + @Override + public boolean isOpen() { + return catalog != null; + } + + @Override + public Table loadTable() { + FlinkEnvironmentContext.init(); + return catalog.loadTable(TableIdentifier.parse(identifier)); + } + + @Override + public void close() throws IOException { + if (catalog instanceof Closeable) { + ((Closeable) catalog).close(); + } + + catalog = null; + } + + @Override + @SuppressWarnings({"checkstyle:NoClone", "checkstyle:SuperClone"}) + public TableLoader clone() { + return new CatalogTableLoader(catalogLoader.clone(), TableIdentifier.parse(identifier)); + } + + @Override + public String toString() { + return MoreObjects.toStringHelper(this) + .add("tableIdentifier", identifier) + .add("catalogLoader", catalogLoader) + .toString(); + } + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/TypeToFlinkType.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/TypeToFlinkType.java new file mode 100644 index 000000000000..f8f1b74b1ceb --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/TypeToFlinkType.java @@ -0,0 +1,134 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import java.util.List; +import org.apache.flink.table.types.logical.ArrayType; +import org.apache.flink.table.types.logical.BigIntType; +import org.apache.flink.table.types.logical.BinaryType; +import org.apache.flink.table.types.logical.BooleanType; +import org.apache.flink.table.types.logical.DateType; +import org.apache.flink.table.types.logical.DecimalType; +import org.apache.flink.table.types.logical.DoubleType; +import org.apache.flink.table.types.logical.FloatType; +import org.apache.flink.table.types.logical.IntType; +import org.apache.flink.table.types.logical.LocalZonedTimestampType; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.table.types.logical.MapType; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.table.types.logical.TimeType; +import org.apache.flink.table.types.logical.TimestampType; +import org.apache.flink.table.types.logical.VarBinaryType; +import org.apache.flink.table.types.logical.VarCharType; +import org.apache.iceberg.Schema; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.types.Type; +import org.apache.iceberg.types.TypeUtil; +import org.apache.iceberg.types.Types; + +class TypeToFlinkType extends TypeUtil.SchemaVisitor { + TypeToFlinkType() {} + + @Override + public LogicalType schema(Schema schema, LogicalType structType) { + return structType; + } + + @Override + public LogicalType struct(Types.StructType struct, List fieldResults) { + List fields = struct.fields(); + + List flinkFields = Lists.newArrayListWithExpectedSize(fieldResults.size()); + for (int i = 0; i < fields.size(); i += 1) { + Types.NestedField field = fields.get(i); + LogicalType type = fieldResults.get(i); + RowType.RowField flinkField = + new RowType.RowField(field.name(), type.copy(field.isOptional()), field.doc()); + flinkFields.add(flinkField); + } + + return new RowType(flinkFields); + } + + @Override + public LogicalType field(Types.NestedField field, LogicalType fieldResult) { + return fieldResult; + } + + @Override + public LogicalType list(Types.ListType list, LogicalType elementResult) { + return new ArrayType(elementResult.copy(list.isElementOptional())); + } + + @Override + public LogicalType map(Types.MapType map, LogicalType keyResult, LogicalType valueResult) { + // keys in map are not allowed to be null. + return new MapType(keyResult.copy(false), valueResult.copy(map.isValueOptional())); + } + + @Override + public LogicalType primitive(Type.PrimitiveType primitive) { + switch (primitive.typeId()) { + case BOOLEAN: + return new BooleanType(); + case INTEGER: + return new IntType(); + case LONG: + return new BigIntType(); + case FLOAT: + return new FloatType(); + case DOUBLE: + return new DoubleType(); + case DATE: + return new DateType(); + case TIME: + // For the type: Flink only support TimeType with default precision (second) now. The + // precision of time is + // not supported in Flink, so we can think of it as a simple time type directly. + // For the data: Flink uses int that support mills to represent time data, so it supports + // mills precision. + return new TimeType(); + case TIMESTAMP: + Types.TimestampType timestamp = (Types.TimestampType) primitive; + if (timestamp.shouldAdjustToUTC()) { + // MICROS + return new LocalZonedTimestampType(6); + } else { + // MICROS + return new TimestampType(6); + } + case STRING: + return new VarCharType(VarCharType.MAX_LENGTH); + case UUID: + // UUID length is 16 + return new BinaryType(16); + case FIXED: + Types.FixedType fixedType = (Types.FixedType) primitive; + return new BinaryType(fixedType.length()); + case BINARY: + return new VarBinaryType(VarBinaryType.MAX_LENGTH); + case DECIMAL: + Types.DecimalType decimal = (Types.DecimalType) primitive; + return new DecimalType(decimal.precision(), decimal.scale()); + default: + throw new UnsupportedOperationException( + "Cannot convert unknown type to Flink: " + primitive); + } + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/actions/Actions.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/actions/Actions.java new file mode 100644 index 000000000000..b96b47c5a785 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/actions/Actions.java @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.actions; + +import org.apache.flink.configuration.Configuration; +import org.apache.flink.configuration.CoreOptions; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.iceberg.Table; + +public class Actions { + + public static final Configuration CONFIG = + new Configuration() + // disable classloader check as Avro may cache class/object in the serializers. + .set(CoreOptions.CHECK_LEAKED_CLASSLOADER, false); + + private final StreamExecutionEnvironment env; + private final Table table; + + private Actions(StreamExecutionEnvironment env, Table table) { + this.env = env; + this.table = table; + } + + public static Actions forTable(StreamExecutionEnvironment env, Table table) { + return new Actions(env, table); + } + + public static Actions forTable(Table table) { + return new Actions(StreamExecutionEnvironment.getExecutionEnvironment(CONFIG), table); + } + + public RewriteDataFilesAction rewriteDataFiles() { + return new RewriteDataFilesAction(env, table); + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/actions/RewriteDataFilesAction.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/actions/RewriteDataFilesAction.java new file mode 100644 index 000000000000..670abebcb58a --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/actions/RewriteDataFilesAction.java @@ -0,0 +1,72 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.actions; + +import java.util.List; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.iceberg.CombinedScanTask; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.Table; +import org.apache.iceberg.actions.BaseRewriteDataFilesAction; +import org.apache.iceberg.flink.source.RowDataRewriter; +import org.apache.iceberg.io.FileIO; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; + +public class RewriteDataFilesAction extends BaseRewriteDataFilesAction { + + private final StreamExecutionEnvironment env; + private int maxParallelism; + + public RewriteDataFilesAction(StreamExecutionEnvironment env, Table table) { + super(table); + this.env = env; + this.maxParallelism = env.getParallelism(); + } + + @Override + protected FileIO fileIO() { + return table().io(); + } + + @Override + protected List rewriteDataForTasks(List combinedScanTasks) { + int size = combinedScanTasks.size(); + int parallelism = Math.min(size, maxParallelism); + DataStream dataStream = env.fromCollection(combinedScanTasks); + RowDataRewriter rowDataRewriter = + new RowDataRewriter(table(), caseSensitive(), fileIO(), encryptionManager()); + try { + return rowDataRewriter.rewriteDataForTasks(dataStream, parallelism); + } catch (Exception e) { + throw new RuntimeException("Rewrite data file error.", e); + } + } + + @Override + protected RewriteDataFilesAction self() { + return this; + } + + public RewriteDataFilesAction maxParallelism(int parallelism) { + Preconditions.checkArgument(parallelism > 0, "Invalid max parallelism %s", parallelism); + this.maxParallelism = parallelism; + return this; + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/AvroWithFlinkSchemaVisitor.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/AvroWithFlinkSchemaVisitor.java new file mode 100644 index 000000000000..8103224a0b6c --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/AvroWithFlinkSchemaVisitor.java @@ -0,0 +1,75 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.data; + +import org.apache.flink.table.types.logical.ArrayType; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.table.types.logical.LogicalTypeFamily; +import org.apache.flink.table.types.logical.MapType; +import org.apache.flink.table.types.logical.NullType; +import org.apache.flink.table.types.logical.RowType; +import org.apache.iceberg.avro.AvroWithPartnerByStructureVisitor; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.util.Pair; + +public abstract class AvroWithFlinkSchemaVisitor + extends AvroWithPartnerByStructureVisitor { + + @Override + protected boolean isStringType(LogicalType logicalType) { + return logicalType.getTypeRoot().getFamilies().contains(LogicalTypeFamily.CHARACTER_STRING); + } + + @Override + protected boolean isMapType(LogicalType logicalType) { + return logicalType instanceof MapType; + } + + @Override + protected LogicalType arrayElementType(LogicalType arrayType) { + Preconditions.checkArgument( + arrayType instanceof ArrayType, "Invalid array: %s is not an array", arrayType); + return ((ArrayType) arrayType).getElementType(); + } + + @Override + protected LogicalType mapKeyType(LogicalType mapType) { + Preconditions.checkArgument(isMapType(mapType), "Invalid map: %s is not a map", mapType); + return ((MapType) mapType).getKeyType(); + } + + @Override + protected LogicalType mapValueType(LogicalType mapType) { + Preconditions.checkArgument(isMapType(mapType), "Invalid map: %s is not a map", mapType); + return ((MapType) mapType).getValueType(); + } + + @Override + protected Pair fieldNameAndType(LogicalType structType, int pos) { + Preconditions.checkArgument( + structType instanceof RowType, "Invalid struct: %s is not a struct", structType); + RowType.RowField field = ((RowType) structType).getFields().get(pos); + return Pair.of(field.getName(), field.getType()); + } + + @Override + protected LogicalType nullType() { + return new NullType(); + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkAvroReader.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkAvroReader.java new file mode 100644 index 000000000000..86404959735a --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkAvroReader.java @@ -0,0 +1,169 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.data; + +import java.io.IOException; +import java.util.List; +import java.util.Map; +import java.util.function.Supplier; +import org.apache.avro.LogicalType; +import org.apache.avro.LogicalTypes; +import org.apache.avro.Schema; +import org.apache.avro.io.DatumReader; +import org.apache.avro.io.Decoder; +import org.apache.flink.table.data.RowData; +import org.apache.iceberg.avro.AvroSchemaWithTypeVisitor; +import org.apache.iceberg.avro.SupportsRowPosition; +import org.apache.iceberg.avro.ValueReader; +import org.apache.iceberg.avro.ValueReaders; +import org.apache.iceberg.data.avro.DecoderResolver; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.types.Type; +import org.apache.iceberg.types.Types; + +public class FlinkAvroReader implements DatumReader, SupportsRowPosition { + + private final Schema readSchema; + private final ValueReader reader; + private Schema fileSchema = null; + + public FlinkAvroReader(org.apache.iceberg.Schema expectedSchema, Schema readSchema) { + this(expectedSchema, readSchema, ImmutableMap.of()); + } + + @SuppressWarnings("unchecked") + public FlinkAvroReader( + org.apache.iceberg.Schema expectedSchema, Schema readSchema, Map constants) { + this.readSchema = readSchema; + this.reader = + (ValueReader) + AvroSchemaWithTypeVisitor.visit(expectedSchema, readSchema, new ReadBuilder(constants)); + } + + @Override + public void setSchema(Schema newFileSchema) { + this.fileSchema = Schema.applyAliases(newFileSchema, readSchema); + } + + @Override + public RowData read(RowData reuse, Decoder decoder) throws IOException { + return DecoderResolver.resolveAndRead(decoder, readSchema, fileSchema, reader, reuse); + } + + @Override + public void setRowPositionSupplier(Supplier posSupplier) { + if (reader instanceof SupportsRowPosition) { + ((SupportsRowPosition) reader).setRowPositionSupplier(posSupplier); + } + } + + private static class ReadBuilder extends AvroSchemaWithTypeVisitor> { + private final Map idToConstant; + + private ReadBuilder(Map idToConstant) { + this.idToConstant = idToConstant; + } + + @Override + public ValueReader record( + Types.StructType expected, Schema record, List names, List> fields) { + return FlinkValueReaders.struct(fields, expected.asStructType(), idToConstant); + } + + @Override + public ValueReader union(Type expected, Schema union, List> options) { + return ValueReaders.union(options); + } + + @Override + public ValueReader array( + Types.ListType expected, Schema array, ValueReader elementReader) { + return FlinkValueReaders.array(elementReader); + } + + @Override + public ValueReader map( + Types.MapType expected, Schema map, ValueReader keyReader, ValueReader valueReader) { + return FlinkValueReaders.arrayMap(keyReader, valueReader); + } + + @Override + public ValueReader map(Types.MapType expected, Schema map, ValueReader valueReader) { + return FlinkValueReaders.map(FlinkValueReaders.strings(), valueReader); + } + + @Override + public ValueReader primitive(Type.PrimitiveType expected, Schema primitive) { + LogicalType logicalType = primitive.getLogicalType(); + if (logicalType != null) { + switch (logicalType.getName()) { + case "date": + return ValueReaders.ints(); + + case "time-micros": + return FlinkValueReaders.timeMicros(); + + case "timestamp-millis": + return FlinkValueReaders.timestampMills(); + + case "timestamp-micros": + return FlinkValueReaders.timestampMicros(); + + case "decimal": + LogicalTypes.Decimal decimal = (LogicalTypes.Decimal) logicalType; + return FlinkValueReaders.decimal( + ValueReaders.decimalBytesReader(primitive), + decimal.getPrecision(), + decimal.getScale()); + + case "uuid": + return FlinkValueReaders.uuids(); + + default: + throw new IllegalArgumentException("Unknown logical type: " + logicalType); + } + } + + switch (primitive.getType()) { + case NULL: + return ValueReaders.nulls(); + case BOOLEAN: + return ValueReaders.booleans(); + case INT: + return ValueReaders.ints(); + case LONG: + return ValueReaders.longs(); + case FLOAT: + return ValueReaders.floats(); + case DOUBLE: + return ValueReaders.doubles(); + case STRING: + return FlinkValueReaders.strings(); + case FIXED: + return ValueReaders.fixed(primitive.getFixedSize()); + case BYTES: + return ValueReaders.bytes(); + case ENUM: + return FlinkValueReaders.enums(primitive.getEnumSymbols()); + default: + throw new IllegalArgumentException("Unsupported type: " + primitive); + } + } + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkAvroWriter.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkAvroWriter.java new file mode 100644 index 000000000000..873e65783119 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkAvroWriter.java @@ -0,0 +1,165 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.data; + +import java.io.IOException; +import java.util.List; +import java.util.stream.Collectors; +import java.util.stream.IntStream; +import java.util.stream.Stream; +import org.apache.avro.LogicalTypes; +import org.apache.avro.Schema; +import org.apache.avro.io.Encoder; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.table.types.logical.RowType; +import org.apache.iceberg.FieldMetrics; +import org.apache.iceberg.avro.MetricsAwareDatumWriter; +import org.apache.iceberg.avro.ValueWriter; +import org.apache.iceberg.avro.ValueWriters; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; + +public class FlinkAvroWriter implements MetricsAwareDatumWriter { + private final RowType rowType; + private ValueWriter writer = null; + + public FlinkAvroWriter(RowType rowType) { + this.rowType = rowType; + } + + @Override + @SuppressWarnings("unchecked") + public void setSchema(Schema schema) { + this.writer = + (ValueWriter) + AvroWithFlinkSchemaVisitor.visit(rowType, schema, new WriteBuilder()); + } + + @Override + public void write(RowData datum, Encoder out) throws IOException { + writer.write(datum, out); + } + + @Override + public Stream metrics() { + return writer.metrics(); + } + + private static class WriteBuilder extends AvroWithFlinkSchemaVisitor> { + @Override + public ValueWriter record( + LogicalType struct, Schema record, List names, List> fields) { + return FlinkValueWriters.row( + fields, + IntStream.range(0, names.size()) + .mapToObj(i -> fieldNameAndType(struct, i).second()) + .collect(Collectors.toList())); + } + + @Override + public ValueWriter union(LogicalType type, Schema union, List> options) { + Preconditions.checkArgument( + options.contains(ValueWriters.nulls()), + "Cannot create writer for non-option union: %s", + union); + Preconditions.checkArgument( + options.size() == 2, "Cannot create writer for non-option union: %s", union); + if (union.getTypes().get(0).getType() == Schema.Type.NULL) { + return ValueWriters.option(0, options.get(1)); + } else { + return ValueWriters.option(1, options.get(0)); + } + } + + @Override + public ValueWriter array(LogicalType sArray, Schema array, ValueWriter elementWriter) { + return FlinkValueWriters.array(elementWriter, arrayElementType(sArray)); + } + + @Override + public ValueWriter map(LogicalType sMap, Schema map, ValueWriter valueReader) { + return FlinkValueWriters.map( + FlinkValueWriters.strings(), mapKeyType(sMap), valueReader, mapValueType(sMap)); + } + + @Override + public ValueWriter map( + LogicalType sMap, Schema map, ValueWriter keyWriter, ValueWriter valueWriter) { + return FlinkValueWriters.arrayMap( + keyWriter, mapKeyType(sMap), valueWriter, mapValueType(sMap)); + } + + @Override + public ValueWriter primitive(LogicalType type, Schema primitive) { + org.apache.avro.LogicalType logicalType = primitive.getLogicalType(); + if (logicalType != null) { + switch (logicalType.getName()) { + case "date": + return ValueWriters.ints(); + + case "time-micros": + return FlinkValueWriters.timeMicros(); + + case "timestamp-micros": + return FlinkValueWriters.timestampMicros(); + + case "decimal": + LogicalTypes.Decimal decimal = (LogicalTypes.Decimal) logicalType; + return FlinkValueWriters.decimal(decimal.getPrecision(), decimal.getScale()); + + case "uuid": + return ValueWriters.uuids(); + + default: + throw new IllegalArgumentException("Unsupported logical type: " + logicalType); + } + } + + switch (primitive.getType()) { + case NULL: + return ValueWriters.nulls(); + case BOOLEAN: + return ValueWriters.booleans(); + case INT: + switch (type.getTypeRoot()) { + case TINYINT: + return ValueWriters.tinyints(); + case SMALLINT: + return ValueWriters.shorts(); + default: + return ValueWriters.ints(); + } + case LONG: + return ValueWriters.longs(); + case FLOAT: + return ValueWriters.floats(); + case DOUBLE: + return ValueWriters.doubles(); + case STRING: + return FlinkValueWriters.strings(); + case FIXED: + return ValueWriters.fixed(primitive.getFixedSize()); + case BYTES: + return ValueWriters.bytes(); + default: + throw new IllegalArgumentException("Unsupported type: " + primitive); + } + } + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReader.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReader.java new file mode 100644 index 000000000000..65b9d44ad4b8 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReader.java @@ -0,0 +1,131 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.data; + +import java.util.List; +import java.util.Map; +import org.apache.flink.table.data.ArrayData; +import org.apache.flink.table.data.MapData; +import org.apache.flink.table.data.RowData; +import org.apache.iceberg.Schema; +import org.apache.iceberg.orc.OrcRowReader; +import org.apache.iceberg.orc.OrcSchemaWithTypeVisitor; +import org.apache.iceberg.orc.OrcValueReader; +import org.apache.iceberg.orc.OrcValueReaders; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.types.Type; +import org.apache.iceberg.types.Types; +import org.apache.orc.TypeDescription; +import org.apache.orc.storage.ql.exec.vector.StructColumnVector; +import org.apache.orc.storage.ql.exec.vector.VectorizedRowBatch; + +public class FlinkOrcReader implements OrcRowReader { + private final OrcValueReader reader; + + public FlinkOrcReader(Schema iSchema, TypeDescription readSchema) { + this(iSchema, readSchema, ImmutableMap.of()); + } + + public FlinkOrcReader(Schema iSchema, TypeDescription readSchema, Map idToConstant) { + this.reader = + OrcSchemaWithTypeVisitor.visit(iSchema, readSchema, new ReadBuilder(idToConstant)); + } + + @Override + public RowData read(VectorizedRowBatch batch, int row) { + return (RowData) reader.read(new StructColumnVector(batch.size, batch.cols), row); + } + + @Override + public void setBatchContext(long batchOffsetInFile) { + reader.setBatchContext(batchOffsetInFile); + } + + private static class ReadBuilder extends OrcSchemaWithTypeVisitor> { + private final Map idToConstant; + + private ReadBuilder(Map idToConstant) { + this.idToConstant = idToConstant; + } + + @Override + public OrcValueReader record( + Types.StructType iStruct, + TypeDescription record, + List names, + List> fields) { + return FlinkOrcReaders.struct(fields, iStruct, idToConstant); + } + + @Override + public OrcValueReader list( + Types.ListType iList, TypeDescription array, OrcValueReader elementReader) { + return FlinkOrcReaders.array(elementReader); + } + + @Override + public OrcValueReader map( + Types.MapType iMap, + TypeDescription map, + OrcValueReader keyReader, + OrcValueReader valueReader) { + return FlinkOrcReaders.map(keyReader, valueReader); + } + + @Override + public OrcValueReader primitive(Type.PrimitiveType iPrimitive, TypeDescription primitive) { + switch (iPrimitive.typeId()) { + case BOOLEAN: + return OrcValueReaders.booleans(); + case INTEGER: + return OrcValueReaders.ints(); + case LONG: + return OrcValueReaders.longs(); + case FLOAT: + return OrcValueReaders.floats(); + case DOUBLE: + return OrcValueReaders.doubles(); + case DATE: + return FlinkOrcReaders.dates(); + case TIME: + return FlinkOrcReaders.times(); + case TIMESTAMP: + Types.TimestampType timestampType = (Types.TimestampType) iPrimitive; + if (timestampType.shouldAdjustToUTC()) { + return FlinkOrcReaders.timestampTzs(); + } else { + return FlinkOrcReaders.timestamps(); + } + case STRING: + return FlinkOrcReaders.strings(); + case UUID: + case FIXED: + case BINARY: + return OrcValueReaders.bytes(); + case DECIMAL: + Types.DecimalType decimalType = (Types.DecimalType) iPrimitive; + return FlinkOrcReaders.decimals(decimalType.precision(), decimalType.scale()); + default: + throw new IllegalArgumentException( + String.format( + "Invalid iceberg type %s corresponding to ORC type %s", iPrimitive, primitive)); + } + } + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReaders.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReaders.java new file mode 100644 index 000000000000..7a4a15c7e600 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReaders.java @@ -0,0 +1,283 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.data; + +import java.math.BigDecimal; +import java.time.Instant; +import java.time.LocalDateTime; +import java.time.ZoneOffset; +import java.util.List; +import java.util.Map; +import org.apache.flink.table.data.ArrayData; +import org.apache.flink.table.data.DecimalData; +import org.apache.flink.table.data.GenericArrayData; +import org.apache.flink.table.data.GenericMapData; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.MapData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.flink.table.data.TimestampData; +import org.apache.iceberg.orc.OrcValueReader; +import org.apache.iceberg.orc.OrcValueReaders; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.types.Types; +import org.apache.orc.storage.ql.exec.vector.BytesColumnVector; +import org.apache.orc.storage.ql.exec.vector.ColumnVector; +import org.apache.orc.storage.ql.exec.vector.DecimalColumnVector; +import org.apache.orc.storage.ql.exec.vector.ListColumnVector; +import org.apache.orc.storage.ql.exec.vector.LongColumnVector; +import org.apache.orc.storage.ql.exec.vector.MapColumnVector; +import org.apache.orc.storage.ql.exec.vector.TimestampColumnVector; +import org.apache.orc.storage.serde2.io.HiveDecimalWritable; + +class FlinkOrcReaders { + private FlinkOrcReaders() {} + + static OrcValueReader strings() { + return StringReader.INSTANCE; + } + + static OrcValueReader dates() { + return DateReader.INSTANCE; + } + + static OrcValueReader decimals(int precision, int scale) { + if (precision <= 18) { + return new Decimal18Reader(precision, scale); + } else if (precision <= 38) { + return new Decimal38Reader(precision, scale); + } else { + throw new IllegalArgumentException("Invalid precision: " + precision); + } + } + + static OrcValueReader times() { + return TimeReader.INSTANCE; + } + + static OrcValueReader timestamps() { + return TimestampReader.INSTANCE; + } + + static OrcValueReader timestampTzs() { + return TimestampTzReader.INSTANCE; + } + + static OrcValueReader array(OrcValueReader elementReader) { + return new ArrayReader<>(elementReader); + } + + public static OrcValueReader map( + OrcValueReader keyReader, OrcValueReader valueReader) { + return new MapReader<>(keyReader, valueReader); + } + + public static OrcValueReader struct( + List> readers, Types.StructType struct, Map idToConstant) { + return new StructReader(readers, struct, idToConstant); + } + + private static class StringReader implements OrcValueReader { + private static final StringReader INSTANCE = new StringReader(); + + @Override + public StringData nonNullRead(ColumnVector vector, int row) { + BytesColumnVector bytesVector = (BytesColumnVector) vector; + return StringData.fromBytes( + bytesVector.vector[row], bytesVector.start[row], bytesVector.length[row]); + } + } + + private static class DateReader implements OrcValueReader { + private static final DateReader INSTANCE = new DateReader(); + + @Override + public Integer nonNullRead(ColumnVector vector, int row) { + return (int) ((LongColumnVector) vector).vector[row]; + } + } + + private static class Decimal18Reader implements OrcValueReader { + private final int precision; + private final int scale; + + Decimal18Reader(int precision, int scale) { + this.precision = precision; + this.scale = scale; + } + + @Override + public DecimalData nonNullRead(ColumnVector vector, int row) { + HiveDecimalWritable value = ((DecimalColumnVector) vector).vector[row]; + + // The hive ORC writer may will adjust the scale of decimal data. + Preconditions.checkArgument( + value.precision() <= precision, + "Cannot read value as decimal(%s,%s), too large: %s", + precision, + scale, + value); + + return DecimalData.fromUnscaledLong(value.serialize64(scale), precision, scale); + } + } + + private static class Decimal38Reader implements OrcValueReader { + private final int precision; + private final int scale; + + Decimal38Reader(int precision, int scale) { + this.precision = precision; + this.scale = scale; + } + + @Override + public DecimalData nonNullRead(ColumnVector vector, int row) { + BigDecimal value = + ((DecimalColumnVector) vector).vector[row].getHiveDecimal().bigDecimalValue(); + + Preconditions.checkArgument( + value.precision() <= precision, + "Cannot read value as decimal(%s,%s), too large: %s", + precision, + scale, + value); + + return DecimalData.fromBigDecimal(value, precision, scale); + } + } + + private static class TimeReader implements OrcValueReader { + private static final TimeReader INSTANCE = new TimeReader(); + + @Override + public Integer nonNullRead(ColumnVector vector, int row) { + long micros = ((LongColumnVector) vector).vector[row]; + // Flink only support time mills, just erase micros. + return (int) (micros / 1000); + } + } + + private static class TimestampReader implements OrcValueReader { + private static final TimestampReader INSTANCE = new TimestampReader(); + + @Override + public TimestampData nonNullRead(ColumnVector vector, int row) { + TimestampColumnVector tcv = (TimestampColumnVector) vector; + LocalDateTime localDate = + Instant.ofEpochSecond(Math.floorDiv(tcv.time[row], 1_000), tcv.nanos[row]) + .atOffset(ZoneOffset.UTC) + .toLocalDateTime(); + return TimestampData.fromLocalDateTime(localDate); + } + } + + private static class TimestampTzReader implements OrcValueReader { + private static final TimestampTzReader INSTANCE = new TimestampTzReader(); + + @Override + public TimestampData nonNullRead(ColumnVector vector, int row) { + TimestampColumnVector tcv = (TimestampColumnVector) vector; + Instant instant = + Instant.ofEpochSecond(Math.floorDiv(tcv.time[row], 1_000), tcv.nanos[row]) + .atOffset(ZoneOffset.UTC) + .toInstant(); + return TimestampData.fromInstant(instant); + } + } + + private static class ArrayReader implements OrcValueReader { + private final OrcValueReader elementReader; + + private ArrayReader(OrcValueReader elementReader) { + this.elementReader = elementReader; + } + + @Override + public ArrayData nonNullRead(ColumnVector vector, int row) { + ListColumnVector listVector = (ListColumnVector) vector; + int offset = (int) listVector.offsets[row]; + int length = (int) listVector.lengths[row]; + List elements = Lists.newArrayListWithExpectedSize(length); + for (int c = 0; c < length; ++c) { + elements.add(elementReader.read(listVector.child, offset + c)); + } + return new GenericArrayData(elements.toArray()); + } + + @Override + public void setBatchContext(long batchOffsetInFile) { + elementReader.setBatchContext(batchOffsetInFile); + } + } + + private static class MapReader implements OrcValueReader { + private final OrcValueReader keyReader; + private final OrcValueReader valueReader; + + private MapReader(OrcValueReader keyReader, OrcValueReader valueReader) { + this.keyReader = keyReader; + this.valueReader = valueReader; + } + + @Override + public MapData nonNullRead(ColumnVector vector, int row) { + MapColumnVector mapVector = (MapColumnVector) vector; + int offset = (int) mapVector.offsets[row]; + long length = mapVector.lengths[row]; + + Map map = Maps.newHashMap(); + for (int c = 0; c < length; c++) { + K key = keyReader.read(mapVector.keys, offset + c); + V value = valueReader.read(mapVector.values, offset + c); + map.put(key, value); + } + + return new GenericMapData(map); + } + + @Override + public void setBatchContext(long batchOffsetInFile) { + keyReader.setBatchContext(batchOffsetInFile); + valueReader.setBatchContext(batchOffsetInFile); + } + } + + private static class StructReader extends OrcValueReaders.StructReader { + private final int numFields; + + StructReader( + List> readers, Types.StructType struct, Map idToConstant) { + super(readers, struct, idToConstant); + this.numFields = struct.fields().size(); + } + + @Override + protected RowData create() { + return new GenericRowData(numFields); + } + + @Override + protected void set(RowData struct, int pos, Object value) { + ((GenericRowData) struct).setField(pos, value); + } + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriter.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriter.java new file mode 100644 index 000000000000..6a31accffd22 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriter.java @@ -0,0 +1,163 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.data; + +import java.util.Deque; +import java.util.List; +import java.util.stream.Stream; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.table.types.logical.RowType; +import org.apache.iceberg.FieldMetrics; +import org.apache.iceberg.Schema; +import org.apache.iceberg.data.orc.GenericOrcWriters; +import org.apache.iceberg.orc.OrcRowWriter; +import org.apache.iceberg.orc.OrcValueWriter; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.types.Type; +import org.apache.iceberg.types.Types; +import org.apache.orc.storage.ql.exec.vector.VectorizedRowBatch; + +public class FlinkOrcWriter implements OrcRowWriter { + private final FlinkOrcWriters.RowDataWriter writer; + + private FlinkOrcWriter(RowType rowType, Schema iSchema) { + this.writer = + (FlinkOrcWriters.RowDataWriter) + FlinkSchemaVisitor.visit(rowType, iSchema, new WriteBuilder()); + } + + public static OrcRowWriter buildWriter(RowType rowType, Schema iSchema) { + return new FlinkOrcWriter(rowType, iSchema); + } + + @Override + public void write(RowData row, VectorizedRowBatch output) { + Preconditions.checkArgument(row != null, "value must not be null"); + writer.writeRow(row, output); + } + + @Override + public List> writers() { + return writer.writers(); + } + + @Override + public Stream> metrics() { + return writer.metrics(); + } + + private static class WriteBuilder extends FlinkSchemaVisitor> { + private final Deque fieldIds = Lists.newLinkedList(); + + private WriteBuilder() {} + + @Override + public void beforeField(Types.NestedField field) { + fieldIds.push(field.fieldId()); + } + + @Override + public void afterField(Types.NestedField field) { + fieldIds.pop(); + } + + @Override + public OrcValueWriter record( + Types.StructType iStruct, List> results, List fieldType) { + return FlinkOrcWriters.struct(results, fieldType); + } + + @Override + public OrcValueWriter map( + Types.MapType iMap, + OrcValueWriter key, + OrcValueWriter value, + LogicalType keyType, + LogicalType valueType) { + return FlinkOrcWriters.map(key, value, keyType, valueType); + } + + @Override + public OrcValueWriter list( + Types.ListType iList, OrcValueWriter element, LogicalType elementType) { + return FlinkOrcWriters.list(element, elementType); + } + + @Override + public OrcValueWriter primitive(Type.PrimitiveType iPrimitive, LogicalType flinkPrimitive) { + switch (iPrimitive.typeId()) { + case BOOLEAN: + return GenericOrcWriters.booleans(); + case INTEGER: + switch (flinkPrimitive.getTypeRoot()) { + case TINYINT: + return GenericOrcWriters.bytes(); + case SMALLINT: + return GenericOrcWriters.shorts(); + } + return GenericOrcWriters.ints(); + case LONG: + return GenericOrcWriters.longs(); + case FLOAT: + Preconditions.checkArgument( + fieldIds.peek() != null, + String.format( + "[BUG] Cannot find field id for primitive field with type %s. This is likely because id " + + "information is not properly pushed during schema visiting.", + iPrimitive)); + return GenericOrcWriters.floats(fieldIds.peek()); + case DOUBLE: + Preconditions.checkArgument( + fieldIds.peek() != null, + String.format( + "[BUG] Cannot find field id for primitive field with type %s. This is likely because id " + + "information is not properly pushed during schema visiting.", + iPrimitive)); + return GenericOrcWriters.doubles(fieldIds.peek()); + case DATE: + return FlinkOrcWriters.dates(); + case TIME: + return FlinkOrcWriters.times(); + case TIMESTAMP: + Types.TimestampType timestampType = (Types.TimestampType) iPrimitive; + if (timestampType.shouldAdjustToUTC()) { + return FlinkOrcWriters.timestampTzs(); + } else { + return FlinkOrcWriters.timestamps(); + } + case STRING: + return FlinkOrcWriters.strings(); + case UUID: + case FIXED: + case BINARY: + return GenericOrcWriters.byteArrays(); + case DECIMAL: + Types.DecimalType decimalType = (Types.DecimalType) iPrimitive; + return FlinkOrcWriters.decimals(decimalType.precision(), decimalType.scale()); + default: + throw new IllegalArgumentException( + String.format( + "Invalid iceberg type %s corresponding to Flink logical type %s", + iPrimitive, flinkPrimitive)); + } + } + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriters.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriters.java new file mode 100644 index 000000000000..da2f95cf822f --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriters.java @@ -0,0 +1,317 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.data; + +import java.time.Instant; +import java.time.OffsetDateTime; +import java.time.ZoneOffset; +import java.util.List; +import java.util.stream.Stream; +import org.apache.flink.table.data.ArrayData; +import org.apache.flink.table.data.DecimalData; +import org.apache.flink.table.data.MapData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.flink.table.data.TimestampData; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.iceberg.FieldMetrics; +import org.apache.iceberg.data.orc.GenericOrcWriters; +import org.apache.iceberg.orc.OrcValueWriter; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.orc.storage.common.type.HiveDecimal; +import org.apache.orc.storage.ql.exec.vector.BytesColumnVector; +import org.apache.orc.storage.ql.exec.vector.ColumnVector; +import org.apache.orc.storage.ql.exec.vector.DecimalColumnVector; +import org.apache.orc.storage.ql.exec.vector.ListColumnVector; +import org.apache.orc.storage.ql.exec.vector.LongColumnVector; +import org.apache.orc.storage.ql.exec.vector.MapColumnVector; +import org.apache.orc.storage.ql.exec.vector.TimestampColumnVector; + +class FlinkOrcWriters { + + private FlinkOrcWriters() {} + + static OrcValueWriter strings() { + return StringWriter.INSTANCE; + } + + static OrcValueWriter dates() { + return DateWriter.INSTANCE; + } + + static OrcValueWriter times() { + return TimeWriter.INSTANCE; + } + + static OrcValueWriter timestamps() { + return TimestampWriter.INSTANCE; + } + + static OrcValueWriter timestampTzs() { + return TimestampTzWriter.INSTANCE; + } + + static OrcValueWriter decimals(int precision, int scale) { + if (precision <= 18) { + return new Decimal18Writer(precision, scale); + } else if (precision <= 38) { + return new Decimal38Writer(precision, scale); + } else { + throw new IllegalArgumentException("Invalid precision: " + precision); + } + } + + static OrcValueWriter list( + OrcValueWriter elementWriter, LogicalType elementType) { + return new ListWriter<>(elementWriter, elementType); + } + + static OrcValueWriter map( + OrcValueWriter keyWriter, + OrcValueWriter valueWriter, + LogicalType keyType, + LogicalType valueType) { + return new MapWriter<>(keyWriter, valueWriter, keyType, valueType); + } + + static OrcValueWriter struct(List> writers, List types) { + return new RowDataWriter(writers, types); + } + + private static class StringWriter implements OrcValueWriter { + private static final StringWriter INSTANCE = new StringWriter(); + + @Override + public void nonNullWrite(int rowId, StringData data, ColumnVector output) { + byte[] value = data.toBytes(); + ((BytesColumnVector) output).setRef(rowId, value, 0, value.length); + } + } + + private static class DateWriter implements OrcValueWriter { + private static final DateWriter INSTANCE = new DateWriter(); + + @Override + public void nonNullWrite(int rowId, Integer data, ColumnVector output) { + ((LongColumnVector) output).vector[rowId] = data; + } + } + + private static class TimeWriter implements OrcValueWriter { + private static final TimeWriter INSTANCE = new TimeWriter(); + + @Override + public void nonNullWrite(int rowId, Integer millis, ColumnVector output) { + // The time in flink is in millisecond, while the standard time in iceberg is microsecond. + // So we need to transform it to microsecond. + ((LongColumnVector) output).vector[rowId] = millis * 1000L; + } + } + + private static class TimestampWriter implements OrcValueWriter { + private static final TimestampWriter INSTANCE = new TimestampWriter(); + + @Override + public void nonNullWrite(int rowId, TimestampData data, ColumnVector output) { + TimestampColumnVector cv = (TimestampColumnVector) output; + cv.setIsUTC(true); + // millis + OffsetDateTime offsetDateTime = data.toInstant().atOffset(ZoneOffset.UTC); + cv.time[rowId] = + offsetDateTime.toEpochSecond() * 1_000 + offsetDateTime.getNano() / 1_000_000; + // truncate nanos to only keep microsecond precision. + cv.nanos[rowId] = (offsetDateTime.getNano() / 1_000) * 1_000; + } + } + + private static class TimestampTzWriter implements OrcValueWriter { + private static final TimestampTzWriter INSTANCE = new TimestampTzWriter(); + + @SuppressWarnings("JavaInstantGetSecondsGetNano") + @Override + public void nonNullWrite(int rowId, TimestampData data, ColumnVector output) { + TimestampColumnVector cv = (TimestampColumnVector) output; + // millis + Instant instant = data.toInstant(); + cv.time[rowId] = instant.toEpochMilli(); + // truncate nanos to only keep microsecond precision. + cv.nanos[rowId] = (instant.getNano() / 1_000) * 1_000; + } + } + + private static class Decimal18Writer implements OrcValueWriter { + private final int precision; + private final int scale; + + Decimal18Writer(int precision, int scale) { + this.precision = precision; + this.scale = scale; + } + + @Override + public void nonNullWrite(int rowId, DecimalData data, ColumnVector output) { + Preconditions.checkArgument( + scale == data.scale(), + "Cannot write value as decimal(%s,%s), wrong scale: %s", + precision, + scale, + data); + Preconditions.checkArgument( + data.precision() <= precision, + "Cannot write value as decimal(%s,%s), too large: %s", + precision, + scale, + data); + + ((DecimalColumnVector) output) + .vector[rowId].setFromLongAndScale(data.toUnscaledLong(), data.scale()); + } + } + + private static class Decimal38Writer implements OrcValueWriter { + private final int precision; + private final int scale; + + Decimal38Writer(int precision, int scale) { + this.precision = precision; + this.scale = scale; + } + + @Override + public void nonNullWrite(int rowId, DecimalData data, ColumnVector output) { + Preconditions.checkArgument( + scale == data.scale(), + "Cannot write value as decimal(%s,%s), wrong scale: %s", + precision, + scale, + data); + Preconditions.checkArgument( + data.precision() <= precision, + "Cannot write value as decimal(%s,%s), too large: %s", + precision, + scale, + data); + + ((DecimalColumnVector) output) + .vector[rowId].set(HiveDecimal.create(data.toBigDecimal(), false)); + } + } + + static class ListWriter implements OrcValueWriter { + private final OrcValueWriter elementWriter; + private final ArrayData.ElementGetter elementGetter; + + ListWriter(OrcValueWriter elementWriter, LogicalType elementType) { + this.elementWriter = elementWriter; + this.elementGetter = ArrayData.createElementGetter(elementType); + } + + @Override + @SuppressWarnings("unchecked") + public void nonNullWrite(int rowId, ArrayData data, ColumnVector output) { + ListColumnVector cv = (ListColumnVector) output; + cv.lengths[rowId] = data.size(); + cv.offsets[rowId] = cv.childCount; + cv.childCount = (int) (cv.childCount + cv.lengths[rowId]); + // make sure the child is big enough. + growColumnVector(cv.child, cv.childCount); + + for (int e = 0; e < cv.lengths[rowId]; ++e) { + Object value = elementGetter.getElementOrNull(data, e); + elementWriter.write((int) (e + cv.offsets[rowId]), (T) value, cv.child); + } + } + + @Override + public Stream> metrics() { + return elementWriter.metrics(); + } + } + + static class MapWriter implements OrcValueWriter { + private final OrcValueWriter keyWriter; + private final OrcValueWriter valueWriter; + private final ArrayData.ElementGetter keyGetter; + private final ArrayData.ElementGetter valueGetter; + + MapWriter( + OrcValueWriter keyWriter, + OrcValueWriter valueWriter, + LogicalType keyType, + LogicalType valueType) { + this.keyWriter = keyWriter; + this.valueWriter = valueWriter; + this.keyGetter = ArrayData.createElementGetter(keyType); + this.valueGetter = ArrayData.createElementGetter(valueType); + } + + @Override + @SuppressWarnings("unchecked") + public void nonNullWrite(int rowId, MapData data, ColumnVector output) { + MapColumnVector cv = (MapColumnVector) output; + ArrayData keyArray = data.keyArray(); + ArrayData valArray = data.valueArray(); + + // record the length and start of the list elements + cv.lengths[rowId] = data.size(); + cv.offsets[rowId] = cv.childCount; + cv.childCount = (int) (cv.childCount + cv.lengths[rowId]); + // make sure the child is big enough + growColumnVector(cv.keys, cv.childCount); + growColumnVector(cv.values, cv.childCount); + // Add each element + for (int e = 0; e < cv.lengths[rowId]; ++e) { + int pos = (int) (e + cv.offsets[rowId]); + keyWriter.write(pos, (K) keyGetter.getElementOrNull(keyArray, e), cv.keys); + valueWriter.write(pos, (V) valueGetter.getElementOrNull(valArray, e), cv.values); + } + } + + @Override + public Stream> metrics() { + return Stream.concat(keyWriter.metrics(), valueWriter.metrics()); + } + } + + static class RowDataWriter extends GenericOrcWriters.StructWriter { + private final List fieldGetters; + + RowDataWriter(List> writers, List types) { + super(writers); + + this.fieldGetters = Lists.newArrayListWithExpectedSize(types.size()); + for (int i = 0; i < types.size(); i++) { + fieldGetters.add(RowData.createFieldGetter(types.get(i), i)); + } + } + + @Override + protected Object get(RowData struct, int index) { + return fieldGetters.get(index).getFieldOrNull(struct); + } + } + + private static void growColumnVector(ColumnVector cv, int requestedSize) { + if (cv.isNull.length < requestedSize) { + // Use growth factor of 3 to avoid frequent array allocations + cv.ensureSize(requestedSize * 3, true); + } + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkParquetReaders.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkParquetReaders.java new file mode 100644 index 000000000000..a5f2bb738960 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkParquetReaders.java @@ -0,0 +1,905 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.data; + +import java.math.BigDecimal; +import java.math.BigInteger; +import java.nio.ByteBuffer; +import java.time.Instant; +import java.time.ZoneOffset; +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import org.apache.flink.table.data.ArrayData; +import org.apache.flink.table.data.DecimalData; +import org.apache.flink.table.data.GenericArrayData; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.MapData; +import org.apache.flink.table.data.RawValueData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.flink.table.data.TimestampData; +import org.apache.iceberg.MetadataColumns; +import org.apache.iceberg.Schema; +import org.apache.iceberg.parquet.ParquetSchemaUtil; +import org.apache.iceberg.parquet.ParquetValueReader; +import org.apache.iceberg.parquet.ParquetValueReaders; +import org.apache.iceberg.parquet.TypeWithSchemaVisitor; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.types.Types; +import org.apache.iceberg.util.ArrayUtil; +import org.apache.parquet.column.ColumnDescriptor; +import org.apache.parquet.io.api.Binary; +import org.apache.parquet.schema.GroupType; +import org.apache.parquet.schema.LogicalTypeAnnotation; +import org.apache.parquet.schema.LogicalTypeAnnotation.DecimalLogicalTypeAnnotation; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Type; + +public class FlinkParquetReaders { + private FlinkParquetReaders() {} + + public static ParquetValueReader buildReader( + Schema expectedSchema, MessageType fileSchema) { + return buildReader(expectedSchema, fileSchema, ImmutableMap.of()); + } + + @SuppressWarnings("unchecked") + public static ParquetValueReader buildReader( + Schema expectedSchema, MessageType fileSchema, Map idToConstant) { + return (ParquetValueReader) + TypeWithSchemaVisitor.visit( + expectedSchema.asStruct(), fileSchema, new ReadBuilder(fileSchema, idToConstant)); + } + + private static class ReadBuilder extends TypeWithSchemaVisitor> { + private final MessageType type; + private final Map idToConstant; + + ReadBuilder(MessageType type, Map idToConstant) { + this.type = type; + this.idToConstant = idToConstant; + } + + @Override + public ParquetValueReader message( + Types.StructType expected, MessageType message, List> fieldReaders) { + return struct(expected, message.asGroupType(), fieldReaders); + } + + @Override + public ParquetValueReader struct( + Types.StructType expected, GroupType struct, List> fieldReaders) { + // match the expected struct's order + Map> readersById = Maps.newHashMap(); + Map typesById = Maps.newHashMap(); + Map maxDefinitionLevelsById = Maps.newHashMap(); + List fields = struct.getFields(); + for (int i = 0; i < fields.size(); i += 1) { + Type fieldType = fields.get(i); + if (fieldReaders.get(i) != null) { + int fieldD = type.getMaxDefinitionLevel(path(fieldType.getName())) - 1; + if (fieldType.getId() != null) { + int id = fieldType.getId().intValue(); + readersById.put(id, ParquetValueReaders.option(fieldType, fieldD, fieldReaders.get(i))); + typesById.put(id, fieldType); + if (idToConstant.containsKey(id)) { + maxDefinitionLevelsById.put(id, fieldD); + } + } + } + } + + List expectedFields = + expected != null ? expected.fields() : ImmutableList.of(); + List> reorderedFields = + Lists.newArrayListWithExpectedSize(expectedFields.size()); + List types = Lists.newArrayListWithExpectedSize(expectedFields.size()); + // Defaulting to parent max definition level + int defaultMaxDefinitionLevel = type.getMaxDefinitionLevel(currentPath()); + for (Types.NestedField field : expectedFields) { + int id = field.fieldId(); + if (idToConstant.containsKey(id)) { + // containsKey is used because the constant may be null + int fieldMaxDefinitionLevel = + maxDefinitionLevelsById.getOrDefault(id, defaultMaxDefinitionLevel); + reorderedFields.add( + ParquetValueReaders.constant(idToConstant.get(id), fieldMaxDefinitionLevel)); + types.add(null); + } else if (id == MetadataColumns.ROW_POSITION.fieldId()) { + reorderedFields.add(ParquetValueReaders.position()); + types.add(null); + } else if (id == MetadataColumns.IS_DELETED.fieldId()) { + reorderedFields.add(ParquetValueReaders.constant(false)); + types.add(null); + } else { + ParquetValueReader reader = readersById.get(id); + if (reader != null) { + reorderedFields.add(reader); + types.add(typesById.get(id)); + } else { + reorderedFields.add(ParquetValueReaders.nulls()); + types.add(null); + } + } + } + + return new RowDataReader(types, reorderedFields); + } + + @Override + public ParquetValueReader list( + Types.ListType expectedList, GroupType array, ParquetValueReader elementReader) { + if (expectedList == null) { + return null; + } + + String[] repeatedPath = currentPath(); + + int repeatedD = type.getMaxDefinitionLevel(repeatedPath) - 1; + int repeatedR = type.getMaxRepetitionLevel(repeatedPath) - 1; + + Type elementType = ParquetSchemaUtil.determineListElementType(array); + int elementD = type.getMaxDefinitionLevel(path(elementType.getName())) - 1; + + return new ArrayReader<>( + repeatedD, repeatedR, ParquetValueReaders.option(elementType, elementD, elementReader)); + } + + @Override + public ParquetValueReader map( + Types.MapType expectedMap, + GroupType map, + ParquetValueReader keyReader, + ParquetValueReader valueReader) { + if (expectedMap == null) { + return null; + } + + GroupType repeatedKeyValue = map.getFields().get(0).asGroupType(); + String[] repeatedPath = currentPath(); + + int repeatedD = type.getMaxDefinitionLevel(repeatedPath) - 1; + int repeatedR = type.getMaxRepetitionLevel(repeatedPath) - 1; + + Type keyType = repeatedKeyValue.getType(0); + int keyD = type.getMaxDefinitionLevel(path(keyType.getName())) - 1; + Type valueType = repeatedKeyValue.getType(1); + int valueD = type.getMaxDefinitionLevel(path(valueType.getName())) - 1; + + return new MapReader<>( + repeatedD, + repeatedR, + ParquetValueReaders.option(keyType, keyD, keyReader), + ParquetValueReaders.option(valueType, valueD, valueReader)); + } + + private static class LogicalTypeAnnotationParquetValueReaderVisitor + implements LogicalTypeAnnotation.LogicalTypeAnnotationVisitor> { + + private final PrimitiveType primitive; + private final ColumnDescriptor desc; + private final org.apache.iceberg.types.Type.PrimitiveType expected; + + LogicalTypeAnnotationParquetValueReaderVisitor( + PrimitiveType primitive, + ColumnDescriptor desc, + org.apache.iceberg.types.Type.PrimitiveType expected) { + this.primitive = primitive; + this.desc = desc; + this.expected = expected; + } + + @Override + public Optional> visit( + LogicalTypeAnnotation.StringLogicalTypeAnnotation stringLogicalType) { + return Optional.of(new StringReader(desc)); + } + + @Override + public Optional> visit( + LogicalTypeAnnotation.EnumLogicalTypeAnnotation enumLogicalType) { + return Optional.of(new StringReader(desc)); + } + + @Override + public Optional> visit( + LogicalTypeAnnotation.JsonLogicalTypeAnnotation jsonLogicalType) { + return Optional.of(new StringReader(desc)); + } + + @Override + public Optional> visit( + DecimalLogicalTypeAnnotation decimalLogicalType) { + switch (primitive.getPrimitiveTypeName()) { + case BINARY: + case FIXED_LEN_BYTE_ARRAY: + return Optional.of( + new BinaryDecimalReader( + desc, decimalLogicalType.getPrecision(), decimalLogicalType.getScale())); + case INT64: + return Optional.of( + new LongDecimalReader( + desc, decimalLogicalType.getPrecision(), decimalLogicalType.getScale())); + case INT32: + return Optional.of( + new IntegerDecimalReader( + desc, decimalLogicalType.getPrecision(), decimalLogicalType.getScale())); + } + + return LogicalTypeAnnotation.LogicalTypeAnnotationVisitor.super.visit(decimalLogicalType); + } + + @Override + public Optional> visit( + LogicalTypeAnnotation.DateLogicalTypeAnnotation dateLogicalType) { + return Optional.of(new ParquetValueReaders.UnboxedReader<>(desc)); + } + + @Override + public Optional> visit( + LogicalTypeAnnotation.TimeLogicalTypeAnnotation timeLogicalType) { + if (timeLogicalType.getUnit() == LogicalTypeAnnotation.TimeUnit.MILLIS) { + return Optional.of(new MillisTimeReader(desc)); + } else if (timeLogicalType.getUnit() == LogicalTypeAnnotation.TimeUnit.MICROS) { + return Optional.of(new LossyMicrosToMillisTimeReader(desc)); + } + + return LogicalTypeAnnotation.LogicalTypeAnnotationVisitor.super.visit(timeLogicalType); + } + + @Override + public Optional> visit( + LogicalTypeAnnotation.TimestampLogicalTypeAnnotation timestampLogicalType) { + if (timestampLogicalType.getUnit() == LogicalTypeAnnotation.TimeUnit.MILLIS) { + if (timestampLogicalType.isAdjustedToUTC()) { + return Optional.of(new MillisToTimestampTzReader(desc)); + } else { + return Optional.of(new MillisToTimestampReader(desc)); + } + } else if (timestampLogicalType.getUnit() == LogicalTypeAnnotation.TimeUnit.MICROS) { + if (timestampLogicalType.isAdjustedToUTC()) { + return Optional.of(new MicrosToTimestampTzReader(desc)); + } else { + return Optional.of(new MicrosToTimestampReader(desc)); + } + } + + return LogicalTypeAnnotation.LogicalTypeAnnotationVisitor.super.visit(timestampLogicalType); + } + + @Override + public Optional> visit( + LogicalTypeAnnotation.IntLogicalTypeAnnotation intLogicalType) { + int width = intLogicalType.getBitWidth(); + if (width <= 32) { + if (expected.typeId() == Types.LongType.get().typeId()) { + return Optional.of(new ParquetValueReaders.IntAsLongReader(desc)); + } else { + return Optional.of(new ParquetValueReaders.UnboxedReader<>(desc)); + } + } else if (width <= 64) { + return Optional.of(new ParquetValueReaders.UnboxedReader<>(desc)); + } + + return LogicalTypeAnnotation.LogicalTypeAnnotationVisitor.super.visit(intLogicalType); + } + + @Override + public Optional> visit( + LogicalTypeAnnotation.BsonLogicalTypeAnnotation bsonLogicalType) { + return Optional.of(new ParquetValueReaders.ByteArrayReader(desc)); + } + } + + @Override + @SuppressWarnings("CyclomaticComplexity") + public ParquetValueReader primitive( + org.apache.iceberg.types.Type.PrimitiveType expected, PrimitiveType primitive) { + if (expected == null) { + return null; + } + + ColumnDescriptor desc = type.getColumnDescription(currentPath()); + LogicalTypeAnnotation logicalTypeAnnotation = primitive.getLogicalTypeAnnotation(); + if (logicalTypeAnnotation != null) { + return logicalTypeAnnotation + .accept(new LogicalTypeAnnotationParquetValueReaderVisitor(primitive, desc, expected)) + .orElseThrow( + () -> + new UnsupportedOperationException( + "Unsupported logical type: " + primitive.getLogicalTypeAnnotation())); + } + + switch (primitive.getPrimitiveTypeName()) { + case FIXED_LEN_BYTE_ARRAY: + case BINARY: + return new ParquetValueReaders.ByteArrayReader(desc); + case INT32: + if (expected.typeId() == org.apache.iceberg.types.Type.TypeID.LONG) { + return new ParquetValueReaders.IntAsLongReader(desc); + } else { + return new ParquetValueReaders.UnboxedReader<>(desc); + } + case FLOAT: + if (expected.typeId() == org.apache.iceberg.types.Type.TypeID.DOUBLE) { + return new ParquetValueReaders.FloatAsDoubleReader(desc); + } else { + return new ParquetValueReaders.UnboxedReader<>(desc); + } + case BOOLEAN: + case INT64: + case DOUBLE: + return new ParquetValueReaders.UnboxedReader<>(desc); + default: + throw new UnsupportedOperationException("Unsupported type: " + primitive); + } + } + } + + private static class BinaryDecimalReader + extends ParquetValueReaders.PrimitiveReader { + private final int precision; + private final int scale; + + BinaryDecimalReader(ColumnDescriptor desc, int precision, int scale) { + super(desc); + this.precision = precision; + this.scale = scale; + } + + @Override + public DecimalData read(DecimalData ignored) { + Binary binary = column.nextBinary(); + BigDecimal bigDecimal = new BigDecimal(new BigInteger(binary.getBytes()), scale); + // TODO: need a unit test to write-read-validate decimal via FlinkParquetWrite/Reader + return DecimalData.fromBigDecimal(bigDecimal, precision, scale); + } + } + + private static class IntegerDecimalReader + extends ParquetValueReaders.PrimitiveReader { + private final int precision; + private final int scale; + + IntegerDecimalReader(ColumnDescriptor desc, int precision, int scale) { + super(desc); + this.precision = precision; + this.scale = scale; + } + + @Override + public DecimalData read(DecimalData ignored) { + return DecimalData.fromUnscaledLong(column.nextInteger(), precision, scale); + } + } + + private static class LongDecimalReader extends ParquetValueReaders.PrimitiveReader { + private final int precision; + private final int scale; + + LongDecimalReader(ColumnDescriptor desc, int precision, int scale) { + super(desc); + this.precision = precision; + this.scale = scale; + } + + @Override + public DecimalData read(DecimalData ignored) { + return DecimalData.fromUnscaledLong(column.nextLong(), precision, scale); + } + } + + private static class MicrosToTimestampTzReader + extends ParquetValueReaders.UnboxedReader { + MicrosToTimestampTzReader(ColumnDescriptor desc) { + super(desc); + } + + @Override + public TimestampData read(TimestampData ignored) { + long value = readLong(); + return TimestampData.fromLocalDateTime( + Instant.ofEpochSecond( + Math.floorDiv(value, 1000_000L), Math.floorMod(value, 1000_000L) * 1000L) + .atOffset(ZoneOffset.UTC) + .toLocalDateTime()); + } + + @Override + public long readLong() { + return column.nextLong(); + } + } + + private static class MicrosToTimestampReader + extends ParquetValueReaders.UnboxedReader { + MicrosToTimestampReader(ColumnDescriptor desc) { + super(desc); + } + + @Override + public TimestampData read(TimestampData ignored) { + long value = readLong(); + return TimestampData.fromInstant( + Instant.ofEpochSecond( + Math.floorDiv(value, 1000_000L), Math.floorMod(value, 1000_000L) * 1000L)); + } + + @Override + public long readLong() { + return column.nextLong(); + } + } + + private static class MillisToTimestampReader + extends ParquetValueReaders.UnboxedReader { + MillisToTimestampReader(ColumnDescriptor desc) { + super(desc); + } + + @Override + public TimestampData read(TimestampData ignored) { + long millis = readLong(); + return TimestampData.fromEpochMillis(millis); + } + + @Override + public long readLong() { + return column.nextLong(); + } + } + + private static class MillisToTimestampTzReader + extends ParquetValueReaders.UnboxedReader { + MillisToTimestampTzReader(ColumnDescriptor desc) { + super(desc); + } + + @Override + public TimestampData read(TimestampData ignored) { + long millis = readLong(); + return TimestampData.fromLocalDateTime( + Instant.ofEpochMilli(millis).atOffset(ZoneOffset.UTC).toLocalDateTime()); + } + + @Override + public long readLong() { + return column.nextLong(); + } + } + + private static class StringReader extends ParquetValueReaders.PrimitiveReader { + StringReader(ColumnDescriptor desc) { + super(desc); + } + + @Override + public StringData read(StringData ignored) { + Binary binary = column.nextBinary(); + ByteBuffer buffer = binary.toByteBuffer(); + if (buffer.hasArray()) { + return StringData.fromBytes( + buffer.array(), buffer.arrayOffset() + buffer.position(), buffer.remaining()); + } else { + return StringData.fromBytes(binary.getBytes()); + } + } + } + + private static class LossyMicrosToMillisTimeReader + extends ParquetValueReaders.PrimitiveReader { + LossyMicrosToMillisTimeReader(ColumnDescriptor desc) { + super(desc); + } + + @Override + public Integer read(Integer reuse) { + // Discard microseconds since Flink uses millisecond unit for TIME type. + return (int) Math.floorDiv(column.nextLong(), 1000L); + } + } + + private static class MillisTimeReader extends ParquetValueReaders.PrimitiveReader { + MillisTimeReader(ColumnDescriptor desc) { + super(desc); + } + + @Override + public Integer read(Integer reuse) { + return (int) column.nextLong(); + } + } + + private static class ArrayReader + extends ParquetValueReaders.RepeatedReader { + private int readPos = 0; + private int writePos = 0; + + ArrayReader(int definitionLevel, int repetitionLevel, ParquetValueReader reader) { + super(definitionLevel, repetitionLevel, reader); + } + + @Override + protected ReusableArrayData newListData(ArrayData reuse) { + this.readPos = 0; + this.writePos = 0; + + if (reuse instanceof ReusableArrayData) { + return (ReusableArrayData) reuse; + } else { + return new ReusableArrayData(); + } + } + + @Override + @SuppressWarnings("unchecked") + protected E getElement(ReusableArrayData list) { + E value = null; + if (readPos < list.capacity()) { + value = (E) list.values[readPos]; + } + + readPos += 1; + + return value; + } + + @Override + protected void addElement(ReusableArrayData reused, E element) { + if (writePos >= reused.capacity()) { + reused.grow(); + } + + reused.values[writePos] = element; + + writePos += 1; + } + + @Override + protected ArrayData buildList(ReusableArrayData list) { + // Since ReusableArrayData is not accepted by Flink, use GenericArrayData temporarily to walk + // around it. + // Revert this to use ReusableArrayData once it is fixed in Flink. + // For your reference, https://issues.apache.org/jira/browse/FLINK-25238. + return new GenericArrayData(Arrays.copyOf(list.values, writePos)); + } + } + + private static class MapReader + extends ParquetValueReaders.RepeatedKeyValueReader { + private int readPos = 0; + private int writePos = 0; + + private final ParquetValueReaders.ReusableEntry entry = + new ParquetValueReaders.ReusableEntry<>(); + private final ParquetValueReaders.ReusableEntry nullEntry = + new ParquetValueReaders.ReusableEntry<>(); + + MapReader( + int definitionLevel, + int repetitionLevel, + ParquetValueReader keyReader, + ParquetValueReader valueReader) { + super(definitionLevel, repetitionLevel, keyReader, valueReader); + } + + @Override + protected ReusableMapData newMapData(MapData reuse) { + this.readPos = 0; + this.writePos = 0; + + if (reuse instanceof ReusableMapData) { + return (ReusableMapData) reuse; + } else { + return new ReusableMapData(); + } + } + + @Override + @SuppressWarnings("unchecked") + protected Map.Entry getPair(ReusableMapData map) { + Map.Entry kv = nullEntry; + if (readPos < map.capacity()) { + entry.set((K) map.keys.values[readPos], (V) map.values.values[readPos]); + kv = entry; + } + + readPos += 1; + + return kv; + } + + @Override + protected void addPair(ReusableMapData map, K key, V value) { + if (writePos >= map.capacity()) { + map.grow(); + } + + map.keys.values[writePos] = key; + map.values.values[writePos] = value; + + writePos += 1; + } + + @Override + protected MapData buildMap(ReusableMapData map) { + map.setNumElements(writePos); + return map; + } + } + + private static class RowDataReader + extends ParquetValueReaders.StructReader { + private final int numFields; + + RowDataReader(List types, List> readers) { + super(types, readers); + this.numFields = readers.size(); + } + + @Override + protected GenericRowData newStructData(RowData reuse) { + if (reuse instanceof GenericRowData) { + return (GenericRowData) reuse; + } else { + return new GenericRowData(numFields); + } + } + + @Override + protected Object getField(GenericRowData intermediate, int pos) { + return intermediate.getField(pos); + } + + @Override + protected RowData buildStruct(GenericRowData struct) { + return struct; + } + + @Override + protected void set(GenericRowData row, int pos, Object value) { + row.setField(pos, value); + } + + @Override + protected void setNull(GenericRowData row, int pos) { + row.setField(pos, null); + } + + @Override + protected void setBoolean(GenericRowData row, int pos, boolean value) { + row.setField(pos, value); + } + + @Override + protected void setInteger(GenericRowData row, int pos, int value) { + row.setField(pos, value); + } + + @Override + protected void setLong(GenericRowData row, int pos, long value) { + row.setField(pos, value); + } + + @Override + protected void setFloat(GenericRowData row, int pos, float value) { + row.setField(pos, value); + } + + @Override + protected void setDouble(GenericRowData row, int pos, double value) { + row.setField(pos, value); + } + } + + private static class ReusableMapData implements MapData { + private final ReusableArrayData keys; + private final ReusableArrayData values; + + private int numElements; + + private ReusableMapData() { + this.keys = new ReusableArrayData(); + this.values = new ReusableArrayData(); + } + + private void grow() { + keys.grow(); + values.grow(); + } + + private int capacity() { + return keys.capacity(); + } + + public void setNumElements(int numElements) { + this.numElements = numElements; + keys.setNumElements(numElements); + values.setNumElements(numElements); + } + + @Override + public int size() { + return numElements; + } + + @Override + public ReusableArrayData keyArray() { + return keys; + } + + @Override + public ReusableArrayData valueArray() { + return values; + } + } + + private static class ReusableArrayData implements ArrayData { + private static final Object[] EMPTY = new Object[0]; + + private Object[] values = EMPTY; + private int numElements = 0; + + private void grow() { + if (values.length == 0) { + this.values = new Object[20]; + } else { + Object[] old = values; + this.values = new Object[old.length << 1]; + // copy the old array in case it has values that can be reused + System.arraycopy(old, 0, values, 0, old.length); + } + } + + private int capacity() { + return values.length; + } + + public void setNumElements(int numElements) { + this.numElements = numElements; + } + + @Override + public int size() { + return numElements; + } + + @Override + public boolean isNullAt(int ordinal) { + return null == values[ordinal]; + } + + @Override + public boolean getBoolean(int ordinal) { + return (boolean) values[ordinal]; + } + + @Override + public byte getByte(int ordinal) { + return (byte) values[ordinal]; + } + + @Override + public short getShort(int ordinal) { + return (short) values[ordinal]; + } + + @Override + public int getInt(int ordinal) { + return (int) values[ordinal]; + } + + @Override + public long getLong(int ordinal) { + return (long) values[ordinal]; + } + + @Override + public float getFloat(int ordinal) { + return (float) values[ordinal]; + } + + @Override + public double getDouble(int ordinal) { + return (double) values[ordinal]; + } + + @Override + public StringData getString(int pos) { + return (StringData) values[pos]; + } + + @Override + public DecimalData getDecimal(int pos, int precision, int scale) { + return (DecimalData) values[pos]; + } + + @Override + public TimestampData getTimestamp(int pos, int precision) { + return (TimestampData) values[pos]; + } + + @SuppressWarnings("unchecked") + @Override + public RawValueData getRawValue(int pos) { + return (RawValueData) values[pos]; + } + + @Override + public byte[] getBinary(int ordinal) { + return (byte[]) values[ordinal]; + } + + @Override + public ArrayData getArray(int ordinal) { + return (ArrayData) values[ordinal]; + } + + @Override + public MapData getMap(int ordinal) { + return (MapData) values[ordinal]; + } + + @Override + public RowData getRow(int pos, int numFields) { + return (RowData) values[pos]; + } + + @Override + public boolean[] toBooleanArray() { + return ArrayUtil.toPrimitive((Boolean[]) values); + } + + @Override + public byte[] toByteArray() { + return ArrayUtil.toPrimitive((Byte[]) values); + } + + @Override + public short[] toShortArray() { + return ArrayUtil.toPrimitive((Short[]) values); + } + + @Override + public int[] toIntArray() { + return ArrayUtil.toPrimitive((Integer[]) values); + } + + @Override + public long[] toLongArray() { + return ArrayUtil.toPrimitive((Long[]) values); + } + + @Override + public float[] toFloatArray() { + return ArrayUtil.toPrimitive((Float[]) values); + } + + @Override + public double[] toDoubleArray() { + return ArrayUtil.toPrimitive((Double[]) values); + } + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkParquetWriters.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkParquetWriters.java new file mode 100644 index 000000000000..db4f1730a134 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkParquetWriters.java @@ -0,0 +1,504 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.data; + +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.NoSuchElementException; +import org.apache.flink.table.data.ArrayData; +import org.apache.flink.table.data.DecimalData; +import org.apache.flink.table.data.MapData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.flink.table.data.TimestampData; +import org.apache.flink.table.types.logical.ArrayType; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.table.types.logical.MapType; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.table.types.logical.RowType.RowField; +import org.apache.flink.table.types.logical.SmallIntType; +import org.apache.flink.table.types.logical.TinyIntType; +import org.apache.iceberg.parquet.ParquetValueReaders; +import org.apache.iceberg.parquet.ParquetValueWriter; +import org.apache.iceberg.parquet.ParquetValueWriters; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.types.TypeUtil; +import org.apache.iceberg.util.DecimalUtil; +import org.apache.parquet.column.ColumnDescriptor; +import org.apache.parquet.io.api.Binary; +import org.apache.parquet.schema.GroupType; +import org.apache.parquet.schema.LogicalTypeAnnotation.DecimalLogicalTypeAnnotation; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Type; + +public class FlinkParquetWriters { + private FlinkParquetWriters() {} + + @SuppressWarnings("unchecked") + public static ParquetValueWriter buildWriter(LogicalType schema, MessageType type) { + return (ParquetValueWriter) + ParquetWithFlinkSchemaVisitor.visit(schema, type, new WriteBuilder(type)); + } + + private static class WriteBuilder extends ParquetWithFlinkSchemaVisitor> { + private final MessageType type; + + WriteBuilder(MessageType type) { + this.type = type; + } + + @Override + public ParquetValueWriter message( + RowType sStruct, MessageType message, List> fields) { + return struct(sStruct, message.asGroupType(), fields); + } + + @Override + public ParquetValueWriter struct( + RowType sStruct, GroupType struct, List> fieldWriters) { + List fields = struct.getFields(); + List flinkFields = sStruct.getFields(); + List> writers = Lists.newArrayListWithExpectedSize(fieldWriters.size()); + List flinkTypes = Lists.newArrayList(); + for (int i = 0; i < fields.size(); i += 1) { + writers.add(newOption(struct.getType(i), fieldWriters.get(i))); + flinkTypes.add(flinkFields.get(i).getType()); + } + + return new RowDataWriter(writers, flinkTypes); + } + + @Override + public ParquetValueWriter list( + ArrayType sArray, GroupType array, ParquetValueWriter elementWriter) { + GroupType repeated = array.getFields().get(0).asGroupType(); + String[] repeatedPath = currentPath(); + + int repeatedD = type.getMaxDefinitionLevel(repeatedPath); + int repeatedR = type.getMaxRepetitionLevel(repeatedPath); + + return new ArrayDataWriter<>( + repeatedD, + repeatedR, + newOption(repeated.getType(0), elementWriter), + sArray.getElementType()); + } + + @Override + public ParquetValueWriter map( + MapType sMap, + GroupType map, + ParquetValueWriter keyWriter, + ParquetValueWriter valueWriter) { + GroupType repeatedKeyValue = map.getFields().get(0).asGroupType(); + String[] repeatedPath = currentPath(); + + int repeatedD = type.getMaxDefinitionLevel(repeatedPath); + int repeatedR = type.getMaxRepetitionLevel(repeatedPath); + + return new MapDataWriter<>( + repeatedD, + repeatedR, + newOption(repeatedKeyValue.getType(0), keyWriter), + newOption(repeatedKeyValue.getType(1), valueWriter), + sMap.getKeyType(), + sMap.getValueType()); + } + + private ParquetValueWriter newOption(Type fieldType, ParquetValueWriter writer) { + int maxD = type.getMaxDefinitionLevel(path(fieldType.getName())); + return ParquetValueWriters.option(fieldType, maxD, writer); + } + + @Override + public ParquetValueWriter primitive(LogicalType fType, PrimitiveType primitive) { + ColumnDescriptor desc = type.getColumnDescription(currentPath()); + + if (primitive.getOriginalType() != null) { + switch (primitive.getOriginalType()) { + case ENUM: + case JSON: + case UTF8: + return strings(desc); + case DATE: + case INT_8: + case INT_16: + case INT_32: + return ints(fType, desc); + case INT_64: + return ParquetValueWriters.longs(desc); + case TIME_MICROS: + return timeMicros(desc); + case TIMESTAMP_MICROS: + return timestamps(desc); + case DECIMAL: + DecimalLogicalTypeAnnotation decimal = + (DecimalLogicalTypeAnnotation) primitive.getLogicalTypeAnnotation(); + switch (primitive.getPrimitiveTypeName()) { + case INT32: + return decimalAsInteger(desc, decimal.getPrecision(), decimal.getScale()); + case INT64: + return decimalAsLong(desc, decimal.getPrecision(), decimal.getScale()); + case BINARY: + case FIXED_LEN_BYTE_ARRAY: + return decimalAsFixed(desc, decimal.getPrecision(), decimal.getScale()); + default: + throw new UnsupportedOperationException( + "Unsupported base type for decimal: " + primitive.getPrimitiveTypeName()); + } + case BSON: + return byteArrays(desc); + default: + throw new UnsupportedOperationException( + "Unsupported logical type: " + primitive.getOriginalType()); + } + } + + switch (primitive.getPrimitiveTypeName()) { + case FIXED_LEN_BYTE_ARRAY: + case BINARY: + return byteArrays(desc); + case BOOLEAN: + return ParquetValueWriters.booleans(desc); + case INT32: + return ints(fType, desc); + case INT64: + return ParquetValueWriters.longs(desc); + case FLOAT: + return ParquetValueWriters.floats(desc); + case DOUBLE: + return ParquetValueWriters.doubles(desc); + default: + throw new UnsupportedOperationException("Unsupported type: " + primitive); + } + } + } + + private static ParquetValueWriters.PrimitiveWriter ints( + LogicalType type, ColumnDescriptor desc) { + if (type instanceof TinyIntType) { + return ParquetValueWriters.tinyints(desc); + } else if (type instanceof SmallIntType) { + return ParquetValueWriters.shorts(desc); + } + return ParquetValueWriters.ints(desc); + } + + private static ParquetValueWriters.PrimitiveWriter strings(ColumnDescriptor desc) { + return new StringDataWriter(desc); + } + + private static ParquetValueWriters.PrimitiveWriter timeMicros(ColumnDescriptor desc) { + return new TimeMicrosWriter(desc); + } + + private static ParquetValueWriters.PrimitiveWriter decimalAsInteger( + ColumnDescriptor desc, int precision, int scale) { + Preconditions.checkArgument( + precision <= 9, + "Cannot write decimal value as integer with precision larger than 9," + + " wrong precision %s", + precision); + return new IntegerDecimalWriter(desc, precision, scale); + } + + private static ParquetValueWriters.PrimitiveWriter decimalAsLong( + ColumnDescriptor desc, int precision, int scale) { + Preconditions.checkArgument( + precision <= 18, + "Cannot write decimal value as long with precision larger than 18, " + + " wrong precision %s", + precision); + return new LongDecimalWriter(desc, precision, scale); + } + + private static ParquetValueWriters.PrimitiveWriter decimalAsFixed( + ColumnDescriptor desc, int precision, int scale) { + return new FixedDecimalWriter(desc, precision, scale); + } + + private static ParquetValueWriters.PrimitiveWriter timestamps( + ColumnDescriptor desc) { + return new TimestampDataWriter(desc); + } + + private static ParquetValueWriters.PrimitiveWriter byteArrays(ColumnDescriptor desc) { + return new ByteArrayWriter(desc); + } + + private static class StringDataWriter extends ParquetValueWriters.PrimitiveWriter { + private StringDataWriter(ColumnDescriptor desc) { + super(desc); + } + + @Override + public void write(int repetitionLevel, StringData value) { + column.writeBinary(repetitionLevel, Binary.fromReusedByteArray(value.toBytes())); + } + } + + private static class TimeMicrosWriter extends ParquetValueWriters.PrimitiveWriter { + private TimeMicrosWriter(ColumnDescriptor desc) { + super(desc); + } + + @Override + public void write(int repetitionLevel, Integer value) { + long micros = value.longValue() * 1000; + column.writeLong(repetitionLevel, micros); + } + } + + private static class IntegerDecimalWriter + extends ParquetValueWriters.PrimitiveWriter { + private final int precision; + private final int scale; + + private IntegerDecimalWriter(ColumnDescriptor desc, int precision, int scale) { + super(desc); + this.precision = precision; + this.scale = scale; + } + + @Override + public void write(int repetitionLevel, DecimalData decimal) { + Preconditions.checkArgument( + decimal.scale() == scale, + "Cannot write value as decimal(%s,%s), wrong scale: %s", + precision, + scale, + decimal); + Preconditions.checkArgument( + decimal.precision() <= precision, + "Cannot write value as decimal(%s,%s), too large: %s", + precision, + scale, + decimal); + + column.writeInteger(repetitionLevel, (int) decimal.toUnscaledLong()); + } + } + + private static class LongDecimalWriter extends ParquetValueWriters.PrimitiveWriter { + private final int precision; + private final int scale; + + private LongDecimalWriter(ColumnDescriptor desc, int precision, int scale) { + super(desc); + this.precision = precision; + this.scale = scale; + } + + @Override + public void write(int repetitionLevel, DecimalData decimal) { + Preconditions.checkArgument( + decimal.scale() == scale, + "Cannot write value as decimal(%s,%s), wrong scale: %s", + precision, + scale, + decimal); + Preconditions.checkArgument( + decimal.precision() <= precision, + "Cannot write value as decimal(%s,%s), too large: %s", + precision, + scale, + decimal); + + column.writeLong(repetitionLevel, decimal.toUnscaledLong()); + } + } + + private static class FixedDecimalWriter extends ParquetValueWriters.PrimitiveWriter { + private final int precision; + private final int scale; + private final ThreadLocal bytes; + + private FixedDecimalWriter(ColumnDescriptor desc, int precision, int scale) { + super(desc); + this.precision = precision; + this.scale = scale; + this.bytes = + ThreadLocal.withInitial(() -> new byte[TypeUtil.decimalRequiredBytes(precision)]); + } + + @Override + public void write(int repetitionLevel, DecimalData decimal) { + byte[] binary = + DecimalUtil.toReusedFixLengthBytes(precision, scale, decimal.toBigDecimal(), bytes.get()); + column.writeBinary(repetitionLevel, Binary.fromReusedByteArray(binary)); + } + } + + private static class TimestampDataWriter + extends ParquetValueWriters.PrimitiveWriter { + private TimestampDataWriter(ColumnDescriptor desc) { + super(desc); + } + + @Override + public void write(int repetitionLevel, TimestampData value) { + column.writeLong( + repetitionLevel, value.getMillisecond() * 1000 + value.getNanoOfMillisecond() / 1000); + } + } + + private static class ByteArrayWriter extends ParquetValueWriters.PrimitiveWriter { + private ByteArrayWriter(ColumnDescriptor desc) { + super(desc); + } + + @Override + public void write(int repetitionLevel, byte[] bytes) { + column.writeBinary(repetitionLevel, Binary.fromReusedByteArray(bytes)); + } + } + + private static class ArrayDataWriter extends ParquetValueWriters.RepeatedWriter { + private final LogicalType elementType; + + private ArrayDataWriter( + int definitionLevel, + int repetitionLevel, + ParquetValueWriter writer, + LogicalType elementType) { + super(definitionLevel, repetitionLevel, writer); + this.elementType = elementType; + } + + @Override + protected Iterator elements(ArrayData list) { + return new ElementIterator<>(list); + } + + private class ElementIterator implements Iterator { + private final int size; + private final ArrayData list; + private final ArrayData.ElementGetter getter; + private int index; + + private ElementIterator(ArrayData list) { + this.list = list; + size = list.size(); + getter = ArrayData.createElementGetter(elementType); + index = 0; + } + + @Override + public boolean hasNext() { + return index != size; + } + + @Override + @SuppressWarnings("unchecked") + public E next() { + if (index >= size) { + throw new NoSuchElementException(); + } + + E element = (E) getter.getElementOrNull(list, index); + index += 1; + + return element; + } + } + } + + private static class MapDataWriter + extends ParquetValueWriters.RepeatedKeyValueWriter { + private final LogicalType keyType; + private final LogicalType valueType; + + private MapDataWriter( + int definitionLevel, + int repetitionLevel, + ParquetValueWriter keyWriter, + ParquetValueWriter valueWriter, + LogicalType keyType, + LogicalType valueType) { + super(definitionLevel, repetitionLevel, keyWriter, valueWriter); + this.keyType = keyType; + this.valueType = valueType; + } + + @Override + protected Iterator> pairs(MapData map) { + return new EntryIterator<>(map); + } + + private class EntryIterator implements Iterator> { + private final int size; + private final ArrayData keys; + private final ArrayData values; + private final ParquetValueReaders.ReusableEntry entry; + private final ArrayData.ElementGetter keyGetter; + private final ArrayData.ElementGetter valueGetter; + private int index; + + private EntryIterator(MapData map) { + size = map.size(); + keys = map.keyArray(); + values = map.valueArray(); + entry = new ParquetValueReaders.ReusableEntry<>(); + keyGetter = ArrayData.createElementGetter(keyType); + valueGetter = ArrayData.createElementGetter(valueType); + index = 0; + } + + @Override + public boolean hasNext() { + return index != size; + } + + @Override + @SuppressWarnings("unchecked") + public Map.Entry next() { + if (index >= size) { + throw new NoSuchElementException(); + } + + entry.set( + (K) keyGetter.getElementOrNull(keys, index), + (V) valueGetter.getElementOrNull(values, index)); + index += 1; + + return entry; + } + } + } + + private static class RowDataWriter extends ParquetValueWriters.StructWriter { + private final RowData.FieldGetter[] fieldGetter; + + RowDataWriter(List> writers, List types) { + super(writers); + fieldGetter = new RowData.FieldGetter[types.size()]; + for (int i = 0; i < types.size(); i += 1) { + fieldGetter[i] = RowData.createFieldGetter(types.get(i), i); + } + } + + @Override + protected Object get(RowData struct, int index) { + return fieldGetter[index].getFieldOrNull(struct); + } + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkSchemaVisitor.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkSchemaVisitor.java new file mode 100644 index 000000000000..ba4e1a7a7aec --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkSchemaVisitor.java @@ -0,0 +1,161 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.data; + +import java.util.List; +import org.apache.flink.table.types.logical.ArrayType; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.table.types.logical.MapType; +import org.apache.flink.table.types.logical.RowType; +import org.apache.iceberg.Schema; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.types.Type; +import org.apache.iceberg.types.Types; + +abstract class FlinkSchemaVisitor { + + static T visit(RowType flinkType, Schema schema, FlinkSchemaVisitor visitor) { + return visit(flinkType, schema.asStruct(), visitor); + } + + private static T visit(LogicalType flinkType, Type iType, FlinkSchemaVisitor visitor) { + switch (iType.typeId()) { + case STRUCT: + return visitRecord(flinkType, iType.asStructType(), visitor); + + case MAP: + MapType mapType = (MapType) flinkType; + Types.MapType iMapType = iType.asMapType(); + T key; + T value; + + Types.NestedField keyField = iMapType.field(iMapType.keyId()); + visitor.beforeMapKey(keyField); + try { + key = visit(mapType.getKeyType(), iMapType.keyType(), visitor); + } finally { + visitor.afterMapKey(keyField); + } + + Types.NestedField valueField = iMapType.field(iMapType.valueId()); + visitor.beforeMapValue(valueField); + try { + value = visit(mapType.getValueType(), iMapType.valueType(), visitor); + } finally { + visitor.afterMapValue(valueField); + } + + return visitor.map(iMapType, key, value, mapType.getKeyType(), mapType.getValueType()); + + case LIST: + ArrayType listType = (ArrayType) flinkType; + Types.ListType iListType = iType.asListType(); + T element; + + Types.NestedField elementField = iListType.field(iListType.elementId()); + visitor.beforeListElement(elementField); + try { + element = visit(listType.getElementType(), iListType.elementType(), visitor); + } finally { + visitor.afterListElement(elementField); + } + + return visitor.list(iListType, element, listType.getElementType()); + + default: + return visitor.primitive(iType.asPrimitiveType(), flinkType); + } + } + + private static T visitRecord( + LogicalType flinkType, Types.StructType struct, FlinkSchemaVisitor visitor) { + Preconditions.checkArgument(flinkType instanceof RowType, "%s is not a RowType.", flinkType); + RowType rowType = (RowType) flinkType; + + int fieldSize = struct.fields().size(); + List results = Lists.newArrayListWithExpectedSize(fieldSize); + List fieldTypes = Lists.newArrayListWithExpectedSize(fieldSize); + List nestedFields = struct.fields(); + + for (int i = 0; i < fieldSize; i++) { + Types.NestedField iField = nestedFields.get(i); + int fieldIndex = rowType.getFieldIndex(iField.name()); + Preconditions.checkArgument( + fieldIndex >= 0, "NestedField: %s is not found in flink RowType: %s", iField, rowType); + + LogicalType fieldFlinkType = rowType.getTypeAt(fieldIndex); + + fieldTypes.add(fieldFlinkType); + + visitor.beforeField(iField); + try { + results.add(visit(fieldFlinkType, iField.type(), visitor)); + } finally { + visitor.afterField(iField); + } + } + + return visitor.record(struct, results, fieldTypes); + } + + public T record(Types.StructType iStruct, List results, List fieldTypes) { + return null; + } + + public T list(Types.ListType iList, T element, LogicalType elementType) { + return null; + } + + public T map(Types.MapType iMap, T key, T value, LogicalType keyType, LogicalType valueType) { + return null; + } + + public T primitive(Type.PrimitiveType iPrimitive, LogicalType flinkPrimitive) { + return null; + } + + public void beforeField(Types.NestedField field) {} + + public void afterField(Types.NestedField field) {} + + public void beforeListElement(Types.NestedField elementField) { + beforeField(elementField); + } + + public void afterListElement(Types.NestedField elementField) { + afterField(elementField); + } + + public void beforeMapKey(Types.NestedField keyField) { + beforeField(keyField); + } + + public void afterMapKey(Types.NestedField keyField) { + afterField(keyField); + } + + public void beforeMapValue(Types.NestedField valueField) { + beforeField(valueField); + } + + public void afterMapValue(Types.NestedField valueField) { + afterField(valueField); + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkValueReaders.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkValueReaders.java new file mode 100644 index 000000000000..32f6c3a2ccfd --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkValueReaders.java @@ -0,0 +1,312 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.data; + +import java.io.IOException; +import java.math.BigDecimal; +import java.math.BigInteger; +import java.nio.charset.StandardCharsets; +import java.util.List; +import java.util.Map; +import org.apache.avro.io.Decoder; +import org.apache.avro.util.Utf8; +import org.apache.flink.table.data.ArrayData; +import org.apache.flink.table.data.DecimalData; +import org.apache.flink.table.data.GenericArrayData; +import org.apache.flink.table.data.GenericMapData; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.MapData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.flink.table.data.TimestampData; +import org.apache.iceberg.avro.ValueReader; +import org.apache.iceberg.avro.ValueReaders; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.types.Types; + +public class FlinkValueReaders { + + private FlinkValueReaders() {} + + static ValueReader strings() { + return StringReader.INSTANCE; + } + + static ValueReader enums(List symbols) { + return new EnumReader(symbols); + } + + static ValueReader uuids() { + return ValueReaders.fixed(16); + } + + static ValueReader timeMicros() { + return TimeMicrosReader.INSTANCE; + } + + static ValueReader timestampMills() { + return TimestampMillsReader.INSTANCE; + } + + static ValueReader timestampMicros() { + return TimestampMicrosReader.INSTANCE; + } + + static ValueReader decimal( + ValueReader unscaledReader, int precision, int scale) { + return new DecimalReader(unscaledReader, precision, scale); + } + + static ValueReader array(ValueReader elementReader) { + return new ArrayReader(elementReader); + } + + static ValueReader arrayMap(ValueReader keyReader, ValueReader valueReader) { + return new ArrayMapReader(keyReader, valueReader); + } + + static ValueReader map(ValueReader keyReader, ValueReader valueReader) { + return new MapReader(keyReader, valueReader); + } + + static ValueReader struct( + List> readers, Types.StructType struct, Map idToConstant) { + return new StructReader(readers, struct, idToConstant); + } + + private static class StringReader implements ValueReader { + private static final StringReader INSTANCE = new StringReader(); + + private StringReader() {} + + @Override + public StringData read(Decoder decoder, Object reuse) throws IOException { + // use the decoder's readString(Utf8) method because it may be a resolving decoder + Utf8 utf8 = null; + if (reuse instanceof StringData) { + utf8 = new Utf8(((StringData) reuse).toBytes()); + } + + Utf8 string = decoder.readString(utf8); + return StringData.fromBytes(string.getBytes(), 0, string.getByteLength()); + } + } + + private static class EnumReader implements ValueReader { + private final StringData[] symbols; + + private EnumReader(List symbols) { + this.symbols = new StringData[symbols.size()]; + for (int i = 0; i < this.symbols.length; i += 1) { + this.symbols[i] = StringData.fromBytes(symbols.get(i).getBytes(StandardCharsets.UTF_8)); + } + } + + @Override + public StringData read(Decoder decoder, Object ignore) throws IOException { + int index = decoder.readEnum(); + return symbols[index]; + } + } + + private static class DecimalReader implements ValueReader { + private final ValueReader bytesReader; + private final int precision; + private final int scale; + + private DecimalReader(ValueReader bytesReader, int precision, int scale) { + this.bytesReader = bytesReader; + this.precision = precision; + this.scale = scale; + } + + @Override + public DecimalData read(Decoder decoder, Object reuse) throws IOException { + byte[] bytes = bytesReader.read(decoder, null); + return DecimalData.fromBigDecimal( + new BigDecimal(new BigInteger(bytes), scale), precision, scale); + } + } + + private static class TimeMicrosReader implements ValueReader { + private static final TimeMicrosReader INSTANCE = new TimeMicrosReader(); + + @Override + public Integer read(Decoder decoder, Object reuse) throws IOException { + long micros = decoder.readLong(); + // Flink only support time mills, just erase micros. + return (int) (micros / 1000); + } + } + + private static class TimestampMillsReader implements ValueReader { + private static final TimestampMillsReader INSTANCE = new TimestampMillsReader(); + + @Override + public TimestampData read(Decoder decoder, Object reuse) throws IOException { + return TimestampData.fromEpochMillis(decoder.readLong()); + } + } + + private static class TimestampMicrosReader implements ValueReader { + private static final TimestampMicrosReader INSTANCE = new TimestampMicrosReader(); + + @Override + public TimestampData read(Decoder decoder, Object reuse) throws IOException { + long micros = decoder.readLong(); + long mills = micros / 1000; + int nanos = ((int) (micros % 1000)) * 1000; + if (nanos < 0) { + nanos += 1_000_000; + mills -= 1; + } + return TimestampData.fromEpochMillis(mills, nanos); + } + } + + private static class ArrayReader implements ValueReader { + private final ValueReader elementReader; + private final List reusedList = Lists.newArrayList(); + + private ArrayReader(ValueReader elementReader) { + this.elementReader = elementReader; + } + + @Override + public GenericArrayData read(Decoder decoder, Object reuse) throws IOException { + reusedList.clear(); + long chunkLength = decoder.readArrayStart(); + + while (chunkLength > 0) { + for (int i = 0; i < chunkLength; i += 1) { + reusedList.add(elementReader.read(decoder, null)); + } + + chunkLength = decoder.arrayNext(); + } + + // this will convert the list to an array so it is okay to reuse the list + return new GenericArrayData(reusedList.toArray()); + } + } + + private static MapData kvArrayToMap(List keyList, List valueList) { + Map map = Maps.newHashMap(); + Object[] keys = keyList.toArray(); + Object[] values = valueList.toArray(); + for (int i = 0; i < keys.length; i++) { + map.put(keys[i], values[i]); + } + + return new GenericMapData(map); + } + + private static class ArrayMapReader implements ValueReader { + private final ValueReader keyReader; + private final ValueReader valueReader; + + private final List reusedKeyList = Lists.newArrayList(); + private final List reusedValueList = Lists.newArrayList(); + + private ArrayMapReader(ValueReader keyReader, ValueReader valueReader) { + this.keyReader = keyReader; + this.valueReader = valueReader; + } + + @Override + public MapData read(Decoder decoder, Object reuse) throws IOException { + reusedKeyList.clear(); + reusedValueList.clear(); + + long chunkLength = decoder.readArrayStart(); + + while (chunkLength > 0) { + for (int i = 0; i < chunkLength; i += 1) { + reusedKeyList.add(keyReader.read(decoder, null)); + reusedValueList.add(valueReader.read(decoder, null)); + } + + chunkLength = decoder.arrayNext(); + } + + return kvArrayToMap(reusedKeyList, reusedValueList); + } + } + + private static class MapReader implements ValueReader { + private final ValueReader keyReader; + private final ValueReader valueReader; + + private final List reusedKeyList = Lists.newArrayList(); + private final List reusedValueList = Lists.newArrayList(); + + private MapReader(ValueReader keyReader, ValueReader valueReader) { + this.keyReader = keyReader; + this.valueReader = valueReader; + } + + @Override + public MapData read(Decoder decoder, Object reuse) throws IOException { + reusedKeyList.clear(); + reusedValueList.clear(); + + long chunkLength = decoder.readMapStart(); + + while (chunkLength > 0) { + for (int i = 0; i < chunkLength; i += 1) { + reusedKeyList.add(keyReader.read(decoder, null)); + reusedValueList.add(valueReader.read(decoder, null)); + } + + chunkLength = decoder.mapNext(); + } + + return kvArrayToMap(reusedKeyList, reusedValueList); + } + } + + private static class StructReader extends ValueReaders.StructReader { + private final int numFields; + + private StructReader( + List> readers, Types.StructType struct, Map idToConstant) { + super(readers, struct, idToConstant); + this.numFields = readers.size(); + } + + @Override + protected RowData reuseOrCreate(Object reuse) { + if (reuse instanceof GenericRowData && ((GenericRowData) reuse).getArity() == numFields) { + return (GenericRowData) reuse; + } + return new GenericRowData(numFields); + } + + @Override + protected Object get(RowData struct, int pos) { + return null; + } + + @Override + protected void set(RowData struct, int pos, Object value) { + ((GenericRowData) struct).setField(pos, value); + } + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkValueWriters.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkValueWriters.java new file mode 100644 index 000000000000..4e86ecce28b5 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/FlinkValueWriters.java @@ -0,0 +1,253 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.data; + +import java.io.IOException; +import java.lang.reflect.Array; +import java.util.List; +import org.apache.avro.io.Encoder; +import org.apache.avro.util.Utf8; +import org.apache.flink.table.data.ArrayData; +import org.apache.flink.table.data.DecimalData; +import org.apache.flink.table.data.MapData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.flink.table.data.TimestampData; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.iceberg.avro.ValueWriter; +import org.apache.iceberg.types.TypeUtil; +import org.apache.iceberg.util.DecimalUtil; + +public class FlinkValueWriters { + + private FlinkValueWriters() {} + + static ValueWriter strings() { + return StringWriter.INSTANCE; + } + + static ValueWriter timeMicros() { + return TimeMicrosWriter.INSTANCE; + } + + static ValueWriter timestampMicros() { + return TimestampMicrosWriter.INSTANCE; + } + + static ValueWriter decimal(int precision, int scale) { + return new DecimalWriter(precision, scale); + } + + static ValueWriter array(ValueWriter elementWriter, LogicalType elementType) { + return new ArrayWriter<>(elementWriter, elementType); + } + + static ValueWriter arrayMap( + ValueWriter keyWriter, + LogicalType keyType, + ValueWriter valueWriter, + LogicalType valueType) { + return new ArrayMapWriter<>(keyWriter, keyType, valueWriter, valueType); + } + + static ValueWriter map( + ValueWriter keyWriter, + LogicalType keyType, + ValueWriter valueWriter, + LogicalType valueType) { + return new MapWriter<>(keyWriter, keyType, valueWriter, valueType); + } + + static ValueWriter row(List> writers, List types) { + return new RowWriter(writers, types); + } + + private static class StringWriter implements ValueWriter { + private static final StringWriter INSTANCE = new StringWriter(); + + private StringWriter() {} + + @Override + public void write(StringData s, Encoder encoder) throws IOException { + // toBytes is cheaper than Avro calling toString, which incurs encoding costs + encoder.writeString(new Utf8(s.toBytes())); + } + } + + private static class DecimalWriter implements ValueWriter { + private final int precision; + private final int scale; + private final ThreadLocal bytes; + + private DecimalWriter(int precision, int scale) { + this.precision = precision; + this.scale = scale; + this.bytes = + ThreadLocal.withInitial(() -> new byte[TypeUtil.decimalRequiredBytes(precision)]); + } + + @Override + public void write(DecimalData d, Encoder encoder) throws IOException { + encoder.writeFixed( + DecimalUtil.toReusedFixLengthBytes(precision, scale, d.toBigDecimal(), bytes.get())); + } + } + + private static class TimeMicrosWriter implements ValueWriter { + private static final TimeMicrosWriter INSTANCE = new TimeMicrosWriter(); + + @Override + public void write(Integer timeMills, Encoder encoder) throws IOException { + encoder.writeLong(timeMills * 1000L); + } + } + + private static class TimestampMicrosWriter implements ValueWriter { + private static final TimestampMicrosWriter INSTANCE = new TimestampMicrosWriter(); + + @Override + public void write(TimestampData timestampData, Encoder encoder) throws IOException { + long micros = + timestampData.getMillisecond() * 1000 + timestampData.getNanoOfMillisecond() / 1000; + encoder.writeLong(micros); + } + } + + private static class ArrayWriter implements ValueWriter { + private final ValueWriter elementWriter; + private final ArrayData.ElementGetter elementGetter; + + private ArrayWriter(ValueWriter elementWriter, LogicalType elementType) { + this.elementWriter = elementWriter; + this.elementGetter = ArrayData.createElementGetter(elementType); + } + + @Override + @SuppressWarnings("unchecked") + public void write(ArrayData array, Encoder encoder) throws IOException { + encoder.writeArrayStart(); + int numElements = array.size(); + encoder.setItemCount(numElements); + for (int i = 0; i < numElements; i += 1) { + encoder.startItem(); + elementWriter.write((T) elementGetter.getElementOrNull(array, i), encoder); + } + encoder.writeArrayEnd(); + } + } + + private static class ArrayMapWriter implements ValueWriter { + private final ValueWriter keyWriter; + private final ValueWriter valueWriter; + private final ArrayData.ElementGetter keyGetter; + private final ArrayData.ElementGetter valueGetter; + + private ArrayMapWriter( + ValueWriter keyWriter, + LogicalType keyType, + ValueWriter valueWriter, + LogicalType valueType) { + this.keyWriter = keyWriter; + this.keyGetter = ArrayData.createElementGetter(keyType); + this.valueWriter = valueWriter; + this.valueGetter = ArrayData.createElementGetter(valueType); + } + + @Override + @SuppressWarnings("unchecked") + public void write(MapData map, Encoder encoder) throws IOException { + encoder.writeArrayStart(); + int numElements = map.size(); + encoder.setItemCount(numElements); + ArrayData keyArray = map.keyArray(); + ArrayData valueArray = map.valueArray(); + for (int i = 0; i < numElements; i += 1) { + encoder.startItem(); + keyWriter.write((K) keyGetter.getElementOrNull(keyArray, i), encoder); + valueWriter.write((V) valueGetter.getElementOrNull(valueArray, i), encoder); + } + encoder.writeArrayEnd(); + } + } + + private static class MapWriter implements ValueWriter { + private final ValueWriter keyWriter; + private final ValueWriter valueWriter; + private final ArrayData.ElementGetter keyGetter; + private final ArrayData.ElementGetter valueGetter; + + private MapWriter( + ValueWriter keyWriter, + LogicalType keyType, + ValueWriter valueWriter, + LogicalType valueType) { + this.keyWriter = keyWriter; + this.keyGetter = ArrayData.createElementGetter(keyType); + this.valueWriter = valueWriter; + this.valueGetter = ArrayData.createElementGetter(valueType); + } + + @Override + @SuppressWarnings("unchecked") + public void write(MapData map, Encoder encoder) throws IOException { + encoder.writeMapStart(); + int numElements = map.size(); + encoder.setItemCount(numElements); + ArrayData keyArray = map.keyArray(); + ArrayData valueArray = map.valueArray(); + for (int i = 0; i < numElements; i += 1) { + encoder.startItem(); + keyWriter.write((K) keyGetter.getElementOrNull(keyArray, i), encoder); + valueWriter.write((V) valueGetter.getElementOrNull(valueArray, i), encoder); + } + encoder.writeMapEnd(); + } + } + + static class RowWriter implements ValueWriter { + private final ValueWriter[] writers; + private final RowData.FieldGetter[] getters; + + private RowWriter(List> writers, List types) { + this.writers = (ValueWriter[]) Array.newInstance(ValueWriter.class, writers.size()); + this.getters = new RowData.FieldGetter[writers.size()]; + for (int i = 0; i < writers.size(); i += 1) { + this.writers[i] = writers.get(i); + this.getters[i] = RowData.createFieldGetter(types.get(i), i); + } + } + + @Override + public void write(RowData row, Encoder encoder) throws IOException { + for (int i = 0; i < writers.length; i += 1) { + if (row.isNullAt(i)) { + writers[i].write(null, encoder); + } else { + write(row, i, writers[i], encoder); + } + } + } + + @SuppressWarnings("unchecked") + private void write(RowData row, int pos, ValueWriter writer, Encoder encoder) + throws IOException { + writer.write((T) getters[pos].getFieldOrNull(row), encoder); + } + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/ParquetWithFlinkSchemaVisitor.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/ParquetWithFlinkSchemaVisitor.java new file mode 100644 index 000000000000..33feb2e32118 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/ParquetWithFlinkSchemaVisitor.java @@ -0,0 +1,222 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.data; + +import java.util.Deque; +import java.util.List; +import org.apache.flink.table.types.logical.ArrayType; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.table.types.logical.MapType; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.table.types.logical.RowType.RowField; +import org.apache.iceberg.avro.AvroSchemaUtil; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.parquet.schema.GroupType; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.OriginalType; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Type; + +public class ParquetWithFlinkSchemaVisitor { + private final Deque fieldNames = Lists.newLinkedList(); + + public static T visit( + LogicalType sType, Type type, ParquetWithFlinkSchemaVisitor visitor) { + Preconditions.checkArgument(sType != null, "Invalid DataType: null"); + if (type instanceof MessageType) { + Preconditions.checkArgument( + sType instanceof RowType, "Invalid struct: %s is not a struct", sType); + RowType struct = (RowType) sType; + return visitor.message( + struct, (MessageType) type, visitFields(struct, type.asGroupType(), visitor)); + } else if (type.isPrimitive()) { + return visitor.primitive(sType, type.asPrimitiveType()); + } else { + // if not a primitive, the typeId must be a group + GroupType group = type.asGroupType(); + OriginalType annotation = group.getOriginalType(); + if (annotation != null) { + switch (annotation) { + case LIST: + Preconditions.checkArgument( + !group.isRepetition(Type.Repetition.REPEATED), + "Invalid list: top-level group is repeated: %s", + group); + Preconditions.checkArgument( + group.getFieldCount() == 1, + "Invalid list: does not contain single repeated field: %s", + group); + + GroupType repeatedElement = group.getFields().get(0).asGroupType(); + Preconditions.checkArgument( + repeatedElement.isRepetition(Type.Repetition.REPEATED), + "Invalid list: inner group is not repeated"); + Preconditions.checkArgument( + repeatedElement.getFieldCount() <= 1, + "Invalid list: repeated group is not a single field: %s", + group); + + Preconditions.checkArgument( + sType instanceof ArrayType, "Invalid list: %s is not an array", sType); + ArrayType array = (ArrayType) sType; + RowType.RowField element = + new RowField( + "element", array.getElementType(), "element of " + array.asSummaryString()); + + visitor.fieldNames.push(repeatedElement.getName()); + try { + T elementResult = null; + if (repeatedElement.getFieldCount() > 0) { + elementResult = visitField(element, repeatedElement.getType(0), visitor); + } + + return visitor.list(array, group, elementResult); + + } finally { + visitor.fieldNames.pop(); + } + + case MAP: + Preconditions.checkArgument( + !group.isRepetition(Type.Repetition.REPEATED), + "Invalid map: top-level group is repeated: %s", + group); + Preconditions.checkArgument( + group.getFieldCount() == 1, + "Invalid map: does not contain single repeated field: %s", + group); + + GroupType repeatedKeyValue = group.getType(0).asGroupType(); + Preconditions.checkArgument( + repeatedKeyValue.isRepetition(Type.Repetition.REPEATED), + "Invalid map: inner group is not repeated"); + Preconditions.checkArgument( + repeatedKeyValue.getFieldCount() <= 2, + "Invalid map: repeated group does not have 2 fields"); + + Preconditions.checkArgument( + sType instanceof MapType, "Invalid map: %s is not a map", sType); + MapType map = (MapType) sType; + RowField keyField = + new RowField("key", map.getKeyType(), "key of " + map.asSummaryString()); + RowField valueField = + new RowField("value", map.getValueType(), "value of " + map.asSummaryString()); + + visitor.fieldNames.push(repeatedKeyValue.getName()); + try { + T keyResult = null; + T valueResult = null; + switch (repeatedKeyValue.getFieldCount()) { + case 2: + // if there are 2 fields, both key and value are projected + keyResult = visitField(keyField, repeatedKeyValue.getType(0), visitor); + valueResult = visitField(valueField, repeatedKeyValue.getType(1), visitor); + break; + case 1: + // if there is just one, use the name to determine what it is + Type keyOrValue = repeatedKeyValue.getType(0); + if (keyOrValue.getName().equalsIgnoreCase("key")) { + keyResult = visitField(keyField, keyOrValue, visitor); + // value result remains null + } else { + valueResult = visitField(valueField, keyOrValue, visitor); + // key result remains null + } + break; + default: + // both results will remain null + } + + return visitor.map(map, group, keyResult, valueResult); + + } finally { + visitor.fieldNames.pop(); + } + + default: + } + } + Preconditions.checkArgument( + sType instanceof RowType, "Invalid struct: %s is not a struct", sType); + RowType struct = (RowType) sType; + return visitor.struct(struct, group, visitFields(struct, group, visitor)); + } + } + + private static T visitField( + RowType.RowField sField, Type field, ParquetWithFlinkSchemaVisitor visitor) { + visitor.fieldNames.push(field.getName()); + try { + return visit(sField.getType(), field, visitor); + } finally { + visitor.fieldNames.pop(); + } + } + + private static List visitFields( + RowType struct, GroupType group, ParquetWithFlinkSchemaVisitor visitor) { + List sFields = struct.getFields(); + Preconditions.checkArgument( + sFields.size() == group.getFieldCount(), "Structs do not match: %s and %s", struct, group); + List results = Lists.newArrayListWithExpectedSize(group.getFieldCount()); + for (int i = 0; i < sFields.size(); i += 1) { + Type field = group.getFields().get(i); + RowType.RowField sField = sFields.get(i); + Preconditions.checkArgument( + field.getName().equals(AvroSchemaUtil.makeCompatibleName(sField.getName())), + "Structs do not match: field %s != %s", + field.getName(), + sField.getName()); + results.add(visitField(sField, field, visitor)); + } + + return results; + } + + public T message(RowType sStruct, MessageType message, List fields) { + return null; + } + + public T struct(RowType sStruct, GroupType struct, List fields) { + return null; + } + + public T list(ArrayType sArray, GroupType array, T element) { + return null; + } + + public T map(MapType sMap, GroupType map, T key, T value) { + return null; + } + + public T primitive(LogicalType sPrimitive, PrimitiveType primitive) { + return null; + } + + protected String[] currentPath() { + return Lists.newArrayList(fieldNames.descendingIterator()).toArray(new String[0]); + } + + protected String[] path(String name) { + List list = Lists.newArrayList(fieldNames.descendingIterator()); + list.add(name); + return list.toArray(new String[0]); + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/RowDataProjection.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/RowDataProjection.java new file mode 100644 index 000000000000..33816c97ac29 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/RowDataProjection.java @@ -0,0 +1,341 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.data; + +import java.util.Arrays; +import java.util.Map; +import java.util.Objects; +import org.apache.flink.table.data.ArrayData; +import org.apache.flink.table.data.DecimalData; +import org.apache.flink.table.data.MapData; +import org.apache.flink.table.data.RawValueData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.flink.table.data.TimestampData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.types.RowKind; +import org.apache.flink.util.StringUtils; +import org.apache.iceberg.Schema; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.types.Types; + +public class RowDataProjection implements RowData { + /** + * Creates a projecting wrapper for {@link RowData} rows. + * + *

This projection will not project the nested children types of repeated types like lists and + * maps. + * + * @param schema schema of rows wrapped by this projection + * @param projectedSchema result schema of the projected rows + * @return a wrapper to project rows + */ + public static RowDataProjection create(Schema schema, Schema projectedSchema) { + return RowDataProjection.create( + FlinkSchemaUtil.convert(schema), schema.asStruct(), projectedSchema.asStruct()); + } + + /** + * Creates a projecting wrapper for {@link RowData} rows. + * + *

This projection will not project the nested children types of repeated types like lists and + * maps. + * + * @param rowType flink row type of rows wrapped by this projection + * @param schema schema of rows wrapped by this projection + * @param projectedSchema result schema of the projected rows + * @return a wrapper to project rows + */ + public static RowDataProjection create( + RowType rowType, Types.StructType schema, Types.StructType projectedSchema) { + return new RowDataProjection(rowType, schema, projectedSchema); + } + + private final RowData.FieldGetter[] getters; + private RowData rowData; + + private RowDataProjection( + RowType rowType, Types.StructType rowStruct, Types.StructType projectType) { + Map fieldIdToPosition = Maps.newHashMap(); + for (int i = 0; i < rowStruct.fields().size(); i++) { + fieldIdToPosition.put(rowStruct.fields().get(i).fieldId(), i); + } + + this.getters = new RowData.FieldGetter[projectType.fields().size()]; + for (int i = 0; i < getters.length; i++) { + Types.NestedField projectField = projectType.fields().get(i); + Types.NestedField rowField = rowStruct.field(projectField.fieldId()); + + Preconditions.checkNotNull( + rowField, + "Cannot locate the project field <%s> in the iceberg struct <%s>", + projectField, + rowStruct); + + getters[i] = + createFieldGetter( + rowType, fieldIdToPosition.get(projectField.fieldId()), rowField, projectField); + } + } + + private static RowData.FieldGetter createFieldGetter( + RowType rowType, int position, Types.NestedField rowField, Types.NestedField projectField) { + Preconditions.checkArgument( + rowField.type().typeId() == projectField.type().typeId(), + "Different iceberg type between row field <%s> and project field <%s>", + rowField, + projectField); + + switch (projectField.type().typeId()) { + case STRUCT: + RowType nestedRowType = (RowType) rowType.getTypeAt(position); + return row -> { + // null nested struct value + if (row.isNullAt(position)) { + return null; + } + + RowData nestedRow = row.getRow(position, nestedRowType.getFieldCount()); + return RowDataProjection.create( + nestedRowType, rowField.type().asStructType(), projectField.type().asStructType()) + .wrap(nestedRow); + }; + + case MAP: + Types.MapType projectedMap = projectField.type().asMapType(); + Types.MapType originalMap = rowField.type().asMapType(); + + boolean keyProjectable = + !projectedMap.keyType().isNestedType() + || projectedMap.keyType().equals(originalMap.keyType()); + boolean valueProjectable = + !projectedMap.valueType().isNestedType() + || projectedMap.valueType().equals(originalMap.valueType()); + Preconditions.checkArgument( + keyProjectable && valueProjectable, + "Cannot project a partial map key or value with non-primitive type. Trying to project <%s> out of <%s>", + projectField, + rowField); + + return RowData.createFieldGetter(rowType.getTypeAt(position), position); + + case LIST: + Types.ListType projectedList = projectField.type().asListType(); + Types.ListType originalList = rowField.type().asListType(); + + boolean elementProjectable = + !projectedList.elementType().isNestedType() + || projectedList.elementType().equals(originalList.elementType()); + Preconditions.checkArgument( + elementProjectable, + "Cannot project a partial list element with non-primitive type. Trying to project <%s> out of <%s>", + projectField, + rowField); + + return RowData.createFieldGetter(rowType.getTypeAt(position), position); + + default: + return RowData.createFieldGetter(rowType.getTypeAt(position), position); + } + } + + public RowData wrap(RowData row) { + // StructProjection allow wrapping null root struct object. + // See more discussions in https://github.com/apache/iceberg/pull/7517. + // RowDataProjection never allowed null root object to be wrapped. + // Hence, it is fine to enforce strict Preconditions check here. + Preconditions.checkArgument(row != null, "Invalid row data: null"); + this.rowData = row; + return this; + } + + private Object getValue(int pos) { + Preconditions.checkState(rowData != null, "Row data not wrapped"); + return getters[pos].getFieldOrNull(rowData); + } + + @Override + public int getArity() { + return getters.length; + } + + @Override + public RowKind getRowKind() { + Preconditions.checkState(rowData != null, "Row data not wrapped"); + return rowData.getRowKind(); + } + + @Override + public void setRowKind(RowKind kind) { + throw new UnsupportedOperationException("Cannot set row kind in the RowDataProjection"); + } + + @Override + public boolean isNullAt(int pos) { + return getValue(pos) == null; + } + + @Override + public boolean getBoolean(int pos) { + return (boolean) getValue(pos); + } + + @Override + public byte getByte(int pos) { + return (byte) getValue(pos); + } + + @Override + public short getShort(int pos) { + return (short) getValue(pos); + } + + @Override + public int getInt(int pos) { + return (int) getValue(pos); + } + + @Override + public long getLong(int pos) { + return (long) getValue(pos); + } + + @Override + public float getFloat(int pos) { + return (float) getValue(pos); + } + + @Override + public double getDouble(int pos) { + return (double) getValue(pos); + } + + @Override + public StringData getString(int pos) { + return (StringData) getValue(pos); + } + + @Override + public DecimalData getDecimal(int pos, int precision, int scale) { + return (DecimalData) getValue(pos); + } + + @Override + public TimestampData getTimestamp(int pos, int precision) { + return (TimestampData) getValue(pos); + } + + @Override + @SuppressWarnings("unchecked") + public RawValueData getRawValue(int pos) { + return (RawValueData) getValue(pos); + } + + @Override + public byte[] getBinary(int pos) { + return (byte[]) getValue(pos); + } + + @Override + public ArrayData getArray(int pos) { + return (ArrayData) getValue(pos); + } + + @Override + public MapData getMap(int pos) { + return (MapData) getValue(pos); + } + + @Override + public RowData getRow(int pos, int numFields) { + return (RowData) getValue(pos); + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + + if (!(o instanceof RowDataProjection)) { + return false; + } + + RowDataProjection that = (RowDataProjection) o; + return deepEquals(that); + } + + @Override + public int hashCode() { + int result = Objects.hashCode(getRowKind()); + for (int pos = 0; pos < getArity(); pos++) { + if (!isNullAt(pos)) { + // Arrays.deepHashCode handles array object properly + result = 31 * result + Arrays.deepHashCode(new Object[] {getValue(pos)}); + } + } + + return result; + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append(getRowKind().shortString()).append("("); + for (int pos = 0; pos < getArity(); pos++) { + if (pos != 0) { + sb.append(","); + } + // copied the behavior from Flink GenericRowData + sb.append(StringUtils.arrayAwareToString(getValue(pos))); + } + + sb.append(")"); + return sb.toString(); + } + + private boolean deepEquals(RowDataProjection other) { + if (getRowKind() != other.getRowKind()) { + return false; + } + + if (getArity() != other.getArity()) { + return false; + } + + for (int pos = 0; pos < getArity(); ++pos) { + if (isNullAt(pos) && other.isNullAt(pos)) { + continue; + } + + if ((isNullAt(pos) && !other.isNullAt(pos)) || (!isNullAt(pos) && other.isNullAt(pos))) { + return false; + } + + // Objects.deepEquals handles array object properly + if (!Objects.deepEquals(getValue(pos), other.getValue(pos))) { + return false; + } + } + + return true; + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/RowDataUtil.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/RowDataUtil.java new file mode 100644 index 000000000000..3a8f5ccc6c03 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/RowDataUtil.java @@ -0,0 +1,123 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.data; + +import java.math.BigDecimal; +import java.nio.ByteBuffer; +import org.apache.avro.generic.GenericData; +import org.apache.avro.util.Utf8; +import org.apache.flink.api.common.typeutils.TypeSerializer; +import org.apache.flink.table.data.DecimalData; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.flink.table.data.TimestampData; +import org.apache.flink.table.runtime.typeutils.RowDataSerializer; +import org.apache.flink.table.types.logical.RowType; +import org.apache.iceberg.types.Type; +import org.apache.iceberg.types.Types; +import org.apache.iceberg.util.ByteBuffers; +import org.apache.iceberg.util.DateTimeUtil; + +public class RowDataUtil { + + private RowDataUtil() {} + + public static Object convertConstant(Type type, Object value) { + if (value == null) { + return null; + } + + switch (type.typeId()) { + case DECIMAL: // DecimalData + Types.DecimalType decimal = (Types.DecimalType) type; + return DecimalData.fromBigDecimal((BigDecimal) value, decimal.precision(), decimal.scale()); + case STRING: // StringData + if (value instanceof Utf8) { + Utf8 utf8 = (Utf8) value; + return StringData.fromBytes(utf8.getBytes(), 0, utf8.getByteLength()); + } + return StringData.fromString(value.toString()); + case FIXED: // byte[] + if (value instanceof byte[]) { + return value; + } else if (value instanceof GenericData.Fixed) { + return ((GenericData.Fixed) value).bytes(); + } + return ByteBuffers.toByteArray((ByteBuffer) value); + case BINARY: // byte[] + return ByteBuffers.toByteArray((ByteBuffer) value); + case TIME: // int mills instead of long + return (int) ((Long) value / 1000); + case TIMESTAMP: // TimestampData + return TimestampData.fromLocalDateTime(DateTimeUtil.timestampFromMicros((Long) value)); + default: + } + return value; + } + + /** + * Similar to the private {@link RowDataSerializer#copyRowData(RowData, RowData)} method. This + * skips the check the arity of rowType and from, because the from RowData may contains additional + * column for position deletes. Using {@link RowDataSerializer#copy(RowData, RowData)} will fail + * the arity check. + */ + public static RowData clone( + RowData from, + RowData reuse, + RowType rowType, + TypeSerializer[] fieldSerializers, + RowData.FieldGetter[] fieldGetters) { + GenericRowData ret; + if (reuse instanceof GenericRowData) { + ret = (GenericRowData) reuse; + } else { + ret = new GenericRowData(from.getArity()); + } + + ret.setRowKind(from.getRowKind()); + for (int i = 0; i < rowType.getFieldCount(); i++) { + if (!from.isNullAt(i)) { + ret.setField(i, fieldSerializers[i].copy(fieldGetters[i].getFieldOrNull(from))); + } else { + ret.setField(i, null); + } + } + + return ret; + } + + /** + * @deprecated will be removed in 1.7.0; Not reusing FieldGetter in this method could lead to + * performance degradation, use {@link #clone(RowData, RowData, RowType, TypeSerializer[], + * RowData.FieldGetter[])} instead. + */ + @Deprecated + public static RowData clone( + RowData from, RowData reuse, RowType rowType, TypeSerializer[] fieldSerializers) { + RowData.FieldGetter[] fieldGetters = new RowData.FieldGetter[rowType.getFieldCount()]; + for (int i = 0; i < rowType.getFieldCount(); ++i) { + if (!from.isNullAt(i)) { + fieldGetters[i] = RowData.createFieldGetter(rowType.getTypeAt(i), i); + } + } + + return clone(from, reuse, rowType, fieldSerializers, fieldGetters); + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/StructRowData.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/StructRowData.java new file mode 100644 index 000000000000..1019285018d0 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/StructRowData.java @@ -0,0 +1,300 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.data; + +import java.math.BigDecimal; +import java.nio.ByteBuffer; +import java.time.Duration; +import java.time.Instant; +import java.time.LocalDate; +import java.time.LocalDateTime; +import java.time.LocalTime; +import java.time.OffsetDateTime; +import java.time.ZoneOffset; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.UUID; +import org.apache.flink.annotation.Internal; +import org.apache.flink.table.data.ArrayData; +import org.apache.flink.table.data.DecimalData; +import org.apache.flink.table.data.GenericArrayData; +import org.apache.flink.table.data.GenericMapData; +import org.apache.flink.table.data.MapData; +import org.apache.flink.table.data.RawValueData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.flink.table.data.TimestampData; +import org.apache.flink.types.RowKind; +import org.apache.iceberg.StructLike; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.types.Type; +import org.apache.iceberg.types.Types; +import org.apache.iceberg.util.ByteBuffers; + +@Internal +public class StructRowData implements RowData { + private final Types.StructType type; + private RowKind kind; + private StructLike struct; + + public StructRowData(Types.StructType type) { + this(type, RowKind.INSERT); + } + + public StructRowData(Types.StructType type, RowKind kind) { + this(type, null, kind); + } + + private StructRowData(Types.StructType type, StructLike struct) { + this(type, struct, RowKind.INSERT); + } + + private StructRowData(Types.StructType type, StructLike struct, RowKind kind) { + this.type = type; + this.struct = struct; + this.kind = kind; + } + + public StructRowData setStruct(StructLike newStruct) { + this.struct = newStruct; + return this; + } + + @Override + public int getArity() { + return struct.size(); + } + + @Override + public RowKind getRowKind() { + return kind; + } + + @Override + public void setRowKind(RowKind newKind) { + Preconditions.checkNotNull(newKind, "kind can not be null"); + this.kind = newKind; + } + + @Override + public boolean isNullAt(int pos) { + return struct.get(pos, Object.class) == null; + } + + @Override + public boolean getBoolean(int pos) { + return struct.get(pos, Boolean.class); + } + + @Override + public byte getByte(int pos) { + return (byte) (int) struct.get(pos, Integer.class); + } + + @Override + public short getShort(int pos) { + return (short) (int) struct.get(pos, Integer.class); + } + + @Override + public int getInt(int pos) { + Object integer = struct.get(pos, Object.class); + + if (integer instanceof Integer) { + return (int) integer; + } else if (integer instanceof LocalDate) { + return (int) ((LocalDate) integer).toEpochDay(); + } else if (integer instanceof LocalTime) { + return (int) (((LocalTime) integer).toNanoOfDay() / 1000_000); + } else { + throw new IllegalStateException( + "Unknown type for int field. Type name: " + integer.getClass().getName()); + } + } + + @Override + public long getLong(int pos) { + Object longVal = struct.get(pos, Object.class); + + if (longVal instanceof Long) { + return (long) longVal; + } else if (longVal instanceof OffsetDateTime) { + return Duration.between(Instant.EPOCH, (OffsetDateTime) longVal).toNanos() / 1000; + } else if (longVal instanceof LocalDate) { + return ((LocalDate) longVal).toEpochDay(); + } else if (longVal instanceof LocalTime) { + return ((LocalTime) longVal).toNanoOfDay(); + } else if (longVal instanceof LocalDateTime) { + return Duration.between(Instant.EPOCH, ((LocalDateTime) longVal).atOffset(ZoneOffset.UTC)) + .toNanos() + / 1000; + } else { + throw new IllegalStateException( + "Unknown type for long field. Type name: " + longVal.getClass().getName()); + } + } + + @Override + public float getFloat(int pos) { + return struct.get(pos, Float.class); + } + + @Override + public double getDouble(int pos) { + return struct.get(pos, Double.class); + } + + @Override + public StringData getString(int pos) { + return isNullAt(pos) ? null : getStringDataInternal(pos); + } + + private StringData getStringDataInternal(int pos) { + CharSequence seq = struct.get(pos, CharSequence.class); + return StringData.fromString(seq.toString()); + } + + @Override + public DecimalData getDecimal(int pos, int precision, int scale) { + return isNullAt(pos) + ? null + : DecimalData.fromBigDecimal(getDecimalInternal(pos), precision, scale); + } + + private BigDecimal getDecimalInternal(int pos) { + return struct.get(pos, BigDecimal.class); + } + + @Override + public TimestampData getTimestamp(int pos, int precision) { + long timeLong = getLong(pos); + return TimestampData.fromEpochMillis(timeLong / 1000, (int) (timeLong % 1000) * 1000); + } + + @Override + public RawValueData getRawValue(int pos) { + throw new UnsupportedOperationException("Not supported yet."); + } + + @Override + public byte[] getBinary(int pos) { + return isNullAt(pos) ? null : getBinaryInternal(pos); + } + + private byte[] getBinaryInternal(int pos) { + Object bytes = struct.get(pos, Object.class); + + // should only be either ByteBuffer or byte[] + if (bytes instanceof ByteBuffer) { + return ByteBuffers.toByteArray((ByteBuffer) bytes); + } else if (bytes instanceof byte[]) { + return (byte[]) bytes; + } else if (bytes instanceof UUID) { + UUID uuid = (UUID) bytes; + ByteBuffer bb = ByteBuffer.allocate(16); + bb.putLong(uuid.getMostSignificantBits()); + bb.putLong(uuid.getLeastSignificantBits()); + return bb.array(); + } else { + throw new IllegalStateException( + "Unknown type for binary field. Type name: " + bytes.getClass().getName()); + } + } + + @Override + public ArrayData getArray(int pos) { + return isNullAt(pos) + ? null + : (ArrayData) + convertValue(type.fields().get(pos).type().asListType(), struct.get(pos, List.class)); + } + + @Override + public MapData getMap(int pos) { + return isNullAt(pos) + ? null + : (MapData) + convertValue(type.fields().get(pos).type().asMapType(), struct.get(pos, Map.class)); + } + + @Override + public RowData getRow(int pos, int numFields) { + return isNullAt(pos) ? null : getStructRowData(pos, numFields); + } + + private StructRowData getStructRowData(int pos, int numFields) { + return new StructRowData( + type.fields().get(pos).type().asStructType(), struct.get(pos, StructLike.class)); + } + + private Object convertValue(Type elementType, Object value) { + switch (elementType.typeId()) { + case BOOLEAN: + case INTEGER: + case DATE: + case TIME: + case LONG: + case FLOAT: + case DOUBLE: + case DECIMAL: + return value; + case TIMESTAMP: + long millisecond = (long) value / 1000; + int nanoOfMillisecond = (int) ((Long) value % 1000) * 1000; + return TimestampData.fromEpochMillis(millisecond, nanoOfMillisecond); + case STRING: + return StringData.fromString(value.toString()); + case FIXED: + case BINARY: + return ByteBuffers.toByteArray((ByteBuffer) value); + case STRUCT: + return new StructRowData(elementType.asStructType(), (StructLike) value); + case LIST: + List list = (List) value; + Object[] array = new Object[list.size()]; + + int index = 0; + for (Object element : list) { + if (element == null) { + array[index] = null; + } else { + array[index] = convertValue(elementType.asListType().elementType(), element); + } + + index += 1; + } + return new GenericArrayData(array); + case MAP: + Types.MapType mapType = elementType.asMapType(); + Set> entries = ((Map) value).entrySet(); + Map result = Maps.newHashMap(); + for (Map.Entry entry : entries) { + final Object keyValue = convertValue(mapType.keyType(), entry.getKey()); + final Object valueValue = convertValue(mapType.valueType(), entry.getValue()); + result.put(keyValue, valueValue); + } + + return new GenericMapData(result); + default: + throw new UnsupportedOperationException("Unsupported element type: " + elementType); + } + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/MonitorSource.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/MonitorSource.java new file mode 100644 index 000000000000..d74b2349b1de --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/MonitorSource.java @@ -0,0 +1,206 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.maintenance.operator; + +import java.io.IOException; +import java.util.Iterator; +import org.apache.flink.annotation.Internal; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.connector.source.Boundedness; +import org.apache.flink.api.connector.source.SourceReader; +import org.apache.flink.api.connector.source.SourceReaderContext; +import org.apache.flink.api.connector.source.util.ratelimit.RateLimitedSourceReader; +import org.apache.flink.api.connector.source.util.ratelimit.RateLimiter; +import org.apache.flink.api.connector.source.util.ratelimit.RateLimiterStrategy; +import org.apache.flink.core.io.SimpleVersionedSerializer; +import org.apache.flink.core.memory.DataInputDeserializer; +import org.apache.flink.core.memory.DataOutputSerializer; +import org.apache.iceberg.DataOperations; +import org.apache.iceberg.Snapshot; +import org.apache.iceberg.Table; +import org.apache.iceberg.flink.TableLoader; +import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; +import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** Monitors an Iceberg table for changes */ +@Internal +public class MonitorSource extends SingleThreadedIteratorSource { + private static final Logger LOG = LoggerFactory.getLogger(MonitorSource.class); + + private final TableLoader tableLoader; + private final RateLimiterStrategy rateLimiterStrategy; + private final long maxReadBack; + + /** + * Creates a {@link org.apache.flink.api.connector.source.Source} which monitors an Iceberg table + * for changes. + * + * @param tableLoader used for accessing the table + * @param rateLimiterStrategy limits the frequency the table is checked + * @param maxReadBack sets the number of snapshots read before stopping change collection + */ + public MonitorSource( + TableLoader tableLoader, RateLimiterStrategy rateLimiterStrategy, long maxReadBack) { + Preconditions.checkNotNull(tableLoader, "Table loader should no be null"); + Preconditions.checkNotNull(rateLimiterStrategy, "Rate limiter strategy should no be null"); + Preconditions.checkArgument(maxReadBack > 0, "Need to read at least 1 snapshot to work"); + + this.tableLoader = tableLoader; + this.rateLimiterStrategy = rateLimiterStrategy; + this.maxReadBack = maxReadBack; + } + + @Override + public Boundedness getBoundedness() { + return Boundedness.CONTINUOUS_UNBOUNDED; + } + + @Override + public TypeInformation getProducedType() { + return TypeInformation.of(TableChange.class); + } + + @Override + Iterator createIterator() { + return new TableChangeIterator(tableLoader, null, maxReadBack); + } + + @Override + SimpleVersionedSerializer> iteratorSerializer() { + return new TableChangeIteratorSerializer(tableLoader, maxReadBack); + } + + @Override + public SourceReader> createReader( + SourceReaderContext readerContext) throws Exception { + RateLimiter rateLimiter = rateLimiterStrategy.createRateLimiter(1); + return new RateLimitedSourceReader<>(super.createReader(readerContext), rateLimiter); + } + + /** The Iterator which returns the latest changes on an Iceberg table. */ + @VisibleForTesting + static class TableChangeIterator implements Iterator { + private Long lastSnapshotId; + private final long maxReadBack; + private final Table table; + + TableChangeIterator(TableLoader tableLoader, Long lastSnapshotId, long maxReadBack) { + this.lastSnapshotId = lastSnapshotId; + this.maxReadBack = maxReadBack; + tableLoader.open(); + this.table = tableLoader.loadTable(); + } + + @Override + public boolean hasNext() { + return true; + } + + @Override + public TableChange next() { + try { + table.refresh(); + Snapshot currentSnapshot = table.currentSnapshot(); + Long current = currentSnapshot != null ? currentSnapshot.snapshotId() : null; + Long checking = current; + TableChange event = TableChange.empty(); + long readBack = 0; + while (checking != null && !checking.equals(lastSnapshotId) && ++readBack <= maxReadBack) { + Snapshot snapshot = table.snapshot(checking); + if (snapshot != null) { + if (!DataOperations.REPLACE.equals(snapshot.operation())) { + LOG.debug("Reading snapshot {}", snapshot.snapshotId()); + event.merge(new TableChange(snapshot, table.io())); + } else { + LOG.debug("Skipping replace snapshot {}", snapshot.snapshotId()); + } + + checking = snapshot.parentId(); + } else { + // If the last snapshot has been removed from the history + checking = null; + } + } + + lastSnapshotId = current; + return event; + } catch (Exception e) { + LOG.warn("Failed to fetch table changes for {}", table, e); + return TableChange.empty(); + } + } + + @Override + public String toString() { + return MoreObjects.toStringHelper(this) + .add("lastSnapshotId", lastSnapshotId) + .add("maxReadBack", maxReadBack) + .add("table", table) + .toString(); + } + } + + private static final class TableChangeIteratorSerializer + implements SimpleVersionedSerializer> { + + private static final int CURRENT_VERSION = 1; + private final TableLoader tableLoader; + private final long maxReadBack; + + TableChangeIteratorSerializer(TableLoader tableLoader, long maxReadBack) { + this.tableLoader = tableLoader; + this.maxReadBack = maxReadBack; + } + + @Override + public int getVersion() { + return CURRENT_VERSION; + } + + @Override + public byte[] serialize(Iterator iterator) throws IOException { + Preconditions.checkArgument( + iterator instanceof TableChangeIterator, + "Use TableChangeIterator iterator. Found incompatible type: %s", + iterator.getClass()); + + TableChangeIterator tableChangeIterator = (TableChangeIterator) iterator; + DataOutputSerializer out = new DataOutputSerializer(8); + long toStore = + tableChangeIterator.lastSnapshotId != null ? tableChangeIterator.lastSnapshotId : -1L; + out.writeLong(toStore); + return out.getCopyOfBuffer(); + } + + @Override + public TableChangeIterator deserialize(int version, byte[] serialized) throws IOException { + if (version == CURRENT_VERSION) { + DataInputDeserializer in = new DataInputDeserializer(serialized); + long fromStore = in.readLong(); + return new TableChangeIterator( + tableLoader, fromStore != -1 ? fromStore : null, maxReadBack); + } else { + throw new IOException("Unrecognized version or corrupt state: " + version); + } + } + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/SingleThreadedIteratorSource.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/SingleThreadedIteratorSource.java new file mode 100644 index 000000000000..20c7684d9700 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/SingleThreadedIteratorSource.java @@ -0,0 +1,197 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.maintenance.operator; + +import java.io.IOException; +import java.util.Collection; +import java.util.Iterator; +import org.apache.flink.annotation.Internal; +import org.apache.flink.api.connector.source.Source; +import org.apache.flink.api.connector.source.SourceReader; +import org.apache.flink.api.connector.source.SourceReaderContext; +import org.apache.flink.api.connector.source.SplitEnumerator; +import org.apache.flink.api.connector.source.SplitEnumeratorContext; +import org.apache.flink.api.connector.source.lib.util.IteratorSourceEnumerator; +import org.apache.flink.api.connector.source.lib.util.IteratorSourceReader; +import org.apache.flink.api.connector.source.lib.util.IteratorSourceSplit; +import org.apache.flink.api.java.typeutils.ResultTypeQueryable; +import org.apache.flink.core.io.SimpleVersionedSerializer; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; + +/** + * Implementation of the Source V2 API which uses an iterator to read the elements, and uses a + * single thread to do so. + * + * @param The return type of the source + */ +@Internal +public abstract class SingleThreadedIteratorSource + implements Source< + T, + SingleThreadedIteratorSource.GlobalSplit, + Collection>>, + ResultTypeQueryable { + private static final String PARALLELISM_ERROR = "Parallelism should be set to 1"; + + /** + * Creates the iterator to return the elements which then emitted by the source. + * + * @return iterator for the elements + */ + abstract Iterator createIterator(); + + /** + * Serializes the iterator, which is used to save and restore the state of the source. + * + * @return serializer for the iterator + */ + abstract SimpleVersionedSerializer> iteratorSerializer(); + + @Override + public SplitEnumerator, Collection>> createEnumerator( + SplitEnumeratorContext> enumContext) { + Preconditions.checkArgument(enumContext.currentParallelism() == 1, PARALLELISM_ERROR); + return new IteratorSourceEnumerator<>( + enumContext, ImmutableList.of(new GlobalSplit<>(createIterator()))); + } + + @Override + public SplitEnumerator, Collection>> restoreEnumerator( + SplitEnumeratorContext> enumContext, Collection> checkpoint) { + Preconditions.checkArgument(enumContext.currentParallelism() == 1, PARALLELISM_ERROR); + return new IteratorSourceEnumerator<>(enumContext, checkpoint); + } + + @Override + public SimpleVersionedSerializer> getSplitSerializer() { + return new SplitSerializer<>(iteratorSerializer()); + } + + @Override + public SimpleVersionedSerializer>> getEnumeratorCheckpointSerializer() { + return new EnumeratorSerializer<>(iteratorSerializer()); + } + + @Override + public SourceReader> createReader(SourceReaderContext readerContext) + throws Exception { + Preconditions.checkArgument(readerContext.getIndexOfSubtask() == 0, PARALLELISM_ERROR); + return new IteratorSourceReader<>(readerContext); + } + + /** The single split of the {@link SingleThreadedIteratorSource}. */ + static class GlobalSplit implements IteratorSourceSplit> { + private final Iterator iterator; + + GlobalSplit(Iterator iterator) { + this.iterator = iterator; + } + + @Override + public String splitId() { + return "1"; + } + + @Override + public Iterator getIterator() { + return iterator; + } + + @Override + public IteratorSourceSplit> getUpdatedSplitForIterator( + final Iterator newIterator) { + return new GlobalSplit<>(newIterator); + } + + @Override + public String toString() { + return String.format("GlobalSplit (%s)", iterator); + } + } + + private static final class SplitSerializer + implements SimpleVersionedSerializer> { + private final SimpleVersionedSerializer> iteratorSerializer; + + SplitSerializer(SimpleVersionedSerializer> iteratorSerializer) { + this.iteratorSerializer = iteratorSerializer; + } + + private static final int CURRENT_VERSION = 1; + + @Override + public int getVersion() { + return CURRENT_VERSION; + } + + @Override + public byte[] serialize(GlobalSplit split) throws IOException { + return iteratorSerializer.serialize(split.iterator); + } + + @Override + public GlobalSplit deserialize(int version, byte[] serialized) throws IOException { + return new GlobalSplit<>(iteratorSerializer.deserialize(version, serialized)); + } + } + + private static final class EnumeratorSerializer + implements SimpleVersionedSerializer>> { + private static final int CURRENT_VERSION = 1; + private final SimpleVersionedSerializer> iteratorSerializer; + + EnumeratorSerializer(SimpleVersionedSerializer> iteratorSerializer) { + this.iteratorSerializer = iteratorSerializer; + } + + @Override + public int getVersion() { + return CURRENT_VERSION; + } + + @Override + public byte[] serialize(Collection> checkpoint) throws IOException { + Preconditions.checkArgument(checkpoint.size() < 2, PARALLELISM_ERROR); + if (checkpoint.isEmpty()) { + return new byte[] {0}; + } else { + byte[] iterator = iteratorSerializer.serialize(checkpoint.iterator().next().getIterator()); + byte[] result = new byte[iterator.length + 1]; + result[0] = 1; + System.arraycopy(iterator, 0, result, 1, iterator.length); + return result; + } + } + + @Override + public Collection> deserialize(int version, byte[] serialized) + throws IOException { + if (serialized[0] == 0) { + return Lists.newArrayList(); + } else { + byte[] iterator = new byte[serialized.length - 1]; + System.arraycopy(serialized, 1, iterator, 0, serialized.length - 1); + return Lists.newArrayList( + new GlobalSplit<>(iteratorSerializer.deserialize(version, iterator))); + } + } + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/TableChange.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/TableChange.java new file mode 100644 index 000000000000..452ed80ed0e5 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/TableChange.java @@ -0,0 +1,133 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.maintenance.operator; + +import java.util.Objects; +import org.apache.flink.annotation.Internal; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.DeleteFile; +import org.apache.iceberg.Snapshot; +import org.apache.iceberg.io.FileIO; +import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; + +/** Event describing changes in an Iceberg table */ +@Internal +class TableChange { + private int dataFileNum; + private int deleteFileNum; + private long dataFileSize; + private long deleteFileSize; + private int commitNum; + + TableChange( + int dataFileNum, int deleteFileNum, long dataFileSize, long deleteFileSize, int commitNum) { + this.dataFileNum = dataFileNum; + this.deleteFileNum = deleteFileNum; + this.dataFileSize = dataFileSize; + this.deleteFileSize = deleteFileSize; + this.commitNum = commitNum; + } + + TableChange(Snapshot snapshot, FileIO io) { + Iterable dataFiles = snapshot.addedDataFiles(io); + Iterable deleteFiles = snapshot.addedDeleteFiles(io); + + dataFiles.forEach( + dataFile -> { + this.dataFileNum++; + this.dataFileSize += dataFile.fileSizeInBytes(); + }); + + deleteFiles.forEach( + deleteFile -> { + this.deleteFileNum++; + this.deleteFileSize += deleteFile.fileSizeInBytes(); + }); + + this.commitNum = 1; + } + + static TableChange empty() { + return new TableChange(0, 0, 0L, 0L, 0); + } + + int dataFileNum() { + return dataFileNum; + } + + int deleteFileNum() { + return deleteFileNum; + } + + long dataFileSize() { + return dataFileSize; + } + + long deleteFileSize() { + return deleteFileSize; + } + + public int commitNum() { + return commitNum; + } + + public void merge(TableChange other) { + this.dataFileNum += other.dataFileNum; + this.deleteFileNum += other.deleteFileNum; + this.dataFileSize += other.dataFileSize; + this.deleteFileSize += other.deleteFileSize; + this.commitNum += other.commitNum; + } + + TableChange copy() { + return new TableChange(dataFileNum, deleteFileNum, dataFileSize, deleteFileSize, commitNum); + } + + @Override + public String toString() { + return MoreObjects.toStringHelper(this) + .add("dataFileNum", dataFileNum) + .add("deleteFileNum", deleteFileNum) + .add("dataFileSize", dataFileSize) + .add("deleteFileSize", deleteFileSize) + .add("commitNum", commitNum) + .toString(); + } + + @Override + public boolean equals(Object other) { + if (this == other) { + return true; + } else if (other == null || getClass() != other.getClass()) { + return false; + } + + TableChange that = (TableChange) other; + return dataFileNum == that.dataFileNum + && deleteFileNum == that.deleteFileNum + && dataFileSize == that.dataFileSize + && deleteFileSize == that.deleteFileSize + && commitNum == that.commitNum; + } + + @Override + public int hashCode() { + return Objects.hash(dataFileNum, deleteFileNum, dataFileSize, deleteFileSize, commitNum); + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/AvroGenericRecordToRowDataMapper.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/AvroGenericRecordToRowDataMapper.java new file mode 100644 index 000000000000..f7e8e0c884cf --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/AvroGenericRecordToRowDataMapper.java @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; +import org.apache.flink.api.common.functions.MapFunction; +import org.apache.flink.formats.avro.AvroToRowDataConverters; +import org.apache.flink.formats.avro.typeutils.AvroSchemaConverter; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.DataType; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.table.types.utils.TypeConversions; +import org.apache.iceberg.avro.AvroSchemaUtil; + +/** + * This util class converts Avro GenericRecord to Flink RowData.
+ *
+ * Internally it uses Flink {@link AvroToRowDataConverters}. Because of the precision difference + * between how Iceberg schema (micro) and Flink {@link AvroToRowDataConverters} (milli) deal with + * time type, we can't directly use the Avro Schema converted from Iceberg schema via {@link + * AvroSchemaUtil#convert(org.apache.iceberg.Schema, String)}. + */ +public class AvroGenericRecordToRowDataMapper implements MapFunction { + + private final AvroToRowDataConverters.AvroToRowDataConverter converter; + + AvroGenericRecordToRowDataMapper(RowType rowType) { + this.converter = AvroToRowDataConverters.createRowConverter(rowType); + } + + @Override + public RowData map(GenericRecord genericRecord) throws Exception { + return (RowData) converter.convert(genericRecord); + } + + /** Create a mapper based on Avro schema. */ + public static AvroGenericRecordToRowDataMapper forAvroSchema(Schema avroSchema) { + DataType dataType = AvroSchemaConverter.convertToDataType(avroSchema.toString()); + LogicalType logicalType = TypeConversions.fromDataToLogicalType(dataType); + RowType rowType = RowType.of(logicalType.getChildren().toArray(new LogicalType[0])); + return new AvroGenericRecordToRowDataMapper(rowType); + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/BaseDeltaTaskWriter.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/BaseDeltaTaskWriter.java new file mode 100644 index 000000000000..e8a46c5becd7 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/BaseDeltaTaskWriter.java @@ -0,0 +1,126 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import java.io.IOException; +import java.util.List; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.PartitionKey; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.StructLike; +import org.apache.iceberg.deletes.DeleteGranularity; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.RowDataWrapper; +import org.apache.iceberg.flink.data.RowDataProjection; +import org.apache.iceberg.io.BaseTaskWriter; +import org.apache.iceberg.io.FileAppenderFactory; +import org.apache.iceberg.io.FileIO; +import org.apache.iceberg.io.OutputFileFactory; +import org.apache.iceberg.relocated.com.google.common.collect.Sets; +import org.apache.iceberg.types.TypeUtil; + +abstract class BaseDeltaTaskWriter extends BaseTaskWriter { + + private final Schema schema; + private final Schema deleteSchema; + private final RowDataWrapper wrapper; + private final RowDataWrapper keyWrapper; + private final RowDataProjection keyProjection; + private final boolean upsert; + + BaseDeltaTaskWriter( + PartitionSpec spec, + FileFormat format, + FileAppenderFactory appenderFactory, + OutputFileFactory fileFactory, + FileIO io, + long targetFileSize, + Schema schema, + RowType flinkSchema, + List equalityFieldIds, + boolean upsert) { + super(spec, format, appenderFactory, fileFactory, io, targetFileSize); + this.schema = schema; + this.deleteSchema = TypeUtil.select(schema, Sets.newHashSet(equalityFieldIds)); + this.wrapper = new RowDataWrapper(flinkSchema, schema.asStruct()); + this.keyWrapper = + new RowDataWrapper(FlinkSchemaUtil.convert(deleteSchema), deleteSchema.asStruct()); + this.keyProjection = + RowDataProjection.create(flinkSchema, schema.asStruct(), deleteSchema.asStruct()); + this.upsert = upsert; + } + + abstract RowDataDeltaWriter route(RowData row); + + RowDataWrapper wrapper() { + return wrapper; + } + + @Override + public void write(RowData row) throws IOException { + RowDataDeltaWriter writer = route(row); + + switch (row.getRowKind()) { + case INSERT: + case UPDATE_AFTER: + if (upsert) { + writer.deleteKey(keyProjection.wrap(row)); + } + writer.write(row); + break; + + case UPDATE_BEFORE: + if (upsert) { + break; // UPDATE_BEFORE is not necessary for UPSERT, we do nothing to prevent delete one + // row twice + } + writer.delete(row); + break; + case DELETE: + if (upsert) { + writer.deleteKey(keyProjection.wrap(row)); + } else { + writer.delete(row); + } + break; + + default: + throw new UnsupportedOperationException("Unknown row kind: " + row.getRowKind()); + } + } + + protected class RowDataDeltaWriter extends BaseEqualityDeltaWriter { + RowDataDeltaWriter(PartitionKey partition) { + super(partition, schema, deleteSchema, DeleteGranularity.FILE); + } + + @Override + protected StructLike asStructLike(RowData data) { + return wrapper.wrap(data); + } + + @Override + protected StructLike asStructLikeKey(RowData data) { + return keyWrapper.wrap(data); + } + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/BucketPartitionKeySelector.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/BucketPartitionKeySelector.java new file mode 100644 index 000000000000..1cb6e013bd2c --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/BucketPartitionKeySelector.java @@ -0,0 +1,70 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import java.util.stream.IntStream; +import org.apache.flink.api.java.functions.KeySelector; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.iceberg.PartitionKey; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.flink.RowDataWrapper; + +/** + * A {@link KeySelector} that extracts the bucketId from a data row's bucket partition as the key. + * To be used with the {@link BucketPartitioner}. + */ +class BucketPartitionKeySelector implements KeySelector { + + private final Schema schema; + private final PartitionKey partitionKey; + private final RowType flinkSchema; + private final int bucketFieldPosition; + + private transient RowDataWrapper rowDataWrapper; + + BucketPartitionKeySelector(PartitionSpec partitionSpec, Schema schema, RowType flinkSchema) { + this.schema = schema; + this.partitionKey = new PartitionKey(partitionSpec, schema); + this.flinkSchema = flinkSchema; + this.bucketFieldPosition = getBucketFieldPosition(partitionSpec); + } + + private int getBucketFieldPosition(PartitionSpec partitionSpec) { + int bucketFieldId = BucketPartitionerUtil.getBucketFieldId(partitionSpec); + return IntStream.range(0, partitionSpec.fields().size()) + .filter(i -> partitionSpec.fields().get(i).fieldId() == bucketFieldId) + .toArray()[0]; + } + + private RowDataWrapper lazyRowDataWrapper() { + if (rowDataWrapper == null) { + rowDataWrapper = new RowDataWrapper(flinkSchema, schema.asStruct()); + } + + return rowDataWrapper; + } + + @Override + public Integer getKey(RowData rowData) { + partitionKey.partition(lazyRowDataWrapper().wrap(rowData)); + return partitionKey.get(bucketFieldPosition, Integer.class); + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/BucketPartitioner.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/BucketPartitioner.java new file mode 100644 index 000000000000..9c9a117906e2 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/BucketPartitioner.java @@ -0,0 +1,103 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import org.apache.flink.api.common.functions.Partitioner; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; + +/** + * This partitioner will redirect records to writers deterministically based on the Bucket partition + * spec. It'll attempt to optimize the file size written depending on whether numPartitions is + * greater, less or equal than the maxNumBuckets. Note: The current implementation only supports ONE + * bucket in the partition spec. + */ +class BucketPartitioner implements Partitioner { + + static final String BUCKET_NULL_MESSAGE = "bucketId cannot be null"; + static final String BUCKET_LESS_THAN_LOWER_BOUND_MESSAGE = + "Invalid bucket ID %s: must be non-negative."; + static final String BUCKET_GREATER_THAN_UPPER_BOUND_MESSAGE = + "Invalid bucket ID %s: must be less than bucket limit: %s."; + + private final int maxNumBuckets; + + // To hold the OFFSET of the next writer to use for any bucket, only used when writers > the + // number of buckets + private final int[] currentBucketWriterOffset; + + BucketPartitioner(PartitionSpec partitionSpec) { + this.maxNumBuckets = BucketPartitionerUtil.getMaxNumBuckets(partitionSpec); + this.currentBucketWriterOffset = new int[maxNumBuckets]; + } + + /** + * Determine the partition id based on the following criteria: If the number of writers <= the + * number of buckets, an evenly distributed number of buckets will be assigned to each writer (one + * writer -> many buckets). Conversely, if the number of writers > the number of buckets the logic + * is handled by the {@link #getPartitionWithMoreWritersThanBuckets + * getPartitionWritersGreaterThanBuckets} method. + * + * @param bucketId the bucketId for each request + * @param numPartitions the total number of partitions + * @return the partition id (writer) to use for each request + */ + @Override + public int partition(Integer bucketId, int numPartitions) { + Preconditions.checkNotNull(bucketId, BUCKET_NULL_MESSAGE); + Preconditions.checkArgument(bucketId >= 0, BUCKET_LESS_THAN_LOWER_BOUND_MESSAGE, bucketId); + Preconditions.checkArgument( + bucketId < maxNumBuckets, BUCKET_GREATER_THAN_UPPER_BOUND_MESSAGE, bucketId, maxNumBuckets); + + if (numPartitions <= maxNumBuckets) { + return bucketId % numPartitions; + } else { + return getPartitionWithMoreWritersThanBuckets(bucketId, numPartitions); + } + } + + /*- + * If the number of writers > the number of buckets each partitioner will keep a state of multiple + * writers per bucket as evenly as possible, and will round-robin the requests across them, in this + * case each writer will target only one bucket at all times (many writers -> one bucket). Example: + * Configuration: numPartitions (writers) = 5, maxBuckets = 2 + * Expected behavior: + * - Records for Bucket 0 will be "round robin" between Writers 0, 2 and 4 + * - Records for Bucket 1 will always use Writer 1 and 3 + * Notes: + * - maxNumWritersPerBucket determines when to reset the currentBucketWriterOffset to 0 for this bucketId + * - When numPartitions is not evenly divisible by maxBuckets, some buckets will have one more writer (extraWriter). + * In this example Bucket 0 has an "extra writer" to consider before resetting its offset to 0. + * + * @return the destination partition index (writer subtask id) + */ + private int getPartitionWithMoreWritersThanBuckets(int bucketId, int numPartitions) { + int currentOffset = currentBucketWriterOffset[bucketId]; + // Determine if this bucket requires an "extra writer" + int extraWriter = bucketId < (numPartitions % maxNumBuckets) ? 1 : 0; + // The max number of writers this bucket can have + int maxNumWritersPerBucket = (numPartitions / maxNumBuckets) + extraWriter; + + // Increment the writer offset or reset if it's reached the max for this bucket + int nextOffset = currentOffset == maxNumWritersPerBucket - 1 ? 0 : currentOffset + 1; + currentBucketWriterOffset[bucketId] = nextOffset; + + return bucketId + (maxNumBuckets * currentOffset); + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/BucketPartitionerUtil.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/BucketPartitionerUtil.java new file mode 100644 index 000000000000..c33207728d3e --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/BucketPartitionerUtil.java @@ -0,0 +1,125 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import java.util.List; +import java.util.Objects; +import java.util.stream.Collectors; +import org.apache.flink.api.java.tuple.Tuple2; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.transforms.PartitionSpecVisitor; + +final class BucketPartitionerUtil { + static final String BAD_NUMBER_OF_BUCKETS_ERROR_MESSAGE = + "Invalid number of buckets: %s (must be 1)"; + + private BucketPartitionerUtil() {} + + /** + * Determines whether the PartitionSpec has one and only one Bucket definition + * + * @param partitionSpec the partition spec in question + * @return whether the PartitionSpec has only one Bucket + */ + static boolean hasOneBucketField(PartitionSpec partitionSpec) { + List> bucketFields = getBucketFields(partitionSpec); + return bucketFields != null && bucketFields.size() == 1; + } + + /** + * Extracts the Bucket definition from a PartitionSpec. + * + * @param partitionSpec the partition spec in question + * @return the Bucket definition in the form of a tuple (fieldId, maxNumBuckets) + */ + private static Tuple2 getBucketFieldInfo(PartitionSpec partitionSpec) { + List> bucketFields = getBucketFields(partitionSpec); + Preconditions.checkArgument( + bucketFields.size() == 1, + BucketPartitionerUtil.BAD_NUMBER_OF_BUCKETS_ERROR_MESSAGE, + bucketFields.size()); + return bucketFields.get(0); + } + + static int getBucketFieldId(PartitionSpec partitionSpec) { + return getBucketFieldInfo(partitionSpec).f0; + } + + static int getMaxNumBuckets(PartitionSpec partitionSpec) { + return getBucketFieldInfo(partitionSpec).f1; + } + + private static List> getBucketFields(PartitionSpec spec) { + return PartitionSpecVisitor.visit(spec, new BucketPartitionSpecVisitor()).stream() + .filter(Objects::nonNull) + .collect(Collectors.toList()); + } + + private static class BucketPartitionSpecVisitor + implements PartitionSpecVisitor> { + @Override + public Tuple2 identity(int fieldId, String sourceName, int sourceId) { + return null; + } + + @Override + public Tuple2 bucket( + int fieldId, String sourceName, int sourceId, int numBuckets) { + return new Tuple2<>(fieldId, numBuckets); + } + + @Override + public Tuple2 truncate( + int fieldId, String sourceName, int sourceId, int width) { + return null; + } + + @Override + public Tuple2 year(int fieldId, String sourceName, int sourceId) { + return null; + } + + @Override + public Tuple2 month(int fieldId, String sourceName, int sourceId) { + return null; + } + + @Override + public Tuple2 day(int fieldId, String sourceName, int sourceId) { + return null; + } + + @Override + public Tuple2 hour(int fieldId, String sourceName, int sourceId) { + return null; + } + + @Override + public Tuple2 alwaysNull(int fieldId, String sourceName, int sourceId) { + return null; + } + + @Override + public Tuple2 unknown( + int fieldId, String sourceName, int sourceId, String transform) { + return null; + } + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/CachingTableSupplier.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/CachingTableSupplier.java new file mode 100644 index 000000000000..e9f9786f9190 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/CachingTableSupplier.java @@ -0,0 +1,91 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import java.time.Duration; +import org.apache.flink.util.Preconditions; +import org.apache.iceberg.SerializableTable; +import org.apache.iceberg.Table; +import org.apache.iceberg.flink.TableLoader; +import org.apache.iceberg.util.DateTimeUtil; +import org.apache.iceberg.util.SerializableSupplier; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * A table loader that will only reload a table after a certain interval has passed. WARNING: This + * table loader should be used carefully when used with writer tasks. It could result in heavy load + * on a catalog for jobs with many writers. + */ +class CachingTableSupplier implements SerializableSupplier { + + private static final Logger LOG = LoggerFactory.getLogger(CachingTableSupplier.class); + + private final Table initialTable; + private final TableLoader tableLoader; + private final Duration tableRefreshInterval; + private long lastLoadTimeMillis; + private transient Table table; + + CachingTableSupplier( + SerializableTable initialTable, TableLoader tableLoader, Duration tableRefreshInterval) { + Preconditions.checkArgument(initialTable != null, "initialTable cannot be null"); + Preconditions.checkArgument(tableLoader != null, "tableLoader cannot be null"); + Preconditions.checkArgument( + tableRefreshInterval != null, "tableRefreshInterval cannot be null"); + this.initialTable = initialTable; + this.table = initialTable; + this.tableLoader = tableLoader; + this.tableRefreshInterval = tableRefreshInterval; + this.lastLoadTimeMillis = System.currentTimeMillis(); + } + + @Override + public Table get() { + if (table == null) { + this.table = initialTable; + } + return table; + } + + Table initialTable() { + return initialTable; + } + + void refreshTable() { + if (System.currentTimeMillis() > lastLoadTimeMillis + tableRefreshInterval.toMillis()) { + try { + if (!tableLoader.isOpen()) { + tableLoader.open(); + } + + this.table = tableLoader.loadTable(); + this.lastLoadTimeMillis = System.currentTimeMillis(); + + LOG.info( + "Table {} reloaded, next min load time threshold is {}", + table.name(), + DateTimeUtil.formatTimestampMillis( + lastLoadTimeMillis + tableRefreshInterval.toMillis())); + } catch (Exception e) { + LOG.warn("An error occurred reloading table {}, table was not reloaded", table.name(), e); + } + } + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/CommitSummary.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/CommitSummary.java new file mode 100644 index 000000000000..9a2f57181708 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/CommitSummary.java @@ -0,0 +1,93 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import java.util.Arrays; +import java.util.NavigableMap; +import java.util.concurrent.atomic.AtomicLong; +import org.apache.iceberg.io.WriteResult; +import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; + +class CommitSummary { + + private final AtomicLong dataFilesCount = new AtomicLong(); + private final AtomicLong dataFilesRecordCount = new AtomicLong(); + private final AtomicLong dataFilesByteCount = new AtomicLong(); + private final AtomicLong deleteFilesCount = new AtomicLong(); + private final AtomicLong deleteFilesRecordCount = new AtomicLong(); + private final AtomicLong deleteFilesByteCount = new AtomicLong(); + + CommitSummary(NavigableMap pendingResults) { + pendingResults + .values() + .forEach( + writeResult -> { + dataFilesCount.addAndGet(writeResult.dataFiles().length); + Arrays.stream(writeResult.dataFiles()) + .forEach( + dataFile -> { + dataFilesRecordCount.addAndGet(dataFile.recordCount()); + dataFilesByteCount.addAndGet(dataFile.fileSizeInBytes()); + }); + deleteFilesCount.addAndGet(writeResult.deleteFiles().length); + Arrays.stream(writeResult.deleteFiles()) + .forEach( + deleteFile -> { + deleteFilesRecordCount.addAndGet(deleteFile.recordCount()); + deleteFilesByteCount.addAndGet(deleteFile.fileSizeInBytes()); + }); + }); + } + + long dataFilesCount() { + return dataFilesCount.get(); + } + + long dataFilesRecordCount() { + return dataFilesRecordCount.get(); + } + + long dataFilesByteCount() { + return dataFilesByteCount.get(); + } + + long deleteFilesCount() { + return deleteFilesCount.get(); + } + + long deleteFilesRecordCount() { + return deleteFilesRecordCount.get(); + } + + long deleteFilesByteCount() { + return deleteFilesByteCount.get(); + } + + @Override + public String toString() { + return MoreObjects.toStringHelper(this) + .add("dataFilesCount", dataFilesCount) + .add("dataFilesRecordCount", dataFilesRecordCount) + .add("dataFilesByteCount", dataFilesByteCount) + .add("deleteFilesCount", deleteFilesCount) + .add("deleteFilesRecordCount", deleteFilesRecordCount) + .add("deleteFilesByteCount", deleteFilesByteCount) + .toString(); + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/DeltaManifests.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/DeltaManifests.java new file mode 100644 index 000000000000..036970c06d5b --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/DeltaManifests.java @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import java.util.List; +import org.apache.iceberg.ManifestFile; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; + +class DeltaManifests { + + private static final CharSequence[] EMPTY_REF_DATA_FILES = new CharSequence[0]; + + private final ManifestFile dataManifest; + private final ManifestFile deleteManifest; + private final CharSequence[] referencedDataFiles; + + DeltaManifests(ManifestFile dataManifest, ManifestFile deleteManifest) { + this(dataManifest, deleteManifest, EMPTY_REF_DATA_FILES); + } + + DeltaManifests( + ManifestFile dataManifest, ManifestFile deleteManifest, CharSequence[] referencedDataFiles) { + Preconditions.checkNotNull(referencedDataFiles, "Referenced data files shouldn't be null."); + + this.dataManifest = dataManifest; + this.deleteManifest = deleteManifest; + this.referencedDataFiles = referencedDataFiles; + } + + ManifestFile dataManifest() { + return dataManifest; + } + + ManifestFile deleteManifest() { + return deleteManifest; + } + + CharSequence[] referencedDataFiles() { + return referencedDataFiles; + } + + List manifests() { + List manifests = Lists.newArrayListWithCapacity(2); + if (dataManifest != null) { + manifests.add(dataManifest); + } + + if (deleteManifest != null) { + manifests.add(deleteManifest); + } + + return manifests; + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/DeltaManifestsSerializer.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/DeltaManifestsSerializer.java new file mode 100644 index 000000000000..92ca284b12ba --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/DeltaManifestsSerializer.java @@ -0,0 +1,122 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.IOException; +import org.apache.flink.core.io.SimpleVersionedSerializer; +import org.apache.iceberg.ManifestFile; +import org.apache.iceberg.ManifestFiles; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; + +class DeltaManifestsSerializer implements SimpleVersionedSerializer { + private static final int VERSION_1 = 1; + private static final int VERSION_2 = 2; + private static final byte[] EMPTY_BINARY = new byte[0]; + + static final DeltaManifestsSerializer INSTANCE = new DeltaManifestsSerializer(); + + @Override + public int getVersion() { + return VERSION_2; + } + + @Override + public byte[] serialize(DeltaManifests deltaManifests) throws IOException { + Preconditions.checkNotNull( + deltaManifests, "DeltaManifests to be serialized should not be null"); + + ByteArrayOutputStream binaryOut = new ByteArrayOutputStream(); + DataOutputStream out = new DataOutputStream(binaryOut); + + byte[] dataManifestBinary = EMPTY_BINARY; + if (deltaManifests.dataManifest() != null) { + dataManifestBinary = ManifestFiles.encode(deltaManifests.dataManifest()); + } + + out.writeInt(dataManifestBinary.length); + out.write(dataManifestBinary); + + byte[] deleteManifestBinary = EMPTY_BINARY; + if (deltaManifests.deleteManifest() != null) { + deleteManifestBinary = ManifestFiles.encode(deltaManifests.deleteManifest()); + } + + out.writeInt(deleteManifestBinary.length); + out.write(deleteManifestBinary); + + CharSequence[] referencedDataFiles = deltaManifests.referencedDataFiles(); + out.writeInt(referencedDataFiles.length); + for (CharSequence referencedDataFile : referencedDataFiles) { + out.writeUTF(referencedDataFile.toString()); + } + + return binaryOut.toByteArray(); + } + + @Override + public DeltaManifests deserialize(int version, byte[] serialized) throws IOException { + if (version == VERSION_1) { + return deserializeV1(serialized); + } else if (version == VERSION_2) { + return deserializeV2(serialized); + } else { + throw new RuntimeException("Unknown serialize version: " + version); + } + } + + private DeltaManifests deserializeV1(byte[] serialized) throws IOException { + return new DeltaManifests(ManifestFiles.decode(serialized), null); + } + + private DeltaManifests deserializeV2(byte[] serialized) throws IOException { + ManifestFile dataManifest = null; + ManifestFile deleteManifest = null; + + ByteArrayInputStream binaryIn = new ByteArrayInputStream(serialized); + DataInputStream in = new DataInputStream(binaryIn); + + int dataManifestSize = in.readInt(); + if (dataManifestSize > 0) { + byte[] dataManifestBinary = new byte[dataManifestSize]; + Preconditions.checkState(in.read(dataManifestBinary) == dataManifestSize); + + dataManifest = ManifestFiles.decode(dataManifestBinary); + } + + int deleteManifestSize = in.readInt(); + if (deleteManifestSize > 0) { + byte[] deleteManifestBinary = new byte[deleteManifestSize]; + Preconditions.checkState(in.read(deleteManifestBinary) == deleteManifestSize); + + deleteManifest = ManifestFiles.decode(deleteManifestBinary); + } + + int referenceDataFileNum = in.readInt(); + CharSequence[] referencedDataFiles = new CharSequence[referenceDataFileNum]; + for (int i = 0; i < referenceDataFileNum; i++) { + referencedDataFiles[i] = in.readUTF(); + } + + return new DeltaManifests(dataManifest, deleteManifest, referencedDataFiles); + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/EqualityFieldKeySelector.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/EqualityFieldKeySelector.java new file mode 100644 index 000000000000..18b269d6c3e9 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/EqualityFieldKeySelector.java @@ -0,0 +1,86 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import java.util.List; +import org.apache.flink.api.java.functions.KeySelector; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.iceberg.Schema; +import org.apache.iceberg.flink.RowDataWrapper; +import org.apache.iceberg.relocated.com.google.common.collect.Sets; +import org.apache.iceberg.types.TypeUtil; +import org.apache.iceberg.util.StructLikeWrapper; +import org.apache.iceberg.util.StructProjection; + +/** + * Create a {@link KeySelector} to shuffle by equality fields, to ensure same equality fields record + * will be emitted to same writer in order. + */ +class EqualityFieldKeySelector implements KeySelector { + + private final Schema schema; + private final RowType flinkSchema; + private final Schema deleteSchema; + + private transient RowDataWrapper rowDataWrapper; + private transient StructProjection structProjection; + private transient StructLikeWrapper structLikeWrapper; + + EqualityFieldKeySelector(Schema schema, RowType flinkSchema, List equalityFieldIds) { + this.schema = schema; + this.flinkSchema = flinkSchema; + this.deleteSchema = TypeUtil.select(schema, Sets.newHashSet(equalityFieldIds)); + } + + /** + * Construct the {@link RowDataWrapper} lazily here because few members in it are not + * serializable. In this way, we don't have to serialize them with forcing. + */ + protected RowDataWrapper lazyRowDataWrapper() { + if (rowDataWrapper == null) { + rowDataWrapper = new RowDataWrapper(flinkSchema, schema.asStruct()); + } + return rowDataWrapper; + } + + /** Construct the {@link StructProjection} lazily because it is not serializable. */ + protected StructProjection lazyStructProjection() { + if (structProjection == null) { + structProjection = StructProjection.create(schema, deleteSchema); + } + return structProjection; + } + + /** Construct the {@link StructLikeWrapper} lazily because it is not serializable. */ + protected StructLikeWrapper lazyStructLikeWrapper() { + if (structLikeWrapper == null) { + structLikeWrapper = StructLikeWrapper.forType(deleteSchema.asStruct()); + } + return structLikeWrapper; + } + + @Override + public Integer getKey(RowData row) { + RowDataWrapper wrappedRowData = lazyRowDataWrapper().wrap(row); + StructProjection projectedRowData = lazyStructProjection().wrap(wrappedRowData); + StructLikeWrapper wrapper = lazyStructLikeWrapper().set(projectedRowData); + return wrapper.hashCode(); + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkAppenderFactory.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkAppenderFactory.java new file mode 100644 index 000000000000..b6f1392d1562 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkAppenderFactory.java @@ -0,0 +1,274 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import java.io.IOException; +import java.io.Serializable; +import java.io.UncheckedIOException; +import java.util.Map; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.MetricsConfig; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.StructLike; +import org.apache.iceberg.Table; +import org.apache.iceberg.avro.Avro; +import org.apache.iceberg.deletes.EqualityDeleteWriter; +import org.apache.iceberg.deletes.PositionDeleteWriter; +import org.apache.iceberg.encryption.EncryptedOutputFile; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.data.FlinkAvroWriter; +import org.apache.iceberg.flink.data.FlinkOrcWriter; +import org.apache.iceberg.flink.data.FlinkParquetWriters; +import org.apache.iceberg.io.DataWriter; +import org.apache.iceberg.io.DeleteSchemaUtil; +import org.apache.iceberg.io.FileAppender; +import org.apache.iceberg.io.FileAppenderFactory; +import org.apache.iceberg.io.OutputFile; +import org.apache.iceberg.orc.ORC; +import org.apache.iceberg.parquet.Parquet; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; + +public class FlinkAppenderFactory implements FileAppenderFactory, Serializable { + private final Schema schema; + private final RowType flinkSchema; + private final Map props; + private final PartitionSpec spec; + private final int[] equalityFieldIds; + private final Schema eqDeleteRowSchema; + private final Schema posDeleteRowSchema; + private final Table table; + + private RowType eqDeleteFlinkSchema = null; + private RowType posDeleteFlinkSchema = null; + + public FlinkAppenderFactory( + Table table, + Schema schema, + RowType flinkSchema, + Map props, + PartitionSpec spec, + int[] equalityFieldIds, + Schema eqDeleteRowSchema, + Schema posDeleteRowSchema) { + Preconditions.checkNotNull(table, "Table shouldn't be null"); + this.table = table; + this.schema = schema; + this.flinkSchema = flinkSchema; + this.props = props; + this.spec = spec; + this.equalityFieldIds = equalityFieldIds; + this.eqDeleteRowSchema = eqDeleteRowSchema; + this.posDeleteRowSchema = posDeleteRowSchema; + } + + private RowType lazyEqDeleteFlinkSchema() { + if (eqDeleteFlinkSchema == null) { + Preconditions.checkNotNull(eqDeleteRowSchema, "Equality delete row schema shouldn't be null"); + this.eqDeleteFlinkSchema = FlinkSchemaUtil.convert(eqDeleteRowSchema); + } + return eqDeleteFlinkSchema; + } + + private RowType lazyPosDeleteFlinkSchema() { + if (posDeleteFlinkSchema == null) { + Preconditions.checkNotNull(posDeleteRowSchema, "Pos-delete row schema shouldn't be null"); + this.posDeleteFlinkSchema = FlinkSchemaUtil.convert(posDeleteRowSchema); + } + return this.posDeleteFlinkSchema; + } + + @Override + public FileAppender newAppender(OutputFile outputFile, FileFormat format) { + MetricsConfig metricsConfig = MetricsConfig.forTable(table); + try { + switch (format) { + case AVRO: + return Avro.write(outputFile) + .createWriterFunc(ignore -> new FlinkAvroWriter(flinkSchema)) + .setAll(props) + .schema(schema) + .metricsConfig(metricsConfig) + .overwrite() + .build(); + + case ORC: + return ORC.write(outputFile) + .createWriterFunc( + (iSchema, typDesc) -> FlinkOrcWriter.buildWriter(flinkSchema, iSchema)) + .setAll(props) + .metricsConfig(metricsConfig) + .schema(schema) + .overwrite() + .build(); + + case PARQUET: + return Parquet.write(outputFile) + .createWriterFunc(msgType -> FlinkParquetWriters.buildWriter(flinkSchema, msgType)) + .setAll(props) + .metricsConfig(metricsConfig) + .schema(schema) + .overwrite() + .build(); + + default: + throw new UnsupportedOperationException("Cannot write unknown file format: " + format); + } + } catch (IOException e) { + throw new UncheckedIOException(e); + } + } + + @Override + public DataWriter newDataWriter( + EncryptedOutputFile file, FileFormat format, StructLike partition) { + return new DataWriter<>( + newAppender(file.encryptingOutputFile(), format), + format, + file.encryptingOutputFile().location(), + spec, + partition, + file.keyMetadata()); + } + + @Override + public EqualityDeleteWriter newEqDeleteWriter( + EncryptedOutputFile outputFile, FileFormat format, StructLike partition) { + Preconditions.checkState( + equalityFieldIds != null && equalityFieldIds.length > 0, + "Equality field ids shouldn't be null or empty when creating equality-delete writer"); + Preconditions.checkNotNull( + eqDeleteRowSchema, + "Equality delete row schema shouldn't be null when creating equality-delete writer"); + + MetricsConfig metricsConfig = MetricsConfig.forTable(table); + try { + switch (format) { + case AVRO: + return Avro.writeDeletes(outputFile.encryptingOutputFile()) + .createWriterFunc(ignore -> new FlinkAvroWriter(lazyEqDeleteFlinkSchema())) + .withPartition(partition) + .overwrite() + .setAll(props) + .metricsConfig(metricsConfig) + .rowSchema(eqDeleteRowSchema) + .withSpec(spec) + .withKeyMetadata(outputFile.keyMetadata()) + .equalityFieldIds(equalityFieldIds) + .buildEqualityWriter(); + + case ORC: + return ORC.writeDeletes(outputFile.encryptingOutputFile()) + .createWriterFunc( + (iSchema, typDesc) -> FlinkOrcWriter.buildWriter(flinkSchema, iSchema)) + .withPartition(partition) + .overwrite() + .setAll(props) + .metricsConfig(metricsConfig) + .rowSchema(eqDeleteRowSchema) + .withSpec(spec) + .withKeyMetadata(outputFile.keyMetadata()) + .equalityFieldIds(equalityFieldIds) + .buildEqualityWriter(); + + case PARQUET: + return Parquet.writeDeletes(outputFile.encryptingOutputFile()) + .createWriterFunc( + msgType -> FlinkParquetWriters.buildWriter(lazyEqDeleteFlinkSchema(), msgType)) + .withPartition(partition) + .overwrite() + .setAll(props) + .metricsConfig(metricsConfig) + .rowSchema(eqDeleteRowSchema) + .withSpec(spec) + .withKeyMetadata(outputFile.keyMetadata()) + .equalityFieldIds(equalityFieldIds) + .buildEqualityWriter(); + + default: + throw new UnsupportedOperationException( + "Cannot write equality-deletes for unsupported file format: " + format); + } + } catch (IOException e) { + throw new UncheckedIOException(e); + } + } + + @Override + public PositionDeleteWriter newPosDeleteWriter( + EncryptedOutputFile outputFile, FileFormat format, StructLike partition) { + MetricsConfig metricsConfig = MetricsConfig.forPositionDelete(table); + try { + switch (format) { + case AVRO: + return Avro.writeDeletes(outputFile.encryptingOutputFile()) + .createWriterFunc(ignore -> new FlinkAvroWriter(lazyPosDeleteFlinkSchema())) + .withPartition(partition) + .overwrite() + .setAll(props) + .metricsConfig(metricsConfig) + .rowSchema(posDeleteRowSchema) + .withSpec(spec) + .withKeyMetadata(outputFile.keyMetadata()) + .buildPositionWriter(); + + case ORC: + RowType orcPosDeleteSchema = + FlinkSchemaUtil.convert(DeleteSchemaUtil.posDeleteSchema(posDeleteRowSchema)); + return ORC.writeDeletes(outputFile.encryptingOutputFile()) + .createWriterFunc( + (iSchema, typDesc) -> FlinkOrcWriter.buildWriter(orcPosDeleteSchema, iSchema)) + .withPartition(partition) + .overwrite() + .setAll(props) + .metricsConfig(metricsConfig) + .rowSchema(posDeleteRowSchema) + .withSpec(spec) + .withKeyMetadata(outputFile.keyMetadata()) + .transformPaths(path -> StringData.fromString(path.toString())) + .buildPositionWriter(); + + case PARQUET: + RowType flinkPosDeleteSchema = + FlinkSchemaUtil.convert(DeleteSchemaUtil.posDeleteSchema(posDeleteRowSchema)); + return Parquet.writeDeletes(outputFile.encryptingOutputFile()) + .createWriterFunc( + msgType -> FlinkParquetWriters.buildWriter(flinkPosDeleteSchema, msgType)) + .withPartition(partition) + .overwrite() + .setAll(props) + .metricsConfig(metricsConfig) + .rowSchema(posDeleteRowSchema) + .withSpec(spec) + .withKeyMetadata(outputFile.keyMetadata()) + .transformPaths(path -> StringData.fromString(path.toString())) + .buildPositionWriter(); + + default: + throw new UnsupportedOperationException( + "Cannot write pos-deletes for unsupported file format: " + format); + } + } catch (IOException e) { + throw new UncheckedIOException(e); + } + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkFileWriterFactory.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkFileWriterFactory.java new file mode 100644 index 000000000000..2183fe062af4 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkFileWriterFactory.java @@ -0,0 +1,293 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import static org.apache.iceberg.MetadataColumns.DELETE_FILE_ROW_FIELD_NAME; +import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; +import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT_DEFAULT; +import static org.apache.iceberg.TableProperties.DELETE_DEFAULT_FILE_FORMAT; + +import java.io.Serializable; +import java.util.Map; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SortOrder; +import org.apache.iceberg.Table; +import org.apache.iceberg.avro.Avro; +import org.apache.iceberg.data.BaseFileWriterFactory; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.data.FlinkAvroWriter; +import org.apache.iceberg.flink.data.FlinkOrcWriter; +import org.apache.iceberg.flink.data.FlinkParquetWriters; +import org.apache.iceberg.io.DeleteSchemaUtil; +import org.apache.iceberg.orc.ORC; +import org.apache.iceberg.parquet.Parquet; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; + +class FlinkFileWriterFactory extends BaseFileWriterFactory implements Serializable { + private RowType dataFlinkType; + private RowType equalityDeleteFlinkType; + private RowType positionDeleteFlinkType; + + FlinkFileWriterFactory( + Table table, + FileFormat dataFileFormat, + Schema dataSchema, + RowType dataFlinkType, + SortOrder dataSortOrder, + FileFormat deleteFileFormat, + int[] equalityFieldIds, + Schema equalityDeleteRowSchema, + RowType equalityDeleteFlinkType, + SortOrder equalityDeleteSortOrder, + Schema positionDeleteRowSchema, + RowType positionDeleteFlinkType) { + + super( + table, + dataFileFormat, + dataSchema, + dataSortOrder, + deleteFileFormat, + equalityFieldIds, + equalityDeleteRowSchema, + equalityDeleteSortOrder, + positionDeleteRowSchema); + + this.dataFlinkType = dataFlinkType; + this.equalityDeleteFlinkType = equalityDeleteFlinkType; + this.positionDeleteFlinkType = positionDeleteFlinkType; + } + + static Builder builderFor(Table table) { + return new Builder(table); + } + + @Override + protected void configureDataWrite(Avro.DataWriteBuilder builder) { + builder.createWriterFunc(ignore -> new FlinkAvroWriter(dataFlinkType())); + } + + @Override + protected void configureEqualityDelete(Avro.DeleteWriteBuilder builder) { + builder.createWriterFunc(ignored -> new FlinkAvroWriter(equalityDeleteFlinkType())); + } + + @Override + protected void configurePositionDelete(Avro.DeleteWriteBuilder builder) { + int rowFieldIndex = positionDeleteFlinkType().getFieldIndex(DELETE_FILE_ROW_FIELD_NAME); + if (rowFieldIndex >= 0) { + // FlinkAvroWriter accepts just the Flink type of the row ignoring the path and pos + RowType positionDeleteRowFlinkType = + (RowType) positionDeleteFlinkType().getTypeAt(rowFieldIndex); + builder.createWriterFunc(ignored -> new FlinkAvroWriter(positionDeleteRowFlinkType)); + } + } + + @Override + protected void configureDataWrite(Parquet.DataWriteBuilder builder) { + builder.createWriterFunc(msgType -> FlinkParquetWriters.buildWriter(dataFlinkType(), msgType)); + } + + @Override + protected void configureEqualityDelete(Parquet.DeleteWriteBuilder builder) { + builder.createWriterFunc( + msgType -> FlinkParquetWriters.buildWriter(equalityDeleteFlinkType(), msgType)); + } + + @Override + protected void configurePositionDelete(Parquet.DeleteWriteBuilder builder) { + builder.createWriterFunc( + msgType -> FlinkParquetWriters.buildWriter(positionDeleteFlinkType(), msgType)); + builder.transformPaths(path -> StringData.fromString(path.toString())); + } + + @Override + protected void configureDataWrite(ORC.DataWriteBuilder builder) { + builder.createWriterFunc( + (iSchema, typDesc) -> FlinkOrcWriter.buildWriter(dataFlinkType(), iSchema)); + } + + @Override + protected void configureEqualityDelete(ORC.DeleteWriteBuilder builder) { + builder.createWriterFunc( + (iSchema, typDesc) -> FlinkOrcWriter.buildWriter(equalityDeleteFlinkType(), iSchema)); + } + + @Override + protected void configurePositionDelete(ORC.DeleteWriteBuilder builder) { + builder.createWriterFunc( + (iSchema, typDesc) -> FlinkOrcWriter.buildWriter(positionDeleteFlinkType(), iSchema)); + builder.transformPaths(path -> StringData.fromString(path.toString())); + } + + private RowType dataFlinkType() { + if (dataFlinkType == null) { + Preconditions.checkNotNull(dataSchema(), "Data schema must not be null"); + this.dataFlinkType = FlinkSchemaUtil.convert(dataSchema()); + } + + return dataFlinkType; + } + + private RowType equalityDeleteFlinkType() { + if (equalityDeleteFlinkType == null) { + Preconditions.checkNotNull( + equalityDeleteRowSchema(), "Equality delete schema must not be null"); + this.equalityDeleteFlinkType = FlinkSchemaUtil.convert(equalityDeleteRowSchema()); + } + + return equalityDeleteFlinkType; + } + + private RowType positionDeleteFlinkType() { + if (positionDeleteFlinkType == null) { + // wrap the optional row schema into the position delete schema that contains path and + // position + Schema positionDeleteSchema = DeleteSchemaUtil.posDeleteSchema(positionDeleteRowSchema()); + this.positionDeleteFlinkType = FlinkSchemaUtil.convert(positionDeleteSchema); + } + + return positionDeleteFlinkType; + } + + static class Builder { + private final Table table; + private FileFormat dataFileFormat; + private Schema dataSchema; + private RowType dataFlinkType; + private SortOrder dataSortOrder; + private FileFormat deleteFileFormat; + private int[] equalityFieldIds; + private Schema equalityDeleteRowSchema; + private RowType equalityDeleteFlinkType; + private SortOrder equalityDeleteSortOrder; + private Schema positionDeleteRowSchema; + private RowType positionDeleteFlinkType; + + Builder(Table table) { + this.table = table; + + Map properties = table.properties(); + + String dataFileFormatName = + properties.getOrDefault(DEFAULT_FILE_FORMAT, DEFAULT_FILE_FORMAT_DEFAULT); + this.dataFileFormat = FileFormat.fromString(dataFileFormatName); + + String deleteFileFormatName = + properties.getOrDefault(DELETE_DEFAULT_FILE_FORMAT, dataFileFormatName); + this.deleteFileFormat = FileFormat.fromString(deleteFileFormatName); + } + + Builder dataFileFormat(FileFormat newDataFileFormat) { + this.dataFileFormat = newDataFileFormat; + return this; + } + + Builder dataSchema(Schema newDataSchema) { + this.dataSchema = newDataSchema; + return this; + } + + /** + * Sets a Flink type for data. + * + *

If not set, the value is derived from the provided Iceberg schema. + */ + Builder dataFlinkType(RowType newDataFlinkType) { + this.dataFlinkType = newDataFlinkType; + return this; + } + + Builder dataSortOrder(SortOrder newDataSortOrder) { + this.dataSortOrder = newDataSortOrder; + return this; + } + + Builder deleteFileFormat(FileFormat newDeleteFileFormat) { + this.deleteFileFormat = newDeleteFileFormat; + return this; + } + + Builder equalityFieldIds(int[] newEqualityFieldIds) { + this.equalityFieldIds = newEqualityFieldIds; + return this; + } + + Builder equalityDeleteRowSchema(Schema newEqualityDeleteRowSchema) { + this.equalityDeleteRowSchema = newEqualityDeleteRowSchema; + return this; + } + + /** + * Sets a Flink type for equality deletes. + * + *

If not set, the value is derived from the provided Iceberg schema. + */ + Builder equalityDeleteFlinkType(RowType newEqualityDeleteFlinkType) { + this.equalityDeleteFlinkType = newEqualityDeleteFlinkType; + return this; + } + + Builder equalityDeleteSortOrder(SortOrder newEqualityDeleteSortOrder) { + this.equalityDeleteSortOrder = newEqualityDeleteSortOrder; + return this; + } + + Builder positionDeleteRowSchema(Schema newPositionDeleteRowSchema) { + this.positionDeleteRowSchema = newPositionDeleteRowSchema; + return this; + } + + /** + * Sets a Flink type for position deletes. + * + *

If not set, the value is derived from the provided Iceberg schema. + */ + Builder positionDeleteFlinkType(RowType newPositionDeleteFlinkType) { + this.positionDeleteFlinkType = newPositionDeleteFlinkType; + return this; + } + + FlinkFileWriterFactory build() { + boolean noEqualityDeleteConf = equalityFieldIds == null && equalityDeleteRowSchema == null; + boolean fullEqualityDeleteConf = equalityFieldIds != null && equalityDeleteRowSchema != null; + Preconditions.checkArgument( + noEqualityDeleteConf || fullEqualityDeleteConf, + "Equality field IDs and equality delete row schema must be set together"); + + return new FlinkFileWriterFactory( + table, + dataFileFormat, + dataSchema, + dataFlinkType, + dataSortOrder, + deleteFileFormat, + equalityFieldIds, + equalityDeleteRowSchema, + equalityDeleteFlinkType, + equalityDeleteSortOrder, + positionDeleteRowSchema, + positionDeleteFlinkType); + } + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkManifestUtil.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkManifestUtil.java new file mode 100644 index 000000000000..c7e8a2dea7cb --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkManifestUtil.java @@ -0,0 +1,132 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import java.io.IOException; +import java.util.List; +import java.util.Map; +import java.util.function.Supplier; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.DeleteFile; +import org.apache.iceberg.ManifestFile; +import org.apache.iceberg.ManifestFiles; +import org.apache.iceberg.ManifestWriter; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Table; +import org.apache.iceberg.io.CloseableIterable; +import org.apache.iceberg.io.FileIO; +import org.apache.iceberg.io.OutputFile; +import org.apache.iceberg.io.WriteResult; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; + +class FlinkManifestUtil { + private static final int FORMAT_V2 = 2; + private static final Long DUMMY_SNAPSHOT_ID = 0L; + + private FlinkManifestUtil() {} + + static ManifestFile writeDataFiles( + OutputFile outputFile, PartitionSpec spec, List dataFiles) throws IOException { + ManifestWriter writer = + ManifestFiles.write(FORMAT_V2, spec, outputFile, DUMMY_SNAPSHOT_ID); + + try (ManifestWriter closeableWriter = writer) { + closeableWriter.addAll(dataFiles); + } + + return writer.toManifestFile(); + } + + static List readDataFiles( + ManifestFile manifestFile, FileIO io, Map specsById) + throws IOException { + try (CloseableIterable dataFiles = ManifestFiles.read(manifestFile, io, specsById)) { + return Lists.newArrayList(dataFiles); + } + } + + static ManifestOutputFileFactory createOutputFileFactory( + Supplier

tableSupplier, + Map tableProps, + String flinkJobId, + String operatorUniqueId, + int subTaskId, + long attemptNumber) { + return new ManifestOutputFileFactory( + tableSupplier, tableProps, flinkJobId, operatorUniqueId, subTaskId, attemptNumber); + } + + /** + * Write the {@link WriteResult} to temporary manifest files. + * + * @param result all those DataFiles/DeleteFiles in this WriteResult should be written with same + * partition spec + */ + static DeltaManifests writeCompletedFiles( + WriteResult result, Supplier outputFileSupplier, PartitionSpec spec) + throws IOException { + + ManifestFile dataManifest = null; + ManifestFile deleteManifest = null; + + // Write the completed data files into a newly created data manifest file. + if (result.dataFiles() != null && result.dataFiles().length > 0) { + dataManifest = + writeDataFiles(outputFileSupplier.get(), spec, Lists.newArrayList(result.dataFiles())); + } + + // Write the completed delete files into a newly created delete manifest file. + if (result.deleteFiles() != null && result.deleteFiles().length > 0) { + OutputFile deleteManifestFile = outputFileSupplier.get(); + + ManifestWriter deleteManifestWriter = + ManifestFiles.writeDeleteManifest(FORMAT_V2, spec, deleteManifestFile, DUMMY_SNAPSHOT_ID); + try (ManifestWriter writer = deleteManifestWriter) { + for (DeleteFile deleteFile : result.deleteFiles()) { + writer.add(deleteFile); + } + } + + deleteManifest = deleteManifestWriter.toManifestFile(); + } + + return new DeltaManifests(dataManifest, deleteManifest, result.referencedDataFiles()); + } + + static WriteResult readCompletedFiles( + DeltaManifests deltaManifests, FileIO io, Map specsById) + throws IOException { + WriteResult.Builder builder = WriteResult.builder(); + + // Read the completed data files from persisted data manifest file. + if (deltaManifests.dataManifest() != null) { + builder.addDataFiles(readDataFiles(deltaManifests.dataManifest(), io, specsById)); + } + + // Read the completed delete files from persisted delete manifests file. + if (deltaManifests.deleteManifest() != null) { + try (CloseableIterable deleteFiles = + ManifestFiles.readDeleteManifest(deltaManifests.deleteManifest(), io, specsById)) { + builder.addDeleteFiles(deleteFiles); + } + } + + return builder.addReferencedDataFiles(deltaManifests.referencedDataFiles()).build(); + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkSink.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkSink.java new file mode 100644 index 000000000000..769af7d77140 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkSink.java @@ -0,0 +1,654 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import static org.apache.iceberg.TableProperties.AVRO_COMPRESSION; +import static org.apache.iceberg.TableProperties.AVRO_COMPRESSION_LEVEL; +import static org.apache.iceberg.TableProperties.ORC_COMPRESSION; +import static org.apache.iceberg.TableProperties.ORC_COMPRESSION_STRATEGY; +import static org.apache.iceberg.TableProperties.PARQUET_COMPRESSION; +import static org.apache.iceberg.TableProperties.PARQUET_COMPRESSION_LEVEL; +import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.time.Duration; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.function.Function; +import org.apache.flink.api.common.functions.MapFunction; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.common.typeinfo.Types; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.configuration.ReadableConfig; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.datastream.DataStreamSink; +import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator; +import org.apache.flink.streaming.api.functions.sink.DiscardingSink; +import org.apache.flink.table.api.TableSchema; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.util.DataFormatConverters; +import org.apache.flink.table.types.DataType; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.types.Row; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.DistributionMode; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.PartitionField; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SerializableTable; +import org.apache.iceberg.Table; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.FlinkWriteConf; +import org.apache.iceberg.flink.FlinkWriteOptions; +import org.apache.iceberg.flink.TableLoader; +import org.apache.iceberg.flink.util.FlinkCompatibilityUtil; +import org.apache.iceberg.io.WriteResult; +import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.relocated.com.google.common.collect.Sets; +import org.apache.iceberg.types.TypeUtil; +import org.apache.iceberg.util.SerializableSupplier; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class FlinkSink { + private static final Logger LOG = LoggerFactory.getLogger(FlinkSink.class); + + private static final String ICEBERG_STREAM_WRITER_NAME = + IcebergStreamWriter.class.getSimpleName(); + private static final String ICEBERG_FILES_COMMITTER_NAME = + IcebergFilesCommitter.class.getSimpleName(); + + private FlinkSink() {} + + /** + * Initialize a {@link Builder} to export the data from generic input data stream into iceberg + * table. We use {@link RowData} inside the sink connector, so users need to provide a mapper + * function and a {@link TypeInformation} to convert those generic records to a RowData + * DataStream. + * + * @param input the generic source input data stream. + * @param mapper function to convert the generic data to {@link RowData} + * @param outputType to define the {@link TypeInformation} for the input data. + * @param the data type of records. + * @return {@link Builder} to connect the iceberg table. + */ + public static Builder builderFor( + DataStream input, MapFunction mapper, TypeInformation outputType) { + return new Builder().forMapperOutputType(input, mapper, outputType); + } + + /** + * Initialize a {@link Builder} to export the data from input data stream with {@link Row}s into + * iceberg table. We use {@link RowData} inside the sink connector, so users need to provide a + * {@link TableSchema} for builder to convert those {@link Row}s to a {@link RowData} DataStream. + * + * @param input the source input data stream with {@link Row}s. + * @param tableSchema defines the {@link TypeInformation} for input data. + * @return {@link Builder} to connect the iceberg table. + */ + public static Builder forRow(DataStream input, TableSchema tableSchema) { + RowType rowType = (RowType) tableSchema.toRowDataType().getLogicalType(); + DataType[] fieldDataTypes = tableSchema.getFieldDataTypes(); + + DataFormatConverters.RowConverter rowConverter = + new DataFormatConverters.RowConverter(fieldDataTypes); + return builderFor(input, rowConverter::toInternal, FlinkCompatibilityUtil.toTypeInfo(rowType)) + .tableSchema(tableSchema); + } + + /** + * Initialize a {@link Builder} to export the data from input data stream with {@link RowData}s + * into iceberg table. + * + * @param input the source input data stream with {@link RowData}s. + * @return {@link Builder} to connect the iceberg table. + */ + public static Builder forRowData(DataStream input) { + return new Builder().forRowData(input); + } + + public static class Builder { + private Function> inputCreator = null; + private TableLoader tableLoader; + private Table table; + private TableSchema tableSchema; + private List equalityFieldColumns = null; + private String uidPrefix = null; + private final Map snapshotProperties = Maps.newHashMap(); + private ReadableConfig readableConfig = new Configuration(); + private final Map writeOptions = Maps.newHashMap(); + private FlinkWriteConf flinkWriteConf = null; + + private Builder() {} + + private Builder forRowData(DataStream newRowDataInput) { + this.inputCreator = ignored -> newRowDataInput; + return this; + } + + private Builder forMapperOutputType( + DataStream input, MapFunction mapper, TypeInformation outputType) { + this.inputCreator = + newUidPrefix -> { + // Input stream order is crucial for some situation(e.g. in cdc case). Therefore, we + // need to set the parallelism + // of map operator same as its input to keep map operator chaining its input, and avoid + // rebalanced by default. + SingleOutputStreamOperator inputStream = + input.map(mapper, outputType).setParallelism(input.getParallelism()); + if (newUidPrefix != null) { + inputStream.name(operatorName(newUidPrefix)).uid(newUidPrefix + "-mapper"); + } + return inputStream; + }; + return this; + } + + /** + * This iceberg {@link Table} instance is used for initializing {@link IcebergStreamWriter} + * which will write all the records into {@link DataFile}s and emit them to downstream operator. + * Providing a table would avoid so many table loading from each separate task. + * + * @param newTable the loaded iceberg table instance. + * @return {@link Builder} to connect the iceberg table. + */ + public Builder table(Table newTable) { + this.table = newTable; + return this; + } + + /** + * The table loader is used for loading tables in {@link IcebergFilesCommitter} lazily, we need + * this loader because {@link Table} is not serializable and could not just use the loaded table + * from Builder#table in the remote task manager. + * + * @param newTableLoader to load iceberg table inside tasks. + * @return {@link Builder} to connect the iceberg table. + */ + public Builder tableLoader(TableLoader newTableLoader) { + this.tableLoader = newTableLoader; + return this; + } + + /** + * Set the write properties for Flink sink. View the supported properties in {@link + * FlinkWriteOptions} + */ + public Builder set(String property, String value) { + writeOptions.put(property, value); + return this; + } + + /** + * Set the write properties for Flink sink. View the supported properties in {@link + * FlinkWriteOptions} + */ + public Builder setAll(Map properties) { + writeOptions.putAll(properties); + return this; + } + + public Builder tableSchema(TableSchema newTableSchema) { + this.tableSchema = newTableSchema; + return this; + } + + public Builder overwrite(boolean newOverwrite) { + writeOptions.put(FlinkWriteOptions.OVERWRITE_MODE.key(), Boolean.toString(newOverwrite)); + return this; + } + + public Builder flinkConf(ReadableConfig config) { + this.readableConfig = config; + return this; + } + + /** + * Configure the write {@link DistributionMode} that the flink sink will use. Currently, flink + * support {@link DistributionMode#NONE} and {@link DistributionMode#HASH}. + * + * @param mode to specify the write distribution mode. + * @return {@link Builder} to connect the iceberg table. + */ + public Builder distributionMode(DistributionMode mode) { + Preconditions.checkArgument( + !DistributionMode.RANGE.equals(mode), + "Flink does not support 'range' write distribution mode now."); + if (mode != null) { + writeOptions.put(FlinkWriteOptions.DISTRIBUTION_MODE.key(), mode.modeName()); + } + return this; + } + + /** + * Configuring the write parallel number for iceberg stream writer. + * + * @param newWriteParallelism the number of parallel iceberg stream writer. + * @return {@link Builder} to connect the iceberg table. + */ + public Builder writeParallelism(int newWriteParallelism) { + writeOptions.put( + FlinkWriteOptions.WRITE_PARALLELISM.key(), Integer.toString(newWriteParallelism)); + return this; + } + + /** + * All INSERT/UPDATE_AFTER events from input stream will be transformed to UPSERT events, which + * means it will DELETE the old records and then INSERT the new records. In partitioned table, + * the partition fields should be a subset of equality fields, otherwise the old row that + * located in partition-A could not be deleted by the new row that located in partition-B. + * + * @param enabled indicate whether it should transform all INSERT/UPDATE_AFTER events to UPSERT. + * @return {@link Builder} to connect the iceberg table. + */ + public Builder upsert(boolean enabled) { + writeOptions.put(FlinkWriteOptions.WRITE_UPSERT_ENABLED.key(), Boolean.toString(enabled)); + return this; + } + + /** + * Configuring the equality field columns for iceberg table that accept CDC or UPSERT events. + * + * @param columns defines the iceberg table's key. + * @return {@link Builder} to connect the iceberg table. + */ + public Builder equalityFieldColumns(List columns) { + this.equalityFieldColumns = columns; + return this; + } + + /** + * Set the uid prefix for FlinkSink operators. Note that FlinkSink internally consists of + * multiple operators (like writer, committer, dummy sink etc.) Actually operator uid will be + * appended with a suffix like "uidPrefix-writer".
+ *
+ * If provided, this prefix is also applied to operator names.
+ *
+ * Flink auto generates operator uid if not set explicitly. It is a recommended + * best-practice to set uid for all operators before deploying to production. Flink has an + * option to {@code pipeline.auto-generate-uid=false} to disable auto-generation and force + * explicit setting of all operator uid.
+ *
+ * Be careful with setting this for an existing job, because now we are changing the operator + * uid from an auto-generated one to this new value. When deploying the change with a + * checkpoint, Flink won't be able to restore the previous Flink sink operator state (more + * specifically the committer operator state). You need to use {@code --allowNonRestoredState} + * to ignore the previous sink state. During restore Flink sink state is used to check if last + * commit was actually successful or not. {@code --allowNonRestoredState} can lead to data loss + * if the Iceberg commit failed in the last completed checkpoint. + * + * @param newPrefix prefix for Flink sink operator uid and name + * @return {@link Builder} to connect the iceberg table. + */ + public Builder uidPrefix(String newPrefix) { + this.uidPrefix = newPrefix; + return this; + } + + public Builder setSnapshotProperties(Map properties) { + snapshotProperties.putAll(properties); + return this; + } + + public Builder setSnapshotProperty(String property, String value) { + snapshotProperties.put(property, value); + return this; + } + + public Builder toBranch(String branch) { + writeOptions.put(FlinkWriteOptions.BRANCH.key(), branch); + return this; + } + + private DataStreamSink chainIcebergOperators() { + Preconditions.checkArgument( + inputCreator != null, + "Please use forRowData() or forMapperOutputType() to initialize the input DataStream."); + Preconditions.checkNotNull(tableLoader, "Table loader shouldn't be null"); + + DataStream rowDataInput = inputCreator.apply(uidPrefix); + + if (table == null) { + if (!tableLoader.isOpen()) { + tableLoader.open(); + } + + try (TableLoader loader = tableLoader) { + this.table = loader.loadTable(); + } catch (IOException e) { + throw new UncheckedIOException( + "Failed to load iceberg table from table loader: " + tableLoader, e); + } + } + + flinkWriteConf = new FlinkWriteConf(table, writeOptions, readableConfig); + + // Find out the equality field id list based on the user-provided equality field column names. + List equalityFieldIds = checkAndGetEqualityFieldIds(); + + // Convert the requested flink table schema to flink row type. + RowType flinkRowType = toFlinkRowType(table.schema(), tableSchema); + + // Distribute the records from input data stream based on the write.distribution-mode and + // equality fields. + DataStream distributeStream = + distributeDataStream( + rowDataInput, equalityFieldIds, table.spec(), table.schema(), flinkRowType); + + // Add parallel writers that append rows to files + SingleOutputStreamOperator writerStream = + appendWriter(distributeStream, flinkRowType, equalityFieldIds); + + // Add single-parallelism committer that commits files + // after successful checkpoint or end of input + SingleOutputStreamOperator committerStream = appendCommitter(writerStream); + + // Add dummy discard sink + return appendDummySink(committerStream); + } + + /** + * Append the iceberg sink operators to write records to iceberg table. + * + * @return {@link DataStreamSink} for sink. + */ + public DataStreamSink append() { + return chainIcebergOperators(); + } + + private String operatorName(String suffix) { + return uidPrefix != null ? uidPrefix + "-" + suffix : suffix; + } + + @VisibleForTesting + List checkAndGetEqualityFieldIds() { + List equalityFieldIds = Lists.newArrayList(table.schema().identifierFieldIds()); + if (equalityFieldColumns != null && !equalityFieldColumns.isEmpty()) { + Set equalityFieldSet = + Sets.newHashSetWithExpectedSize(equalityFieldColumns.size()); + for (String column : equalityFieldColumns) { + org.apache.iceberg.types.Types.NestedField field = table.schema().findField(column); + Preconditions.checkNotNull( + field, + "Missing required equality field column '%s' in table schema %s", + column, + table.schema()); + equalityFieldSet.add(field.fieldId()); + } + + if (!equalityFieldSet.equals(table.schema().identifierFieldIds())) { + LOG.warn( + "The configured equality field column IDs {} are not matched with the schema identifier field IDs" + + " {}, use job specified equality field columns as the equality fields by default.", + equalityFieldSet, + table.schema().identifierFieldIds()); + } + equalityFieldIds = Lists.newArrayList(equalityFieldSet); + } + return equalityFieldIds; + } + + @SuppressWarnings("unchecked") + private DataStreamSink appendDummySink( + SingleOutputStreamOperator committerStream) { + DataStreamSink resultStream = + committerStream + .addSink(new DiscardingSink()) + .name(operatorName(String.format("IcebergSink %s", this.table.name()))) + .setParallelism(1); + if (uidPrefix != null) { + resultStream = resultStream.uid(uidPrefix + "-dummysink"); + } + return resultStream; + } + + private SingleOutputStreamOperator appendCommitter( + SingleOutputStreamOperator writerStream) { + IcebergFilesCommitter filesCommitter = + new IcebergFilesCommitter( + tableLoader, + flinkWriteConf.overwriteMode(), + snapshotProperties, + flinkWriteConf.workerPoolSize(), + flinkWriteConf.branch(), + table.spec()); + SingleOutputStreamOperator committerStream = + writerStream + .transform(operatorName(ICEBERG_FILES_COMMITTER_NAME), Types.VOID, filesCommitter) + .setParallelism(1) + .setMaxParallelism(1); + if (uidPrefix != null) { + committerStream = committerStream.uid(uidPrefix + "-committer"); + } + return committerStream; + } + + private SingleOutputStreamOperator appendWriter( + DataStream input, RowType flinkRowType, List equalityFieldIds) { + // Validate the equality fields and partition fields if we enable the upsert mode. + if (flinkWriteConf.upsertMode()) { + Preconditions.checkState( + !flinkWriteConf.overwriteMode(), + "OVERWRITE mode shouldn't be enable when configuring to use UPSERT data stream."); + Preconditions.checkState( + !equalityFieldIds.isEmpty(), + "Equality field columns shouldn't be empty when configuring to use UPSERT data stream."); + if (!table.spec().isUnpartitioned()) { + for (PartitionField partitionField : table.spec().fields()) { + Preconditions.checkState( + equalityFieldIds.contains(partitionField.sourceId()), + "In UPSERT mode, partition field '%s' should be included in equality fields: '%s'", + partitionField, + equalityFieldColumns); + } + } + } + + SerializableTable serializableTable = (SerializableTable) SerializableTable.copyOf(table); + Duration tableRefreshInterval = flinkWriteConf.tableRefreshInterval(); + + SerializableSupplier
tableSupplier; + if (tableRefreshInterval != null) { + tableSupplier = + new CachingTableSupplier(serializableTable, tableLoader, tableRefreshInterval); + } else { + tableSupplier = () -> serializableTable; + } + + IcebergStreamWriter streamWriter = + createStreamWriter(tableSupplier, flinkWriteConf, flinkRowType, equalityFieldIds); + + int parallelism = + flinkWriteConf.writeParallelism() == null + ? input.getParallelism() + : flinkWriteConf.writeParallelism(); + SingleOutputStreamOperator writerStream = + input + .transform( + operatorName(ICEBERG_STREAM_WRITER_NAME), + TypeInformation.of(WriteResult.class), + streamWriter) + .setParallelism(parallelism); + if (uidPrefix != null) { + writerStream = writerStream.uid(uidPrefix + "-writer"); + } + return writerStream; + } + + private DataStream distributeDataStream( + DataStream input, + List equalityFieldIds, + PartitionSpec partitionSpec, + Schema iSchema, + RowType flinkRowType) { + DistributionMode writeMode = flinkWriteConf.distributionMode(); + + LOG.info("Write distribution mode is '{}'", writeMode.modeName()); + switch (writeMode) { + case NONE: + if (equalityFieldIds.isEmpty()) { + return input; + } else { + LOG.info("Distribute rows by equality fields, because there are equality fields set"); + return input.keyBy( + new EqualityFieldKeySelector(iSchema, flinkRowType, equalityFieldIds)); + } + + case HASH: + if (equalityFieldIds.isEmpty()) { + if (partitionSpec.isUnpartitioned()) { + LOG.warn( + "Fallback to use 'none' distribution mode, because there are no equality fields set " + + "and table is unpartitioned"); + return input; + } else { + return input.keyBy(new PartitionKeySelector(partitionSpec, iSchema, flinkRowType)); + } + } else { + if (partitionSpec.isUnpartitioned()) { + LOG.info( + "Distribute rows by equality fields, because there are equality fields set " + + "and table is unpartitioned"); + return input.keyBy( + new EqualityFieldKeySelector(iSchema, flinkRowType, equalityFieldIds)); + } else { + for (PartitionField partitionField : partitionSpec.fields()) { + Preconditions.checkState( + equalityFieldIds.contains(partitionField.sourceId()), + "In 'hash' distribution mode with equality fields set, partition field '%s' " + + "should be included in equality fields: '%s'", + partitionField, + equalityFieldColumns); + } + return input.keyBy(new PartitionKeySelector(partitionSpec, iSchema, flinkRowType)); + } + } + + case RANGE: + if (equalityFieldIds.isEmpty()) { + LOG.warn( + "Fallback to use 'none' distribution mode, because there are no equality fields set " + + "and {}=range is not supported yet in flink", + WRITE_DISTRIBUTION_MODE); + return input; + } else { + LOG.info( + "Distribute rows by equality fields, because there are equality fields set " + + "and{}=range is not supported yet in flink", + WRITE_DISTRIBUTION_MODE); + return input.keyBy( + new EqualityFieldKeySelector(iSchema, flinkRowType, equalityFieldIds)); + } + + default: + throw new RuntimeException("Unrecognized " + WRITE_DISTRIBUTION_MODE + ": " + writeMode); + } + } + } + + static RowType toFlinkRowType(Schema schema, TableSchema requestedSchema) { + if (requestedSchema != null) { + // Convert the flink schema to iceberg schema firstly, then reassign ids to match the existing + // iceberg schema. + Schema writeSchema = TypeUtil.reassignIds(FlinkSchemaUtil.convert(requestedSchema), schema); + TypeUtil.validateWriteSchema(schema, writeSchema, true, true); + + // We use this flink schema to read values from RowData. The flink's TINYINT and SMALLINT will + // be promoted to + // iceberg INTEGER, that means if we use iceberg's table schema to read TINYINT (backend by 1 + // 'byte'), we will + // read 4 bytes rather than 1 byte, it will mess up the byte array in BinaryRowData. So here + // we must use flink + // schema. + return (RowType) requestedSchema.toRowDataType().getLogicalType(); + } else { + return FlinkSchemaUtil.convert(schema); + } + } + + static IcebergStreamWriter createStreamWriter( + SerializableSupplier
tableSupplier, + FlinkWriteConf flinkWriteConf, + RowType flinkRowType, + List equalityFieldIds) { + Preconditions.checkArgument(tableSupplier != null, "Iceberg table supplier shouldn't be null"); + + Table initTable = tableSupplier.get(); + FileFormat format = flinkWriteConf.dataFileFormat(); + TaskWriterFactory taskWriterFactory = + new RowDataTaskWriterFactory( + tableSupplier, + flinkRowType, + flinkWriteConf.targetDataFileSize(), + format, + writeProperties(initTable, format, flinkWriteConf), + equalityFieldIds, + flinkWriteConf.upsertMode()); + + return new IcebergStreamWriter<>(initTable.name(), taskWriterFactory); + } + + /** + * Based on the {@link FileFormat} overwrites the table level compression properties for the table + * write. + * + * @param table The table to get the table level settings + * @param format The FileFormat to use + * @param conf The write configuration + * @return The properties to use for writing + */ + private static Map writeProperties( + Table table, FileFormat format, FlinkWriteConf conf) { + Map writeProperties = Maps.newHashMap(table.properties()); + + switch (format) { + case PARQUET: + writeProperties.put(PARQUET_COMPRESSION, conf.parquetCompressionCodec()); + String parquetCompressionLevel = conf.parquetCompressionLevel(); + if (parquetCompressionLevel != null) { + writeProperties.put(PARQUET_COMPRESSION_LEVEL, parquetCompressionLevel); + } + + break; + case AVRO: + writeProperties.put(AVRO_COMPRESSION, conf.avroCompressionCodec()); + String avroCompressionLevel = conf.avroCompressionLevel(); + if (avroCompressionLevel != null) { + writeProperties.put(AVRO_COMPRESSION_LEVEL, conf.avroCompressionLevel()); + } + + break; + case ORC: + writeProperties.put(ORC_COMPRESSION, conf.orcCompressionCodec()); + writeProperties.put(ORC_COMPRESSION_STRATEGY, conf.orcCompressionStrategy()); + break; + default: + throw new IllegalArgumentException(String.format("Unknown file format %s", format)); + } + + return writeProperties; + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergFilesCommitter.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergFilesCommitter.java new file mode 100644 index 000000000000..b9bceaa9311d --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergFilesCommitter.java @@ -0,0 +1,516 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import java.io.IOException; +import java.util.Arrays; +import java.util.Comparator; +import java.util.List; +import java.util.Map; +import java.util.NavigableMap; +import java.util.SortedMap; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.TimeUnit; +import org.apache.flink.api.common.state.ListState; +import org.apache.flink.api.common.state.ListStateDescriptor; +import org.apache.flink.api.common.typeinfo.BasicTypeInfo; +import org.apache.flink.api.common.typeinfo.PrimitiveArrayTypeInfo; +import org.apache.flink.core.io.SimpleVersionedSerialization; +import org.apache.flink.runtime.state.StateInitializationContext; +import org.apache.flink.runtime.state.StateSnapshotContext; +import org.apache.flink.streaming.api.operators.AbstractStreamOperator; +import org.apache.flink.streaming.api.operators.BoundedOneInput; +import org.apache.flink.streaming.api.operators.OneInputStreamOperator; +import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; +import org.apache.flink.table.runtime.typeutils.SortedMapTypeInfo; +import org.apache.iceberg.AppendFiles; +import org.apache.iceberg.ManifestFile; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.ReplacePartitions; +import org.apache.iceberg.RowDelta; +import org.apache.iceberg.Snapshot; +import org.apache.iceberg.SnapshotUpdate; +import org.apache.iceberg.Table; +import org.apache.iceberg.flink.TableLoader; +import org.apache.iceberg.io.WriteResult; +import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; +import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.base.Strings; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.types.Comparators; +import org.apache.iceberg.types.Types; +import org.apache.iceberg.util.PropertyUtil; +import org.apache.iceberg.util.ThreadPools; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +class IcebergFilesCommitter extends AbstractStreamOperator + implements OneInputStreamOperator, BoundedOneInput { + + private static final long serialVersionUID = 1L; + private static final long INITIAL_CHECKPOINT_ID = -1L; + private static final byte[] EMPTY_MANIFEST_DATA = new byte[0]; + + private static final Logger LOG = LoggerFactory.getLogger(IcebergFilesCommitter.class); + private static final String FLINK_JOB_ID = "flink.job-id"; + private static final String OPERATOR_ID = "flink.operator-id"; + + // The max checkpoint id we've committed to iceberg table. As the flink's checkpoint is always + // increasing, so we could correctly commit all the data files whose checkpoint id is greater than + // the max committed one to iceberg table, for avoiding committing the same data files twice. This + // id will be attached to iceberg's meta when committing the iceberg transaction. + private static final String MAX_COMMITTED_CHECKPOINT_ID = "flink.max-committed-checkpoint-id"; + static final String MAX_CONTINUOUS_EMPTY_COMMITS = "flink.max-continuous-empty-commits"; + + // TableLoader to load iceberg table lazily. + private final TableLoader tableLoader; + private final boolean replacePartitions; + private final Map snapshotProperties; + + // A sorted map to maintain the completed data files for each pending checkpointId (which have not + // been committed to iceberg table). We need a sorted map here because there's possible that few + // checkpoints snapshot failed, for example: the 1st checkpoint have 2 data files <1, >, the 2st checkpoint have 1 data files <2, >. Snapshot for checkpoint#1 + // interrupted because of network/disk failure etc, while we don't expect any data loss in iceberg + // table. So we keep the finished files <1, > in memory and retry to commit iceberg + // table when the next checkpoint happen. + private final NavigableMap dataFilesPerCheckpoint = Maps.newTreeMap(); + + // The completed files cache for current checkpoint. Once the snapshot barrier received, it will + // be flushed to the 'dataFilesPerCheckpoint'. + private final List writeResultsOfCurrentCkpt = Lists.newArrayList(); + private final String branch; + + // It will have an unique identifier for one job. + private transient String flinkJobId; + private transient String operatorUniqueId; + private transient Table table; + private transient IcebergFilesCommitterMetrics committerMetrics; + private transient ManifestOutputFileFactory manifestOutputFileFactory; + private transient long maxCommittedCheckpointId; + private transient int continuousEmptyCheckpoints; + private transient int maxContinuousEmptyCommits; + // There're two cases that we restore from flink checkpoints: the first case is restoring from + // snapshot created by the same flink job; another case is restoring from snapshot created by + // another different job. For the second case, we need to maintain the old flink job's id in flink + // state backend to find the max-committed-checkpoint-id when traversing iceberg table's + // snapshots. + private static final ListStateDescriptor JOB_ID_DESCRIPTOR = + new ListStateDescriptor<>("iceberg-flink-job-id", BasicTypeInfo.STRING_TYPE_INFO); + private transient ListState jobIdState; + // All pending checkpoints states for this function. + private static final ListStateDescriptor> STATE_DESCRIPTOR = + buildStateDescriptor(); + private transient ListState> checkpointsState; + + private final Integer workerPoolSize; + private final PartitionSpec spec; + private transient ExecutorService workerPool; + + IcebergFilesCommitter( + TableLoader tableLoader, + boolean replacePartitions, + Map snapshotProperties, + Integer workerPoolSize, + String branch, + PartitionSpec spec) { + this.tableLoader = tableLoader; + this.replacePartitions = replacePartitions; + this.snapshotProperties = snapshotProperties; + this.workerPoolSize = workerPoolSize; + this.branch = branch; + this.spec = spec; + } + + @Override + public void initializeState(StateInitializationContext context) throws Exception { + super.initializeState(context); + this.flinkJobId = getContainingTask().getEnvironment().getJobID().toString(); + this.operatorUniqueId = getRuntimeContext().getOperatorUniqueID(); + + // Open the table loader and load the table. + this.tableLoader.open(); + this.table = tableLoader.loadTable(); + this.committerMetrics = new IcebergFilesCommitterMetrics(super.metrics, table.name()); + + maxContinuousEmptyCommits = + PropertyUtil.propertyAsInt(table.properties(), MAX_CONTINUOUS_EMPTY_COMMITS, 10); + Preconditions.checkArgument( + maxContinuousEmptyCommits > 0, MAX_CONTINUOUS_EMPTY_COMMITS + " must be positive"); + + int subTaskId = getRuntimeContext().getIndexOfThisSubtask(); + int attemptId = getRuntimeContext().getAttemptNumber(); + this.manifestOutputFileFactory = + FlinkManifestUtil.createOutputFileFactory( + () -> table, table.properties(), flinkJobId, operatorUniqueId, subTaskId, attemptId); + this.maxCommittedCheckpointId = INITIAL_CHECKPOINT_ID; + + this.checkpointsState = context.getOperatorStateStore().getListState(STATE_DESCRIPTOR); + this.jobIdState = context.getOperatorStateStore().getListState(JOB_ID_DESCRIPTOR); + if (context.isRestored()) { + Iterable jobIdIterable = jobIdState.get(); + if (jobIdIterable == null || !jobIdIterable.iterator().hasNext()) { + LOG.warn( + "Failed to restore committer state. This can happen when operator uid changed and Flink " + + "allowNonRestoredState is enabled. Best practice is to explicitly set the operator id " + + "via FlinkSink#Builder#uidPrefix() so that the committer operator uid is stable. " + + "Otherwise, Flink auto generate an operator uid based on job topology." + + "With that, operator uid is subjective to change upon topology change."); + return; + } + + String restoredFlinkJobId = jobIdIterable.iterator().next(); + Preconditions.checkState( + !Strings.isNullOrEmpty(restoredFlinkJobId), + "Flink job id parsed from checkpoint snapshot shouldn't be null or empty"); + + // Since flink's checkpoint id will start from the max-committed-checkpoint-id + 1 in the new + // flink job even if it's restored from a snapshot created by another different flink job, so + // it's safe to assign the max committed checkpoint id from restored flink job to the current + // flink job. + this.maxCommittedCheckpointId = + getMaxCommittedCheckpointId(table, restoredFlinkJobId, operatorUniqueId, branch); + + NavigableMap uncommittedDataFiles = + Maps.newTreeMap(checkpointsState.get().iterator().next()) + .tailMap(maxCommittedCheckpointId, false); + if (!uncommittedDataFiles.isEmpty()) { + // Committed all uncommitted data files from the old flink job to iceberg table. + long maxUncommittedCheckpointId = uncommittedDataFiles.lastKey(); + commitUpToCheckpoint( + uncommittedDataFiles, restoredFlinkJobId, operatorUniqueId, maxUncommittedCheckpointId); + } + } + } + + @Override + public void snapshotState(StateSnapshotContext context) throws Exception { + super.snapshotState(context); + long checkpointId = context.getCheckpointId(); + LOG.info( + "Start to flush snapshot state to state backend, table: {}, checkpointId: {}", + table, + checkpointId); + + // Update the checkpoint state. + long startNano = System.nanoTime(); + dataFilesPerCheckpoint.put(checkpointId, writeToManifest(checkpointId)); + // Reset the snapshot state to the latest state. + checkpointsState.clear(); + checkpointsState.add(dataFilesPerCheckpoint); + + jobIdState.clear(); + jobIdState.add(flinkJobId); + + // Clear the local buffer for current checkpoint. + writeResultsOfCurrentCkpt.clear(); + committerMetrics.checkpointDuration( + TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - startNano)); + } + + @Override + public void notifyCheckpointComplete(long checkpointId) throws Exception { + super.notifyCheckpointComplete(checkpointId); + // It's possible that we have the following events: + // 1. snapshotState(ckpId); + // 2. snapshotState(ckpId+1); + // 3. notifyCheckpointComplete(ckpId+1); + // 4. notifyCheckpointComplete(ckpId); + // For step#4, we don't need to commit iceberg table again because in step#3 we've committed all + // the files, + // Besides, we need to maintain the max-committed-checkpoint-id to be increasing. + if (checkpointId > maxCommittedCheckpointId) { + LOG.info("Checkpoint {} completed. Attempting commit.", checkpointId); + commitUpToCheckpoint(dataFilesPerCheckpoint, flinkJobId, operatorUniqueId, checkpointId); + this.maxCommittedCheckpointId = checkpointId; + } else { + LOG.info( + "Skipping committing checkpoint {}. {} is already committed.", + checkpointId, + maxCommittedCheckpointId); + } + + // reload the table in case new configuration is needed + this.table = tableLoader.loadTable(); + } + + private void commitUpToCheckpoint( + NavigableMap deltaManifestsMap, + String newFlinkJobId, + String operatorId, + long checkpointId) + throws IOException { + NavigableMap pendingMap = deltaManifestsMap.headMap(checkpointId, true); + List manifests = Lists.newArrayList(); + NavigableMap pendingResults = Maps.newTreeMap(); + for (Map.Entry e : pendingMap.entrySet()) { + if (Arrays.equals(EMPTY_MANIFEST_DATA, e.getValue())) { + // Skip the empty flink manifest. + continue; + } + + DeltaManifests deltaManifests = + SimpleVersionedSerialization.readVersionAndDeSerialize( + DeltaManifestsSerializer.INSTANCE, e.getValue()); + pendingResults.put( + e.getKey(), + FlinkManifestUtil.readCompletedFiles(deltaManifests, table.io(), table.specs())); + manifests.addAll(deltaManifests.manifests()); + } + + CommitSummary summary = new CommitSummary(pendingResults); + commitPendingResult(pendingResults, summary, newFlinkJobId, operatorId, checkpointId); + committerMetrics.updateCommitSummary(summary); + pendingMap.clear(); + deleteCommittedManifests(manifests, newFlinkJobId, checkpointId); + } + + private void commitPendingResult( + NavigableMap pendingResults, + CommitSummary summary, + String newFlinkJobId, + String operatorId, + long checkpointId) { + long totalFiles = summary.dataFilesCount() + summary.deleteFilesCount(); + continuousEmptyCheckpoints = totalFiles == 0 ? continuousEmptyCheckpoints + 1 : 0; + if (totalFiles != 0 || continuousEmptyCheckpoints % maxContinuousEmptyCommits == 0) { + if (replacePartitions) { + replacePartitions(pendingResults, summary, newFlinkJobId, operatorId, checkpointId); + } else { + commitDeltaTxn(pendingResults, summary, newFlinkJobId, operatorId, checkpointId); + } + continuousEmptyCheckpoints = 0; + } else { + LOG.info("Skip commit for checkpoint {} due to no data files or delete files.", checkpointId); + } + } + + private void deleteCommittedManifests( + List manifests, String newFlinkJobId, long checkpointId) { + for (ManifestFile manifest : manifests) { + try { + table.io().deleteFile(manifest.path()); + } catch (Exception e) { + // The flink manifests cleaning failure shouldn't abort the completed checkpoint. + String details = + MoreObjects.toStringHelper(this) + .add("flinkJobId", newFlinkJobId) + .add("checkpointId", checkpointId) + .add("manifestPath", manifest.path()) + .toString(); + LOG.warn( + "The iceberg transaction has been committed, but we failed to clean the temporary flink manifests: {}", + details, + e); + } + } + } + + private void replacePartitions( + NavigableMap pendingResults, + CommitSummary summary, + String newFlinkJobId, + String operatorId, + long checkpointId) { + Preconditions.checkState( + summary.deleteFilesCount() == 0, "Cannot overwrite partitions with delete files."); + // Commit the overwrite transaction. + ReplacePartitions dynamicOverwrite = table.newReplacePartitions().scanManifestsWith(workerPool); + for (WriteResult result : pendingResults.values()) { + Preconditions.checkState( + result.referencedDataFiles().length == 0, "Should have no referenced data files."); + Arrays.stream(result.dataFiles()).forEach(dynamicOverwrite::addFile); + } + + commitOperation( + dynamicOverwrite, + summary, + "dynamic partition overwrite", + newFlinkJobId, + operatorId, + checkpointId); + } + + private void commitDeltaTxn( + NavigableMap pendingResults, + CommitSummary summary, + String newFlinkJobId, + String operatorId, + long checkpointId) { + if (summary.deleteFilesCount() == 0) { + // To be compatible with iceberg format V1. + AppendFiles appendFiles = table.newAppend().scanManifestsWith(workerPool); + for (WriteResult result : pendingResults.values()) { + Preconditions.checkState( + result.referencedDataFiles().length == 0, + "Should have no referenced data files for append."); + Arrays.stream(result.dataFiles()).forEach(appendFiles::appendFile); + } + commitOperation(appendFiles, summary, "append", newFlinkJobId, operatorId, checkpointId); + } else { + // To be compatible with iceberg format V2. + for (Map.Entry e : pendingResults.entrySet()) { + // We don't commit the merged result into a single transaction because for the sequential + // transaction txn1 and txn2, the equality-delete files of txn2 are required to be applied + // to data files from txn1. Committing the merged one will lead to the incorrect delete + // semantic. + WriteResult result = e.getValue(); + + // Row delta validations are not needed for streaming changes that write equality deletes. + // Equality deletes are applied to data in all previous sequence numbers, so retries may + // push deletes further in the future, but do not affect correctness. Position deletes + // committed to the table in this path are used only to delete rows from data files that are + // being added in this commit. There is no way for data files added along with the delete + // files to be concurrently removed, so there is no need to validate the files referenced by + // the position delete files that are being committed. + RowDelta rowDelta = table.newRowDelta().scanManifestsWith(workerPool); + + Arrays.stream(result.dataFiles()).forEach(rowDelta::addRows); + Arrays.stream(result.deleteFiles()).forEach(rowDelta::addDeletes); + commitOperation(rowDelta, summary, "rowDelta", newFlinkJobId, operatorId, e.getKey()); + } + } + } + + private void commitOperation( + SnapshotUpdate operation, + CommitSummary summary, + String description, + String newFlinkJobId, + String operatorId, + long checkpointId) { + LOG.info( + "Committing {} for checkpoint {} to table {} branch {} with summary: {}", + description, + checkpointId, + table.name(), + branch, + summary); + snapshotProperties.forEach(operation::set); + // custom snapshot metadata properties will be overridden if they conflict with internal ones + // used by the sink. + operation.set(MAX_COMMITTED_CHECKPOINT_ID, Long.toString(checkpointId)); + operation.set(FLINK_JOB_ID, newFlinkJobId); + operation.set(OPERATOR_ID, operatorId); + operation.toBranch(branch); + + long startNano = System.nanoTime(); + operation.commit(); // abort is automatically called if this fails. + long durationMs = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - startNano); + LOG.info( + "Committed {} to table: {}, branch: {}, checkpointId {} in {} ms", + description, + table.name(), + branch, + checkpointId, + durationMs); + committerMetrics.commitDuration(durationMs); + } + + @Override + public void processElement(StreamRecord element) { + this.writeResultsOfCurrentCkpt.add(element.getValue()); + } + + @Override + public void endInput() throws IOException { + // Flush the buffered data files into 'dataFilesPerCheckpoint' firstly. + long currentCheckpointId = Long.MAX_VALUE; + dataFilesPerCheckpoint.put(currentCheckpointId, writeToManifest(currentCheckpointId)); + writeResultsOfCurrentCkpt.clear(); + + commitUpToCheckpoint(dataFilesPerCheckpoint, flinkJobId, operatorUniqueId, currentCheckpointId); + } + + /** + * Write all the complete data files to a newly created manifest file and return the manifest's + * avro serialized bytes. + */ + private byte[] writeToManifest(long checkpointId) throws IOException { + if (writeResultsOfCurrentCkpt.isEmpty()) { + return EMPTY_MANIFEST_DATA; + } + + WriteResult result = WriteResult.builder().addAll(writeResultsOfCurrentCkpt).build(); + DeltaManifests deltaManifests = + FlinkManifestUtil.writeCompletedFiles( + result, () -> manifestOutputFileFactory.create(checkpointId), spec); + + return SimpleVersionedSerialization.writeVersionAndSerialize( + DeltaManifestsSerializer.INSTANCE, deltaManifests); + } + + @Override + public void open() throws Exception { + super.open(); + + final String operatorID = getRuntimeContext().getOperatorUniqueID(); + this.workerPool = + ThreadPools.newWorkerPool("iceberg-worker-pool-" + operatorID, workerPoolSize); + } + + @Override + public void close() throws Exception { + if (tableLoader != null) { + tableLoader.close(); + } + + if (workerPool != null) { + workerPool.shutdown(); + } + } + + @VisibleForTesting + static ListStateDescriptor> buildStateDescriptor() { + Comparator longComparator = Comparators.forType(Types.LongType.get()); + // Construct a SortedMapTypeInfo. + SortedMapTypeInfo sortedMapTypeInfo = + new SortedMapTypeInfo<>( + BasicTypeInfo.LONG_TYPE_INFO, + PrimitiveArrayTypeInfo.BYTE_PRIMITIVE_ARRAY_TYPE_INFO, + longComparator); + return new ListStateDescriptor<>("iceberg-files-committer-state", sortedMapTypeInfo); + } + + static long getMaxCommittedCheckpointId( + Table table, String flinkJobId, String operatorId, String branch) { + Snapshot snapshot = table.snapshot(branch); + long lastCommittedCheckpointId = INITIAL_CHECKPOINT_ID; + + while (snapshot != null) { + Map summary = snapshot.summary(); + String snapshotFlinkJobId = summary.get(FLINK_JOB_ID); + String snapshotOperatorId = summary.get(OPERATOR_ID); + if (flinkJobId.equals(snapshotFlinkJobId) + && (snapshotOperatorId == null || snapshotOperatorId.equals(operatorId))) { + String value = summary.get(MAX_COMMITTED_CHECKPOINT_ID); + if (value != null) { + lastCommittedCheckpointId = Long.parseLong(value); + break; + } + } + Long parentSnapshotId = snapshot.parentId(); + snapshot = parentSnapshotId != null ? table.snapshot(parentSnapshotId) : null; + } + + return lastCommittedCheckpointId; + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergFilesCommitterMetrics.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergFilesCommitterMetrics.java new file mode 100644 index 000000000000..5b28c4acb1c5 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergFilesCommitterMetrics.java @@ -0,0 +1,73 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicLong; +import org.apache.flink.metrics.Counter; +import org.apache.flink.metrics.MetricGroup; +import org.apache.iceberg.flink.util.ElapsedTimeGauge; + +class IcebergFilesCommitterMetrics { + private final AtomicLong lastCheckpointDurationMs = new AtomicLong(); + private final AtomicLong lastCommitDurationMs = new AtomicLong(); + private final ElapsedTimeGauge elapsedSecondsSinceLastSuccessfulCommit; + private final Counter committedDataFilesCount; + private final Counter committedDataFilesRecordCount; + private final Counter committedDataFilesByteCount; + private final Counter committedDeleteFilesCount; + private final Counter committedDeleteFilesRecordCount; + private final Counter committedDeleteFilesByteCount; + + IcebergFilesCommitterMetrics(MetricGroup metrics, String fullTableName) { + MetricGroup committerMetrics = + metrics.addGroup("IcebergFilesCommitter").addGroup("table", fullTableName); + committerMetrics.gauge("lastCheckpointDurationMs", lastCheckpointDurationMs::get); + committerMetrics.gauge("lastCommitDurationMs", lastCommitDurationMs::get); + this.elapsedSecondsSinceLastSuccessfulCommit = new ElapsedTimeGauge(TimeUnit.SECONDS); + committerMetrics.gauge( + "elapsedSecondsSinceLastSuccessfulCommit", elapsedSecondsSinceLastSuccessfulCommit); + this.committedDataFilesCount = committerMetrics.counter("committedDataFilesCount"); + this.committedDataFilesRecordCount = committerMetrics.counter("committedDataFilesRecordCount"); + this.committedDataFilesByteCount = committerMetrics.counter("committedDataFilesByteCount"); + this.committedDeleteFilesCount = committerMetrics.counter("committedDeleteFilesCount"); + this.committedDeleteFilesRecordCount = + committerMetrics.counter("committedDeleteFilesRecordCount"); + this.committedDeleteFilesByteCount = committerMetrics.counter("committedDeleteFilesByteCount"); + } + + void checkpointDuration(long checkpointDurationMs) { + lastCheckpointDurationMs.set(checkpointDurationMs); + } + + void commitDuration(long commitDurationMs) { + lastCommitDurationMs.set(commitDurationMs); + } + + /** This is called upon a successful commit. */ + void updateCommitSummary(CommitSummary stats) { + elapsedSecondsSinceLastSuccessfulCommit.refreshLastRecordedTime(); + committedDataFilesCount.inc(stats.dataFilesCount()); + committedDataFilesRecordCount.inc(stats.dataFilesRecordCount()); + committedDataFilesByteCount.inc(stats.dataFilesByteCount()); + committedDeleteFilesCount.inc(stats.deleteFilesCount()); + committedDeleteFilesRecordCount.inc(stats.deleteFilesRecordCount()); + committedDeleteFilesByteCount.inc(stats.deleteFilesByteCount()); + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriter.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriter.java new file mode 100644 index 000000000000..9ea0349fb057 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriter.java @@ -0,0 +1,120 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import java.io.IOException; +import java.util.concurrent.TimeUnit; +import org.apache.flink.streaming.api.operators.AbstractStreamOperator; +import org.apache.flink.streaming.api.operators.BoundedOneInput; +import org.apache.flink.streaming.api.operators.ChainingStrategy; +import org.apache.flink.streaming.api.operators.OneInputStreamOperator; +import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; +import org.apache.iceberg.io.TaskWriter; +import org.apache.iceberg.io.WriteResult; +import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; + +class IcebergStreamWriter extends AbstractStreamOperator + implements OneInputStreamOperator, BoundedOneInput { + + private static final long serialVersionUID = 1L; + + private final String fullTableName; + private final TaskWriterFactory taskWriterFactory; + + private transient TaskWriter writer; + private transient int subTaskId; + private transient int attemptId; + private transient IcebergStreamWriterMetrics writerMetrics; + + IcebergStreamWriter(String fullTableName, TaskWriterFactory taskWriterFactory) { + this.fullTableName = fullTableName; + this.taskWriterFactory = taskWriterFactory; + setChainingStrategy(ChainingStrategy.ALWAYS); + } + + @Override + public void open() { + this.subTaskId = getRuntimeContext().getIndexOfThisSubtask(); + this.attemptId = getRuntimeContext().getAttemptNumber(); + this.writerMetrics = new IcebergStreamWriterMetrics(super.metrics, fullTableName); + + // Initialize the task writer factory. + this.taskWriterFactory.initialize(subTaskId, attemptId); + + // Initialize the task writer. + this.writer = taskWriterFactory.create(); + } + + @Override + public void prepareSnapshotPreBarrier(long checkpointId) throws Exception { + flush(); + this.writer = taskWriterFactory.create(); + } + + @Override + public void processElement(StreamRecord element) throws Exception { + writer.write(element.getValue()); + } + + @Override + public void close() throws Exception { + super.close(); + if (writer != null) { + writer.close(); + writer = null; + } + } + + @Override + public void endInput() throws IOException { + // For bounded stream, it may don't enable the checkpoint mechanism so we'd better to emit the + // remaining completed files to downstream before closing the writer so that we won't miss any + // of them. + // Note that if the task is not closed after calling endInput, checkpoint may be triggered again + // causing files to be sent repeatedly, the writer is marked as null after the last file is sent + // to guard against duplicated writes. + flush(); + } + + @Override + public String toString() { + return MoreObjects.toStringHelper(this) + .add("table_name", fullTableName) + .add("subtask_id", subTaskId) + .add("attempt_id", attemptId) + .toString(); + } + + /** close all open files and emit files to downstream committer operator */ + private void flush() throws IOException { + if (writer == null) { + return; + } + + long startNano = System.nanoTime(); + WriteResult result = writer.complete(); + writerMetrics.updateFlushResult(result); + output.collect(new StreamRecord<>(result)); + writerMetrics.flushDuration(TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - startNano)); + + // Set writer to null to prevent duplicate flushes in the corner case of + // prepareSnapshotPreBarrier happening after endInput. + writer = null; + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriterMetrics.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriterMetrics.java new file mode 100644 index 000000000000..ce2a6c583fdf --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriterMetrics.java @@ -0,0 +1,89 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import com.codahale.metrics.SlidingWindowReservoir; +import java.util.Arrays; +import java.util.concurrent.atomic.AtomicLong; +import org.apache.flink.dropwizard.metrics.DropwizardHistogramWrapper; +import org.apache.flink.metrics.Counter; +import org.apache.flink.metrics.Histogram; +import org.apache.flink.metrics.MetricGroup; +import org.apache.iceberg.io.WriteResult; + +class IcebergStreamWriterMetrics { + // 1,024 reservoir size should cost about 8KB, which is quite small. + // It should also produce good accuracy for histogram distribution (like percentiles). + private static final int HISTOGRAM_RESERVOIR_SIZE = 1024; + + private final Counter flushedDataFiles; + private final Counter flushedDeleteFiles; + private final Counter flushedReferencedDataFiles; + private final AtomicLong lastFlushDurationMs; + private final Histogram dataFilesSizeHistogram; + private final Histogram deleteFilesSizeHistogram; + + IcebergStreamWriterMetrics(MetricGroup metrics, String fullTableName) { + MetricGroup writerMetrics = + metrics.addGroup("IcebergStreamWriter").addGroup("table", fullTableName); + this.flushedDataFiles = writerMetrics.counter("flushedDataFiles"); + this.flushedDeleteFiles = writerMetrics.counter("flushedDeleteFiles"); + this.flushedReferencedDataFiles = writerMetrics.counter("flushedReferencedDataFiles"); + this.lastFlushDurationMs = new AtomicLong(); + writerMetrics.gauge("lastFlushDurationMs", lastFlushDurationMs::get); + + com.codahale.metrics.Histogram dropwizardDataFilesSizeHistogram = + new com.codahale.metrics.Histogram(new SlidingWindowReservoir(HISTOGRAM_RESERVOIR_SIZE)); + this.dataFilesSizeHistogram = + writerMetrics.histogram( + "dataFilesSizeHistogram", + new DropwizardHistogramWrapper(dropwizardDataFilesSizeHistogram)); + com.codahale.metrics.Histogram dropwizardDeleteFilesSizeHistogram = + new com.codahale.metrics.Histogram(new SlidingWindowReservoir(HISTOGRAM_RESERVOIR_SIZE)); + this.deleteFilesSizeHistogram = + writerMetrics.histogram( + "deleteFilesSizeHistogram", + new DropwizardHistogramWrapper(dropwizardDeleteFilesSizeHistogram)); + } + + void updateFlushResult(WriteResult result) { + flushedDataFiles.inc(result.dataFiles().length); + flushedDeleteFiles.inc(result.deleteFiles().length); + flushedReferencedDataFiles.inc(result.referencedDataFiles().length); + + // For file size distribution histogram, we don't have to update them after successful commits. + // This should works equally well and we avoided the overhead of tracking the list of file sizes + // in the {@link CommitSummary}, which currently stores simple stats for counters and gauges + // metrics. + Arrays.stream(result.dataFiles()) + .forEach( + dataFile -> { + dataFilesSizeHistogram.update(dataFile.fileSizeInBytes()); + }); + Arrays.stream(result.deleteFiles()) + .forEach( + deleteFile -> { + deleteFilesSizeHistogram.update(deleteFile.fileSizeInBytes()); + }); + } + + void flushDuration(long flushDurationMs) { + lastFlushDurationMs.set(flushDurationMs); + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/ManifestOutputFileFactory.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/ManifestOutputFileFactory.java new file mode 100644 index 000000000000..da5e6e7627ae --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/ManifestOutputFileFactory.java @@ -0,0 +1,94 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import java.util.Map; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.function.Supplier; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.HasTableOperations; +import org.apache.iceberg.Table; +import org.apache.iceberg.TableOperations; +import org.apache.iceberg.io.OutputFile; +import org.apache.iceberg.relocated.com.google.common.base.Strings; + +class ManifestOutputFileFactory { + // Users could define their own flink manifests directory by setting this value in table + // properties. + static final String FLINK_MANIFEST_LOCATION = "flink.manifests.location"; + + private final Supplier
tableSupplier; + private final Map props; + private final String flinkJobId; + private final String operatorUniqueId; + private final int subTaskId; + private final long attemptNumber; + private final AtomicInteger fileCount = new AtomicInteger(0); + + ManifestOutputFileFactory( + Supplier
tableSupplier, + Map props, + String flinkJobId, + String operatorUniqueId, + int subTaskId, + long attemptNumber) { + this.tableSupplier = tableSupplier; + this.props = props; + this.flinkJobId = flinkJobId; + this.operatorUniqueId = operatorUniqueId; + this.subTaskId = subTaskId; + this.attemptNumber = attemptNumber; + } + + private String generatePath(long checkpointId) { + return FileFormat.AVRO.addExtension( + String.format( + "%s-%s-%05d-%d-%d-%05d", + flinkJobId, + operatorUniqueId, + subTaskId, + attemptNumber, + checkpointId, + fileCount.incrementAndGet())); + } + + OutputFile create(long checkpointId) { + String flinkManifestDir = props.get(FLINK_MANIFEST_LOCATION); + TableOperations ops = ((HasTableOperations) tableSupplier.get()).operations(); + + String newManifestFullPath; + if (Strings.isNullOrEmpty(flinkManifestDir)) { + // User don't specify any flink manifest directory, so just use the default metadata path. + newManifestFullPath = ops.metadataFileLocation(generatePath(checkpointId)); + } else { + newManifestFullPath = + String.format("%s/%s", stripTrailingSlash(flinkManifestDir), generatePath(checkpointId)); + } + + return tableSupplier.get().io().newOutputFile(newManifestFullPath); + } + + private static String stripTrailingSlash(String path) { + String result = path; + while (result.endsWith("/")) { + result = result.substring(0, result.length() - 1); + } + return result; + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/PartitionKeySelector.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/PartitionKeySelector.java new file mode 100644 index 000000000000..df951684b446 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/PartitionKeySelector.java @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import org.apache.flink.api.java.functions.KeySelector; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.iceberg.PartitionKey; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.flink.RowDataWrapper; + +/** + * Create a {@link KeySelector} to shuffle by partition key, then each partition/bucket will be + * wrote by only one task. That will reduce lots of small files in partitioned fanout write policy + * for {@link FlinkSink}. + */ +class PartitionKeySelector implements KeySelector { + + private final Schema schema; + private final PartitionKey partitionKey; + private final RowType flinkSchema; + + private transient RowDataWrapper rowDataWrapper; + + PartitionKeySelector(PartitionSpec spec, Schema schema, RowType flinkSchema) { + this.schema = schema; + this.partitionKey = new PartitionKey(spec, schema); + this.flinkSchema = flinkSchema; + } + + /** + * Construct the {@link RowDataWrapper} lazily here because few members in it are not + * serializable. In this way, we don't have to serialize them with forcing. + */ + private RowDataWrapper lazyRowDataWrapper() { + if (rowDataWrapper == null) { + rowDataWrapper = new RowDataWrapper(flinkSchema, schema.asStruct()); + } + return rowDataWrapper; + } + + @Override + public String getKey(RowData row) { + partitionKey.partition(lazyRowDataWrapper().wrap(row)); + return partitionKey.toPath(); + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/PartitionedDeltaWriter.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/PartitionedDeltaWriter.java new file mode 100644 index 000000000000..38062dd1a2c4 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/PartitionedDeltaWriter.java @@ -0,0 +1,97 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.util.List; +import java.util.Map; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.PartitionKey; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.io.FileAppenderFactory; +import org.apache.iceberg.io.FileIO; +import org.apache.iceberg.io.OutputFileFactory; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.util.Tasks; + +class PartitionedDeltaWriter extends BaseDeltaTaskWriter { + + private final PartitionKey partitionKey; + + private final Map writers = Maps.newHashMap(); + + PartitionedDeltaWriter( + PartitionSpec spec, + FileFormat format, + FileAppenderFactory appenderFactory, + OutputFileFactory fileFactory, + FileIO io, + long targetFileSize, + Schema schema, + RowType flinkSchema, + List equalityFieldIds, + boolean upsert) { + super( + spec, + format, + appenderFactory, + fileFactory, + io, + targetFileSize, + schema, + flinkSchema, + equalityFieldIds, + upsert); + this.partitionKey = new PartitionKey(spec, schema); + } + + @Override + RowDataDeltaWriter route(RowData row) { + partitionKey.partition(wrapper().wrap(row)); + + RowDataDeltaWriter writer = writers.get(partitionKey); + if (writer == null) { + // NOTICE: we need to copy a new partition key here, in case of messing up the keys in + // writers. + PartitionKey copiedKey = partitionKey.copy(); + writer = new RowDataDeltaWriter(copiedKey); + writers.put(copiedKey, writer); + } + + return writer; + } + + @Override + public void close() { + try { + Tasks.foreach(writers.values()) + .throwFailureWhenFinished() + .noRetry() + .run(RowDataDeltaWriter::close, IOException.class); + + writers.clear(); + } catch (IOException e) { + throw new UncheckedIOException("Failed to close equality delta writer", e); + } + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/RowDataTaskWriterFactory.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/RowDataTaskWriterFactory.java new file mode 100644 index 000000000000..67422a1afeb1 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/RowDataTaskWriterFactory.java @@ -0,0 +1,244 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import java.util.List; +import java.util.Map; +import java.util.function.Supplier; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.PartitionKey; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.Table; +import org.apache.iceberg.flink.RowDataWrapper; +import org.apache.iceberg.io.FileAppenderFactory; +import org.apache.iceberg.io.FileIO; +import org.apache.iceberg.io.OutputFileFactory; +import org.apache.iceberg.io.PartitionedFanoutWriter; +import org.apache.iceberg.io.TaskWriter; +import org.apache.iceberg.io.UnpartitionedWriter; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Sets; +import org.apache.iceberg.types.TypeUtil; +import org.apache.iceberg.util.ArrayUtil; +import org.apache.iceberg.util.SerializableSupplier; + +public class RowDataTaskWriterFactory implements TaskWriterFactory { + private final Supplier
tableSupplier; + private final Schema schema; + private final RowType flinkSchema; + private final PartitionSpec spec; + private final long targetFileSizeBytes; + private final FileFormat format; + private final List equalityFieldIds; + private final boolean upsert; + private final FileAppenderFactory appenderFactory; + + private transient OutputFileFactory outputFileFactory; + + public RowDataTaskWriterFactory( + Table table, + RowType flinkSchema, + long targetFileSizeBytes, + FileFormat format, + Map writeProperties, + List equalityFieldIds, + boolean upsert) { + this( + () -> table, + flinkSchema, + targetFileSizeBytes, + format, + writeProperties, + equalityFieldIds, + upsert); + } + + public RowDataTaskWriterFactory( + SerializableSupplier
tableSupplier, + RowType flinkSchema, + long targetFileSizeBytes, + FileFormat format, + Map writeProperties, + List equalityFieldIds, + boolean upsert) { + this.tableSupplier = tableSupplier; + + Table table; + if (tableSupplier instanceof CachingTableSupplier) { + // rely on the initial table metadata for schema, etc., until schema evolution is supported + table = ((CachingTableSupplier) tableSupplier).initialTable(); + } else { + table = tableSupplier.get(); + } + + this.schema = table.schema(); + this.flinkSchema = flinkSchema; + this.spec = table.spec(); + this.targetFileSizeBytes = targetFileSizeBytes; + this.format = format; + this.equalityFieldIds = equalityFieldIds; + this.upsert = upsert; + + if (equalityFieldIds == null || equalityFieldIds.isEmpty()) { + this.appenderFactory = + new FlinkAppenderFactory( + table, schema, flinkSchema, writeProperties, spec, null, null, null); + } else if (upsert) { + // In upsert mode, only the new row is emitted using INSERT row kind. Therefore, any column of + // the inserted row + // may differ from the deleted row other than the primary key fields, and the delete file must + // contain values + // that are correct for the deleted row. Therefore, only write the equality delete fields. + this.appenderFactory = + new FlinkAppenderFactory( + table, + schema, + flinkSchema, + writeProperties, + spec, + ArrayUtil.toIntArray(equalityFieldIds), + TypeUtil.select(schema, Sets.newHashSet(equalityFieldIds)), + null); + } else { + this.appenderFactory = + new FlinkAppenderFactory( + table, + schema, + flinkSchema, + writeProperties, + spec, + ArrayUtil.toIntArray(equalityFieldIds), + schema, + null); + } + } + + @Override + public void initialize(int taskId, int attemptId) { + Table table; + if (tableSupplier instanceof CachingTableSupplier) { + // rely on the initial table metadata for schema, etc., until schema evolution is supported + table = ((CachingTableSupplier) tableSupplier).initialTable(); + } else { + table = tableSupplier.get(); + } + + refreshTable(); + + this.outputFileFactory = + OutputFileFactory.builderFor(table, taskId, attemptId) + .format(format) + .ioSupplier(() -> tableSupplier.get().io()) + .build(); + } + + @Override + public TaskWriter create() { + Preconditions.checkNotNull( + outputFileFactory, + "The outputFileFactory shouldn't be null if we have invoked the initialize()."); + + refreshTable(); + + if (equalityFieldIds == null || equalityFieldIds.isEmpty()) { + // Initialize a task writer to write INSERT only. + if (spec.isUnpartitioned()) { + return new UnpartitionedWriter<>( + spec, + format, + appenderFactory, + outputFileFactory, + tableSupplier.get().io(), + targetFileSizeBytes); + } else { + return new RowDataPartitionedFanoutWriter( + spec, + format, + appenderFactory, + outputFileFactory, + tableSupplier.get().io(), + targetFileSizeBytes, + schema, + flinkSchema); + } + } else { + // Initialize a task writer to write both INSERT and equality DELETE. + if (spec.isUnpartitioned()) { + return new UnpartitionedDeltaWriter( + spec, + format, + appenderFactory, + outputFileFactory, + tableSupplier.get().io(), + targetFileSizeBytes, + schema, + flinkSchema, + equalityFieldIds, + upsert); + } else { + return new PartitionedDeltaWriter( + spec, + format, + appenderFactory, + outputFileFactory, + tableSupplier.get().io(), + targetFileSizeBytes, + schema, + flinkSchema, + equalityFieldIds, + upsert); + } + } + } + + void refreshTable() { + if (tableSupplier instanceof CachingTableSupplier) { + ((CachingTableSupplier) tableSupplier).refreshTable(); + } + } + + private static class RowDataPartitionedFanoutWriter extends PartitionedFanoutWriter { + + private final PartitionKey partitionKey; + private final RowDataWrapper rowDataWrapper; + + RowDataPartitionedFanoutWriter( + PartitionSpec spec, + FileFormat format, + FileAppenderFactory appenderFactory, + OutputFileFactory fileFactory, + FileIO io, + long targetFileSize, + Schema schema, + RowType flinkSchema) { + super(spec, format, appenderFactory, fileFactory, io, targetFileSize); + this.partitionKey = new PartitionKey(spec, schema); + this.rowDataWrapper = new RowDataWrapper(flinkSchema, schema.asStruct()); + } + + @Override + protected PartitionKey partition(RowData row) { + partitionKey.partition(rowDataWrapper.wrap(row)); + return partitionKey; + } + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/TaskWriterFactory.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/TaskWriterFactory.java new file mode 100644 index 000000000000..e3a1245e8cbd --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/TaskWriterFactory.java @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import java.io.Serializable; +import org.apache.iceberg.io.TaskWriter; + +/** + * Factory to create {@link TaskWriter} + * + * @param data type of record. + */ +public interface TaskWriterFactory extends Serializable { + + /** + * Initialize the factory with a given taskId and attemptId. + * + * @param taskId the identifier of task. + * @param attemptId the attempt id of this task. + */ + void initialize(int taskId, int attemptId); + + /** + * Initialize a {@link TaskWriter} with given task id and attempt id. + * + * @return a newly created task writer. + */ + TaskWriter create(); +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/UnpartitionedDeltaWriter.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/UnpartitionedDeltaWriter.java new file mode 100644 index 000000000000..7680fb933b20 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/UnpartitionedDeltaWriter.java @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import java.io.IOException; +import java.util.List; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.io.FileAppenderFactory; +import org.apache.iceberg.io.FileIO; +import org.apache.iceberg.io.OutputFileFactory; + +class UnpartitionedDeltaWriter extends BaseDeltaTaskWriter { + private final RowDataDeltaWriter writer; + + UnpartitionedDeltaWriter( + PartitionSpec spec, + FileFormat format, + FileAppenderFactory appenderFactory, + OutputFileFactory fileFactory, + FileIO io, + long targetFileSize, + Schema schema, + RowType flinkSchema, + List equalityFieldIds, + boolean upsert) { + super( + spec, + format, + appenderFactory, + fileFactory, + io, + targetFileSize, + schema, + flinkSchema, + equalityFieldIds, + upsert); + this.writer = new RowDataDeltaWriter(null); + } + + @Override + RowDataDeltaWriter route(RowData row) { + return writer; + } + + @Override + public void close() throws IOException { + writer.close(); + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/AggregatedStatisticsTracker.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/AggregatedStatisticsTracker.java new file mode 100644 index 000000000000..5525f02c873e --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/AggregatedStatisticsTracker.java @@ -0,0 +1,262 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import java.util.Map; +import java.util.NavigableMap; +import java.util.Set; +import javax.annotation.Nullable; +import org.apache.datasketches.sampling.ReservoirItemsSketch; +import org.apache.datasketches.sampling.ReservoirItemsUnion; +import org.apache.flink.api.common.typeutils.TypeSerializer; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SortKey; +import org.apache.iceberg.SortOrder; +import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.relocated.com.google.common.collect.Sets; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * AggregatedStatisticsTracker tracks the statistics aggregation received from {@link + * DataStatisticsOperator} subtasks for every checkpoint. + */ +class AggregatedStatisticsTracker { + private static final Logger LOG = LoggerFactory.getLogger(AggregatedStatisticsTracker.class); + + private final String operatorName; + private final int parallelism; + private final TypeSerializer statisticsSerializer; + private final int downstreamParallelism; + private final StatisticsType statisticsType; + private final int switchToSketchThreshold; + private final NavigableMap aggregationsPerCheckpoint; + + private CompletedStatistics completedStatistics; + + AggregatedStatisticsTracker( + String operatorName, + int parallelism, + Schema schema, + SortOrder sortOrder, + int downstreamParallelism, + StatisticsType statisticsType, + int switchToSketchThreshold, + @Nullable CompletedStatistics restoredStatistics) { + this.operatorName = operatorName; + this.parallelism = parallelism; + this.statisticsSerializer = + new DataStatisticsSerializer(new SortKeySerializer(schema, sortOrder)); + this.downstreamParallelism = downstreamParallelism; + this.statisticsType = statisticsType; + this.switchToSketchThreshold = switchToSketchThreshold; + this.completedStatistics = restoredStatistics; + + this.aggregationsPerCheckpoint = Maps.newTreeMap(); + } + + CompletedStatistics updateAndCheckCompletion(int subtask, StatisticsEvent event) { + long checkpointId = event.checkpointId(); + LOG.debug( + "Handling statistics event from subtask {} of operator {} for checkpoint {}", + subtask, + operatorName, + checkpointId); + + if (completedStatistics != null && completedStatistics.checkpointId() > checkpointId) { + LOG.info( + "Ignore stale statistics event from operator {} subtask {} for older checkpoint {}. " + + "Was expecting data statistics from checkpoint higher than {}", + operatorName, + subtask, + checkpointId, + completedStatistics.checkpointId()); + return null; + } + + Aggregation aggregation = + aggregationsPerCheckpoint.computeIfAbsent( + checkpointId, + ignored -> + new Aggregation( + parallelism, + downstreamParallelism, + switchToSketchThreshold, + statisticsType, + StatisticsUtil.collectType(statisticsType, completedStatistics))); + DataStatistics dataStatistics = + StatisticsUtil.deserializeDataStatistics(event.statisticsBytes(), statisticsSerializer); + if (!aggregation.merge(subtask, dataStatistics)) { + LOG.debug( + "Ignore duplicate data statistics from operator {} subtask {} for checkpoint {}.", + operatorName, + subtask, + checkpointId); + } + + if (aggregation.isComplete()) { + this.completedStatistics = aggregation.completedStatistics(checkpointId); + // clean up aggregations up to the completed checkpoint id + aggregationsPerCheckpoint.headMap(checkpointId, true).clear(); + return completedStatistics; + } + + return null; + } + + @VisibleForTesting + NavigableMap aggregationsPerCheckpoint() { + return aggregationsPerCheckpoint; + } + + static class Aggregation { + private static final Logger LOG = LoggerFactory.getLogger(Aggregation.class); + + private final Set subtaskSet; + private final int parallelism; + private final int downstreamParallelism; + private final int switchToSketchThreshold; + private final StatisticsType configuredType; + private StatisticsType currentType; + private Map mapStatistics; + private ReservoirItemsUnion sketchStatistics; + + Aggregation( + int parallelism, + int downstreamParallelism, + int switchToSketchThreshold, + StatisticsType configuredType, + StatisticsType currentType) { + this.subtaskSet = Sets.newHashSet(); + this.parallelism = parallelism; + this.downstreamParallelism = downstreamParallelism; + this.switchToSketchThreshold = switchToSketchThreshold; + this.configuredType = configuredType; + this.currentType = currentType; + + if (currentType == StatisticsType.Map) { + this.mapStatistics = Maps.newHashMap(); + this.sketchStatistics = null; + } else { + this.mapStatistics = null; + this.sketchStatistics = + ReservoirItemsUnion.newInstance( + SketchUtil.determineCoordinatorReservoirSize(downstreamParallelism)); + } + } + + @VisibleForTesting + Set subtaskSet() { + return subtaskSet; + } + + @VisibleForTesting + StatisticsType currentType() { + return currentType; + } + + @VisibleForTesting + Map mapStatistics() { + return mapStatistics; + } + + @VisibleForTesting + ReservoirItemsUnion sketchStatistics() { + return sketchStatistics; + } + + private boolean isComplete() { + return subtaskSet.size() == parallelism; + } + + /** @return false if duplicate */ + private boolean merge(int subtask, DataStatistics taskStatistics) { + if (subtaskSet.contains(subtask)) { + return false; + } + + subtaskSet.add(subtask); + merge(taskStatistics); + return true; + } + + @SuppressWarnings("unchecked") + private void merge(DataStatistics taskStatistics) { + if (taskStatistics.type() == StatisticsType.Map) { + Map taskMapStats = (Map) taskStatistics.result(); + if (currentType == StatisticsType.Map) { + taskMapStats.forEach((key, count) -> mapStatistics.merge(key, count, Long::sum)); + if (configuredType == StatisticsType.Auto + && mapStatistics.size() > switchToSketchThreshold) { + convertCoordinatorToSketch(); + } + } else { + // convert task stats to sketch first + ReservoirItemsSketch taskSketch = + ReservoirItemsSketch.newInstance( + SketchUtil.determineOperatorReservoirSize(parallelism, downstreamParallelism)); + SketchUtil.convertMapToSketch(taskMapStats, taskSketch::update); + sketchStatistics.update(taskSketch); + } + } else { + ReservoirItemsSketch taskSketch = + (ReservoirItemsSketch) taskStatistics.result(); + if (currentType == StatisticsType.Map) { + // convert global stats to sketch first + convertCoordinatorToSketch(); + } + + if (taskSketch.getNumSamples() > 0) { + sketchStatistics.update(taskSketch); + } + } + } + + private void convertCoordinatorToSketch() { + this.sketchStatistics = + ReservoirItemsUnion.newInstance( + SketchUtil.determineCoordinatorReservoirSize(downstreamParallelism)); + SketchUtil.convertMapToSketch(mapStatistics, sketchStatistics::update); + this.currentType = StatisticsType.Sketch; + this.mapStatistics = null; + } + + private CompletedStatistics completedStatistics(long checkpointId) { + if (currentType == StatisticsType.Map) { + LOG.info("Completed map statistics aggregation with {} keys", mapStatistics.size()); + return CompletedStatistics.fromKeyFrequency(checkpointId, mapStatistics); + } else { + ReservoirItemsSketch sketch = sketchStatistics.getResult(); + if (sketch != null) { + LOG.info( + "Completed sketch statistics aggregation: " + + "reservoir size = {}, number of items seen = {}, number of samples = {}", + sketch.getK(), + sketch.getN(), + sketch.getNumSamples()); + return CompletedStatistics.fromKeySamples(checkpointId, sketch.getSamples()); + } else { + LOG.info("Empty sketch statistics."); + return CompletedStatistics.fromKeySamples(checkpointId, new SortKey[0]); + } + } + } + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/CompletedStatistics.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/CompletedStatistics.java new file mode 100644 index 000000000000..e4cba174f0f2 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/CompletedStatistics.java @@ -0,0 +1,111 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import java.util.Arrays; +import java.util.Map; +import org.apache.iceberg.SortKey; +import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; +import org.apache.iceberg.relocated.com.google.common.base.Objects; + +/** + * This is what {@link AggregatedStatisticsTracker} returns upon a completed statistics aggregation + * from all subtasks. It contains the raw statistics (Map or reservoir samples). + */ +class CompletedStatistics { + private final long checkpointId; + private final StatisticsType type; + private final Map keyFrequency; + private final SortKey[] keySamples; + + static CompletedStatistics fromKeyFrequency(long checkpointId, Map stats) { + return new CompletedStatistics(checkpointId, StatisticsType.Map, stats, null); + } + + static CompletedStatistics fromKeySamples(long checkpointId, SortKey[] keySamples) { + return new CompletedStatistics(checkpointId, StatisticsType.Sketch, null, keySamples); + } + + CompletedStatistics( + long checkpointId, + StatisticsType type, + Map keyFrequency, + SortKey[] keySamples) { + this.checkpointId = checkpointId; + this.type = type; + this.keyFrequency = keyFrequency; + this.keySamples = keySamples; + } + + @Override + public String toString() { + return MoreObjects.toStringHelper(this) + .add("checkpointId", checkpointId) + .add("type", type) + .add("keyFrequency", keyFrequency) + .add("keySamples", keySamples) + .toString(); + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + + if (!(o instanceof CompletedStatistics)) { + return false; + } + + CompletedStatistics other = (CompletedStatistics) o; + return Objects.equal(checkpointId, other.checkpointId) + && Objects.equal(type, other.type) + && Objects.equal(keyFrequency, other.keyFrequency()) + && Arrays.equals(keySamples, other.keySamples()); + } + + @Override + public int hashCode() { + return Objects.hashCode(checkpointId, type, keyFrequency, keySamples); + } + + long checkpointId() { + return checkpointId; + } + + StatisticsType type() { + return type; + } + + Map keyFrequency() { + return keyFrequency; + } + + SortKey[] keySamples() { + return keySamples; + } + + boolean isEmpty() { + if (type == StatisticsType.Sketch) { + return keySamples.length == 0; + } else { + return keyFrequency().isEmpty(); + } + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/CompletedStatisticsSerializer.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/CompletedStatisticsSerializer.java new file mode 100644 index 000000000000..1ac0e386a011 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/CompletedStatisticsSerializer.java @@ -0,0 +1,176 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import java.io.IOException; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import org.apache.flink.api.common.typeutils.CompositeTypeSerializerSnapshot; +import org.apache.flink.api.common.typeutils.TypeSerializer; +import org.apache.flink.api.common.typeutils.TypeSerializerSnapshot; +import org.apache.flink.api.common.typeutils.base.EnumSerializer; +import org.apache.flink.api.common.typeutils.base.ListSerializer; +import org.apache.flink.api.common.typeutils.base.LongSerializer; +import org.apache.flink.api.common.typeutils.base.MapSerializer; +import org.apache.flink.core.memory.DataInputView; +import org.apache.flink.core.memory.DataOutputView; +import org.apache.iceberg.SortKey; + +class CompletedStatisticsSerializer extends TypeSerializer { + private final TypeSerializer sortKeySerializer; + private final EnumSerializer statisticsTypeSerializer; + private final MapSerializer keyFrequencySerializer; + private final ListSerializer keySamplesSerializer; + + CompletedStatisticsSerializer(TypeSerializer sortKeySerializer) { + this.sortKeySerializer = sortKeySerializer; + this.statisticsTypeSerializer = new EnumSerializer<>(StatisticsType.class); + this.keyFrequencySerializer = new MapSerializer<>(sortKeySerializer, LongSerializer.INSTANCE); + this.keySamplesSerializer = new ListSerializer<>(sortKeySerializer); + } + + @Override + public boolean isImmutableType() { + return false; + } + + @Override + public TypeSerializer duplicate() { + return new CompletedStatisticsSerializer(sortKeySerializer); + } + + @Override + public CompletedStatistics createInstance() { + return CompletedStatistics.fromKeyFrequency(0L, Collections.emptyMap()); + } + + @Override + public CompletedStatistics copy(CompletedStatistics from) { + return new CompletedStatistics( + from.checkpointId(), from.type(), from.keyFrequency(), from.keySamples()); + } + + @Override + public CompletedStatistics copy(CompletedStatistics from, CompletedStatistics reuse) { + // no benefit of reuse + return copy(from); + } + + @Override + public int getLength() { + return -1; + } + + @Override + public void serialize(CompletedStatistics record, DataOutputView target) throws IOException { + target.writeLong(record.checkpointId()); + statisticsTypeSerializer.serialize(record.type(), target); + if (record.type() == StatisticsType.Map) { + keyFrequencySerializer.serialize(record.keyFrequency(), target); + } else { + keySamplesSerializer.serialize(Arrays.asList(record.keySamples()), target); + } + } + + @Override + public CompletedStatistics deserialize(DataInputView source) throws IOException { + long checkpointId = source.readLong(); + StatisticsType type = statisticsTypeSerializer.deserialize(source); + if (type == StatisticsType.Map) { + Map keyFrequency = keyFrequencySerializer.deserialize(source); + return CompletedStatistics.fromKeyFrequency(checkpointId, keyFrequency); + } else { + List sortKeys = keySamplesSerializer.deserialize(source); + SortKey[] keySamples = new SortKey[sortKeys.size()]; + keySamples = sortKeys.toArray(keySamples); + return CompletedStatistics.fromKeySamples(checkpointId, keySamples); + } + } + + @Override + public CompletedStatistics deserialize(CompletedStatistics reuse, DataInputView source) + throws IOException { + // not much benefit to reuse + return deserialize(source); + } + + @Override + public void copy(DataInputView source, DataOutputView target) throws IOException { + serialize(deserialize(source), target); + } + + @Override + public boolean equals(Object obj) { + if (this == obj) { + return true; + } + + if (obj == null || getClass() != obj.getClass()) { + return false; + } + + CompletedStatisticsSerializer other = (CompletedStatisticsSerializer) obj; + return Objects.equals(sortKeySerializer, other.sortKeySerializer); + } + + @Override + public int hashCode() { + return sortKeySerializer.hashCode(); + } + + @Override + public TypeSerializerSnapshot snapshotConfiguration() { + return new CompletedStatisticsSerializerSnapshot(this); + } + + public static class CompletedStatisticsSerializerSnapshot + extends CompositeTypeSerializerSnapshot { + private static final int CURRENT_VERSION = 1; + + /** Constructor for read instantiation. */ + @SuppressWarnings({"unused", "checkstyle:RedundantModifier"}) + public CompletedStatisticsSerializerSnapshot() {} + + @SuppressWarnings("checkstyle:RedundantModifier") + public CompletedStatisticsSerializerSnapshot(CompletedStatisticsSerializer serializer) { + super(serializer); + } + + @Override + protected int getCurrentOuterSnapshotVersion() { + return CURRENT_VERSION; + } + + @Override + protected TypeSerializer[] getNestedSerializers( + CompletedStatisticsSerializer outerSerializer) { + return new TypeSerializer[] {outerSerializer.sortKeySerializer}; + } + + @Override + protected CompletedStatisticsSerializer createOuterSerializerWithNestedSerializers( + TypeSerializer[] nestedSerializers) { + SortKeySerializer sortKeySerializer = (SortKeySerializer) nestedSerializers[0]; + return new CompletedStatisticsSerializer(sortKeySerializer); + } + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatistics.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatistics.java new file mode 100644 index 000000000000..76c59cd5f4b8 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatistics.java @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import java.util.Map; +import org.apache.datasketches.sampling.ReservoirItemsSketch; +import org.apache.flink.annotation.Internal; +import org.apache.iceberg.SortKey; + +/** + * DataStatistics defines the interface to collect data distribution information. + * + *

Data statistics tracks traffic volume distribution across data keys. For low-cardinality key, + * a simple map of (key, count) can be used. For high-cardinality key, probabilistic data structures + * (sketching) can be used. + */ +@Internal +interface DataStatistics { + + StatisticsType type(); + + boolean isEmpty(); + + /** Add row sortKey to data statistics. */ + void add(SortKey sortKey); + + /** + * Get the collected statistics. Could be a {@link Map} (low cardinality) or {@link + * ReservoirItemsSketch} (high cardinality) + */ + Object result(); +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsCoordinator.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsCoordinator.java new file mode 100644 index 000000000000..4bfde7204acf --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsCoordinator.java @@ -0,0 +1,522 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import java.util.Comparator; +import java.util.Map; +import java.util.concurrent.Callable; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.ThreadFactory; +import org.apache.flink.annotation.Internal; +import org.apache.flink.api.common.typeutils.TypeSerializer; +import org.apache.flink.runtime.operators.coordination.OperatorCoordinator; +import org.apache.flink.runtime.operators.coordination.OperatorEvent; +import org.apache.flink.util.ExceptionUtils; +import org.apache.flink.util.FatalExitExceptionHandler; +import org.apache.flink.util.FlinkRuntimeException; +import org.apache.flink.util.Preconditions; +import org.apache.flink.util.ThrowableCatchingRunnable; +import org.apache.flink.util.function.ThrowingRunnable; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SortOrder; +import org.apache.iceberg.StructLike; +import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; +import org.apache.iceberg.relocated.com.google.common.collect.Iterables; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.types.Comparators; +import org.jetbrains.annotations.NotNull; +import org.jetbrains.annotations.Nullable; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * DataStatisticsCoordinator receives {@link StatisticsEvent} from {@link DataStatisticsOperator} + * every subtask and then merge them together. Once aggregation for all subtasks data statistics + * completes, DataStatisticsCoordinator will send the aggregated data statistics back to {@link + * DataStatisticsOperator}. In the end a custom partitioner will distribute traffic based on the + * aggregated data statistics to improve data clustering. + */ +@Internal +class DataStatisticsCoordinator implements OperatorCoordinator { + private static final Logger LOG = LoggerFactory.getLogger(DataStatisticsCoordinator.class); + + private final String operatorName; + private final OperatorCoordinator.Context context; + private final Schema schema; + private final SortOrder sortOrder; + private final Comparator comparator; + private final int downstreamParallelism; + private final StatisticsType statisticsType; + private final double closeFileCostWeightPercentage; + + private final ExecutorService coordinatorExecutor; + private final SubtaskGateways subtaskGateways; + private final CoordinatorExecutorThreadFactory coordinatorThreadFactory; + private final TypeSerializer completedStatisticsSerializer; + private final TypeSerializer globalStatisticsSerializer; + + private transient boolean started; + private transient AggregatedStatisticsTracker aggregatedStatisticsTracker; + private transient CompletedStatistics completedStatistics; + private transient GlobalStatistics globalStatistics; + + DataStatisticsCoordinator( + String operatorName, + OperatorCoordinator.Context context, + Schema schema, + SortOrder sortOrder, + int downstreamParallelism, + StatisticsType statisticsType, + double closeFileCostWeightPercentage) { + this.operatorName = operatorName; + this.context = context; + this.schema = schema; + this.sortOrder = sortOrder; + this.comparator = Comparators.forType(SortKeyUtil.sortKeySchema(schema, sortOrder).asStruct()); + this.downstreamParallelism = downstreamParallelism; + this.statisticsType = statisticsType; + this.closeFileCostWeightPercentage = closeFileCostWeightPercentage; + + this.coordinatorThreadFactory = + new CoordinatorExecutorThreadFactory( + "DataStatisticsCoordinator-" + operatorName, context.getUserCodeClassloader()); + this.coordinatorExecutor = Executors.newSingleThreadExecutor(coordinatorThreadFactory); + this.subtaskGateways = new SubtaskGateways(operatorName, context.currentParallelism()); + SortKeySerializer sortKeySerializer = new SortKeySerializer(schema, sortOrder); + this.completedStatisticsSerializer = new CompletedStatisticsSerializer(sortKeySerializer); + this.globalStatisticsSerializer = new GlobalStatisticsSerializer(sortKeySerializer); + } + + @Override + public void start() throws Exception { + LOG.info("Starting data statistics coordinator: {}.", operatorName); + this.started = true; + + // statistics are restored already in resetToCheckpoint() before start() called + this.aggregatedStatisticsTracker = + new AggregatedStatisticsTracker( + operatorName, + context.currentParallelism(), + schema, + sortOrder, + downstreamParallelism, + statisticsType, + SketchUtil.COORDINATOR_SKETCH_SWITCH_THRESHOLD, + completedStatistics); + } + + @Override + public void close() throws Exception { + coordinatorExecutor.shutdown(); + this.aggregatedStatisticsTracker = null; + this.started = false; + LOG.info("Closed data statistics coordinator: {}.", operatorName); + } + + @VisibleForTesting + void callInCoordinatorThread(Callable callable, String errorMessage) { + ensureStarted(); + // Ensure the task is done by the coordinator executor. + if (!coordinatorThreadFactory.isCurrentThreadCoordinatorThread()) { + try { + Callable guardedCallable = + () -> { + try { + return callable.call(); + } catch (Throwable t) { + LOG.error( + "Uncaught Exception in data statistics coordinator: {} executor", + operatorName, + t); + ExceptionUtils.rethrowException(t); + return null; + } + }; + + coordinatorExecutor.submit(guardedCallable).get(); + } catch (InterruptedException | ExecutionException e) { + throw new FlinkRuntimeException(errorMessage, e); + } + } else { + try { + callable.call(); + } catch (Throwable t) { + LOG.error( + "Uncaught Exception in data statistics coordinator: {} executor", operatorName, t); + throw new FlinkRuntimeException(errorMessage, t); + } + } + } + + public void runInCoordinatorThread(Runnable runnable) { + this.coordinatorExecutor.execute( + new ThrowableCatchingRunnable( + throwable -> + this.coordinatorThreadFactory.uncaughtException(Thread.currentThread(), throwable), + runnable)); + } + + private void runInCoordinatorThread(ThrowingRunnable action, String actionString) { + ensureStarted(); + runInCoordinatorThread( + () -> { + try { + action.run(); + } catch (Throwable t) { + ExceptionUtils.rethrowIfFatalErrorOrOOM(t); + LOG.error( + "Uncaught exception in the data statistics coordinator: {} while {}. Triggering job failover", + operatorName, + actionString, + t); + context.failJob(t); + } + }); + } + + private void ensureStarted() { + Preconditions.checkState(started, "The coordinator of %s has not started yet.", operatorName); + } + + private void handleDataStatisticRequest(int subtask, StatisticsEvent event) { + CompletedStatistics maybeCompletedStatistics = + aggregatedStatisticsTracker.updateAndCheckCompletion(subtask, event); + + if (maybeCompletedStatistics != null) { + if (maybeCompletedStatistics.isEmpty()) { + LOG.info( + "Skip aggregated statistics for checkpoint {} as it is empty.", event.checkpointId()); + } else { + LOG.info("Completed statistics aggregation for checkpoint {}", event.checkpointId()); + // completedStatistics contains the complete samples, which is needed to compute + // the range bounds in globalStatistics if downstreamParallelism changed. + this.completedStatistics = maybeCompletedStatistics; + // globalStatistics only contains assignment calculated based on Map or Sketch statistics + this.globalStatistics = + globalStatistics( + maybeCompletedStatistics, + downstreamParallelism, + comparator, + closeFileCostWeightPercentage); + sendGlobalStatisticsToSubtasks(globalStatistics); + } + } + } + + private static GlobalStatistics globalStatistics( + CompletedStatistics completedStatistics, + int downstreamParallelism, + Comparator comparator, + double closeFileCostWeightPercentage) { + if (completedStatistics.type() == StatisticsType.Sketch) { + // range bound is a much smaller array compared to the complete samples. + // It helps reduce the amount of data transfer from coordinator to operator subtasks. + return GlobalStatistics.fromRangeBounds( + completedStatistics.checkpointId(), + SketchUtil.rangeBounds( + downstreamParallelism, comparator, completedStatistics.keySamples())); + } else { + return GlobalStatistics.fromMapAssignment( + completedStatistics.checkpointId(), + MapAssignment.fromKeyFrequency( + downstreamParallelism, + completedStatistics.keyFrequency(), + closeFileCostWeightPercentage, + comparator)); + } + } + + @SuppressWarnings("FutureReturnValueIgnored") + private void sendGlobalStatisticsToSubtasks(GlobalStatistics statistics) { + runInCoordinatorThread( + () -> { + LOG.info( + "Broadcast latest global statistics from checkpoint {} to all subtasks", + statistics.checkpointId()); + // applyImmediately is set to false so that operator subtasks can + // apply the change at checkpoint boundary + StatisticsEvent statisticsEvent = + StatisticsEvent.createGlobalStatisticsEvent( + statistics, globalStatisticsSerializer, false); + for (int i = 0; i < context.currentParallelism(); ++i) { + // Ignore future return value for potential error (e.g. subtask down). + // Upon restart, subtasks send request to coordinator to refresh statistics + // if there is any difference + subtaskGateways.getSubtaskGateway(i).sendEvent(statisticsEvent); + } + }, + String.format( + "Failed to send operator %s coordinator global data statistics for checkpoint %d", + operatorName, statistics.checkpointId())); + } + + @SuppressWarnings("FutureReturnValueIgnored") + private void handleRequestGlobalStatisticsEvent(int subtask, RequestGlobalStatisticsEvent event) { + if (globalStatistics != null) { + runInCoordinatorThread( + () -> { + if (event.signature() != null && event.signature() != globalStatistics.hashCode()) { + LOG.debug( + "Skip responding to statistics request from subtask {}, as hashCode matches or not included in the request", + subtask); + } else { + LOG.info( + "Send latest global statistics from checkpoint {} to subtask {}", + globalStatistics.checkpointId(), + subtask); + StatisticsEvent statisticsEvent = + StatisticsEvent.createGlobalStatisticsEvent( + globalStatistics, globalStatisticsSerializer, true); + subtaskGateways.getSubtaskGateway(subtask).sendEvent(statisticsEvent); + } + }, + String.format( + "Failed to send operator %s coordinator global data statistics to requesting subtask %d for checkpoint %d", + operatorName, subtask, globalStatistics.checkpointId())); + } else { + LOG.info( + "Ignore global statistics request from subtask {} as statistics not available", subtask); + } + } + + @Override + public void handleEventFromOperator(int subtask, int attemptNumber, OperatorEvent event) { + runInCoordinatorThread( + () -> { + LOG.debug( + "Handling event from subtask {} (#{}) of {}: {}", + subtask, + attemptNumber, + operatorName, + event); + if (event instanceof StatisticsEvent) { + handleDataStatisticRequest(subtask, ((StatisticsEvent) event)); + } else if (event instanceof RequestGlobalStatisticsEvent) { + handleRequestGlobalStatisticsEvent(subtask, (RequestGlobalStatisticsEvent) event); + } else { + throw new IllegalArgumentException( + "Invalid operator event type: " + event.getClass().getCanonicalName()); + } + }, + String.format( + "handling operator event %s from subtask %d (#%d)", + event.getClass(), subtask, attemptNumber)); + } + + @Override + public void checkpointCoordinator(long checkpointId, CompletableFuture resultFuture) { + runInCoordinatorThread( + () -> { + LOG.debug( + "Snapshotting data statistics coordinator {} for checkpoint {}", + operatorName, + checkpointId); + if (completedStatistics == null) { + // null checkpoint result is not allowed, hence supply an empty byte array + resultFuture.complete(new byte[0]); + } else { + resultFuture.complete( + StatisticsUtil.serializeCompletedStatistics( + completedStatistics, completedStatisticsSerializer)); + } + }, + String.format("taking checkpoint %d", checkpointId)); + } + + @Override + public void notifyCheckpointComplete(long checkpointId) {} + + @Override + public void resetToCheckpoint(long checkpointId, byte[] checkpointData) { + Preconditions.checkState( + !started, "The coordinator %s can only be reset if it was not yet started", operatorName); + if (checkpointData == null || checkpointData.length == 0) { + LOG.info( + "Data statistic coordinator {} has nothing to restore from checkpoint {}", + operatorName, + checkpointId); + return; + } + + LOG.info( + "Restoring data statistic coordinator {} from checkpoint {}", operatorName, checkpointId); + this.completedStatistics = + StatisticsUtil.deserializeCompletedStatistics( + checkpointData, completedStatisticsSerializer); + // recompute global statistics in case downstream parallelism changed + this.globalStatistics = + globalStatistics( + completedStatistics, downstreamParallelism, comparator, closeFileCostWeightPercentage); + } + + @Override + public void subtaskReset(int subtask, long checkpointId) { + runInCoordinatorThread( + () -> { + LOG.info( + "Operator {} subtask {} is reset to checkpoint {}", + operatorName, + subtask, + checkpointId); + Preconditions.checkState( + this.coordinatorThreadFactory.isCurrentThreadCoordinatorThread()); + subtaskGateways.reset(subtask); + }, + String.format("handling subtask %d recovery to checkpoint %d", subtask, checkpointId)); + } + + @Override + public void executionAttemptFailed(int subtask, int attemptNumber, @Nullable Throwable reason) { + runInCoordinatorThread( + () -> { + LOG.info( + "Unregistering gateway after failure for subtask {} (#{}) of data statistics {}", + subtask, + attemptNumber, + operatorName); + Preconditions.checkState( + this.coordinatorThreadFactory.isCurrentThreadCoordinatorThread()); + subtaskGateways.unregisterSubtaskGateway(subtask, attemptNumber); + }, + String.format("handling subtask %d (#%d) failure", subtask, attemptNumber)); + } + + @Override + public void executionAttemptReady(int subtask, int attemptNumber, SubtaskGateway gateway) { + Preconditions.checkArgument(subtask == gateway.getSubtask()); + Preconditions.checkArgument(attemptNumber == gateway.getExecution().getAttemptNumber()); + runInCoordinatorThread( + () -> { + Preconditions.checkState( + this.coordinatorThreadFactory.isCurrentThreadCoordinatorThread()); + subtaskGateways.registerSubtaskGateway(gateway); + }, + String.format( + "making event gateway to subtask %d (#%d) available", subtask, attemptNumber)); + } + + @VisibleForTesting + CompletedStatistics completedStatistics() { + return completedStatistics; + } + + @VisibleForTesting + GlobalStatistics globalStatistics() { + return globalStatistics; + } + + private static class SubtaskGateways { + private final String operatorName; + private final Map[] gateways; + + @SuppressWarnings("unchecked") + private SubtaskGateways(String operatorName, int parallelism) { + this.operatorName = operatorName; + gateways = new Map[parallelism]; + + for (int i = 0; i < parallelism; ++i) { + gateways[i] = Maps.newHashMap(); + } + } + + private void registerSubtaskGateway(OperatorCoordinator.SubtaskGateway gateway) { + int subtaskIndex = gateway.getSubtask(); + int attemptNumber = gateway.getExecution().getAttemptNumber(); + Preconditions.checkState( + !gateways[subtaskIndex].containsKey(attemptNumber), + "Coordinator of %s already has a subtask gateway for %d (#%d)", + operatorName, + subtaskIndex, + attemptNumber); + LOG.debug( + "Coordinator of {} registers gateway for subtask {} attempt {}", + operatorName, + subtaskIndex, + attemptNumber); + gateways[subtaskIndex].put(attemptNumber, gateway); + } + + private void unregisterSubtaskGateway(int subtaskIndex, int attemptNumber) { + LOG.debug( + "Coordinator of {} unregisters gateway for subtask {} attempt {}", + operatorName, + subtaskIndex, + attemptNumber); + gateways[subtaskIndex].remove(attemptNumber); + } + + private OperatorCoordinator.SubtaskGateway getSubtaskGateway(int subtaskIndex) { + Preconditions.checkState( + !gateways[subtaskIndex].isEmpty(), + "Coordinator of %s subtask %d is not ready yet to receive events", + operatorName, + subtaskIndex); + return Iterables.getOnlyElement(gateways[subtaskIndex].values()); + } + + private void reset(int subtaskIndex) { + gateways[subtaskIndex].clear(); + } + } + + private static class CoordinatorExecutorThreadFactory + implements ThreadFactory, Thread.UncaughtExceptionHandler { + + private final String coordinatorThreadName; + private final ClassLoader classLoader; + private final Thread.UncaughtExceptionHandler errorHandler; + + @javax.annotation.Nullable private Thread thread; + + CoordinatorExecutorThreadFactory( + final String coordinatorThreadName, final ClassLoader contextClassLoader) { + this(coordinatorThreadName, contextClassLoader, FatalExitExceptionHandler.INSTANCE); + } + + @org.apache.flink.annotation.VisibleForTesting + CoordinatorExecutorThreadFactory( + final String coordinatorThreadName, + final ClassLoader contextClassLoader, + final Thread.UncaughtExceptionHandler errorHandler) { + this.coordinatorThreadName = coordinatorThreadName; + this.classLoader = contextClassLoader; + this.errorHandler = errorHandler; + } + + @Override + public synchronized Thread newThread(@NotNull Runnable runnable) { + thread = new Thread(runnable, coordinatorThreadName); + thread.setContextClassLoader(classLoader); + thread.setUncaughtExceptionHandler(this); + return thread; + } + + @Override + public synchronized void uncaughtException(Thread t, Throwable e) { + errorHandler.uncaughtException(t, e); + } + + boolean isCurrentThreadCoordinatorThread() { + return Thread.currentThread() == thread; + } + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsCoordinatorProvider.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsCoordinatorProvider.java new file mode 100644 index 000000000000..9d7d989c298e --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsCoordinatorProvider.java @@ -0,0 +1,70 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import org.apache.flink.annotation.Internal; +import org.apache.flink.runtime.jobgraph.OperatorID; +import org.apache.flink.runtime.operators.coordination.OperatorCoordinator; +import org.apache.flink.runtime.operators.coordination.RecreateOnResetOperatorCoordinator; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SortOrder; + +/** + * DataStatisticsCoordinatorProvider provides the method to create new {@link + * DataStatisticsCoordinator} + */ +@Internal +public class DataStatisticsCoordinatorProvider extends RecreateOnResetOperatorCoordinator.Provider { + + private final String operatorName; + private final Schema schema; + private final SortOrder sortOrder; + private final int downstreamParallelism; + private final StatisticsType type; + private final double closeFileCostWeightPercentage; + + public DataStatisticsCoordinatorProvider( + String operatorName, + OperatorID operatorID, + Schema schema, + SortOrder sortOrder, + int downstreamParallelism, + StatisticsType type, + double closeFileCostWeightPercentage) { + super(operatorID); + this.operatorName = operatorName; + this.schema = schema; + this.sortOrder = sortOrder; + this.downstreamParallelism = downstreamParallelism; + this.type = type; + this.closeFileCostWeightPercentage = closeFileCostWeightPercentage; + } + + @Override + public OperatorCoordinator getCoordinator(OperatorCoordinator.Context context) { + return new DataStatisticsCoordinator( + operatorName, + context, + schema, + sortOrder, + downstreamParallelism, + type, + closeFileCostWeightPercentage); + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsOperator.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsOperator.java new file mode 100644 index 000000000000..7995a8a5b181 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsOperator.java @@ -0,0 +1,266 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import java.util.Map; +import org.apache.flink.annotation.Internal; +import org.apache.flink.api.common.state.ListState; +import org.apache.flink.api.common.state.ListStateDescriptor; +import org.apache.flink.api.common.typeutils.TypeSerializer; +import org.apache.flink.runtime.operators.coordination.OperatorEvent; +import org.apache.flink.runtime.operators.coordination.OperatorEventGateway; +import org.apache.flink.runtime.operators.coordination.OperatorEventHandler; +import org.apache.flink.runtime.state.StateInitializationContext; +import org.apache.flink.runtime.state.StateSnapshotContext; +import org.apache.flink.streaming.api.operators.AbstractStreamOperator; +import org.apache.flink.streaming.api.operators.OneInputStreamOperator; +import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; +import org.apache.flink.table.data.RowData; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SortKey; +import org.apache.iceberg.SortOrder; +import org.apache.iceberg.StructLike; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.RowDataWrapper; +import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; + +/** + * DataStatisticsOperator collects traffic distribution statistics. A custom partitioner shall be + * attached to the DataStatisticsOperator output. The custom partitioner leverages the statistics to + * shuffle record to improve data clustering while maintaining relative balanced traffic + * distribution to downstream subtasks. + */ +@Internal +public class DataStatisticsOperator extends AbstractStreamOperator + implements OneInputStreamOperator, OperatorEventHandler { + + private static final long serialVersionUID = 1L; + + private final String operatorName; + private final RowDataWrapper rowDataWrapper; + private final SortKey sortKey; + private final OperatorEventGateway operatorEventGateway; + private final int downstreamParallelism; + private final StatisticsType statisticsType; + private final TypeSerializer taskStatisticsSerializer; + private final TypeSerializer globalStatisticsSerializer; + + private transient int parallelism; + private transient int subtaskIndex; + private transient ListState globalStatisticsState; + // current statistics type may be different from the config due to possible + // migration from Map statistics to Sketch statistics when high cardinality detected + private transient volatile StatisticsType taskStatisticsType; + private transient volatile DataStatistics localStatistics; + private transient volatile GlobalStatistics globalStatistics; + + DataStatisticsOperator( + String operatorName, + Schema schema, + SortOrder sortOrder, + OperatorEventGateway operatorEventGateway, + int downstreamParallelism, + StatisticsType statisticsType) { + this.operatorName = operatorName; + this.rowDataWrapper = new RowDataWrapper(FlinkSchemaUtil.convert(schema), schema.asStruct()); + this.sortKey = new SortKey(schema, sortOrder); + this.operatorEventGateway = operatorEventGateway; + this.downstreamParallelism = downstreamParallelism; + this.statisticsType = statisticsType; + + SortKeySerializer sortKeySerializer = new SortKeySerializer(schema, sortOrder); + this.taskStatisticsSerializer = new DataStatisticsSerializer(sortKeySerializer); + this.globalStatisticsSerializer = new GlobalStatisticsSerializer(sortKeySerializer); + } + + @Override + public void initializeState(StateInitializationContext context) throws Exception { + this.parallelism = getRuntimeContext().getTaskInfo().getNumberOfParallelSubtasks(); + this.subtaskIndex = getRuntimeContext().getTaskInfo().getIndexOfThisSubtask(); + + // Use union state so that new subtasks can also restore global statistics during scale-up. + this.globalStatisticsState = + context + .getOperatorStateStore() + .getUnionListState( + new ListStateDescriptor<>("globalStatisticsState", globalStatisticsSerializer)); + + if (context.isRestored()) { + if (globalStatisticsState.get() == null + || !globalStatisticsState.get().iterator().hasNext()) { + LOG.info( + "Operator {} subtask {} doesn't have global statistics state to restore", + operatorName, + subtaskIndex); + // If Flink deprecates union state in the future, RequestGlobalStatisticsEvent can be + // leveraged to request global statistics from coordinator if new subtasks (scale-up case) + // has nothing to restore from. + } else { + GlobalStatistics restoredStatistics = globalStatisticsState.get().iterator().next(); + LOG.info( + "Operator {} subtask {} restored global statistics state", operatorName, subtaskIndex); + this.globalStatistics = restoredStatistics; + } + + // Always request for new statistics from coordinator upon task initialization. + // There are a few scenarios this is needed + // 1. downstream writer parallelism changed due to rescale. + // 2. coordinator failed to send the aggregated statistics to subtask + // (e.g. due to subtask failure at the time). + // Records may flow before coordinator can respond. Range partitioner should be + // able to continue to operate with potentially suboptimal behavior (in sketch case). + LOG.info( + "Operator {} subtask {} requests new global statistics from coordinator ", + operatorName, + subtaskIndex); + // coordinator can use the hashCode (if available) in the request event to determine + // if operator already has the latest global statistics and respond can be skipped. + // This makes the handling cheap in most situations. + RequestGlobalStatisticsEvent event = + globalStatistics != null + ? new RequestGlobalStatisticsEvent(globalStatistics.hashCode()) + : new RequestGlobalStatisticsEvent(); + operatorEventGateway.sendEventToCoordinator(event); + } + + this.taskStatisticsType = StatisticsUtil.collectType(statisticsType, globalStatistics); + this.localStatistics = + StatisticsUtil.createTaskStatistics(taskStatisticsType, parallelism, downstreamParallelism); + } + + @Override + public void open() throws Exception { + if (globalStatistics != null) { + output.collect(new StreamRecord<>(StatisticsOrRecord.fromStatistics(globalStatistics))); + } + } + + @Override + public void handleOperatorEvent(OperatorEvent event) { + Preconditions.checkArgument( + event instanceof StatisticsEvent, + String.format( + "Operator %s subtask %s received unexpected operator event %s", + operatorName, subtaskIndex, event.getClass())); + StatisticsEvent statisticsEvent = (StatisticsEvent) event; + LOG.info( + "Operator {} subtask {} received global data event from coordinator checkpoint {}", + operatorName, + subtaskIndex, + statisticsEvent.checkpointId()); + this.globalStatistics = + StatisticsUtil.deserializeGlobalStatistics( + statisticsEvent.statisticsBytes(), globalStatisticsSerializer); + checkStatisticsTypeMigration(); + // if applyImmediately not set, wait until the checkpoint time to switch + if (statisticsEvent.applyImmediately()) { + output.collect(new StreamRecord<>(StatisticsOrRecord.fromStatistics(globalStatistics))); + } + } + + @Override + public void processElement(StreamRecord streamRecord) { + // collect data statistics + RowData record = streamRecord.getValue(); + StructLike struct = rowDataWrapper.wrap(record); + sortKey.wrap(struct); + localStatistics.add(sortKey); + + checkStatisticsTypeMigration(); + output.collect(new StreamRecord<>(StatisticsOrRecord.fromRecord(record))); + } + + @Override + public void snapshotState(StateSnapshotContext context) throws Exception { + long checkpointId = context.getCheckpointId(); + LOG.info( + "Operator {} subtask {} snapshotting data statistics for checkpoint {}", + operatorName, + subtaskIndex, + checkpointId); + + // Pass global statistics to partitioner so that all the operators refresh statistics + // at same checkpoint barrier + if (globalStatistics != null) { + output.collect(new StreamRecord<>(StatisticsOrRecord.fromStatistics(globalStatistics))); + } + + // Only subtask 0 saves the state so that globalStatisticsState(UnionListState) stores + // an exact copy of globalStatistics + if (globalStatistics != null + && getRuntimeContext().getTaskInfo().getIndexOfThisSubtask() == 0) { + globalStatisticsState.clear(); + LOG.info( + "Operator {} subtask {} saving global statistics to state", operatorName, subtaskIndex); + globalStatisticsState.add(globalStatistics); + LOG.debug( + "Operator {} subtask {} saved global statistics to state: {}", + operatorName, + subtaskIndex, + globalStatistics); + } + + // For now, local statistics are sent to coordinator at checkpoint + LOG.info( + "Operator {} Subtask {} sending local statistics to coordinator for checkpoint {}", + operatorName, + subtaskIndex, + checkpointId); + operatorEventGateway.sendEventToCoordinator( + StatisticsEvent.createTaskStatisticsEvent( + checkpointId, localStatistics, taskStatisticsSerializer)); + + // Recreate the local statistics + localStatistics = + StatisticsUtil.createTaskStatistics(taskStatisticsType, parallelism, downstreamParallelism); + } + + @SuppressWarnings("unchecked") + private void checkStatisticsTypeMigration() { + // only check if the statisticsType config is Auto and localStatistics is currently Map type + if (statisticsType == StatisticsType.Auto && localStatistics.type() == StatisticsType.Map) { + Map mapStatistics = (Map) localStatistics.result(); + // convert if local statistics has cardinality over the threshold or + // if received global statistics is already sketch type + if (mapStatistics.size() > SketchUtil.OPERATOR_SKETCH_SWITCH_THRESHOLD + || (globalStatistics != null && globalStatistics.type() == StatisticsType.Sketch)) { + LOG.info( + "Operator {} subtask {} switched local statistics from Map to Sketch.", + operatorName, + subtaskIndex); + this.taskStatisticsType = StatisticsType.Sketch; + this.localStatistics = + StatisticsUtil.createTaskStatistics( + taskStatisticsType, parallelism, downstreamParallelism); + SketchUtil.convertMapToSketch(mapStatistics, localStatistics::add); + } + } + } + + @VisibleForTesting + DataStatistics localStatistics() { + return localStatistics; + } + + @VisibleForTesting + GlobalStatistics globalStatistics() { + return globalStatistics; + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsSerializer.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsSerializer.java new file mode 100644 index 000000000000..8ce99073836d --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsSerializer.java @@ -0,0 +1,204 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import java.io.IOException; +import java.util.Map; +import java.util.Objects; +import org.apache.datasketches.memory.Memory; +import org.apache.datasketches.sampling.ReservoirItemsSketch; +import org.apache.flink.annotation.Internal; +import org.apache.flink.api.common.typeutils.CompositeTypeSerializerSnapshot; +import org.apache.flink.api.common.typeutils.TypeSerializer; +import org.apache.flink.api.common.typeutils.TypeSerializerSnapshot; +import org.apache.flink.api.common.typeutils.base.EnumSerializer; +import org.apache.flink.api.common.typeutils.base.LongSerializer; +import org.apache.flink.api.common.typeutils.base.MapSerializer; +import org.apache.flink.core.memory.DataInputView; +import org.apache.flink.core.memory.DataOutputView; +import org.apache.iceberg.SortKey; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; + +@Internal +class DataStatisticsSerializer extends TypeSerializer { + private final TypeSerializer sortKeySerializer; + private final EnumSerializer statisticsTypeSerializer; + private final MapSerializer mapSerializer; + private final SortKeySketchSerializer sketchSerializer; + + DataStatisticsSerializer(TypeSerializer sortKeySerializer) { + this.sortKeySerializer = sortKeySerializer; + this.statisticsTypeSerializer = new EnumSerializer<>(StatisticsType.class); + this.mapSerializer = new MapSerializer<>(sortKeySerializer, LongSerializer.INSTANCE); + this.sketchSerializer = new SortKeySketchSerializer(sortKeySerializer); + } + + @Override + public boolean isImmutableType() { + return false; + } + + @SuppressWarnings("ReferenceEquality") + @Override + public TypeSerializer duplicate() { + TypeSerializer duplicateSortKeySerializer = sortKeySerializer.duplicate(); + return (duplicateSortKeySerializer == sortKeySerializer) + ? this + : new DataStatisticsSerializer(duplicateSortKeySerializer); + } + + @Override + public DataStatistics createInstance() { + return new MapDataStatistics(); + } + + @SuppressWarnings("unchecked") + @Override + public DataStatistics copy(DataStatistics obj) { + StatisticsType statisticsType = obj.type(); + if (statisticsType == StatisticsType.Map) { + MapDataStatistics from = (MapDataStatistics) obj; + Map fromStats = (Map) from.result(); + Map toStats = Maps.newHashMap(fromStats); + return new MapDataStatistics(toStats); + } else if (statisticsType == StatisticsType.Sketch) { + // because ReservoirItemsSketch doesn't expose enough public methods for cloning, + // this implementation adopted the less efficient serialization and deserialization. + SketchDataStatistics from = (SketchDataStatistics) obj; + ReservoirItemsSketch fromStats = (ReservoirItemsSketch) from.result(); + byte[] bytes = fromStats.toByteArray(sketchSerializer); + Memory memory = Memory.wrap(bytes); + ReservoirItemsSketch toStats = + ReservoirItemsSketch.heapify(memory, sketchSerializer); + return new SketchDataStatistics(toStats); + } else { + throw new IllegalArgumentException("Unsupported data statistics type: " + statisticsType); + } + } + + @Override + public DataStatistics copy(DataStatistics from, DataStatistics reuse) { + // not much benefit to reuse + return copy(from); + } + + @Override + public int getLength() { + return -1; + } + + @SuppressWarnings("unchecked") + @Override + public void serialize(DataStatistics obj, DataOutputView target) throws IOException { + StatisticsType statisticsType = obj.type(); + statisticsTypeSerializer.serialize(obj.type(), target); + if (statisticsType == StatisticsType.Map) { + Map mapStatistics = (Map) obj.result(); + mapSerializer.serialize(mapStatistics, target); + } else if (statisticsType == StatisticsType.Sketch) { + ReservoirItemsSketch sketch = (ReservoirItemsSketch) obj.result(); + byte[] sketchBytes = sketch.toByteArray(sketchSerializer); + target.writeInt(sketchBytes.length); + target.write(sketchBytes); + } else { + throw new IllegalArgumentException("Unsupported data statistics type: " + statisticsType); + } + } + + @Override + public DataStatistics deserialize(DataInputView source) throws IOException { + StatisticsType statisticsType = statisticsTypeSerializer.deserialize(source); + if (statisticsType == StatisticsType.Map) { + Map mapStatistics = mapSerializer.deserialize(source); + return new MapDataStatistics(mapStatistics); + } else if (statisticsType == StatisticsType.Sketch) { + int numBytes = source.readInt(); + byte[] sketchBytes = new byte[numBytes]; + source.read(sketchBytes); + Memory sketchMemory = Memory.wrap(sketchBytes); + ReservoirItemsSketch sketch = + ReservoirItemsSketch.heapify(sketchMemory, sketchSerializer); + return new SketchDataStatistics(sketch); + } else { + throw new IllegalArgumentException("Unsupported data statistics type: " + statisticsType); + } + } + + @Override + public DataStatistics deserialize(DataStatistics reuse, DataInputView source) throws IOException { + // not much benefit to reuse + return deserialize(source); + } + + @Override + public void copy(DataInputView source, DataOutputView target) throws IOException { + serialize(deserialize(source), target); + } + + @Override + public boolean equals(Object obj) { + if (!(obj instanceof DataStatisticsSerializer)) { + return false; + } + + DataStatisticsSerializer other = (DataStatisticsSerializer) obj; + return Objects.equals(sortKeySerializer, other.sortKeySerializer); + } + + @Override + public int hashCode() { + return sortKeySerializer.hashCode(); + } + + @Override + public TypeSerializerSnapshot snapshotConfiguration() { + return new DataStatisticsSerializerSnapshot(this); + } + + public static class DataStatisticsSerializerSnapshot + extends CompositeTypeSerializerSnapshot { + private static final int CURRENT_VERSION = 1; + + /** Constructor for read instantiation. */ + @SuppressWarnings({"unused", "checkstyle:RedundantModifier"}) + public DataStatisticsSerializerSnapshot() {} + + @SuppressWarnings("checkstyle:RedundantModifier") + public DataStatisticsSerializerSnapshot(DataStatisticsSerializer serializer) { + super(serializer); + } + + @Override + protected int getCurrentOuterSnapshotVersion() { + return CURRENT_VERSION; + } + + @Override + protected TypeSerializer[] getNestedSerializers(DataStatisticsSerializer outerSerializer) { + return new TypeSerializer[] {outerSerializer.sortKeySerializer}; + } + + @Override + protected DataStatisticsSerializer createOuterSerializerWithNestedSerializers( + TypeSerializer[] nestedSerializers) { + SortKeySerializer sortKeySerializer = (SortKeySerializer) nestedSerializers[0]; + return new DataStatisticsSerializer(sortKeySerializer); + } + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/GlobalStatistics.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/GlobalStatistics.java new file mode 100644 index 000000000000..50ec23e9f7a2 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/GlobalStatistics.java @@ -0,0 +1,114 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import java.util.Arrays; +import org.apache.iceberg.SortKey; +import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; +import org.apache.iceberg.relocated.com.google.common.base.Objects; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; + +/** + * This is used by {@link RangePartitioner} for guiding range partitioning. This is what is sent to + * the operator subtasks. For sketch statistics, it only contains much smaller range bounds than the + * complete raw samples. + */ +class GlobalStatistics { + private final long checkpointId; + private final StatisticsType type; + private final MapAssignment mapAssignment; + private final SortKey[] rangeBounds; + + private transient Integer hashCode; + + GlobalStatistics( + long checkpointId, StatisticsType type, MapAssignment mapAssignment, SortKey[] rangeBounds) { + Preconditions.checkArgument( + (mapAssignment != null && rangeBounds == null) + || (mapAssignment == null && rangeBounds != null), + "Invalid key assignment or range bounds: both are non-null or null"); + this.checkpointId = checkpointId; + this.type = type; + this.mapAssignment = mapAssignment; + this.rangeBounds = rangeBounds; + } + + static GlobalStatistics fromMapAssignment(long checkpointId, MapAssignment mapAssignment) { + return new GlobalStatistics(checkpointId, StatisticsType.Map, mapAssignment, null); + } + + static GlobalStatistics fromRangeBounds(long checkpointId, SortKey[] rangeBounds) { + return new GlobalStatistics(checkpointId, StatisticsType.Sketch, null, rangeBounds); + } + + @Override + public String toString() { + return MoreObjects.toStringHelper(this) + .add("checkpointId", checkpointId) + .add("type", type) + .add("mapAssignment", mapAssignment) + .add("rangeBounds", rangeBounds) + .toString(); + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + + if (!(o instanceof GlobalStatistics)) { + return false; + } + + GlobalStatistics other = (GlobalStatistics) o; + return Objects.equal(checkpointId, other.checkpointId) + && Objects.equal(type, other.type) + && Objects.equal(mapAssignment, other.mapAssignment()) + && Arrays.equals(rangeBounds, other.rangeBounds()); + } + + @Override + public int hashCode() { + // implemented caching because coordinator can call the hashCode many times. + // when subtasks request statistics refresh upon initialization for reconciliation purpose, + // hashCode is used to check if there is any difference btw coordinator and operator state. + if (hashCode == null) { + this.hashCode = Objects.hashCode(checkpointId, type, mapAssignment, rangeBounds); + } + + return hashCode; + } + + long checkpointId() { + return checkpointId; + } + + StatisticsType type() { + return type; + } + + MapAssignment mapAssignment() { + return mapAssignment; + } + + SortKey[] rangeBounds() { + return rangeBounds; + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/GlobalStatisticsSerializer.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/GlobalStatisticsSerializer.java new file mode 100644 index 000000000000..a7fe2b30b865 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/GlobalStatisticsSerializer.java @@ -0,0 +1,199 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import java.io.IOException; +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import org.apache.flink.api.common.typeutils.CompositeTypeSerializerSnapshot; +import org.apache.flink.api.common.typeutils.TypeSerializer; +import org.apache.flink.api.common.typeutils.TypeSerializerSnapshot; +import org.apache.flink.api.common.typeutils.base.EnumSerializer; +import org.apache.flink.api.common.typeutils.base.IntSerializer; +import org.apache.flink.api.common.typeutils.base.ListSerializer; +import org.apache.flink.api.common.typeutils.base.LongSerializer; +import org.apache.flink.core.memory.DataInputView; +import org.apache.flink.core.memory.DataOutputView; +import org.apache.iceberg.SortKey; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; + +class GlobalStatisticsSerializer extends TypeSerializer { + private final TypeSerializer sortKeySerializer; + private final EnumSerializer statisticsTypeSerializer; + private final ListSerializer rangeBoundsSerializer; + private final ListSerializer intsSerializer; + private final ListSerializer longsSerializer; + + GlobalStatisticsSerializer(TypeSerializer sortKeySerializer) { + this.sortKeySerializer = sortKeySerializer; + this.statisticsTypeSerializer = new EnumSerializer<>(StatisticsType.class); + this.rangeBoundsSerializer = new ListSerializer<>(sortKeySerializer); + this.intsSerializer = new ListSerializer<>(IntSerializer.INSTANCE); + this.longsSerializer = new ListSerializer<>(LongSerializer.INSTANCE); + } + + @Override + public boolean isImmutableType() { + return false; + } + + @Override + public TypeSerializer duplicate() { + return new GlobalStatisticsSerializer(sortKeySerializer); + } + + @Override + public GlobalStatistics createInstance() { + return GlobalStatistics.fromRangeBounds(0L, new SortKey[0]); + } + + @Override + public GlobalStatistics copy(GlobalStatistics from) { + return new GlobalStatistics( + from.checkpointId(), from.type(), from.mapAssignment(), from.rangeBounds()); + } + + @Override + public GlobalStatistics copy(GlobalStatistics from, GlobalStatistics reuse) { + // no benefit of reuse + return copy(from); + } + + @Override + public int getLength() { + return -1; + } + + @Override + public void serialize(GlobalStatistics record, DataOutputView target) throws IOException { + target.writeLong(record.checkpointId()); + statisticsTypeSerializer.serialize(record.type(), target); + if (record.type() == StatisticsType.Map) { + MapAssignment mapAssignment = record.mapAssignment(); + target.writeInt(mapAssignment.numPartitions()); + target.writeInt(mapAssignment.keyAssignments().size()); + for (Map.Entry entry : mapAssignment.keyAssignments().entrySet()) { + sortKeySerializer.serialize(entry.getKey(), target); + KeyAssignment keyAssignment = entry.getValue(); + intsSerializer.serialize(keyAssignment.assignedSubtasks(), target); + longsSerializer.serialize(keyAssignment.subtaskWeightsWithCloseFileCost(), target); + target.writeLong(keyAssignment.closeFileCostWeight()); + } + } else { + rangeBoundsSerializer.serialize(Arrays.asList(record.rangeBounds()), target); + } + } + + @Override + public GlobalStatistics deserialize(DataInputView source) throws IOException { + long checkpointId = source.readLong(); + StatisticsType type = statisticsTypeSerializer.deserialize(source); + if (type == StatisticsType.Map) { + int numPartitions = source.readInt(); + int mapSize = source.readInt(); + Map keyAssignments = Maps.newHashMapWithExpectedSize(mapSize); + for (int i = 0; i < mapSize; ++i) { + SortKey sortKey = sortKeySerializer.deserialize(source); + List assignedSubtasks = intsSerializer.deserialize(source); + List subtaskWeightsWithCloseFileCost = longsSerializer.deserialize(source); + long closeFileCostWeight = source.readLong(); + keyAssignments.put( + sortKey, + new KeyAssignment( + assignedSubtasks, subtaskWeightsWithCloseFileCost, closeFileCostWeight)); + } + + return GlobalStatistics.fromMapAssignment( + checkpointId, new MapAssignment(numPartitions, keyAssignments)); + } else { + List sortKeys = rangeBoundsSerializer.deserialize(source); + SortKey[] rangeBounds = new SortKey[sortKeys.size()]; + return GlobalStatistics.fromRangeBounds(checkpointId, sortKeys.toArray(rangeBounds)); + } + } + + @Override + public GlobalStatistics deserialize(GlobalStatistics reuse, DataInputView source) + throws IOException { + // not much benefit to reuse + return deserialize(source); + } + + @Override + public void copy(DataInputView source, DataOutputView target) throws IOException { + serialize(deserialize(source), target); + } + + @Override + public boolean equals(Object obj) { + if (this == obj) { + return true; + } + + if (obj == null || getClass() != obj.getClass()) { + return false; + } + + GlobalStatisticsSerializer other = (GlobalStatisticsSerializer) obj; + return Objects.equals(sortKeySerializer, other.sortKeySerializer); + } + + @Override + public int hashCode() { + return sortKeySerializer.hashCode(); + } + + @Override + public TypeSerializerSnapshot snapshotConfiguration() { + return new GlobalStatisticsSerializerSnapshot(this); + } + + public static class GlobalStatisticsSerializerSnapshot + extends CompositeTypeSerializerSnapshot { + private static final int CURRENT_VERSION = 1; + + /** Constructor for read instantiation. */ + @SuppressWarnings({"unused", "checkstyle:RedundantModifier"}) + public GlobalStatisticsSerializerSnapshot() {} + + @SuppressWarnings("checkstyle:RedundantModifier") + public GlobalStatisticsSerializerSnapshot(GlobalStatisticsSerializer serializer) { + super(serializer); + } + + @Override + protected int getCurrentOuterSnapshotVersion() { + return CURRENT_VERSION; + } + + @Override + protected TypeSerializer[] getNestedSerializers(GlobalStatisticsSerializer outerSerializer) { + return new TypeSerializer[] {outerSerializer.sortKeySerializer}; + } + + @Override + protected GlobalStatisticsSerializer createOuterSerializerWithNestedSerializers( + TypeSerializer[] nestedSerializers) { + SortKeySerializer sortKeySerializer = (SortKeySerializer) nestedSerializers[0]; + return new GlobalStatisticsSerializer(sortKeySerializer); + } + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/KeyAssignment.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/KeyAssignment.java new file mode 100644 index 000000000000..781bcc646023 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/KeyAssignment.java @@ -0,0 +1,155 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import java.util.Arrays; +import java.util.List; +import java.util.Objects; +import java.util.concurrent.ThreadLocalRandom; +import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; + +/** Subtask assignment for a key for Map statistics based */ +class KeyAssignment { + private final List assignedSubtasks; + private final List subtaskWeightsWithCloseFileCost; + private final long closeFileCostWeight; + private final long[] subtaskWeightsExcludingCloseCost; + private final long keyWeight; + private final long[] cumulativeWeights; + + /** + * @param assignedSubtasks assigned subtasks for this key. It could be a single subtask. It could + * also be multiple subtasks if the key has heavy weight that should be handled by multiple + * subtasks. + * @param subtaskWeightsWithCloseFileCost assigned weight for each subtask. E.g., if the keyWeight + * is 27 and the key is assigned to 3 subtasks, subtaskWeights could contain values as [10, + * 10, 7] for target weight of 10 per subtask. + */ + KeyAssignment( + List assignedSubtasks, + List subtaskWeightsWithCloseFileCost, + long closeFileCostWeight) { + Preconditions.checkArgument( + assignedSubtasks != null && !assignedSubtasks.isEmpty(), + "Invalid assigned subtasks: null or empty"); + Preconditions.checkArgument( + subtaskWeightsWithCloseFileCost != null && !subtaskWeightsWithCloseFileCost.isEmpty(), + "Invalid assigned subtasks weights: null or empty"); + Preconditions.checkArgument( + assignedSubtasks.size() == subtaskWeightsWithCloseFileCost.size(), + "Invalid assignment: size mismatch (tasks length = %s, weights length = %s)", + assignedSubtasks.size(), + subtaskWeightsWithCloseFileCost.size()); + subtaskWeightsWithCloseFileCost.forEach( + weight -> + Preconditions.checkArgument( + weight > closeFileCostWeight, + "Invalid weight: should be larger than close file cost: weight = %s, close file cost = %s", + weight, + closeFileCostWeight)); + + this.assignedSubtasks = assignedSubtasks; + this.subtaskWeightsWithCloseFileCost = subtaskWeightsWithCloseFileCost; + this.closeFileCostWeight = closeFileCostWeight; + // Exclude the close file cost for key routing + this.subtaskWeightsExcludingCloseCost = + subtaskWeightsWithCloseFileCost.stream() + .mapToLong(weightWithCloseFileCost -> weightWithCloseFileCost - closeFileCostWeight) + .toArray(); + this.keyWeight = Arrays.stream(subtaskWeightsExcludingCloseCost).sum(); + this.cumulativeWeights = new long[subtaskWeightsExcludingCloseCost.length]; + long cumulativeWeight = 0; + for (int i = 0; i < subtaskWeightsExcludingCloseCost.length; ++i) { + cumulativeWeight += subtaskWeightsExcludingCloseCost[i]; + cumulativeWeights[i] = cumulativeWeight; + } + } + + List assignedSubtasks() { + return assignedSubtasks; + } + + List subtaskWeightsWithCloseFileCost() { + return subtaskWeightsWithCloseFileCost; + } + + long closeFileCostWeight() { + return closeFileCostWeight; + } + + long[] subtaskWeightsExcludingCloseCost() { + return subtaskWeightsExcludingCloseCost; + } + + /** + * Select a subtask for the key. + * + * @return subtask id + */ + int select() { + if (assignedSubtasks.size() == 1) { + // only choice. no need to run random number generator. + return assignedSubtasks.get(0); + } else { + long randomNumber = ThreadLocalRandom.current().nextLong(keyWeight); + int index = Arrays.binarySearch(cumulativeWeights, randomNumber); + // choose the subtask where randomNumber < cumulativeWeights[pos]. + // this works regardless whether index is negative or not. + int position = Math.abs(index + 1); + Preconditions.checkState( + position < assignedSubtasks.size(), + "Invalid selected position: out of range. key weight = %s, random number = %s, cumulative weights array = %s", + keyWeight, + randomNumber, + cumulativeWeights); + return assignedSubtasks.get(position); + } + } + + @Override + public int hashCode() { + return Objects.hash(assignedSubtasks, subtaskWeightsWithCloseFileCost, closeFileCostWeight); + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + + if (o == null || getClass() != o.getClass()) { + return false; + } + + KeyAssignment that = (KeyAssignment) o; + return Objects.equals(assignedSubtasks, that.assignedSubtasks) + && Objects.equals(subtaskWeightsWithCloseFileCost, that.subtaskWeightsWithCloseFileCost) + && closeFileCostWeight == that.closeFileCostWeight; + } + + @Override + public String toString() { + return MoreObjects.toStringHelper(this) + .add("assignedSubtasks", assignedSubtasks) + .add("subtaskWeightsWithCloseFileCost", subtaskWeightsWithCloseFileCost) + .add("closeFileCostWeight", closeFileCostWeight) + .toString(); + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapAssignment.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapAssignment.java new file mode 100644 index 000000000000..9d8167460a1b --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapAssignment.java @@ -0,0 +1,242 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import java.util.Comparator; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.NavigableMap; +import org.apache.iceberg.SortKey; +import org.apache.iceberg.StructLike; +import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; +import org.apache.iceberg.relocated.com.google.common.base.Objects; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.util.Pair; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** Key assignment to subtasks for Map statistics. */ +class MapAssignment { + private static final Logger LOG = LoggerFactory.getLogger(MapAssignment.class); + + private final int numPartitions; + private final Map keyAssignments; + + MapAssignment(int numPartitions, Map keyAssignments) { + Preconditions.checkArgument(keyAssignments != null, "Invalid key assignments: null"); + this.numPartitions = numPartitions; + this.keyAssignments = keyAssignments; + } + + static MapAssignment fromKeyFrequency( + int numPartitions, + Map mapStatistics, + double closeFileCostWeightPercentage, + Comparator comparator) { + return new MapAssignment( + numPartitions, + assignment(numPartitions, mapStatistics, closeFileCostWeightPercentage, comparator)); + } + + @Override + public int hashCode() { + return Objects.hashCode(numPartitions, keyAssignments); + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + + if (o == null || getClass() != o.getClass()) { + return false; + } + + MapAssignment that = (MapAssignment) o; + return numPartitions == that.numPartitions && keyAssignments.equals(that.keyAssignments); + } + + @Override + public String toString() { + return MoreObjects.toStringHelper(this) + .add("numPartitions", numPartitions) + .add("keyAssignments", keyAssignments) + .toString(); + } + + int numPartitions() { + return numPartitions; + } + + Map keyAssignments() { + return keyAssignments; + } + + /** + * Returns assignment summary for every subtask. + * + * @return assignment summary for every subtask. Key is subtaskId. Value pair is (weight assigned + * to the subtask, number of keys assigned to the subtask) + */ + Map> assignmentInfo() { + Map> assignmentInfo = Maps.newTreeMap(); + keyAssignments.forEach( + (key, keyAssignment) -> { + for (int i = 0; i < keyAssignment.assignedSubtasks().size(); ++i) { + int subtaskId = keyAssignment.assignedSubtasks().get(i); + long subtaskWeight = keyAssignment.subtaskWeightsExcludingCloseCost()[i]; + Pair oldValue = assignmentInfo.getOrDefault(subtaskId, Pair.of(0L, 0)); + assignmentInfo.put( + subtaskId, Pair.of(oldValue.first() + subtaskWeight, oldValue.second() + 1)); + } + }); + + return assignmentInfo; + } + + static Map assignment( + int numPartitions, + Map mapStatistics, + double closeFileCostWeightPercentage, + Comparator comparator) { + mapStatistics.forEach( + (key, value) -> + Preconditions.checkArgument( + value > 0, "Invalid statistics: weight is 0 for key %s", key)); + + long totalWeight = mapStatistics.values().stream().mapToLong(l -> l).sum(); + double targetWeightPerSubtask = ((double) totalWeight) / numPartitions; + long closeFileCostWeight = + (long) Math.ceil(targetWeightPerSubtask * closeFileCostWeightPercentage / 100); + + NavigableMap sortedStatsWithCloseFileCost = Maps.newTreeMap(comparator); + mapStatistics.forEach( + (k, v) -> { + int estimatedSplits = (int) Math.ceil(v / targetWeightPerSubtask); + long estimatedCloseFileCost = closeFileCostWeight * estimatedSplits; + sortedStatsWithCloseFileCost.put(k, v + estimatedCloseFileCost); + }); + + long totalWeightWithCloseFileCost = + sortedStatsWithCloseFileCost.values().stream().mapToLong(l -> l).sum(); + long targetWeightPerSubtaskWithCloseFileCost = + (long) Math.ceil(((double) totalWeightWithCloseFileCost) / numPartitions); + return buildAssignment( + numPartitions, + sortedStatsWithCloseFileCost, + targetWeightPerSubtaskWithCloseFileCost, + closeFileCostWeight); + } + + private static Map buildAssignment( + int numPartitions, + NavigableMap sortedStatistics, + long targetWeightPerSubtask, + long closeFileCostWeight) { + Map assignmentMap = + Maps.newHashMapWithExpectedSize(sortedStatistics.size()); + Iterator mapKeyIterator = sortedStatistics.keySet().iterator(); + int subtaskId = 0; + SortKey currentKey = null; + long keyRemainingWeight = 0L; + long subtaskRemainingWeight = targetWeightPerSubtask; + List assignedSubtasks = Lists.newArrayList(); + List subtaskWeights = Lists.newArrayList(); + while (mapKeyIterator.hasNext() || currentKey != null) { + // This should never happen because target weight is calculated using ceil function. + if (subtaskId >= numPartitions) { + LOG.error( + "Internal algorithm error: exhausted subtasks with unassigned keys left. number of partitions: {}, " + + "target weight per subtask: {}, close file cost in weight: {}, data statistics: {}", + numPartitions, + targetWeightPerSubtask, + closeFileCostWeight, + sortedStatistics); + throw new IllegalStateException( + "Internal algorithm error: exhausted subtasks with unassigned keys left"); + } + + if (currentKey == null) { + currentKey = mapKeyIterator.next(); + keyRemainingWeight = sortedStatistics.get(currentKey); + } + + assignedSubtasks.add(subtaskId); + if (keyRemainingWeight < subtaskRemainingWeight) { + // assign the remaining weight of the key to the current subtask + subtaskWeights.add(keyRemainingWeight); + subtaskRemainingWeight -= keyRemainingWeight; + keyRemainingWeight = 0L; + } else { + // filled up the current subtask + long assignedWeight = subtaskRemainingWeight; + keyRemainingWeight -= subtaskRemainingWeight; + + // If assigned weight is less than close file cost, pad it up with close file cost. + // This might cause the subtask assigned weight over the target weight. + // But it should be no more than one close file cost. Small skew is acceptable. + if (assignedWeight <= closeFileCostWeight) { + long paddingWeight = Math.min(keyRemainingWeight, closeFileCostWeight); + keyRemainingWeight -= paddingWeight; + assignedWeight += paddingWeight; + } + + subtaskWeights.add(assignedWeight); + // move on to the next subtask + subtaskId += 1; + subtaskRemainingWeight = targetWeightPerSubtask; + } + + Preconditions.checkState( + assignedSubtasks.size() == subtaskWeights.size(), + "List size mismatch: assigned subtasks = %s, subtask weights = %s", + assignedSubtasks, + subtaskWeights); + + // If the remaining key weight is smaller than the close file cost, simply skip the residual + // as it doesn't make sense to assign a weight smaller than close file cost to a new subtask. + // this might lead to some inaccuracy in weight calculation. E.g., assuming the key weight is + // 2 and close file cost is 2. key weight with close cost is 4. Let's assume the previous + // task has a weight of 3 available. So weight of 3 for this key is assigned to the task and + // the residual weight of 1 is dropped. Then the routing weight for this key is 1 (minus the + // close file cost), which is inaccurate as the true key weight should be 2. + // Again, this greedy algorithm is not intended to be perfect. Some small inaccuracy is + // expected and acceptable. Traffic distribution should still be balanced. + if (keyRemainingWeight > 0 && keyRemainingWeight <= closeFileCostWeight) { + keyRemainingWeight = 0; + } + + if (keyRemainingWeight == 0) { + // finishing up the assignment for the current key + KeyAssignment keyAssignment = + new KeyAssignment(assignedSubtasks, subtaskWeights, closeFileCostWeight); + assignmentMap.put(currentKey, keyAssignment); + assignedSubtasks = Lists.newArrayList(); + subtaskWeights = Lists.newArrayList(); + currentKey = null; + } + } + + return assignmentMap; + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapDataStatistics.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapDataStatistics.java new file mode 100644 index 000000000000..05b943f6046f --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapDataStatistics.java @@ -0,0 +1,88 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import java.util.Map; +import org.apache.iceberg.SortKey; +import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; +import org.apache.iceberg.relocated.com.google.common.base.Objects; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; + +/** MapDataStatistics uses map to count key frequency */ +class MapDataStatistics implements DataStatistics { + private final Map keyFrequency; + + MapDataStatistics() { + this.keyFrequency = Maps.newHashMap(); + } + + MapDataStatistics(Map keyFrequency) { + this.keyFrequency = keyFrequency; + } + + @Override + public StatisticsType type() { + return StatisticsType.Map; + } + + @Override + public boolean isEmpty() { + return keyFrequency.isEmpty(); + } + + @Override + public void add(SortKey sortKey) { + if (keyFrequency.containsKey(sortKey)) { + keyFrequency.merge(sortKey, 1L, Long::sum); + } else { + // clone the sort key before adding to map because input sortKey object can be reused + SortKey copiedKey = sortKey.copy(); + keyFrequency.put(copiedKey, 1L); + } + } + + @Override + public Object result() { + return keyFrequency; + } + + @Override + public String toString() { + return MoreObjects.toStringHelper(this).add("map", keyFrequency).toString(); + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + + if (!(o instanceof MapDataStatistics)) { + return false; + } + + MapDataStatistics other = (MapDataStatistics) o; + return Objects.equal(keyFrequency, other.keyFrequency); + } + + @Override + public int hashCode() { + return Objects.hashCode(keyFrequency); + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapRangePartitioner.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapRangePartitioner.java new file mode 100644 index 000000000000..f36a078c94e0 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapRangePartitioner.java @@ -0,0 +1,95 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import java.util.concurrent.TimeUnit; +import org.apache.flink.api.common.functions.Partitioner; +import org.apache.flink.table.data.RowData; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SortKey; +import org.apache.iceberg.SortOrder; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.RowDataWrapper; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Internal partitioner implementation that supports MapDataStatistics, which is typically used for + * low-cardinality use cases. While MapDataStatistics can keep accurate counters, it can't be used + * for high-cardinality use cases. Otherwise, the memory footprint is too high. + * + *

It is a greedy algorithm for bin packing. With close file cost, the calculation isn't always + * precise when calculating close cost for every file, target weight per subtask, padding residual + * weight, assigned weight without close cost. + * + *

All actions should be executed in a single Flink mailbox thread. So there is no need to make + * it thread safe. + */ +class MapRangePartitioner implements Partitioner { + private static final Logger LOG = LoggerFactory.getLogger(MapRangePartitioner.class); + + private final RowDataWrapper rowDataWrapper; + private final SortKey sortKey; + private final MapAssignment mapAssignment; + + // Counter that tracks how many times a new key encountered + // where there is no traffic statistics learned about it. + private long newSortKeyCounter; + private long lastNewSortKeyLogTimeMilli; + + MapRangePartitioner(Schema schema, SortOrder sortOrder, MapAssignment mapAssignment) { + this.rowDataWrapper = new RowDataWrapper(FlinkSchemaUtil.convert(schema), schema.asStruct()); + this.sortKey = new SortKey(schema, sortOrder); + this.mapAssignment = mapAssignment; + this.newSortKeyCounter = 0; + this.lastNewSortKeyLogTimeMilli = System.currentTimeMillis(); + } + + @Override + public int partition(RowData row, int numPartitions) { + // reuse the sortKey and rowDataWrapper + sortKey.wrap(rowDataWrapper.wrap(row)); + KeyAssignment keyAssignment = mapAssignment.keyAssignments().get(sortKey); + + int partition; + if (keyAssignment == null) { + LOG.trace( + "Encountered new sort key: {}. Fall back to round robin as statistics not learned yet.", + sortKey); + // Ideally unknownKeyCounter should be published as a counter metric. + // It seems difficult to pass in MetricGroup into the partitioner. + // Just log an INFO message every minute. + newSortKeyCounter += 1; + long now = System.currentTimeMillis(); + if (now - lastNewSortKeyLogTimeMilli > TimeUnit.MINUTES.toMillis(1)) { + LOG.info( + "Encounter new sort keys {} times. Fall back to round robin as statistics not learned yet", + newSortKeyCounter); + lastNewSortKeyLogTimeMilli = now; + newSortKeyCounter = 0; + } + partition = (int) (newSortKeyCounter % numPartitions); + } else { + partition = keyAssignment.select(); + } + + return RangePartitioner.adjustPartitionWithRescale( + partition, mapAssignment.numPartitions(), numPartitions); + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/RangePartitioner.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/RangePartitioner.java new file mode 100644 index 000000000000..6608b938f5a8 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/RangePartitioner.java @@ -0,0 +1,110 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import java.util.Random; +import java.util.concurrent.atomic.AtomicLong; +import org.apache.flink.annotation.Internal; +import org.apache.flink.api.common.functions.Partitioner; +import org.apache.flink.table.data.RowData; +import org.apache.iceberg.DistributionMode; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SortOrder; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** This custom partitioner implements the {@link DistributionMode#RANGE} for Flink sink. */ +@Internal +public class RangePartitioner implements Partitioner { + private static final Logger LOG = LoggerFactory.getLogger(RangePartitioner.class); + + private final Schema schema; + private final SortOrder sortOrder; + + private transient AtomicLong roundRobinCounter; + private transient Partitioner delegatePartitioner; + + public RangePartitioner(Schema schema, SortOrder sortOrder) { + this.schema = schema; + this.sortOrder = sortOrder; + } + + @Override + public int partition(StatisticsOrRecord wrapper, int numPartitions) { + if (wrapper.hasStatistics()) { + this.delegatePartitioner = delegatePartitioner(wrapper.statistics()); + return (int) (roundRobinCounter(numPartitions).getAndIncrement() % numPartitions); + } else { + if (delegatePartitioner != null) { + return delegatePartitioner.partition(wrapper.record(), numPartitions); + } else { + int partition = (int) (roundRobinCounter(numPartitions).getAndIncrement() % numPartitions); + LOG.trace("Statistics not available. Round robin to partition {}", partition); + return partition; + } + } + } + + private AtomicLong roundRobinCounter(int numPartitions) { + if (roundRobinCounter == null) { + // randomize the starting point to avoid synchronization across subtasks + this.roundRobinCounter = new AtomicLong(new Random().nextInt(numPartitions)); + } + + return roundRobinCounter; + } + + private Partitioner delegatePartitioner(GlobalStatistics statistics) { + if (statistics.type() == StatisticsType.Map) { + return new MapRangePartitioner(schema, sortOrder, statistics.mapAssignment()); + } else if (statistics.type() == StatisticsType.Sketch) { + return new SketchRangePartitioner(schema, sortOrder, statistics.rangeBounds()); + } else { + throw new IllegalArgumentException( + String.format("Invalid statistics type: %s. Should be Map or Sketch", statistics.type())); + } + } + + /** + * Util method that handles rescale (write parallelism / numPartitions change). + * + * @param partition partition caculated based on the existing statistics + * @param numPartitionsStatsCalculation number of partitions when the assignment was calculated + * based on + * @param numPartitions current number of partitions + * @return adjusted partition if necessary. + */ + static int adjustPartitionWithRescale( + int partition, int numPartitionsStatsCalculation, int numPartitions) { + if (numPartitionsStatsCalculation <= numPartitions) { + // no rescale or scale-up case. + // new subtasks are ignored and not assigned any keys, which is sub-optimal and only + // transient. when rescale is detected, operator requests new statistics from + // coordinator upon initialization. + return partition; + } else { + // scale-down case. + // Use mod % operation to distribution the over-range partitions. + // It can cause skew among subtasks. but the behavior is still better than + // discarding the statistics and falling back to round-robin (no clustering). + // Again, this is transient and stats refresh is requested when rescale is detected. + return partition % numPartitions; + } + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/RequestGlobalStatisticsEvent.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/RequestGlobalStatisticsEvent.java new file mode 100644 index 000000000000..ce17e1964392 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/RequestGlobalStatisticsEvent.java @@ -0,0 +1,38 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import org.apache.flink.runtime.operators.coordination.OperatorEvent; + +class RequestGlobalStatisticsEvent implements OperatorEvent { + private final Integer signature; + + RequestGlobalStatisticsEvent() { + this.signature = null; + } + + /** @param signature hashCode of the subtask's existing global statistics */ + RequestGlobalStatisticsEvent(int signature) { + this.signature = signature; + } + + Integer signature() { + return signature; + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchDataStatistics.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchDataStatistics.java new file mode 100644 index 000000000000..35bbb27baf16 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchDataStatistics.java @@ -0,0 +1,87 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import java.util.Arrays; +import org.apache.datasketches.sampling.ReservoirItemsSketch; +import org.apache.iceberg.SortKey; +import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; +import org.apache.iceberg.relocated.com.google.common.base.Objects; + +/** MapDataStatistics uses map to count key frequency */ +class SketchDataStatistics implements DataStatistics { + + private final ReservoirItemsSketch sketch; + + SketchDataStatistics(int reservoirSize) { + this.sketch = ReservoirItemsSketch.newInstance(reservoirSize); + } + + SketchDataStatistics(ReservoirItemsSketch sketchStats) { + this.sketch = sketchStats; + } + + @Override + public StatisticsType type() { + return StatisticsType.Sketch; + } + + @Override + public boolean isEmpty() { + return sketch.getNumSamples() == 0; + } + + @Override + public void add(SortKey sortKey) { + // clone the sort key first because input sortKey object can be reused + SortKey copiedKey = sortKey.copy(); + sketch.update(copiedKey); + } + + @Override + public Object result() { + return sketch; + } + + @Override + public String toString() { + return MoreObjects.toStringHelper(this).add("sketch", sketch).toString(); + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + + if (!(o instanceof SketchDataStatistics)) { + return false; + } + + ReservoirItemsSketch otherSketch = ((SketchDataStatistics) o).sketch; + return Objects.equal(sketch.getK(), otherSketch.getK()) + && Objects.equal(sketch.getN(), otherSketch.getN()) + && Arrays.deepEquals(sketch.getSamples(), otherSketch.getSamples()); + } + + @Override + public int hashCode() { + return Objects.hashCode(sketch.getK(), sketch.getN(), sketch.getSamples()); + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchRangePartitioner.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchRangePartitioner.java new file mode 100644 index 000000000000..dddb0d8722c0 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchRangePartitioner.java @@ -0,0 +1,51 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import java.util.Comparator; +import org.apache.flink.api.common.functions.Partitioner; +import org.apache.flink.table.data.RowData; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SortKey; +import org.apache.iceberg.SortOrder; +import org.apache.iceberg.StructLike; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.RowDataWrapper; +import org.apache.iceberg.types.Comparators; + +class SketchRangePartitioner implements Partitioner { + private final SortKey sortKey; + private final Comparator comparator; + private final SortKey[] rangeBounds; + private final RowDataWrapper rowDataWrapper; + + SketchRangePartitioner(Schema schema, SortOrder sortOrder, SortKey[] rangeBounds) { + this.sortKey = new SortKey(schema, sortOrder); + this.comparator = Comparators.forType(SortKeyUtil.sortKeySchema(schema, sortOrder).asStruct()); + this.rangeBounds = rangeBounds; + this.rowDataWrapper = new RowDataWrapper(FlinkSchemaUtil.convert(schema), schema.asStruct()); + } + + @Override + public int partition(RowData row, int numPartitions) { + // reuse the sortKey and rowDataWrapper + sortKey.wrap(rowDataWrapper.wrap(row)); + return SketchUtil.partition(sortKey, numPartitions, rangeBounds, comparator); + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchUtil.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchUtil.java new file mode 100644 index 000000000000..871ef9ef1149 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchUtil.java @@ -0,0 +1,159 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import java.util.Arrays; +import java.util.Comparator; +import java.util.Map; +import java.util.function.Consumer; +import org.apache.iceberg.SortKey; +import org.apache.iceberg.StructLike; + +class SketchUtil { + static final int COORDINATOR_MIN_RESERVOIR_SIZE = 10_000; + static final int COORDINATOR_MAX_RESERVOIR_SIZE = 1_000_000; + static final int COORDINATOR_TARGET_PARTITIONS_MULTIPLIER = 100; + static final int OPERATOR_OVER_SAMPLE_RATIO = 10; + + // switch the statistics tracking from map to sketch if the cardinality of the sort key is over + // this threshold. It is hardcoded for now, we can revisit in the future if config is needed. + static final int OPERATOR_SKETCH_SWITCH_THRESHOLD = 10_000; + static final int COORDINATOR_SKETCH_SWITCH_THRESHOLD = 100_000; + + private SketchUtil() {} + + /** + * The larger the reservoir size, the more accurate for range bounds calculation and the more + * balanced range distribution. + * + *

Here are the heuristic rules + *

  • Target size: numPartitions x 100 to achieve good accuracy and is easier to calculate the + * range bounds + *
  • Min is 10K to achieve good accuracy while memory footprint is still relatively small + *
  • Max is 1M to cap the memory footprint on coordinator + * + * @param numPartitions number of range partitions which equals to downstream operator parallelism + * @return reservoir size + */ + static int determineCoordinatorReservoirSize(int numPartitions) { + int reservoirSize = numPartitions * COORDINATOR_TARGET_PARTITIONS_MULTIPLIER; + + if (reservoirSize < COORDINATOR_MIN_RESERVOIR_SIZE) { + // adjust it up and still make reservoirSize divisible by numPartitions + int remainder = COORDINATOR_MIN_RESERVOIR_SIZE % numPartitions; + reservoirSize = COORDINATOR_MIN_RESERVOIR_SIZE + (numPartitions - remainder); + } else if (reservoirSize > COORDINATOR_MAX_RESERVOIR_SIZE) { + // adjust it down and still make reservoirSize divisible by numPartitions + int remainder = COORDINATOR_MAX_RESERVOIR_SIZE % numPartitions; + reservoirSize = COORDINATOR_MAX_RESERVOIR_SIZE - remainder; + } + + return reservoirSize; + } + + /** + * Determine the sampling reservoir size where operator subtasks collect data statistics. + * + *

    Here are the heuristic rules + *

  • Target size is "coordinator reservoir size * over sampling ration (10) / operator + * parallelism" + *
  • Min is 1K to achieve good accuracy while memory footprint is still relatively small + *
  • Max is 100K to cap the memory footprint on coordinator + * + * @param numPartitions number of range partitions which equals to downstream operator parallelism + * @param operatorParallelism data statistics operator parallelism + * @return reservoir size + */ + static int determineOperatorReservoirSize(int operatorParallelism, int numPartitions) { + int coordinatorReservoirSize = determineCoordinatorReservoirSize(numPartitions); + int totalOperatorSamples = coordinatorReservoirSize * OPERATOR_OVER_SAMPLE_RATIO; + return (int) Math.ceil((double) totalOperatorSamples / operatorParallelism); + } + + /** + * To understand how range bounds are used in range partitioning, here is an example for human + * ages with 4 partitions: [15, 32, 60]. The 4 ranges would be + * + *
      + *
    • age <= 15 + *
    • age > 15 && age <= 32 + *
    • age >32 && age <= 60 + *
    • age > 60 + *
    + * + *

    Assumption is that a single key is not dominant enough to span multiple subtasks. + * + * @param numPartitions number of partitions which maps to downstream operator parallelism + * @param samples sampled keys + * @return array of range partition bounds. It should be a sorted list (ascending). Number of + * items should be {@code numPartitions - 1}. if numPartitions is 1, return an empty list + */ + static SortKey[] rangeBounds( + int numPartitions, Comparator comparator, SortKey[] samples) { + // sort the keys first + Arrays.sort(samples, comparator); + int numCandidates = numPartitions - 1; + SortKey[] candidates = new SortKey[numCandidates]; + int step = (int) Math.ceil((double) samples.length / numPartitions); + int position = step - 1; + int numChosen = 0; + while (position < samples.length && numChosen < numCandidates) { + SortKey candidate = samples[position]; + // skip duplicate values + if (numChosen > 0 && candidate.equals(candidates[numChosen - 1])) { + // linear probe for the next distinct value + position += 1; + } else { + candidates[numChosen] = candidate; + position += step; + numChosen += 1; + } + } + + return candidates; + } + + /** This can be a bit expensive since it is quadratic. */ + static void convertMapToSketch( + Map taskMapStats, Consumer sketchConsumer) { + taskMapStats.forEach( + (sortKey, count) -> { + for (int i = 0; i < count; ++i) { + sketchConsumer.accept(sortKey); + } + }); + } + + static int partition( + SortKey key, int numPartitions, SortKey[] rangeBounds, Comparator comparator) { + int partition = Arrays.binarySearch(rangeBounds, key, comparator); + + // binarySearch either returns the match location or -[insertion point]-1 + if (partition < 0) { + partition = -partition - 1; + } + + if (partition > rangeBounds.length) { + partition = rangeBounds.length; + } + + return RangePartitioner.adjustPartitionWithRescale( + partition, rangeBounds.length + 1, numPartitions); + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SortKeySerializer.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SortKeySerializer.java new file mode 100644 index 000000000000..b82fc8250763 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SortKeySerializer.java @@ -0,0 +1,359 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import java.io.IOException; +import java.math.BigDecimal; +import java.math.BigInteger; +import java.nio.ByteBuffer; +import java.util.List; +import java.util.Objects; +import java.util.Set; +import java.util.UUID; +import java.util.stream.Collectors; +import org.apache.flink.api.common.typeutils.TypeSerializer; +import org.apache.flink.api.common.typeutils.TypeSerializerSchemaCompatibility; +import org.apache.flink.api.common.typeutils.TypeSerializerSnapshot; +import org.apache.flink.core.memory.DataInputView; +import org.apache.flink.core.memory.DataOutputView; +import org.apache.flink.util.Preconditions; +import org.apache.flink.util.StringUtils; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SchemaParser; +import org.apache.iceberg.SortField; +import org.apache.iceberg.SortKey; +import org.apache.iceberg.SortOrder; +import org.apache.iceberg.SortOrderParser; +import org.apache.iceberg.types.CheckCompatibility; +import org.apache.iceberg.types.Type; +import org.apache.iceberg.types.TypeUtil; +import org.apache.iceberg.types.Types; + +class SortKeySerializer extends TypeSerializer { + private final Schema schema; + private final SortOrder sortOrder; + private final int size; + private final Types.NestedField[] transformedFields; + + private transient SortKey sortKey; + + SortKeySerializer(Schema schema, SortOrder sortOrder) { + this.schema = schema; + this.sortOrder = sortOrder; + this.size = sortOrder.fields().size(); + + this.transformedFields = new Types.NestedField[size]; + for (int i = 0; i < size; ++i) { + SortField sortField = sortOrder.fields().get(i); + Types.NestedField sourceField = schema.findField(sortField.sourceId()); + Type resultType = sortField.transform().getResultType(sourceField.type()); + Types.NestedField transformedField = + Types.NestedField.of( + sourceField.fieldId(), + sourceField.isOptional(), + sourceField.name(), + resultType, + sourceField.doc()); + transformedFields[i] = transformedField; + } + } + + private SortKey lazySortKey() { + if (sortKey == null) { + this.sortKey = new SortKey(schema, sortOrder); + } + + return sortKey; + } + + @Override + public boolean isImmutableType() { + return false; + } + + @Override + public TypeSerializer duplicate() { + return new SortKeySerializer(schema, sortOrder); + } + + @Override + public SortKey createInstance() { + return new SortKey(schema, sortOrder); + } + + @Override + public SortKey copy(SortKey from) { + return from.copy(); + } + + @Override + public SortKey copy(SortKey from, SortKey reuse) { + // no benefit of reuse + return copy(from); + } + + @Override + public int getLength() { + return -1; + } + + @Override + public void serialize(SortKey record, DataOutputView target) throws IOException { + Preconditions.checkArgument( + record.size() == size, + "Invalid size of the sort key object: %s. Expected %s", + record.size(), + size); + for (int i = 0; i < size; ++i) { + int fieldId = transformedFields[i].fieldId(); + Type.TypeID typeId = transformedFields[i].type().typeId(); + switch (typeId) { + case BOOLEAN: + target.writeBoolean(record.get(i, Boolean.class)); + break; + case INTEGER: + case DATE: + target.writeInt(record.get(i, Integer.class)); + break; + case LONG: + case TIME: + case TIMESTAMP: + target.writeLong(record.get(i, Long.class)); + break; + case FLOAT: + target.writeFloat(record.get(i, Float.class)); + break; + case DOUBLE: + target.writeDouble(record.get(i, Double.class)); + break; + case STRING: + target.writeUTF(record.get(i, CharSequence.class).toString()); + break; + case UUID: + UUID uuid = record.get(i, UUID.class); + target.writeLong(uuid.getMostSignificantBits()); + target.writeLong(uuid.getLeastSignificantBits()); + break; + case FIXED: + case BINARY: + byte[] bytes = record.get(i, ByteBuffer.class).array(); + target.writeInt(bytes.length); + target.write(bytes); + break; + case DECIMAL: + BigDecimal decimal = record.get(i, BigDecimal.class); + byte[] decimalBytes = decimal.unscaledValue().toByteArray(); + target.writeInt(decimalBytes.length); + target.write(decimalBytes); + target.writeInt(decimal.scale()); + break; + case STRUCT: + case MAP: + case LIST: + default: + // SortKey transformation is a flattened struct without list and map + throw new UnsupportedOperationException( + String.format("Field %d has unsupported field type: %s", fieldId, typeId)); + } + } + } + + @Override + public SortKey deserialize(DataInputView source) throws IOException { + // copying is a little faster than constructing a new SortKey object + SortKey deserialized = lazySortKey().copy(); + deserialize(deserialized, source); + return deserialized; + } + + @Override + public SortKey deserialize(SortKey reuse, DataInputView source) throws IOException { + Preconditions.checkArgument( + reuse.size() == size, + "Invalid size of the sort key object: %s. Expected %s", + reuse.size(), + size); + for (int i = 0; i < size; ++i) { + int fieldId = transformedFields[i].fieldId(); + Type.TypeID typeId = transformedFields[i].type().typeId(); + switch (typeId) { + case BOOLEAN: + reuse.set(i, source.readBoolean()); + break; + case INTEGER: + case DATE: + reuse.set(i, source.readInt()); + break; + case LONG: + case TIME: + case TIMESTAMP: + reuse.set(i, source.readLong()); + break; + case FLOAT: + reuse.set(i, source.readFloat()); + break; + case DOUBLE: + reuse.set(i, source.readDouble()); + break; + case STRING: + reuse.set(i, source.readUTF()); + break; + case UUID: + long mostSignificantBits = source.readLong(); + long leastSignificantBits = source.readLong(); + reuse.set(i, new UUID(mostSignificantBits, leastSignificantBits)); + break; + case FIXED: + case BINARY: + byte[] bytes = new byte[source.readInt()]; + source.read(bytes); + reuse.set(i, ByteBuffer.wrap(bytes)); + break; + case DECIMAL: + byte[] unscaledBytes = new byte[source.readInt()]; + source.read(unscaledBytes); + int scale = source.readInt(); + BigDecimal decimal = new BigDecimal(new BigInteger(unscaledBytes), scale); + reuse.set(i, decimal); + break; + case STRUCT: + case MAP: + case LIST: + default: + // SortKey transformation is a flattened struct without list and map + throw new UnsupportedOperationException( + String.format("Field %d has unsupported field type: %s", fieldId, typeId)); + } + } + + return reuse; + } + + @Override + public void copy(DataInputView source, DataOutputView target) throws IOException { + // no optimization here + serialize(deserialize(source), target); + } + + @Override + public boolean equals(Object obj) { + if (!(obj instanceof SortKeySerializer)) { + return false; + } + + SortKeySerializer other = (SortKeySerializer) obj; + return Objects.equals(schema.asStruct(), other.schema.asStruct()) + && Objects.equals(sortOrder, other.sortOrder); + } + + @Override + public int hashCode() { + return schema.asStruct().hashCode() * 31 + sortOrder.hashCode(); + } + + @Override + public TypeSerializerSnapshot snapshotConfiguration() { + return new SortKeySerializerSnapshot(schema, sortOrder); + } + + public static class SortKeySerializerSnapshot implements TypeSerializerSnapshot { + private static final int CURRENT_VERSION = 1; + + private Schema schema; + private SortOrder sortOrder; + + /** Constructor for read instantiation. */ + @SuppressWarnings({"unused", "checkstyle:RedundantModifier"}) + public SortKeySerializerSnapshot() { + // this constructor is used when restoring from a checkpoint. + } + + @SuppressWarnings("checkstyle:RedundantModifier") + public SortKeySerializerSnapshot(Schema schema, SortOrder sortOrder) { + this.schema = schema; + this.sortOrder = sortOrder; + } + + @Override + public int getCurrentVersion() { + return CURRENT_VERSION; + } + + @Override + public void writeSnapshot(DataOutputView out) throws IOException { + Preconditions.checkState(schema != null, "Invalid schema: null"); + Preconditions.checkState(sortOrder != null, "Invalid sort order: null"); + + StringUtils.writeString(SchemaParser.toJson(schema), out); + StringUtils.writeString(SortOrderParser.toJson(sortOrder), out); + } + + @Override + public void readSnapshot(int readVersion, DataInputView in, ClassLoader userCodeClassLoader) + throws IOException { + if (readVersion == 1) { + readV1(in); + } else { + throw new IllegalArgumentException("Unknown read version: " + readVersion); + } + } + + @Override + public TypeSerializerSchemaCompatibility resolveSchemaCompatibility( + TypeSerializerSnapshot oldSerializerSnapshot) { + if (!(oldSerializerSnapshot instanceof SortKeySerializerSnapshot)) { + return TypeSerializerSchemaCompatibility.incompatible(); + } + + // Sort order should be identical + SortKeySerializerSnapshot oldSnapshot = (SortKeySerializerSnapshot) oldSerializerSnapshot; + if (!sortOrder.sameOrder(oldSnapshot.sortOrder)) { + return TypeSerializerSchemaCompatibility.incompatible(); + } + + Set sortFieldIds = + sortOrder.fields().stream().map(SortField::sourceId).collect(Collectors.toSet()); + // only care about the schema related to sort fields + Schema sortSchema = TypeUtil.project(schema, sortFieldIds); + Schema oldSortSchema = TypeUtil.project(oldSnapshot.schema, sortFieldIds); + + List compatibilityErrors = + CheckCompatibility.writeCompatibilityErrors(sortSchema, oldSortSchema); + if (compatibilityErrors.isEmpty()) { + return TypeSerializerSchemaCompatibility.compatibleAsIs(); + } + + return TypeSerializerSchemaCompatibility.incompatible(); + } + + @Override + public TypeSerializer restoreSerializer() { + Preconditions.checkState(schema != null, "Invalid schema: null"); + Preconditions.checkState(sortOrder != null, "Invalid sort order: null"); + return new SortKeySerializer(schema, sortOrder); + } + + private void readV1(DataInputView in) throws IOException { + String schemaJson = StringUtils.readString(in); + String sortOrderJson = StringUtils.readString(in); + this.schema = SchemaParser.fromJson(schemaJson); + this.sortOrder = SortOrderParser.fromJson(sortOrderJson).bind(schema); + } + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SortKeySketchSerializer.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SortKeySketchSerializer.java new file mode 100644 index 000000000000..d6c23f035015 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SortKeySketchSerializer.java @@ -0,0 +1,143 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import java.io.IOException; +import java.io.Serializable; +import java.io.UncheckedIOException; +import java.util.Arrays; +import java.util.List; +import org.apache.datasketches.common.ArrayOfItemsSerDe; +import org.apache.datasketches.common.ArrayOfStringsSerDe; +import org.apache.datasketches.common.ByteArrayUtil; +import org.apache.datasketches.common.Util; +import org.apache.datasketches.memory.Memory; +import org.apache.datasketches.sampling.ReservoirItemsSketch; +import org.apache.flink.api.common.typeutils.TypeSerializer; +import org.apache.flink.api.common.typeutils.base.ListSerializer; +import org.apache.flink.core.memory.DataInputDeserializer; +import org.apache.flink.core.memory.DataOutputSerializer; +import org.apache.iceberg.SortKey; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; + +/** + * Only way to implement {@link ReservoirItemsSketch} serializer is to extend from {@link + * ArrayOfItemsSerDe}, as deserialization uses a private constructor from ReservoirItemsSketch. The + * implementation is modeled after {@link ArrayOfStringsSerDe} + */ +class SortKeySketchSerializer extends ArrayOfItemsSerDe implements Serializable { + private static final int DEFAULT_SORT_KEY_SIZE = 128; + + private final TypeSerializer itemSerializer; + private final ListSerializer listSerializer; + private final DataInputDeserializer input; + + SortKeySketchSerializer(TypeSerializer itemSerializer) { + this.itemSerializer = itemSerializer; + this.listSerializer = new ListSerializer<>(itemSerializer); + this.input = new DataInputDeserializer(); + } + + @Override + public byte[] serializeToByteArray(SortKey item) { + try { + DataOutputSerializer output = new DataOutputSerializer(DEFAULT_SORT_KEY_SIZE); + itemSerializer.serialize(item, output); + byte[] itemBytes = output.getSharedBuffer(); + int numBytes = output.length(); + byte[] out = new byte[numBytes + Integer.BYTES]; + ByteArrayUtil.copyBytes(itemBytes, 0, out, 4, numBytes); + ByteArrayUtil.putIntLE(out, 0, numBytes); + return out; + } catch (IOException e) { + throw new UncheckedIOException("Failed to serialize sort key", e); + } + } + + @Override + public byte[] serializeToByteArray(SortKey[] items) { + try { + DataOutputSerializer output = new DataOutputSerializer(DEFAULT_SORT_KEY_SIZE * items.length); + listSerializer.serialize(Arrays.asList(items), output); + byte[] itemsBytes = output.getSharedBuffer(); + int numBytes = output.length(); + byte[] out = new byte[Integer.BYTES + numBytes]; + ByteArrayUtil.putIntLE(out, 0, numBytes); + System.arraycopy(itemsBytes, 0, out, Integer.BYTES, numBytes); + return out; + } catch (IOException e) { + throw new UncheckedIOException("Failed to serialize sort key", e); + } + } + + @Override + public SortKey[] deserializeFromMemory(Memory mem, long startingOffset, int numItems) { + Preconditions.checkArgument(mem != null, "Invalid input memory: null"); + if (numItems <= 0) { + return new SortKey[0]; + } + + long offset = startingOffset; + Util.checkBounds(offset, Integer.BYTES, mem.getCapacity()); + int numBytes = mem.getInt(offset); + offset += Integer.BYTES; + + Util.checkBounds(offset, numBytes, mem.getCapacity()); + byte[] sortKeyBytes = new byte[numBytes]; + mem.getByteArray(offset, sortKeyBytes, 0, numBytes); + input.setBuffer(sortKeyBytes); + + try { + List sortKeys = listSerializer.deserialize(input); + SortKey[] array = new SortKey[numItems]; + sortKeys.toArray(array); + input.releaseArrays(); + return array; + } catch (IOException e) { + throw new UncheckedIOException("Failed to deserialize sort key sketch", e); + } + } + + @Override + public int sizeOf(SortKey item) { + return serializeToByteArray(item).length; + } + + @Override + public int sizeOf(Memory mem, long offset, int numItems) { + Preconditions.checkArgument(mem != null, "Invalid input memory: null"); + if (numItems <= 0) { + return 0; + } + + Util.checkBounds(offset, Integer.BYTES, mem.getCapacity()); + int numBytes = mem.getInt(offset); + return Integer.BYTES + numBytes; + } + + @Override + public String toString(SortKey item) { + return item.toString(); + } + + @Override + public Class getClassOfT() { + return SortKey.class; + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SortKeyUtil.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SortKeyUtil.java new file mode 100644 index 000000000000..1e5bdbbac3e4 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SortKeyUtil.java @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import java.util.List; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SortField; +import org.apache.iceberg.SortOrder; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.types.Type; +import org.apache.iceberg.types.Types; + +class SortKeyUtil { + private SortKeyUtil() {} + + /** Compute the result schema of {@code SortKey} transformation */ + static Schema sortKeySchema(Schema schema, SortOrder sortOrder) { + List sortFields = sortOrder.fields(); + int size = sortFields.size(); + List transformedFields = Lists.newArrayListWithCapacity(size); + for (int i = 0; i < size; ++i) { + int sourceFieldId = sortFields.get(i).sourceId(); + Types.NestedField sourceField = schema.findField(sourceFieldId); + Preconditions.checkArgument( + sourceField != null, "Cannot find source field: %s", sourceFieldId); + Type transformedType = sortFields.get(i).transform().getResultType(sourceField.type()); + // There could be multiple transformations on the same source column, like in the PartitionKey + // case. To resolve the collision, field id is set to transform index and field name is set to + // sourceFieldName_transformIndex + Types.NestedField transformedField = + Types.NestedField.of( + i, + sourceField.isOptional(), + sourceField.name() + '_' + i, + transformedType, + sourceField.doc()); + transformedFields.add(transformedField); + } + + return new Schema(transformedFields); + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsEvent.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsEvent.java new file mode 100644 index 000000000000..f6fcdb8b16ef --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsEvent.java @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import org.apache.flink.annotation.Internal; +import org.apache.flink.api.common.typeutils.TypeSerializer; +import org.apache.flink.runtime.operators.coordination.OperatorEvent; + +/** + * DataStatisticsEvent is sent between data statistics coordinator and operator to transmit data + * statistics in bytes + */ +@Internal +class StatisticsEvent implements OperatorEvent { + + private static final long serialVersionUID = 1L; + private final long checkpointId; + private final byte[] statisticsBytes; + private final boolean applyImmediately; + + private StatisticsEvent(long checkpointId, byte[] statisticsBytes, boolean applyImmediately) { + this.checkpointId = checkpointId; + this.statisticsBytes = statisticsBytes; + this.applyImmediately = applyImmediately; + } + + static StatisticsEvent createTaskStatisticsEvent( + long checkpointId, + DataStatistics statistics, + TypeSerializer statisticsSerializer) { + // applyImmediately is really only relevant for coordinator to operator event. + // task reported statistics is always merged immediately by the coordinator. + return new StatisticsEvent( + checkpointId, + StatisticsUtil.serializeDataStatistics(statistics, statisticsSerializer), + true); + } + + static StatisticsEvent createGlobalStatisticsEvent( + GlobalStatistics statistics, + TypeSerializer statisticsSerializer, + boolean applyImmediately) { + return new StatisticsEvent( + statistics.checkpointId(), + StatisticsUtil.serializeGlobalStatistics(statistics, statisticsSerializer), + applyImmediately); + } + + long checkpointId() { + return checkpointId; + } + + byte[] statisticsBytes() { + return statisticsBytes; + } + + boolean applyImmediately() { + return applyImmediately; + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsOrRecord.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsOrRecord.java new file mode 100644 index 000000000000..bc28df2b0e22 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsOrRecord.java @@ -0,0 +1,112 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import java.io.Serializable; +import org.apache.flink.annotation.Internal; +import org.apache.flink.api.common.typeutils.TypeSerializer; +import org.apache.flink.table.data.RowData; +import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; + +/** + * The wrapper class for data statistics and record. It is the only way for data statistics operator + * to send global data statistics to custom partitioner to distribute data based on statistics + * + *

    DataStatisticsOrRecord contains either data statistics(globally aggregated) or a record. It is + * sent from {@link DataStatisticsOperator} to partitioner. Once partitioner receives the data + * statistics, it will use that to decide the coming record should send to which writer subtask. + * After shuffling, a filter and mapper are required to filter out the data distribution weight, + * unwrap the object and extract the original record type T. + */ +@Internal +public class StatisticsOrRecord implements Serializable { + + private static final long serialVersionUID = 1L; + + private GlobalStatistics statistics; + private RowData record; + + private StatisticsOrRecord(GlobalStatistics statistics, RowData record) { + Preconditions.checkArgument( + record != null ^ statistics != null, "DataStatistics or record, not neither or both"); + this.statistics = statistics; + this.record = record; + } + + static StatisticsOrRecord fromRecord(RowData record) { + return new StatisticsOrRecord(null, record); + } + + static StatisticsOrRecord fromStatistics(GlobalStatistics statistics) { + return new StatisticsOrRecord(statistics, null); + } + + static StatisticsOrRecord reuseRecord( + StatisticsOrRecord reuse, TypeSerializer recordSerializer) { + if (reuse.hasRecord()) { + return reuse; + } else { + // not reusable + return StatisticsOrRecord.fromRecord(recordSerializer.createInstance()); + } + } + + static StatisticsOrRecord reuseStatistics( + StatisticsOrRecord reuse, TypeSerializer statisticsSerializer) { + if (reuse.hasStatistics()) { + return reuse; + } else { + // not reusable + return StatisticsOrRecord.fromStatistics(statisticsSerializer.createInstance()); + } + } + + boolean hasStatistics() { + return statistics != null; + } + + public boolean hasRecord() { + return record != null; + } + + GlobalStatistics statistics() { + return statistics; + } + + void statistics(GlobalStatistics newStatistics) { + this.statistics = newStatistics; + } + + public RowData record() { + return record; + } + + void record(RowData newRecord) { + this.record = newRecord; + } + + @Override + public String toString() { + return MoreObjects.toStringHelper(this) + .add("statistics", statistics) + .add("record", record) + .toString(); + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsOrRecordSerializer.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsOrRecordSerializer.java new file mode 100644 index 000000000000..d4ae2b359679 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsOrRecordSerializer.java @@ -0,0 +1,208 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import java.io.IOException; +import java.util.Objects; +import org.apache.flink.annotation.Internal; +import org.apache.flink.api.common.typeutils.CompositeTypeSerializerSnapshot; +import org.apache.flink.api.common.typeutils.TypeSerializer; +import org.apache.flink.api.common.typeutils.TypeSerializerSnapshot; +import org.apache.flink.core.memory.DataInputView; +import org.apache.flink.core.memory.DataOutputView; +import org.apache.flink.table.data.RowData; + +@Internal +class StatisticsOrRecordSerializer extends TypeSerializer { + private final TypeSerializer statisticsSerializer; + private final TypeSerializer recordSerializer; + + StatisticsOrRecordSerializer( + TypeSerializer statisticsSerializer, + TypeSerializer recordSerializer) { + this.statisticsSerializer = statisticsSerializer; + this.recordSerializer = recordSerializer; + } + + @Override + public boolean isImmutableType() { + return false; + } + + @SuppressWarnings("ReferenceEquality") + @Override + public TypeSerializer duplicate() { + TypeSerializer duplicateStatisticsSerializer = + statisticsSerializer.duplicate(); + TypeSerializer duplicateRowDataSerializer = recordSerializer.duplicate(); + if ((statisticsSerializer != duplicateStatisticsSerializer) + || (recordSerializer != duplicateRowDataSerializer)) { + return new StatisticsOrRecordSerializer( + duplicateStatisticsSerializer, duplicateRowDataSerializer); + } else { + return this; + } + } + + @Override + public StatisticsOrRecord createInstance() { + // arbitrarily always create RowData value instance + return StatisticsOrRecord.fromRecord(recordSerializer.createInstance()); + } + + @Override + public StatisticsOrRecord copy(StatisticsOrRecord from) { + if (from.hasRecord()) { + return StatisticsOrRecord.fromRecord(recordSerializer.copy(from.record())); + } else { + return StatisticsOrRecord.fromStatistics(statisticsSerializer.copy(from.statistics())); + } + } + + @Override + public StatisticsOrRecord copy(StatisticsOrRecord from, StatisticsOrRecord reuse) { + StatisticsOrRecord to; + if (from.hasRecord()) { + to = StatisticsOrRecord.reuseRecord(reuse, recordSerializer); + RowData record = recordSerializer.copy(from.record(), to.record()); + to.record(record); + } else { + to = StatisticsOrRecord.reuseStatistics(reuse, statisticsSerializer); + GlobalStatistics statistics = statisticsSerializer.copy(from.statistics(), to.statistics()); + to.statistics(statistics); + } + + return to; + } + + @Override + public int getLength() { + return -1; + } + + @Override + public void serialize(StatisticsOrRecord statisticsOrRecord, DataOutputView target) + throws IOException { + if (statisticsOrRecord.hasRecord()) { + target.writeBoolean(true); + recordSerializer.serialize(statisticsOrRecord.record(), target); + } else { + target.writeBoolean(false); + statisticsSerializer.serialize(statisticsOrRecord.statistics(), target); + } + } + + @Override + public StatisticsOrRecord deserialize(DataInputView source) throws IOException { + boolean isRecord = source.readBoolean(); + if (isRecord) { + return StatisticsOrRecord.fromRecord(recordSerializer.deserialize(source)); + } else { + return StatisticsOrRecord.fromStatistics(statisticsSerializer.deserialize(source)); + } + } + + @Override + public StatisticsOrRecord deserialize(StatisticsOrRecord reuse, DataInputView source) + throws IOException { + StatisticsOrRecord to; + boolean isRecord = source.readBoolean(); + if (isRecord) { + to = StatisticsOrRecord.reuseRecord(reuse, recordSerializer); + RowData record = recordSerializer.deserialize(to.record(), source); + to.record(record); + } else { + to = StatisticsOrRecord.reuseStatistics(reuse, statisticsSerializer); + GlobalStatistics statistics = statisticsSerializer.deserialize(to.statistics(), source); + to.statistics(statistics); + } + + return to; + } + + @Override + public void copy(DataInputView source, DataOutputView target) throws IOException { + boolean hasRecord = source.readBoolean(); + target.writeBoolean(hasRecord); + if (hasRecord) { + recordSerializer.copy(source, target); + } else { + statisticsSerializer.copy(source, target); + } + } + + @Override + public boolean equals(Object obj) { + if (!(obj instanceof StatisticsOrRecordSerializer)) { + return false; + } + + StatisticsOrRecordSerializer other = (StatisticsOrRecordSerializer) obj; + return Objects.equals(statisticsSerializer, other.statisticsSerializer) + && Objects.equals(recordSerializer, other.recordSerializer); + } + + @Override + public int hashCode() { + return Objects.hash(statisticsSerializer, recordSerializer); + } + + @Override + public TypeSerializerSnapshot snapshotConfiguration() { + return new StatisticsOrRecordSerializerSnapshot(this); + } + + public static class StatisticsOrRecordSerializerSnapshot + extends CompositeTypeSerializerSnapshot { + private static final int CURRENT_VERSION = 1; + + /** Constructor for read instantiation. */ + @SuppressWarnings({"unused", "checkstyle:RedundantModifier"}) + public StatisticsOrRecordSerializerSnapshot() {} + + @SuppressWarnings("checkstyle:RedundantModifier") + public StatisticsOrRecordSerializerSnapshot(StatisticsOrRecordSerializer serializer) { + super(serializer); + } + + @SuppressWarnings("checkstyle:RedundantModifier") + @Override + protected int getCurrentOuterSnapshotVersion() { + return CURRENT_VERSION; + } + + @Override + protected TypeSerializer[] getNestedSerializers( + StatisticsOrRecordSerializer outerSerializer) { + return new TypeSerializer[] { + outerSerializer.statisticsSerializer, outerSerializer.recordSerializer + }; + } + + @SuppressWarnings("unchecked") + @Override + protected StatisticsOrRecordSerializer createOuterSerializerWithNestedSerializers( + TypeSerializer[] nestedSerializers) { + TypeSerializer statisticsSerializer = + (TypeSerializer) nestedSerializers[0]; + TypeSerializer recordSerializer = (TypeSerializer) nestedSerializers[1]; + return new StatisticsOrRecordSerializer(statisticsSerializer, recordSerializer); + } + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsType.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsType.java new file mode 100644 index 000000000000..43f72e336e06 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsType.java @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +/** + * Range distribution requires gathering statistics on the sort keys to determine proper range + * boundaries to distribute/cluster rows before writer operators. + */ +public enum StatisticsType { + /** + * Tracks the data statistics as {@code Map} frequency. It works better for + * low-cardinality scenarios (like country, event_type, etc.) where the cardinalities are in + * hundreds or thousands. + * + *

      + *
    • Pro: accurate measurement on the statistics/weight of every key. + *
    • Con: memory footprint can be large if the key cardinality is high. + *
    + */ + Map, + + /** + * Sample the sort keys via reservoir sampling. Then split the range partitions via range bounds + * from sampled values. It works better for high-cardinality scenarios (like device_id, user_id, + * uuid etc.) where the cardinalities can be in millions or billions. + * + *
      + *
    • Pro: relatively low memory footprint for high-cardinality sort keys. + *
    • Con: non-precise approximation with potentially lower accuracy. + *
    + */ + Sketch, + + /** + * Initially use Map for statistics tracking. If key cardinality turns out to be high, + * automatically switch to sketch sampling. + */ + Auto +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsUtil.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsUtil.java new file mode 100644 index 000000000000..5d48ec57ca49 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsUtil.java @@ -0,0 +1,126 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import java.io.IOException; +import java.io.UncheckedIOException; +import javax.annotation.Nullable; +import org.apache.flink.api.common.typeutils.TypeSerializer; +import org.apache.flink.core.memory.DataInputDeserializer; +import org.apache.flink.core.memory.DataOutputSerializer; + +class StatisticsUtil { + + private StatisticsUtil() {} + + static DataStatistics createTaskStatistics( + StatisticsType type, int operatorParallelism, int numPartitions) { + if (type == StatisticsType.Map) { + return new MapDataStatistics(); + } else { + return new SketchDataStatistics( + SketchUtil.determineOperatorReservoirSize(operatorParallelism, numPartitions)); + } + } + + static byte[] serializeDataStatistics( + DataStatistics dataStatistics, TypeSerializer statisticsSerializer) { + DataOutputSerializer out = new DataOutputSerializer(64); + try { + statisticsSerializer.serialize(dataStatistics, out); + return out.getCopyOfBuffer(); + } catch (IOException e) { + throw new UncheckedIOException("Fail to serialize data statistics", e); + } + } + + static DataStatistics deserializeDataStatistics( + byte[] bytes, TypeSerializer statisticsSerializer) { + DataInputDeserializer input = new DataInputDeserializer(bytes, 0, bytes.length); + try { + return statisticsSerializer.deserialize(input); + } catch (IOException e) { + throw new UncheckedIOException("Fail to deserialize data statistics", e); + } + } + + static byte[] serializeCompletedStatistics( + CompletedStatistics completedStatistics, + TypeSerializer statisticsSerializer) { + try { + DataOutputSerializer out = new DataOutputSerializer(1024); + statisticsSerializer.serialize(completedStatistics, out); + return out.getCopyOfBuffer(); + } catch (IOException e) { + throw new UncheckedIOException("Fail to serialize aggregated statistics", e); + } + } + + static CompletedStatistics deserializeCompletedStatistics( + byte[] bytes, TypeSerializer statisticsSerializer) { + try { + DataInputDeserializer input = new DataInputDeserializer(bytes); + return statisticsSerializer.deserialize(input); + } catch (IOException e) { + throw new UncheckedIOException("Fail to deserialize aggregated statistics", e); + } + } + + static byte[] serializeGlobalStatistics( + GlobalStatistics globalStatistics, TypeSerializer statisticsSerializer) { + try { + DataOutputSerializer out = new DataOutputSerializer(1024); + statisticsSerializer.serialize(globalStatistics, out); + return out.getCopyOfBuffer(); + } catch (IOException e) { + throw new UncheckedIOException("Fail to serialize aggregated statistics", e); + } + } + + static GlobalStatistics deserializeGlobalStatistics( + byte[] bytes, TypeSerializer statisticsSerializer) { + try { + DataInputDeserializer input = new DataInputDeserializer(bytes); + return statisticsSerializer.deserialize(input); + } catch (IOException e) { + throw new UncheckedIOException("Fail to deserialize aggregated statistics", e); + } + } + + static StatisticsType collectType(StatisticsType config) { + return config == StatisticsType.Sketch ? StatisticsType.Sketch : StatisticsType.Map; + } + + static StatisticsType collectType(StatisticsType config, @Nullable GlobalStatistics statistics) { + if (statistics != null) { + return statistics.type(); + } + + return collectType(config); + } + + static StatisticsType collectType( + StatisticsType config, @Nullable CompletedStatistics statistics) { + if (statistics != null) { + return statistics.type(); + } + + return collectType(config); + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/AvroGenericRecordFileScanTaskReader.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/AvroGenericRecordFileScanTaskReader.java new file mode 100644 index 000000000000..796434c45136 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/AvroGenericRecordFileScanTaskReader.java @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import org.apache.avro.generic.GenericRecord; +import org.apache.iceberg.FileScanTask; +import org.apache.iceberg.encryption.InputFilesDecryptor; +import org.apache.iceberg.io.CloseableIterator; + +public class AvroGenericRecordFileScanTaskReader implements FileScanTaskReader { + private final RowDataFileScanTaskReader rowDataReader; + private final RowDataToAvroGenericRecordConverter converter; + + public AvroGenericRecordFileScanTaskReader( + RowDataFileScanTaskReader rowDataReader, RowDataToAvroGenericRecordConverter converter) { + this.rowDataReader = rowDataReader; + this.converter = converter; + } + + @Override + public CloseableIterator open( + FileScanTask fileScanTask, InputFilesDecryptor inputFilesDecryptor) { + return CloseableIterator.transform( + rowDataReader.open(fileScanTask, inputFilesDecryptor), converter); + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/DataIterator.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/DataIterator.java new file mode 100644 index 000000000000..91d975349b19 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/DataIterator.java @@ -0,0 +1,156 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.util.Iterator; +import org.apache.flink.annotation.Internal; +import org.apache.iceberg.CombinedScanTask; +import org.apache.iceberg.FileScanTask; +import org.apache.iceberg.encryption.EncryptionManager; +import org.apache.iceberg.encryption.InputFilesDecryptor; +import org.apache.iceberg.io.CloseableIterator; +import org.apache.iceberg.io.FileIO; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; + +/** + * Flink data iterator that reads {@link CombinedScanTask} into a {@link CloseableIterator} + * + * @param is the output data type returned by this iterator. + */ +@Internal +public class DataIterator implements CloseableIterator { + + private final FileScanTaskReader fileScanTaskReader; + + private final InputFilesDecryptor inputFilesDecryptor; + private final CombinedScanTask combinedTask; + + private Iterator tasks; + private CloseableIterator currentIterator; + private int fileOffset; + private long recordOffset; + + public DataIterator( + FileScanTaskReader fileScanTaskReader, + CombinedScanTask task, + FileIO io, + EncryptionManager encryption) { + this.fileScanTaskReader = fileScanTaskReader; + + this.inputFilesDecryptor = new InputFilesDecryptor(task, io, encryption); + this.combinedTask = task; + + this.tasks = task.files().iterator(); + this.currentIterator = CloseableIterator.empty(); + + // fileOffset starts at -1 because we started + // from an empty iterator that is not from the split files. + this.fileOffset = -1; + // record offset points to the record that next() should return when called + this.recordOffset = 0L; + } + + /** + * (startingFileOffset, startingRecordOffset) points to the next row that reader should resume + * from. E.g., if the seek position is (file=0, record=1), seek moves the iterator position to the + * 2nd row in file 0. When next() is called after seek, 2nd row from file 0 should be returned. + */ + public void seek(int startingFileOffset, long startingRecordOffset) { + Preconditions.checkState( + fileOffset == -1, "Seek should be called before any other iterator actions"); + // skip files + Preconditions.checkState( + startingFileOffset < combinedTask.files().size(), + "Invalid starting file offset %s for combined scan task with %s files: %s", + startingFileOffset, + combinedTask.files().size(), + combinedTask); + for (long i = 0L; i < startingFileOffset; ++i) { + tasks.next(); + } + + updateCurrentIterator(); + // skip records within the file + for (long i = 0; i < startingRecordOffset; ++i) { + if (currentFileHasNext() && hasNext()) { + next(); + } else { + throw new IllegalStateException( + String.format( + "Invalid starting record offset %d for file %d from CombinedScanTask: %s", + startingRecordOffset, startingFileOffset, combinedTask)); + } + } + + fileOffset = startingFileOffset; + recordOffset = startingRecordOffset; + } + + @Override + public boolean hasNext() { + updateCurrentIterator(); + return currentIterator.hasNext(); + } + + @Override + public T next() { + updateCurrentIterator(); + recordOffset += 1; + return currentIterator.next(); + } + + public boolean currentFileHasNext() { + return currentIterator.hasNext(); + } + + /** Updates the current iterator field to ensure that the current Iterator is not exhausted. */ + private void updateCurrentIterator() { + try { + while (!currentIterator.hasNext() && tasks.hasNext()) { + currentIterator.close(); + currentIterator = openTaskIterator(tasks.next()); + fileOffset += 1; + recordOffset = 0L; + } + } catch (IOException e) { + throw new UncheckedIOException(e); + } + } + + private CloseableIterator openTaskIterator(FileScanTask scanTask) { + return fileScanTaskReader.open(scanTask, inputFilesDecryptor); + } + + @Override + public void close() throws IOException { + // close the current iterator + currentIterator.close(); + tasks = null; + } + + public int fileOffset() { + return fileOffset; + } + + public long recordOffset() { + return recordOffset; + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/DataTaskReader.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/DataTaskReader.java new file mode 100644 index 000000000000..4394dab4d4cc --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/DataTaskReader.java @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import org.apache.flink.annotation.Internal; +import org.apache.flink.table.data.RowData; +import org.apache.iceberg.FileScanTask; +import org.apache.iceberg.Schema; +import org.apache.iceberg.encryption.InputFilesDecryptor; +import org.apache.iceberg.flink.data.StructRowData; +import org.apache.iceberg.io.CloseableIterable; +import org.apache.iceberg.io.CloseableIterator; + +@Internal +public class DataTaskReader implements FileScanTaskReader { + + private final Schema readSchema; + + public DataTaskReader(Schema readSchema) { + this.readSchema = readSchema; + } + + @Override + public CloseableIterator open( + FileScanTask task, InputFilesDecryptor inputFilesDecryptor) { + StructRowData row = new StructRowData(readSchema.asStruct()); + CloseableIterable iterable = + CloseableIterable.transform(task.asDataTask().rows(), row::setStruct); + return iterable.iterator(); + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/FileScanTaskReader.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/FileScanTaskReader.java new file mode 100644 index 000000000000..927a804a4792 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/FileScanTaskReader.java @@ -0,0 +1,35 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import java.io.Serializable; +import org.apache.flink.annotation.Internal; +import org.apache.iceberg.FileScanTask; +import org.apache.iceberg.encryption.InputFilesDecryptor; +import org.apache.iceberg.io.CloseableIterator; + +/** + * Read a {@link FileScanTask} into a {@link CloseableIterator} + * + * @param is the output data type returned by this iterator. + */ +@Internal +public interface FileScanTaskReader extends Serializable { + CloseableIterator open(FileScanTask fileScanTask, InputFilesDecryptor inputFilesDecryptor); +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/FlinkInputFormat.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/FlinkInputFormat.java new file mode 100644 index 000000000000..9a5123dc489e --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/FlinkInputFormat.java @@ -0,0 +1,141 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import java.io.IOException; +import java.util.concurrent.ExecutorService; +import org.apache.flink.api.common.io.DefaultInputSplitAssigner; +import org.apache.flink.api.common.io.InputFormat; +import org.apache.flink.api.common.io.LocatableInputSplitAssigner; +import org.apache.flink.api.common.io.RichInputFormat; +import org.apache.flink.api.common.io.statistics.BaseStatistics; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.core.io.InputSplitAssigner; +import org.apache.flink.table.data.RowData; +import org.apache.iceberg.BaseMetadataTable; +import org.apache.iceberg.Schema; +import org.apache.iceberg.Table; +import org.apache.iceberg.encryption.EncryptionManager; +import org.apache.iceberg.flink.TableLoader; +import org.apache.iceberg.io.FileIO; +import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; +import org.apache.iceberg.util.ThreadPools; + +/** Flink {@link InputFormat} for Iceberg. */ +public class FlinkInputFormat extends RichInputFormat { + + private static final long serialVersionUID = 1L; + + private final TableLoader tableLoader; + private final FileIO io; + private final EncryptionManager encryption; + private final ScanContext context; + private final FileScanTaskReader rowDataReader; + + private transient DataIterator iterator; + private transient long currentReadCount = 0L; + + FlinkInputFormat( + TableLoader tableLoader, + Schema tableSchema, + FileIO io, + EncryptionManager encryption, + ScanContext context) { + this.tableLoader = tableLoader; + this.io = io; + this.encryption = encryption; + this.context = context; + + tableLoader.open(); + Table table = tableLoader.loadTable(); + if (table instanceof BaseMetadataTable) { + this.rowDataReader = new DataTaskReader(context.project()); + } else { + this.rowDataReader = + new RowDataFileScanTaskReader( + tableSchema, + context.project(), + context.nameMapping(), + context.caseSensitive(), + context.filters()); + } + } + + @VisibleForTesting + Schema projectedSchema() { + return context.project(); + } + + @Override + public BaseStatistics getStatistics(BaseStatistics cachedStatistics) { + // Legacy method, not be used. + return null; + } + + @Override + public FlinkInputSplit[] createInputSplits(int minNumSplits) throws IOException { + // Called in Job manager, so it is OK to load table from catalog. + tableLoader.open(); + final ExecutorService workerPool = + ThreadPools.newWorkerPool("iceberg-plan-worker-pool", context.planParallelism()); + try (TableLoader loader = tableLoader) { + Table table = loader.loadTable(); + return FlinkSplitPlanner.planInputSplits(table, context, workerPool); + } finally { + workerPool.shutdown(); + } + } + + @Override + public InputSplitAssigner getInputSplitAssigner(FlinkInputSplit[] inputSplits) { + return context.exposeLocality() + ? new LocatableInputSplitAssigner(inputSplits) + : new DefaultInputSplitAssigner(inputSplits); + } + + @Override + public void configure(Configuration parameters) {} + + @Override + public void open(FlinkInputSplit split) { + this.iterator = new DataIterator<>(rowDataReader, split.getTask(), io, encryption); + } + + @Override + public boolean reachedEnd() { + if (context.limit() > 0 && currentReadCount >= context.limit()) { + return true; + } else { + return !iterator.hasNext(); + } + } + + @Override + public RowData nextRecord(RowData reuse) { + currentReadCount++; + return iterator.next(); + } + + @Override + public void close() throws IOException { + if (iterator != null) { + iterator.close(); + } + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/FlinkInputSplit.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/FlinkInputSplit.java new file mode 100644 index 000000000000..16fd4f39596c --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/FlinkInputSplit.java @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import java.util.Arrays; +import javax.annotation.Nullable; +import org.apache.flink.core.io.LocatableInputSplit; +import org.apache.iceberg.CombinedScanTask; +import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; + +public class FlinkInputSplit extends LocatableInputSplit { + + private final CombinedScanTask task; + + FlinkInputSplit(int splitNumber, CombinedScanTask task, @Nullable String[] hostnames) { + super(splitNumber, hostnames); + this.task = task; + } + + CombinedScanTask getTask() { + return task; + } + + @Override + public String toString() { + return MoreObjects.toStringHelper(this) + .add("splitNumber", getSplitNumber()) + .add("task", task) + .add("hosts", Arrays.toString(getHostnames())) + .toString(); + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSource.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSource.java new file mode 100644 index 000000000000..b1431a32dd20 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSource.java @@ -0,0 +1,307 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.time.Duration; +import java.util.List; +import java.util.Map; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.configuration.ReadableConfig; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.api.TableSchema; +import org.apache.flink.table.data.RowData; +import org.apache.iceberg.Schema; +import org.apache.iceberg.Table; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.TableScan; +import org.apache.iceberg.encryption.EncryptionManager; +import org.apache.iceberg.expressions.Expression; +import org.apache.iceberg.flink.FlinkConfigOptions; +import org.apache.iceberg.flink.FlinkReadOptions; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.TableLoader; +import org.apache.iceberg.flink.util.FlinkCompatibilityUtil; +import org.apache.iceberg.io.FileIO; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.util.PropertyUtil; + +public class FlinkSource { + private FlinkSource() {} + + /** + * Initialize a {@link Builder} to read the data from iceberg table. Equivalent to {@link + * TableScan}. See more options in {@link ScanContext}. + * + *

    The Source can be read static data in bounded mode. It can also continuously check the + * arrival of new data and read records incrementally. + * + *

      + *
    • Without startSnapshotId: Bounded + *
    • With startSnapshotId and with endSnapshotId: Bounded + *
    • With startSnapshotId (-1 means unbounded preceding) and Without endSnapshotId: Unbounded + *
    + * + *

    + * + * @return {@link Builder} to connect the iceberg table. + */ + public static Builder forRowData() { + return new Builder(); + } + + /** Source builder to build {@link DataStream}. */ + public static class Builder { + private StreamExecutionEnvironment env; + private Table table; + private TableLoader tableLoader; + private TableSchema projectedSchema; + private ReadableConfig readableConfig = new Configuration(); + private final ScanContext.Builder contextBuilder = ScanContext.builder(); + private Boolean exposeLocality; + + private final Map readOptions = Maps.newHashMap(); + + public Builder tableLoader(TableLoader newLoader) { + this.tableLoader = newLoader; + return this; + } + + public Builder table(Table newTable) { + this.table = newTable; + return this; + } + + public Builder env(StreamExecutionEnvironment newEnv) { + this.env = newEnv; + return this; + } + + public Builder filters(List filters) { + contextBuilder.filters(filters); + return this; + } + + public Builder project(TableSchema schema) { + this.projectedSchema = schema; + return this; + } + + public Builder limit(Long newLimit) { + if (newLimit != null) { + readOptions.put(FlinkReadOptions.LIMIT, Long.toString(newLimit)); + } + return this; + } + + public Builder set(String property, String value) { + readOptions.put(property, value); + return this; + } + + public Builder setAll(Map properties) { + readOptions.putAll(properties); + return this; + } + + /** @deprecated Use {@link #setAll} instead. */ + @Deprecated + public Builder properties(Map properties) { + readOptions.putAll(properties); + return this; + } + + public Builder caseSensitive(boolean caseSensitive) { + readOptions.put(FlinkReadOptions.CASE_SENSITIVE, Boolean.toString(caseSensitive)); + return this; + } + + public Builder snapshotId(Long snapshotId) { + readOptions.put(FlinkReadOptions.SNAPSHOT_ID.key(), Long.toString(snapshotId)); + return this; + } + + public Builder branch(String branch) { + readOptions.put(FlinkReadOptions.BRANCH.key(), branch); + return this; + } + + public Builder tag(String tag) { + readOptions.put(FlinkReadOptions.TAG.key(), tag); + return this; + } + + public Builder startSnapshotId(Long startSnapshotId) { + readOptions.put(FlinkReadOptions.START_SNAPSHOT_ID.key(), Long.toString(startSnapshotId)); + return this; + } + + public Builder endSnapshotId(Long endSnapshotId) { + readOptions.put(FlinkReadOptions.END_SNAPSHOT_ID.key(), Long.toString(endSnapshotId)); + return this; + } + + public Builder startTag(String startTag) { + readOptions.put(FlinkReadOptions.START_TAG.key(), startTag); + return this; + } + + public Builder endTag(String endTag) { + readOptions.put(FlinkReadOptions.END_TAG.key(), endTag); + return this; + } + + public Builder asOfTimestamp(Long asOfTimestamp) { + readOptions.put(FlinkReadOptions.AS_OF_TIMESTAMP.key(), Long.toString(asOfTimestamp)); + return this; + } + + public Builder splitSize(Long splitSize) { + readOptions.put(FlinkReadOptions.SPLIT_SIZE, Long.toString(splitSize)); + return this; + } + + public Builder splitLookback(Integer splitLookback) { + readOptions.put(FlinkReadOptions.SPLIT_LOOKBACK, Integer.toString(splitLookback)); + return this; + } + + public Builder splitOpenFileCost(Long splitOpenFileCost) { + readOptions.put(FlinkReadOptions.SPLIT_FILE_OPEN_COST, Long.toString(splitOpenFileCost)); + return this; + } + + public Builder streaming(boolean streaming) { + readOptions.put(FlinkReadOptions.STREAMING, Boolean.toString(streaming)); + return this; + } + + public Builder exposeLocality(boolean newExposeLocality) { + this.exposeLocality = newExposeLocality; + return this; + } + + public Builder nameMapping(String nameMapping) { + readOptions.put(TableProperties.DEFAULT_NAME_MAPPING, nameMapping); + return this; + } + + public Builder monitorInterval(Duration interval) { + readOptions.put(FlinkReadOptions.MONITOR_INTERVAL, interval.toNanos() + " ns"); + return this; + } + + public Builder maxPlanningSnapshotCount(int newMaxPlanningSnapshotCount) { + readOptions.put( + FlinkReadOptions.MAX_PLANNING_SNAPSHOT_COUNT, + Integer.toString(newMaxPlanningSnapshotCount)); + return this; + } + + public Builder flinkConf(ReadableConfig config) { + this.readableConfig = config; + return this; + } + + public FlinkInputFormat buildFormat() { + Preconditions.checkNotNull(tableLoader, "TableLoader should not be null"); + + Schema icebergSchema; + FileIO io; + EncryptionManager encryption; + if (table == null) { + // load required fields by table loader. + tableLoader.open(); + try (TableLoader loader = tableLoader) { + table = loader.loadTable(); + icebergSchema = table.schema(); + io = table.io(); + encryption = table.encryption(); + } catch (IOException e) { + throw new UncheckedIOException(e); + } + } else { + icebergSchema = table.schema(); + io = table.io(); + encryption = table.encryption(); + } + + if (projectedSchema == null) { + contextBuilder.project(icebergSchema); + } else { + contextBuilder.project(FlinkSchemaUtil.convert(icebergSchema, projectedSchema)); + } + + contextBuilder.exposeLocality( + SourceUtil.isLocalityEnabled(table, readableConfig, exposeLocality)); + contextBuilder.planParallelism( + readableConfig.get(FlinkConfigOptions.TABLE_EXEC_ICEBERG_WORKER_POOL_SIZE)); + + contextBuilder.resolveConfig(table, readOptions, readableConfig); + + ScanContext context = contextBuilder.build(); + context.validate(); + return new FlinkInputFormat(tableLoader, icebergSchema, io, encryption, context); + } + + public DataStream build() { + Preconditions.checkNotNull(env, "StreamExecutionEnvironment should not be null"); + FlinkInputFormat format = buildFormat(); + + ScanContext context = contextBuilder.build(); + TypeInformation typeInfo = + FlinkCompatibilityUtil.toTypeInfo(FlinkSchemaUtil.convert(context.project())); + + if (!context.isStreaming()) { + int parallelism = + SourceUtil.inferParallelism( + readableConfig, + context.limit(), + () -> { + try { + return format.createInputSplits(0).length; + } catch (IOException e) { + throw new UncheckedIOException( + "Failed to create iceberg input splits for table: " + table, e); + } + }); + if (env.getMaxParallelism() > 0) { + parallelism = Math.min(parallelism, env.getMaxParallelism()); + } + return env.createInput(format, typeInfo).setParallelism(parallelism); + } else { + StreamingMonitorFunction function = new StreamingMonitorFunction(tableLoader, context); + + String monitorFunctionName = String.format("Iceberg table (%s) monitor", table); + String readerOperatorName = String.format("Iceberg table (%s) reader", table); + + return env.addSource(function, monitorFunctionName) + .transform(readerOperatorName, typeInfo, StreamingReaderOperator.factory(format)); + } + } + } + + public static boolean isBounded(Map properties) { + return !PropertyUtil.propertyAsBoolean(properties, FlinkReadOptions.STREAMING, false); + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSplitPlanner.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSplitPlanner.java new file mode 100644 index 000000000000..15078809714f --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSplitPlanner.java @@ -0,0 +1,189 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.util.List; +import java.util.concurrent.ExecutorService; +import org.apache.flink.annotation.Internal; +import org.apache.iceberg.CombinedScanTask; +import org.apache.iceberg.FileScanTask; +import org.apache.iceberg.IncrementalAppendScan; +import org.apache.iceberg.Scan; +import org.apache.iceberg.Table; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.TableScan; +import org.apache.iceberg.expressions.Expression; +import org.apache.iceberg.flink.source.split.IcebergSourceSplit; +import org.apache.iceberg.hadoop.Util; +import org.apache.iceberg.io.CloseableIterable; +import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.util.Tasks; + +@Internal +public class FlinkSplitPlanner { + private FlinkSplitPlanner() {} + + static FlinkInputSplit[] planInputSplits( + Table table, ScanContext context, ExecutorService workerPool) { + try (CloseableIterable tasksIterable = + planTasks(table, context, workerPool)) { + List tasks = Lists.newArrayList(tasksIterable); + FlinkInputSplit[] splits = new FlinkInputSplit[tasks.size()]; + boolean exposeLocality = context.exposeLocality(); + + Tasks.range(tasks.size()) + .stopOnFailure() + .executeWith(exposeLocality ? workerPool : null) + .run( + index -> { + CombinedScanTask task = tasks.get(index); + String[] hostnames = null; + if (exposeLocality) { + hostnames = Util.blockLocations(table.io(), task); + } + splits[index] = new FlinkInputSplit(index, task, hostnames); + }); + return splits; + } catch (IOException e) { + throw new UncheckedIOException("Failed to process tasks iterable", e); + } + } + + /** This returns splits for the FLIP-27 source */ + public static List planIcebergSourceSplits( + Table table, ScanContext context, ExecutorService workerPool) { + try (CloseableIterable tasksIterable = + planTasks(table, context, workerPool)) { + return Lists.newArrayList( + CloseableIterable.transform(tasksIterable, IcebergSourceSplit::fromCombinedScanTask)); + } catch (IOException e) { + throw new UncheckedIOException("Failed to process task iterable: ", e); + } + } + + static CloseableIterable planTasks( + Table table, ScanContext context, ExecutorService workerPool) { + ScanMode scanMode = checkScanMode(context); + if (scanMode == ScanMode.INCREMENTAL_APPEND_SCAN) { + IncrementalAppendScan scan = table.newIncrementalAppendScan(); + scan = refineScanWithBaseConfigs(scan, context, workerPool); + + if (context.startTag() != null) { + Preconditions.checkArgument( + table.snapshot(context.startTag()) != null, + "Cannot find snapshot with tag %s", + context.startTag()); + scan = scan.fromSnapshotExclusive(table.snapshot(context.startTag()).snapshotId()); + } + + if (context.startSnapshotId() != null) { + Preconditions.checkArgument( + context.startTag() == null, "START_SNAPSHOT_ID and START_TAG cannot both be set"); + scan = scan.fromSnapshotExclusive(context.startSnapshotId()); + } + + if (context.endTag() != null) { + Preconditions.checkArgument( + table.snapshot(context.endTag()) != null, + "Cannot find snapshot with tag %s", + context.endTag()); + scan = scan.toSnapshot(table.snapshot(context.endTag()).snapshotId()); + } + + if (context.endSnapshotId() != null) { + Preconditions.checkArgument( + context.endTag() == null, "END_SNAPSHOT_ID and END_TAG cannot both be set"); + scan = scan.toSnapshot(context.endSnapshotId()); + } + + return scan.planTasks(); + } else { + TableScan scan = table.newScan(); + scan = refineScanWithBaseConfigs(scan, context, workerPool); + + if (context.snapshotId() != null) { + scan = scan.useSnapshot(context.snapshotId()); + } else if (context.tag() != null) { + scan = scan.useRef(context.tag()); + } else if (context.branch() != null) { + scan = scan.useRef(context.branch()); + } + + if (context.asOfTimestamp() != null) { + scan = scan.asOfTime(context.asOfTimestamp()); + } + + return scan.planTasks(); + } + } + + @VisibleForTesting + enum ScanMode { + BATCH, + INCREMENTAL_APPEND_SCAN + } + + @VisibleForTesting + static ScanMode checkScanMode(ScanContext context) { + if (context.startSnapshotId() != null + || context.endSnapshotId() != null + || context.startTag() != null + || context.endTag() != null) { + return ScanMode.INCREMENTAL_APPEND_SCAN; + } else { + return ScanMode.BATCH; + } + } + + /** refine scan with common configs */ + private static > T refineScanWithBaseConfigs( + T scan, ScanContext context, ExecutorService workerPool) { + T refinedScan = + scan.caseSensitive(context.caseSensitive()).project(context.project()).planWith(workerPool); + + if (context.includeColumnStats()) { + refinedScan = refinedScan.includeColumnStats(); + } + + if (context.includeStatsForColumns() != null) { + refinedScan = refinedScan.includeColumnStats(context.includeStatsForColumns()); + } + + refinedScan = refinedScan.option(TableProperties.SPLIT_SIZE, context.splitSize().toString()); + + refinedScan = + refinedScan.option(TableProperties.SPLIT_LOOKBACK, context.splitLookback().toString()); + + refinedScan = + refinedScan.option( + TableProperties.SPLIT_OPEN_FILE_COST, context.splitOpenFileCost().toString()); + + if (context.filters() != null) { + for (Expression filter : context.filters()) { + refinedScan = refinedScan.filter(filter); + } + } + + return refinedScan; + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/IcebergSource.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/IcebergSource.java new file mode 100644 index 000000000000..ccbd0d9997ed --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/IcebergSource.java @@ -0,0 +1,549 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.time.Duration; +import java.util.List; +import java.util.Map; +import java.util.UUID; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.TimeUnit; +import javax.annotation.Nullable; +import org.apache.flink.annotation.Experimental; +import org.apache.flink.api.connector.source.Boundedness; +import org.apache.flink.api.connector.source.Source; +import org.apache.flink.api.connector.source.SourceReader; +import org.apache.flink.api.connector.source.SourceReaderContext; +import org.apache.flink.api.connector.source.SplitEnumerator; +import org.apache.flink.api.connector.source.SplitEnumeratorContext; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.configuration.ReadableConfig; +import org.apache.flink.core.io.SimpleVersionedSerializer; +import org.apache.flink.table.api.TableSchema; +import org.apache.flink.table.data.RowData; +import org.apache.flink.util.Preconditions; +import org.apache.iceberg.BaseMetadataTable; +import org.apache.iceberg.Schema; +import org.apache.iceberg.Table; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.expressions.Expression; +import org.apache.iceberg.flink.FlinkConfigOptions; +import org.apache.iceberg.flink.FlinkReadConf; +import org.apache.iceberg.flink.FlinkReadOptions; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.TableLoader; +import org.apache.iceberg.flink.source.assigner.OrderedSplitAssignerFactory; +import org.apache.iceberg.flink.source.assigner.SimpleSplitAssignerFactory; +import org.apache.iceberg.flink.source.assigner.SplitAssigner; +import org.apache.iceberg.flink.source.assigner.SplitAssignerFactory; +import org.apache.iceberg.flink.source.enumerator.ContinuousIcebergEnumerator; +import org.apache.iceberg.flink.source.enumerator.ContinuousSplitPlanner; +import org.apache.iceberg.flink.source.enumerator.ContinuousSplitPlannerImpl; +import org.apache.iceberg.flink.source.enumerator.IcebergEnumeratorState; +import org.apache.iceberg.flink.source.enumerator.IcebergEnumeratorStateSerializer; +import org.apache.iceberg.flink.source.enumerator.StaticIcebergEnumerator; +import org.apache.iceberg.flink.source.reader.ColumnStatsWatermarkExtractor; +import org.apache.iceberg.flink.source.reader.IcebergSourceReader; +import org.apache.iceberg.flink.source.reader.IcebergSourceReaderMetrics; +import org.apache.iceberg.flink.source.reader.MetaDataReaderFunction; +import org.apache.iceberg.flink.source.reader.ReaderFunction; +import org.apache.iceberg.flink.source.reader.RowDataReaderFunction; +import org.apache.iceberg.flink.source.reader.SerializableRecordEmitter; +import org.apache.iceberg.flink.source.reader.SplitWatermarkExtractor; +import org.apache.iceberg.flink.source.split.IcebergSourceSplit; +import org.apache.iceberg.flink.source.split.IcebergSourceSplitSerializer; +import org.apache.iceberg.flink.source.split.SerializableComparator; +import org.apache.iceberg.flink.source.split.SplitComparators; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.relocated.com.google.common.collect.Sets; +import org.apache.iceberg.util.ThreadPools; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +@Experimental +public class IcebergSource implements Source { + private static final Logger LOG = LoggerFactory.getLogger(IcebergSource.class); + + // This table loader can be closed, and it is only safe to use this instance for resource + // independent information (e.g. a table name). Copies of this are required to avoid lifecycle + // management conflicts with the user provided table loader. e.g. a copy of this is required for + // split planning, which uses the underlying io, and should be closed after split planning is + // complete. + private final TableLoader tableLoader; + private final ScanContext scanContext; + private final ReaderFunction readerFunction; + private final SplitAssignerFactory assignerFactory; + private final SerializableComparator splitComparator; + private final SerializableRecordEmitter emitter; + private final String tableName; + + IcebergSource( + TableLoader tableLoader, + ScanContext scanContext, + ReaderFunction readerFunction, + SplitAssignerFactory assignerFactory, + SerializableComparator splitComparator, + Table table, + SerializableRecordEmitter emitter) { + Preconditions.checkNotNull(tableLoader, "tableLoader is required."); + Preconditions.checkNotNull(readerFunction, "readerFunction is required."); + Preconditions.checkNotNull(assignerFactory, "assignerFactory is required."); + Preconditions.checkNotNull(table, "table is required."); + this.tableLoader = tableLoader; + this.scanContext = scanContext; + this.readerFunction = readerFunction; + this.assignerFactory = assignerFactory; + this.splitComparator = splitComparator; + this.emitter = emitter; + this.tableName = table.name(); + } + + String name() { + return "IcebergSource-" + tableName; + } + + private String planningThreadName() { + // Ideally, operatorId should be used as the threadPoolName as Flink guarantees its uniqueness + // within a job. SplitEnumeratorContext doesn't expose the OperatorCoordinator.Context, which + // would contain the OperatorID. Need to discuss with Flink community whether it is ok to expose + // a public API like the protected method "OperatorCoordinator.Context getCoordinatorContext()" + // from SourceCoordinatorContext implementation. For now,

  • - is used as + // the unique thread pool name. + return tableName + "-" + UUID.randomUUID(); + } + + private List planSplitsForBatch(String threadName) { + ExecutorService workerPool = + ThreadPools.newWorkerPool(threadName, scanContext.planParallelism()); + try (TableLoader loader = tableLoader.clone()) { + loader.open(); + List splits = + FlinkSplitPlanner.planIcebergSourceSplits(loader.loadTable(), scanContext, workerPool); + LOG.info( + "Discovered {} splits from table {} during job initialization", splits.size(), tableName); + return splits; + } catch (IOException e) { + throw new UncheckedIOException("Failed to close table loader", e); + } finally { + workerPool.shutdown(); + } + } + + @Override + public Boundedness getBoundedness() { + return scanContext.isStreaming() ? Boundedness.CONTINUOUS_UNBOUNDED : Boundedness.BOUNDED; + } + + @Override + public SourceReader createReader(SourceReaderContext readerContext) { + IcebergSourceReaderMetrics metrics = + new IcebergSourceReaderMetrics(readerContext.metricGroup(), tableName); + return new IcebergSourceReader<>( + emitter, metrics, readerFunction, splitComparator, readerContext); + } + + @Override + public SplitEnumerator createEnumerator( + SplitEnumeratorContext enumContext) { + return createEnumerator(enumContext, null); + } + + @Override + public SplitEnumerator restoreEnumerator( + SplitEnumeratorContext enumContext, IcebergEnumeratorState enumState) { + return createEnumerator(enumContext, enumState); + } + + @Override + public SimpleVersionedSerializer getSplitSerializer() { + return new IcebergSourceSplitSerializer(scanContext.caseSensitive()); + } + + @Override + public SimpleVersionedSerializer getEnumeratorCheckpointSerializer() { + return new IcebergEnumeratorStateSerializer(scanContext.caseSensitive()); + } + + private SplitEnumerator createEnumerator( + SplitEnumeratorContext enumContext, + @Nullable IcebergEnumeratorState enumState) { + SplitAssigner assigner; + if (enumState == null) { + assigner = assignerFactory.createAssigner(); + } else { + LOG.info( + "Iceberg source restored {} splits from state for table {}", + enumState.pendingSplits().size(), + tableName); + assigner = assignerFactory.createAssigner(enumState.pendingSplits()); + } + if (scanContext.isStreaming()) { + ContinuousSplitPlanner splitPlanner = + new ContinuousSplitPlannerImpl(tableLoader, scanContext, planningThreadName()); + return new ContinuousIcebergEnumerator( + enumContext, assigner, scanContext, splitPlanner, enumState); + } else { + if (enumState == null) { + // Only do scan planning if nothing is restored from checkpoint state + List splits = planSplitsForBatch(planningThreadName()); + assigner.onDiscoveredSplits(splits); + } + + return new StaticIcebergEnumerator(enumContext, assigner); + } + } + + public static Builder builder() { + return new Builder<>(); + } + + public static Builder forRowData() { + return new Builder<>(); + } + + public static class Builder { + private TableLoader tableLoader; + private Table table; + private SplitAssignerFactory splitAssignerFactory; + private SerializableComparator splitComparator; + private ReaderFunction readerFunction; + private ReadableConfig flinkConfig = new Configuration(); + private final ScanContext.Builder contextBuilder = ScanContext.builder(); + private TableSchema projectedFlinkSchema; + private Boolean exposeLocality; + + private final Map readOptions = Maps.newHashMap(); + + Builder() {} + + public Builder tableLoader(TableLoader loader) { + this.tableLoader = loader; + return this; + } + + public Builder table(Table newTable) { + this.table = newTable; + return this; + } + + public Builder assignerFactory(SplitAssignerFactory assignerFactory) { + this.splitAssignerFactory = assignerFactory; + return this; + } + + public Builder splitComparator( + SerializableComparator newSplitComparator) { + this.splitComparator = newSplitComparator; + return this; + } + + public Builder readerFunction(ReaderFunction newReaderFunction) { + this.readerFunction = newReaderFunction; + return this; + } + + public Builder flinkConfig(ReadableConfig config) { + this.flinkConfig = config; + return this; + } + + public Builder caseSensitive(boolean newCaseSensitive) { + readOptions.put(FlinkReadOptions.CASE_SENSITIVE, Boolean.toString(newCaseSensitive)); + return this; + } + + public Builder useSnapshotId(Long newSnapshotId) { + if (newSnapshotId != null) { + readOptions.put(FlinkReadOptions.SNAPSHOT_ID.key(), Long.toString(newSnapshotId)); + } + return this; + } + + public Builder streamingStartingStrategy(StreamingStartingStrategy newStartingStrategy) { + readOptions.put(FlinkReadOptions.STARTING_STRATEGY, newStartingStrategy.name()); + return this; + } + + public Builder startSnapshotTimestamp(Long newStartSnapshotTimestamp) { + if (newStartSnapshotTimestamp != null) { + readOptions.put( + FlinkReadOptions.START_SNAPSHOT_TIMESTAMP.key(), + Long.toString(newStartSnapshotTimestamp)); + } + return this; + } + + public Builder startSnapshotId(Long newStartSnapshotId) { + if (newStartSnapshotId != null) { + readOptions.put( + FlinkReadOptions.START_SNAPSHOT_ID.key(), Long.toString(newStartSnapshotId)); + } + return this; + } + + public Builder tag(String tag) { + readOptions.put(FlinkReadOptions.TAG.key(), tag); + return this; + } + + public Builder branch(String branch) { + readOptions.put(FlinkReadOptions.BRANCH.key(), branch); + return this; + } + + public Builder startTag(String startTag) { + readOptions.put(FlinkReadOptions.START_TAG.key(), startTag); + return this; + } + + public Builder endTag(String endTag) { + readOptions.put(FlinkReadOptions.END_TAG.key(), endTag); + return this; + } + + public Builder endSnapshotId(Long newEndSnapshotId) { + if (newEndSnapshotId != null) { + readOptions.put(FlinkReadOptions.END_SNAPSHOT_ID.key(), Long.toString(newEndSnapshotId)); + } + return this; + } + + public Builder asOfTimestamp(Long newAsOfTimestamp) { + if (newAsOfTimestamp != null) { + readOptions.put(FlinkReadOptions.AS_OF_TIMESTAMP.key(), Long.toString(newAsOfTimestamp)); + } + return this; + } + + public Builder splitSize(Long newSplitSize) { + if (newSplitSize != null) { + readOptions.put(FlinkReadOptions.SPLIT_SIZE, Long.toString(newSplitSize)); + } + return this; + } + + public Builder splitLookback(Integer newSplitLookback) { + if (newSplitLookback != null) { + readOptions.put(FlinkReadOptions.SPLIT_LOOKBACK, Integer.toString(newSplitLookback)); + } + return this; + } + + public Builder splitOpenFileCost(Long newSplitOpenFileCost) { + if (newSplitOpenFileCost != null) { + readOptions.put(FlinkReadOptions.SPLIT_FILE_OPEN_COST, Long.toString(newSplitOpenFileCost)); + } + + return this; + } + + public Builder streaming(boolean streaming) { + readOptions.put(FlinkReadOptions.STREAMING, Boolean.toString(streaming)); + return this; + } + + public Builder monitorInterval(Duration newMonitorInterval) { + if (newMonitorInterval != null) { + readOptions.put(FlinkReadOptions.MONITOR_INTERVAL, newMonitorInterval.toNanos() + " ns"); + } + return this; + } + + public Builder nameMapping(String newNameMapping) { + readOptions.put(TableProperties.DEFAULT_NAME_MAPPING, newNameMapping); + return this; + } + + public Builder project(Schema newProjectedSchema) { + this.contextBuilder.project(newProjectedSchema); + return this; + } + + public Builder project(TableSchema newProjectedFlinkSchema) { + this.projectedFlinkSchema = newProjectedFlinkSchema; + return this; + } + + public Builder filters(List newFilters) { + this.contextBuilder.filters(newFilters); + return this; + } + + public Builder limit(Long newLimit) { + if (newLimit != null) { + readOptions.put(FlinkReadOptions.LIMIT, Long.toString(newLimit)); + } + return this; + } + + public Builder includeColumnStats(boolean newIncludeColumnStats) { + readOptions.put( + FlinkReadOptions.INCLUDE_COLUMN_STATS, Boolean.toString(newIncludeColumnStats)); + return this; + } + + public Builder planParallelism(int planParallelism) { + readOptions.put( + FlinkConfigOptions.TABLE_EXEC_ICEBERG_WORKER_POOL_SIZE.key(), + Integer.toString(planParallelism)); + return this; + } + + public Builder exposeLocality(boolean newExposeLocality) { + this.exposeLocality = newExposeLocality; + return this; + } + + public Builder maxAllowedPlanningFailures(int maxAllowedPlanningFailures) { + readOptions.put( + FlinkReadOptions.MAX_ALLOWED_PLANNING_FAILURES_OPTION.key(), + Integer.toString(maxAllowedPlanningFailures)); + return this; + } + + /** + * Set the read properties for Flink source. View the supported properties in {@link + * FlinkReadOptions} + */ + public Builder set(String property, String value) { + readOptions.put(property, value); + return this; + } + + /** + * Set the read properties for Flink source. View the supported properties in {@link + * FlinkReadOptions} + */ + public Builder setAll(Map properties) { + readOptions.putAll(properties); + return this; + } + + /** + * Emits watermarks once per split based on the min value of column statistics from files + * metadata in the given split. The generated watermarks are also used for ordering the splits + * for read. Accepted column types are timestamp/timestamptz/long. For long columns consider + * setting {@link #watermarkColumnTimeUnit(TimeUnit)}. + * + *

    Consider setting `read.split.open-file-cost` to prevent combining small files to a single + * split when the watermark is used for watermark alignment. + */ + public Builder watermarkColumn(String columnName) { + Preconditions.checkArgument( + splitAssignerFactory == null, + "Watermark column and SplitAssigner should not be set in the same source"); + readOptions.put(FlinkReadOptions.WATERMARK_COLUMN, columnName); + return this; + } + + /** + * When the type of the {@link #watermarkColumn} is {@link + * org.apache.iceberg.types.Types.LongType}, then sets the {@link TimeUnit} to convert the + * value. The default value is {@link TimeUnit#MICROSECONDS}. + */ + public Builder watermarkColumnTimeUnit(TimeUnit timeUnit) { + readOptions.put(FlinkReadOptions.WATERMARK_COLUMN_TIME_UNIT, timeUnit.name()); + return this; + } + + /** @deprecated Use {@link #setAll} instead. */ + @Deprecated + public Builder properties(Map properties) { + readOptions.putAll(properties); + return this; + } + + public IcebergSource build() { + if (table == null) { + try (TableLoader loader = tableLoader) { + loader.open(); + this.table = tableLoader.loadTable(); + } catch (IOException e) { + throw new UncheckedIOException(e); + } + } + + contextBuilder.resolveConfig(table, readOptions, flinkConfig); + Schema icebergSchema = table.schema(); + if (projectedFlinkSchema != null) { + contextBuilder.project(FlinkSchemaUtil.convert(icebergSchema, projectedFlinkSchema)); + } + + SerializableRecordEmitter emitter = SerializableRecordEmitter.defaultEmitter(); + FlinkReadConf flinkReadConf = new FlinkReadConf(table, readOptions, flinkConfig); + String watermarkColumn = flinkReadConf.watermarkColumn(); + TimeUnit watermarkTimeUnit = flinkReadConf.watermarkColumnTimeUnit(); + + if (watermarkColumn != null) { + // Column statistics is needed for watermark generation + contextBuilder.includeColumnStats(Sets.newHashSet(watermarkColumn)); + + SplitWatermarkExtractor watermarkExtractor = + new ColumnStatsWatermarkExtractor(icebergSchema, watermarkColumn, watermarkTimeUnit); + emitter = SerializableRecordEmitter.emitterWithWatermark(watermarkExtractor); + splitAssignerFactory = + new OrderedSplitAssignerFactory(SplitComparators.watermark(watermarkExtractor)); + } + + ScanContext context = contextBuilder.build(); + context.validate(); + if (readerFunction == null) { + if (table instanceof BaseMetadataTable) { + MetaDataReaderFunction rowDataReaderFunction = + new MetaDataReaderFunction( + flinkConfig, table.schema(), context.project(), table.io(), table.encryption()); + this.readerFunction = (ReaderFunction) rowDataReaderFunction; + } else { + RowDataReaderFunction rowDataReaderFunction = + new RowDataReaderFunction( + flinkConfig, + table.schema(), + context.project(), + context.nameMapping(), + context.caseSensitive(), + table.io(), + table.encryption(), + context.filters(), + context.limit()); + this.readerFunction = (ReaderFunction) rowDataReaderFunction; + } + } + + if (splitAssignerFactory == null) { + if (splitComparator == null) { + splitAssignerFactory = new SimpleSplitAssignerFactory(); + } else { + splitAssignerFactory = new OrderedSplitAssignerFactory(splitComparator); + } + } + + // Since builder already load the table, pass it to the source to avoid double loading + return new IcebergSource<>( + tableLoader, + context, + readerFunction, + splitAssignerFactory, + splitComparator, + table, + emitter); + } + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/IcebergTableSource.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/IcebergTableSource.java new file mode 100644 index 000000000000..610657e8d47b --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/IcebergTableSource.java @@ -0,0 +1,229 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import org.apache.flink.annotation.Internal; +import org.apache.flink.api.common.eventtime.WatermarkStrategy; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.configuration.ReadableConfig; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.datastream.DataStreamSource; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.api.TableSchema; +import org.apache.flink.table.connector.ChangelogMode; +import org.apache.flink.table.connector.ProviderContext; +import org.apache.flink.table.connector.source.DataStreamScanProvider; +import org.apache.flink.table.connector.source.DynamicTableSource; +import org.apache.flink.table.connector.source.ScanTableSource; +import org.apache.flink.table.connector.source.abilities.SupportsFilterPushDown; +import org.apache.flink.table.connector.source.abilities.SupportsLimitPushDown; +import org.apache.flink.table.connector.source.abilities.SupportsProjectionPushDown; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.expressions.ResolvedExpression; +import org.apache.flink.table.types.DataType; +import org.apache.iceberg.expressions.Expression; +import org.apache.iceberg.flink.FlinkConfigOptions; +import org.apache.iceberg.flink.FlinkFilters; +import org.apache.iceberg.flink.TableLoader; +import org.apache.iceberg.flink.source.assigner.SplitAssignerType; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; + +/** Flink Iceberg table source. */ +@Internal +public class IcebergTableSource + implements ScanTableSource, + SupportsProjectionPushDown, + SupportsFilterPushDown, + SupportsLimitPushDown { + + private int[] projectedFields; + private Long limit; + private List filters; + + private final TableLoader loader; + private final TableSchema schema; + private final Map properties; + private final boolean isLimitPushDown; + private final ReadableConfig readableConfig; + + private IcebergTableSource(IcebergTableSource toCopy) { + this.loader = toCopy.loader; + this.schema = toCopy.schema; + this.properties = toCopy.properties; + this.projectedFields = toCopy.projectedFields; + this.isLimitPushDown = toCopy.isLimitPushDown; + this.limit = toCopy.limit; + this.filters = toCopy.filters; + this.readableConfig = toCopy.readableConfig; + } + + public IcebergTableSource( + TableLoader loader, + TableSchema schema, + Map properties, + ReadableConfig readableConfig) { + this(loader, schema, properties, null, false, null, ImmutableList.of(), readableConfig); + } + + private IcebergTableSource( + TableLoader loader, + TableSchema schema, + Map properties, + int[] projectedFields, + boolean isLimitPushDown, + Long limit, + List filters, + ReadableConfig readableConfig) { + this.loader = loader; + this.schema = schema; + this.properties = properties; + this.projectedFields = projectedFields; + this.isLimitPushDown = isLimitPushDown; + this.limit = limit; + this.filters = filters; + this.readableConfig = readableConfig; + } + + @Override + public void applyProjection(int[][] projectFields) { + this.projectedFields = new int[projectFields.length]; + for (int i = 0; i < projectFields.length; i++) { + Preconditions.checkArgument( + projectFields[i].length == 1, "Don't support nested projection in iceberg source now."); + this.projectedFields[i] = projectFields[i][0]; + } + } + + private DataStream createDataStream(StreamExecutionEnvironment execEnv) { + return FlinkSource.forRowData() + .env(execEnv) + .tableLoader(loader) + .properties(properties) + .project(getProjectedSchema()) + .limit(limit) + .filters(filters) + .flinkConf(readableConfig) + .build(); + } + + private DataStreamSource createFLIP27Stream(StreamExecutionEnvironment env) { + SplitAssignerType assignerType = + readableConfig.get(FlinkConfigOptions.TABLE_EXEC_SPLIT_ASSIGNER_TYPE); + IcebergSource source = + IcebergSource.forRowData() + .tableLoader(loader) + .assignerFactory(assignerType.factory()) + .properties(properties) + .project(getProjectedSchema()) + .limit(limit) + .filters(filters) + .flinkConfig(readableConfig) + .build(); + DataStreamSource stream = + env.fromSource( + source, + WatermarkStrategy.noWatermarks(), + source.name(), + TypeInformation.of(RowData.class)); + return stream; + } + + private TableSchema getProjectedSchema() { + if (projectedFields == null) { + return schema; + } else { + String[] fullNames = schema.getFieldNames(); + DataType[] fullTypes = schema.getFieldDataTypes(); + return TableSchema.builder() + .fields( + Arrays.stream(projectedFields).mapToObj(i -> fullNames[i]).toArray(String[]::new), + Arrays.stream(projectedFields).mapToObj(i -> fullTypes[i]).toArray(DataType[]::new)) + .build(); + } + } + + @Override + public void applyLimit(long newLimit) { + this.limit = newLimit; + } + + @Override + public Result applyFilters(List flinkFilters) { + List acceptedFilters = Lists.newArrayList(); + List expressions = Lists.newArrayList(); + + for (ResolvedExpression resolvedExpression : flinkFilters) { + Optional icebergExpression = FlinkFilters.convert(resolvedExpression); + if (icebergExpression.isPresent()) { + expressions.add(icebergExpression.get()); + acceptedFilters.add(resolvedExpression); + } + } + + this.filters = expressions; + return Result.of(acceptedFilters, flinkFilters); + } + + @Override + public boolean supportsNestedProjection() { + // TODO: support nested projection + return false; + } + + @Override + public ChangelogMode getChangelogMode() { + return ChangelogMode.insertOnly(); + } + + @Override + public ScanRuntimeProvider getScanRuntimeProvider(ScanContext runtimeProviderContext) { + return new DataStreamScanProvider() { + @Override + public DataStream produceDataStream( + ProviderContext providerContext, StreamExecutionEnvironment execEnv) { + if (readableConfig.get(FlinkConfigOptions.TABLE_EXEC_ICEBERG_USE_FLIP27_SOURCE)) { + return createFLIP27Stream(execEnv); + } else { + return createDataStream(execEnv); + } + } + + @Override + public boolean isBounded() { + return FlinkSource.isBounded(properties); + } + }; + } + + @Override + public DynamicTableSource copy() { + return new IcebergTableSource(this); + } + + @Override + public String asSummaryString() { + return "Iceberg table source"; + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/RowDataFileScanTaskReader.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/RowDataFileScanTaskReader.java new file mode 100644 index 000000000000..88364f4e87b1 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/RowDataFileScanTaskReader.java @@ -0,0 +1,243 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import java.util.List; +import java.util.Map; +import org.apache.flink.annotation.Internal; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.iceberg.FileScanTask; +import org.apache.iceberg.MetadataColumns; +import org.apache.iceberg.Schema; +import org.apache.iceberg.StructLike; +import org.apache.iceberg.avro.Avro; +import org.apache.iceberg.data.DeleteFilter; +import org.apache.iceberg.encryption.InputFilesDecryptor; +import org.apache.iceberg.expressions.Expression; +import org.apache.iceberg.expressions.Expressions; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.FlinkSourceFilter; +import org.apache.iceberg.flink.RowDataWrapper; +import org.apache.iceberg.flink.data.FlinkAvroReader; +import org.apache.iceberg.flink.data.FlinkOrcReader; +import org.apache.iceberg.flink.data.FlinkParquetReaders; +import org.apache.iceberg.flink.data.RowDataProjection; +import org.apache.iceberg.flink.data.RowDataUtil; +import org.apache.iceberg.io.CloseableIterable; +import org.apache.iceberg.io.CloseableIterator; +import org.apache.iceberg.io.InputFile; +import org.apache.iceberg.mapping.NameMappingParser; +import org.apache.iceberg.orc.ORC; +import org.apache.iceberg.parquet.Parquet; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Sets; +import org.apache.iceberg.types.TypeUtil; +import org.apache.iceberg.util.PartitionUtil; + +@Internal +public class RowDataFileScanTaskReader implements FileScanTaskReader { + + private final Schema tableSchema; + private final Schema projectedSchema; + private final String nameMapping; + private final boolean caseSensitive; + private final FlinkSourceFilter rowFilter; + + public RowDataFileScanTaskReader( + Schema tableSchema, + Schema projectedSchema, + String nameMapping, + boolean caseSensitive, + List filters) { + this.tableSchema = tableSchema; + this.projectedSchema = projectedSchema; + this.nameMapping = nameMapping; + this.caseSensitive = caseSensitive; + + if (filters != null && !filters.isEmpty()) { + Expression combinedExpression = + filters.stream().reduce(Expressions.alwaysTrue(), Expressions::and); + this.rowFilter = + new FlinkSourceFilter(this.projectedSchema, combinedExpression, this.caseSensitive); + } else { + this.rowFilter = null; + } + } + + @Override + public CloseableIterator open( + FileScanTask task, InputFilesDecryptor inputFilesDecryptor) { + Schema partitionSchema = TypeUtil.select(projectedSchema, task.spec().identitySourceIds()); + + Map idToConstant = + partitionSchema.columns().isEmpty() + ? ImmutableMap.of() + : PartitionUtil.constantsMap(task, RowDataUtil::convertConstant); + + FlinkDeleteFilter deletes = + new FlinkDeleteFilter(task, tableSchema, projectedSchema, inputFilesDecryptor); + CloseableIterable iterable = + deletes.filter( + newIterable(task, deletes.requiredSchema(), idToConstant, inputFilesDecryptor)); + + // Project the RowData to remove the extra meta columns. + if (!projectedSchema.sameSchema(deletes.requiredSchema())) { + RowDataProjection rowDataProjection = + RowDataProjection.create( + deletes.requiredRowType(), + deletes.requiredSchema().asStruct(), + projectedSchema.asStruct()); + iterable = CloseableIterable.transform(iterable, rowDataProjection::wrap); + } + + return iterable.iterator(); + } + + private CloseableIterable newIterable( + FileScanTask task, + Schema schema, + Map idToConstant, + InputFilesDecryptor inputFilesDecryptor) { + CloseableIterable iter; + if (task.isDataTask()) { + throw new UnsupportedOperationException("Cannot read data task."); + } else { + switch (task.file().format()) { + case PARQUET: + iter = newParquetIterable(task, schema, idToConstant, inputFilesDecryptor); + break; + + case AVRO: + iter = newAvroIterable(task, schema, idToConstant, inputFilesDecryptor); + break; + + case ORC: + iter = newOrcIterable(task, schema, idToConstant, inputFilesDecryptor); + break; + + default: + throw new UnsupportedOperationException( + "Cannot read unknown format: " + task.file().format()); + } + } + + if (rowFilter != null) { + return CloseableIterable.filter(iter, rowFilter::filter); + } + return iter; + } + + private CloseableIterable newAvroIterable( + FileScanTask task, + Schema schema, + Map idToConstant, + InputFilesDecryptor inputFilesDecryptor) { + Avro.ReadBuilder builder = + Avro.read(inputFilesDecryptor.getInputFile(task)) + .reuseContainers() + .project(schema) + .split(task.start(), task.length()) + .createReaderFunc(readSchema -> new FlinkAvroReader(schema, readSchema, idToConstant)); + + if (nameMapping != null) { + builder.withNameMapping(NameMappingParser.fromJson(nameMapping)); + } + + return builder.build(); + } + + private CloseableIterable newParquetIterable( + FileScanTask task, + Schema schema, + Map idToConstant, + InputFilesDecryptor inputFilesDecryptor) { + Parquet.ReadBuilder builder = + Parquet.read(inputFilesDecryptor.getInputFile(task)) + .split(task.start(), task.length()) + .project(schema) + .createReaderFunc( + fileSchema -> FlinkParquetReaders.buildReader(schema, fileSchema, idToConstant)) + .filter(task.residual()) + .caseSensitive(caseSensitive) + .reuseContainers(); + + if (nameMapping != null) { + builder.withNameMapping(NameMappingParser.fromJson(nameMapping)); + } + + return builder.build(); + } + + private CloseableIterable newOrcIterable( + FileScanTask task, + Schema schema, + Map idToConstant, + InputFilesDecryptor inputFilesDecryptor) { + Schema readSchemaWithoutConstantAndMetadataFields = + TypeUtil.selectNot( + schema, Sets.union(idToConstant.keySet(), MetadataColumns.metadataFieldIds())); + + ORC.ReadBuilder builder = + ORC.read(inputFilesDecryptor.getInputFile(task)) + .project(readSchemaWithoutConstantAndMetadataFields) + .split(task.start(), task.length()) + .createReaderFunc( + readOrcSchema -> new FlinkOrcReader(schema, readOrcSchema, idToConstant)) + .filter(task.residual()) + .caseSensitive(caseSensitive); + + if (nameMapping != null) { + builder.withNameMapping(NameMappingParser.fromJson(nameMapping)); + } + + return builder.build(); + } + + private static class FlinkDeleteFilter extends DeleteFilter { + private final RowType requiredRowType; + private final RowDataWrapper asStructLike; + private final InputFilesDecryptor inputFilesDecryptor; + + FlinkDeleteFilter( + FileScanTask task, + Schema tableSchema, + Schema requestedSchema, + InputFilesDecryptor inputFilesDecryptor) { + super(task.file().path().toString(), task.deletes(), tableSchema, requestedSchema); + this.requiredRowType = FlinkSchemaUtil.convert(requiredSchema()); + this.asStructLike = new RowDataWrapper(requiredRowType, requiredSchema().asStruct()); + this.inputFilesDecryptor = inputFilesDecryptor; + } + + public RowType requiredRowType() { + return requiredRowType; + } + + @Override + protected StructLike asStructLike(RowData row) { + return asStructLike.wrap(row); + } + + @Override + protected InputFile getInputFile(String location) { + return inputFilesDecryptor.getInputFile(location); + } + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/RowDataRewriter.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/RowDataRewriter.java new file mode 100644 index 000000000000..c958604c004a --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/RowDataRewriter.java @@ -0,0 +1,172 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import static org.apache.iceberg.TableProperties.DEFAULT_NAME_MAPPING; + +import java.util.Collection; +import java.util.Collections; +import java.util.List; +import java.util.stream.Collectors; +import org.apache.flink.api.common.functions.RichMapFunction; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.iceberg.CombinedScanTask; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SerializableTable; +import org.apache.iceberg.Table; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.encryption.EncryptionManager; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.sink.RowDataTaskWriterFactory; +import org.apache.iceberg.flink.sink.TaskWriterFactory; +import org.apache.iceberg.io.FileIO; +import org.apache.iceberg.io.TaskWriter; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.util.PropertyUtil; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class RowDataRewriter { + + private static final Logger LOG = LoggerFactory.getLogger(RowDataRewriter.class); + + private final Schema schema; + private final String nameMapping; + private final FileIO io; + private final boolean caseSensitive; + private final EncryptionManager encryptionManager; + private final TaskWriterFactory taskWriterFactory; + private final String tableName; + + public RowDataRewriter( + Table table, boolean caseSensitive, FileIO io, EncryptionManager encryptionManager) { + this.schema = table.schema(); + this.caseSensitive = caseSensitive; + this.io = io; + this.encryptionManager = encryptionManager; + this.nameMapping = + PropertyUtil.propertyAsString(table.properties(), DEFAULT_NAME_MAPPING, null); + this.tableName = table.name(); + + String formatString = + PropertyUtil.propertyAsString( + table.properties(), + TableProperties.DEFAULT_FILE_FORMAT, + TableProperties.DEFAULT_FILE_FORMAT_DEFAULT); + FileFormat format = FileFormat.fromString(formatString); + RowType flinkSchema = FlinkSchemaUtil.convert(table.schema()); + this.taskWriterFactory = + new RowDataTaskWriterFactory( + SerializableTable.copyOf(table), + flinkSchema, + Long.MAX_VALUE, + format, + table.properties(), + null, + false); + } + + public List rewriteDataForTasks( + DataStream dataStream, int parallelism) throws Exception { + RewriteMap map = + new RewriteMap( + schema, nameMapping, io, caseSensitive, encryptionManager, taskWriterFactory); + DataStream> ds = dataStream.map(map).setParallelism(parallelism); + return Lists.newArrayList(ds.executeAndCollect("Rewrite table :" + tableName)).stream() + .flatMap(Collection::stream) + .collect(Collectors.toList()); + } + + public static class RewriteMap extends RichMapFunction> { + + private TaskWriter writer; + private int subTaskId; + private int attemptId; + + private final Schema schema; + private final String nameMapping; + private final FileIO io; + private final boolean caseSensitive; + private final EncryptionManager encryptionManager; + private final TaskWriterFactory taskWriterFactory; + private final RowDataFileScanTaskReader rowDataReader; + + public RewriteMap( + Schema schema, + String nameMapping, + FileIO io, + boolean caseSensitive, + EncryptionManager encryptionManager, + TaskWriterFactory taskWriterFactory) { + this.schema = schema; + this.nameMapping = nameMapping; + this.io = io; + this.caseSensitive = caseSensitive; + this.encryptionManager = encryptionManager; + this.taskWriterFactory = taskWriterFactory; + this.rowDataReader = + new RowDataFileScanTaskReader( + schema, schema, nameMapping, caseSensitive, Collections.emptyList()); + } + + @Override + public void open(Configuration parameters) { + this.subTaskId = getRuntimeContext().getIndexOfThisSubtask(); + this.attemptId = getRuntimeContext().getAttemptNumber(); + // Initialize the task writer factory. + this.taskWriterFactory.initialize(subTaskId, attemptId); + } + + @Override + public List map(CombinedScanTask task) throws Exception { + // Initialize the task writer. + this.writer = taskWriterFactory.create(); + try (DataIterator iterator = + new DataIterator<>(rowDataReader, task, io, encryptionManager)) { + while (iterator.hasNext()) { + RowData rowData = iterator.next(); + writer.write(rowData); + } + return Lists.newArrayList(writer.dataFiles()); + } catch (Throwable originalThrowable) { + try { + LOG.error("Aborting commit for (subTaskId {}, attemptId {})", subTaskId, attemptId); + writer.abort(); + LOG.error("Aborted commit for (subTaskId {}, attemptId {})", subTaskId, attemptId); + } catch (Throwable inner) { + if (originalThrowable != inner) { + originalThrowable.addSuppressed(inner); + LOG.warn("Suppressing exception in catch: {}", inner.getMessage(), inner); + } + } + + if (originalThrowable instanceof Exception) { + throw originalThrowable; + } else { + throw new RuntimeException(originalThrowable); + } + } + } + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/RowDataToAvroGenericRecordConverter.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/RowDataToAvroGenericRecordConverter.java new file mode 100644 index 000000000000..8ef1f1fbb833 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/RowDataToAvroGenericRecordConverter.java @@ -0,0 +1,70 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import java.io.Serializable; +import java.util.function.Function; +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; +import org.apache.flink.annotation.Internal; +import org.apache.flink.formats.avro.RowDataToAvroConverters; +import org.apache.flink.formats.avro.typeutils.AvroSchemaConverter; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.DataType; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.table.types.utils.TypeConversions; +import org.apache.iceberg.avro.AvroSchemaUtil; +import org.apache.iceberg.flink.FlinkSchemaUtil; + +/** + * This is not serializable because Avro {@link Schema} is not actually serializable, even though it + * implements {@link Serializable} interface. + */ +@Internal +public class RowDataToAvroGenericRecordConverter implements Function { + private final RowDataToAvroConverters.RowDataToAvroConverter converter; + private final Schema avroSchema; + + private RowDataToAvroGenericRecordConverter(RowType rowType, Schema avroSchema) { + this.converter = RowDataToAvroConverters.createConverter(rowType); + this.avroSchema = avroSchema; + } + + @Override + public GenericRecord apply(RowData rowData) { + return (GenericRecord) converter.convert(avroSchema, rowData); + } + + /** Create a converter based on Iceberg schema */ + public static RowDataToAvroGenericRecordConverter fromIcebergSchema( + String tableName, org.apache.iceberg.Schema icebergSchema) { + RowType rowType = FlinkSchemaUtil.convert(icebergSchema); + Schema avroSchema = AvroSchemaUtil.convert(icebergSchema, tableName); + return new RowDataToAvroGenericRecordConverter(rowType, avroSchema); + } + + /** Create a mapper based on Avro schema */ + public static RowDataToAvroGenericRecordConverter fromAvroSchema(Schema avroSchema) { + DataType dataType = AvroSchemaConverter.convertToDataType(avroSchema.toString()); + LogicalType logicalType = TypeConversions.fromDataToLogicalType(dataType); + RowType rowType = RowType.of(logicalType.getChildren().toArray(new LogicalType[0])); + return new RowDataToAvroGenericRecordConverter(rowType, avroSchema); + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/ScanContext.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/ScanContext.java new file mode 100644 index 000000000000..ab79a3173933 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/ScanContext.java @@ -0,0 +1,597 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import java.io.Serializable; +import java.time.Duration; +import java.util.Collection; +import java.util.List; +import java.util.Map; +import java.util.concurrent.TimeUnit; +import org.apache.flink.annotation.Internal; +import org.apache.flink.configuration.ReadableConfig; +import org.apache.flink.util.Preconditions; +import org.apache.flink.util.TimeUtils; +import org.apache.iceberg.Schema; +import org.apache.iceberg.Table; +import org.apache.iceberg.expressions.Expression; +import org.apache.iceberg.flink.FlinkConfigOptions; +import org.apache.iceberg.flink.FlinkReadConf; +import org.apache.iceberg.flink.FlinkReadOptions; + +/** Context object with optional arguments for a Flink Scan. */ +@Internal +public class ScanContext implements Serializable { + + private static final long serialVersionUID = 1L; + + private final boolean caseSensitive; + private final boolean exposeLocality; + private final Long snapshotId; + private final String branch; + private final String tag; + private final StreamingStartingStrategy startingStrategy; + private final Long startSnapshotId; + private final Long startSnapshotTimestamp; + private final Long endSnapshotId; + private final Long asOfTimestamp; + private final String startTag; + private final String endTag; + private final Long splitSize; + private final Integer splitLookback; + private final Long splitOpenFileCost; + private final boolean isStreaming; + private final Duration monitorInterval; + + private final String nameMapping; + private final Schema schema; + private final List filters; + private final long limit; + private final boolean includeColumnStats; + private final Collection includeStatsForColumns; + private final Integer planParallelism; + private final int maxPlanningSnapshotCount; + private final int maxAllowedPlanningFailures; + private final String watermarkColumn; + private final TimeUnit watermarkColumnTimeUnit; + + private ScanContext( + boolean caseSensitive, + Long snapshotId, + StreamingStartingStrategy startingStrategy, + Long startSnapshotTimestamp, + Long startSnapshotId, + Long endSnapshotId, + Long asOfTimestamp, + Long splitSize, + Integer splitLookback, + Long splitOpenFileCost, + boolean isStreaming, + Duration monitorInterval, + String nameMapping, + Schema schema, + List filters, + long limit, + boolean includeColumnStats, + Collection includeStatsForColumns, + boolean exposeLocality, + Integer planParallelism, + int maxPlanningSnapshotCount, + int maxAllowedPlanningFailures, + String watermarkColumn, + TimeUnit watermarkColumnTimeUnit, + String branch, + String tag, + String startTag, + String endTag) { + this.caseSensitive = caseSensitive; + this.snapshotId = snapshotId; + this.tag = tag; + this.branch = branch; + this.startingStrategy = startingStrategy; + this.startSnapshotTimestamp = startSnapshotTimestamp; + this.startSnapshotId = startSnapshotId; + this.endSnapshotId = endSnapshotId; + this.asOfTimestamp = asOfTimestamp; + this.startTag = startTag; + this.endTag = endTag; + this.splitSize = splitSize; + this.splitLookback = splitLookback; + this.splitOpenFileCost = splitOpenFileCost; + this.isStreaming = isStreaming; + this.monitorInterval = monitorInterval; + + this.nameMapping = nameMapping; + this.schema = schema; + this.filters = filters; + this.limit = limit; + this.includeColumnStats = includeColumnStats; + this.includeStatsForColumns = includeStatsForColumns; + this.exposeLocality = exposeLocality; + this.planParallelism = planParallelism; + this.maxPlanningSnapshotCount = maxPlanningSnapshotCount; + this.maxAllowedPlanningFailures = maxAllowedPlanningFailures; + this.watermarkColumn = watermarkColumn; + this.watermarkColumnTimeUnit = watermarkColumnTimeUnit; + } + + void validate() { + if (isStreaming) { + if (startingStrategy == StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_ID) { + Preconditions.checkArgument( + startSnapshotId != null, + "Invalid starting snapshot id for SPECIFIC_START_SNAPSHOT_ID strategy: null"); + Preconditions.checkArgument( + startSnapshotTimestamp == null, + "Invalid starting snapshot timestamp for SPECIFIC_START_SNAPSHOT_ID strategy: not null"); + } + if (startingStrategy == StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP) { + Preconditions.checkArgument( + startSnapshotTimestamp != null, + "Invalid starting snapshot timestamp for SPECIFIC_START_SNAPSHOT_TIMESTAMP strategy: null"); + Preconditions.checkArgument( + startSnapshotId == null, + "Invalid starting snapshot id for SPECIFIC_START_SNAPSHOT_ID strategy: not null"); + } + + Preconditions.checkArgument( + tag == null, + String.format("Cannot scan table using ref %s configured for streaming reader", tag)); + Preconditions.checkArgument( + snapshotId == null, "Cannot set snapshot-id option for streaming reader"); + Preconditions.checkArgument( + asOfTimestamp == null, "Cannot set as-of-timestamp option for streaming reader"); + Preconditions.checkArgument( + endSnapshotId == null, "Cannot set end-snapshot-id option for streaming reader"); + Preconditions.checkArgument(endTag == null, "Cannot set end-tag option for streaming reader"); + } + + Preconditions.checkArgument( + !(startTag != null && startSnapshotId() != null), + "START_SNAPSHOT_ID and START_TAG cannot both be set."); + + Preconditions.checkArgument( + !(endTag != null && endSnapshotId() != null), + "END_SNAPSHOT_ID and END_TAG cannot both be set."); + + Preconditions.checkArgument( + maxAllowedPlanningFailures >= -1, + "Cannot set maxAllowedPlanningFailures to a negative number other than -1."); + } + + public boolean caseSensitive() { + return caseSensitive; + } + + public Long snapshotId() { + return snapshotId; + } + + public String branch() { + return branch; + } + + public String tag() { + return tag; + } + + public String startTag() { + return startTag; + } + + public String endTag() { + return endTag; + } + + public StreamingStartingStrategy streamingStartingStrategy() { + return startingStrategy; + } + + public Long startSnapshotTimestamp() { + return startSnapshotTimestamp; + } + + public Long startSnapshotId() { + return startSnapshotId; + } + + public Long endSnapshotId() { + return endSnapshotId; + } + + public Long asOfTimestamp() { + return asOfTimestamp; + } + + public Long splitSize() { + return splitSize; + } + + public Integer splitLookback() { + return splitLookback; + } + + public Long splitOpenFileCost() { + return splitOpenFileCost; + } + + public boolean isStreaming() { + return isStreaming; + } + + public Duration monitorInterval() { + return monitorInterval; + } + + public String nameMapping() { + return nameMapping; + } + + public Schema project() { + return schema; + } + + public List filters() { + return filters; + } + + public long limit() { + return limit; + } + + public boolean includeColumnStats() { + return includeColumnStats; + } + + public Collection includeStatsForColumns() { + return includeStatsForColumns; + } + + public boolean exposeLocality() { + return exposeLocality; + } + + public Integer planParallelism() { + return planParallelism; + } + + public int maxPlanningSnapshotCount() { + return maxPlanningSnapshotCount; + } + + public int maxAllowedPlanningFailures() { + return maxAllowedPlanningFailures; + } + + public String watermarkColumn() { + return watermarkColumn; + } + + public TimeUnit watermarkColumnTimeUnit() { + return watermarkColumnTimeUnit; + } + + public ScanContext copyWithAppendsBetween(Long newStartSnapshotId, long newEndSnapshotId) { + return ScanContext.builder() + .caseSensitive(caseSensitive) + .useSnapshotId(null) + .useBranch(branch) + .useTag(null) + .startSnapshotId(newStartSnapshotId) + .endSnapshotId(newEndSnapshotId) + .startTag(null) + .endTag(null) + .asOfTimestamp(null) + .splitSize(splitSize) + .splitLookback(splitLookback) + .splitOpenFileCost(splitOpenFileCost) + .streaming(isStreaming) + .monitorInterval(monitorInterval) + .nameMapping(nameMapping) + .project(schema) + .filters(filters) + .limit(limit) + .includeColumnStats(includeColumnStats) + .includeColumnStats(includeStatsForColumns) + .exposeLocality(exposeLocality) + .planParallelism(planParallelism) + .maxPlanningSnapshotCount(maxPlanningSnapshotCount) + .maxAllowedPlanningFailures(maxAllowedPlanningFailures) + .watermarkColumn(watermarkColumn) + .watermarkColumnTimeUnit(watermarkColumnTimeUnit) + .build(); + } + + public ScanContext copyWithSnapshotId(long newSnapshotId) { + return ScanContext.builder() + .caseSensitive(caseSensitive) + .useSnapshotId(newSnapshotId) + .useBranch(branch) + .useTag(tag) + .startSnapshotId(null) + .endSnapshotId(null) + .startTag(null) + .endTag(null) + .asOfTimestamp(null) + .splitSize(splitSize) + .splitLookback(splitLookback) + .splitOpenFileCost(splitOpenFileCost) + .streaming(isStreaming) + .monitorInterval(monitorInterval) + .nameMapping(nameMapping) + .project(schema) + .filters(filters) + .limit(limit) + .includeColumnStats(includeColumnStats) + .includeColumnStats(includeStatsForColumns) + .exposeLocality(exposeLocality) + .planParallelism(planParallelism) + .maxPlanningSnapshotCount(maxPlanningSnapshotCount) + .maxAllowedPlanningFailures(maxAllowedPlanningFailures) + .watermarkColumn(watermarkColumn) + .watermarkColumnTimeUnit(watermarkColumnTimeUnit) + .build(); + } + + public static Builder builder() { + return new Builder(); + } + + public static class Builder { + private boolean caseSensitive = FlinkReadOptions.CASE_SENSITIVE_OPTION.defaultValue(); + private Long snapshotId = FlinkReadOptions.SNAPSHOT_ID.defaultValue(); + private String branch = FlinkReadOptions.BRANCH.defaultValue(); + private String tag = FlinkReadOptions.TAG.defaultValue(); + private String startTag = FlinkReadOptions.START_TAG.defaultValue(); + private String endTag = FlinkReadOptions.END_TAG.defaultValue(); + private StreamingStartingStrategy startingStrategy = + FlinkReadOptions.STARTING_STRATEGY_OPTION.defaultValue(); + private Long startSnapshotTimestamp = FlinkReadOptions.START_SNAPSHOT_TIMESTAMP.defaultValue(); + private Long startSnapshotId = FlinkReadOptions.START_SNAPSHOT_ID.defaultValue(); + private Long endSnapshotId = FlinkReadOptions.END_SNAPSHOT_ID.defaultValue(); + private Long asOfTimestamp = FlinkReadOptions.AS_OF_TIMESTAMP.defaultValue(); + private Long splitSize = FlinkReadOptions.SPLIT_SIZE_OPTION.defaultValue(); + private Integer splitLookback = FlinkReadOptions.SPLIT_LOOKBACK_OPTION.defaultValue(); + private Long splitOpenFileCost = FlinkReadOptions.SPLIT_FILE_OPEN_COST_OPTION.defaultValue(); + private boolean isStreaming = FlinkReadOptions.STREAMING_OPTION.defaultValue(); + private Duration monitorInterval = + TimeUtils.parseDuration(FlinkReadOptions.MONITOR_INTERVAL_OPTION.defaultValue()); + private String nameMapping; + private Schema projectedSchema; + private List filters; + private long limit = FlinkReadOptions.LIMIT_OPTION.defaultValue(); + private boolean includeColumnStats = + FlinkReadOptions.INCLUDE_COLUMN_STATS_OPTION.defaultValue(); + private Collection includeStatsForColumns = null; + private boolean exposeLocality; + private Integer planParallelism = + FlinkConfigOptions.TABLE_EXEC_ICEBERG_WORKER_POOL_SIZE.defaultValue(); + private int maxPlanningSnapshotCount = + FlinkReadOptions.MAX_PLANNING_SNAPSHOT_COUNT_OPTION.defaultValue(); + private int maxAllowedPlanningFailures = + FlinkReadOptions.MAX_ALLOWED_PLANNING_FAILURES_OPTION.defaultValue(); + private String watermarkColumn = FlinkReadOptions.WATERMARK_COLUMN_OPTION.defaultValue(); + private TimeUnit watermarkColumnTimeUnit = + FlinkReadOptions.WATERMARK_COLUMN_TIME_UNIT_OPTION.defaultValue(); + + private Builder() {} + + public Builder caseSensitive(boolean newCaseSensitive) { + this.caseSensitive = newCaseSensitive; + return this; + } + + public Builder useSnapshotId(Long newSnapshotId) { + this.snapshotId = newSnapshotId; + return this; + } + + public Builder useTag(String newTag) { + this.tag = newTag; + return this; + } + + public Builder useBranch(String newBranch) { + this.branch = newBranch; + return this; + } + + public Builder startingStrategy(StreamingStartingStrategy newStartingStrategy) { + this.startingStrategy = newStartingStrategy; + return this; + } + + public Builder startSnapshotTimestamp(Long newStartSnapshotTimestamp) { + this.startSnapshotTimestamp = newStartSnapshotTimestamp; + return this; + } + + public Builder startSnapshotId(Long newStartSnapshotId) { + this.startSnapshotId = newStartSnapshotId; + return this; + } + + public Builder endSnapshotId(Long newEndSnapshotId) { + this.endSnapshotId = newEndSnapshotId; + return this; + } + + public Builder startTag(String newStartTag) { + this.startTag = newStartTag; + return this; + } + + public Builder endTag(String newEndTag) { + this.endTag = newEndTag; + return this; + } + + public Builder asOfTimestamp(Long newAsOfTimestamp) { + this.asOfTimestamp = newAsOfTimestamp; + return this; + } + + public Builder splitSize(Long newSplitSize) { + this.splitSize = newSplitSize; + return this; + } + + public Builder splitLookback(Integer newSplitLookback) { + this.splitLookback = newSplitLookback; + return this; + } + + public Builder splitOpenFileCost(Long newSplitOpenFileCost) { + this.splitOpenFileCost = newSplitOpenFileCost; + return this; + } + + public Builder streaming(boolean streaming) { + this.isStreaming = streaming; + return this; + } + + public Builder monitorInterval(Duration newMonitorInterval) { + this.monitorInterval = newMonitorInterval; + return this; + } + + public Builder nameMapping(String newNameMapping) { + this.nameMapping = newNameMapping; + return this; + } + + public Builder project(Schema newProjectedSchema) { + this.projectedSchema = newProjectedSchema; + return this; + } + + public Builder filters(List newFilters) { + this.filters = newFilters; + return this; + } + + public Builder limit(long newLimit) { + this.limit = newLimit; + return this; + } + + public Builder includeColumnStats(boolean newIncludeColumnStats) { + this.includeColumnStats = newIncludeColumnStats; + return this; + } + + public Builder includeColumnStats(Collection newIncludeStatsForColumns) { + this.includeStatsForColumns = newIncludeStatsForColumns; + return this; + } + + public Builder exposeLocality(boolean newExposeLocality) { + this.exposeLocality = newExposeLocality; + return this; + } + + public Builder planParallelism(Integer parallelism) { + this.planParallelism = parallelism; + return this; + } + + public Builder maxPlanningSnapshotCount(int newMaxPlanningSnapshotCount) { + this.maxPlanningSnapshotCount = newMaxPlanningSnapshotCount; + return this; + } + + public Builder maxAllowedPlanningFailures(int newMaxAllowedPlanningFailures) { + this.maxAllowedPlanningFailures = newMaxAllowedPlanningFailures; + return this; + } + + public Builder watermarkColumn(String newWatermarkColumn) { + this.watermarkColumn = newWatermarkColumn; + return this; + } + + public Builder watermarkColumnTimeUnit(TimeUnit newWatermarkTimeUnit) { + this.watermarkColumnTimeUnit = newWatermarkTimeUnit; + return this; + } + + public Builder resolveConfig( + Table table, Map readOptions, ReadableConfig readableConfig) { + FlinkReadConf flinkReadConf = new FlinkReadConf(table, readOptions, readableConfig); + + return this.useSnapshotId(flinkReadConf.snapshotId()) + .useTag(flinkReadConf.tag()) + .useBranch(flinkReadConf.branch()) + .startTag(flinkReadConf.startTag()) + .endTag(flinkReadConf.endTag()) + .caseSensitive(flinkReadConf.caseSensitive()) + .asOfTimestamp(flinkReadConf.asOfTimestamp()) + .startingStrategy(flinkReadConf.startingStrategy()) + .startSnapshotTimestamp(flinkReadConf.startSnapshotTimestamp()) + .startSnapshotId(flinkReadConf.startSnapshotId()) + .endSnapshotId(flinkReadConf.endSnapshotId()) + .splitSize(flinkReadConf.splitSize()) + .splitLookback(flinkReadConf.splitLookback()) + .splitOpenFileCost(flinkReadConf.splitFileOpenCost()) + .streaming(flinkReadConf.streaming()) + .monitorInterval(flinkReadConf.monitorInterval()) + .nameMapping(flinkReadConf.nameMapping()) + .limit(flinkReadConf.limit()) + .planParallelism(flinkReadConf.workerPoolSize()) + .includeColumnStats(flinkReadConf.includeColumnStats()) + .maxPlanningSnapshotCount(flinkReadConf.maxPlanningSnapshotCount()) + .maxAllowedPlanningFailures(maxAllowedPlanningFailures) + .watermarkColumn(flinkReadConf.watermarkColumn()) + .watermarkColumnTimeUnit(flinkReadConf.watermarkColumnTimeUnit()); + } + + public ScanContext build() { + return new ScanContext( + caseSensitive, + snapshotId, + startingStrategy, + startSnapshotTimestamp, + startSnapshotId, + endSnapshotId, + asOfTimestamp, + splitSize, + splitLookback, + splitOpenFileCost, + isStreaming, + monitorInterval, + nameMapping, + projectedSchema, + filters, + limit, + includeColumnStats, + includeStatsForColumns, + exposeLocality, + planParallelism, + maxPlanningSnapshotCount, + maxAllowedPlanningFailures, + watermarkColumn, + watermarkColumnTimeUnit, + branch, + tag, + startTag, + endTag); + } + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/SourceUtil.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/SourceUtil.java new file mode 100644 index 000000000000..7c3a69dbc141 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/SourceUtil.java @@ -0,0 +1,77 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import java.util.function.Supplier; +import org.apache.flink.configuration.ReadableConfig; +import org.apache.flink.table.api.config.ExecutionConfigOptions; +import org.apache.iceberg.Table; +import org.apache.iceberg.flink.FlinkConfigOptions; +import org.apache.iceberg.hadoop.Util; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; + +class SourceUtil { + private SourceUtil() {} + + static boolean isLocalityEnabled( + Table table, ReadableConfig readableConfig, Boolean exposeLocality) { + Boolean localityEnabled = + exposeLocality != null + ? exposeLocality + : readableConfig.get(FlinkConfigOptions.TABLE_EXEC_ICEBERG_EXPOSE_SPLIT_LOCALITY_INFO); + + if (localityEnabled != null && !localityEnabled) { + return false; + } + + return Util.mayHaveBlockLocations(table.io(), table.location()); + } + + /** + * Infer source parallelism. + * + * @param readableConfig Flink config. + * @param splitCountProvider Split count supplier. As the computation may involve expensive split + * discover, lazy evaluation is performed if inferring parallelism is enabled. + * @param limitCount limited output count. + */ + static int inferParallelism( + ReadableConfig readableConfig, long limitCount, Supplier splitCountProvider) { + int parallelism = + readableConfig.get(ExecutionConfigOptions.TABLE_EXEC_RESOURCE_DEFAULT_PARALLELISM); + if (readableConfig.get(FlinkConfigOptions.TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM)) { + int maxInferParallelism = + readableConfig.get(FlinkConfigOptions.TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM_MAX); + Preconditions.checkState( + maxInferParallelism >= 1, + FlinkConfigOptions.TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM_MAX.key() + + " cannot be less than 1"); + parallelism = Math.min(splitCountProvider.get(), maxInferParallelism); + } + + if (limitCount > 0) { + int limit = limitCount >= Integer.MAX_VALUE ? Integer.MAX_VALUE : (int) limitCount; + parallelism = Math.min(parallelism, limit); + } + + // parallelism must be positive. + parallelism = Math.max(1, parallelism); + return parallelism; + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/StreamingMonitorFunction.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/StreamingMonitorFunction.java new file mode 100644 index 000000000000..a07613aee59b --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/StreamingMonitorFunction.java @@ -0,0 +1,269 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.util.List; +import java.util.concurrent.ExecutorService; +import org.apache.flink.api.common.functions.RuntimeContext; +import org.apache.flink.api.common.state.ListState; +import org.apache.flink.api.common.state.ListStateDescriptor; +import org.apache.flink.api.common.typeutils.base.LongSerializer; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.runtime.state.FunctionInitializationContext; +import org.apache.flink.runtime.state.FunctionSnapshotContext; +import org.apache.flink.streaming.api.checkpoint.CheckpointedFunction; +import org.apache.flink.streaming.api.functions.source.RichSourceFunction; +import org.apache.flink.streaming.api.operators.StreamingRuntimeContext; +import org.apache.iceberg.Snapshot; +import org.apache.iceberg.Table; +import org.apache.iceberg.exceptions.ValidationException; +import org.apache.iceberg.flink.TableLoader; +import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.util.SnapshotUtil; +import org.apache.iceberg.util.ThreadPools; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * This is the single (non-parallel) monitoring task which takes a {@link FlinkInputFormat}, it is + * responsible for: + * + *

      + *
    1. Monitoring snapshots of the Iceberg table. + *
    2. Creating the {@link FlinkInputSplit splits} corresponding to the incremental files + *
    3. Assigning them to downstream tasks for further processing. + *
    + * + *

    The splits to be read are forwarded to the downstream {@link StreamingReaderOperator} which + * can have parallelism greater than one. + */ +public class StreamingMonitorFunction extends RichSourceFunction + implements CheckpointedFunction { + + private static final Logger LOG = LoggerFactory.getLogger(StreamingMonitorFunction.class); + + private static final long INIT_LAST_SNAPSHOT_ID = -1L; + + private final TableLoader tableLoader; + private final ScanContext scanContext; + + private volatile boolean isRunning = true; + + // The checkpoint thread is not the same thread that running the function for SourceStreamTask + // now. It's necessary to + // mark this as volatile. + private volatile long lastSnapshotId = INIT_LAST_SNAPSHOT_ID; + + private transient SourceContext sourceContext; + private transient Table table; + private transient ListState lastSnapshotIdState; + private transient ExecutorService workerPool; + + public StreamingMonitorFunction(TableLoader tableLoader, ScanContext scanContext) { + Preconditions.checkArgument( + scanContext.snapshotId() == null, "Cannot set snapshot-id option for streaming reader"); + Preconditions.checkArgument( + scanContext.asOfTimestamp() == null, + "Cannot set as-of-timestamp option for streaming reader"); + Preconditions.checkArgument( + scanContext.endSnapshotId() == null, + "Cannot set end-snapshot-id option for streaming reader"); + Preconditions.checkArgument( + scanContext.endTag() == null, "Cannot set end-tag option for streaming reader"); + Preconditions.checkArgument( + scanContext.maxPlanningSnapshotCount() > 0, + "The max-planning-snapshot-count must be greater than zero"); + this.tableLoader = tableLoader; + this.scanContext = scanContext; + } + + @Override + public void open(Configuration parameters) throws Exception { + super.open(parameters); + + final RuntimeContext runtimeContext = getRuntimeContext(); + ValidationException.check( + runtimeContext instanceof StreamingRuntimeContext, + "context should be instance of StreamingRuntimeContext"); + final String operatorID = ((StreamingRuntimeContext) runtimeContext).getOperatorUniqueID(); + this.workerPool = + ThreadPools.newWorkerPool( + "iceberg-worker-pool-" + operatorID, scanContext.planParallelism()); + } + + @Override + public void initializeState(FunctionInitializationContext context) throws Exception { + // Load iceberg table from table loader. + tableLoader.open(); + table = tableLoader.loadTable(); + + // Initialize the flink state for last snapshot id. + lastSnapshotIdState = + context + .getOperatorStateStore() + .getListState(new ListStateDescriptor<>("snapshot-id-state", LongSerializer.INSTANCE)); + + // Restore the last-snapshot-id from flink's state if possible. + if (context.isRestored()) { + LOG.info("Restoring state for the {}.", getClass().getSimpleName()); + lastSnapshotId = lastSnapshotIdState.get().iterator().next(); + } else if (scanContext.startTag() != null || scanContext.startSnapshotId() != null) { + Preconditions.checkArgument( + !(scanContext.startTag() != null && scanContext.startSnapshotId() != null), + "START_SNAPSHOT_ID and START_TAG cannot both be set."); + Preconditions.checkNotNull( + table.currentSnapshot(), "Don't have any available snapshot in table."); + + long startSnapshotId; + if (scanContext.startTag() != null) { + Preconditions.checkArgument( + table.snapshot(scanContext.startTag()) != null, + "Cannot find snapshot with tag %s in table.", + scanContext.startTag()); + startSnapshotId = table.snapshot(scanContext.startTag()).snapshotId(); + } else { + startSnapshotId = scanContext.startSnapshotId(); + } + + long currentSnapshotId = table.currentSnapshot().snapshotId(); + Preconditions.checkState( + SnapshotUtil.isAncestorOf(table, currentSnapshotId, startSnapshotId), + "The option start-snapshot-id %s is not an ancestor of the current snapshot.", + startSnapshotId); + + lastSnapshotId = startSnapshotId; + } + } + + @Override + public void snapshotState(FunctionSnapshotContext context) throws Exception { + lastSnapshotIdState.clear(); + lastSnapshotIdState.add(lastSnapshotId); + } + + @Override + public void run(SourceContext ctx) throws Exception { + this.sourceContext = ctx; + while (isRunning) { + monitorAndForwardSplits(); + Thread.sleep(scanContext.monitorInterval().toMillis()); + } + } + + private long toSnapshotIdInclusive( + long lastConsumedSnapshotId, long currentSnapshotId, int maxPlanningSnapshotCount) { + List snapshotIds = + SnapshotUtil.snapshotIdsBetween(table, lastConsumedSnapshotId, currentSnapshotId); + if (snapshotIds.size() <= maxPlanningSnapshotCount) { + return currentSnapshotId; + } else { + // It uses reverted index since snapshotIdsBetween returns Ids that are ordered by committed + // time descending. + return snapshotIds.get(snapshotIds.size() - maxPlanningSnapshotCount); + } + } + + @VisibleForTesting + void sourceContext(SourceContext ctx) { + this.sourceContext = ctx; + } + + @VisibleForTesting + void monitorAndForwardSplits() { + // Refresh the table to get the latest committed snapshot. + table.refresh(); + + Snapshot snapshot = + scanContext.branch() != null + ? table.snapshot(scanContext.branch()) + : table.currentSnapshot(); + if (snapshot != null && snapshot.snapshotId() != lastSnapshotId) { + long snapshotId = snapshot.snapshotId(); + + ScanContext newScanContext; + if (lastSnapshotId == INIT_LAST_SNAPSHOT_ID) { + newScanContext = scanContext.copyWithSnapshotId(snapshotId); + } else { + snapshotId = + toSnapshotIdInclusive( + lastSnapshotId, snapshotId, scanContext.maxPlanningSnapshotCount()); + newScanContext = scanContext.copyWithAppendsBetween(lastSnapshotId, snapshotId); + } + + LOG.debug( + "Start discovering splits from {} (exclusive) to {} (inclusive)", + lastSnapshotId, + snapshotId); + long start = System.currentTimeMillis(); + FlinkInputSplit[] splits = + FlinkSplitPlanner.planInputSplits(table, newScanContext, workerPool); + LOG.debug( + "Discovered {} splits, time elapsed {}ms", + splits.length, + System.currentTimeMillis() - start); + + // only need to hold the checkpoint lock when emitting the splits and updating lastSnapshotId + start = System.currentTimeMillis(); + synchronized (sourceContext.getCheckpointLock()) { + for (FlinkInputSplit split : splits) { + sourceContext.collect(split); + } + + lastSnapshotId = snapshotId; + } + LOG.debug( + "Forwarded {} splits, time elapsed {}ms", + splits.length, + System.currentTimeMillis() - start); + } + } + + @Override + public void cancel() { + // this is to cover the case where cancel() is called before the run() + if (sourceContext != null) { + synchronized (sourceContext.getCheckpointLock()) { + isRunning = false; + } + } else { + isRunning = false; + } + + // Release all the resources here. + if (tableLoader != null) { + try { + tableLoader.close(); + } catch (IOException e) { + throw new UncheckedIOException(e); + } + } + } + + @Override + public void close() { + cancel(); + + if (workerPool != null) { + workerPool.shutdown(); + } + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/StreamingReaderOperator.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/StreamingReaderOperator.java new file mode 100644 index 000000000000..ee6f7b63988d --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/StreamingReaderOperator.java @@ -0,0 +1,246 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import java.io.IOException; +import java.util.Queue; +import org.apache.flink.api.common.operators.MailboxExecutor; +import org.apache.flink.api.common.state.ListState; +import org.apache.flink.api.common.state.ListStateDescriptor; +import org.apache.flink.runtime.state.JavaSerializer; +import org.apache.flink.runtime.state.StateInitializationContext; +import org.apache.flink.runtime.state.StateSnapshotContext; +import org.apache.flink.streaming.api.functions.source.SourceFunction; +import org.apache.flink.streaming.api.operators.AbstractStreamOperator; +import org.apache.flink.streaming.api.operators.AbstractStreamOperatorFactory; +import org.apache.flink.streaming.api.operators.OneInputStreamOperator; +import org.apache.flink.streaming.api.operators.OneInputStreamOperatorFactory; +import org.apache.flink.streaming.api.operators.StreamOperator; +import org.apache.flink.streaming.api.operators.StreamOperatorParameters; +import org.apache.flink.streaming.api.operators.StreamSourceContexts; +import org.apache.flink.streaming.api.operators.YieldingOperatorFactory; +import org.apache.flink.streaming.api.watermark.Watermark; +import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; +import org.apache.flink.streaming.runtime.tasks.ProcessingTimeService; +import org.apache.flink.table.data.RowData; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * The operator that reads the {@link FlinkInputSplit splits} received from the preceding {@link + * StreamingMonitorFunction}. Contrary to the {@link StreamingMonitorFunction} which has a + * parallelism of 1, this operator can have multiple parallelism. + * + *

    As soon as a split descriptor is received, it is put in a queue, and use {@link + * MailboxExecutor} read the actual data of the split. This architecture allows the separation of + * the reading thread from the one split processing the checkpoint barriers, thus removing any + * potential back-pressure. + */ +public class StreamingReaderOperator extends AbstractStreamOperator + implements OneInputStreamOperator { + + private static final Logger LOG = LoggerFactory.getLogger(StreamingReaderOperator.class); + + // It's the same thread that is running this operator and checkpoint actions. we use this executor + // to schedule only + // one split for future reading, so that a new checkpoint could be triggered without blocking long + // time for exhausting + // all scheduled splits. + private final MailboxExecutor executor; + private FlinkInputFormat format; + + private transient SourceFunction.SourceContext sourceContext; + + private transient ListState inputSplitsState; + private transient Queue splits; + + // Splits are read by the same thread that calls processElement. Each read task is submitted to + // that thread by adding + // them to the executor. This state is used to ensure that only one read task is in that queue at + // a time, so that read + // tasks do not accumulate ahead of checkpoint tasks. When there is a read task in the queue, this + // is set to RUNNING. + // When there are no more files to read, this will be set to IDLE. + private transient SplitState currentSplitState; + + private StreamingReaderOperator( + FlinkInputFormat format, ProcessingTimeService timeService, MailboxExecutor mailboxExecutor) { + this.format = Preconditions.checkNotNull(format, "The InputFormat should not be null."); + this.processingTimeService = timeService; + this.executor = + Preconditions.checkNotNull(mailboxExecutor, "The mailboxExecutor should not be null."); + } + + @Override + public void initializeState(StateInitializationContext context) throws Exception { + super.initializeState(context); + + // TODO Replace Java serialization with Avro approach to keep state compatibility. + // See issue: https://github.com/apache/iceberg/issues/1698 + inputSplitsState = + context + .getOperatorStateStore() + .getListState(new ListStateDescriptor<>("splits", new JavaSerializer<>())); + + // Initialize the current split state to IDLE. + currentSplitState = SplitState.IDLE; + + // Recover splits state from flink state backend if possible. + splits = Lists.newLinkedList(); + if (context.isRestored()) { + int subtaskIdx = getRuntimeContext().getIndexOfThisSubtask(); + LOG.info("Restoring state for the {} (taskIdx: {}).", getClass().getSimpleName(), subtaskIdx); + + for (FlinkInputSplit split : inputSplitsState.get()) { + splits.add(split); + } + } + + this.sourceContext = + StreamSourceContexts.getSourceContext( + getOperatorConfig().getTimeCharacteristic(), + getProcessingTimeService(), + new Object(), // no actual locking needed + output, + getRuntimeContext().getExecutionConfig().getAutoWatermarkInterval(), + -1, + true); + + // Enqueue to process the recovered input splits. + enqueueProcessSplits(); + } + + @Override + public void snapshotState(StateSnapshotContext context) throws Exception { + super.snapshotState(context); + + inputSplitsState.clear(); + inputSplitsState.addAll(Lists.newArrayList(splits)); + } + + @Override + public void processElement(StreamRecord element) { + splits.add(element.getValue()); + enqueueProcessSplits(); + } + + private void enqueueProcessSplits() { + if (currentSplitState == SplitState.IDLE && !splits.isEmpty()) { + currentSplitState = SplitState.RUNNING; + executor.execute(this::processSplits, this.getClass().getSimpleName()); + } + } + + private void processSplits() throws IOException { + FlinkInputSplit split = splits.poll(); + if (split == null) { + currentSplitState = SplitState.IDLE; + return; + } + + format.open(split); + try { + RowData nextElement = null; + while (!format.reachedEnd()) { + nextElement = format.nextRecord(nextElement); + sourceContext.collect(nextElement); + } + } finally { + currentSplitState = SplitState.IDLE; + format.close(); + } + + // Re-schedule to process the next split. + enqueueProcessSplits(); + } + + @Override + public void processWatermark(Watermark mark) { + // we do nothing because we emit our own watermarks if needed. + } + + @Override + public void close() throws Exception { + super.close(); + + if (format != null) { + format.close(); + format.closeInputFormat(); + format = null; + } + + sourceContext = null; + } + + @Override + public void finish() throws Exception { + super.finish(); + output.close(); + if (sourceContext != null) { + sourceContext.emitWatermark(Watermark.MAX_WATERMARK); + sourceContext.close(); + sourceContext = null; + } + } + + static OneInputStreamOperatorFactory factory(FlinkInputFormat format) { + return new OperatorFactory(format); + } + + private enum SplitState { + IDLE, + RUNNING + } + + private static class OperatorFactory extends AbstractStreamOperatorFactory + implements YieldingOperatorFactory, + OneInputStreamOperatorFactory { + + private final FlinkInputFormat format; + + private transient MailboxExecutor mailboxExecutor; + + private OperatorFactory(FlinkInputFormat format) { + this.format = format; + } + + @Override + public void setMailboxExecutor(MailboxExecutor mailboxExecutor) { + this.mailboxExecutor = mailboxExecutor; + } + + @SuppressWarnings("unchecked") + @Override + public > O createStreamOperator( + StreamOperatorParameters parameters) { + StreamingReaderOperator operator = + new StreamingReaderOperator(format, processingTimeService, mailboxExecutor); + operator.setup( + parameters.getContainingTask(), parameters.getStreamConfig(), parameters.getOutput()); + return (O) operator; + } + + @Override + public Class getStreamOperatorClass(ClassLoader classLoader) { + return StreamingReaderOperator.class; + } + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/StreamingStartingStrategy.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/StreamingStartingStrategy.java new file mode 100644 index 000000000000..11707bf82a0f --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/StreamingStartingStrategy.java @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +/** Starting strategy for streaming execution. */ +public enum StreamingStartingStrategy { + /** + * Do a regular table scan then switch to the incremental mode. + * + *

    The incremental mode starts from the current snapshot exclusive. + */ + TABLE_SCAN_THEN_INCREMENTAL, + + /** + * Start incremental mode from the latest snapshot inclusive. + * + *

    If it is an empty map, all future append snapshots should be discovered. + */ + INCREMENTAL_FROM_LATEST_SNAPSHOT, + + /** + * Start incremental mode from the earliest snapshot inclusive. + * + *

    If it is an empty map, all future append snapshots should be discovered. + */ + INCREMENTAL_FROM_EARLIEST_SNAPSHOT, + + /** Start incremental mode from a snapshot with a specific id inclusive. */ + INCREMENTAL_FROM_SNAPSHOT_ID, + + /** + * Start incremental mode from a snapshot with a specific timestamp inclusive. + * + *

    If the timestamp is between two snapshots, it should start from the snapshot after the + * timestamp. + */ + INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/assigner/DefaultSplitAssigner.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/assigner/DefaultSplitAssigner.java new file mode 100644 index 000000000000..e7447d08c985 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/assigner/DefaultSplitAssigner.java @@ -0,0 +1,119 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.assigner; + +import java.util.ArrayDeque; +import java.util.Collection; +import java.util.PriorityQueue; +import java.util.Queue; +import java.util.concurrent.CompletableFuture; +import java.util.stream.Collectors; +import javax.annotation.Nullable; +import org.apache.flink.annotation.Internal; +import org.apache.iceberg.flink.source.split.IcebergSourceSplit; +import org.apache.iceberg.flink.source.split.IcebergSourceSplitState; +import org.apache.iceberg.flink.source.split.IcebergSourceSplitStatus; +import org.apache.iceberg.flink.source.split.SerializableComparator; + +/** + * Since all methods are called in the source coordinator thread by enumerator, there is no need for + * locking. + */ +@Internal +public class DefaultSplitAssigner implements SplitAssigner { + + private final Queue pendingSplits; + private CompletableFuture availableFuture; + + public DefaultSplitAssigner(SerializableComparator comparator) { + this.pendingSplits = comparator == null ? new ArrayDeque<>() : new PriorityQueue<>(comparator); + } + + public DefaultSplitAssigner( + SerializableComparator comparator, + Collection assignerState) { + this(comparator); + // Because default assigner only tracks unassigned splits, + // there is no need to filter splits based on status (unassigned) here. + assignerState.forEach(splitState -> pendingSplits.add(splitState.split())); + } + + @Override + public synchronized GetSplitResult getNext(@Nullable String hostname) { + if (pendingSplits.isEmpty()) { + return GetSplitResult.unavailable(); + } else { + IcebergSourceSplit split = pendingSplits.poll(); + return GetSplitResult.forSplit(split); + } + } + + @Override + public void onDiscoveredSplits(Collection splits) { + addSplits(splits); + } + + @Override + public void onUnassignedSplits(Collection splits) { + addSplits(splits); + } + + private synchronized void addSplits(Collection splits) { + if (!splits.isEmpty()) { + pendingSplits.addAll(splits); + // only complete pending future if new splits are discovered + completeAvailableFuturesIfNeeded(); + } + } + + /** Simple assigner only tracks unassigned splits */ + @Override + public synchronized Collection state() { + return pendingSplits.stream() + .map(split -> new IcebergSourceSplitState(split, IcebergSourceSplitStatus.UNASSIGNED)) + .collect(Collectors.toList()); + } + + @Override + public synchronized CompletableFuture isAvailable() { + if (availableFuture == null) { + availableFuture = new CompletableFuture<>(); + } + return availableFuture; + } + + @Override + public synchronized int pendingSplitCount() { + return pendingSplits.size(); + } + + @Override + public long pendingRecords() { + return pendingSplits.stream() + .map(split -> split.task().estimatedRowsCount()) + .reduce(0L, Long::sum); + } + + private synchronized void completeAvailableFuturesIfNeeded() { + if (availableFuture != null && !pendingSplits.isEmpty()) { + availableFuture.complete(null); + } + availableFuture = null; + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/assigner/GetSplitResult.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/assigner/GetSplitResult.java new file mode 100644 index 000000000000..72deaeb890f3 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/assigner/GetSplitResult.java @@ -0,0 +1,77 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.assigner; + +import org.apache.flink.annotation.Internal; +import org.apache.flink.util.Preconditions; +import org.apache.iceberg.flink.source.split.IcebergSourceSplit; + +@Internal +public class GetSplitResult { + + public enum Status { + AVAILABLE, + + /** + * There are pending splits. But they can't be assigned due to constraints (like event time + * alignment) + */ + CONSTRAINED, + + /** Assigner doesn't have pending splits. */ + UNAVAILABLE + } + + private final Status status; + private final IcebergSourceSplit split; + + private GetSplitResult(Status status) { + this.status = status; + this.split = null; + } + + private GetSplitResult(IcebergSourceSplit split) { + Preconditions.checkNotNull(split, "Split cannot be null"); + this.status = Status.AVAILABLE; + this.split = split; + } + + public Status status() { + return status; + } + + public IcebergSourceSplit split() { + return split; + } + + private static final GetSplitResult UNAVAILABLE = new GetSplitResult(Status.UNAVAILABLE); + private static final GetSplitResult CONSTRAINED = new GetSplitResult(Status.CONSTRAINED); + + public static GetSplitResult unavailable() { + return UNAVAILABLE; + } + + public static GetSplitResult constrained() { + return CONSTRAINED; + } + + public static GetSplitResult forSplit(IcebergSourceSplit split) { + return new GetSplitResult(split); + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/assigner/OrderedSplitAssignerFactory.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/assigner/OrderedSplitAssignerFactory.java new file mode 100644 index 000000000000..e58478897aef --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/assigner/OrderedSplitAssignerFactory.java @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.assigner; + +import java.util.Collection; +import org.apache.iceberg.flink.source.split.IcebergSourceSplit; +import org.apache.iceberg.flink.source.split.IcebergSourceSplitState; +import org.apache.iceberg.flink.source.split.SerializableComparator; + +/** + * Create default assigner with a comparator that hands out splits where the order of the splits + * will be defined by the {@link SerializableComparator}. + */ +public class OrderedSplitAssignerFactory implements SplitAssignerFactory { + private final SerializableComparator comparator; + + public OrderedSplitAssignerFactory(SerializableComparator comparator) { + this.comparator = comparator; + } + + @Override + public SplitAssigner createAssigner() { + return new DefaultSplitAssigner(comparator); + } + + @Override + public SplitAssigner createAssigner(Collection assignerState) { + return new DefaultSplitAssigner(comparator, assignerState); + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SimpleSplitAssignerFactory.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SimpleSplitAssignerFactory.java new file mode 100644 index 000000000000..a2e2ff364d46 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SimpleSplitAssignerFactory.java @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.assigner; + +import java.util.Collection; +import org.apache.iceberg.flink.source.split.IcebergSourceSplitState; + +/** Create simple assigner that hands out splits without any guarantee in order or locality. */ +public class SimpleSplitAssignerFactory implements SplitAssignerFactory { + public SimpleSplitAssignerFactory() {} + + @Override + public SplitAssigner createAssigner() { + return new DefaultSplitAssigner(null); + } + + @Override + public SplitAssigner createAssigner(Collection assignerState) { + return new DefaultSplitAssigner(null, assignerState); + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssigner.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssigner.java new file mode 100644 index 000000000000..dae7c8cca70c --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssigner.java @@ -0,0 +1,124 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.assigner; + +import java.io.Closeable; +import java.util.Collection; +import java.util.concurrent.CompletableFuture; +import javax.annotation.Nullable; +import org.apache.flink.api.connector.source.SplitEnumeratorContext; +import org.apache.iceberg.flink.source.ScanContext; +import org.apache.iceberg.flink.source.split.IcebergSourceSplit; +import org.apache.iceberg.flink.source.split.IcebergSourceSplitState; + +/** + * SplitAssigner interface is extracted out as a separate component so that we can plug in different + * split assignment strategy for different requirements. E.g. + * + *

      + *
    • Simple assigner with no ordering guarantee or locality aware optimization. + *
    • Locality aware assigner that prefer splits that are local. + *
    • Snapshot aware assigner that assign splits based on the order they are committed. + *
    • Event time alignment assigner that assign splits satisfying certain time ordering within a + * single source or across sources. + *
    + * + *

    Assigner implementation needs to be thread safe. Enumerator call the assigner APIs mostly from + * the coordinator thread. But enumerator may call the {@link SplitAssigner#pendingSplitCount()} + * from the I/O threads. + */ +public interface SplitAssigner extends Closeable { + + /** + * Some assigners may need to start background threads or perform other activity such as + * registering as listeners to updates from other event sources e.g., watermark tracker. + */ + default void start() {} + + /** + * Some assigners may need to perform certain actions when their corresponding enumerators are + * closed + */ + @Override + default void close() {} + + /** + * Request a new split from the assigner when enumerator trying to assign splits to awaiting + * readers. + * + *

    If enumerator wasn't able to assign the split (e.g., reader disconnected), enumerator should + * call {@link SplitAssigner#onUnassignedSplits} to return the split. + */ + GetSplitResult getNext(@Nullable String hostname); + + /** Add new splits discovered by enumerator */ + void onDiscoveredSplits(Collection splits); + + /** Forward addSplitsBack event (for failed reader) to assigner */ + void onUnassignedSplits(Collection splits); + + /** + * Some assigner (like event time alignment) may rack in-progress splits to advance watermark upon + * completed splits + */ + default void onCompletedSplits(Collection completedSplitIds) {} + + /** + * Get assigner state for checkpointing. This is a super-set API that works for all currently + * imagined assigners. + */ + Collection state(); + + /** + * Enumerator can get a notification via CompletableFuture when the assigner has more splits + * available later. Enumerator should schedule assignment in the thenAccept action of the future. + * + *

    Assigner will return the same future if this method is called again before the previous + * future is completed. + * + *

    The future can be completed from other thread, e.g. the coordinator thread from another + * thread for event time alignment. + * + *

    If enumerator need to trigger action upon the future completion, it may want to run it in + * the coordinator thread using {@link SplitEnumeratorContext#runInCoordinatorThread(Runnable)}. + */ + CompletableFuture isAvailable(); + + /** + * Return the number of pending splits that haven't been assigned yet. + * + *

    The enumerator can poll this API to publish a metric on the number of pending splits. + * + *

    The enumerator can also use this information to throttle split discovery for streaming read. + * If there are already many pending splits tracked by the assigner, it is undesirable to discover + * more splits and track them in the assigner. That will increase the memory footprint and + * enumerator checkpoint size. + * + *

    Throttling works better together with {@link ScanContext#maxPlanningSnapshotCount()}. + * Otherwise, the next split discovery after throttling will just discover all non-enumerated + * snapshots and splits, which defeats the purpose of throttling. + */ + int pendingSplitCount(); + + /** + * Return the number of pending records, which can act as a measure of the source lag. This value + * could be an estimation if the exact number of records cannot be accurately computed. + */ + long pendingRecords(); +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssignerFactory.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssignerFactory.java new file mode 100644 index 000000000000..6e02a556ffcd --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssignerFactory.java @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.assigner; + +import java.io.Serializable; +import java.util.Collection; +import org.apache.iceberg.flink.source.split.IcebergSourceSplitState; + +public interface SplitAssignerFactory extends Serializable { + + SplitAssigner createAssigner(); + + SplitAssigner createAssigner(Collection assignerState); +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssignerType.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssignerType.java new file mode 100644 index 000000000000..03ba67a554f9 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssignerType.java @@ -0,0 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.assigner; + +import org.apache.flink.annotation.Internal; + +@Internal +public enum SplitAssignerType { + SIMPLE { + @Override + public SplitAssignerFactory factory() { + return new SimpleSplitAssignerFactory(); + } + }; + + public abstract SplitAssignerFactory factory(); +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/AbstractIcebergEnumerator.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/AbstractIcebergEnumerator.java new file mode 100644 index 000000000000..280a126a46ce --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/AbstractIcebergEnumerator.java @@ -0,0 +1,184 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.enumerator; + +import java.io.IOException; +import java.util.Iterator; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.atomic.AtomicReference; +import javax.annotation.Nullable; +import org.apache.flink.api.connector.source.SourceEvent; +import org.apache.flink.api.connector.source.SplitEnumerator; +import org.apache.flink.api.connector.source.SplitEnumeratorContext; +import org.apache.flink.api.connector.source.SupportsHandleExecutionAttemptSourceEvent; +import org.apache.iceberg.flink.source.assigner.GetSplitResult; +import org.apache.iceberg.flink.source.assigner.SplitAssigner; +import org.apache.iceberg.flink.source.split.IcebergSourceSplit; +import org.apache.iceberg.flink.source.split.SplitRequestEvent; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +abstract class AbstractIcebergEnumerator + implements SplitEnumerator, + SupportsHandleExecutionAttemptSourceEvent { + private static final Logger LOG = LoggerFactory.getLogger(AbstractIcebergEnumerator.class); + + private final SplitEnumeratorContext enumeratorContext; + private final SplitAssigner assigner; + private final Map readersAwaitingSplit; + private final AtomicReference> availableFuture; + + AbstractIcebergEnumerator( + SplitEnumeratorContext enumeratorContext, SplitAssigner assigner) { + this.enumeratorContext = enumeratorContext; + this.assigner = assigner; + this.readersAwaitingSplit = new LinkedHashMap<>(); + this.availableFuture = new AtomicReference<>(); + this.enumeratorContext + .metricGroup() + // This number may not capture the entire backlog due to split discovery throttling to avoid + // excessive memory footprint. Some pending splits may not have been discovered yet. + .setUnassignedSplitsGauge(() -> Long.valueOf(assigner.pendingSplitCount())); + this.enumeratorContext.metricGroup().gauge("pendingRecords", assigner::pendingRecords); + } + + @Override + public void start() { + assigner.start(); + } + + @Override + public void close() throws IOException { + assigner.close(); + } + + @Override + public void handleSplitRequest(int subtaskId, @Nullable String requesterHostname) { + // Iceberg source uses custom split request event to piggyback finished split ids. + throw new UnsupportedOperationException( + String.format( + "Received invalid default split request event " + + "from subtask %d as Iceberg source uses custom split request event", + subtaskId)); + } + + @Override + public void handleSourceEvent(int subtaskId, SourceEvent sourceEvent) { + if (sourceEvent instanceof SplitRequestEvent) { + SplitRequestEvent splitRequestEvent = (SplitRequestEvent) sourceEvent; + LOG.info("Received request split event from subtask {}", subtaskId); + assigner.onCompletedSplits(splitRequestEvent.finishedSplitIds()); + readersAwaitingSplit.put(subtaskId, splitRequestEvent.requesterHostname()); + assignSplits(); + } else { + throw new IllegalArgumentException( + String.format( + "Received unknown event from subtask %d: %s", + subtaskId, sourceEvent.getClass().getCanonicalName())); + } + } + + // Flink's SourceCoordinator already keeps track of subTask to splits mapping. + // It already takes care of re-assigning splits to speculated attempts as well. + @Override + public void handleSourceEvent(int subTaskId, int attemptNumber, SourceEvent sourceEvent) { + handleSourceEvent(subTaskId, sourceEvent); + } + + @Override + public void addSplitsBack(List splits, int subtaskId) { + LOG.info("Add {} splits back to the pool for failed subtask {}", splits.size(), subtaskId); + assigner.onUnassignedSplits(splits); + assignSplits(); + } + + @Override + public void addReader(int subtaskId) { + LOG.info("Added reader: {}", subtaskId); + } + + private void assignSplits() { + LOG.info("Assigning splits for {} awaiting readers", readersAwaitingSplit.size()); + Iterator> awaitingReader = + readersAwaitingSplit.entrySet().iterator(); + while (awaitingReader.hasNext()) { + Map.Entry nextAwaiting = awaitingReader.next(); + // if the reader that requested another split has failed in the meantime, remove + // it from the list of waiting readers + if (!enumeratorContext.registeredReaders().containsKey(nextAwaiting.getKey())) { + awaitingReader.remove(); + continue; + } + + int awaitingSubtask = nextAwaiting.getKey(); + String hostname = nextAwaiting.getValue(); + GetSplitResult getResult = assigner.getNext(hostname); + if (getResult.status() == GetSplitResult.Status.AVAILABLE) { + LOG.info("Assign split to subtask {}: {}", awaitingSubtask, getResult.split()); + enumeratorContext.assignSplit(getResult.split(), awaitingSubtask); + awaitingReader.remove(); + } else if (getResult.status() == GetSplitResult.Status.CONSTRAINED) { + getAvailableFutureIfNeeded(); + break; + } else if (getResult.status() == GetSplitResult.Status.UNAVAILABLE) { + if (shouldWaitForMoreSplits()) { + getAvailableFutureIfNeeded(); + break; + } else { + LOG.info("No more splits available for subtask {}", awaitingSubtask); + enumeratorContext.signalNoMoreSplits(awaitingSubtask); + awaitingReader.remove(); + } + } else { + throw new IllegalArgumentException("Unsupported status: " + getResult.status()); + } + } + } + + /** return true if enumerator should wait for splits like in the continuous enumerator case */ + protected abstract boolean shouldWaitForMoreSplits(); + + private synchronized void getAvailableFutureIfNeeded() { + if (availableFuture.get() != null) { + return; + } + + CompletableFuture future = + assigner + .isAvailable() + .thenAccept( + ignore -> + // Must run assignSplits in coordinator thread + // because the future may be completed from other threads. + // E.g., in event time alignment assigner, + // watermark advancement from another source may + // cause the available future to be completed + enumeratorContext.runInCoordinatorThread( + () -> { + LOG.debug("Executing callback of assignSplits"); + availableFuture.set(null); + assignSplits(); + })); + availableFuture.set(future); + LOG.debug("Registered callback for future available splits"); + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousEnumerationResult.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousEnumerationResult.java new file mode 100644 index 000000000000..41863ffee60b --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousEnumerationResult.java @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.enumerator; + +import java.util.Collection; +import org.apache.iceberg.flink.source.split.IcebergSourceSplit; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; + +class ContinuousEnumerationResult { + private final Collection splits; + private final IcebergEnumeratorPosition fromPosition; + private final IcebergEnumeratorPosition toPosition; + + /** + * @param splits should never be null. But it can be an empty collection + * @param fromPosition can be null + * @param toPosition should never be null. But it can have null snapshotId and snapshotTimestampMs + */ + ContinuousEnumerationResult( + Collection splits, + IcebergEnumeratorPosition fromPosition, + IcebergEnumeratorPosition toPosition) { + Preconditions.checkArgument(splits != null, "Invalid to splits collection: null"); + Preconditions.checkArgument(toPosition != null, "Invalid end position: null"); + this.splits = splits; + this.fromPosition = fromPosition; + this.toPosition = toPosition; + } + + public Collection splits() { + return splits; + } + + public IcebergEnumeratorPosition fromPosition() { + return fromPosition; + } + + public IcebergEnumeratorPosition toPosition() { + return toPosition; + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousIcebergEnumerator.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousIcebergEnumerator.java new file mode 100644 index 000000000000..c7021b9c6847 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousIcebergEnumerator.java @@ -0,0 +1,187 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.enumerator; + +import java.io.IOException; +import java.util.Collections; +import java.util.Objects; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicReference; +import javax.annotation.Nullable; +import org.apache.flink.annotation.Internal; +import org.apache.flink.api.connector.source.SplitEnumeratorContext; +import org.apache.iceberg.flink.source.ScanContext; +import org.apache.iceberg.flink.source.assigner.SplitAssigner; +import org.apache.iceberg.flink.source.split.IcebergSourceSplit; +import org.apache.iceberg.flink.util.ElapsedTimeGauge; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +@Internal +public class ContinuousIcebergEnumerator extends AbstractIcebergEnumerator { + + private static final Logger LOG = LoggerFactory.getLogger(ContinuousIcebergEnumerator.class); + /** + * This is hardcoded, as {@link ScanContext#maxPlanningSnapshotCount()} could be the knob to + * control the total number of snapshots worth of splits tracked by assigner. + */ + private static final int ENUMERATION_SPLIT_COUNT_HISTORY_SIZE = 3; + + private final SplitEnumeratorContext enumeratorContext; + private final SplitAssigner assigner; + private final ScanContext scanContext; + private final ContinuousSplitPlanner splitPlanner; + + /** + * snapshotId for the last enumerated snapshot. next incremental enumeration should be based off + * this as the starting position. + */ + private final AtomicReference enumeratorPosition; + + /** Track enumeration result history for split discovery throttling. */ + private final EnumerationHistory enumerationHistory; + + /** Count the consecutive failures and throw exception if the max allowed failres are reached */ + private transient int consecutiveFailures = 0; + + private final ElapsedTimeGauge elapsedSecondsSinceLastSplitDiscovery; + + public ContinuousIcebergEnumerator( + SplitEnumeratorContext enumeratorContext, + SplitAssigner assigner, + ScanContext scanContext, + ContinuousSplitPlanner splitPlanner, + @Nullable IcebergEnumeratorState enumState) { + super(enumeratorContext, assigner); + + this.enumeratorContext = enumeratorContext; + this.assigner = assigner; + this.scanContext = scanContext; + this.splitPlanner = splitPlanner; + this.enumeratorPosition = new AtomicReference<>(); + this.enumerationHistory = new EnumerationHistory(ENUMERATION_SPLIT_COUNT_HISTORY_SIZE); + this.elapsedSecondsSinceLastSplitDiscovery = new ElapsedTimeGauge(TimeUnit.SECONDS); + this.enumeratorContext + .metricGroup() + .gauge("elapsedSecondsSinceLastSplitDiscovery", elapsedSecondsSinceLastSplitDiscovery); + + if (enumState != null) { + this.enumeratorPosition.set(enumState.lastEnumeratedPosition()); + this.enumerationHistory.restore(enumState.enumerationSplitCountHistory()); + } + } + + @Override + public void start() { + super.start(); + enumeratorContext.callAsync( + this::discoverSplits, + this::processDiscoveredSplits, + 0L, + scanContext.monitorInterval().toMillis()); + } + + @Override + public void close() throws IOException { + splitPlanner.close(); + super.close(); + } + + @Override + protected boolean shouldWaitForMoreSplits() { + return true; + } + + @Override + public IcebergEnumeratorState snapshotState(long checkpointId) { + return new IcebergEnumeratorState( + enumeratorPosition.get(), assigner.state(), enumerationHistory.snapshot()); + } + + /** This method is executed in an IO thread pool. */ + private ContinuousEnumerationResult discoverSplits() { + int pendingSplitCountFromAssigner = assigner.pendingSplitCount(); + if (enumerationHistory.shouldPauseSplitDiscovery(pendingSplitCountFromAssigner)) { + // If the assigner already has many pending splits, it is better to pause split discovery. + // Otherwise, eagerly discovering more splits will just increase assigner memory footprint + // and enumerator checkpoint state size. + LOG.info( + "Pause split discovery as the assigner already has too many pending splits: {}", + pendingSplitCountFromAssigner); + return new ContinuousEnumerationResult( + Collections.emptyList(), enumeratorPosition.get(), enumeratorPosition.get()); + } else { + return splitPlanner.planSplits(enumeratorPosition.get()); + } + } + + /** This method is executed in a single coordinator thread. */ + private void processDiscoveredSplits(ContinuousEnumerationResult result, Throwable error) { + if (error == null) { + consecutiveFailures = 0; + if (!Objects.equals(result.fromPosition(), enumeratorPosition.get())) { + // Multiple discoverSplits() may be triggered with the same starting snapshot to the I/O + // thread pool. E.g., the splitDiscoveryInterval is very short (like 10 ms in some unit + // tests) or the thread pool is busy and multiple discovery actions are executed + // concurrently. Discovery result should only be accepted if the starting position + // matches the enumerator position (like compare-and-swap). + LOG.info( + "Skip {} discovered splits because the scan starting position doesn't match " + + "the current enumerator position: enumerator position = {}, scan starting position = {}", + result.splits().size(), + enumeratorPosition.get(), + result.fromPosition()); + } else { + elapsedSecondsSinceLastSplitDiscovery.refreshLastRecordedTime(); + // Sometimes, enumeration may yield no splits for a few reasons. + // - upstream paused or delayed streaming writes to the Iceberg table. + // - enumeration frequency is higher than the upstream write frequency. + if (!result.splits().isEmpty()) { + assigner.onDiscoveredSplits(result.splits()); + // EnumerationHistory makes throttling decision on split discovery + // based on the total number of splits discovered in the last a few cycles. + // Only update enumeration history when there are some discovered splits. + enumerationHistory.add(result.splits().size()); + LOG.info( + "Added {} splits discovered between ({}, {}] to the assigner", + result.splits().size(), + result.fromPosition(), + result.toPosition()); + } else { + LOG.info( + "No new splits discovered between ({}, {}]", + result.fromPosition(), + result.toPosition()); + } + // update the enumerator position even if there is no split discovered + // or the toPosition is empty (e.g. for empty table). + enumeratorPosition.set(result.toPosition()); + LOG.info("Update enumerator position to {}", result.toPosition()); + } + } else { + consecutiveFailures++; + if (scanContext.maxAllowedPlanningFailures() < 0 + || consecutiveFailures <= scanContext.maxAllowedPlanningFailures()) { + LOG.error("Failed to discover new splits", error); + } else { + throw new RuntimeException("Failed to discover new splits", error); + } + } + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousSplitPlanner.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousSplitPlanner.java new file mode 100644 index 000000000000..2a1325178873 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousSplitPlanner.java @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.enumerator; + +import java.io.Closeable; +import org.apache.flink.annotation.Internal; + +/** This interface is introduced so that we can plug in different split planner for unit test */ +@Internal +public interface ContinuousSplitPlanner extends Closeable { + + /** Discover the files appended between {@code lastPosition} and current table snapshot */ + ContinuousEnumerationResult planSplits(IcebergEnumeratorPosition lastPosition); +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousSplitPlannerImpl.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousSplitPlannerImpl.java new file mode 100644 index 000000000000..fef4ec45ed8a --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousSplitPlannerImpl.java @@ -0,0 +1,240 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.enumerator; + +import java.io.IOException; +import java.util.Collections; +import java.util.List; +import java.util.Optional; +import java.util.concurrent.ExecutorService; +import org.apache.flink.annotation.Internal; +import org.apache.flink.annotation.VisibleForTesting; +import org.apache.flink.util.Preconditions; +import org.apache.iceberg.Snapshot; +import org.apache.iceberg.Table; +import org.apache.iceberg.flink.TableLoader; +import org.apache.iceberg.flink.source.FlinkSplitPlanner; +import org.apache.iceberg.flink.source.ScanContext; +import org.apache.iceberg.flink.source.StreamingStartingStrategy; +import org.apache.iceberg.flink.source.split.IcebergSourceSplit; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.util.SnapshotUtil; +import org.apache.iceberg.util.ThreadPools; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +@Internal +public class ContinuousSplitPlannerImpl implements ContinuousSplitPlanner { + private static final Logger LOG = LoggerFactory.getLogger(ContinuousSplitPlannerImpl.class); + + private final Table table; + private final ScanContext scanContext; + private final boolean isSharedPool; + private final ExecutorService workerPool; + private final TableLoader tableLoader; + + /** + * @param tableLoader A cloned tableLoader. + * @param threadName thread name prefix for worker pool to run the split planning. If null, a + * shared worker pool will be used. + */ + public ContinuousSplitPlannerImpl( + TableLoader tableLoader, ScanContext scanContext, String threadName) { + this.tableLoader = tableLoader.clone(); + this.tableLoader.open(); + this.table = this.tableLoader.loadTable(); + this.scanContext = scanContext; + this.isSharedPool = threadName == null; + this.workerPool = + isSharedPool + ? ThreadPools.getWorkerPool() + : ThreadPools.newWorkerPool( + "iceberg-plan-worker-pool-" + threadName, scanContext.planParallelism()); + } + + @Override + public void close() throws IOException { + if (!isSharedPool) { + workerPool.shutdown(); + } + tableLoader.close(); + } + + @Override + public ContinuousEnumerationResult planSplits(IcebergEnumeratorPosition lastPosition) { + table.refresh(); + if (lastPosition != null) { + return discoverIncrementalSplits(lastPosition); + } else { + return discoverInitialSplits(); + } + } + + private Snapshot toSnapshotInclusive( + Long lastConsumedSnapshotId, Snapshot currentSnapshot, int maxPlanningSnapshotCount) { + // snapshots are in reverse order (latest snapshot first) + List snapshots = + Lists.newArrayList( + SnapshotUtil.ancestorsBetween( + table, currentSnapshot.snapshotId(), lastConsumedSnapshotId)); + if (snapshots.size() <= maxPlanningSnapshotCount) { + return currentSnapshot; + } else { + // Because snapshots are in reverse order of commit history, this index returns + // the max allowed number of snapshots from the lastConsumedSnapshotId. + return snapshots.get(snapshots.size() - maxPlanningSnapshotCount); + } + } + + private ContinuousEnumerationResult discoverIncrementalSplits( + IcebergEnumeratorPosition lastPosition) { + Snapshot currentSnapshot = + scanContext.branch() != null + ? table.snapshot(scanContext.branch()) + : table.currentSnapshot(); + + if (currentSnapshot == null) { + // empty table + Preconditions.checkArgument( + lastPosition.snapshotId() == null, + "Invalid last enumerated position for an empty table: not null"); + LOG.info("Skip incremental scan because table is empty"); + return new ContinuousEnumerationResult(Collections.emptyList(), lastPosition, lastPosition); + } else if (lastPosition.snapshotId() != null + && currentSnapshot.snapshotId() == lastPosition.snapshotId()) { + LOG.info("Current table snapshot is already enumerated: {}", currentSnapshot.snapshotId()); + return new ContinuousEnumerationResult(Collections.emptyList(), lastPosition, lastPosition); + } else { + Long lastConsumedSnapshotId = lastPosition.snapshotId(); + Snapshot toSnapshotInclusive = + toSnapshotInclusive( + lastConsumedSnapshotId, currentSnapshot, scanContext.maxPlanningSnapshotCount()); + IcebergEnumeratorPosition newPosition = + IcebergEnumeratorPosition.of( + toSnapshotInclusive.snapshotId(), toSnapshotInclusive.timestampMillis()); + ScanContext incrementalScan = + scanContext.copyWithAppendsBetween( + lastPosition.snapshotId(), toSnapshotInclusive.snapshotId()); + List splits = + FlinkSplitPlanner.planIcebergSourceSplits(table, incrementalScan, workerPool); + LOG.info( + "Discovered {} splits from incremental scan: " + + "from snapshot (exclusive) is {}, to snapshot (inclusive) is {}", + splits.size(), + lastPosition, + newPosition); + return new ContinuousEnumerationResult(splits, lastPosition, newPosition); + } + } + + /** + * Discovery initial set of splits based on {@link StreamingStartingStrategy}. + *

  • {@link ContinuousEnumerationResult#splits()} should contain initial splits discovered from + * table scan for {@link StreamingStartingStrategy#TABLE_SCAN_THEN_INCREMENTAL}. For all other + * strategies, splits collection should be empty. + *
  • {@link ContinuousEnumerationResult#toPosition()} points to the starting position for the + * next incremental split discovery with exclusive behavior. Meaning files committed by the + * snapshot from the position in {@code ContinuousEnumerationResult} won't be included in the + * next incremental scan. + */ + private ContinuousEnumerationResult discoverInitialSplits() { + Optional startSnapshotOptional = startSnapshot(table, scanContext); + if (!startSnapshotOptional.isPresent()) { + return new ContinuousEnumerationResult( + Collections.emptyList(), null, IcebergEnumeratorPosition.empty()); + } + + Snapshot startSnapshot = startSnapshotOptional.get(); + LOG.info( + "Get starting snapshot id {} based on strategy {}", + startSnapshot.snapshotId(), + scanContext.streamingStartingStrategy()); + List splits; + IcebergEnumeratorPosition toPosition; + if (scanContext.streamingStartingStrategy() + == StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) { + // do a batch table scan first + splits = + FlinkSplitPlanner.planIcebergSourceSplits( + table, scanContext.copyWithSnapshotId(startSnapshot.snapshotId()), workerPool); + LOG.info( + "Discovered {} splits from initial batch table scan with snapshot Id {}", + splits.size(), + startSnapshot.snapshotId()); + // For TABLE_SCAN_THEN_INCREMENTAL, incremental mode starts exclusive from the startSnapshot + toPosition = + IcebergEnumeratorPosition.of(startSnapshot.snapshotId(), startSnapshot.timestampMillis()); + } else { + // For all other modes, starting snapshot should be consumed inclusively. + // Use parentId to achieve the inclusive behavior. It is fine if parentId is null. + splits = Collections.emptyList(); + Long parentSnapshotId = startSnapshot.parentId(); + if (parentSnapshotId != null) { + Snapshot parentSnapshot = table.snapshot(parentSnapshotId); + Long parentSnapshotTimestampMs = + parentSnapshot != null ? parentSnapshot.timestampMillis() : null; + toPosition = IcebergEnumeratorPosition.of(parentSnapshotId, parentSnapshotTimestampMs); + } else { + toPosition = IcebergEnumeratorPosition.empty(); + } + + LOG.info( + "Start incremental scan with start snapshot (inclusive): id = {}, timestamp = {}", + startSnapshot.snapshotId(), + startSnapshot.timestampMillis()); + } + + return new ContinuousEnumerationResult(splits, null, toPosition); + } + + /** + * Calculate the starting snapshot based on the {@link StreamingStartingStrategy} defined in + * {@code ScanContext}. + * + *

    If the {@link StreamingStartingStrategy} is not {@link + * StreamingStartingStrategy#TABLE_SCAN_THEN_INCREMENTAL}, the start snapshot should be consumed + * inclusively. + */ + @VisibleForTesting + static Optional startSnapshot(Table table, ScanContext scanContext) { + switch (scanContext.streamingStartingStrategy()) { + case TABLE_SCAN_THEN_INCREMENTAL: + case INCREMENTAL_FROM_LATEST_SNAPSHOT: + return Optional.ofNullable(table.currentSnapshot()); + case INCREMENTAL_FROM_EARLIEST_SNAPSHOT: + return Optional.ofNullable(SnapshotUtil.oldestAncestor(table)); + case INCREMENTAL_FROM_SNAPSHOT_ID: + Snapshot matchedSnapshotById = table.snapshot(scanContext.startSnapshotId()); + Preconditions.checkArgument( + matchedSnapshotById != null, + "Start snapshot id not found in history: " + scanContext.startSnapshotId()); + return Optional.of(matchedSnapshotById); + case INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP: + Snapshot matchedSnapshotByTimestamp = + SnapshotUtil.oldestAncestorAfter(table, scanContext.startSnapshotTimestamp()); + Preconditions.checkArgument( + matchedSnapshotByTimestamp != null, + "Cannot find a snapshot after: " + scanContext.startSnapshotTimestamp()); + return Optional.of(matchedSnapshotByTimestamp); + default: + throw new IllegalArgumentException( + "Unknown starting strategy: " + scanContext.streamingStartingStrategy()); + } + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/EnumerationHistory.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/EnumerationHistory.java new file mode 100644 index 000000000000..ec56a9ecdac1 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/EnumerationHistory.java @@ -0,0 +1,100 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.enumerator; + +import java.util.Arrays; +import javax.annotation.concurrent.ThreadSafe; +import org.apache.flink.annotation.VisibleForTesting; + +/** + * This enumeration history is used for split discovery throttling. It tracks the discovered split + * count per every non-empty enumeration. + */ +@ThreadSafe +class EnumerationHistory { + + private final int[] history; + // int (2B) should be enough without overflow for enumeration history + private int count; + + EnumerationHistory(int maxHistorySize) { + this.history = new int[maxHistorySize]; + } + + synchronized void restore(int[] restoredHistory) { + int startingOffset = 0; + int restoreSize = restoredHistory.length; + + if (restoredHistory.length > history.length) { + // keep the newest history + startingOffset = restoredHistory.length - history.length; + // only restore the latest history up to maxHistorySize + restoreSize = history.length; + } + + System.arraycopy(restoredHistory, startingOffset, history, 0, restoreSize); + count = restoreSize; + } + + synchronized int[] snapshot() { + int len = history.length; + if (count > len) { + int[] copy = new int[len]; + // this is like a circular buffer + int indexForOldest = count % len; + System.arraycopy(history, indexForOldest, copy, 0, len - indexForOldest); + System.arraycopy(history, 0, copy, len - indexForOldest, indexForOldest); + return copy; + } else { + return Arrays.copyOfRange(history, 0, count); + } + } + + /** Add the split count from the last enumeration result. */ + synchronized void add(int splitCount) { + int pos = count % history.length; + history[pos] = splitCount; + count += 1; + } + + @VisibleForTesting + synchronized boolean hasFullHistory() { + return count >= history.length; + } + + /** + * Checks whether split discovery should be paused. + * + * @return true if split discovery should pause because assigner has too many splits already. + */ + synchronized boolean shouldPauseSplitDiscovery(int pendingSplitCountFromAssigner) { + if (count < history.length) { + // only check throttling when full history is obtained. + return false; + } else { + // if ScanContext#maxPlanningSnapshotCount() is 10, each split enumeration can + // discovery splits up to 10 snapshots. if maxHistorySize is 3, the max number of + // splits tracked in assigner shouldn't be more than 10 * (3 + 1) snapshots + // worth of splits. +1 because there could be another enumeration when the + // pending splits fall just below the 10 * 3. + int totalSplitCountFromRecentDiscovery = Arrays.stream(history).reduce(0, Integer::sum); + return pendingSplitCountFromAssigner >= totalSplitCountFromRecentDiscovery; + } + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorPosition.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorPosition.java new file mode 100644 index 000000000000..96aba296f8cf --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorPosition.java @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.enumerator; + +import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; +import org.apache.iceberg.relocated.com.google.common.base.Objects; + +class IcebergEnumeratorPosition { + private final Long snapshotId; + // Track snapshot timestamp mainly for info logging + private final Long snapshotTimestampMs; + + static IcebergEnumeratorPosition empty() { + return new IcebergEnumeratorPosition(null, null); + } + + static IcebergEnumeratorPosition of(long snapshotId, Long snapshotTimestampMs) { + return new IcebergEnumeratorPosition(snapshotId, snapshotTimestampMs); + } + + private IcebergEnumeratorPosition(Long snapshotId, Long snapshotTimestampMs) { + this.snapshotId = snapshotId; + this.snapshotTimestampMs = snapshotTimestampMs; + } + + boolean isEmpty() { + return snapshotId == null; + } + + Long snapshotId() { + return snapshotId; + } + + Long snapshotTimestampMs() { + return snapshotTimestampMs; + } + + @Override + public String toString() { + return MoreObjects.toStringHelper(this) + .add("snapshotId", snapshotId) + .add("snapshotTimestampMs", snapshotTimestampMs) + .toString(); + } + + @Override + public int hashCode() { + return Objects.hashCode(snapshotId, snapshotTimestampMs); + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + IcebergEnumeratorPosition other = (IcebergEnumeratorPosition) o; + return Objects.equal(snapshotId, other.snapshotId()) + && Objects.equal(snapshotTimestampMs, other.snapshotTimestampMs()); + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorPositionSerializer.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorPositionSerializer.java new file mode 100644 index 000000000000..1c63807361c5 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorPositionSerializer.java @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.enumerator; + +import java.io.IOException; +import org.apache.flink.core.io.SimpleVersionedSerializer; +import org.apache.flink.core.memory.DataInputDeserializer; +import org.apache.flink.core.memory.DataOutputSerializer; + +class IcebergEnumeratorPositionSerializer + implements SimpleVersionedSerializer { + + public static final IcebergEnumeratorPositionSerializer INSTANCE = + new IcebergEnumeratorPositionSerializer(); + + private static final int VERSION = 1; + + private static final ThreadLocal SERIALIZER_CACHE = + ThreadLocal.withInitial(() -> new DataOutputSerializer(128)); + + @Override + public int getVersion() { + return VERSION; + } + + @Override + public byte[] serialize(IcebergEnumeratorPosition position) throws IOException { + return serializeV1(position); + } + + @Override + public IcebergEnumeratorPosition deserialize(int version, byte[] serialized) throws IOException { + switch (version) { + case 1: + return deserializeV1(serialized); + default: + throw new IOException("Unknown version: " + version); + } + } + + private byte[] serializeV1(IcebergEnumeratorPosition position) throws IOException { + DataOutputSerializer out = SERIALIZER_CACHE.get(); + out.writeBoolean(position.snapshotId() != null); + if (position.snapshotId() != null) { + out.writeLong(position.snapshotId()); + } + out.writeBoolean(position.snapshotTimestampMs() != null); + if (position.snapshotTimestampMs() != null) { + out.writeLong(position.snapshotTimestampMs()); + } + byte[] result = out.getCopyOfBuffer(); + out.clear(); + return result; + } + + private IcebergEnumeratorPosition deserializeV1(byte[] serialized) throws IOException { + DataInputDeserializer in = new DataInputDeserializer(serialized); + Long snapshotId = null; + if (in.readBoolean()) { + snapshotId = in.readLong(); + } + + Long snapshotTimestampMs = null; + if (in.readBoolean()) { + snapshotTimestampMs = in.readLong(); + } + + if (snapshotId != null) { + return IcebergEnumeratorPosition.of(snapshotId, snapshotTimestampMs); + } else { + return IcebergEnumeratorPosition.empty(); + } + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorState.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorState.java new file mode 100644 index 000000000000..26fbad46c128 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorState.java @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.enumerator; + +import java.io.Serializable; +import java.util.Collection; +import javax.annotation.Nullable; +import org.apache.flink.annotation.Internal; +import org.apache.iceberg.flink.source.split.IcebergSourceSplitState; + +/** Enumerator state for checkpointing */ +@Internal +public class IcebergEnumeratorState implements Serializable { + @Nullable private final IcebergEnumeratorPosition lastEnumeratedPosition; + private final Collection pendingSplits; + private final int[] enumerationSplitCountHistory; + + public IcebergEnumeratorState(Collection pendingSplits) { + this(null, pendingSplits); + } + + public IcebergEnumeratorState( + @Nullable IcebergEnumeratorPosition lastEnumeratedPosition, + Collection pendingSplits) { + this(lastEnumeratedPosition, pendingSplits, new int[0]); + } + + public IcebergEnumeratorState( + @Nullable IcebergEnumeratorPosition lastEnumeratedPosition, + Collection pendingSplits, + int[] enumerationSplitCountHistory) { + this.lastEnumeratedPosition = lastEnumeratedPosition; + this.pendingSplits = pendingSplits; + this.enumerationSplitCountHistory = enumerationSplitCountHistory; + } + + @Nullable + public IcebergEnumeratorPosition lastEnumeratedPosition() { + return lastEnumeratedPosition; + } + + public Collection pendingSplits() { + return pendingSplits; + } + + public int[] enumerationSplitCountHistory() { + return enumerationSplitCountHistory; + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorStateSerializer.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorStateSerializer.java new file mode 100644 index 000000000000..f76f8a69ff0e --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorStateSerializer.java @@ -0,0 +1,194 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.enumerator; + +import java.io.IOException; +import java.util.Collection; +import org.apache.flink.annotation.Internal; +import org.apache.flink.annotation.VisibleForTesting; +import org.apache.flink.core.io.SimpleVersionedSerializer; +import org.apache.flink.core.memory.DataInputDeserializer; +import org.apache.flink.core.memory.DataOutputSerializer; +import org.apache.iceberg.flink.source.split.IcebergSourceSplit; +import org.apache.iceberg.flink.source.split.IcebergSourceSplitSerializer; +import org.apache.iceberg.flink.source.split.IcebergSourceSplitState; +import org.apache.iceberg.flink.source.split.IcebergSourceSplitStatus; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; + +@Internal +public class IcebergEnumeratorStateSerializer + implements SimpleVersionedSerializer { + + private static final int VERSION = 2; + + private static final ThreadLocal SERIALIZER_CACHE = + ThreadLocal.withInitial(() -> new DataOutputSerializer(1024)); + + private final IcebergEnumeratorPositionSerializer positionSerializer = + IcebergEnumeratorPositionSerializer.INSTANCE; + private final IcebergSourceSplitSerializer splitSerializer; + + public IcebergEnumeratorStateSerializer(boolean caseSensitive) { + this.splitSerializer = new IcebergSourceSplitSerializer(caseSensitive); + } + + @Override + public int getVersion() { + return VERSION; + } + + @Override + public byte[] serialize(IcebergEnumeratorState enumState) throws IOException { + return serializeV2(enumState); + } + + @Override + public IcebergEnumeratorState deserialize(int version, byte[] serialized) throws IOException { + switch (version) { + case 1: + return deserializeV1(serialized); + case 2: + return deserializeV2(serialized); + default: + throw new IOException("Unknown version: " + version); + } + } + + @VisibleForTesting + byte[] serializeV1(IcebergEnumeratorState enumState) throws IOException { + DataOutputSerializer out = SERIALIZER_CACHE.get(); + serializeEnumeratorPosition(out, enumState.lastEnumeratedPosition(), positionSerializer); + serializePendingSplits(out, enumState.pendingSplits(), splitSerializer); + byte[] result = out.getCopyOfBuffer(); + out.clear(); + return result; + } + + @VisibleForTesting + IcebergEnumeratorState deserializeV1(byte[] serialized) throws IOException { + DataInputDeserializer in = new DataInputDeserializer(serialized); + IcebergEnumeratorPosition enumeratorPosition = + deserializeEnumeratorPosition(in, positionSerializer); + Collection pendingSplits = + deserializePendingSplits(in, splitSerializer); + return new IcebergEnumeratorState(enumeratorPosition, pendingSplits); + } + + @VisibleForTesting + byte[] serializeV2(IcebergEnumeratorState enumState) throws IOException { + DataOutputSerializer out = SERIALIZER_CACHE.get(); + serializeEnumeratorPosition(out, enumState.lastEnumeratedPosition(), positionSerializer); + serializePendingSplits(out, enumState.pendingSplits(), splitSerializer); + serializeEnumerationSplitCountHistory(out, enumState.enumerationSplitCountHistory()); + byte[] result = out.getCopyOfBuffer(); + out.clear(); + return result; + } + + @VisibleForTesting + IcebergEnumeratorState deserializeV2(byte[] serialized) throws IOException { + DataInputDeserializer in = new DataInputDeserializer(serialized); + IcebergEnumeratorPosition enumeratorPosition = + deserializeEnumeratorPosition(in, positionSerializer); + Collection pendingSplits = + deserializePendingSplits(in, splitSerializer); + int[] enumerationSplitCountHistory = deserializeEnumerationSplitCountHistory(in); + return new IcebergEnumeratorState( + enumeratorPosition, pendingSplits, enumerationSplitCountHistory); + } + + private static void serializeEnumeratorPosition( + DataOutputSerializer out, + IcebergEnumeratorPosition enumeratorPosition, + IcebergEnumeratorPositionSerializer positionSerializer) + throws IOException { + out.writeBoolean(enumeratorPosition != null); + if (enumeratorPosition != null) { + out.writeInt(positionSerializer.getVersion()); + byte[] positionBytes = positionSerializer.serialize(enumeratorPosition); + out.writeInt(positionBytes.length); + out.write(positionBytes); + } + } + + private static IcebergEnumeratorPosition deserializeEnumeratorPosition( + DataInputDeserializer in, IcebergEnumeratorPositionSerializer positionSerializer) + throws IOException { + IcebergEnumeratorPosition enumeratorPosition = null; + if (in.readBoolean()) { + int version = in.readInt(); + byte[] positionBytes = new byte[in.readInt()]; + in.read(positionBytes); + enumeratorPosition = positionSerializer.deserialize(version, positionBytes); + } + return enumeratorPosition; + } + + private static void serializePendingSplits( + DataOutputSerializer out, + Collection pendingSplits, + IcebergSourceSplitSerializer splitSerializer) + throws IOException { + out.writeInt(splitSerializer.getVersion()); + out.writeInt(pendingSplits.size()); + for (IcebergSourceSplitState splitState : pendingSplits) { + byte[] splitBytes = splitSerializer.serialize(splitState.split()); + out.writeInt(splitBytes.length); + out.write(splitBytes); + out.writeUTF(splitState.status().name()); + } + } + + private static Collection deserializePendingSplits( + DataInputDeserializer in, IcebergSourceSplitSerializer splitSerializer) throws IOException { + int splitSerializerVersion = in.readInt(); + int splitCount = in.readInt(); + Collection pendingSplits = Lists.newArrayListWithCapacity(splitCount); + for (int i = 0; i < splitCount; ++i) { + byte[] splitBytes = new byte[in.readInt()]; + in.read(splitBytes); + IcebergSourceSplit split = splitSerializer.deserialize(splitSerializerVersion, splitBytes); + String statusName = in.readUTF(); + pendingSplits.add( + new IcebergSourceSplitState(split, IcebergSourceSplitStatus.valueOf(statusName))); + } + return pendingSplits; + } + + private static void serializeEnumerationSplitCountHistory( + DataOutputSerializer out, int[] enumerationSplitCountHistory) throws IOException { + out.writeInt(enumerationSplitCountHistory.length); + for (int enumerationSplitCount : enumerationSplitCountHistory) { + out.writeInt(enumerationSplitCount); + } + } + + private static int[] deserializeEnumerationSplitCountHistory(DataInputDeserializer in) + throws IOException { + int historySize = in.readInt(); + int[] history = new int[historySize]; + if (historySize > 0) { + for (int i = 0; i < historySize; ++i) { + history[i] = in.readInt(); + } + } + + return history; + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/StaticIcebergEnumerator.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/StaticIcebergEnumerator.java new file mode 100644 index 000000000000..4e55ea5d5fd6 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/StaticIcebergEnumerator.java @@ -0,0 +1,51 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.enumerator; + +import org.apache.flink.annotation.Internal; +import org.apache.flink.api.connector.source.SplitEnumeratorContext; +import org.apache.iceberg.flink.source.assigner.SplitAssigner; +import org.apache.iceberg.flink.source.split.IcebergSourceSplit; + +/** One-time split enumeration at the start-up for batch execution */ +@Internal +public class StaticIcebergEnumerator extends AbstractIcebergEnumerator { + private final SplitAssigner assigner; + + public StaticIcebergEnumerator( + SplitEnumeratorContext enumeratorContext, SplitAssigner assigner) { + super(enumeratorContext, assigner); + this.assigner = assigner; + } + + @Override + public void start() { + super.start(); + } + + @Override + protected boolean shouldWaitForMoreSplits() { + return false; + } + + @Override + public IcebergEnumeratorState snapshotState(long checkpointId) { + return new IcebergEnumeratorState(null, assigner.state(), new int[0]); + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/ArrayBatchRecords.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/ArrayBatchRecords.java new file mode 100644 index 000000000000..7b94c364c976 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/ArrayBatchRecords.java @@ -0,0 +1,171 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.reader; + +import java.util.Collections; +import java.util.Set; +import javax.annotation.Nullable; +import org.apache.flink.annotation.VisibleForTesting; +import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; +import org.apache.flink.connector.file.src.util.Pool; +import org.apache.flink.table.data.RowData; +import org.apache.iceberg.flink.source.DataIterator; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; + +/** + * {@link RecordsWithSplitIds} is used to pass a batch of records from fetcher to source reader. + * Batching is to improve the efficiency for records handover. + * + *

    {@link RecordsWithSplitIds} interface can encapsulate batches from multiple splits. This is + * the case for Kafka source where fetchers can retrieve records from multiple Kafka partitions at + * the same time. + * + *

    For file-based sources like Iceberg, readers always read one split/file at a time. Hence, we + * will only have a batch of records for one split here. + * + *

    This class uses array to store a batch of records from the same file (with the same + * fileOffset). + */ +class ArrayBatchRecords implements RecordsWithSplitIds> { + @Nullable private String splitId; + @Nullable private final Pool.Recycler recycler; + @Nullable private final T[] records; + private final int numberOfRecords; + private final Set finishedSplits; + private final RecordAndPosition recordAndPosition; + + // point to current read position within the records array + private int position; + + private ArrayBatchRecords( + @Nullable String splitId, + @Nullable Pool.Recycler recycler, + @Nullable T[] records, + int numberOfRecords, + int fileOffset, + long startingRecordOffset, + Set finishedSplits) { + Preconditions.checkArgument(numberOfRecords >= 0, "numberOfRecords can't be negative"); + Preconditions.checkArgument(fileOffset >= 0, "fileOffset can't be negative"); + Preconditions.checkArgument(startingRecordOffset >= 0, "numberOfRecords can't be negative"); + + this.splitId = splitId; + this.recycler = recycler; + this.records = records; + this.numberOfRecords = numberOfRecords; + this.finishedSplits = + Preconditions.checkNotNull(finishedSplits, "finishedSplits can be empty but not null"); + this.recordAndPosition = new RecordAndPosition<>(); + + recordAndPosition.set(null, fileOffset, startingRecordOffset); + this.position = 0; + } + + @Nullable + @Override + public String nextSplit() { + String nextSplit = this.splitId; + // set the splitId to null to indicate no more splits + // this class only contains record for one split + this.splitId = null; + return nextSplit; + } + + @Nullable + @Override + public RecordAndPosition nextRecordFromSplit() { + if (position < numberOfRecords) { + recordAndPosition.record(records[position]); + position++; + return recordAndPosition; + } else { + return null; + } + } + + /** + * This method is called when all records from this batch has been emitted. If recycler is set, it + * should be called to return the records array back to pool. + */ + @Override + public void recycle() { + if (recycler != null) { + recycler.recycle(records); + } + } + + @Override + public Set finishedSplits() { + return finishedSplits; + } + + @VisibleForTesting + T[] records() { + return records; + } + + @VisibleForTesting + int numberOfRecords() { + return numberOfRecords; + } + + /** + * Create a ArrayBatchRecords backed up an array with records from the same file + * + * @param splitId Iceberg source only read from one split a time. We never have multiple records + * from multiple splits. + * @param recycler Because {@link DataIterator} with {@link RowData} returns an iterator of reused + * RowData object, we need to clone RowData eagerly when constructing a batch of records. We + * can use object pool to reuse the RowData array object which can be expensive to create. + * This recycler can be provided to recycle the array object back to pool after read is + * exhausted. If the {@link DataIterator} returns an iterator of non-reused objects, we don't + * need to clone objects. It is cheap to just create the batch array. Hence, we don't need + * object pool and recycler can be set to null. + * @param records an array (maybe reused) holding a batch of records + * @param numberOfRecords actual number of records in the array + * @param fileOffset fileOffset for all records in this batch + * @param startingRecordOffset starting recordOffset + * @param record type + */ + public static ArrayBatchRecords forRecords( + String splitId, + Pool.Recycler recycler, + T[] records, + int numberOfRecords, + int fileOffset, + long startingRecordOffset) { + return new ArrayBatchRecords<>( + splitId, + recycler, + records, + numberOfRecords, + fileOffset, + startingRecordOffset, + Collections.emptySet()); + } + + /** + * Create ab ArrayBatchRecords with only finished split id + * + * @param splitId for the split that is just exhausted + */ + public static ArrayBatchRecords finishedSplit(String splitId) { + return new ArrayBatchRecords<>(null, null, null, 0, 0, 0, Collections.singleton(splitId)); + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/ArrayPoolDataIteratorBatcher.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/ArrayPoolDataIteratorBatcher.java new file mode 100644 index 000000000000..306afd1811be --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/ArrayPoolDataIteratorBatcher.java @@ -0,0 +1,130 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.reader; + +import java.io.IOException; +import java.util.NoSuchElementException; +import org.apache.flink.configuration.ReadableConfig; +import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; +import org.apache.flink.connector.base.source.reader.SourceReaderOptions; +import org.apache.flink.connector.file.src.util.Pool; +import org.apache.iceberg.flink.FlinkConfigOptions; +import org.apache.iceberg.flink.source.DataIterator; +import org.apache.iceberg.io.CloseableIterator; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; + +/** This implementation stores record batch in array from recyclable pool */ +class ArrayPoolDataIteratorBatcher implements DataIteratorBatcher { + private final int batchSize; + private final int handoverQueueSize; + private final RecordFactory recordFactory; + + private transient Pool pool; + + ArrayPoolDataIteratorBatcher(ReadableConfig config, RecordFactory recordFactory) { + this.batchSize = config.get(FlinkConfigOptions.SOURCE_READER_FETCH_BATCH_RECORD_COUNT); + this.handoverQueueSize = config.get(SourceReaderOptions.ELEMENT_QUEUE_CAPACITY); + this.recordFactory = recordFactory; + } + + @Override + public CloseableIterator>> batch( + String splitId, DataIterator inputIterator) { + Preconditions.checkArgument(inputIterator != null, "Input data iterator can't be null"); + // lazily create pool as it is not serializable + if (pool == null) { + this.pool = createPoolOfBatches(handoverQueueSize); + } + return new ArrayPoolBatchIterator(splitId, inputIterator, pool); + } + + private Pool createPoolOfBatches(int numBatches) { + Pool poolOfBatches = new Pool<>(numBatches); + for (int batchId = 0; batchId < numBatches; batchId++) { + T[] batch = recordFactory.createBatch(batchSize); + poolOfBatches.add(batch); + } + + return poolOfBatches; + } + + private class ArrayPoolBatchIterator + implements CloseableIterator>> { + + private final String splitId; + private final DataIterator inputIterator; + private final Pool pool; + + ArrayPoolBatchIterator(String splitId, DataIterator inputIterator, Pool pool) { + this.splitId = splitId; + this.inputIterator = inputIterator; + this.pool = pool; + } + + @Override + public boolean hasNext() { + return inputIterator.hasNext(); + } + + @Override + public RecordsWithSplitIds> next() { + if (!inputIterator.hasNext()) { + throw new NoSuchElementException(); + } + + T[] batch = getCachedEntry(); + int recordCount = 0; + while (inputIterator.hasNext() && recordCount < batchSize) { + // The record produced by inputIterator can be reused like for the RowData case. + // inputIterator.next() can't be called again until the copy is made + // since the record is not consumed immediately. + T nextRecord = inputIterator.next(); + recordFactory.clone(nextRecord, batch, recordCount); + recordCount++; + if (!inputIterator.currentFileHasNext()) { + // break early so that records in the ArrayResultIterator + // have the same fileOffset. + break; + } + } + + return ArrayBatchRecords.forRecords( + splitId, + pool.recycler(), + batch, + recordCount, + inputIterator.fileOffset(), + inputIterator.recordOffset() - recordCount); + } + + @Override + public void close() throws IOException { + inputIterator.close(); + } + + private T[] getCachedEntry() { + try { + return pool.pollEntry(); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + throw new RuntimeException("Interrupted while waiting for array pool entry", e); + } + } + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/AvroGenericRecordReaderFunction.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/AvroGenericRecordReaderFunction.java new file mode 100644 index 000000000000..66e59633fff2 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/AvroGenericRecordReaderFunction.java @@ -0,0 +1,102 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.reader; + +import java.util.List; +import org.apache.avro.generic.GenericRecord; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.configuration.ReadableConfig; +import org.apache.iceberg.Schema; +import org.apache.iceberg.Table; +import org.apache.iceberg.encryption.EncryptionManager; +import org.apache.iceberg.expressions.Expression; +import org.apache.iceberg.flink.source.AvroGenericRecordFileScanTaskReader; +import org.apache.iceberg.flink.source.DataIterator; +import org.apache.iceberg.flink.source.RowDataFileScanTaskReader; +import org.apache.iceberg.flink.source.RowDataToAvroGenericRecordConverter; +import org.apache.iceberg.flink.source.split.IcebergSourceSplit; +import org.apache.iceberg.io.FileIO; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; + +/** Read Iceberg rows as {@link GenericRecord}. */ +public class AvroGenericRecordReaderFunction extends DataIteratorReaderFunction { + private final String tableName; + private final Schema readSchema; + private final FileIO io; + private final EncryptionManager encryption; + private final RowDataFileScanTaskReader rowDataReader; + + private transient RowDataToAvroGenericRecordConverter converter; + + /** + * Create a reader function without projection and name mapping. Column name is case-insensitive. + */ + public static AvroGenericRecordReaderFunction fromTable(Table table) { + return new AvroGenericRecordReaderFunction( + table.name(), + new Configuration(), + table.schema(), + null, + null, + false, + table.io(), + table.encryption(), + null); + } + + public AvroGenericRecordReaderFunction( + String tableName, + ReadableConfig config, + Schema tableSchema, + Schema projectedSchema, + String nameMapping, + boolean caseSensitive, + FileIO io, + EncryptionManager encryption, + List filters) { + super(new ListDataIteratorBatcher<>(config)); + this.tableName = tableName; + this.readSchema = readSchema(tableSchema, projectedSchema); + this.io = io; + this.encryption = encryption; + this.rowDataReader = + new RowDataFileScanTaskReader(tableSchema, readSchema, nameMapping, caseSensitive, filters); + } + + @Override + protected DataIterator createDataIterator(IcebergSourceSplit split) { + return new DataIterator<>( + new AvroGenericRecordFileScanTaskReader(rowDataReader, lazyConverter()), + split.task(), + io, + encryption); + } + + private RowDataToAvroGenericRecordConverter lazyConverter() { + if (converter == null) { + this.converter = RowDataToAvroGenericRecordConverter.fromIcebergSchema(tableName, readSchema); + } + return converter; + } + + private static Schema readSchema(Schema tableSchema, Schema projectedSchema) { + Preconditions.checkNotNull(tableSchema, "Table schema can't be null"); + return projectedSchema == null ? tableSchema : projectedSchema; + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/ColumnStatsWatermarkExtractor.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/ColumnStatsWatermarkExtractor.java new file mode 100644 index 000000000000..4bb6f0a98c4c --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/ColumnStatsWatermarkExtractor.java @@ -0,0 +1,98 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.reader; + +import java.io.Serializable; +import java.util.Comparator; +import java.util.concurrent.TimeUnit; +import org.apache.flink.annotation.Internal; +import org.apache.iceberg.Schema; +import org.apache.iceberg.flink.source.split.IcebergSourceSplit; +import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.types.Conversions; +import org.apache.iceberg.types.Type.TypeID; +import org.apache.iceberg.types.Types; + +/** + * {@link SplitWatermarkExtractor} implementation which uses an Iceberg timestamp column statistics + * to get the watermarks for the {@link IcebergSourceSplit}. This watermark is emitted by the {@link + * WatermarkExtractorRecordEmitter} along with the actual records. + */ +@Internal +public class ColumnStatsWatermarkExtractor implements SplitWatermarkExtractor, Serializable { + private final int eventTimeFieldId; + private final String eventTimeFieldName; + private final TimeUnit timeUnit; + + /** + * Creates the extractor. + * + * @param schema The schema of the Table + * @param eventTimeFieldName The column which should be used as an event time + * @param timeUnit Used for converting the long value to epoch milliseconds + */ + public ColumnStatsWatermarkExtractor( + Schema schema, String eventTimeFieldName, TimeUnit timeUnit) { + Types.NestedField field = schema.findField(eventTimeFieldName); + TypeID typeID = field.type().typeId(); + Preconditions.checkArgument( + typeID.equals(TypeID.LONG) || typeID.equals(TypeID.TIMESTAMP), + "Found %s, expected a LONG or TIMESTAMP column for watermark generation.", + typeID); + this.eventTimeFieldId = field.fieldId(); + this.eventTimeFieldName = eventTimeFieldName; + // Use the timeUnit only for Long columns. + this.timeUnit = typeID.equals(TypeID.LONG) ? timeUnit : TimeUnit.MICROSECONDS; + } + + @VisibleForTesting + ColumnStatsWatermarkExtractor(int eventTimeFieldId, String eventTimeFieldName) { + this.eventTimeFieldId = eventTimeFieldId; + this.eventTimeFieldName = eventTimeFieldName; + this.timeUnit = TimeUnit.MICROSECONDS; + } + + /** + * Get the watermark for a split using column statistics. + * + * @param split The split + * @return The watermark + * @throws IllegalArgumentException if there is no statistics for the column + */ + @Override + public long extractWatermark(IcebergSourceSplit split) { + return split.task().files().stream() + .map( + scanTask -> { + Preconditions.checkArgument( + scanTask.file().lowerBounds() != null + && scanTask.file().lowerBounds().get(eventTimeFieldId) != null, + "Missing statistics for column name = %s in file = %s", + eventTimeFieldName, + eventTimeFieldId, + scanTask.file()); + return timeUnit.toMillis( + Conversions.fromByteBuffer( + Types.LongType.get(), scanTask.file().lowerBounds().get(eventTimeFieldId))); + }) + .min(Comparator.comparingLong(l -> l)) + .get(); + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorBatcher.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorBatcher.java new file mode 100644 index 000000000000..c376e359c600 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorBatcher.java @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.reader; + +import java.io.Serializable; +import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; +import org.apache.flink.connector.base.source.reader.splitreader.SplitReader; +import org.apache.iceberg.flink.source.DataIterator; +import org.apache.iceberg.io.CloseableIterator; + +/** + * Batcher converts iterator of T into iterator of batched {@code + * RecordsWithSplitIds>}, as FLIP-27's {@link SplitReader#fetch()} returns + * batched records. + */ +@FunctionalInterface +public interface DataIteratorBatcher extends Serializable { + CloseableIterator>> batch( + String splitId, DataIterator inputIterator); +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorReaderFunction.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorReaderFunction.java new file mode 100644 index 000000000000..bbf797ef4aa8 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorReaderFunction.java @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.reader; + +import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; +import org.apache.iceberg.flink.source.DataIterator; +import org.apache.iceberg.flink.source.split.IcebergSourceSplit; +import org.apache.iceberg.io.CloseableIterator; + +/** A {@link ReaderFunction} implementation that uses {@link DataIterator}. */ +public abstract class DataIteratorReaderFunction implements ReaderFunction { + private final DataIteratorBatcher batcher; + + public DataIteratorReaderFunction(DataIteratorBatcher batcher) { + this.batcher = batcher; + } + + protected abstract DataIterator createDataIterator(IcebergSourceSplit split); + + @Override + public CloseableIterator>> apply( + IcebergSourceSplit split) { + DataIterator inputIterator = createDataIterator(split); + inputIterator.seek(split.fileOffset(), split.recordOffset()); + return batcher.batch(split.splitId(), inputIterator); + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceReader.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceReader.java new file mode 100644 index 000000000000..f143b8d2df2e --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceReader.java @@ -0,0 +1,77 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.reader; + +import java.util.Collection; +import java.util.Collections; +import java.util.Map; +import org.apache.flink.annotation.Internal; +import org.apache.flink.api.connector.source.SourceReaderContext; +import org.apache.flink.connector.base.source.reader.SingleThreadMultiplexSourceReaderBase; +import org.apache.iceberg.flink.source.split.IcebergSourceSplit; +import org.apache.iceberg.flink.source.split.SerializableComparator; +import org.apache.iceberg.flink.source.split.SplitRequestEvent; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; + +@Internal +public class IcebergSourceReader + extends SingleThreadMultiplexSourceReaderBase< + RecordAndPosition, T, IcebergSourceSplit, IcebergSourceSplit> { + + public IcebergSourceReader( + SerializableRecordEmitter emitter, + IcebergSourceReaderMetrics metrics, + ReaderFunction readerFunction, + SerializableComparator splitComparator, + SourceReaderContext context) { + super( + () -> new IcebergSourceSplitReader<>(metrics, readerFunction, splitComparator, context), + emitter, + context.getConfiguration(), + context); + } + + @Override + public void start() { + // We request a split only if we did not get splits during the checkpoint restore. + // Otherwise, reader restarts will keep requesting more and more splits. + if (getNumberOfCurrentlyAssignedSplits() == 0) { + requestSplit(Collections.emptyList()); + } + } + + @Override + protected void onSplitFinished(Map finishedSplitIds) { + requestSplit(Lists.newArrayList(finishedSplitIds.keySet())); + } + + @Override + protected IcebergSourceSplit initializedState(IcebergSourceSplit split) { + return split; + } + + @Override + protected IcebergSourceSplit toSplitType(String splitId, IcebergSourceSplit splitState) { + return splitState; + } + + private void requestSplit(Collection finishedSplitIds) { + context.sendSourceEventToCoordinator(new SplitRequestEvent(finishedSplitIds)); + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceReaderMetrics.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceReaderMetrics.java new file mode 100644 index 000000000000..2a3e1dd86b95 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceReaderMetrics.java @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.reader; + +import org.apache.flink.metrics.Counter; +import org.apache.flink.metrics.MetricGroup; + +public class IcebergSourceReaderMetrics { + private final Counter assignedSplits; + private final Counter assignedBytes; + private final Counter finishedSplits; + private final Counter finishedBytes; + private final Counter splitReaderFetchCalls; + + public IcebergSourceReaderMetrics(MetricGroup metrics, String fullTableName) { + MetricGroup readerMetrics = + metrics.addGroup("IcebergSourceReader").addGroup("table", fullTableName); + + this.assignedSplits = readerMetrics.counter("assignedSplits"); + this.assignedBytes = readerMetrics.counter("assignedBytes"); + this.finishedSplits = readerMetrics.counter("finishedSplits"); + this.finishedBytes = readerMetrics.counter("finishedBytes"); + this.splitReaderFetchCalls = readerMetrics.counter("splitReaderFetchCalls"); + } + + public void incrementAssignedSplits(long count) { + assignedSplits.inc(count); + } + + public void incrementAssignedBytes(long count) { + assignedBytes.inc(count); + } + + public void incrementFinishedSplits(long count) { + finishedSplits.inc(count); + } + + public void incrementFinishedBytes(long count) { + finishedBytes.inc(count); + } + + public void incrementSplitReaderFetchCalls(long count) { + splitReaderFetchCalls.inc(count); + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceSplitReader.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceSplitReader.java new file mode 100644 index 000000000000..9c20494fdbcd --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceSplitReader.java @@ -0,0 +1,167 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.reader; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.util.Collection; +import java.util.Collections; +import java.util.List; +import java.util.Queue; +import org.apache.flink.api.connector.source.SourceReaderContext; +import org.apache.flink.connector.base.source.reader.RecordsBySplits; +import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; +import org.apache.flink.connector.base.source.reader.splitreader.SplitReader; +import org.apache.flink.connector.base.source.reader.splitreader.SplitsAddition; +import org.apache.flink.connector.base.source.reader.splitreader.SplitsChange; +import org.apache.iceberg.FileScanTask; +import org.apache.iceberg.flink.source.split.IcebergSourceSplit; +import org.apache.iceberg.flink.source.split.SerializableComparator; +import org.apache.iceberg.io.CloseableIterator; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Queues; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +class IcebergSourceSplitReader implements SplitReader, IcebergSourceSplit> { + private static final Logger LOG = LoggerFactory.getLogger(IcebergSourceSplitReader.class); + + private final IcebergSourceReaderMetrics metrics; + private final ReaderFunction openSplitFunction; + private final SerializableComparator splitComparator; + private final int indexOfSubtask; + private final Queue splits; + + private CloseableIterator>> currentReader; + private IcebergSourceSplit currentSplit; + private String currentSplitId; + + IcebergSourceSplitReader( + IcebergSourceReaderMetrics metrics, + ReaderFunction openSplitFunction, + SerializableComparator splitComparator, + SourceReaderContext context) { + this.metrics = metrics; + this.openSplitFunction = openSplitFunction; + this.splitComparator = splitComparator; + this.indexOfSubtask = context.getIndexOfSubtask(); + this.splits = Queues.newArrayDeque(); + } + + /** + * The method reads a batch of records from the assigned splits. If all the records from the + * current split are returned then it will emit a {@link ArrayBatchRecords#finishedSplit(String)} + * batch to signal this event. In the next fetch loop the reader will continue with the next split + * (if any). + * + * @return The fetched records + * @throws IOException If there is an error during reading + */ + @Override + public RecordsWithSplitIds> fetch() throws IOException { + metrics.incrementSplitReaderFetchCalls(1); + if (currentReader == null) { + IcebergSourceSplit nextSplit = splits.poll(); + if (nextSplit != null) { + currentSplit = nextSplit; + currentSplitId = nextSplit.splitId(); + currentReader = openSplitFunction.apply(currentSplit); + } else { + // return an empty result, which will lead to split fetch to be idle. + // SplitFetcherManager will then close idle fetcher. + return new RecordsBySplits(Collections.emptyMap(), Collections.emptySet()); + } + } + + if (currentReader.hasNext()) { + // Because Iterator#next() doesn't support checked exception, + // we need to wrap and unwrap the checked IOException with UncheckedIOException + try { + return currentReader.next(); + } catch (UncheckedIOException e) { + throw e.getCause(); + } + } else { + return finishSplit(); + } + } + + @Override + public void handleSplitsChanges(SplitsChange splitsChange) { + if (!(splitsChange instanceof SplitsAddition)) { + throw new UnsupportedOperationException( + String.format("Unsupported split change: %s", splitsChange.getClass())); + } + + if (splitComparator != null) { + List newSplits = Lists.newArrayList(splitsChange.splits()); + newSplits.sort(splitComparator); + LOG.info("Add {} splits to reader: {}", newSplits.size(), newSplits); + splits.addAll(newSplits); + } else { + LOG.info("Add {} splits to reader", splitsChange.splits().size()); + splits.addAll(splitsChange.splits()); + } + metrics.incrementAssignedSplits(splitsChange.splits().size()); + metrics.incrementAssignedBytes(calculateBytes(splitsChange)); + } + + @Override + public void wakeUp() {} + + @Override + public void close() throws Exception { + currentSplitId = null; + if (currentReader != null) { + currentReader.close(); + } + } + + @Override + public void pauseOrResumeSplits( + Collection splitsToPause, Collection splitsToResume) { + // IcebergSourceSplitReader only reads splits sequentially. When waiting for watermark alignment + // the SourceOperator will stop processing and recycling the fetched batches. This exhausts the + // {@link ArrayPoolDataIteratorBatcher#pool} and the `currentReader.next()` call will be + // blocked even without split-level watermark alignment. Based on this the + // `pauseOrResumeSplits` and the `wakeUp` are left empty. + } + + private long calculateBytes(IcebergSourceSplit split) { + return split.task().files().stream().map(FileScanTask::length).reduce(0L, Long::sum); + } + + private long calculateBytes(SplitsChange splitsChanges) { + return splitsChanges.splits().stream().map(this::calculateBytes).reduce(0L, Long::sum); + } + + private ArrayBatchRecords finishSplit() throws IOException { + if (currentReader != null) { + currentReader.close(); + currentReader = null; + } + + ArrayBatchRecords finishRecords = ArrayBatchRecords.finishedSplit(currentSplitId); + LOG.info("Split reader {} finished split: {}", indexOfSubtask, currentSplitId); + metrics.incrementFinishedSplits(1); + metrics.incrementFinishedBytes(calculateBytes(currentSplit)); + currentSplitId = null; + return finishRecords; + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/LimitableDataIterator.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/LimitableDataIterator.java new file mode 100644 index 000000000000..020e87646d05 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/LimitableDataIterator.java @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.reader; + +import org.apache.iceberg.CombinedScanTask; +import org.apache.iceberg.encryption.EncryptionManager; +import org.apache.iceberg.flink.source.DataIterator; +import org.apache.iceberg.flink.source.FileScanTaskReader; +import org.apache.iceberg.io.FileIO; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; + +class LimitableDataIterator extends DataIterator { + private final RecordLimiter limiter; + + LimitableDataIterator( + FileScanTaskReader fileScanTaskReader, + CombinedScanTask task, + FileIO io, + EncryptionManager encryption, + RecordLimiter limiter) { + super(fileScanTaskReader, task, io, encryption); + Preconditions.checkArgument(limiter != null, "Invalid record limiter: null"); + this.limiter = limiter; + } + + @Override + public boolean hasNext() { + if (limiter.reachedLimit()) { + return false; + } + + return super.hasNext(); + } + + @Override + public T next() { + limiter.increment(); + return super.next(); + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/ListBatchRecords.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/ListBatchRecords.java new file mode 100644 index 000000000000..1acb3df76102 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/ListBatchRecords.java @@ -0,0 +1,85 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.reader; + +import java.util.Collections; +import java.util.List; +import java.util.Set; +import javax.annotation.Nullable; +import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; + +class ListBatchRecords implements RecordsWithSplitIds> { + private String splitId; + private final List records; + private final Set finishedSplits; + private final RecordAndPosition recordAndPosition; + + // point to current read position within the records list + private int position; + + ListBatchRecords( + String splitId, + List records, + int fileOffset, + long startingRecordOffset, + Set finishedSplits) { + this.splitId = splitId; + this.records = records; + this.finishedSplits = + Preconditions.checkNotNull(finishedSplits, "finishedSplits can be empty but not null"); + this.recordAndPosition = new RecordAndPosition<>(); + this.recordAndPosition.set(null, fileOffset, startingRecordOffset); + + this.position = 0; + } + + @Nullable + @Override + public String nextSplit() { + String nextSplit = this.splitId; + // set the splitId to null to indicate no more splits + // this class only contains record for one split + this.splitId = null; + return nextSplit; + } + + @Nullable + @Override + public RecordAndPosition nextRecordFromSplit() { + if (position < records.size()) { + recordAndPosition.record(records.get(position)); + position++; + return recordAndPosition; + } else { + return null; + } + } + + @Override + public Set finishedSplits() { + return finishedSplits; + } + + public static ListBatchRecords forRecords( + String splitId, List records, int fileOffset, long startingRecordOffset) { + return new ListBatchRecords<>( + splitId, records, fileOffset, startingRecordOffset, Collections.emptySet()); + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/ListDataIteratorBatcher.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/ListDataIteratorBatcher.java new file mode 100644 index 000000000000..365416239d37 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/ListDataIteratorBatcher.java @@ -0,0 +1,94 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.reader; + +import java.io.IOException; +import java.util.List; +import java.util.NoSuchElementException; +import org.apache.flink.configuration.ReadableConfig; +import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; +import org.apache.iceberg.flink.FlinkConfigOptions; +import org.apache.iceberg.flink.source.DataIterator; +import org.apache.iceberg.io.CloseableIterator; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; + +/** + * FlinkRecordReaderFunction essentially cloned objects already. So there is no need to use array + * pool to clone objects. Simply create a new ArrayList for each batch. + */ +class ListDataIteratorBatcher implements DataIteratorBatcher { + + private final int batchSize; + + ListDataIteratorBatcher(ReadableConfig config) { + this.batchSize = config.get(FlinkConfigOptions.SOURCE_READER_FETCH_BATCH_RECORD_COUNT); + } + + @Override + public CloseableIterator>> batch( + String splitId, DataIterator dataIterator) { + return new ListBatchIterator(splitId, dataIterator); + } + + private class ListBatchIterator + implements CloseableIterator>> { + + private final String splitId; + private final DataIterator inputIterator; + + ListBatchIterator(String splitId, DataIterator inputIterator) { + this.splitId = splitId; + this.inputIterator = inputIterator; + } + + @Override + public boolean hasNext() { + return inputIterator.hasNext(); + } + + @Override + public RecordsWithSplitIds> next() { + if (!inputIterator.hasNext()) { + throw new NoSuchElementException(); + } + + final List batch = Lists.newArrayListWithCapacity(batchSize); + int recordCount = 0; + while (inputIterator.hasNext() && recordCount < batchSize) { + T nextRecord = inputIterator.next(); + batch.add(nextRecord); + recordCount++; + if (!inputIterator.currentFileHasNext()) { + // break early so that records have the same fileOffset. + break; + } + } + + return ListBatchRecords.forRecords( + splitId, batch, inputIterator.fileOffset(), inputIterator.recordOffset() - recordCount); + } + + @Override + public void close() throws IOException { + if (inputIterator != null) { + inputIterator.close(); + } + } + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/MetaDataReaderFunction.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/MetaDataReaderFunction.java new file mode 100644 index 000000000000..fb4466913b90 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/MetaDataReaderFunction.java @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.reader; + +import org.apache.flink.annotation.Internal; +import org.apache.flink.configuration.ReadableConfig; +import org.apache.flink.table.data.RowData; +import org.apache.iceberg.Schema; +import org.apache.iceberg.encryption.EncryptionManager; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.source.DataIterator; +import org.apache.iceberg.flink.source.DataTaskReader; +import org.apache.iceberg.flink.source.split.IcebergSourceSplit; +import org.apache.iceberg.io.FileIO; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; + +/** Reading metadata tables (like snapshots, manifests, etc.) */ +@Internal +public class MetaDataReaderFunction extends DataIteratorReaderFunction { + private final Schema readSchema; + private final FileIO io; + private final EncryptionManager encryption; + + public MetaDataReaderFunction( + ReadableConfig config, + Schema tableSchema, + Schema projectedSchema, + FileIO io, + EncryptionManager encryption) { + super( + new ArrayPoolDataIteratorBatcher<>( + config, + new RowDataRecordFactory( + FlinkSchemaUtil.convert(readSchema(tableSchema, projectedSchema))))); + this.readSchema = readSchema(tableSchema, projectedSchema); + this.io = io; + this.encryption = encryption; + } + + @Override + public DataIterator createDataIterator(IcebergSourceSplit split) { + return new DataIterator<>(new DataTaskReader(readSchema), split.task(), io, encryption); + } + + private static Schema readSchema(Schema tableSchema, Schema projectedSchema) { + Preconditions.checkNotNull(tableSchema, "Table schema can't be null"); + return projectedSchema == null ? tableSchema : projectedSchema; + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/ReaderFunction.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/ReaderFunction.java new file mode 100644 index 000000000000..1ea91f10b4e7 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/ReaderFunction.java @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.reader; + +import java.io.Serializable; +import java.util.function.Function; +import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; +import org.apache.iceberg.flink.source.split.IcebergSourceSplit; +import org.apache.iceberg.io.CloseableIterator; + +@FunctionalInterface +public interface ReaderFunction + extends Serializable, + Function< + IcebergSourceSplit, CloseableIterator>>> {} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordAndPosition.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordAndPosition.java new file mode 100644 index 000000000000..6ac92592b6aa --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordAndPosition.java @@ -0,0 +1,78 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.reader; + +import org.apache.flink.annotation.Internal; + +/** + * A record along with the reader position to be stored in the checkpoint. + * + *

    The position defines the point in the reader AFTER the record. Record processing and updating + * checkpointed state happens atomically. The position points to where the reader should resume + * after this record is processed. + * + *

    This mutable object is useful in cases where only one instance of a {@code RecordAndPosition} + * is needed at a time. Then the same instance of RecordAndPosition can be reused. + */ +@Internal +public class RecordAndPosition { + private T record; + private int fileOffset; + private long recordOffset; + + public RecordAndPosition(T record, int fileOffset, long recordOffset) { + this.record = record; + this.fileOffset = fileOffset; + this.recordOffset = recordOffset; + } + + public RecordAndPosition() {} + + // ------------------------------------------------------------------------ + + public T record() { + return record; + } + + public int fileOffset() { + return fileOffset; + } + + public long recordOffset() { + return recordOffset; + } + + /** Updates the record and position in this object. */ + public void set(T newRecord, int newFileOffset, long newRecordOffset) { + this.record = newRecord; + this.fileOffset = newFileOffset; + this.recordOffset = newRecordOffset; + } + + /** Sets the next record of a sequence. This increments the {@code recordOffset} by one. */ + public void record(T nextRecord) { + this.record = nextRecord; + this.recordOffset++; + } + + @Override + public String toString() { + return String.format("%s @ %d + %d", record, fileOffset, recordOffset); + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordFactory.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordFactory.java new file mode 100644 index 000000000000..ef92e2e6b81f --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordFactory.java @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.reader; + +import java.io.Serializable; + +/** + * In FLIP-27 source, SplitReader#fetch() returns a batch of records. Since DataIterator for RowData + * returns an iterator of reused RowData objects, RecordFactory is needed to (1) create object array + * that is recyclable via pool. (2) clone RowData element from DataIterator to the batch array. + */ +interface RecordFactory extends Serializable { + /** Create a batch of records */ + T[] createBatch(int batchSize); + + /** Clone record into the specified position of the batch array */ + void clone(T from, T[] batch, int position); +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordLimiter.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordLimiter.java new file mode 100644 index 000000000000..f260a53089ff --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordLimiter.java @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.reader; + +import java.util.concurrent.atomic.AtomicLong; +import org.apache.flink.annotation.Internal; + +@Internal +class RecordLimiter { + private final long limit; + private final AtomicLong counter; + + static RecordLimiter create(long limit) { + return new RecordLimiter(limit); + } + + private RecordLimiter(long limit) { + this.limit = limit; + this.counter = new AtomicLong(0); + } + + public boolean reachedLimit() { + return limit > 0 && counter.get() >= limit; + } + + public void increment() { + counter.incrementAndGet(); + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataReaderFunction.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataReaderFunction.java new file mode 100644 index 000000000000..c9208a0e1834 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataReaderFunction.java @@ -0,0 +1,115 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.reader; + +import java.util.List; +import org.apache.flink.configuration.ReadableConfig; +import org.apache.flink.table.data.RowData; +import org.apache.iceberg.Schema; +import org.apache.iceberg.encryption.EncryptionManager; +import org.apache.iceberg.expressions.Expression; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.source.DataIterator; +import org.apache.iceberg.flink.source.RowDataFileScanTaskReader; +import org.apache.iceberg.flink.source.split.IcebergSourceSplit; +import org.apache.iceberg.io.FileIO; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; + +public class RowDataReaderFunction extends DataIteratorReaderFunction { + private final Schema tableSchema; + private final Schema readSchema; + private final String nameMapping; + private final boolean caseSensitive; + private final FileIO io; + private final EncryptionManager encryption; + private final List filters; + private final long limit; + + private transient RecordLimiter recordLimiter = null; + + public RowDataReaderFunction( + ReadableConfig config, + Schema tableSchema, + Schema projectedSchema, + String nameMapping, + boolean caseSensitive, + FileIO io, + EncryptionManager encryption, + List filters) { + this( + config, + tableSchema, + projectedSchema, + nameMapping, + caseSensitive, + io, + encryption, + filters, + -1L); + } + + public RowDataReaderFunction( + ReadableConfig config, + Schema tableSchema, + Schema projectedSchema, + String nameMapping, + boolean caseSensitive, + FileIO io, + EncryptionManager encryption, + List filters, + long limit) { + super( + new ArrayPoolDataIteratorBatcher<>( + config, + new RowDataRecordFactory( + FlinkSchemaUtil.convert(readSchema(tableSchema, projectedSchema))))); + this.tableSchema = tableSchema; + this.readSchema = readSchema(tableSchema, projectedSchema); + this.nameMapping = nameMapping; + this.caseSensitive = caseSensitive; + this.io = io; + this.encryption = encryption; + this.filters = filters; + this.limit = limit; + } + + @Override + public DataIterator createDataIterator(IcebergSourceSplit split) { + return new LimitableDataIterator<>( + new RowDataFileScanTaskReader(tableSchema, readSchema, nameMapping, caseSensitive, filters), + split.task(), + io, + encryption, + lazyLimiter()); + } + + private static Schema readSchema(Schema tableSchema, Schema projectedSchema) { + Preconditions.checkNotNull(tableSchema, "Table schema can't be null"); + return projectedSchema == null ? tableSchema : projectedSchema; + } + + /** Lazily create RecordLimiter to avoid the need to make it serializable */ + private RecordLimiter lazyLimiter() { + if (recordLimiter == null) { + this.recordLimiter = RecordLimiter.create(limit); + } + + return recordLimiter; + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataRecordFactory.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataRecordFactory.java new file mode 100644 index 000000000000..40d5c28d7bc7 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataRecordFactory.java @@ -0,0 +1,73 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.reader; + +import org.apache.flink.api.common.typeutils.TypeSerializer; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.runtime.typeutils.InternalSerializers; +import org.apache.flink.table.types.logical.RowType; +import org.apache.iceberg.flink.data.RowDataUtil; + +class RowDataRecordFactory implements RecordFactory { + private final RowType rowType; + private final TypeSerializer[] fieldSerializers; + private final RowData.FieldGetter[] fieldGetters; + + RowDataRecordFactory(RowType rowType) { + this.rowType = rowType; + this.fieldSerializers = createFieldSerializers(rowType); + this.fieldGetters = createFieldGetters(rowType); + } + + static TypeSerializer[] createFieldSerializers(RowType rowType) { + return rowType.getChildren().stream() + .map(InternalSerializers::create) + .toArray(TypeSerializer[]::new); + } + + static RowData.FieldGetter[] createFieldGetters(RowType rowType) { + RowData.FieldGetter[] fieldGetters = new RowData.FieldGetter[rowType.getFieldCount()]; + for (int i = 0; i < rowType.getFieldCount(); ++i) { + fieldGetters[i] = RowData.createFieldGetter(rowType.getTypeAt(i), i); + } + + return fieldGetters; + } + + @Override + public RowData[] createBatch(int batchSize) { + RowData[] arr = new RowData[batchSize]; + for (int i = 0; i < batchSize; ++i) { + arr[i] = new GenericRowData(rowType.getFieldCount()); + } + return arr; + } + + @Override + public void clone(RowData from, RowData[] batch, int position) { + // Set the return value from RowDataUtil.clone back to the array. + // Clone method returns same clone target object (reused) if it is a GenericRowData. + // Clone method will allocate a new GenericRowData object + // if the target object is NOT a GenericRowData. + // So we should always set the clone return value back to the array. + batch[position] = + RowDataUtil.clone(from, batch[position], rowType, fieldSerializers, fieldGetters); + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/SerializableRecordEmitter.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/SerializableRecordEmitter.java new file mode 100644 index 000000000000..a6e2c1dae243 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/SerializableRecordEmitter.java @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.reader; + +import java.io.Serializable; +import org.apache.flink.annotation.Internal; +import org.apache.flink.connector.base.source.reader.RecordEmitter; +import org.apache.iceberg.flink.source.split.IcebergSourceSplit; + +@Internal +@FunctionalInterface +public interface SerializableRecordEmitter + extends RecordEmitter, T, IcebergSourceSplit>, Serializable { + static SerializableRecordEmitter defaultEmitter() { + return (element, output, split) -> { + output.collect(element.record()); + split.updatePosition(element.fileOffset(), element.recordOffset()); + }; + } + + static SerializableRecordEmitter emitterWithWatermark(SplitWatermarkExtractor extractor) { + return new WatermarkExtractorRecordEmitter<>(extractor); + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/SplitWatermarkExtractor.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/SplitWatermarkExtractor.java new file mode 100644 index 000000000000..d1c50ac8ca52 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/SplitWatermarkExtractor.java @@ -0,0 +1,28 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.reader; + +import java.io.Serializable; +import org.apache.iceberg.flink.source.split.IcebergSourceSplit; + +/** The interface used to extract watermarks from splits. */ +public interface SplitWatermarkExtractor extends Serializable { + /** Get the watermark for a split. */ + long extractWatermark(IcebergSourceSplit split); +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/WatermarkExtractorRecordEmitter.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/WatermarkExtractorRecordEmitter.java new file mode 100644 index 000000000000..02ef57d344b1 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/reader/WatermarkExtractorRecordEmitter.java @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.reader; + +import org.apache.flink.api.common.eventtime.Watermark; +import org.apache.flink.api.connector.source.SourceOutput; +import org.apache.iceberg.flink.source.split.IcebergSourceSplit; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Emitter which emits the watermarks, records and updates the split position. + * + *

    The Emitter emits watermarks at the beginning of every split provided by the {@link + * SplitWatermarkExtractor}. + */ +class WatermarkExtractorRecordEmitter implements SerializableRecordEmitter { + private static final Logger LOG = LoggerFactory.getLogger(WatermarkExtractorRecordEmitter.class); + private final SplitWatermarkExtractor timeExtractor; + private String lastSplitId = null; + private long watermark; + + WatermarkExtractorRecordEmitter(SplitWatermarkExtractor timeExtractor) { + this.timeExtractor = timeExtractor; + } + + @Override + public void emitRecord( + RecordAndPosition element, SourceOutput output, IcebergSourceSplit split) { + if (!split.splitId().equals(lastSplitId)) { + long newWatermark = timeExtractor.extractWatermark(split); + if (newWatermark < watermark) { + LOG.info( + "Received a new split with lower watermark. Previous watermark = {}, current watermark = {}, previous split = {}, current split = {}", + watermark, + newWatermark, + lastSplitId, + split.splitId()); + } else { + watermark = newWatermark; + output.emitWatermark(new Watermark(watermark)); + LOG.debug("Watermark = {} emitted based on split = {}", watermark, lastSplitId); + } + + lastSplitId = split.splitId(); + } + + output.collect(element.record()); + split.updatePosition(element.fileOffset(), element.recordOffset()); + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplit.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplit.java new file mode 100644 index 000000000000..344f64833b62 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplit.java @@ -0,0 +1,220 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.split; + +import java.io.IOException; +import java.io.Serializable; +import java.util.Collection; +import java.util.List; +import java.util.stream.Collectors; +import javax.annotation.Nullable; +import org.apache.flink.annotation.Internal; +import org.apache.flink.api.connector.source.SourceSplit; +import org.apache.flink.core.memory.DataInputDeserializer; +import org.apache.flink.core.memory.DataOutputSerializer; +import org.apache.flink.util.InstantiationUtil; +import org.apache.iceberg.BaseCombinedScanTask; +import org.apache.iceberg.CombinedScanTask; +import org.apache.iceberg.FileScanTask; +import org.apache.iceberg.ScanTaskParser; +import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Iterables; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; + +@Internal +public class IcebergSourceSplit implements SourceSplit, Serializable { + private static final long serialVersionUID = 1L; + private static final ThreadLocal SERIALIZER_CACHE = + ThreadLocal.withInitial(() -> new DataOutputSerializer(1024)); + + private final CombinedScanTask task; + + private int fileOffset; + private long recordOffset; + + // The splits are frequently serialized into checkpoints. + // Caching the byte representation makes repeated serialization cheap. + @Nullable private transient byte[] serializedBytesCache; + + private IcebergSourceSplit(CombinedScanTask task, int fileOffset, long recordOffset) { + this.task = task; + this.fileOffset = fileOffset; + this.recordOffset = recordOffset; + } + + public static IcebergSourceSplit fromCombinedScanTask(CombinedScanTask combinedScanTask) { + return fromCombinedScanTask(combinedScanTask, 0, 0L); + } + + public static IcebergSourceSplit fromCombinedScanTask( + CombinedScanTask combinedScanTask, int fileOffset, long recordOffset) { + return new IcebergSourceSplit(combinedScanTask, fileOffset, recordOffset); + } + + public CombinedScanTask task() { + return task; + } + + public int fileOffset() { + return fileOffset; + } + + public long recordOffset() { + return recordOffset; + } + + @Override + public String splitId() { + return MoreObjects.toStringHelper(this).add("files", toString(task.files())).toString(); + } + + public void updatePosition(int newFileOffset, long newRecordOffset) { + // invalidate the cache after position change + serializedBytesCache = null; + fileOffset = newFileOffset; + recordOffset = newRecordOffset; + } + + @Override + public String toString() { + return MoreObjects.toStringHelper(this) + .add("files", toString(task.files())) + .add("fileOffset", fileOffset) + .add("recordOffset", recordOffset) + .toString(); + } + + private String toString(Collection files) { + return Iterables.toString( + files.stream() + .map( + fileScanTask -> + MoreObjects.toStringHelper(fileScanTask) + .add("file", fileScanTask.file().path().toString()) + .add("start", fileScanTask.start()) + .add("length", fileScanTask.length()) + .toString()) + .collect(Collectors.toList())); + } + + byte[] serializeV1() throws IOException { + if (serializedBytesCache == null) { + serializedBytesCache = InstantiationUtil.serializeObject(this); + } + + return serializedBytesCache; + } + + static IcebergSourceSplit deserializeV1(byte[] serialized) throws IOException { + try { + return InstantiationUtil.deserializeObject( + serialized, IcebergSourceSplit.class.getClassLoader()); + } catch (ClassNotFoundException e) { + throw new RuntimeException("Failed to deserialize the split.", e); + } + } + + byte[] serializeV2() throws IOException { + return serialize(2); + } + + byte[] serializeV3() throws IOException { + return serialize(3); + } + + private byte[] serialize(int version) throws IOException { + if (serializedBytesCache == null) { + DataOutputSerializer out = SERIALIZER_CACHE.get(); + Collection fileScanTasks = task.tasks(); + Preconditions.checkArgument( + fileOffset >= 0 && fileOffset < fileScanTasks.size(), + "Invalid file offset: %s. Should be within the range of [0, %s)", + fileOffset, + fileScanTasks.size()); + + out.writeInt(fileOffset); + out.writeLong(recordOffset); + out.writeInt(fileScanTasks.size()); + + for (FileScanTask fileScanTask : fileScanTasks) { + String taskJson = ScanTaskParser.toJson(fileScanTask); + writeTaskJson(out, taskJson, version); + } + + serializedBytesCache = out.getCopyOfBuffer(); + out.clear(); + } + + return serializedBytesCache; + } + + private static void writeTaskJson(DataOutputSerializer out, String taskJson, int version) + throws IOException { + switch (version) { + case 2: + out.writeUTF(taskJson); + break; + case 3: + SerializerHelper.writeLongUTF(out, taskJson); + break; + default: + throw new IllegalArgumentException("Unsupported version: " + version); + } + } + + static IcebergSourceSplit deserializeV2(byte[] serialized, boolean caseSensitive) + throws IOException { + return deserialize(serialized, caseSensitive, 2); + } + + static IcebergSourceSplit deserializeV3(byte[] serialized, boolean caseSensitive) + throws IOException { + return deserialize(serialized, caseSensitive, 3); + } + + private static IcebergSourceSplit deserialize( + byte[] serialized, boolean caseSensitive, int version) throws IOException { + DataInputDeserializer in = new DataInputDeserializer(serialized); + int fileOffset = in.readInt(); + long recordOffset = in.readLong(); + int taskCount = in.readInt(); + + List tasks = Lists.newArrayListWithCapacity(taskCount); + for (int i = 0; i < taskCount; ++i) { + String taskJson = readTaskJson(in, version); + FileScanTask task = ScanTaskParser.fromJson(taskJson, caseSensitive); + tasks.add(task); + } + + CombinedScanTask combinedScanTask = new BaseCombinedScanTask(tasks); + return IcebergSourceSplit.fromCombinedScanTask(combinedScanTask, fileOffset, recordOffset); + } + + private static String readTaskJson(DataInputDeserializer in, int version) throws IOException { + switch (version) { + case 2: + return in.readUTF(); + case 3: + return SerializerHelper.readLongUTF(in); + default: + throw new IllegalArgumentException("Unsupported version: " + version); + } + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitSerializer.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitSerializer.java new file mode 100644 index 000000000000..d4b0f9e1977d --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitSerializer.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.split; + +import java.io.IOException; +import org.apache.flink.annotation.Internal; +import org.apache.flink.core.io.SimpleVersionedSerializer; + +@Internal +public class IcebergSourceSplitSerializer implements SimpleVersionedSerializer { + private static final int VERSION = 3; + + private final boolean caseSensitive; + + public IcebergSourceSplitSerializer(boolean caseSensitive) { + this.caseSensitive = caseSensitive; + } + + @Override + public int getVersion() { + return VERSION; + } + + @Override + public byte[] serialize(IcebergSourceSplit split) throws IOException { + return split.serializeV3(); + } + + @Override + public IcebergSourceSplit deserialize(int version, byte[] serialized) throws IOException { + switch (version) { + case 1: + return IcebergSourceSplit.deserializeV1(serialized); + case 2: + return IcebergSourceSplit.deserializeV2(serialized, caseSensitive); + case 3: + return IcebergSourceSplit.deserializeV3(serialized, caseSensitive); + default: + throw new IOException( + String.format( + "Failed to deserialize IcebergSourceSplit. " + + "Encountered unsupported version: %d. Supported version are [1]", + version)); + } + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitState.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitState.java new file mode 100644 index 000000000000..d9061e049e00 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitState.java @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.split; + +public class IcebergSourceSplitState { + private final IcebergSourceSplit split; + private final IcebergSourceSplitStatus status; + + public IcebergSourceSplitState(IcebergSourceSplit split, IcebergSourceSplitStatus status) { + this.split = split; + this.status = status; + } + + public IcebergSourceSplit split() { + return split; + } + + public IcebergSourceSplitStatus status() { + return status; + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitStatus.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitStatus.java new file mode 100644 index 000000000000..d4a84a165e1a --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitStatus.java @@ -0,0 +1,25 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.split; + +public enum IcebergSourceSplitStatus { + UNASSIGNED, + ASSIGNED, + COMPLETED +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/split/SerializableComparator.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/split/SerializableComparator.java new file mode 100644 index 000000000000..319648ca275c --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/split/SerializableComparator.java @@ -0,0 +1,24 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.split; + +import java.io.Serializable; +import java.util.Comparator; + +public interface SerializableComparator extends Comparator, Serializable {} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/split/SerializerHelper.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/split/SerializerHelper.java new file mode 100644 index 000000000000..841969666ee5 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/split/SerializerHelper.java @@ -0,0 +1,206 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.split; + +import java.io.IOException; +import java.io.Serializable; +import java.io.UTFDataFormatException; +import org.apache.flink.core.memory.DataInputDeserializer; +import org.apache.flink.core.memory.DataOutputSerializer; + +/** + * Helper class to serialize and deserialize strings longer than 65K. The inspiration is mostly + * taken from the class org.apache.flink.core.memory.DataInputSerializer.readUTF and + * org.apache.flink.core.memory.DataOutputSerializer.writeUTF. + */ +class SerializerHelper implements Serializable { + + private SerializerHelper() {} + + /** + * Similar to {@link DataOutputSerializer#writeUTF(String)}. Except this supports larger payloads + * which is up to max integer value. + * + *

    Note: This method can be removed when the method which does similar thing within the {@link + * DataOutputSerializer} already which does the same thing, so use that one instead once that is + * released on Flink version 1.20. + * + *

    See * FLINK-34228 * https://github.com/apache/flink/pull/24191 + * + * @param out the output stream to write the string to. + * @param str the string value to be written. + */ + public static void writeLongUTF(DataOutputSerializer out, String str) throws IOException { + int strlen = str.length(); + long utflen = 0; + int ch; + + /* use charAt instead of copying String to char array */ + for (int i = 0; i < strlen; i++) { + ch = str.charAt(i); + utflen += getUTFBytesSize(ch); + + if (utflen > Integer.MAX_VALUE) { + throw new UTFDataFormatException("Encoded string reached maximum length: " + utflen); + } + } + + if (utflen > Integer.MAX_VALUE - 4) { + throw new UTFDataFormatException("Encoded string is too long: " + utflen); + } + + out.writeInt((int) utflen); + writeUTFBytes(out, str, (int) utflen); + } + + /** + * Similar to {@link DataInputDeserializer#readUTF()}. Except this supports larger payloads which + * is up to max integer value. + * + *

    Note: This method can be removed when the method which does similar thing within the {@link + * DataOutputSerializer} already which does the same thing, so use that one instead once that is + * released on Flink version 1.20. + * + *

    See * FLINK-34228 * https://github.com/apache/flink/pull/24191 + * + * @param in the input stream to read the string from. + * @return the string value read from the input stream. + * @throws IOException if an I/O error occurs when reading from the input stream. + */ + public static String readLongUTF(DataInputDeserializer in) throws IOException { + int utflen = in.readInt(); + byte[] bytearr = new byte[utflen]; + char[] chararr = new char[utflen]; + + int ch; + int char2; + int char3; + int count = 0; + int chararrCount = 0; + + in.readFully(bytearr, 0, utflen); + + while (count < utflen) { + ch = (int) bytearr[count] & 0xff; + if (ch > 127) { + break; + } + count++; + chararr[chararrCount++] = (char) ch; + } + + while (count < utflen) { + ch = (int) bytearr[count] & 0xff; + switch (ch >> 4) { + case 0: + case 1: + case 2: + case 3: + case 4: + case 5: + case 6: + case 7: + /* 0xxxxxxx */ + count++; + chararr[chararrCount++] = (char) ch; + break; + case 12: + case 13: + /* 110x xxxx 10xx xxxx */ + count += 2; + if (count > utflen) { + throw new UTFDataFormatException("malformed input: partial character at end"); + } + char2 = bytearr[count - 1]; + if ((char2 & 0xC0) != 0x80) { + throw new UTFDataFormatException("malformed input around byte " + count); + } + chararr[chararrCount++] = (char) (((ch & 0x1F) << 6) | (char2 & 0x3F)); + break; + case 14: + /* 1110 xxxx 10xx xxxx 10xx xxxx */ + count += 3; + if (count > utflen) { + throw new UTFDataFormatException("malformed input: partial character at end"); + } + char2 = bytearr[count - 2]; + char3 = bytearr[count - 1]; + if (((char2 & 0xC0) != 0x80) || ((char3 & 0xC0) != 0x80)) { + throw new UTFDataFormatException("malformed input around byte " + (count - 1)); + } + chararr[chararrCount++] = + (char) (((ch & 0x0F) << 12) | ((char2 & 0x3F) << 6) | (char3 & 0x3F)); + break; + default: + /* 10xx xxxx, 1111 xxxx */ + throw new UTFDataFormatException("malformed input around byte " + count); + } + } + // The number of chars produced may be less than utflen + return new String(chararr, 0, chararrCount); + } + + private static int getUTFBytesSize(int ch) { + if ((ch >= 0x0001) && (ch <= 0x007F)) { + return 1; + } else if (ch > 0x07FF) { + return 3; + } else { + return 2; + } + } + + private static void writeUTFBytes(DataOutputSerializer out, String str, int utflen) + throws IOException { + int strlen = str.length(); + int ch; + + int len = Math.max(1024, utflen); + + byte[] bytearr = new byte[len]; + int count = 0; + + int index; + for (index = 0; index < strlen; index++) { + ch = str.charAt(index); + if (!((ch >= 0x0001) && (ch <= 0x007F))) { + break; + } + bytearr[count++] = (byte) ch; + } + + for (; index < strlen; index++) { + ch = str.charAt(index); + if ((ch >= 0x0001) && (ch <= 0x007F)) { + bytearr[count++] = (byte) ch; + } else if (ch > 0x07FF) { + bytearr[count++] = (byte) (0xE0 | ((ch >> 12) & 0x0F)); + bytearr[count++] = (byte) (0x80 | ((ch >> 6) & 0x3F)); + bytearr[count++] = (byte) (0x80 | (ch & 0x3F)); + } else { + bytearr[count++] = (byte) (0xC0 | ((ch >> 6) & 0x1F)); + bytearr[count++] = (byte) (0x80 | (ch & 0x3F)); + } + } + + out.write(bytearr, 0, count); + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/split/SplitComparators.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/split/SplitComparators.java new file mode 100644 index 000000000000..56ee92014d12 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/split/SplitComparators.java @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.split; + +import org.apache.iceberg.flink.source.reader.SplitWatermarkExtractor; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; + +/** + * Provides implementations of {@link org.apache.iceberg.flink.source.split.SerializableComparator} + * which could be used for ordering splits. These are used by the {@link + * org.apache.iceberg.flink.source.assigner.OrderedSplitAssignerFactory} and the {@link + * org.apache.iceberg.flink.source.reader.IcebergSourceReader} + */ +public class SplitComparators { + private SplitComparators() {} + + /** Comparator which orders the splits based on the file sequence number of the data files */ + public static SerializableComparator fileSequenceNumber() { + return (IcebergSourceSplit o1, IcebergSourceSplit o2) -> { + Preconditions.checkArgument( + o1.task().files().size() == 1 && o2.task().files().size() == 1, + "Could not compare combined task. Please use 'split-open-file-cost' to prevent combining multiple files to a split"); + + Long seq1 = o1.task().files().iterator().next().file().fileSequenceNumber(); + Long seq2 = o2.task().files().iterator().next().file().fileSequenceNumber(); + + Preconditions.checkNotNull( + seq1, + "Invalid file sequence number: null. Doesn't support splits written with V1 format: %s", + o1); + Preconditions.checkNotNull( + seq2, + "Invalid file sequence number: null. Doesn't support splits written with V1 format: %s", + o2); + + int temp = Long.compare(seq1, seq2); + if (temp != 0) { + return temp; + } else { + return o1.splitId().compareTo(o2.splitId()); + } + }; + } + + /** Comparator which orders the splits based on watermark of the splits */ + public static SerializableComparator watermark( + SplitWatermarkExtractor watermarkExtractor) { + return (IcebergSourceSplit o1, IcebergSourceSplit o2) -> { + long watermark1 = watermarkExtractor.extractWatermark(o1); + long watermark2 = watermarkExtractor.extractWatermark(o2); + + int temp = Long.compare(watermark1, watermark2); + if (temp != 0) { + return temp; + } else { + return o1.splitId().compareTo(o2.splitId()); + } + }; + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/split/SplitRequestEvent.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/split/SplitRequestEvent.java new file mode 100644 index 000000000000..eabd757aa638 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/source/split/SplitRequestEvent.java @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.split; + +import java.util.Collection; +import java.util.Collections; +import org.apache.flink.annotation.Internal; +import org.apache.flink.api.connector.source.SourceEvent; + +/** We can remove this class once FLINK-21364 is resolved. */ +@Internal +public class SplitRequestEvent implements SourceEvent { + private static final long serialVersionUID = 1L; + + private final Collection finishedSplitIds; + private final String requesterHostname; + + public SplitRequestEvent() { + this(Collections.emptyList()); + } + + public SplitRequestEvent(Collection finishedSplitIds) { + this(finishedSplitIds, null); + } + + public SplitRequestEvent(Collection finishedSplitIds, String requesterHostname) { + this.finishedSplitIds = finishedSplitIds; + this.requesterHostname = requesterHostname; + } + + public Collection finishedSplitIds() { + return finishedSplitIds; + } + + public String requesterHostname() { + return requesterHostname; + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/util/ElapsedTimeGauge.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/util/ElapsedTimeGauge.java new file mode 100644 index 000000000000..6306e82d5729 --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/util/ElapsedTimeGauge.java @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.util; + +import java.util.concurrent.TimeUnit; +import org.apache.flink.annotation.Internal; +import org.apache.flink.metrics.Gauge; + +/** + * This gauge measures the elapsed time between now and last recorded time set by {@link + * ElapsedTimeGauge#refreshLastRecordedTime()}. + */ +@Internal +public class ElapsedTimeGauge implements Gauge { + private final TimeUnit reportUnit; + private volatile long lastRecordedTimeNano; + + public ElapsedTimeGauge(TimeUnit timeUnit) { + this.reportUnit = timeUnit; + refreshLastRecordedTime(); + } + + public void refreshLastRecordedTime() { + this.lastRecordedTimeNano = System.nanoTime(); + } + + @Override + public Long getValue() { + return reportUnit.convert(System.nanoTime() - lastRecordedTimeNano, TimeUnit.NANOSECONDS); + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/util/FlinkAlterTableUtil.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/util/FlinkAlterTableUtil.java new file mode 100644 index 000000000000..2bbc9cf208fe --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/util/FlinkAlterTableUtil.java @@ -0,0 +1,248 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.util; + +import java.util.List; +import java.util.Map; +import org.apache.flink.table.catalog.Column; +import org.apache.flink.table.catalog.TableChange; +import org.apache.flink.table.catalog.UniqueConstraint; +import org.apache.iceberg.Table; +import org.apache.iceberg.Transaction; +import org.apache.iceberg.UpdateProperties; +import org.apache.iceberg.UpdateSchema; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.types.Type; + +public class FlinkAlterTableUtil { + private FlinkAlterTableUtil() {} + + public static void commitChanges( + Table table, + String setLocation, + String setSnapshotId, + String pickSnapshotId, + Map setProperties) { + commitManageSnapshots(table, setSnapshotId, pickSnapshotId); + + Transaction transaction = table.newTransaction(); + + if (setLocation != null) { + transaction.updateLocation().setLocation(setLocation).commit(); + } + + if (!setProperties.isEmpty()) { + UpdateProperties updateProperties = transaction.updateProperties(); + setProperties.forEach( + (k, v) -> { + if (v == null) { + updateProperties.remove(k); + } else { + updateProperties.set(k, v); + } + }); + updateProperties.commit(); + } + + transaction.commitTransaction(); + } + + public static void commitChanges( + Table table, + String setLocation, + String setSnapshotId, + String pickSnapshotId, + List schemaChanges, + List propertyChanges) { + commitManageSnapshots(table, setSnapshotId, pickSnapshotId); + + Transaction transaction = table.newTransaction(); + + if (setLocation != null) { + transaction.updateLocation().setLocation(setLocation).commit(); + } + + if (!schemaChanges.isEmpty()) { + UpdateSchema updateSchema = transaction.updateSchema(); + FlinkAlterTableUtil.applySchemaChanges(updateSchema, schemaChanges); + updateSchema.commit(); + } + + if (!propertyChanges.isEmpty()) { + UpdateProperties updateProperties = transaction.updateProperties(); + FlinkAlterTableUtil.applyPropertyChanges(updateProperties, propertyChanges); + updateProperties.commit(); + } + + transaction.commitTransaction(); + } + + public static void commitManageSnapshots( + Table table, String setSnapshotId, String cherrypickSnapshotId) { + // don't allow setting the snapshot and picking a commit at the same time because order is + // ambiguous and choosing one order leads to different results + Preconditions.checkArgument( + setSnapshotId == null || cherrypickSnapshotId == null, + "Cannot set the current snapshot ID and cherry-pick snapshot changes"); + + if (setSnapshotId != null) { + long newSnapshotId = Long.parseLong(setSnapshotId); + table.manageSnapshots().setCurrentSnapshot(newSnapshotId).commit(); + } + + // if updating the table snapshot, perform that update first in case it fails + if (cherrypickSnapshotId != null) { + long newSnapshotId = Long.parseLong(cherrypickSnapshotId); + table.manageSnapshots().cherrypick(newSnapshotId).commit(); + } + } + + /** + * Applies a list of Flink table changes to an {@link UpdateSchema} operation. + * + * @param pendingUpdate an uncommitted UpdateSchema operation to configure + * @param schemaChanges a list of Flink table changes + */ + public static void applySchemaChanges( + UpdateSchema pendingUpdate, List schemaChanges) { + for (TableChange change : schemaChanges) { + if (change instanceof TableChange.AddColumn) { + TableChange.AddColumn addColumn = (TableChange.AddColumn) change; + Column flinkColumn = addColumn.getColumn(); + Preconditions.checkArgument( + FlinkCompatibilityUtil.isPhysicalColumn(flinkColumn), + "Unsupported table change: Adding computed column %s.", + flinkColumn.getName()); + Type icebergType = FlinkSchemaUtil.convert(flinkColumn.getDataType().getLogicalType()); + if (flinkColumn.getDataType().getLogicalType().isNullable()) { + pendingUpdate.addColumn( + flinkColumn.getName(), icebergType, flinkColumn.getComment().orElse(null)); + } else { + pendingUpdate.addRequiredColumn( + flinkColumn.getName(), icebergType, flinkColumn.getComment().orElse(null)); + } + } else if (change instanceof TableChange.ModifyColumn) { + TableChange.ModifyColumn modifyColumn = (TableChange.ModifyColumn) change; + applyModifyColumn(pendingUpdate, modifyColumn); + } else if (change instanceof TableChange.DropColumn) { + TableChange.DropColumn dropColumn = (TableChange.DropColumn) change; + pendingUpdate.deleteColumn(dropColumn.getColumnName()); + } else if (change instanceof TableChange.AddWatermark) { + throw new UnsupportedOperationException("Unsupported table change: AddWatermark."); + } else if (change instanceof TableChange.ModifyWatermark) { + throw new UnsupportedOperationException("Unsupported table change: ModifyWatermark."); + } else if (change instanceof TableChange.DropWatermark) { + throw new UnsupportedOperationException("Unsupported table change: DropWatermark."); + } else if (change instanceof TableChange.AddUniqueConstraint) { + TableChange.AddUniqueConstraint addPk = (TableChange.AddUniqueConstraint) change; + applyUniqueConstraint(pendingUpdate, addPk.getConstraint()); + } else if (change instanceof TableChange.ModifyUniqueConstraint) { + TableChange.ModifyUniqueConstraint modifyPk = (TableChange.ModifyUniqueConstraint) change; + applyUniqueConstraint(pendingUpdate, modifyPk.getNewConstraint()); + } else if (change instanceof TableChange.DropConstraint) { + throw new UnsupportedOperationException("Unsupported table change: DropConstraint."); + } else { + throw new UnsupportedOperationException("Cannot apply unknown table change: " + change); + } + } + } + + /** + * Applies a list of Flink table property changes to an {@link UpdateProperties} operation. + * + * @param pendingUpdate an uncommitted UpdateProperty operation to configure + * @param propertyChanges a list of Flink table changes + */ + public static void applyPropertyChanges( + UpdateProperties pendingUpdate, List propertyChanges) { + for (TableChange change : propertyChanges) { + if (change instanceof TableChange.SetOption) { + TableChange.SetOption setOption = (TableChange.SetOption) change; + pendingUpdate.set(setOption.getKey(), setOption.getValue()); + } else if (change instanceof TableChange.ResetOption) { + TableChange.ResetOption resetOption = (TableChange.ResetOption) change; + pendingUpdate.remove(resetOption.getKey()); + } else { + throw new UnsupportedOperationException( + "The given table change is not a property change: " + change); + } + } + } + + private static void applyModifyColumn( + UpdateSchema pendingUpdate, TableChange.ModifyColumn modifyColumn) { + if (modifyColumn instanceof TableChange.ModifyColumnName) { + TableChange.ModifyColumnName modifyName = (TableChange.ModifyColumnName) modifyColumn; + pendingUpdate.renameColumn(modifyName.getOldColumnName(), modifyName.getNewColumnName()); + } else if (modifyColumn instanceof TableChange.ModifyColumnPosition) { + TableChange.ModifyColumnPosition modifyPosition = + (TableChange.ModifyColumnPosition) modifyColumn; + applyModifyColumnPosition(pendingUpdate, modifyPosition); + } else if (modifyColumn instanceof TableChange.ModifyPhysicalColumnType) { + TableChange.ModifyPhysicalColumnType modifyType = + (TableChange.ModifyPhysicalColumnType) modifyColumn; + Type type = FlinkSchemaUtil.convert(modifyType.getNewType().getLogicalType()); + String columnName = modifyType.getOldColumn().getName(); + pendingUpdate.updateColumn(columnName, type.asPrimitiveType()); + if (modifyType.getNewColumn().getDataType().getLogicalType().isNullable()) { + pendingUpdate.makeColumnOptional(columnName); + } else { + pendingUpdate.requireColumn(columnName); + } + } else if (modifyColumn instanceof TableChange.ModifyColumnComment) { + TableChange.ModifyColumnComment modifyComment = + (TableChange.ModifyColumnComment) modifyColumn; + pendingUpdate.updateColumnDoc( + modifyComment.getOldColumn().getName(), modifyComment.getNewComment()); + } else { + throw new UnsupportedOperationException( + "Cannot apply unknown modify-column change: " + modifyColumn); + } + } + + private static void applyModifyColumnPosition( + UpdateSchema pendingUpdate, TableChange.ModifyColumnPosition modifyColumnPosition) { + TableChange.ColumnPosition newPosition = modifyColumnPosition.getNewPosition(); + if (newPosition instanceof TableChange.First) { + pendingUpdate.moveFirst(modifyColumnPosition.getOldColumn().getName()); + } else if (newPosition instanceof TableChange.After) { + TableChange.After after = (TableChange.After) newPosition; + pendingUpdate.moveAfter(modifyColumnPosition.getOldColumn().getName(), after.column()); + } else { + throw new UnsupportedOperationException( + "Cannot apply unknown modify-column-position change: " + modifyColumnPosition); + } + } + + private static void applyUniqueConstraint( + UpdateSchema pendingUpdate, UniqueConstraint constraint) { + switch (constraint.getType()) { + case PRIMARY_KEY: + pendingUpdate.setIdentifierFields(constraint.getColumns()); + break; + case UNIQUE_KEY: + throw new UnsupportedOperationException( + "Unsupported table change: setting unique key constraints."); + default: + throw new UnsupportedOperationException( + "Cannot apply unknown unique constraint: " + constraint.getType().name()); + } + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/util/FlinkCompatibilityUtil.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/util/FlinkCompatibilityUtil.java new file mode 100644 index 000000000000..f02af894e82b --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/util/FlinkCompatibilityUtil.java @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.util; + +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.table.api.TableColumn; +import org.apache.flink.table.catalog.Column; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.runtime.typeutils.InternalTypeInfo; +import org.apache.flink.table.types.logical.RowType; + +/** + * This is a small util class that try to hide calls to Flink Internal or PublicEvolve interfaces as + * Flink can change those APIs during minor version release. + */ +public class FlinkCompatibilityUtil { + + private FlinkCompatibilityUtil() {} + + public static TypeInformation toTypeInfo(RowType rowType) { + return InternalTypeInfo.of(rowType); + } + + public static boolean isPhysicalColumn(TableColumn column) { + return column.isPhysical(); + } + + public static boolean isPhysicalColumn(Column column) { + return column.isPhysical(); + } +} diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/util/FlinkPackage.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/util/FlinkPackage.java new file mode 100644 index 000000000000..20b33e615e5f --- /dev/null +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/util/FlinkPackage.java @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.util; + +import java.util.concurrent.atomic.AtomicReference; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; + +public class FlinkPackage { + + private static final AtomicReference VERSION = new AtomicReference<>(); + public static final String FLINK_UNKNOWN_VERSION = "FLINK-UNKNOWN-VERSION"; + + private FlinkPackage() {} + + /** Returns Flink version string like x.y.z */ + public static String version() { + if (null == VERSION.get()) { + String detectedVersion; + try { + detectedVersion = versionFromJar(); + // use unknown version in case exact implementation version can't be found from the jar + // (this can happen if the DataStream class appears multiple times in the same classpath + // such as with shading) + detectedVersion = detectedVersion != null ? detectedVersion : FLINK_UNKNOWN_VERSION; + } catch (Exception e) { + detectedVersion = FLINK_UNKNOWN_VERSION; + } + VERSION.set(detectedVersion); + } + + return VERSION.get(); + } + + @VisibleForTesting + static String versionFromJar() { + // Choose {@link DataStream} class because it is one of the core Flink API + return DataStream.class.getPackage().getImplementationVersion(); + } + + @VisibleForTesting + static void setVersion(String version) { + VERSION.set(version); + } +} diff --git a/flink/v1.19/flink/src/main/resources/META-INF/services/org.apache.flink.table.factories.Factory b/flink/v1.19/flink/src/main/resources/META-INF/services/org.apache.flink.table.factories.Factory new file mode 100644 index 000000000000..29a9955a7e20 --- /dev/null +++ b/flink/v1.19/flink/src/main/resources/META-INF/services/org.apache.flink.table.factories.Factory @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +org.apache.iceberg.flink.FlinkDynamicTableFactory diff --git a/flink/v1.19/flink/src/main/resources/META-INF/services/org.apache.flink.table.factories.TableFactory b/flink/v1.19/flink/src/main/resources/META-INF/services/org.apache.flink.table.factories.TableFactory new file mode 100644 index 000000000000..2b6bfa3cd579 --- /dev/null +++ b/flink/v1.19/flink/src/main/resources/META-INF/services/org.apache.flink.table.factories.TableFactory @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +org.apache.iceberg.flink.FlinkCatalogFactory diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/AvroGenericRecordConverterBase.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/AvroGenericRecordConverterBase.java new file mode 100644 index 000000000000..4184526a6a1a --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/AvroGenericRecordConverterBase.java @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import org.junit.jupiter.api.Test; + +public abstract class AvroGenericRecordConverterBase { + protected abstract void testConverter(DataGenerator dataGenerator) throws Exception; + + @Test + public void testPrimitiveTypes() throws Exception { + testConverter(new DataGenerators.Primitives()); + } + + @Test + public void testStructOfPrimitive() throws Exception { + testConverter(new DataGenerators.StructOfPrimitive()); + } + + @Test + public void testStructOfArray() throws Exception { + testConverter(new DataGenerators.StructOfArray()); + } + + @Test + public void testStructOfMap() throws Exception { + testConverter(new DataGenerators.StructOfMap()); + } + + @Test + public void testStructOfStruct() throws Exception { + testConverter(new DataGenerators.StructOfStruct()); + } + + @Test + public void testArrayOfPrimitive() throws Exception { + testConverter(new DataGenerators.ArrayOfPrimitive()); + } + + @Test + public void testArrayOfArray() throws Exception { + testConverter(new DataGenerators.ArrayOfArray()); + } + + @Test + public void testArrayOfMap() throws Exception { + testConverter(new DataGenerators.ArrayOfMap()); + } + + @Test + public void testArrayOfStruct() throws Exception { + testConverter(new DataGenerators.ArrayOfStruct()); + } + + @Test + public void testMapOfPrimitives() throws Exception { + testConverter(new DataGenerators.MapOfPrimitives()); + } + + @Test + public void testMapOfArray() throws Exception { + testConverter(new DataGenerators.MapOfArray()); + } + + @Test + public void testMapOfMap() throws Exception { + testConverter(new DataGenerators.MapOfMap()); + } + + @Test + public void testMapOfStruct() throws Exception { + testConverter(new DataGenerators.MapOfStruct()); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/CatalogTestBase.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/CatalogTestBase.java new file mode 100644 index 000000000000..062ff68d5d85 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/CatalogTestBase.java @@ -0,0 +1,121 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import java.io.File; +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import org.apache.flink.util.ArrayUtils; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.iceberg.CatalogProperties; +import org.apache.iceberg.Parameter; +import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.Parameters; +import org.apache.iceberg.catalog.Catalog; +import org.apache.iceberg.catalog.Namespace; +import org.apache.iceberg.catalog.SupportsNamespaces; +import org.apache.iceberg.hadoop.HadoopCatalog; +import org.apache.iceberg.relocated.com.google.common.base.Joiner; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.extension.ExtendWith; +import org.junit.jupiter.api.io.TempDir; + +@ExtendWith(ParameterizedTestExtension.class) +public abstract class CatalogTestBase extends TestBase { + + protected static final String DATABASE = "db"; + @TempDir protected File hiveWarehouse; + @TempDir protected File hadoopWarehouse; + + @Parameter(index = 0) + protected String catalogName; + + @Parameter(index = 1) + protected Namespace baseNamespace; + + protected Catalog validationCatalog; + protected SupportsNamespaces validationNamespaceCatalog; + protected Map config = Maps.newHashMap(); + + protected String flinkDatabase; + protected Namespace icebergNamespace; + protected boolean isHadoopCatalog; + + @Parameters(name = "catalogName={0}, baseNamespace={1}") + protected static List parameters() { + return Arrays.asList( + new Object[] {"testhive", Namespace.empty()}, + new Object[] {"testhadoop", Namespace.empty()}, + new Object[] {"testhadoop_basenamespace", Namespace.of("l0", "l1")}); + } + + @BeforeEach + public void before() { + this.isHadoopCatalog = catalogName.startsWith("testhadoop"); + this.validationCatalog = + isHadoopCatalog + ? new HadoopCatalog(hiveConf, "file:" + hadoopWarehouse.getPath()) + : catalog; + this.validationNamespaceCatalog = (SupportsNamespaces) validationCatalog; + + config.put("type", "iceberg"); + if (!baseNamespace.isEmpty()) { + config.put(FlinkCatalogFactory.BASE_NAMESPACE, baseNamespace.toString()); + } + if (isHadoopCatalog) { + config.put(FlinkCatalogFactory.ICEBERG_CATALOG_TYPE, "hadoop"); + } else { + config.put(FlinkCatalogFactory.ICEBERG_CATALOG_TYPE, "hive"); + config.put(CatalogProperties.URI, getURI(hiveConf)); + } + config.put(CatalogProperties.WAREHOUSE_LOCATION, String.format("file://%s", warehouseRoot())); + + this.flinkDatabase = catalogName + "." + DATABASE; + this.icebergNamespace = + Namespace.of(ArrayUtils.concat(baseNamespace.levels(), new String[] {DATABASE})); + sql("CREATE CATALOG %s WITH %s", catalogName, toWithClause(config)); + } + + @AfterEach + public void clean() { + dropCatalog(catalogName, true); + } + + protected String warehouseRoot() { + if (isHadoopCatalog) { + return hadoopWarehouse.getAbsolutePath(); + } else { + return hiveWarehouse.getAbsolutePath(); + } + } + + protected String getFullQualifiedTableName(String tableName) { + final List levels = Lists.newArrayList(icebergNamespace.levels()); + levels.add(tableName); + return Joiner.on('.').join(levels); + } + + static String getURI(HiveConf conf) { + return conf.get(HiveConf.ConfVars.METASTOREURIS.varname); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/DataGenerator.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/DataGenerator.java new file mode 100644 index 000000000000..b1e3b20ff7ac --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/DataGenerator.java @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.iceberg.Schema; +import org.apache.iceberg.data.GenericRecord; + +/** + * This interface defines test data generator. Different implementations for primitive and complex + * nested fields are defined in {@link DataGenerators}. + */ +public interface DataGenerator { + Schema icebergSchema(); + + RowType flinkRowType(); + + org.apache.avro.Schema avroSchema(); + + GenericRecord generateIcebergGenericRecord(); + + GenericRowData generateFlinkRowData(); + + org.apache.avro.generic.GenericRecord generateAvroGenericRecord(); +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/DataGenerators.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/DataGenerators.java new file mode 100644 index 000000000000..e2cd411d7069 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/DataGenerators.java @@ -0,0 +1,1172 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import static org.apache.iceberg.types.Types.NestedField.required; + +import com.fasterxml.jackson.databind.node.IntNode; +import java.math.BigDecimal; +import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; +import java.time.LocalDate; +import java.time.LocalDateTime; +import java.time.LocalTime; +import java.time.OffsetDateTime; +import java.time.ZoneOffset; +import java.util.Arrays; +import java.util.List; +import java.util.UUID; +import java.util.concurrent.TimeUnit; +import java.util.stream.Collectors; +import org.apache.avro.LogicalTypes; +import org.apache.avro.SchemaBuilder; +import org.apache.avro.generic.GenericData; +import org.apache.avro.util.Utf8; +import org.apache.flink.table.data.DecimalData; +import org.apache.flink.table.data.GenericArrayData; +import org.apache.flink.table.data.GenericMapData; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.StringData; +import org.apache.flink.table.data.TimestampData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.iceberg.Schema; +import org.apache.iceberg.avro.AvroSchemaUtil; +import org.apache.iceberg.data.GenericRecord; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.types.Types; +import org.joda.time.DateTime; +import org.joda.time.DateTimeZone; +import org.joda.time.Days; + +/** + * Util class to generate test data with extensive coverage different field types: from primitives + * to complex nested types. + */ +public class DataGenerators { + + public static class Primitives implements DataGenerator { + private static final DateTime JODA_DATETIME_EPOC = + new DateTime(1970, 1, 1, 0, 0, 0, 0, DateTimeZone.UTC); + private static final DateTime JODA_DATETIME_20220110 = + new DateTime(2022, 1, 10, 0, 0, 0, 0, DateTimeZone.UTC); + private static final int DAYS_BTW_EPOC_AND_20220110 = + Days.daysBetween(JODA_DATETIME_EPOC, JODA_DATETIME_20220110).getDays(); + private static final int HOUR_8_IN_MILLI = (int) TimeUnit.HOURS.toMillis(8); + + private static final LocalDate JAVA_LOCAL_DATE_20220110 = LocalDate.of(2022, 1, 10); + private static final LocalTime JAVA_LOCAL_TIME_HOUR8 = LocalTime.of(8, 0); + private static final OffsetDateTime JAVA_OFFSET_DATE_TIME_20220110 = + OffsetDateTime.of(2022, 1, 10, 0, 0, 0, 0, ZoneOffset.UTC); + private static final LocalDateTime JAVA_LOCAL_DATE_TIME_20220110 = + LocalDateTime.of(2022, 1, 10, 0, 0, 0); + private static final BigDecimal BIG_DECIMAL_NEGATIVE = new BigDecimal("-1.50"); + private static final byte[] FIXED_BYTES = "012345689012345".getBytes(StandardCharsets.UTF_8); + + private final Schema icebergSchema = + new Schema( + Types.NestedField.required(1, "row_id", Types.StringType.get()), + // primitive types + Types.NestedField.optional(2, "boolean_field", Types.BooleanType.get()), + Types.NestedField.optional(3, "int_field", Types.IntegerType.get()), + Types.NestedField.optional(4, "long_field", Types.LongType.get()), + Types.NestedField.optional(5, "float_field", Types.FloatType.get()), + Types.NestedField.optional(6, "double_field", Types.DoubleType.get()), + Types.NestedField.required(7, "string_field", Types.StringType.get()), + Types.NestedField.required(8, "date_field", Types.DateType.get()), + Types.NestedField.required(9, "time_field", Types.TimeType.get()), + Types.NestedField.required(10, "ts_with_zone_field", Types.TimestampType.withZone()), + Types.NestedField.required( + 11, "ts_without_zone_field", Types.TimestampType.withoutZone()), + Types.NestedField.required(12, "uuid_field", Types.UUIDType.get()), + Types.NestedField.required(13, "binary_field", Types.BinaryType.get()), + Types.NestedField.required(14, "decimal_field", Types.DecimalType.of(9, 2)), + Types.NestedField.required(15, "fixed_field", Types.FixedType.ofLength(16))); + + private final RowType flinkRowType = FlinkSchemaUtil.convert(icebergSchema); + + /** + * Fix up Avro Schema that is converted from Iceberg Schema. + * + * @param schemaConvertedFromIceberg Avro Schema converted from Iceberg schema via {@link + * AvroSchemaUtil#convert(Schema, String)} + */ + private org.apache.avro.Schema fixupAvroSchemaConvertedFromIcebergSchema( + org.apache.avro.Schema schemaConvertedFromIceberg) { + List fixedFields = + schemaConvertedFromIceberg.getFields().stream() + .map( + field -> { + org.apache.avro.Schema.Field updatedField = field; + if (field.name().equals("time_field")) { + // Iceberg's AvroSchemaUtil uses timestamp-micros with Long value for time + // field, while AvroToRowDataConverters#convertToTime() always looks for + // Integer value assuming millis. The root problem is that + // AvroToRowDataConverters#createConverter() uses LogicalTypeRoot to + // determine converter and LogicalTypeRoot lost the timestamp precision + // carried by LogicalType like Time(6). + org.apache.avro.Schema fieldSchema = + LogicalTypes.timeMillis() + .addToSchema( + org.apache.avro.Schema.create(org.apache.avro.Schema.Type.INT)); + updatedField = new org.apache.avro.Schema.Field("time_field", fieldSchema); + } + + return new org.apache.avro.Schema.Field(updatedField, updatedField.schema()); + }) + .collect(Collectors.toList()); + return org.apache.avro.Schema.createRecord( + schemaConvertedFromIceberg.getName(), + schemaConvertedFromIceberg.getDoc(), + schemaConvertedFromIceberg.getNamespace(), + schemaConvertedFromIceberg.isError(), + fixedFields); + } + + private final org.apache.avro.Schema avroSchema = + fixupAvroSchemaConvertedFromIcebergSchema(AvroSchemaUtil.convert(icebergSchema, "table")); + + @Override + public Schema icebergSchema() { + return icebergSchema; + } + + @Override + public RowType flinkRowType() { + return flinkRowType; + } + + @Override + public org.apache.avro.Schema avroSchema() { + return avroSchema; + } + + @Override + public GenericRecord generateIcebergGenericRecord() { + GenericRecord genericRecord = GenericRecord.create(icebergSchema); + genericRecord.setField("row_id", "row_id_value"); + genericRecord.setField("boolean_field", false); + genericRecord.setField("int_field", Integer.MAX_VALUE); + genericRecord.setField("long_field", Long.MAX_VALUE); + genericRecord.setField("float_field", Float.MAX_VALUE); + genericRecord.setField("double_field", Double.MAX_VALUE); + genericRecord.setField("string_field", "str"); + + genericRecord.setField("date_field", JAVA_LOCAL_DATE_20220110); + genericRecord.setField("time_field", JAVA_LOCAL_TIME_HOUR8); + genericRecord.setField("ts_with_zone_field", JAVA_OFFSET_DATE_TIME_20220110); + genericRecord.setField("ts_without_zone_field", JAVA_LOCAL_DATE_TIME_20220110); + + byte[] uuidBytes = new byte[16]; + for (int i = 0; i < 16; ++i) { + uuidBytes[i] = (byte) i; + } + + genericRecord.setField("uuid_field", UUID.nameUUIDFromBytes(uuidBytes)); + + byte[] binaryBytes = new byte[7]; + for (int i = 0; i < 7; ++i) { + binaryBytes[i] = (byte) i; + } + genericRecord.setField("binary_field", ByteBuffer.wrap(binaryBytes)); + + genericRecord.setField("decimal_field", BIG_DECIMAL_NEGATIVE); + genericRecord.setField("fixed_field", FIXED_BYTES); + + return genericRecord; + } + + @Override + public GenericRowData generateFlinkRowData() { + byte[] uuidBytes = new byte[16]; + for (int i = 0; i < 16; ++i) { + uuidBytes[i] = (byte) i; + } + + byte[] binaryBytes = new byte[7]; + for (int i = 0; i < 7; ++i) { + binaryBytes[i] = (byte) i; + } + + return GenericRowData.of( + StringData.fromString("row_id_value"), + false, + Integer.MAX_VALUE, + Long.MAX_VALUE, + Float.MAX_VALUE, + Double.MAX_VALUE, + StringData.fromString("str"), + DAYS_BTW_EPOC_AND_20220110, + HOUR_8_IN_MILLI, + // Although Avro logical type for timestamp fields are in micro seconds, + // AvroToRowDataConverters only looks for long value in milliseconds. + TimestampData.fromEpochMillis(JODA_DATETIME_20220110.getMillis()), + TimestampData.fromEpochMillis(JODA_DATETIME_20220110.getMillis()), + uuidBytes, + binaryBytes, + DecimalData.fromBigDecimal(BIG_DECIMAL_NEGATIVE, 9, 2), + FIXED_BYTES); + } + + @Override + public org.apache.avro.generic.GenericRecord generateAvroGenericRecord() { + org.apache.avro.generic.GenericRecord genericRecord = new GenericData.Record(avroSchema); + genericRecord.put("row_id", new Utf8("row_id_value")); + genericRecord.put("boolean_field", false); + genericRecord.put("int_field", Integer.MAX_VALUE); + genericRecord.put("long_field", Long.MAX_VALUE); + genericRecord.put("float_field", Float.MAX_VALUE); + genericRecord.put("double_field", Double.MAX_VALUE); + genericRecord.put("string_field", new Utf8("str")); + + genericRecord.put("date_field", DAYS_BTW_EPOC_AND_20220110); + genericRecord.put("time_field", HOUR_8_IN_MILLI); + // Although Avro logical type for timestamp fields are in micro seconds, + // AvroToRowDataConverters only looks for long value in milliseconds. + genericRecord.put("ts_with_zone_field", JODA_DATETIME_20220110.getMillis()); + genericRecord.put("ts_without_zone_field", JODA_DATETIME_20220110.getMillis()); + + byte[] uuidBytes = new byte[16]; + for (int i = 0; i < 16; ++i) { + uuidBytes[i] = (byte) i; + } + genericRecord.put("uuid_field", ByteBuffer.wrap(uuidBytes)); + + byte[] binaryBytes = new byte[7]; + for (int i = 0; i < 7; ++i) { + binaryBytes[i] = (byte) i; + } + genericRecord.put("binary_field", ByteBuffer.wrap(binaryBytes)); + + BigDecimal bigDecimal = new BigDecimal("-1.50"); + // unscaledValue().toByteArray() is to match the behavior of RowDataToAvroConverters from + // Flink for decimal type + genericRecord.put("decimal_field", ByteBuffer.wrap(bigDecimal.unscaledValue().toByteArray())); + + genericRecord.put("fixed_field", ByteBuffer.wrap(FIXED_BYTES)); + + return genericRecord; + } + } + + public static class StructOfPrimitive implements DataGenerator { + private final Schema icebergSchema = + new Schema( + Types.NestedField.required(1, "row_id", Types.StringType.get()), + Types.NestedField.required( + 2, + "struct_of_primitive", + Types.StructType.of( + required(101, "id", Types.IntegerType.get()), + required(102, "name", Types.StringType.get())))); + + private final RowType flinkRowType = FlinkSchemaUtil.convert(icebergSchema); + + private final org.apache.avro.Schema avroSchema = + AvroSchemaUtil.convert(icebergSchema, "table"); + + @Override + public Schema icebergSchema() { + return icebergSchema; + } + + @Override + public RowType flinkRowType() { + return flinkRowType; + } + + @Override + public org.apache.avro.Schema avroSchema() { + return avroSchema; + } + + @Override + public GenericRecord generateIcebergGenericRecord() { + Schema structSchema = + new Schema(icebergSchema.findField("struct_of_primitive").type().asStructType().fields()); + GenericRecord struct = GenericRecord.create(structSchema); + struct.setField("id", 1); + struct.setField("name", "Jane"); + GenericRecord genericRecord = GenericRecord.create(icebergSchema); + genericRecord.setField("row_id", "row_id_value"); + genericRecord.setField("struct_of_primitive", struct); + return genericRecord; + } + + @Override + public GenericRowData generateFlinkRowData() { + return GenericRowData.of( + StringData.fromString("row_id_value"), + GenericRowData.of(1, StringData.fromString("Jane"))); + } + + @Override + public org.apache.avro.generic.GenericRecord generateAvroGenericRecord() { + org.apache.avro.Schema structSchema = avroSchema.getField("struct_of_primitive").schema(); + org.apache.avro.generic.GenericRecord struct = new GenericData.Record(structSchema); + struct.put("id", 1); + struct.put("name", "Jane"); + org.apache.avro.generic.GenericRecord genericRecord = new GenericData.Record(avroSchema); + genericRecord.put("row_id", "row_id_value"); + genericRecord.put("struct_of_primitive", struct); + return genericRecord; + } + } + + public static class StructOfArray implements DataGenerator { + private final Schema icebergSchema = + new Schema( + Types.NestedField.required(1, "row_id", Types.StringType.get()), + Types.NestedField.required( + 2, + "struct_of_array", + Types.StructType.of( + required(101, "id", Types.IntegerType.get()), + required( + 102, "names", Types.ListType.ofRequired(201, Types.StringType.get()))))); + + private final RowType flinkRowType = FlinkSchemaUtil.convert(icebergSchema); + + private final org.apache.avro.Schema avroSchema = + AvroSchemaUtil.convert(icebergSchema, "table"); + + @Override + public Schema icebergSchema() { + return icebergSchema; + } + + @Override + public RowType flinkRowType() { + return flinkRowType; + } + + @Override + public org.apache.avro.Schema avroSchema() { + return avroSchema; + } + + @Override + public GenericRecord generateIcebergGenericRecord() { + Schema structSchema = + new Schema(icebergSchema.findField("struct_of_array").type().asStructType().fields()); + GenericRecord struct = GenericRecord.create(structSchema); + struct.setField("id", 1); + struct.setField("names", Arrays.asList("Jane", "Joe")); + GenericRecord genericRecord = GenericRecord.create(icebergSchema); + genericRecord.setField("row_id", "row_id_value"); + genericRecord.setField("struct_of_array", struct); + return genericRecord; + } + + @Override + public GenericRowData generateFlinkRowData() { + StringData[] names = {StringData.fromString("Jane"), StringData.fromString("Joe")}; + return GenericRowData.of( + StringData.fromString("row_id_value"), GenericRowData.of(1, new GenericArrayData(names))); + } + + @Override + public org.apache.avro.generic.GenericRecord generateAvroGenericRecord() { + org.apache.avro.Schema structSchema = avroSchema.getField("struct_of_array").schema(); + org.apache.avro.generic.GenericRecord struct = new GenericData.Record(structSchema); + struct.put("id", 1); + struct.put("names", Arrays.asList("Jane", "Joe")); + org.apache.avro.generic.GenericRecord genericRecord = new GenericData.Record(avroSchema); + genericRecord.put("row_id", "row_id_value"); + genericRecord.put("struct_of_array", struct); + return genericRecord; + } + } + + public static class StructOfMap implements DataGenerator { + private final Schema icebergSchema = + new Schema( + Types.NestedField.required(1, "row_id", Types.StringType.get()), + Types.NestedField.required( + 2, + "struct_of_map", + Types.StructType.of( + required(101, "id", Types.IntegerType.get()), + required( + 102, + "names", + Types.MapType.ofRequired( + 201, 202, Types.StringType.get(), Types.StringType.get()))))); + + private final RowType flinkRowType = FlinkSchemaUtil.convert(icebergSchema); + + private final org.apache.avro.Schema avroSchema = + AvroSchemaUtil.convert(icebergSchema, "table"); + + @Override + public Schema icebergSchema() { + return icebergSchema; + } + + @Override + public RowType flinkRowType() { + return flinkRowType; + } + + @Override + public org.apache.avro.Schema avroSchema() { + return avroSchema; + } + + @Override + public GenericRecord generateIcebergGenericRecord() { + Schema structSchema = + new Schema(icebergSchema.findField("struct_of_map").type().asStructType().fields()); + GenericRecord struct = GenericRecord.create(structSchema); + struct.setField("id", 1); + struct.setField("names", ImmutableMap.of("Jane", "female", "Joe", "male")); + GenericRecord genericRecord = GenericRecord.create(icebergSchema); + genericRecord.setField("row_id", "row_id_value"); + genericRecord.setField("struct_of_map", struct); + return genericRecord; + } + + @Override + public GenericRowData generateFlinkRowData() { + return GenericRowData.of( + StringData.fromString("row_id_value"), + GenericRowData.of( + 1, + new GenericMapData( + ImmutableMap.of( + StringData.fromString("Jane"), + StringData.fromString("female"), + StringData.fromString("Joe"), + StringData.fromString("male"))))); + } + + @Override + public org.apache.avro.generic.GenericRecord generateAvroGenericRecord() { + org.apache.avro.Schema structSchema = avroSchema.getField("struct_of_map").schema(); + org.apache.avro.generic.GenericRecord struct = new GenericData.Record(structSchema); + struct.put("id", 1); + struct.put("names", ImmutableMap.of("Jane", new Utf8("female"), "Joe", new Utf8("male"))); + org.apache.avro.generic.GenericRecord genericRecord = new GenericData.Record(avroSchema); + genericRecord.put("row_id", "row_id_value"); + genericRecord.put("struct_of_map", struct); + return genericRecord; + } + } + + public static class StructOfStruct implements DataGenerator { + private final Schema icebergSchema = + new Schema( + Types.NestedField.required(1, "row_id", Types.StringType.get()), + Types.NestedField.required( + 2, + "struct_of_struct", + Types.StructType.of( + required(101, "id", Types.IntegerType.get()), + required( + 102, + "person_struct", + Types.StructType.of( + Types.NestedField.required(201, "name", Types.StringType.get()), + Types.NestedField.required(202, "address", Types.StringType.get())))))); + + private final RowType flinkRowType = FlinkSchemaUtil.convert(icebergSchema); + + private final org.apache.avro.Schema avroSchema = + AvroSchemaUtil.convert(icebergSchema, "table"); + + @Override + public Schema icebergSchema() { + return icebergSchema; + } + + @Override + public RowType flinkRowType() { + return flinkRowType; + } + + @Override + public org.apache.avro.Schema avroSchema() { + return avroSchema; + } + + @Override + public GenericRecord generateIcebergGenericRecord() { + Schema structSchema = + new Schema(icebergSchema.findField("struct_of_struct").type().asStructType().fields()); + Schema personSchema = + new Schema(structSchema.findField("person_struct").type().asStructType().fields()); + GenericRecord person = GenericRecord.create(personSchema); + person.setField("name", "Jane"); + person.setField("address", "Apple Park"); + GenericRecord struct = GenericRecord.create(structSchema); + struct.setField("id", 1); + struct.setField("person_struct", person); + GenericRecord genericRecord = GenericRecord.create(icebergSchema); + genericRecord.setField("row_id", "row_id_value"); + genericRecord.setField("struct_of_struct", struct); + return genericRecord; + } + + @Override + public GenericRowData generateFlinkRowData() { + return GenericRowData.of( + StringData.fromString("row_id_value"), + GenericRowData.of( + 1, + GenericRowData.of( + StringData.fromString("Jane"), StringData.fromString("Apple Park")))); + } + + @Override + public org.apache.avro.generic.GenericRecord generateAvroGenericRecord() { + org.apache.avro.Schema structSchema = avroSchema.getField("struct_of_struct").schema(); + org.apache.avro.Schema personSchema = structSchema.getField("person_struct").schema(); + org.apache.avro.generic.GenericRecord person = new GenericData.Record(personSchema); + person.put("name", "Jane"); + person.put("address", "Apple Park"); + org.apache.avro.generic.GenericRecord struct = new GenericData.Record(structSchema); + struct.put("id", 1); + struct.put("person_struct", person); + org.apache.avro.generic.GenericRecord genericRecord = new GenericData.Record(avroSchema); + genericRecord.put("row_id", "row_id_value"); + genericRecord.put("struct_of_struct", struct); + return genericRecord; + } + } + + public static class ArrayOfPrimitive implements DataGenerator { + private final Schema icebergSchema = + new Schema( + Types.NestedField.required(1, "row_id", Types.StringType.get()), + Types.NestedField.required( + 2, "array_of_int", Types.ListType.ofOptional(101, Types.IntegerType.get()))); + + private final RowType flinkRowType = FlinkSchemaUtil.convert(icebergSchema); + + private final org.apache.avro.Schema avroSchema = + AvroSchemaUtil.convert(icebergSchema, "table"); + + @Override + public Schema icebergSchema() { + return icebergSchema; + } + + @Override + public RowType flinkRowType() { + return flinkRowType; + } + + @Override + public org.apache.avro.Schema avroSchema() { + return avroSchema; + } + + @Override + public GenericRecord generateIcebergGenericRecord() { + GenericRecord genericRecord = GenericRecord.create(icebergSchema); + genericRecord.setField("row_id", "row_id_value"); + genericRecord.setField("array_of_int", Arrays.asList(1, 2, 3)); + return genericRecord; + } + + @Override + public GenericRowData generateFlinkRowData() { + Integer[] arr = {1, 2, 3}; + return GenericRowData.of(StringData.fromString("row_id_value"), new GenericArrayData(arr)); + } + + @Override + public org.apache.avro.generic.GenericRecord generateAvroGenericRecord() { + org.apache.avro.generic.GenericRecord genericRecord = new GenericData.Record(avroSchema); + genericRecord.put("row_id", "row_id_value"); + genericRecord.put("array_of_int", Arrays.asList(1, 2, 3)); + return genericRecord; + } + } + + public static class ArrayOfArray implements DataGenerator { + private final Schema icebergSchema = + new Schema( + Types.NestedField.required(1, "row_id", Types.StringType.get()), + Types.NestedField.required( + 2, + "array_of_array", + Types.ListType.ofRequired( + 101, Types.ListType.ofRequired(201, Types.IntegerType.get())))); + + private final RowType flinkRowType = FlinkSchemaUtil.convert(icebergSchema); + + private final org.apache.avro.Schema avroSchema = + AvroSchemaUtil.convert(icebergSchema, "table"); + + @Override + public Schema icebergSchema() { + return icebergSchema; + } + + @Override + public RowType flinkRowType() { + return flinkRowType; + } + + @Override + public org.apache.avro.Schema avroSchema() { + return avroSchema; + } + + @Override + public GenericRecord generateIcebergGenericRecord() { + GenericRecord genericRecord = GenericRecord.create(icebergSchema); + genericRecord.setField("row_id", "row_id_value"); + genericRecord.setField( + "array_of_array", Arrays.asList(Arrays.asList(1, 2, 3), Arrays.asList(4, 5, 6))); + return genericRecord; + } + + @Override + public GenericRowData generateFlinkRowData() { + // non-primitive + Integer[] array1 = {1, 2, 3}; + Integer[] array2 = {4, 5, 6}; + GenericArrayData[] arrayOfArrays = { + new GenericArrayData(array1), new GenericArrayData(array2) + }; + return GenericRowData.of( + StringData.fromString("row_id_value"), new GenericArrayData(arrayOfArrays)); + } + + @Override + public org.apache.avro.generic.GenericRecord generateAvroGenericRecord() { + org.apache.avro.generic.GenericRecord genericRecord = new GenericData.Record(avroSchema); + genericRecord.put("row_id", "row_id_value"); + genericRecord.put( + "array_of_array", Arrays.asList(Arrays.asList(1, 2, 3), Arrays.asList(4, 5, 6))); + return genericRecord; + } + } + + public static class ArrayOfMap implements DataGenerator { + private final Schema icebergSchema = + new Schema( + Types.NestedField.required(1, "row_id", Types.StringType.get()), + Types.NestedField.required( + 2, + "array_of_map", + Types.ListType.ofRequired( + 101, + Types.MapType.ofRequired( + 201, 202, Types.StringType.get(), Types.IntegerType.get())))); + + private final RowType flinkRowType = FlinkSchemaUtil.convert(icebergSchema); + + private final org.apache.avro.Schema avroSchema = + AvroSchemaUtil.convert(icebergSchema, "table"); + + @Override + public Schema icebergSchema() { + return icebergSchema; + } + + @Override + public RowType flinkRowType() { + return flinkRowType; + } + + @Override + public org.apache.avro.Schema avroSchema() { + return avroSchema; + } + + @Override + public GenericRecord generateIcebergGenericRecord() { + GenericRecord genericRecord = GenericRecord.create(icebergSchema); + genericRecord.setField("row_id", "row_id_value"); + genericRecord.setField( + "array_of_map", + Arrays.asList( + ImmutableMap.of("Jane", 1, "Joe", 2), ImmutableMap.of("Alice", 3, "Bob", 4))); + return genericRecord; + } + + @Override + public GenericRowData generateFlinkRowData() { + GenericMapData[] array = { + new GenericMapData( + ImmutableMap.of(StringData.fromString("Jane"), 1, StringData.fromString("Joe"), 2)), + new GenericMapData( + ImmutableMap.of(StringData.fromString("Alice"), 3, StringData.fromString("Bob"), 4)) + }; + return GenericRowData.of(StringData.fromString("row_id_value"), new GenericArrayData(array)); + } + + @Override + public org.apache.avro.generic.GenericRecord generateAvroGenericRecord() { + org.apache.avro.generic.GenericRecord genericRecord = new GenericData.Record(avroSchema); + genericRecord.put("row_id", "row_id_value"); + genericRecord.put( + "array_of_map", + Arrays.asList( + ImmutableMap.of("Jane", 1, "Joe", 2), ImmutableMap.of("Alice", 3, "Bob", 4))); + return genericRecord; + } + } + + public static class ArrayOfStruct implements DataGenerator { + private final Types.StructType structType = + Types.StructType.of( + required(201, "id", Types.IntegerType.get()), + required(202, "name", Types.StringType.get())); + private final Schema structIcebergSchema = new Schema(structType.fields()); + private final org.apache.avro.Schema structAvroSchema = + AvroSchemaUtil.convert(structIcebergSchema, "struct"); + + private final Schema icebergSchema = + new Schema( + Types.NestedField.required(1, "row_id", Types.StringType.get()), + Types.NestedField.optional( + 2, "array_of_struct", Types.ListType.ofRequired(101, structType))); + + private final RowType flinkRowType = FlinkSchemaUtil.convert(icebergSchema); + + private final org.apache.avro.Schema avroSchema = + AvroSchemaUtil.convert(icebergSchema, "table"); + + @Override + public Schema icebergSchema() { + return icebergSchema; + } + + @Override + public RowType flinkRowType() { + return flinkRowType; + } + + @Override + public org.apache.avro.Schema avroSchema() { + return avroSchema; + } + + @Override + public GenericRecord generateIcebergGenericRecord() { + GenericRecord struct1 = GenericRecord.create(structIcebergSchema); + struct1.setField("id", 1); + struct1.setField("name", "Jane"); + GenericRecord struct2 = GenericRecord.create(structIcebergSchema); + struct2.setField("id", 2); + struct2.setField("name", "Joe"); + GenericRecord genericRecord = GenericRecord.create(icebergSchema); + genericRecord.setField("row_id", "row_id_value"); + genericRecord.setField("array_of_struct", Arrays.asList(struct1, struct2)); + return genericRecord; + } + + @Override + public GenericRowData generateFlinkRowData() { + GenericRowData[] structArray = { + GenericRowData.of(1, StringData.fromString("Jane")), + GenericRowData.of(2, StringData.fromString("Joe")) + }; + return GenericRowData.of( + StringData.fromString("row_id_value"), new GenericArrayData(structArray)); + } + + @Override + public org.apache.avro.generic.GenericRecord generateAvroGenericRecord() { + org.apache.avro.generic.GenericRecord struct1 = new GenericData.Record(structAvroSchema); + struct1.put("id", 1); + struct1.put("name", "Jane"); + org.apache.avro.generic.GenericRecord struct2 = new GenericData.Record(structAvroSchema); + struct2.put("id", 2); + struct2.put("name", "Joe"); + org.apache.avro.generic.GenericRecord genericRecord = new GenericData.Record(avroSchema); + genericRecord.put("row_id", "row_id_value"); + genericRecord.put("array_of_struct", Arrays.asList(struct1, struct2)); + return genericRecord; + } + } + + public static class MapOfPrimitives implements DataGenerator { + private final Schema icebergSchema = + new Schema( + Types.NestedField.required(1, "row_id", Types.StringType.get()), + Types.NestedField.optional( + 2, + "map_of_primitives", + Types.MapType.ofRequired( + 101, 102, Types.StringType.get(), Types.IntegerType.get()))); + + private final RowType flinkRowType = FlinkSchemaUtil.convert(icebergSchema); + + private final org.apache.avro.Schema avroSchema = + AvroSchemaUtil.convert(icebergSchema, "table"); + + @Override + public Schema icebergSchema() { + return icebergSchema; + } + + @Override + public RowType flinkRowType() { + return flinkRowType; + } + + @Override + public org.apache.avro.Schema avroSchema() { + return avroSchema; + } + + @Override + public GenericRecord generateIcebergGenericRecord() { + GenericRecord genericRecord = GenericRecord.create(icebergSchema); + genericRecord.setField("row_id", "row_id_value"); + genericRecord.setField("map_of_primitives", ImmutableMap.of("Jane", 1, "Joe", 2)); + return genericRecord; + } + + @Override + public GenericRowData generateFlinkRowData() { + return GenericRowData.of( + StringData.fromString("row_id_value"), + new GenericMapData( + ImmutableMap.of(StringData.fromString("Jane"), 1, StringData.fromString("Joe"), 2))); + } + + @Override + public org.apache.avro.generic.GenericRecord generateAvroGenericRecord() { + org.apache.avro.generic.GenericRecord genericRecord = new GenericData.Record(avroSchema); + genericRecord.put("row_id", "row_id_value"); + genericRecord.put("map_of_primitives", ImmutableMap.of("Jane", 1, "Joe", 2)); + return genericRecord; + } + } + + public static class MapOfArray implements DataGenerator { + private final Schema icebergSchema = + new Schema( + Types.NestedField.required(1, "row_id", Types.StringType.get()), + Types.NestedField.required( + 2, + "map_of_array", + Types.MapType.ofRequired( + 101, + 102, + Types.StringType.get(), + Types.ListType.ofRequired(201, Types.IntegerType.get())))); + + private final RowType rowType = FlinkSchemaUtil.convert(icebergSchema); + + private final org.apache.avro.Schema avroSchema = + AvroSchemaUtil.convert(icebergSchema, "table"); + + @Override + public Schema icebergSchema() { + return icebergSchema; + } + + @Override + public RowType flinkRowType() { + return rowType; + } + + @Override + public org.apache.avro.Schema avroSchema() { + return avroSchema; + } + + @Override + public GenericRecord generateIcebergGenericRecord() { + GenericRecord genericRecord = GenericRecord.create(icebergSchema); + genericRecord.setField("row_id", "row_id_value"); + genericRecord.setField( + "map_of_array", + ImmutableMap.of( + "Jane", Arrays.asList(1, 2, 3), + "Joe", Arrays.asList(4, 5, 6))); + return genericRecord; + } + + @Override + public GenericRowData generateFlinkRowData() { + Integer[] janeArray = {1, 2, 3}; + Integer[] joeArray = {4, 5, 6}; + return GenericRowData.of( + StringData.fromString("row_id_value"), + new GenericMapData( + ImmutableMap.of( + StringData.fromString("Jane"), + new GenericArrayData(janeArray), + StringData.fromString("Joe"), + new GenericArrayData(joeArray)))); + } + + @Override + public org.apache.avro.generic.GenericRecord generateAvroGenericRecord() { + org.apache.avro.generic.GenericRecord genericRecord = new GenericData.Record(avroSchema); + genericRecord.put("row_id", "row_id_value"); + genericRecord.put( + "map_of_array", + ImmutableMap.of( + "Jane", Arrays.asList(1, 2, 3), + "Joe", Arrays.asList(4, 5, 6))); + return genericRecord; + } + } + + public static class MapOfMap implements DataGenerator { + private final Schema icebergSchema = + new Schema( + Types.NestedField.required(1, "row_id", Types.StringType.get()), + Types.NestedField.required( + 2, + "map_of_map", + Types.MapType.ofRequired( + 101, + 102, + Types.StringType.get(), + Types.MapType.ofRequired( + 301, 302, Types.StringType.get(), Types.IntegerType.get())))); + + private final RowType flinkRowType = FlinkSchemaUtil.convert(icebergSchema); + + private final org.apache.avro.Schema avroSchema = + AvroSchemaUtil.convert(icebergSchema, "table"); + + @Override + public Schema icebergSchema() { + return icebergSchema; + } + + @Override + public RowType flinkRowType() { + return flinkRowType; + } + + @Override + public org.apache.avro.Schema avroSchema() { + return avroSchema; + } + + @Override + public GenericRecord generateIcebergGenericRecord() { + GenericRecord genericRecord = GenericRecord.create(icebergSchema); + genericRecord.setField("row_id", "row_id_value"); + genericRecord.setField( + "map_of_map", + ImmutableMap.of( + "female", ImmutableMap.of("Jane", 1, "Alice", 2), + "male", ImmutableMap.of("Joe", 3, "Bob", 4))); + return genericRecord; + } + + @Override + public GenericRowData generateFlinkRowData() { + return GenericRowData.of( + StringData.fromString("row_id_value"), + new GenericMapData( + ImmutableMap.of( + StringData.fromString("female"), + new GenericMapData( + ImmutableMap.of( + StringData.fromString("Jane"), 1, StringData.fromString("Alice"), 2)), + StringData.fromString("male"), + new GenericMapData( + ImmutableMap.of( + StringData.fromString("Joe"), 3, StringData.fromString("Bob"), 4))))); + } + + @Override + public org.apache.avro.generic.GenericRecord generateAvroGenericRecord() { + org.apache.avro.generic.GenericRecord genericRecord = new GenericData.Record(avroSchema); + genericRecord.put("row_id", "row_id_value"); + genericRecord.put( + "map_of_map", + ImmutableMap.of( + "female", ImmutableMap.of("Jane", 1, "Alice", 2), + "male", ImmutableMap.of("Joe", 3, "Bob", 4))); + return genericRecord; + } + } + + public static class MapOfStruct implements DataGenerator { + private org.apache.avro.Schema createAvroSchemaIdField() { + org.apache.avro.Schema schema = SchemaBuilder.builder().intType(); + // this is needed to match the converter generated schema props + schema.addProp("field-id", IntNode.valueOf(201)); + return schema; + } + + private org.apache.avro.Schema createAvroSchemaNameField() { + org.apache.avro.Schema schema = SchemaBuilder.builder().stringType(); + // this is needed to match the converter generated schema props + schema.addProp("field-id", IntNode.valueOf(202)); + return schema; + } + + private final Types.StructType structType = + Types.StructType.of( + required(201, "id", Types.IntegerType.get()), + required(202, "name", Types.StringType.get())); + private final Schema structIcebergSchema = new Schema(structType.fields()); + + private final org.apache.avro.Schema structAvroSchema = + SchemaBuilder.builder() + .record("struct") + .fields() + .name("id") + .type(createAvroSchemaIdField()) + .noDefault() + .name("name") + .type(createAvroSchemaNameField()) + .noDefault() + .endRecord(); + + private final Schema icebergSchema = + new Schema( + Types.NestedField.required(1, "row_id", Types.StringType.get()), + Types.NestedField.required( + 2, + "map_of_struct", + Types.MapType.ofRequired(101, 102, Types.StringType.get(), structType))); + + private final RowType flinkRowType = FlinkSchemaUtil.convert(icebergSchema); + + // Can't use AvroSchemaUtil.convert otherwise the nested schema will have generated name like + // `r102` not the specified name like `struct`. + org.apache.avro.Schema avroSchema = + SchemaBuilder.builder() + .record("table") + .fields() + .requiredString("row_id") + .name("map_of_struct") + .type(SchemaBuilder.builder().map().values(structAvroSchema)) + .noDefault() + .endRecord(); + + @Override + public Schema icebergSchema() { + return icebergSchema; + } + + @Override + public RowType flinkRowType() { + return flinkRowType; + } + + @Override + public org.apache.avro.Schema avroSchema() { + return avroSchema; + } + + @Override + public GenericRecord generateIcebergGenericRecord() { + GenericRecord struct1 = GenericRecord.create(structIcebergSchema); + struct1.setField("id", 1); + struct1.setField("name", "Jane"); + GenericRecord struct2 = GenericRecord.create(structIcebergSchema); + struct2.setField("id", 2); + struct2.setField("name", "Joe"); + GenericRecord genericRecord = GenericRecord.create(icebergSchema); + genericRecord.setField("row_id", "row_id_value"); + genericRecord.setField( + "map_of_struct", ImmutableMap.of("struct1", struct1, "struct2", struct2)); + return genericRecord; + } + + @Override + public GenericRowData generateFlinkRowData() { + return GenericRowData.of( + StringData.fromString("row_id_value"), + new GenericMapData( + ImmutableMap.of( + StringData.fromString("struct1"), + GenericRowData.of(1, StringData.fromString("Jane")), + StringData.fromString("struct2"), + GenericRowData.of(2, StringData.fromString("Joe"))))); + } + + @Override + public org.apache.avro.generic.GenericRecord generateAvroGenericRecord() { + org.apache.avro.generic.GenericRecord struct1 = new GenericData.Record(structAvroSchema); + struct1.put("id", 1); + struct1.put("name", new Utf8("Jane")); + org.apache.avro.generic.GenericRecord struct2 = new GenericData.Record(structAvroSchema); + struct2.put("id", 2); + struct2.put("name", new Utf8("Joe")); + org.apache.avro.generic.GenericRecord genericRecord = new GenericData.Record(avroSchema); + genericRecord.put("row_id", new Utf8("row_id_value")); + genericRecord.put("map_of_struct", ImmutableMap.of("struct1", struct1, "struct2", struct2)); + return genericRecord; + } + } + + public static class MapOfStructStruct implements DataGenerator { + private final Schema icebergSchema = + new Schema( + Types.NestedField.required(1, "row_id", Types.StringType.get()), + Types.NestedField.optional( + 2, + "map", + Types.MapType.ofOptional( + 101, + 102, + Types.StructType.of( + Types.NestedField.required(201, "key", Types.LongType.get()), + Types.NestedField.optional(202, "keyData", Types.StringType.get())), + Types.StructType.of( + Types.NestedField.required(203, "value", Types.LongType.get()), + Types.NestedField.optional(204, "valueData", Types.StringType.get()))))); + + private final RowType flinkRowType = FlinkSchemaUtil.convert(icebergSchema); + + @Override + public Schema icebergSchema() { + return icebergSchema; + } + + @Override + public RowType flinkRowType() { + return flinkRowType; + } + + @Override + public org.apache.avro.Schema avroSchema() { + throw new UnsupportedOperationException( + "Not applicable as Avro Map only support string key type"); + } + + @Override + public GenericRecord generateIcebergGenericRecord() { + throw new UnsupportedOperationException("Not implemented yet"); + } + + @Override + public GenericRowData generateFlinkRowData() { + return GenericRowData.of( + StringData.fromString("row_id_value"), + new GenericMapData( + ImmutableMap.of( + GenericRowData.of(1L, StringData.fromString("key_data")), + GenericRowData.of(1L, StringData.fromString("value_data"))))); + } + + @Override + public org.apache.avro.generic.GenericRecord generateAvroGenericRecord() { + throw new UnsupportedOperationException("Avro Map only support string key type"); + } + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/HadoopCatalogExtension.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/HadoopCatalogExtension.java new file mode 100644 index 000000000000..fd5c6b76b683 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/HadoopCatalogExtension.java @@ -0,0 +1,105 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.UUID; +import org.apache.commons.io.FileUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.iceberg.CatalogProperties; +import org.apache.iceberg.catalog.Catalog; +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.hadoop.HadoopCatalog; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.junit.jupiter.api.extension.AfterAllCallback; +import org.junit.jupiter.api.extension.AfterEachCallback; +import org.junit.jupiter.api.extension.BeforeAllCallback; +import org.junit.jupiter.api.extension.BeforeEachCallback; +import org.junit.jupiter.api.extension.ExtensionContext; + +public class HadoopCatalogExtension + implements BeforeAllCallback, BeforeEachCallback, AfterAllCallback, AfterEachCallback { + protected final String database; + protected final String tableName; + + protected Path temporaryFolder; + protected Catalog catalog; + protected CatalogLoader catalogLoader; + protected String warehouse; + protected TableLoader tableLoader; + + public HadoopCatalogExtension(String database, String tableName) { + this.database = database; + this.tableName = tableName; + } + + @Override + public void beforeAll(ExtensionContext context) throws Exception { + this.temporaryFolder = Files.createTempDirectory("junit5_hadoop_catalog-"); + } + + @Override + public void afterAll(ExtensionContext context) throws Exception { + FileUtils.deleteDirectory(temporaryFolder.toFile()); + } + + @Override + public void beforeEach(ExtensionContext context) throws Exception { + assertThat(temporaryFolder).exists().isDirectory(); + this.warehouse = "file:" + temporaryFolder + "/" + UUID.randomUUID(); + this.catalogLoader = + CatalogLoader.hadoop( + "hadoop", + new Configuration(), + ImmutableMap.of(CatalogProperties.WAREHOUSE_LOCATION, warehouse)); + this.catalog = catalogLoader.loadCatalog(); + this.tableLoader = + TableLoader.fromCatalog(catalogLoader, TableIdentifier.of(database, tableName)); + } + + @Override + public void afterEach(ExtensionContext context) throws Exception { + try { + catalog.dropTable(TableIdentifier.of(database, tableName)); + ((HadoopCatalog) catalog).close(); + tableLoader.close(); + } catch (Exception e) { + throw new RuntimeException("Failed to close catalog resource"); + } + } + + public TableLoader tableLoader() { + return tableLoader; + } + + public Catalog catalog() { + return catalog; + } + + public CatalogLoader catalogLoader() { + return catalogLoader; + } + + public String warehouse() { + return warehouse; + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/HadoopTableExtension.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/HadoopTableExtension.java new file mode 100644 index 000000000000..dc6ef400a4a9 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/HadoopTableExtension.java @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.Table; +import org.apache.iceberg.catalog.TableIdentifier; +import org.junit.jupiter.api.extension.ExtensionContext; + +public class HadoopTableExtension extends HadoopCatalogExtension { + private final Schema schema; + private final PartitionSpec partitionSpec; + + private Table table; + + public HadoopTableExtension(String database, String tableName, Schema schema) { + this(database, tableName, schema, null); + } + + public HadoopTableExtension( + String database, String tableName, Schema schema, PartitionSpec partitionSpec) { + super(database, tableName); + this.schema = schema; + this.partitionSpec = partitionSpec; + } + + @Override + public void beforeEach(ExtensionContext context) throws Exception { + super.beforeEach(context); + if (partitionSpec == null) { + this.table = catalog.createTable(TableIdentifier.of(database, tableName), schema); + } else { + this.table = + catalog.createTable(TableIdentifier.of(database, tableName), schema, partitionSpec); + } + tableLoader.open(); + } + + public Table table() { + return table; + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/MiniFlinkClusterExtension.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/MiniFlinkClusterExtension.java new file mode 100644 index 000000000000..d2e086aa448e --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/MiniFlinkClusterExtension.java @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import org.apache.flink.configuration.Configuration; +import org.apache.flink.configuration.CoreOptions; +import org.apache.flink.runtime.testutils.InMemoryReporter; +import org.apache.flink.runtime.testutils.MiniClusterResourceConfiguration; +import org.apache.flink.test.junit5.MiniClusterExtension; + +public class MiniFlinkClusterExtension { + + private static final int DEFAULT_TM_NUM = 1; + private static final int DEFAULT_PARALLELISM = 4; + + public static final Configuration DISABLE_CLASSLOADER_CHECK_CONFIG = + new Configuration() + // disable classloader check as Avro may cache class/object in the serializers. + .set(CoreOptions.CHECK_LEAKED_CLASSLOADER, false); + + private MiniFlinkClusterExtension() {} + + /** + * It will start a mini cluster with classloader.check-leaked-classloader=false, so that we won't + * break the unit tests because of the class loader leak issue. In our iceberg integration tests, + * there're some that will assert the results after finished the flink jobs, so actually we may + * access the class loader that has been closed by the flink task managers if we enable the switch + * classloader.check-leaked-classloader by default. + */ + public static MiniClusterExtension createWithClassloaderCheckDisabled() { + return new MiniClusterExtension( + new MiniClusterResourceConfiguration.Builder() + .setNumberTaskManagers(DEFAULT_TM_NUM) + .setNumberSlotsPerTaskManager(DEFAULT_PARALLELISM) + .setConfiguration(DISABLE_CLASSLOADER_CHECK_CONFIG) + .build()); + } + + public static MiniClusterExtension createWithClassloaderCheckDisabled( + InMemoryReporter inMemoryReporter) { + Configuration configuration = new Configuration(DISABLE_CLASSLOADER_CHECK_CONFIG); + inMemoryReporter.addToConfiguration(configuration); + + return new MiniClusterExtension( + new MiniClusterResourceConfiguration.Builder() + .setNumberTaskManagers(DEFAULT_TM_NUM) + .setNumberSlotsPerTaskManager(DEFAULT_PARALLELISM) + .setConfiguration(configuration) + .build()); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/RowDataConverter.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/RowDataConverter.java new file mode 100644 index 000000000000..e532fb62615c --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/RowDataConverter.java @@ -0,0 +1,135 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import java.math.BigDecimal; +import java.nio.ByteBuffer; +import java.time.Instant; +import java.time.LocalDate; +import java.time.LocalDateTime; +import java.time.LocalTime; +import java.time.OffsetDateTime; +import java.time.ZoneOffset; +import java.time.temporal.ChronoUnit; +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.UUID; +import java.util.concurrent.TimeUnit; +import org.apache.flink.table.data.DecimalData; +import org.apache.flink.table.data.GenericArrayData; +import org.apache.flink.table.data.GenericMapData; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.flink.table.data.TimestampData; +import org.apache.iceberg.Schema; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.types.Type; +import org.apache.iceberg.types.Types; + +public class RowDataConverter { + private static final OffsetDateTime EPOCH = Instant.ofEpochSecond(0).atOffset(ZoneOffset.UTC); + private static final LocalDate EPOCH_DAY = EPOCH.toLocalDate(); + + private RowDataConverter() {} + + public static RowData convert(Schema iSchema, Record record) { + return convert(iSchema.asStruct(), record); + } + + private static RowData convert(Types.StructType struct, Record record) { + GenericRowData rowData = new GenericRowData(struct.fields().size()); + List fields = struct.fields(); + for (int i = 0; i < fields.size(); i += 1) { + Types.NestedField field = fields.get(i); + + Type fieldType = field.type(); + rowData.setField(i, convert(fieldType, record.get(i))); + } + return rowData; + } + + private static Object convert(Type type, Object object) { + if (object == null) { + return null; + } + + switch (type.typeId()) { + case BOOLEAN: + case INTEGER: + case LONG: + case FLOAT: + case DOUBLE: + case FIXED: + return object; + case DATE: + return (int) ChronoUnit.DAYS.between(EPOCH_DAY, (LocalDate) object); + case TIME: + // Iceberg's time is in microseconds, while flink's time is in milliseconds. + LocalTime localTime = (LocalTime) object; + return (int) TimeUnit.NANOSECONDS.toMillis(localTime.toNanoOfDay()); + case TIMESTAMP: + if (((Types.TimestampType) type).shouldAdjustToUTC()) { + return TimestampData.fromInstant(((OffsetDateTime) object).toInstant()); + } else { + return TimestampData.fromLocalDateTime((LocalDateTime) object); + } + case STRING: + return StringData.fromString((String) object); + case UUID: + UUID uuid = (UUID) object; + ByteBuffer bb = ByteBuffer.allocate(16); + bb.putLong(uuid.getMostSignificantBits()); + bb.putLong(uuid.getLeastSignificantBits()); + return bb.array(); + case BINARY: + ByteBuffer buffer = (ByteBuffer) object; + return Arrays.copyOfRange( + buffer.array(), + buffer.arrayOffset() + buffer.position(), + buffer.arrayOffset() + buffer.remaining()); + case DECIMAL: + Types.DecimalType decimalType = (Types.DecimalType) type; + return DecimalData.fromBigDecimal( + (BigDecimal) object, decimalType.precision(), decimalType.scale()); + case STRUCT: + return convert(type.asStructType(), (Record) object); + case LIST: + List list = (List) object; + Object[] convertedArray = new Object[list.size()]; + for (int i = 0; i < convertedArray.length; i++) { + convertedArray[i] = convert(type.asListType().elementType(), list.get(i)); + } + return new GenericArrayData(convertedArray); + case MAP: + Map convertedMap = Maps.newLinkedHashMap(); + Map map = (Map) object; + for (Map.Entry entry : map.entrySet()) { + convertedMap.put( + convert(type.asMapType().keyType(), entry.getKey()), + convert(type.asMapType().valueType(), entry.getValue())); + } + return new GenericMapData(convertedMap); + default: + throw new UnsupportedOperationException("Not a supported type: " + type); + } + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/SimpleDataUtil.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/SimpleDataUtil.java new file mode 100644 index 000000000000..1767f774922a --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/SimpleDataUtil.java @@ -0,0 +1,439 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import static org.apache.iceberg.hadoop.HadoopOutputFile.fromPath; +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.IOException; +import java.time.Duration; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; +import org.apache.flink.table.api.DataTypes; +import org.apache.flink.table.api.TableSchema; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.types.RowKind; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.DataFiles; +import org.apache.iceberg.DeleteFile; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.FileScanTask; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.Snapshot; +import org.apache.iceberg.SnapshotRef; +import org.apache.iceberg.StructLike; +import org.apache.iceberg.Table; +import org.apache.iceberg.TableScan; +import org.apache.iceberg.data.GenericRecord; +import org.apache.iceberg.data.IcebergGenerics; +import org.apache.iceberg.data.InternalRecordWrapper; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.deletes.EqualityDeleteWriter; +import org.apache.iceberg.deletes.PositionDelete; +import org.apache.iceberg.deletes.PositionDeleteWriter; +import org.apache.iceberg.encryption.EncryptedOutputFile; +import org.apache.iceberg.flink.sink.FlinkAppenderFactory; +import org.apache.iceberg.hadoop.HadoopInputFile; +import org.apache.iceberg.hadoop.HadoopTables; +import org.apache.iceberg.io.CloseableIterable; +import org.apache.iceberg.io.FileAppender; +import org.apache.iceberg.io.FileAppenderFactory; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Iterables; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.types.Types; +import org.apache.iceberg.util.Pair; +import org.apache.iceberg.util.StructLikeSet; +import org.apache.iceberg.util.StructLikeWrapper; +import org.awaitility.Awaitility; + +public class SimpleDataUtil { + + private SimpleDataUtil() {} + + public static final Schema SCHEMA = + new Schema( + Types.NestedField.optional(1, "id", Types.IntegerType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get())); + + public static final TableSchema FLINK_SCHEMA = + TableSchema.builder().field("id", DataTypes.INT()).field("data", DataTypes.STRING()).build(); + + public static final RowType ROW_TYPE = (RowType) FLINK_SCHEMA.toRowDataType().getLogicalType(); + + public static final Record RECORD = GenericRecord.create(SCHEMA); + + public static Table createTable( + String path, Map properties, boolean partitioned) { + PartitionSpec spec; + if (partitioned) { + spec = PartitionSpec.builderFor(SCHEMA).identity("data").build(); + } else { + spec = PartitionSpec.unpartitioned(); + } + return new HadoopTables().create(SCHEMA, spec, properties, path); + } + + public static Record createRecord(Integer id, String data) { + Record record = RECORD.copy(); + record.setField("id", id); + record.setField("data", data); + return record; + } + + public static RowData createRowData(Integer id, String data) { + return GenericRowData.of(id, StringData.fromString(data)); + } + + public static RowData createInsert(Integer id, String data) { + return GenericRowData.ofKind(RowKind.INSERT, id, StringData.fromString(data)); + } + + public static RowData createDelete(Integer id, String data) { + return GenericRowData.ofKind(RowKind.DELETE, id, StringData.fromString(data)); + } + + public static RowData createUpdateBefore(Integer id, String data) { + return GenericRowData.ofKind(RowKind.UPDATE_BEFORE, id, StringData.fromString(data)); + } + + public static RowData createUpdateAfter(Integer id, String data) { + return GenericRowData.ofKind(RowKind.UPDATE_AFTER, id, StringData.fromString(data)); + } + + public static DataFile writeFile( + Table table, + Schema schema, + PartitionSpec spec, + Configuration conf, + String location, + String filename, + List rows) + throws IOException { + return writeFile(table, schema, spec, conf, location, filename, rows, null); + } + + /** Write the list of {@link RowData} to the given path and with the given partition data */ + public static DataFile writeFile( + Table table, + Schema schema, + PartitionSpec spec, + Configuration conf, + String location, + String filename, + List rows, + StructLike partition) + throws IOException { + Path path = new Path(location, filename); + FileFormat fileFormat = FileFormat.fromFileName(filename); + Preconditions.checkNotNull(fileFormat, "Cannot determine format for file: %s", filename); + + RowType flinkSchema = FlinkSchemaUtil.convert(schema); + FileAppenderFactory appenderFactory = + new FlinkAppenderFactory( + table, schema, flinkSchema, ImmutableMap.of(), spec, null, null, null); + + FileAppender appender = appenderFactory.newAppender(fromPath(path, conf), fileFormat); + try (FileAppender closeableAppender = appender) { + closeableAppender.addAll(rows); + } + + DataFiles.Builder builder = + DataFiles.builder(spec) + .withInputFile(HadoopInputFile.fromPath(path, conf)) + .withMetrics(appender.metrics()); + + if (partition != null) { + builder = builder.withPartition(partition); + } + + return builder.build(); + } + + public static DeleteFile writeEqDeleteFile( + Table table, + FileFormat format, + String filename, + FileAppenderFactory appenderFactory, + List deletes) + throws IOException { + EncryptedOutputFile outputFile = + table + .encryption() + .encrypt(fromPath(new Path(table.location(), filename), new Configuration())); + + EqualityDeleteWriter eqWriter = + appenderFactory.newEqDeleteWriter(outputFile, format, null); + try (EqualityDeleteWriter writer = eqWriter) { + writer.write(deletes); + } + return eqWriter.toDeleteFile(); + } + + public static DeleteFile writePosDeleteFile( + Table table, + FileFormat format, + String filename, + FileAppenderFactory appenderFactory, + List> positions) + throws IOException { + EncryptedOutputFile outputFile = + table + .encryption() + .encrypt(fromPath(new Path(table.location(), filename), new Configuration())); + + PositionDeleteWriter posWriter = + appenderFactory.newPosDeleteWriter(outputFile, format, null); + PositionDelete posDelete = PositionDelete.create(); + try (PositionDeleteWriter writer = posWriter) { + for (Pair p : positions) { + writer.write(posDelete.set(p.first(), p.second(), null)); + } + } + return posWriter.toDeleteFile(); + } + + private static List convertToRecords(List rows) { + List records = Lists.newArrayList(); + for (RowData row : rows) { + Integer id = row.isNullAt(0) ? null : row.getInt(0); + String data = row.isNullAt(1) ? null : row.getString(1).toString(); + records.add(createRecord(id, data)); + } + return records; + } + + public static void assertTableRows(String tablePath, List expected, String branch) + throws IOException { + assertTableRecords(tablePath, convertToRecords(expected), branch); + } + + public static void assertTableRows(Table table, List expected) throws IOException { + assertTableRecords(table, convertToRecords(expected), SnapshotRef.MAIN_BRANCH); + } + + public static void assertTableRows(Table table, List expected, String branch) + throws IOException { + assertTableRecords(table, convertToRecords(expected), branch); + } + + /** Get all rows for a table */ + public static List tableRecords(Table table) throws IOException { + table.refresh(); + List records = Lists.newArrayList(); + try (CloseableIterable iterable = IcebergGenerics.read(table).build()) { + for (Record record : iterable) { + records.add(record); + } + } + return records; + } + + public static boolean equalsRecords(List expected, List actual, Schema schema) { + if (expected.size() != actual.size()) { + return false; + } + Types.StructType type = schema.asStruct(); + StructLikeSet expectedSet = StructLikeSet.create(type); + expectedSet.addAll(expected); + StructLikeSet actualSet = StructLikeSet.create(type); + actualSet.addAll(actual); + return expectedSet.equals(actualSet); + } + + public static void assertRecordsEqual(List expected, List actual, Schema schema) { + assertThat(actual).hasSameSizeAs(expected); + Types.StructType type = schema.asStruct(); + StructLikeSet expectedSet = StructLikeSet.create(type); + expectedSet.addAll(expected); + StructLikeSet actualSet = StructLikeSet.create(type); + actualSet.addAll(actual); + assertThat(actualSet).containsExactlyInAnyOrderElementsOf(expectedSet); + } + + /** + * Assert table contains the expected list of records after waiting up to the configured {@code + * timeout} + */ + public static void assertTableRecords(Table table, List expected, Duration timeout) { + Awaitility.await("expected list of records should be produced") + .atMost(timeout) + .untilAsserted(() -> assertRecordsEqual(expected, tableRecords(table), table.schema())); + } + + public static void assertTableRecords(Table table, List expected) throws IOException { + assertTableRecords(table, expected, SnapshotRef.MAIN_BRANCH); + } + + public static void assertTableRecords(Table table, List expected, String branch) + throws IOException { + table.refresh(); + Snapshot snapshot = latestSnapshot(table, branch); + + if (snapshot == null) { + assertThat(expected).isEmpty(); + return; + } + + Types.StructType type = table.schema().asStruct(); + StructLikeSet expectedSet = StructLikeSet.create(type); + expectedSet.addAll(expected); + + try (CloseableIterable iterable = + IcebergGenerics.read(table).useSnapshot(snapshot.snapshotId()).build()) { + StructLikeSet actualSet = StructLikeSet.create(type); + + for (Record record : iterable) { + actualSet.add(record); + } + + assertThat(actualSet).containsExactlyInAnyOrderElementsOf(expectedSet); + } + } + + // Returns the latest snapshot of the given branch in the table + public static Snapshot latestSnapshot(Table table, String branch) { + // For the main branch, currentSnapshot() is used to validate that the API behavior has + // not changed since that was the API used for validation prior to addition of branches. + if (branch.equals(SnapshotRef.MAIN_BRANCH)) { + return table.currentSnapshot(); + } + + return table.snapshot(branch); + } + + public static void assertTableRecords(String tablePath, List expected) + throws IOException { + Preconditions.checkArgument(expected != null, "expected records shouldn't be null"); + assertTableRecords(new HadoopTables().load(tablePath), expected, SnapshotRef.MAIN_BRANCH); + } + + public static void assertTableRecords(String tablePath, List expected, String branch) + throws IOException { + Preconditions.checkArgument(expected != null, "expected records shouldn't be null"); + assertTableRecords(new HadoopTables().load(tablePath), expected, branch); + } + + public static StructLikeSet expectedRowSet(Table table, Record... records) { + StructLikeSet set = StructLikeSet.create(table.schema().asStruct()); + InternalRecordWrapper wrapper = new InternalRecordWrapper(table.schema().asStruct()); + for (Record record : records) { + set.add(wrapper.copyFor(record)); + } + return set; + } + + public static StructLikeSet actualRowSet(Table table, String... columns) throws IOException { + return actualRowSet(table, null, columns); + } + + public static StructLikeSet actualRowSet(Table table, Long snapshotId, String... columns) + throws IOException { + table.refresh(); + StructLikeSet set = StructLikeSet.create(table.schema().asStruct()); + InternalRecordWrapper wrapper = new InternalRecordWrapper(table.schema().asStruct()); + try (CloseableIterable reader = + IcebergGenerics.read(table) + .useSnapshot(snapshotId == null ? table.currentSnapshot().snapshotId() : snapshotId) + .select(columns) + .build()) { + reader.forEach(record -> set.add(wrapper.copyFor(record))); + } + return set; + } + + public static List partitionDataFiles(Table table, Map partitionValues) + throws IOException { + table.refresh(); + Types.StructType partitionType = table.spec().partitionType(); + + Record partitionRecord = GenericRecord.create(partitionType).copy(partitionValues); + StructLikeWrapper expectedWrapper = + StructLikeWrapper.forType(partitionType).set(partitionRecord); + + List dataFiles = Lists.newArrayList(); + try (CloseableIterable fileScanTasks = table.newScan().planFiles()) { + for (FileScanTask scanTask : fileScanTasks) { + StructLikeWrapper wrapper = + StructLikeWrapper.forType(partitionType).set(scanTask.file().partition()); + + if (expectedWrapper.equals(wrapper)) { + dataFiles.add(scanTask.file()); + } + } + } + + return dataFiles; + } + + public static Map> snapshotToDataFiles(Table table) throws IOException { + table.refresh(); + + Map> result = Maps.newHashMap(); + Snapshot current = table.currentSnapshot(); + while (current != null) { + TableScan tableScan = table.newScan(); + if (current.parentId() != null) { + // Collect the data files that was added only in current snapshot. + tableScan = tableScan.appendsBetween(current.parentId(), current.snapshotId()); + } else { + // Collect the data files that was added in the oldest snapshot. + tableScan = tableScan.useSnapshot(current.snapshotId()); + } + try (CloseableIterable scanTasks = tableScan.planFiles()) { + result.put( + current.snapshotId(), + ImmutableList.copyOf(Iterables.transform(scanTasks, FileScanTask::file))); + } + + // Continue to traverse the parent snapshot if exists. + if (current.parentId() == null) { + break; + } + // Iterate to the parent snapshot. + current = table.snapshot(current.parentId()); + } + return result; + } + + public static List matchingPartitions( + List dataFiles, PartitionSpec partitionSpec, Map partitionValues) { + Types.StructType partitionType = partitionSpec.partitionType(); + Record partitionRecord = GenericRecord.create(partitionType).copy(partitionValues); + StructLikeWrapper expected = StructLikeWrapper.forType(partitionType).set(partitionRecord); + return dataFiles.stream() + .filter( + df -> { + StructLikeWrapper wrapper = + StructLikeWrapper.forType(partitionType).set(df.partition()); + return wrapper.equals(expected); + }) + .collect(Collectors.toList()); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/SqlBase.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/SqlBase.java new file mode 100644 index 000000000000..9411ea4f7d71 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/SqlBase.java @@ -0,0 +1,110 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import static org.apache.iceberg.flink.FlinkCatalogFactory.DEFAULT_CATALOG_NAME; +import static org.assertj.core.api.Assertions.assertThat; + +import java.util.List; +import java.util.Map; +import org.apache.flink.table.api.TableEnvironment; +import org.apache.flink.table.api.TableResult; +import org.apache.flink.types.Row; +import org.apache.flink.util.CloseableIterator; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; + +public abstract class SqlBase { + protected abstract TableEnvironment getTableEnv(); + + protected static TableResult exec(TableEnvironment env, String query, Object... args) { + return env.executeSql(String.format(query, args)); + } + + protected TableResult exec(String query, Object... args) { + return exec(getTableEnv(), query, args); + } + + protected List sql(String query, Object... args) { + TableResult tableResult = exec(query, args); + try (CloseableIterator iter = tableResult.collect()) { + return Lists.newArrayList(iter); + } catch (Exception e) { + throw new RuntimeException("Failed to collect table result", e); + } + } + + protected void assertSameElements(Iterable expected, Iterable actual) { + assertThat(actual).isNotNull().containsExactlyInAnyOrderElementsOf(expected); + } + + protected void assertSameElements(String message, Iterable expected, Iterable actual) { + assertThat(actual).isNotNull().as(message).containsExactlyInAnyOrderElementsOf(expected); + } + + /** + * We can not drop currently used catalog after FLINK-29677, so we have make sure that we do not + * use the current catalog before dropping it. This method switches to the 'default_catalog' and + * drops the one requested. + * + * @param catalogName The catalog to drop + * @param ifExists If we should use the 'IF EXISTS' when dropping the catalog + */ + protected void dropCatalog(String catalogName, boolean ifExists) { + sql("USE CATALOG %s", DEFAULT_CATALOG_NAME); + sql("DROP CATALOG %s %s", ifExists ? "IF EXISTS" : "", catalogName); + } + + /** + * We can not drop currently used database after FLINK-33226, so we have make sure that we do not + * use the current database before dropping it. This method switches to the default database in + * the default catalog, and then it and drops the one requested. + * + * @param database The database to drop + * @param ifExists If we should use the 'IF EXISTS' when dropping the database + */ + protected void dropDatabase(String database, boolean ifExists) { + String currentCatalog = getTableEnv().getCurrentCatalog(); + sql("USE CATALOG %s", DEFAULT_CATALOG_NAME); + sql("USE %s", getTableEnv().listDatabases()[0]); + sql("USE CATALOG %s", currentCatalog); + sql("DROP DATABASE %s %s", ifExists ? "IF EXISTS" : "", database); + } + + protected static String toWithClause(Map props) { + StringBuilder builder = new StringBuilder(); + builder.append("("); + int propCount = 0; + for (Map.Entry entry : props.entrySet()) { + if (propCount > 0) { + builder.append(","); + } + builder + .append("'") + .append(entry.getKey()) + .append("'") + .append("=") + .append("'") + .append(entry.getValue()) + .append("'"); + propCount++; + } + builder.append(")"); + return builder.toString(); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestBase.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestBase.java new file mode 100644 index 000000000000..401960c3591b --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestBase.java @@ -0,0 +1,145 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import static org.apache.iceberg.flink.FlinkCatalogFactory.DEFAULT_CATALOG_NAME; +import static org.assertj.core.api.Assertions.assertThat; + +import java.nio.file.Path; +import java.util.List; +import org.apache.flink.table.api.EnvironmentSettings; +import org.apache.flink.table.api.TableEnvironment; +import org.apache.flink.table.api.TableResult; +import org.apache.flink.test.junit5.MiniClusterExtension; +import org.apache.flink.types.Row; +import org.apache.flink.util.CloseableIterator; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.iceberg.CatalogUtil; +import org.apache.iceberg.hive.HiveCatalog; +import org.apache.iceberg.hive.TestHiveMetastore; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.extension.RegisterExtension; +import org.junit.jupiter.api.io.TempDir; + +public abstract class TestBase extends SqlBase { + + @RegisterExtension + public static MiniClusterExtension miniClusterExtension = + MiniFlinkClusterExtension.createWithClassloaderCheckDisabled(); + + @TempDir protected Path temporaryDirectory; + + private static TestHiveMetastore metastore = null; + protected static HiveConf hiveConf = null; + protected static HiveCatalog catalog = null; + + private volatile TableEnvironment tEnv = null; + + @BeforeAll + public static void startMetastore() { + TestBase.metastore = new TestHiveMetastore(); + metastore.start(); + TestBase.hiveConf = metastore.hiveConf(); + TestBase.catalog = + (HiveCatalog) + CatalogUtil.loadCatalog( + HiveCatalog.class.getName(), "hive", ImmutableMap.of(), hiveConf); + } + + @AfterAll + public static void stopMetastore() throws Exception { + metastore.stop(); + TestBase.catalog = null; + } + + @Override + protected TableEnvironment getTableEnv() { + if (tEnv == null) { + synchronized (this) { + if (tEnv == null) { + EnvironmentSettings settings = EnvironmentSettings.newInstance().inBatchMode().build(); + + TableEnvironment env = TableEnvironment.create(settings); + env.getConfig() + .getConfiguration() + .set(FlinkConfigOptions.TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM, false); + tEnv = env; + } + } + } + return tEnv; + } + + protected static TableResult exec(TableEnvironment env, String query, Object... args) { + return env.executeSql(String.format(query, args)); + } + + protected TableResult exec(String query, Object... args) { + return exec(getTableEnv(), query, args); + } + + protected List sql(String query, Object... args) { + TableResult tableResult = exec(query, args); + try (CloseableIterator iter = tableResult.collect()) { + return Lists.newArrayList(iter); + } catch (Exception e) { + throw new RuntimeException("Failed to collect table result", e); + } + } + + protected void assertSameElements(Iterable expected, Iterable actual) { + assertThat(actual).isNotNull().containsExactlyInAnyOrderElementsOf(expected); + } + + protected void assertSameElements(String message, Iterable expected, Iterable actual) { + assertThat(actual).isNotNull().as(message).containsExactlyInAnyOrderElementsOf(expected); + } + + /** + * We can not drop currently used catalog after FLINK-29677, so we have make sure that we do not + * use the current catalog before dropping it. This method switches to the 'default_catalog' and + * drops the one requested. + * + * @param catalogName The catalog to drop + * @param ifExists If we should use the 'IF EXISTS' when dropping the catalog + */ + protected void dropCatalog(String catalogName, boolean ifExists) { + sql("USE CATALOG %s", DEFAULT_CATALOG_NAME); + sql("DROP CATALOG %s %s", ifExists ? "IF EXISTS" : "", catalogName); + } + + /** + * We can not drop currently used database after FLINK-33226, so we have make sure that we do not + * use the current database before dropping it. This method switches to the default database in + * the default catalog, and then it and drops the one requested. + * + * @param database The database to drop + * @param ifExists If we should use the 'IF EXISTS' when dropping the database + */ + protected void dropDatabase(String database, boolean ifExists) { + String currentCatalog = getTableEnv().getCurrentCatalog(); + sql("USE CATALOG %s", DEFAULT_CATALOG_NAME); + sql("USE %s", getTableEnv().listDatabases()[0]); + sql("USE CATALOG %s", currentCatalog); + sql("DROP DATABASE %s %s", ifExists ? "IF EXISTS" : "", database); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestCatalogLoader.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestCatalogLoader.java new file mode 100644 index 000000000000..e8f65921c19a --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestCatalogLoader.java @@ -0,0 +1,116 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import static org.apache.iceberg.CatalogProperties.URI; +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.entry; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.IOException; +import java.io.ObjectInputStream; +import java.io.ObjectOutputStream; +import java.util.Map; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.iceberg.CatalogProperties; +import org.apache.iceberg.Schema; +import org.apache.iceberg.Table; +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.hadoop.HadoopFileIO; +import org.apache.iceberg.io.FileIO; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.types.Types; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; + +/** Test for {@link CatalogLoader}. */ +public class TestCatalogLoader extends TestBase { + + private static File warehouse = null; + private static final TableIdentifier IDENTIFIER = TableIdentifier.of("default", "my_table"); + private static final Schema SCHEMA = + new Schema(Types.NestedField.required(1, "f1", Types.StringType.get())); + + @BeforeAll + public static void createWarehouse() throws IOException { + warehouse = File.createTempFile("warehouse", null); + assertThat(warehouse.delete()).isTrue(); + hiveConf.set("my_key", "my_value"); + } + + @AfterAll + public static void dropWarehouse() throws IOException { + if (warehouse != null && warehouse.exists()) { + Path warehousePath = new Path(warehouse.getAbsolutePath()); + FileSystem fs = warehousePath.getFileSystem(hiveConf); + assertThat(fs.delete(warehousePath, true)).as("Failed to delete " + warehousePath).isTrue(); + } + } + + @Test + public void testHadoopCatalogLoader() throws IOException, ClassNotFoundException { + Map properties = Maps.newHashMap(); + properties.put(CatalogProperties.WAREHOUSE_LOCATION, "file:" + warehouse); + CatalogLoader loader = CatalogLoader.hadoop("my_catalog", hiveConf, properties); + validateCatalogLoader(loader); + } + + @Test + public void testHiveCatalogLoader() throws IOException, ClassNotFoundException { + CatalogLoader loader = CatalogLoader.hive("my_catalog", hiveConf, Maps.newHashMap()); + validateCatalogLoader(loader); + } + + @Test + public void testRESTCatalogLoader() { + Map properties = Maps.newHashMap(); + properties.put(URI, "http://localhost/"); + CatalogLoader.rest("my_catalog", hiveConf, Maps.newHashMap()); + } + + private static void validateCatalogLoader(CatalogLoader loader) + throws IOException, ClassNotFoundException { + Table table = javaSerAndDeSer(loader).loadCatalog().createTable(IDENTIFIER, SCHEMA); + validateHadoopConf(table); + } + + private static void validateHadoopConf(Table table) { + FileIO io = table.io(); + assertThat(io).as("FileIO should be a HadoopFileIO").isInstanceOf(HadoopFileIO.class); + HadoopFileIO hadoopIO = (HadoopFileIO) io; + assertThat(hadoopIO.conf()).contains(entry("my_key", "my_value")); + } + + @SuppressWarnings("unchecked") + private static T javaSerAndDeSer(T object) throws IOException, ClassNotFoundException { + ByteArrayOutputStream bytes = new ByteArrayOutputStream(); + try (ObjectOutputStream out = new ObjectOutputStream(bytes)) { + out.writeObject(object); + } + + try (ObjectInputStream in = + new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray()))) { + return (T) in.readObject(); + } + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestCatalogTableLoader.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestCatalogTableLoader.java new file mode 100644 index 000000000000..f719c7bc0001 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestCatalogTableLoader.java @@ -0,0 +1,113 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.IOException; +import java.io.ObjectInputStream; +import java.io.ObjectOutputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.iceberg.Schema; +import org.apache.iceberg.Table; +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.hadoop.HadoopFileIO; +import org.apache.iceberg.hadoop.HadoopTables; +import org.apache.iceberg.io.FileIO; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.types.Types; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; + +/** Test for {@link TableLoader}. */ +public class TestCatalogTableLoader extends TestBase { + + private static File warehouse = null; + private static final TableIdentifier IDENTIFIER = TableIdentifier.of("default", "my_table"); + private static final Schema SCHEMA = + new Schema(Types.NestedField.required(1, "f1", Types.StringType.get())); + + @BeforeAll + public static void createWarehouse() throws IOException { + warehouse = File.createTempFile("warehouse", null); + assertThat(warehouse.delete()).isTrue(); + hiveConf.set("my_key", "my_value"); + } + + @AfterAll + public static void dropWarehouse() throws IOException { + if (warehouse != null && warehouse.exists()) { + Path warehousePath = new Path(warehouse.getAbsolutePath()); + FileSystem fs = warehousePath.getFileSystem(hiveConf); + assertThat(fs.delete(warehousePath, true)).as("Failed to delete " + warehousePath).isTrue(); + } + } + + @Test + public void testHadoopTableLoader() throws IOException, ClassNotFoundException { + String location = "file:" + warehouse + "/my_table"; + new HadoopTables(hiveConf).create(SCHEMA, location); + validateTableLoader(TableLoader.fromHadoopTable(location, hiveConf)); + } + + @Test + public void testHiveCatalogTableLoader() throws IOException, ClassNotFoundException { + CatalogLoader loader = CatalogLoader.hive("my_catalog", hiveConf, Maps.newHashMap()); + javaSerdes(loader).loadCatalog().createTable(IDENTIFIER, SCHEMA); + + CatalogLoader catalogLoader = CatalogLoader.hive("my_catalog", hiveConf, Maps.newHashMap()); + validateTableLoader(TableLoader.fromCatalog(catalogLoader, IDENTIFIER)); + } + + private static void validateTableLoader(TableLoader loader) + throws IOException, ClassNotFoundException { + TableLoader copied = javaSerdes(loader); + copied.open(); + try { + validateHadoopConf(copied.loadTable()); + } finally { + copied.close(); + } + } + + private static void validateHadoopConf(Table table) { + FileIO io = table.io(); + assertThat(io).as("FileIO should be a HadoopFileIO").isInstanceOf(HadoopFileIO.class); + HadoopFileIO hadoopIO = (HadoopFileIO) io; + assertThat(hadoopIO.conf().get("my_key")).isEqualTo("my_value"); + } + + @SuppressWarnings("unchecked") + private static T javaSerdes(T object) throws IOException, ClassNotFoundException { + ByteArrayOutputStream bytes = new ByteArrayOutputStream(); + try (ObjectOutputStream out = new ObjectOutputStream(bytes)) { + out.writeObject(object); + } + + try (ObjectInputStream in = + new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray()))) { + return (T) in.readObject(); + } + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestChangeLogTable.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestChangeLogTable.java new file mode 100644 index 000000000000..1997ef6998a2 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestChangeLogTable.java @@ -0,0 +1,296 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.File; +import java.io.IOException; +import java.util.List; +import org.apache.flink.types.Row; +import org.apache.hadoop.conf.Configuration; +import org.apache.iceberg.BaseTable; +import org.apache.iceberg.CatalogProperties; +import org.apache.iceberg.Parameter; +import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.Parameters; +import org.apache.iceberg.Snapshot; +import org.apache.iceberg.Table; +import org.apache.iceberg.TableMetadata; +import org.apache.iceberg.TableOperations; +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.flink.source.BoundedTableFactory; +import org.apache.iceberg.flink.source.ChangeLogTableTestBase; +import org.apache.iceberg.relocated.com.google.common.base.Joiner; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.util.StructLikeSet; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.api.extension.ExtendWith; + +/** + * In this test case, we mainly cover the impact of primary key selection, multiple operations + * within a single transaction, and multiple operations between different txn on the correctness of + * the data. + */ +@ExtendWith(ParameterizedTestExtension.class) +public class TestChangeLogTable extends ChangeLogTableTestBase { + private static final Configuration CONF = new Configuration(); + private static final String SOURCE_TABLE = "default_catalog.default_database.source_change_logs"; + + private static final String CATALOG_NAME = "test_catalog"; + private static final String DATABASE_NAME = "test_db"; + private static final String TABLE_NAME = "test_table"; + private String warehouse; + + @Parameter private boolean partitioned; + + @Parameters(name = "PartitionedTable={0}") + public static Iterable parameters() { + return ImmutableList.of(new Object[] {true}, new Object[] {false}); + } + + @BeforeEach + public void before() throws IOException { + File warehouseFile = File.createTempFile("junit", null, temporaryDirectory.toFile()); + assertThat(warehouseFile.delete()).isTrue(); + warehouse = String.format("file:%s", warehouseFile); + + sql( + "CREATE CATALOG %s WITH ('type'='iceberg', 'catalog-type'='hadoop', 'warehouse'='%s')", + CATALOG_NAME, warehouse); + sql("USE CATALOG %s", CATALOG_NAME); + sql("CREATE DATABASE %s", DATABASE_NAME); + sql("USE %s", DATABASE_NAME); + // Set the table.exec.sink.upsert-materialize=NONE, so that downstream operators will receive + // the + // records with the same order as the source operator, bypassing Flink's inferred shuffle. + getTableEnv().getConfig().set("table.exec.sink.upsert-materialize", "NONE"); + } + + @AfterEach + @Override + public void clean() { + sql("DROP TABLE IF EXISTS %s", TABLE_NAME); + dropDatabase(DATABASE_NAME, true); + dropCatalog(CATALOG_NAME, true); + BoundedTableFactory.clearDataSets(); + } + + @TestTemplate + public void testSqlChangeLogOnIdKey() throws Exception { + List> inputRowsPerCheckpoint = + ImmutableList.of( + ImmutableList.of( + insertRow(1, "aaa"), + deleteRow(1, "aaa"), + insertRow(1, "bbb"), + insertRow(2, "aaa"), + deleteRow(2, "aaa"), + insertRow(2, "bbb")), + ImmutableList.of( + updateBeforeRow(2, "bbb"), + updateAfterRow(2, "ccc"), + deleteRow(2, "ccc"), + insertRow(2, "ddd")), + ImmutableList.of( + deleteRow(1, "bbb"), + insertRow(1, "ccc"), + deleteRow(1, "ccc"), + insertRow(1, "ddd"))); + + List> expectedRecordsPerCheckpoint = + ImmutableList.of( + ImmutableList.of(insertRow(1, "bbb"), insertRow(2, "bbb")), + ImmutableList.of(insertRow(1, "bbb"), insertRow(2, "ddd")), + ImmutableList.of(insertRow(1, "ddd"), insertRow(2, "ddd"))); + + testSqlChangeLog( + TABLE_NAME, ImmutableList.of("id"), inputRowsPerCheckpoint, expectedRecordsPerCheckpoint); + } + + @TestTemplate + public void testChangeLogOnDataKey() throws Exception { + List> elementsPerCheckpoint = + ImmutableList.of( + ImmutableList.of( + insertRow(1, "aaa"), + deleteRow(1, "aaa"), + insertRow(2, "bbb"), + insertRow(1, "bbb"), + insertRow(2, "aaa")), + ImmutableList.of( + updateBeforeRow(2, "aaa"), updateAfterRow(1, "ccc"), insertRow(1, "aaa")), + ImmutableList.of(deleteRow(1, "bbb"), insertRow(2, "aaa"), insertRow(2, "ccc"))); + + List> expectedRecords = + ImmutableList.of( + ImmutableList.of(insertRow(1, "bbb"), insertRow(2, "aaa")), + ImmutableList.of(insertRow(1, "aaa"), insertRow(1, "bbb"), insertRow(1, "ccc")), + ImmutableList.of( + insertRow(1, "aaa"), + insertRow(1, "ccc"), + insertRow(2, "aaa"), + insertRow(2, "ccc"))); + + testSqlChangeLog(TABLE_NAME, ImmutableList.of("data"), elementsPerCheckpoint, expectedRecords); + } + + @TestTemplate + public void testChangeLogOnIdDataKey() throws Exception { + List> elementsPerCheckpoint = + ImmutableList.of( + ImmutableList.of( + insertRow(1, "aaa"), + deleteRow(1, "aaa"), + insertRow(2, "bbb"), + insertRow(1, "bbb"), + insertRow(2, "aaa")), + ImmutableList.of( + updateBeforeRow(2, "aaa"), updateAfterRow(1, "ccc"), insertRow(1, "aaa")), + ImmutableList.of(deleteRow(1, "bbb"), insertRow(2, "aaa"))); + + List> expectedRecords = + ImmutableList.of( + ImmutableList.of(insertRow(1, "bbb"), insertRow(2, "aaa"), insertRow(2, "bbb")), + ImmutableList.of( + insertRow(1, "aaa"), insertRow(1, "bbb"), insertRow(1, "ccc"), insertRow(2, "bbb")), + ImmutableList.of( + insertRow(1, "aaa"), + insertRow(1, "ccc"), + insertRow(2, "aaa"), + insertRow(2, "bbb"))); + + testSqlChangeLog( + TABLE_NAME, ImmutableList.of("data", "id"), elementsPerCheckpoint, expectedRecords); + } + + @TestTemplate + public void testPureInsertOnIdKey() throws Exception { + List> elementsPerCheckpoint = + ImmutableList.of( + ImmutableList.of(insertRow(1, "aaa"), insertRow(2, "bbb")), + ImmutableList.of(insertRow(3, "ccc"), insertRow(4, "ddd")), + ImmutableList.of(insertRow(5, "eee"), insertRow(6, "fff"))); + + List> expectedRecords = + ImmutableList.of( + ImmutableList.of(insertRow(1, "aaa"), insertRow(2, "bbb")), + ImmutableList.of( + insertRow(1, "aaa"), insertRow(2, "bbb"), insertRow(3, "ccc"), insertRow(4, "ddd")), + ImmutableList.of( + insertRow(1, "aaa"), + insertRow(2, "bbb"), + insertRow(3, "ccc"), + insertRow(4, "ddd"), + insertRow(5, "eee"), + insertRow(6, "fff"))); + + testSqlChangeLog(TABLE_NAME, ImmutableList.of("data"), elementsPerCheckpoint, expectedRecords); + } + + private static Record record(int id, String data) { + return SimpleDataUtil.createRecord(id, data); + } + + private Table createTable(String tableName, List key, boolean isPartitioned) { + String partitionByCause = isPartitioned ? "PARTITIONED BY (data)" : ""; + sql( + "CREATE TABLE %s(id INT, data VARCHAR, PRIMARY KEY(%s) NOT ENFORCED) %s", + tableName, Joiner.on(',').join(key), partitionByCause); + + // Upgrade the iceberg table to format v2. + CatalogLoader loader = + CatalogLoader.hadoop( + "my_catalog", CONF, ImmutableMap.of(CatalogProperties.WAREHOUSE_LOCATION, warehouse)); + Table table = loader.loadCatalog().loadTable(TableIdentifier.of(DATABASE_NAME, TABLE_NAME)); + TableOperations ops = ((BaseTable) table).operations(); + TableMetadata meta = ops.current(); + ops.commit(meta, meta.upgradeToFormatVersion(2)); + + return table; + } + + private void testSqlChangeLog( + String tableName, + List key, + List> inputRowsPerCheckpoint, + List> expectedRecordsPerCheckpoint) + throws Exception { + String dataId = BoundedTableFactory.registerDataSet(inputRowsPerCheckpoint); + sql( + "CREATE TABLE %s(id INT NOT NULL, data STRING NOT NULL)" + + " WITH ('connector'='BoundedSource', 'data-id'='%s')", + SOURCE_TABLE, dataId); + + assertThat(sql("SELECT * FROM %s", SOURCE_TABLE)).isEqualTo(listJoin(inputRowsPerCheckpoint)); + + Table table = createTable(tableName, key, partitioned); + sql("INSERT INTO %s SELECT * FROM %s", tableName, SOURCE_TABLE); + + table.refresh(); + List snapshots = findValidSnapshots(table); + int expectedSnapshotNum = expectedRecordsPerCheckpoint.size(); + assertThat(snapshots) + .as("Should have the expected snapshot number") + .hasSameSizeAs(expectedRecordsPerCheckpoint); + + for (int i = 0; i < expectedSnapshotNum; i++) { + long snapshotId = snapshots.get(i).snapshotId(); + List expectedRows = expectedRecordsPerCheckpoint.get(i); + assertThat(actualRowSet(table, snapshotId)) + .as("Should have the expected records for the checkpoint#" + i) + .isEqualTo(expectedRowSet(table, expectedRows)); + } + + if (expectedSnapshotNum > 0) { + assertThat(sql("SELECT * FROM %s", tableName)) + .as("Should have the expected rows in the final table") + .containsExactlyInAnyOrderElementsOf( + expectedRecordsPerCheckpoint.get(expectedSnapshotNum - 1)); + } + } + + private List findValidSnapshots(Table table) { + List validSnapshots = Lists.newArrayList(); + for (Snapshot snapshot : table.snapshots()) { + if (snapshot.allManifests(table.io()).stream() + .anyMatch(m -> snapshot.snapshotId() == m.snapshotId())) { + validSnapshots.add(snapshot); + } + } + return validSnapshots; + } + + private static StructLikeSet expectedRowSet(Table table, List rows) { + Record[] records = new Record[rows.size()]; + for (int i = 0; i < records.length; i++) { + records[i] = record((int) rows.get(i).getField(0), (String) rows.get(i).getField(1)); + } + return SimpleDataUtil.expectedRowSet(table, records); + } + + private static StructLikeSet actualRowSet(Table table, long snapshotId) throws IOException { + return SimpleDataUtil.actualRowSet(table, snapshotId, "*"); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestDataFileSerialization.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestDataFileSerialization.java new file mode 100644 index 000000000000..8992cbd75187 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestDataFileSerialization.java @@ -0,0 +1,203 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.ObjectInputStream; +import java.io.ObjectOutputStream; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.util.Map; +import org.apache.flink.api.common.ExecutionConfig; +import org.apache.flink.api.java.typeutils.runtime.kryo.KryoSerializer; +import org.apache.flink.core.memory.DataInputDeserializer; +import org.apache.flink.core.memory.DataOutputSerializer; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.DataFiles; +import org.apache.iceberg.DeleteFile; +import org.apache.iceberg.FileMetadata; +import org.apache.iceberg.Metrics; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SortOrder; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.types.Types; +import org.junit.jupiter.api.Test; + +public class TestDataFileSerialization { + + private static final Schema DATE_SCHEMA = + new Schema( + required(1, "id", Types.LongType.get()), + optional(2, "data", Types.StringType.get()), + required(3, "date", Types.StringType.get()), + optional(4, "double", Types.DoubleType.get())); + + private static final PartitionSpec PARTITION_SPEC = + PartitionSpec.builderFor(DATE_SCHEMA).identity("date").build(); + + private static final Map COLUMN_SIZES = Maps.newHashMap(); + private static final Map VALUE_COUNTS = Maps.newHashMap(); + private static final Map NULL_VALUE_COUNTS = Maps.newHashMap(); + private static final Map NAN_VALUE_COUNTS = Maps.newHashMap(); + private static final Map LOWER_BOUNDS = Maps.newHashMap(); + private static final Map UPPER_BOUNDS = Maps.newHashMap(); + + static { + COLUMN_SIZES.put(1, 2L); + COLUMN_SIZES.put(2, 3L); + VALUE_COUNTS.put(1, 5L); + VALUE_COUNTS.put(2, 3L); + VALUE_COUNTS.put(4, 2L); + NULL_VALUE_COUNTS.put(1, 0L); + NULL_VALUE_COUNTS.put(2, 2L); + NAN_VALUE_COUNTS.put(4, 1L); + LOWER_BOUNDS.put(1, longToBuffer(0L)); + UPPER_BOUNDS.put(1, longToBuffer(4L)); + } + + private static final Metrics METRICS = + new Metrics( + 5L, null, VALUE_COUNTS, NULL_VALUE_COUNTS, NAN_VALUE_COUNTS, LOWER_BOUNDS, UPPER_BOUNDS); + + private static final DataFile DATA_FILE = + DataFiles.builder(PARTITION_SPEC) + .withPath("/path/to/data-1.parquet") + .withFileSizeInBytes(1234) + .withPartitionPath("date=2018-06-08") + .withMetrics(METRICS) + .withSplitOffsets(ImmutableList.of(4L)) + .withEncryptionKeyMetadata(ByteBuffer.allocate(4).putInt(34)) + .withSortOrder(SortOrder.unsorted()) + .build(); + + private static final DeleteFile POS_DELETE_FILE = + FileMetadata.deleteFileBuilder(PARTITION_SPEC) + .ofPositionDeletes() + .withPath("/path/to/pos-delete.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("date=2018-06-08") + .withMetrics(METRICS) + .withEncryptionKeyMetadata(ByteBuffer.allocate(4).putInt(35)) + .withRecordCount(23) + .build(); + + private static final DeleteFile EQ_DELETE_FILE = + FileMetadata.deleteFileBuilder(PARTITION_SPEC) + .ofEqualityDeletes(2, 3) + .withPath("/path/to/equality-delete.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("date=2018-06-08") + .withMetrics(METRICS) + .withEncryptionKeyMetadata(ByteBuffer.allocate(4).putInt(35)) + .withRecordCount(23) + .withSortOrder(SortOrder.unsorted()) + .build(); + + @Test + public void testJavaSerialization() throws Exception { + ByteArrayOutputStream bytes = new ByteArrayOutputStream(); + try (ObjectOutputStream out = new ObjectOutputStream(bytes)) { + out.writeObject(DATA_FILE); + out.writeObject(DATA_FILE.copy()); + + out.writeObject(POS_DELETE_FILE); + out.writeObject(POS_DELETE_FILE.copy()); + + out.writeObject(EQ_DELETE_FILE); + out.writeObject(EQ_DELETE_FILE.copy()); + } + + try (ObjectInputStream in = + new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray()))) { + for (int i = 0; i < 2; i += 1) { + Object obj = in.readObject(); + assertThat(obj).as("Should be a DataFile").isInstanceOf(DataFile.class); + TestHelpers.assertEquals(DATA_FILE, (DataFile) obj); + } + + for (int i = 0; i < 2; i += 1) { + Object obj = in.readObject(); + assertThat(obj).as("Should be a position DeleteFile").isInstanceOf(DeleteFile.class); + TestHelpers.assertEquals(POS_DELETE_FILE, (DeleteFile) obj); + } + + for (int i = 0; i < 2; i += 1) { + Object obj = in.readObject(); + assertThat(obj).as("Should be a equality DeleteFile").isInstanceOf(DeleteFile.class); + TestHelpers.assertEquals(EQ_DELETE_FILE, (DeleteFile) obj); + } + } + } + + @Test + public void testDataFileKryoSerialization() throws IOException { + KryoSerializer kryo = new KryoSerializer<>(DataFile.class, new ExecutionConfig()); + + DataOutputSerializer outputView = new DataOutputSerializer(1024); + + kryo.serialize(DATA_FILE, outputView); + kryo.serialize(DATA_FILE.copy(), outputView); + + DataInputDeserializer inputView = new DataInputDeserializer(outputView.getCopyOfBuffer()); + DataFile dataFile1 = kryo.deserialize(inputView); + DataFile dataFile2 = kryo.deserialize(inputView); + + TestHelpers.assertEquals(DATA_FILE, dataFile1); + TestHelpers.assertEquals(DATA_FILE, dataFile2); + } + + @Test + public void testDeleteFileKryoSerialization() throws IOException { + KryoSerializer kryo = new KryoSerializer<>(DeleteFile.class, new ExecutionConfig()); + + DataOutputSerializer outputView = new DataOutputSerializer(1024); + + kryo.serialize(POS_DELETE_FILE, outputView); + kryo.serialize(POS_DELETE_FILE.copy(), outputView); + + kryo.serialize(EQ_DELETE_FILE, outputView); + kryo.serialize(EQ_DELETE_FILE.copy(), outputView); + + DataInputDeserializer inputView = new DataInputDeserializer(outputView.getCopyOfBuffer()); + + DeleteFile posDeleteFile1 = kryo.deserialize(inputView); + DeleteFile posDeleteFile2 = kryo.deserialize(inputView); + + TestHelpers.assertEquals(POS_DELETE_FILE, posDeleteFile1); + TestHelpers.assertEquals(POS_DELETE_FILE, posDeleteFile2); + + DeleteFile eqDeleteFile1 = kryo.deserialize(inputView); + DeleteFile eqDeleteFile2 = kryo.deserialize(inputView); + + TestHelpers.assertEquals(EQ_DELETE_FILE, eqDeleteFile1); + TestHelpers.assertEquals(EQ_DELETE_FILE, eqDeleteFile2); + } + + private static ByteBuffer longToBuffer(long value) { + return ByteBuffer.allocate(8).order(ByteOrder.LITTLE_ENDIAN).putLong(0, value); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFixtures.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFixtures.java new file mode 100644 index 000000000000..b9a7d5b1d589 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFixtures.java @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import static org.apache.iceberg.types.Types.NestedField.required; + +import org.apache.flink.table.types.logical.RowType; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.types.Types; + +public class TestFixtures { + + private TestFixtures() {} + + public static final Schema SCHEMA = + new Schema( + required(1, "data", Types.StringType.get()), + required(2, "id", Types.LongType.get()), + required(3, "dt", Types.StringType.get())); + + public static final PartitionSpec SPEC = + PartitionSpec.builderFor(SCHEMA).identity("dt").bucket("id", 1).build(); + + public static final RowType ROW_TYPE = FlinkSchemaUtil.convert(SCHEMA); + + public static final String DATABASE = "default"; + public static final String TABLE = "t"; + public static final String SINK_TABLE = "t_sink"; + + public static final TableIdentifier TABLE_IDENTIFIER = TableIdentifier.of(DATABASE, TABLE); + public static final TableIdentifier SINK_TABLE_IDENTIFIER = + TableIdentifier.of(DATABASE, SINK_TABLE); + + public static final Schema TS_SCHEMA = + new Schema( + required(1, "ts", Types.TimestampType.withoutZone()), + required(2, "str", Types.StringType.get())); + + public static final PartitionSpec TS_SPEC = + PartitionSpec.builderFor(TS_SCHEMA).hour("ts").build(); + + public static final RowType TS_ROW_TYPE = FlinkSchemaUtil.convert(TS_SCHEMA); +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkAnonymousTable.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkAnonymousTable.java new file mode 100644 index 000000000000..70c8043f8fbb --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkAnonymousTable.java @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.File; +import java.nio.file.Files; +import java.util.concurrent.TimeUnit; +import org.apache.flink.table.api.DataTypes; +import org.apache.flink.table.api.Schema; +import org.apache.flink.table.api.Table; +import org.apache.flink.table.api.TableDescriptor; +import org.apache.flink.table.api.TableEnvironment; +import org.awaitility.Awaitility; +import org.junit.jupiter.api.Test; + +public class TestFlinkAnonymousTable extends TestBase { + + @Test + public void testWriteAnonymousTable() throws Exception { + File warehouseDir = Files.createTempDirectory(temporaryDirectory, "junit").toFile(); + TableEnvironment tEnv = getTableEnv(); + Table table = + tEnv.from( + TableDescriptor.forConnector("datagen") + .schema(Schema.newBuilder().column("f0", DataTypes.STRING()).build()) + .option("number-of-rows", "3") + .build()); + + TableDescriptor descriptor = + TableDescriptor.forConnector("iceberg") + .schema(Schema.newBuilder().column("f0", DataTypes.STRING()).build()) + .option("catalog-name", "hadoop_test") + .option("catalog-type", "hadoop") + .option("catalog-database", "test_db") + .option("catalog-table", "test") + .option("warehouse", warehouseDir.getAbsolutePath()) + .build(); + + table.insertInto(descriptor).execute(); + Awaitility.await() + .atMost(3, TimeUnit.SECONDS) + .untilAsserted( + () -> + assertThat(warehouseDir.toPath().resolve("test_db").resolve("test").toFile()) + .exists()); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogDatabase.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogDatabase.java new file mode 100644 index 000000000000..bd07087756ad --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogDatabase.java @@ -0,0 +1,253 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; +import static org.assertj.core.api.Assumptions.assumeThat; + +import java.nio.file.Path; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import org.apache.flink.table.catalog.exceptions.DatabaseNotEmptyException; +import org.apache.flink.types.Row; +import org.apache.iceberg.Schema; +import org.apache.iceberg.catalog.Namespace; +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.types.Types; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.TestTemplate; + +public class TestFlinkCatalogDatabase extends CatalogTestBase { + + @AfterEach + @Override + public void clean() { + sql("DROP TABLE IF EXISTS %s.tl", flinkDatabase); + dropDatabase(flinkDatabase, true); + super.clean(); + } + + @TestTemplate + public void testCreateNamespace() { + assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) + .as("Database should not already exist") + .isFalse(); + + sql("CREATE DATABASE %s", flinkDatabase); + assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) + .as("Database should exist") + .isTrue(); + + sql("CREATE DATABASE IF NOT EXISTS %s", flinkDatabase); + assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) + .as("Database should still exist") + .isTrue(); + + dropDatabase(flinkDatabase, true); + assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) + .as("Database should be dropped") + .isFalse(); + + sql("CREATE DATABASE IF NOT EXISTS %s", flinkDatabase); + assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) + .as("Database should be created") + .isTrue(); + } + + @TestTemplate + public void testDropEmptyDatabase() { + assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) + .as("Namespace should not already exist") + .isFalse(); + sql("CREATE DATABASE %s", flinkDatabase); + assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) + .as("Namespace should exist") + .isTrue(); + dropDatabase(flinkDatabase, true); + assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) + .as("Namespace should have been dropped") + .isFalse(); + } + + @TestTemplate + public void testDropNonEmptyNamespace() { + assumeThat(isHadoopCatalog) + .as("Hadoop catalog throws IOException: Directory is not empty.") + .isFalse(); + assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) + .as("Namespace should not already exist") + .isFalse(); + sql("CREATE DATABASE %s", flinkDatabase); + validationCatalog.createTable( + TableIdentifier.of(icebergNamespace, "tl"), + new Schema(Types.NestedField.optional(0, "id", Types.LongType.get()))); + assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) + .as("Namespace should exist") + .isTrue(); + assertThat(validationCatalog.tableExists(TableIdentifier.of(icebergNamespace, "tl"))) + .as("Table should exist") + .isTrue(); + assertThatThrownBy(() -> dropDatabase(flinkDatabase, true)) + .cause() + .isInstanceOf(DatabaseNotEmptyException.class) + .hasMessage( + String.format("Database %s in catalog %s is not empty.", DATABASE, catalogName)); + sql("DROP TABLE %s.tl", flinkDatabase); + } + + @TestTemplate + public void testListTables() { + assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) + .as("Namespace should not already exist") + .isFalse(); + sql("CREATE DATABASE %s", flinkDatabase); + sql("USE CATALOG %s", catalogName); + sql("USE %s", DATABASE); + assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) + .as("Namespace should exist") + .isTrue(); + assertThat(sql("SHOW TABLES")).isEmpty(); + validationCatalog.createTable( + TableIdentifier.of(icebergNamespace, "tl"), + new Schema(Types.NestedField.optional(0, "id", Types.LongType.get()))); + + List tables = sql("SHOW TABLES"); + assertThat(tables).hasSize(1); + assertThat("tl").as("Table name should match").isEqualTo(tables.get(0).getField(0)); + } + + @TestTemplate + public void testListNamespace() { + assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) + .as("Namespace should not already exist") + .isFalse(); + sql("CREATE DATABASE %s", flinkDatabase); + sql("USE CATALOG %s", catalogName); + assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) + .as("Namespace should exist") + .isTrue(); + + List databases = sql("SHOW DATABASES"); + + if (isHadoopCatalog) { + assertThat(databases).hasSize(1); + assertThat(databases.get(0).getField(0)).as("Should have db database").isEqualTo("db"); + if (!baseNamespace.isEmpty()) { + // test namespace not belongs to this catalog + validationNamespaceCatalog.createNamespace( + Namespace.of(baseNamespace.level(0), "UNKNOWN_NAMESPACE")); + databases = sql("SHOW DATABASES"); + assertThat(databases).hasSize(1); + assertThat(databases.get(0).getField(0)).as("Should have db database").isEqualTo("db"); + } + } else { + // If there are multiple classes extends FlinkTestBase, TestHiveMetastore may loose the + // creation for default + // database. See HiveMetaStore.HMSHandler.init. + assertThat(databases) + .as("Should have db database") + .anyMatch(d -> Objects.equals(d.getField(0), "db")); + } + } + + @TestTemplate + public void testCreateNamespaceWithMetadata() { + assumeThat(isHadoopCatalog).as("HadoopCatalog does not support namespace metadata").isFalse(); + assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) + .as("Namespace should not already exist") + .isFalse(); + sql("CREATE DATABASE %s WITH ('prop'='value')", flinkDatabase); + assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) + .as("Namespace should exist") + .isTrue(); + Map nsMetadata = + validationNamespaceCatalog.loadNamespaceMetadata(icebergNamespace); + assertThat(nsMetadata).containsEntry("prop", "value"); + } + + @TestTemplate + public void testCreateNamespaceWithComment() { + assumeThat(isHadoopCatalog).as("HadoopCatalog does not support namespace metadata").isFalse(); + assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) + .as("Namespace should not already exist") + .isFalse(); + + sql("CREATE DATABASE %s COMMENT 'namespace doc'", flinkDatabase); + assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) + .as("Namespace should exist") + .isTrue(); + Map nsMetadata = + validationNamespaceCatalog.loadNamespaceMetadata(icebergNamespace); + assertThat(nsMetadata).containsEntry("comment", "namespace doc"); + } + + @TestTemplate + public void testCreateNamespaceWithLocation() throws Exception { + assumeThat(isHadoopCatalog).as("HadoopCatalog does not support namespace metadata").isFalse(); + assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) + .as("Namespace should not already exist") + .isFalse(); + + Path location = temporaryDirectory.getRoot(); + sql("CREATE DATABASE %s WITH ('location'='%s')", flinkDatabase, location); + assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) + .as("Namespace should exist") + .isTrue(); + Map nsMetadata = + validationNamespaceCatalog.loadNamespaceMetadata(icebergNamespace); + assertThat(nsMetadata).containsEntry("location", "file:" + location.getRoot()); + } + + @TestTemplate + public void testSetProperties() { + assumeThat(isHadoopCatalog).as("HadoopCatalog does not support namespace metadata").isFalse(); + assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) + .as("Namespace should not already exist") + .isFalse(); + + sql("CREATE DATABASE %s", flinkDatabase); + assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) + .as("Namespace should exist") + .isTrue(); + + Map defaultMetadata = + validationNamespaceCatalog.loadNamespaceMetadata(icebergNamespace); + assertThat(defaultMetadata).doesNotContainKey("prop"); + sql("ALTER DATABASE %s SET ('prop'='value')", flinkDatabase); + Map nsMetadata = + validationNamespaceCatalog.loadNamespaceMetadata(icebergNamespace); + assertThat(nsMetadata).containsEntry("prop", "value"); + } + + @TestTemplate + public void testHadoopNotSupportMeta() { + assumeThat(isHadoopCatalog).as("HadoopCatalog does not support namespace metadata").isTrue(); + assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) + .as("Namespace should not already exist") + .isFalse(); + assertThatThrownBy(() -> sql("CREATE DATABASE %s WITH ('prop'='value')", flinkDatabase)) + .cause() + .isInstanceOf(UnsupportedOperationException.class) + .hasMessage( + String.format( + "Cannot create namespace %s: metadata is not supported", icebergNamespace)); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogFactory.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogFactory.java new file mode 100644 index 000000000000..4c9e95b8fa82 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogFactory.java @@ -0,0 +1,119 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import java.util.Map; +import org.apache.hadoop.conf.Configuration; +import org.apache.iceberg.CatalogProperties; +import org.apache.iceberg.catalog.Catalog; +import org.apache.iceberg.hadoop.HadoopCatalog; +import org.apache.iceberg.hive.HiveCatalog; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +public class TestFlinkCatalogFactory { + + private Map props; + + @BeforeEach + public void before() { + props = Maps.newHashMap(); + props.put("type", "iceberg"); + props.put(CatalogProperties.WAREHOUSE_LOCATION, "/tmp/location"); + } + + @Test + public void testCreateCatalogHive() { + String catalogName = "hiveCatalog"; + props.put( + FlinkCatalogFactory.ICEBERG_CATALOG_TYPE, FlinkCatalogFactory.ICEBERG_CATALOG_TYPE_HIVE); + + Catalog catalog = + FlinkCatalogFactory.createCatalogLoader(catalogName, props, new Configuration()) + .loadCatalog(); + + assertThat(catalog).isNotNull().isInstanceOf(HiveCatalog.class); + } + + @Test + public void testCreateCatalogHadoop() { + String catalogName = "hadoopCatalog"; + props.put( + FlinkCatalogFactory.ICEBERG_CATALOG_TYPE, FlinkCatalogFactory.ICEBERG_CATALOG_TYPE_HADOOP); + + Catalog catalog = + FlinkCatalogFactory.createCatalogLoader(catalogName, props, new Configuration()) + .loadCatalog(); + + assertThat(catalog).isNotNull().isInstanceOf(HadoopCatalog.class); + } + + @Test + public void testCreateCatalogCustom() { + String catalogName = "customCatalog"; + props.put(CatalogProperties.CATALOG_IMPL, CustomHadoopCatalog.class.getName()); + + Catalog catalog = + FlinkCatalogFactory.createCatalogLoader(catalogName, props, new Configuration()) + .loadCatalog(); + + assertThat(catalog).isNotNull().isInstanceOf(CustomHadoopCatalog.class); + } + + @Test + public void testCreateCatalogCustomWithHiveCatalogTypeSet() { + String catalogName = "customCatalog"; + props.put(CatalogProperties.CATALOG_IMPL, CustomHadoopCatalog.class.getName()); + props.put( + FlinkCatalogFactory.ICEBERG_CATALOG_TYPE, FlinkCatalogFactory.ICEBERG_CATALOG_TYPE_HIVE); + + assertThatThrownBy( + () -> FlinkCatalogFactory.createCatalogLoader(catalogName, props, new Configuration())) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageStartingWith( + "Cannot create catalog customCatalog, both catalog-type and catalog-impl are set"); + } + + @Test + public void testLoadCatalogUnknown() { + String catalogName = "unknownCatalog"; + props.put(FlinkCatalogFactory.ICEBERG_CATALOG_TYPE, "fooType"); + + assertThatThrownBy( + () -> FlinkCatalogFactory.createCatalogLoader(catalogName, props, new Configuration())) + .isInstanceOf(UnsupportedOperationException.class) + .hasMessageStartingWith("Unknown catalog-type: fooType"); + } + + public static class CustomHadoopCatalog extends HadoopCatalog { + + public CustomHadoopCatalog() {} + + public CustomHadoopCatalog(Configuration conf, String warehouseLocation) { + setConf(conf); + initialize( + "custom", ImmutableMap.of(CatalogProperties.WAREHOUSE_LOCATION, warehouseLocation)); + } + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogTable.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogTable.java new file mode 100644 index 000000000000..0b9c2193b4d5 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogTable.java @@ -0,0 +1,669 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; +import static org.assertj.core.api.Assumptions.assumeThat; + +import java.util.Arrays; +import java.util.Collections; +import java.util.Map; +import java.util.Optional; +import java.util.Set; +import java.util.stream.Collectors; +import java.util.stream.StreamSupport; +import org.apache.flink.table.api.DataTypes; +import org.apache.flink.table.api.TableException; +import org.apache.flink.table.api.TableSchema; +import org.apache.flink.table.api.ValidationException; +import org.apache.flink.table.api.constraints.UniqueConstraint; +import org.apache.flink.table.catalog.CatalogTable; +import org.apache.flink.table.catalog.ObjectPath; +import org.apache.flink.table.catalog.exceptions.TableNotExistException; +import org.apache.iceberg.BaseTable; +import org.apache.iceberg.ContentFile; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.DataFiles; +import org.apache.iceberg.DataOperations; +import org.apache.iceberg.FileScanTask; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.Snapshot; +import org.apache.iceberg.Table; +import org.apache.iceberg.TableOperations; +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.exceptions.NoSuchTableException; +import org.apache.iceberg.relocated.com.google.common.collect.Iterables; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.relocated.com.google.common.collect.Sets; +import org.apache.iceberg.types.Types; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.TestTemplate; + +public class TestFlinkCatalogTable extends CatalogTestBase { + + @Override + @BeforeEach + public void before() { + super.before(); + sql("CREATE DATABASE %s", flinkDatabase); + sql("USE CATALOG %s", catalogName); + sql("USE %s", DATABASE); + } + + @AfterEach + public void cleanNamespaces() { + sql("DROP TABLE IF EXISTS %s.tl", flinkDatabase); + sql("DROP TABLE IF EXISTS %s.tl2", flinkDatabase); + dropDatabase(flinkDatabase, true); + super.clean(); + } + + @TestTemplate + public void testGetTable() { + sql("CREATE TABLE tl(id BIGINT, strV STRING)"); + + Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, "tl")); + Schema iSchema = + new Schema( + Types.NestedField.optional(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "strV", Types.StringType.get())); + assertThat(table.schema().toString()) + .as("Should load the expected iceberg schema") + .isEqualTo(iSchema.toString()); + } + + @TestTemplate + public void testRenameTable() { + assumeThat(isHadoopCatalog).as("HadoopCatalog does not support rename table").isFalse(); + final Schema tableSchema = + new Schema(Types.NestedField.optional(0, "id", Types.LongType.get())); + validationCatalog.createTable(TableIdentifier.of(icebergNamespace, "tl"), tableSchema); + sql("ALTER TABLE tl RENAME TO tl2"); + + assertThatThrownBy(() -> getTableEnv().from("tl")) + .isInstanceOf(ValidationException.class) + .hasMessage("Table `tl` was not found."); + + Schema actualSchema = FlinkSchemaUtil.convert(getTableEnv().from("tl2").getSchema()); + assertThat(tableSchema.asStruct()).isEqualTo(actualSchema.asStruct()); + } + + @TestTemplate + public void testCreateTable() throws TableNotExistException { + sql("CREATE TABLE tl(id BIGINT)"); + + Table table = table("tl"); + assertThat(table.schema().asStruct()) + .isEqualTo( + new Schema(Types.NestedField.optional(1, "id", Types.LongType.get())).asStruct()); + CatalogTable catalogTable = catalogTable("tl"); + assertThat(catalogTable.getSchema()) + .isEqualTo(TableSchema.builder().field("id", DataTypes.BIGINT()).build()); + } + + @TestTemplate + public void testCreateTableWithPrimaryKey() throws Exception { + sql("CREATE TABLE tl(id BIGINT, data STRING, key STRING PRIMARY KEY NOT ENFORCED)"); + + Table table = table("tl"); + assertThat(table.schema().identifierFieldIds()) + .as("Should have the expected row key.") + .isEqualTo(Sets.newHashSet(table.schema().findField("key").fieldId())); + CatalogTable catalogTable = catalogTable("tl"); + Optional uniqueConstraintOptional = catalogTable.getSchema().getPrimaryKey(); + assertThat(uniqueConstraintOptional).isPresent(); + assertThat(uniqueConstraintOptional.get().getColumns()).containsExactly("key"); + } + + @TestTemplate + public void testCreateTableWithMultiColumnsInPrimaryKey() throws Exception { + sql( + "CREATE TABLE tl(id BIGINT, data STRING, CONSTRAINT pk_constraint PRIMARY KEY(data, id) NOT ENFORCED)"); + + Table table = table("tl"); + assertThat(table.schema().identifierFieldIds()) + .as("Should have the expected RowKey") + .isEqualTo( + Sets.newHashSet( + table.schema().findField("id").fieldId(), + table.schema().findField("data").fieldId())); + CatalogTable catalogTable = catalogTable("tl"); + Optional uniqueConstraintOptional = catalogTable.getSchema().getPrimaryKey(); + assertThat(uniqueConstraintOptional).isPresent(); + assertThat(uniqueConstraintOptional.get().getColumns()).containsExactly("id", "data"); + } + + @TestTemplate + public void testCreateTableIfNotExists() { + sql("CREATE TABLE tl(id BIGINT)"); + + // Assert that table does exist. + assertThat(table("tl")).isNotNull(); + + sql("DROP TABLE tl"); + assertThatThrownBy(() -> table("tl")) + .isInstanceOf(NoSuchTableException.class) + .hasMessage("Table does not exist: " + getFullQualifiedTableName("tl")); + + sql("CREATE TABLE IF NOT EXISTS tl(id BIGINT)"); + assertThat(table("tl").properties()).doesNotContainKey("key"); + + table("tl").updateProperties().set("key", "value").commit(); + assertThat(table("tl").properties()).containsEntry("key", "value"); + + sql("CREATE TABLE IF NOT EXISTS tl(id BIGINT)"); + assertThat(table("tl").properties()).containsEntry("key", "value"); + } + + @TestTemplate + public void testCreateTableLike() throws TableNotExistException { + sql("CREATE TABLE tl(id BIGINT)"); + sql("CREATE TABLE tl2 LIKE tl"); + + Table table = table("tl2"); + assertThat(table.schema().asStruct()) + .isEqualTo( + new Schema(Types.NestedField.optional(1, "id", Types.LongType.get())).asStruct()); + CatalogTable catalogTable = catalogTable("tl2"); + assertThat(catalogTable.getSchema()) + .isEqualTo(TableSchema.builder().field("id", DataTypes.BIGINT()).build()); + } + + @TestTemplate + public void testCreateTableLocation() { + assumeThat(isHadoopCatalog) + .as("HadoopCatalog does not support creating table with location") + .isFalse(); + sql("CREATE TABLE tl(id BIGINT) WITH ('location'='file:///tmp/location')"); + + Table table = table("tl"); + assertThat(table.schema().asStruct()) + .isEqualTo( + new Schema(Types.NestedField.optional(1, "id", Types.LongType.get())).asStruct()); + assertThat(table.location()).isEqualTo("file:///tmp/location"); + } + + @TestTemplate + public void testCreatePartitionTable() throws TableNotExistException { + sql("CREATE TABLE tl(id BIGINT, dt STRING) PARTITIONED BY(dt)"); + + Table table = table("tl"); + assertThat(table.schema().asStruct()) + .isEqualTo( + new Schema( + Types.NestedField.optional(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "dt", Types.StringType.get())) + .asStruct()); + assertThat(table.spec()) + .isEqualTo(PartitionSpec.builderFor(table.schema()).identity("dt").build()); + CatalogTable catalogTable = catalogTable("tl"); + assertThat(catalogTable.getSchema()) + .isEqualTo( + TableSchema.builder() + .field("id", DataTypes.BIGINT()) + .field("dt", DataTypes.STRING()) + .build()); + assertThat(catalogTable.getPartitionKeys()).isEqualTo(Collections.singletonList("dt")); + } + + @TestTemplate + public void testCreateTableWithColumnComment() { + sql("CREATE TABLE tl(id BIGINT COMMENT 'comment - id', data STRING COMMENT 'comment - data')"); + + Table table = table("tl"); + assertThat(table.schema().asStruct()) + .isEqualTo( + new Schema( + Types.NestedField.optional(1, "id", Types.LongType.get(), "comment - id"), + Types.NestedField.optional(2, "data", Types.StringType.get(), "comment - data")) + .asStruct()); + } + + @TestTemplate + public void testCreateTableWithFormatV2ThroughTableProperty() throws Exception { + sql("CREATE TABLE tl(id BIGINT) WITH ('format-version'='2')"); + + Table table = table("tl"); + assertThat(((BaseTable) table).operations().current().formatVersion()).isEqualTo(2); + } + + @TestTemplate + public void testUpgradeTableWithFormatV2ThroughTableProperty() throws Exception { + sql("CREATE TABLE tl(id BIGINT) WITH ('format-version'='1')"); + + Table table = table("tl"); + TableOperations ops = ((BaseTable) table).operations(); + assertThat(ops.refresh().formatVersion()) + .as("should create table using format v1") + .isEqualTo(1); + sql("ALTER TABLE tl SET('format-version'='2')"); + assertThat(ops.refresh().formatVersion()) + .as("should update table to use format v2") + .isEqualTo(2); + } + + @TestTemplate + public void testDowngradeTableToFormatV1ThroughTablePropertyFails() throws Exception { + sql("CREATE TABLE tl(id BIGINT) WITH ('format-version'='2')"); + + Table table = table("tl"); + TableOperations ops = ((BaseTable) table).operations(); + assertThat(ops.refresh().formatVersion()) + .as("should create table using format v2") + .isEqualTo(2); + assertThatThrownBy(() -> sql("ALTER TABLE tl SET('format-version'='1')")) + .rootCause() + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Cannot downgrade v2 table to v1"); + } + + @TestTemplate + public void testLoadTransformPartitionTable() throws TableNotExistException { + Schema schema = new Schema(Types.NestedField.optional(0, "id", Types.LongType.get())); + validationCatalog.createTable( + TableIdentifier.of(icebergNamespace, "tl"), + schema, + PartitionSpec.builderFor(schema).bucket("id", 100).build()); + + CatalogTable catalogTable = catalogTable("tl"); + assertThat(catalogTable.getSchema()) + .isEqualTo(TableSchema.builder().field("id", DataTypes.BIGINT()).build()); + assertThat(catalogTable.getPartitionKeys()).isEmpty(); + } + + @TestTemplate + public void testAlterTableProperties() throws TableNotExistException { + sql("CREATE TABLE tl(id BIGINT) WITH ('oldK'='oldV')"); + Map properties = Maps.newHashMap(); + properties.put("oldK", "oldV"); + + // new + sql("ALTER TABLE tl SET('newK'='newV')"); + properties.put("newK", "newV"); + assertThat(table("tl").properties()).containsAllEntriesOf(properties); + + // update old + sql("ALTER TABLE tl SET('oldK'='oldV2')"); + properties.put("oldK", "oldV2"); + assertThat(table("tl").properties()).containsAllEntriesOf(properties); + + // remove property + sql("ALTER TABLE tl RESET('oldK')"); + properties.remove("oldK"); + assertThat(table("tl").properties()).containsAllEntriesOf(properties); + } + + @TestTemplate + public void testAlterTableAddColumn() { + sql("CREATE TABLE tl(id BIGINT)"); + Schema schemaBefore = table("tl").schema(); + assertThat(schemaBefore.asStruct()) + .isEqualTo( + new Schema(Types.NestedField.optional(1, "id", Types.LongType.get())).asStruct()); + sql("ALTER TABLE tl ADD (dt STRING)"); + Schema schemaAfter1 = table("tl").schema(); + assertThat(schemaAfter1.asStruct()) + .isEqualTo( + new Schema( + Types.NestedField.optional(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "dt", Types.StringType.get())) + .asStruct()); + // Add multiple columns + sql("ALTER TABLE tl ADD (col1 STRING COMMENT 'comment for col1', col2 BIGINT)"); + Schema schemaAfter2 = table("tl").schema(); + assertThat(schemaAfter2.asStruct()) + .isEqualTo( + new Schema( + Types.NestedField.optional(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "dt", Types.StringType.get()), + Types.NestedField.optional( + 3, "col1", Types.StringType.get(), "comment for col1"), + Types.NestedField.optional(4, "col2", Types.LongType.get())) + .asStruct()); + // Adding a required field should fail because Iceberg's SchemaUpdate does not allow + // incompatible changes. + assertThatThrownBy(() -> sql("ALTER TABLE tl ADD (pk STRING NOT NULL)")) + .hasRootCauseInstanceOf(IllegalArgumentException.class) + .hasRootCauseMessage("Incompatible change: cannot add required column: pk"); + + // Adding an existing field should fail due to Flink's internal validation. + assertThatThrownBy(() -> sql("ALTER TABLE tl ADD (id STRING)")) + .isInstanceOf(ValidationException.class) + .hasMessageContaining("Try to add a column `id` which already exists in the table."); + } + + @TestTemplate + public void testAlterTableDropColumn() { + sql("CREATE TABLE tl(id BIGINT, dt STRING, col1 STRING, col2 BIGINT)"); + Schema schemaBefore = table("tl").schema(); + assertThat(schemaBefore.asStruct()) + .isEqualTo( + new Schema( + Types.NestedField.optional(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "dt", Types.StringType.get()), + Types.NestedField.optional(3, "col1", Types.StringType.get()), + Types.NestedField.optional(4, "col2", Types.LongType.get())) + .asStruct()); + sql("ALTER TABLE tl DROP (dt)"); + Schema schemaAfter1 = table("tl").schema(); + assertThat(schemaAfter1.asStruct()) + .isEqualTo( + new Schema( + Types.NestedField.optional(1, "id", Types.LongType.get()), + Types.NestedField.optional(3, "col1", Types.StringType.get()), + Types.NestedField.optional(4, "col2", Types.LongType.get())) + .asStruct()); + // Drop multiple columns + sql("ALTER TABLE tl DROP (col1, col2)"); + Schema schemaAfter2 = table("tl").schema(); + assertThat(schemaAfter2.asStruct()) + .isEqualTo( + new Schema(Types.NestedField.optional(1, "id", Types.LongType.get())).asStruct()); + // Dropping an non-existing field should fail due to Flink's internal validation. + assertThatThrownBy(() -> sql("ALTER TABLE tl DROP (foo)")) + .isInstanceOf(ValidationException.class) + .hasMessageContaining("The column `foo` does not exist in the base table."); + + // Dropping an already-deleted field should fail due to Flink's internal validation. + assertThatThrownBy(() -> sql("ALTER TABLE tl DROP (dt)")) + .isInstanceOf(ValidationException.class) + .hasMessageContaining("The column `dt` does not exist in the base table."); + } + + @TestTemplate + public void testAlterTableModifyColumnName() { + sql("CREATE TABLE tl(id BIGINT, dt STRING)"); + Schema schemaBefore = table("tl").schema(); + assertThat(schemaBefore.asStruct()) + .isEqualTo( + new Schema( + Types.NestedField.optional(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "dt", Types.StringType.get())) + .asStruct()); + sql("ALTER TABLE tl RENAME dt TO data"); + Schema schemaAfter = table("tl").schema(); + assertThat(schemaAfter.asStruct()) + .isEqualTo( + new Schema( + Types.NestedField.optional(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get())) + .asStruct()); + } + + @TestTemplate + public void testAlterTableModifyColumnType() { + sql("CREATE TABLE tl(id INTEGER, dt STRING)"); + Schema schemaBefore = table("tl").schema(); + assertThat(schemaBefore.asStruct()) + .isEqualTo( + new Schema( + Types.NestedField.optional(1, "id", Types.IntegerType.get()), + Types.NestedField.optional(2, "dt", Types.StringType.get())) + .asStruct()); + // Promote type from Integer to Long + sql("ALTER TABLE tl MODIFY (id BIGINT)"); + Schema schemaAfter = table("tl").schema(); + assertThat(schemaAfter.asStruct()) + .isEqualTo( + new Schema( + Types.NestedField.optional(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "dt", Types.StringType.get())) + .asStruct()); + // Type change that doesn't follow the type-promotion rule should fail due to Iceberg's + // validation. + assertThatThrownBy(() -> sql("ALTER TABLE tl MODIFY (dt INTEGER)")) + .isInstanceOf(TableException.class) + .hasRootCauseInstanceOf(IllegalArgumentException.class) + .hasRootCauseMessage("Cannot change column type: dt: string -> int"); + } + + @TestTemplate + public void testAlterTableModifyColumnNullability() { + sql("CREATE TABLE tl(id INTEGER NOT NULL, dt STRING)"); + Schema schemaBefore = table("tl").schema(); + assertThat(schemaBefore.asStruct()) + .isEqualTo( + new Schema( + Types.NestedField.required(1, "id", Types.IntegerType.get()), + Types.NestedField.optional(2, "dt", Types.StringType.get())) + .asStruct()); + // Changing nullability from optional to required should fail + // because Iceberg's SchemaUpdate does not allow incompatible changes. + assertThatThrownBy(() -> sql("ALTER TABLE tl MODIFY (dt STRING NOT NULL)")) + .isInstanceOf(TableException.class) + .hasRootCauseInstanceOf(IllegalArgumentException.class) + .hasRootCauseMessage("Cannot change column nullability: dt: optional -> required"); + + // Set nullability from required to optional + sql("ALTER TABLE tl MODIFY (id INTEGER)"); + Schema schemaAfter = table("tl").schema(); + assertThat(schemaAfter.asStruct()) + .isEqualTo( + new Schema( + Types.NestedField.optional(1, "id", Types.IntegerType.get()), + Types.NestedField.optional(2, "dt", Types.StringType.get())) + .asStruct()); + } + + @TestTemplate + public void testAlterTableModifyColumnPosition() { + sql("CREATE TABLE tl(id BIGINT, dt STRING)"); + Schema schemaBefore = table("tl").schema(); + assertThat(schemaBefore.asStruct()) + .isEqualTo( + new Schema( + Types.NestedField.optional(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "dt", Types.StringType.get())) + .asStruct()); + + sql("ALTER TABLE tl MODIFY (dt STRING FIRST)"); + Schema schemaAfter = table("tl").schema(); + assertThat(schemaAfter.asStruct()) + .isEqualTo( + new Schema( + Types.NestedField.optional(2, "dt", Types.StringType.get()), + Types.NestedField.optional(1, "id", Types.LongType.get())) + .asStruct()); + + sql("ALTER TABLE tl MODIFY (dt STRING AFTER id)"); + Schema schemaAfterAfter = table("tl").schema(); + assertThat(schemaAfterAfter.asStruct()) + .isEqualTo( + new Schema( + Types.NestedField.optional(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "dt", Types.StringType.get())) + .asStruct()); + // Modifying the position of a non-existing column should fail due to Flink's internal + // validation. + assertThatThrownBy(() -> sql("ALTER TABLE tl MODIFY (non_existing STRING FIRST)")) + .isInstanceOf(ValidationException.class) + .hasMessageContaining( + "Try to modify a column `non_existing` which does not exist in the table."); + + // Moving a column after a non-existing column should fail due to Flink's internal validation. + assertThatThrownBy(() -> sql("ALTER TABLE tl MODIFY (dt STRING AFTER non_existing)")) + .isInstanceOf(ValidationException.class) + .hasMessageContaining( + "Referenced column `non_existing` by 'AFTER' does not exist in the table."); + } + + @TestTemplate + public void testAlterTableModifyColumnComment() { + sql("CREATE TABLE tl(id BIGINT, dt STRING)"); + Schema schemaBefore = table("tl").schema(); + assertThat(schemaBefore.asStruct()) + .isEqualTo( + new Schema( + Types.NestedField.optional(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "dt", Types.StringType.get())) + .asStruct()); + + sql("ALTER TABLE tl MODIFY (dt STRING COMMENT 'comment for dt field')"); + Schema schemaAfter = table("tl").schema(); + assertThat(schemaAfter.asStruct()) + .isEqualTo( + new Schema( + Types.NestedField.optional(1, "id", Types.LongType.get()), + Types.NestedField.optional( + 2, "dt", Types.StringType.get(), "comment for dt field")) + .asStruct()); + } + + @TestTemplate + public void testAlterTableConstraint() { + sql("CREATE TABLE tl(id BIGINT NOT NULL, dt STRING NOT NULL, col1 STRING)"); + Schema schemaBefore = table("tl").schema(); + assertThat(schemaBefore.asStruct()) + .isEqualTo( + new Schema( + Types.NestedField.required(1, "id", Types.LongType.get()), + Types.NestedField.required(2, "dt", Types.StringType.get()), + Types.NestedField.optional(3, "col1", Types.StringType.get())) + .asStruct()); + assertThat(schemaBefore.identifierFieldNames()).isEmpty(); + sql("ALTER TABLE tl ADD (PRIMARY KEY (id) NOT ENFORCED)"); + Schema schemaAfterAdd = table("tl").schema(); + assertThat(schemaAfterAdd.identifierFieldNames()).containsExactly("id"); + sql("ALTER TABLE tl MODIFY (PRIMARY KEY (dt) NOT ENFORCED)"); + Schema schemaAfterModify = table("tl").schema(); + assertThat(schemaAfterModify.asStruct()) + .isEqualTo( + new Schema( + Types.NestedField.required(1, "id", Types.LongType.get()), + Types.NestedField.required(2, "dt", Types.StringType.get()), + Types.NestedField.optional(3, "col1", Types.StringType.get())) + .asStruct()); + assertThat(schemaAfterModify.identifierFieldNames()).containsExactly("dt"); + // Composite primary key + sql("ALTER TABLE tl MODIFY (PRIMARY KEY (id, dt) NOT ENFORCED)"); + Schema schemaAfterComposite = table("tl").schema(); + assertThat(schemaAfterComposite.asStruct()) + .isEqualTo( + new Schema( + Types.NestedField.required(1, "id", Types.LongType.get()), + Types.NestedField.required(2, "dt", Types.StringType.get()), + Types.NestedField.optional(3, "col1", Types.StringType.get())) + .asStruct()); + assertThat(schemaAfterComposite.identifierFieldNames()).containsExactlyInAnyOrder("id", "dt"); + // Setting an optional field as primary key should fail + // because Iceberg's SchemaUpdate does not allow incompatible changes. + assertThatThrownBy(() -> sql("ALTER TABLE tl MODIFY (PRIMARY KEY (col1) NOT ENFORCED)")) + .isInstanceOf(TableException.class) + .hasRootCauseInstanceOf(IllegalArgumentException.class) + .hasRootCauseMessage("Cannot add field col1 as an identifier field: not a required field"); + + // Setting a composite key containing an optional field should fail + // because Iceberg's SchemaUpdate does not allow incompatible changes. + assertThatThrownBy(() -> sql("ALTER TABLE tl MODIFY (PRIMARY KEY (id, col1) NOT ENFORCED)")) + .isInstanceOf(TableException.class) + .hasRootCauseInstanceOf(IllegalArgumentException.class) + .hasRootCauseMessage("Cannot add field col1 as an identifier field: not a required field"); + + // Dropping constraints is not supported yet + assertThatThrownBy(() -> sql("ALTER TABLE tl DROP PRIMARY KEY")) + .isInstanceOf(TableException.class) + .hasRootCauseInstanceOf(UnsupportedOperationException.class) + .hasRootCauseMessage("Unsupported table change: DropConstraint."); + } + + @TestTemplate + public void testRelocateTable() { + assumeThat(isHadoopCatalog).as("HadoopCatalog does not support relocate table").isFalse(); + sql("CREATE TABLE tl(id BIGINT)"); + sql("ALTER TABLE tl SET('location'='file:///tmp/location')"); + assertThat(table("tl").location()).isEqualTo("file:///tmp/location"); + } + + @TestTemplate + public void testSetCurrentAndCherryPickSnapshotId() { + sql("CREATE TABLE tl(c1 INT, c2 STRING, c3 STRING) PARTITIONED BY (c1)"); + + Table table = table("tl"); + + DataFile fileA = + DataFiles.builder(table.spec()) + .withPath("/path/to/data-a.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("c1=0") // easy way to set partition data for now + .withRecordCount(1) + .build(); + DataFile fileB = + DataFiles.builder(table.spec()) + .withPath("/path/to/data-b.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("c1=1") // easy way to set partition data for now + .withRecordCount(1) + .build(); + DataFile replacementFile = + DataFiles.builder(table.spec()) + .withPath("/path/to/data-a-replacement.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("c1=0") // easy way to set partition data for now + .withRecordCount(1) + .build(); + + table.newAppend().appendFile(fileA).commit(); + long snapshotId = table.currentSnapshot().snapshotId(); + + // stage an overwrite that replaces FILE_A + table.newReplacePartitions().addFile(replacementFile).stageOnly().commit(); + + Snapshot staged = Iterables.getLast(table.snapshots()); + assertThat(staged.operation()) + .as("Should find the staged overwrite snapshot") + .isEqualTo(DataOperations.OVERWRITE); + // add another append so that the original commit can't be fast-forwarded + table.newAppend().appendFile(fileB).commit(); + + // test cherry pick + sql("ALTER TABLE tl SET('cherry-pick-snapshot-id'='%s')", staged.snapshotId()); + validateTableFiles(table, fileB, replacementFile); + + // test set current snapshot + sql("ALTER TABLE tl SET('current-snapshot-id'='%s')", snapshotId); + validateTableFiles(table, fileA); + } + + private void validateTableFiles(Table tbl, DataFile... expectedFiles) { + tbl.refresh(); + Set expectedFilePaths = + Arrays.stream(expectedFiles).map(DataFile::path).collect(Collectors.toSet()); + Set actualFilePaths = + StreamSupport.stream(tbl.newScan().planFiles().spliterator(), false) + .map(FileScanTask::file) + .map(ContentFile::path) + .collect(Collectors.toSet()); + assertThat(actualFilePaths).as("Files should match").isEqualTo(expectedFilePaths); + } + + private Table table(String name) { + return validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, name)); + } + + private CatalogTable catalogTable(String name) throws TableNotExistException { + return (CatalogTable) + getTableEnv() + .getCatalog(getTableEnv().getCurrentCatalog()) + .get() + .getTable(new ObjectPath(DATABASE, name)); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogTablePartitions.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogTablePartitions.java new file mode 100644 index 000000000000..e69e1ac4d713 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogTablePartitions.java @@ -0,0 +1,119 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import java.util.List; +import org.apache.flink.table.catalog.CatalogPartitionSpec; +import org.apache.flink.table.catalog.ObjectPath; +import org.apache.flink.table.catalog.exceptions.TableNotExistException; +import org.apache.flink.table.catalog.exceptions.TableNotPartitionedException; +import org.apache.iceberg.CatalogProperties; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Parameter; +import org.apache.iceberg.Parameters; +import org.apache.iceberg.catalog.Namespace; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.TestTemplate; + +public class TestFlinkCatalogTablePartitions extends CatalogTestBase { + + private final String tableName = "test_table"; + + @Parameter(index = 2) + private FileFormat format; + + @Parameter(index = 3) + private Boolean cacheEnabled; + + @Parameters(name = "catalogName={0}, baseNamespace={1}, format={2}, cacheEnabled={3}") + protected static List parameters() { + List parameters = Lists.newArrayList(); + for (FileFormat format : + new FileFormat[] {FileFormat.ORC, FileFormat.AVRO, FileFormat.PARQUET}) { + for (Boolean cacheEnabled : new Boolean[] {true, false}) { + for (Object[] catalogParams : CatalogTestBase.parameters()) { + String catalogName = (String) catalogParams[0]; + Namespace baseNamespace = (Namespace) catalogParams[1]; + parameters.add(new Object[] {catalogName, baseNamespace, format, cacheEnabled}); + } + } + } + return parameters; + } + + @Override + @BeforeEach + public void before() { + super.before(); + config.put(CatalogProperties.CACHE_ENABLED, String.valueOf(cacheEnabled)); + sql("CREATE DATABASE %s", flinkDatabase); + sql("USE CATALOG %s", catalogName); + sql("USE %s", DATABASE); + } + + @AfterEach + public void cleanNamespaces() { + sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, tableName); + dropDatabase(flinkDatabase, true); + super.clean(); + } + + @TestTemplate + public void testListPartitionsWithUnpartitionedTable() { + sql( + "CREATE TABLE %s (id INT, data VARCHAR) with ('write.format.default'='%s')", + tableName, format.name()); + sql("INSERT INTO %s SELECT 1,'a'", tableName); + + ObjectPath objectPath = new ObjectPath(DATABASE, tableName); + FlinkCatalog flinkCatalog = (FlinkCatalog) getTableEnv().getCatalog(catalogName).get(); + assertThatThrownBy(() -> flinkCatalog.listPartitions(objectPath)) + .isInstanceOf(TableNotPartitionedException.class) + .hasMessageStartingWith("Table db.test_table in catalog") + .hasMessageEndingWith("is not partitioned."); + } + + @TestTemplate + public void testListPartitionsWithPartitionedTable() + throws TableNotExistException, TableNotPartitionedException { + sql( + "CREATE TABLE %s (id INT, data VARCHAR) PARTITIONED BY (data) " + + "with ('write.format.default'='%s')", + tableName, format.name()); + sql("INSERT INTO %s SELECT 1,'a'", tableName); + sql("INSERT INTO %s SELECT 2,'b'", tableName); + + ObjectPath objectPath = new ObjectPath(DATABASE, tableName); + FlinkCatalog flinkCatalog = (FlinkCatalog) getTableEnv().getCatalog(catalogName).get(); + List list = flinkCatalog.listPartitions(objectPath); + assertThat(list).hasSize(2); + List expected = Lists.newArrayList(); + CatalogPartitionSpec partitionSpec1 = new CatalogPartitionSpec(ImmutableMap.of("data", "a")); + CatalogPartitionSpec partitionSpec2 = new CatalogPartitionSpec(ImmutableMap.of("data", "b")); + expected.add(partitionSpec1); + expected.add(partitionSpec2); + assertThat(list).as("Should produce the expected catalog partition specs.").isEqualTo(expected); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkConfParser.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkConfParser.java new file mode 100644 index 000000000000..4b6ac25ab8e3 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkConfParser.java @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +import java.time.Duration; +import java.util.Map; +import org.apache.flink.configuration.ConfigOption; +import org.apache.flink.configuration.ConfigOptions; +import org.apache.flink.configuration.Configuration; +import org.apache.iceberg.Table; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.junit.jupiter.api.Test; + +public class TestFlinkConfParser { + + @Test + public void testDurationConf() { + Map writeOptions = ImmutableMap.of("write-prop", "111s"); + + ConfigOption configOption = + ConfigOptions.key("conf-prop").durationType().noDefaultValue(); + Configuration flinkConf = new Configuration(); + flinkConf.setString(configOption.key(), "222s"); + + Table table = mock(Table.class); + when(table.properties()).thenReturn(ImmutableMap.of("table-prop", "333s")); + + FlinkConfParser confParser = new FlinkConfParser(table, writeOptions, flinkConf); + Duration defaultVal = Duration.ofMillis(999); + + Duration result = + confParser.durationConf().option("write-prop").defaultValue(defaultVal).parse(); + assertThat(result).isEqualTo(Duration.ofSeconds(111)); + + result = confParser.durationConf().flinkConfig(configOption).defaultValue(defaultVal).parse(); + assertThat(result).isEqualTo(Duration.ofSeconds(222)); + + result = confParser.durationConf().tableProperty("table-prop").defaultValue(defaultVal).parse(); + assertThat(result).isEqualTo(Duration.ofSeconds(333)); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkFilters.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkFilters.java new file mode 100644 index 000000000000..838b0ea0e1a9 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkFilters.java @@ -0,0 +1,462 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.math.BigDecimal; +import java.nio.ByteBuffer; +import java.time.Instant; +import java.time.LocalDate; +import java.time.LocalDateTime; +import java.time.LocalTime; +import java.util.List; +import java.util.Optional; +import java.util.stream.Collectors; +import org.apache.flink.table.api.DataTypes; +import org.apache.flink.table.api.Expressions; +import org.apache.flink.table.api.TableColumn; +import org.apache.flink.table.api.TableSchema; +import org.apache.flink.table.expressions.ApiExpressionUtils; +import org.apache.flink.table.expressions.CallExpression; +import org.apache.flink.table.expressions.Expression; +import org.apache.flink.table.expressions.FieldReferenceExpression; +import org.apache.flink.table.expressions.ResolvedExpression; +import org.apache.flink.table.expressions.UnresolvedCallExpression; +import org.apache.flink.table.expressions.UnresolvedReferenceExpression; +import org.apache.flink.table.expressions.ValueLiteralExpression; +import org.apache.flink.table.expressions.utils.ApiExpressionDefaultVisitor; +import org.apache.flink.table.functions.BuiltInFunctionDefinitions; +import org.apache.iceberg.expressions.And; +import org.apache.iceberg.expressions.BoundLiteralPredicate; +import org.apache.iceberg.expressions.Not; +import org.apache.iceberg.expressions.Or; +import org.apache.iceberg.expressions.UnboundPredicate; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.util.DateTimeUtil; +import org.apache.iceberg.util.Pair; +import org.junit.jupiter.api.Test; + +public class TestFlinkFilters { + + private static final TableSchema TABLE_SCHEMA = + TableSchema.builder() + .field("field1", DataTypes.INT()) + .field("field2", DataTypes.BIGINT()) + .field("field3", DataTypes.FLOAT()) + .field("field4", DataTypes.DOUBLE()) + .field("field5", DataTypes.STRING()) + .field("field6", DataTypes.BOOLEAN()) + .field("field7", DataTypes.BINARY(2)) + .field("field8", DataTypes.DECIMAL(10, 2)) + .field("field9", DataTypes.DATE()) + .field("field10", DataTypes.TIME()) + .field("field11", DataTypes.TIMESTAMP()) + .field("field12", DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE()) + .build(); + + // A map list of fields and values used to verify the conversion of flink expression to iceberg + // expression + private static final List> FIELD_VALUE_LIST = + ImmutableList.of( + Pair.of("field1", 1), + Pair.of("field2", 2L), + Pair.of("field3", 3F), + Pair.of("field4", 4D), + Pair.of("field5", "iceberg"), + Pair.of("field6", true), + Pair.of("field7", new byte[] {'a', 'b'}), + Pair.of("field8", BigDecimal.valueOf(10.12)), + Pair.of("field9", DateTimeUtil.daysFromDate(LocalDate.now())), + Pair.of("field10", DateTimeUtil.microsFromTime(LocalTime.now())), + Pair.of("field11", DateTimeUtil.microsFromTimestamp(LocalDateTime.now())), + Pair.of("field12", DateTimeUtil.microsFromInstant(Instant.now()))); + + @Test + public void testFlinkDataTypeEqual() { + matchLiteral("field1", 1, 1); + matchLiteral("field2", 10L, 10L); + matchLiteral("field3", 1.2F, 1.2F); + matchLiteral("field4", 3.4D, 3.4D); + matchLiteral("field5", "abcd", "abcd"); + matchLiteral("field6", true, true); + matchLiteral("field7", new byte[] {'a', 'b'}, ByteBuffer.wrap(new byte[] {'a', 'b'})); + matchLiteral("field8", BigDecimal.valueOf(10.12), BigDecimal.valueOf(10.12)); + + LocalDate date = LocalDate.parse("2020-12-23"); + matchLiteral("field9", date, DateTimeUtil.daysFromDate(date)); + + LocalTime time = LocalTime.parse("12:13:14"); + matchLiteral("field10", time, DateTimeUtil.microsFromTime(time)); + + LocalDateTime dateTime = LocalDateTime.parse("2020-12-23T12:13:14"); + matchLiteral("field11", dateTime, DateTimeUtil.microsFromTimestamp(dateTime)); + + Instant instant = Instant.parse("2020-12-23T12:13:14.00Z"); + matchLiteral("field12", instant, DateTimeUtil.microsFromInstant(instant)); + } + + @Test + public void testEquals() { + for (Pair pair : FIELD_VALUE_LIST) { + UnboundPredicate expected = + org.apache.iceberg.expressions.Expressions.equal(pair.first(), pair.second()); + + Optional actual = + FlinkFilters.convert( + resolve(Expressions.$(pair.first()).isEqual(Expressions.lit(pair.second())))); + assertThat(actual).isPresent(); + assertPredicatesMatch(expected, actual.get()); + + Optional actual1 = + FlinkFilters.convert( + resolve(Expressions.lit(pair.second()).isEqual(Expressions.$(pair.first())))); + assertThat(actual1).isPresent(); + assertPredicatesMatch(expected, actual1.get()); + } + } + + @Test + public void testEqualsNaN() { + UnboundPredicate expected = org.apache.iceberg.expressions.Expressions.isNaN("field3"); + + Optional actual = + FlinkFilters.convert(resolve(Expressions.$("field3").isEqual(Expressions.lit(Float.NaN)))); + assertThat(actual).isPresent(); + assertPredicatesMatch(expected, actual.get()); + + Optional actual1 = + FlinkFilters.convert(resolve(Expressions.lit(Float.NaN).isEqual(Expressions.$("field3")))); + assertThat(actual1).isPresent(); + assertPredicatesMatch(expected, actual1.get()); + } + + @Test + public void testNotEquals() { + for (Pair pair : FIELD_VALUE_LIST) { + UnboundPredicate expected = + org.apache.iceberg.expressions.Expressions.notEqual(pair.first(), pair.second()); + + Optional actual = + FlinkFilters.convert( + resolve(Expressions.$(pair.first()).isNotEqual(Expressions.lit(pair.second())))); + assertThat(actual).isPresent(); + assertPredicatesMatch(expected, actual.get()); + + Optional actual1 = + FlinkFilters.convert( + resolve(Expressions.lit(pair.second()).isNotEqual(Expressions.$(pair.first())))); + assertThat(actual1).isPresent(); + assertPredicatesMatch(expected, actual1.get()); + } + } + + @Test + public void testNotEqualsNaN() { + UnboundPredicate expected = org.apache.iceberg.expressions.Expressions.notNaN("field3"); + + Optional actual = + FlinkFilters.convert( + resolve(Expressions.$("field3").isNotEqual(Expressions.lit(Float.NaN)))); + assertThat(actual).isPresent(); + assertPredicatesMatch(expected, actual.get()); + + Optional actual1 = + FlinkFilters.convert( + resolve(Expressions.lit(Float.NaN).isNotEqual(Expressions.$("field3")))); + assertThat(actual1).isPresent(); + assertPredicatesMatch(expected, actual1.get()); + } + + @Test + public void testGreaterThan() { + UnboundPredicate expected = + org.apache.iceberg.expressions.Expressions.greaterThan("field1", 1); + + Optional actual = + FlinkFilters.convert(resolve(Expressions.$("field1").isGreater(Expressions.lit(1)))); + assertThat(actual).isPresent(); + assertPredicatesMatch(expected, actual.get()); + + Optional actual1 = + FlinkFilters.convert(resolve(Expressions.lit(1).isLess(Expressions.$("field1")))); + assertThat(actual1).isPresent(); + assertPredicatesMatch(expected, actual1.get()); + } + + @Test + public void testGreaterThanEquals() { + UnboundPredicate expected = + org.apache.iceberg.expressions.Expressions.greaterThanOrEqual("field1", 1); + + Optional actual = + FlinkFilters.convert(resolve(Expressions.$("field1").isGreaterOrEqual(Expressions.lit(1)))); + assertThat(actual).isPresent(); + assertPredicatesMatch(expected, actual.get()); + + Optional actual1 = + FlinkFilters.convert(resolve(Expressions.lit(1).isLessOrEqual(Expressions.$("field1")))); + assertThat(actual1).isPresent(); + assertPredicatesMatch(expected, actual1.get()); + } + + @Test + public void testLessThan() { + UnboundPredicate expected = + org.apache.iceberg.expressions.Expressions.lessThan("field1", 1); + + Optional actual = + FlinkFilters.convert(resolve(Expressions.$("field1").isLess(Expressions.lit(1)))); + assertThat(actual).isPresent(); + assertPredicatesMatch(expected, actual.get()); + + Optional actual1 = + FlinkFilters.convert(resolve(Expressions.lit(1).isGreater(Expressions.$("field1")))); + assertThat(actual1).isPresent(); + assertPredicatesMatch(expected, actual1.get()); + } + + @Test + public void testLessThanEquals() { + UnboundPredicate expected = + org.apache.iceberg.expressions.Expressions.lessThanOrEqual("field1", 1); + + Optional actual = + FlinkFilters.convert(resolve(Expressions.$("field1").isLessOrEqual(Expressions.lit(1)))); + assertThat(actual).isPresent(); + assertPredicatesMatch(expected, actual.get()); + + Optional actual1 = + FlinkFilters.convert(resolve(Expressions.lit(1).isGreaterOrEqual(Expressions.$("field1")))); + assertThat(actual1).isPresent(); + assertPredicatesMatch(expected, actual1.get()); + } + + @Test + public void testIsNull() { + Expression expr = resolve(Expressions.$("field1").isNull()); + Optional actual = FlinkFilters.convert(expr); + assertThat(actual).isPresent(); + UnboundPredicate expected = org.apache.iceberg.expressions.Expressions.isNull("field1"); + assertPredicatesMatch(expected, actual.get()); + } + + @Test + public void testIsNotNull() { + Expression expr = resolve(Expressions.$("field1").isNotNull()); + Optional actual = FlinkFilters.convert(expr); + assertThat(actual).isPresent(); + UnboundPredicate expected = + org.apache.iceberg.expressions.Expressions.notNull("field1"); + assertPredicatesMatch(expected, actual.get()); + } + + @Test + public void testAnd() { + Expression expr = + resolve( + Expressions.$("field1") + .isEqual(Expressions.lit(1)) + .and(Expressions.$("field2").isEqual(Expressions.lit(2L)))); + Optional actual = FlinkFilters.convert(expr); + assertThat(actual).isPresent(); + And and = (And) actual.get(); + And expected = + (And) + org.apache.iceberg.expressions.Expressions.and( + org.apache.iceberg.expressions.Expressions.equal("field1", 1), + org.apache.iceberg.expressions.Expressions.equal("field2", 2L)); + + assertPredicatesMatch(expected.left(), and.left()); + assertPredicatesMatch(expected.right(), and.right()); + } + + @Test + public void testOr() { + Expression expr = + resolve( + Expressions.$("field1") + .isEqual(Expressions.lit(1)) + .or(Expressions.$("field2").isEqual(Expressions.lit(2L)))); + Optional actual = FlinkFilters.convert(expr); + assertThat(actual).isPresent(); + Or or = (Or) actual.get(); + Or expected = + (Or) + org.apache.iceberg.expressions.Expressions.or( + org.apache.iceberg.expressions.Expressions.equal("field1", 1), + org.apache.iceberg.expressions.Expressions.equal("field2", 2L)); + + assertPredicatesMatch(expected.left(), or.left()); + assertPredicatesMatch(expected.right(), or.right()); + } + + @Test + public void testNot() { + Expression expr = + resolve( + ApiExpressionUtils.unresolvedCall( + BuiltInFunctionDefinitions.NOT, + Expressions.$("field1").isEqual(Expressions.lit(1)))); + Optional actual = FlinkFilters.convert(expr); + assertThat(actual).isPresent(); + Not not = (Not) actual.get(); + Not expected = + (Not) + org.apache.iceberg.expressions.Expressions.not( + org.apache.iceberg.expressions.Expressions.equal("field1", 1)); + + assertThat(not.op()).as("Predicate operation should match").isEqualTo(expected.op()); + assertPredicatesMatch(expected.child(), not.child()); + } + + @Test + public void testLike() { + UnboundPredicate expected = + org.apache.iceberg.expressions.Expressions.startsWith("field5", "abc"); + Expression expr = + resolve( + ApiExpressionUtils.unresolvedCall( + BuiltInFunctionDefinitions.LIKE, Expressions.$("field5"), Expressions.lit("abc%"))); + Optional actual = FlinkFilters.convert(expr); + assertThat(actual).isPresent(); + assertPredicatesMatch(expected, actual.get()); + + expr = + resolve( + ApiExpressionUtils.unresolvedCall( + BuiltInFunctionDefinitions.LIKE, Expressions.$("field5"), Expressions.lit("%abc"))); + actual = FlinkFilters.convert(expr); + assertThat(actual).isNotPresent(); + + expr = + resolve( + ApiExpressionUtils.unresolvedCall( + BuiltInFunctionDefinitions.LIKE, + Expressions.$("field5"), + Expressions.lit("%abc%"))); + actual = FlinkFilters.convert(expr); + assertThat(actual).isNotPresent(); + + expr = + resolve( + ApiExpressionUtils.unresolvedCall( + BuiltInFunctionDefinitions.LIKE, + Expressions.$("field5"), + Expressions.lit("abc%d"))); + actual = FlinkFilters.convert(expr); + assertThat(actual).isNotPresent(); + + expr = + resolve( + ApiExpressionUtils.unresolvedCall( + BuiltInFunctionDefinitions.LIKE, Expressions.$("field5"), Expressions.lit("%"))); + actual = FlinkFilters.convert(expr); + assertThat(actual).isNotPresent(); + + expr = + resolve( + ApiExpressionUtils.unresolvedCall( + BuiltInFunctionDefinitions.LIKE, Expressions.$("field5"), Expressions.lit("a_"))); + actual = FlinkFilters.convert(expr); + assertThat(actual).isNotPresent(); + + expr = + resolve( + ApiExpressionUtils.unresolvedCall( + BuiltInFunctionDefinitions.LIKE, Expressions.$("field5"), Expressions.lit("a%b"))); + actual = FlinkFilters.convert(expr); + assertThat(actual).isNotPresent(); + } + + @SuppressWarnings("unchecked") + private void matchLiteral(String fieldName, Object flinkLiteral, T icebergLiteral) { + Expression expr = resolve(Expressions.$(fieldName).isEqual(Expressions.lit(flinkLiteral))); + Optional actual = FlinkFilters.convert(expr); + assertThat(actual).isPresent(); + org.apache.iceberg.expressions.Expression expression = actual.get(); + assertThat(expression) + .as("The expression should be a UnboundPredicate") + .isInstanceOf(UnboundPredicate.class); + UnboundPredicate unboundPredicate = (UnboundPredicate) expression; + + org.apache.iceberg.expressions.Expression expression1 = + unboundPredicate.bind(FlinkSchemaUtil.convert(TABLE_SCHEMA).asStruct(), false); + assertThat(expression1) + .as("The expression should be a BoundLiteralPredicate") + .isInstanceOf(BoundLiteralPredicate.class); + + BoundLiteralPredicate predicate = (BoundLiteralPredicate) expression1; + assertThat(predicate.test(icebergLiteral)).isTrue(); + } + + private static Expression resolve(Expression originalExpression) { + return originalExpression.accept( + new ApiExpressionDefaultVisitor() { + @Override + public Expression visit(UnresolvedReferenceExpression unresolvedReference) { + String name = unresolvedReference.getName(); + Optional field = TABLE_SCHEMA.getTableColumn(name); + if (field.isPresent()) { + int index = TABLE_SCHEMA.getTableColumns().indexOf(field.get()); + return new FieldReferenceExpression(name, field.get().getType(), 0, index); + } else { + return null; + } + } + + @Override + public Expression visit(UnresolvedCallExpression unresolvedCall) { + List children = + unresolvedCall.getChildren().stream() + .map(e -> (ResolvedExpression) e.accept(this)) + .collect(Collectors.toList()); + return new CallExpression( + unresolvedCall.getFunctionDefinition(), children, DataTypes.STRING()); + } + + @Override + public Expression visit(ValueLiteralExpression valueLiteral) { + return valueLiteral; + } + + @Override + protected Expression defaultMethod(Expression expression) { + throw new UnsupportedOperationException( + String.format("unsupported expression: %s", expression)); + } + }); + } + + private void assertPredicatesMatch( + org.apache.iceberg.expressions.Expression expected, + org.apache.iceberg.expressions.Expression actual) { + assertThat(expected) + .as("The expected expression should be a UnboundPredicate") + .isInstanceOf(UnboundPredicate.class); + assertThat(actual) + .as("The actual expression should be a UnboundPredicate") + .isInstanceOf(UnboundPredicate.class); + UnboundPredicate predicateExpected = (UnboundPredicate) expected; + UnboundPredicate predicateActual = (UnboundPredicate) actual; + assertThat(predicateActual.op()).isEqualTo(predicateExpected.op()); + assertThat(predicateActual.literal()).isEqualTo(predicateExpected.literal()); + assertThat(predicateActual.ref().name()).isEqualTo(predicateExpected.ref().name()); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkHiveCatalog.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkHiveCatalog.java new file mode 100644 index 000000000000..91343ab1ee72 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkHiveCatalog.java @@ -0,0 +1,101 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Map; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.iceberg.CatalogProperties; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.junit.jupiter.api.Test; + +public class TestFlinkHiveCatalog extends TestBase { + + @Test + public void testCreateCatalogWithWarehouseLocation() throws IOException { + Map props = Maps.newHashMap(); + props.put("type", "iceberg"); + props.put(FlinkCatalogFactory.ICEBERG_CATALOG_TYPE, "hive"); + props.put(CatalogProperties.URI, CatalogTestBase.getURI(hiveConf)); + + File warehouseDir = Files.createTempDirectory(temporaryDirectory, "junit").toFile(); + props.put(CatalogProperties.WAREHOUSE_LOCATION, "file://" + warehouseDir.getAbsolutePath()); + + checkSQLQuery(props, warehouseDir); + } + + @Test + public void testCreateCatalogWithHiveConfDir() throws IOException { + // Dump the hive conf into a local file. + File hiveConfDir = Files.createTempDirectory(temporaryDirectory, "junit").toFile(); + File hiveSiteXML = new File(hiveConfDir, "hive-site.xml"); + File warehouseDir = Files.createTempDirectory(temporaryDirectory, "junit").toFile(); + try (FileOutputStream fos = new FileOutputStream(hiveSiteXML)) { + Configuration newConf = new Configuration(hiveConf); + // Set another new directory which is different with the hive metastore's warehouse path. + newConf.set( + HiveConf.ConfVars.METASTOREWAREHOUSE.varname, "file://" + warehouseDir.getAbsolutePath()); + newConf.writeXml(fos); + } + assertThat(hiveSiteXML.toPath()).exists(); + + // Construct the catalog attributions. + Map props = Maps.newHashMap(); + props.put("type", "iceberg"); + props.put(FlinkCatalogFactory.ICEBERG_CATALOG_TYPE, "hive"); + props.put(CatalogProperties.URI, CatalogTestBase.getURI(hiveConf)); + // Set the 'hive-conf-dir' instead of 'warehouse' + props.put(FlinkCatalogFactory.HIVE_CONF_DIR, hiveConfDir.getAbsolutePath()); + + checkSQLQuery(props, warehouseDir); + } + + private void checkSQLQuery(Map catalogProperties, File warehouseDir) + throws IOException { + sql("CREATE CATALOG test_catalog WITH %s", CatalogTestBase.toWithClause(catalogProperties)); + sql("USE CATALOG test_catalog"); + sql("CREATE DATABASE test_db"); + sql("USE test_db"); + sql("CREATE TABLE test_table(c1 INT, c2 STRING)"); + sql("INSERT INTO test_table SELECT 1, 'a'"); + + Path databasePath = warehouseDir.toPath().resolve("test_db.db"); + assertThat(databasePath).exists(); + + Path tablePath = databasePath.resolve("test_table"); + assertThat(tablePath).exists(); + + Path dataPath = tablePath.resolve("data"); + assertThat(dataPath).exists(); + assertThat(Files.list(dataPath).count()) + .as("Should have a .crc file and a .parquet file") + .isEqualTo(2); + + sql("DROP TABLE test_table"); + dropDatabase("test_db", false); + dropCatalog("test_catalog", false); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkSchemaUtil.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkSchemaUtil.java new file mode 100644 index 000000000000..eab60d886ada --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkSchemaUtil.java @@ -0,0 +1,416 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import org.apache.flink.table.api.DataTypes; +import org.apache.flink.table.api.TableSchema; +import org.apache.flink.table.api.ValidationException; +import org.apache.flink.table.types.logical.BinaryType; +import org.apache.flink.table.types.logical.CharType; +import org.apache.flink.table.types.logical.LocalZonedTimestampType; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.table.types.logical.TimeType; +import org.apache.flink.table.types.logical.TimestampType; +import org.apache.flink.table.types.logical.VarBinaryType; +import org.apache.flink.table.types.logical.VarCharType; +import org.apache.iceberg.Schema; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Sets; +import org.apache.iceberg.types.Type; +import org.apache.iceberg.types.Types; +import org.junit.jupiter.api.Test; + +public class TestFlinkSchemaUtil { + + @Test + public void testConvertFlinkSchemaToIcebergSchema() { + TableSchema flinkSchema = + TableSchema.builder() + .field("id", DataTypes.INT().notNull()) + .field("name", DataTypes.STRING()) /* optional by default */ + .field("salary", DataTypes.DOUBLE().notNull()) + .field( + "locations", + DataTypes.MAP( + DataTypes.STRING(), + DataTypes.ROW( + DataTypes.FIELD("posX", DataTypes.DOUBLE().notNull(), "X field"), + DataTypes.FIELD("posY", DataTypes.DOUBLE().notNull(), "Y field")))) + .field("strArray", DataTypes.ARRAY(DataTypes.STRING()).nullable()) + .field("intArray", DataTypes.ARRAY(DataTypes.INT()).nullable()) + .field("char", DataTypes.CHAR(10).notNull()) + .field("varchar", DataTypes.VARCHAR(10).notNull()) + .field("boolean", DataTypes.BOOLEAN().nullable()) + .field("tinyint", DataTypes.TINYINT()) + .field("smallint", DataTypes.SMALLINT()) + .field("bigint", DataTypes.BIGINT()) + .field("varbinary", DataTypes.VARBINARY(10)) + .field("binary", DataTypes.BINARY(10)) + .field("time", DataTypes.TIME()) + .field("timestampWithoutZone", DataTypes.TIMESTAMP()) + .field("timestampWithZone", DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE()) + .field("date", DataTypes.DATE()) + .field("decimal", DataTypes.DECIMAL(2, 2)) + .field("decimal2", DataTypes.DECIMAL(38, 2)) + .field("decimal3", DataTypes.DECIMAL(10, 1)) + .field("multiset", DataTypes.MULTISET(DataTypes.STRING().notNull())) + .build(); + + Schema icebergSchema = + new Schema( + Types.NestedField.required(0, "id", Types.IntegerType.get(), null), + Types.NestedField.optional(1, "name", Types.StringType.get(), null), + Types.NestedField.required(2, "salary", Types.DoubleType.get(), null), + Types.NestedField.optional( + 3, + "locations", + Types.MapType.ofOptional( + 24, + 25, + Types.StringType.get(), + Types.StructType.of( + Types.NestedField.required(22, "posX", Types.DoubleType.get(), "X field"), + Types.NestedField.required( + 23, "posY", Types.DoubleType.get(), "Y field")))), + Types.NestedField.optional( + 4, "strArray", Types.ListType.ofOptional(26, Types.StringType.get())), + Types.NestedField.optional( + 5, "intArray", Types.ListType.ofOptional(27, Types.IntegerType.get())), + Types.NestedField.required(6, "char", Types.StringType.get()), + Types.NestedField.required(7, "varchar", Types.StringType.get()), + Types.NestedField.optional(8, "boolean", Types.BooleanType.get()), + Types.NestedField.optional(9, "tinyint", Types.IntegerType.get()), + Types.NestedField.optional(10, "smallint", Types.IntegerType.get()), + Types.NestedField.optional(11, "bigint", Types.LongType.get()), + Types.NestedField.optional(12, "varbinary", Types.BinaryType.get()), + Types.NestedField.optional(13, "binary", Types.FixedType.ofLength(10)), + Types.NestedField.optional(14, "time", Types.TimeType.get()), + Types.NestedField.optional( + 15, "timestampWithoutZone", Types.TimestampType.withoutZone()), + Types.NestedField.optional(16, "timestampWithZone", Types.TimestampType.withZone()), + Types.NestedField.optional(17, "date", Types.DateType.get()), + Types.NestedField.optional(18, "decimal", Types.DecimalType.of(2, 2)), + Types.NestedField.optional(19, "decimal2", Types.DecimalType.of(38, 2)), + Types.NestedField.optional(20, "decimal3", Types.DecimalType.of(10, 1)), + Types.NestedField.optional( + 21, + "multiset", + Types.MapType.ofRequired(28, 29, Types.StringType.get(), Types.IntegerType.get()))); + + checkSchema(flinkSchema, icebergSchema); + } + + @Test + public void testMapField() { + TableSchema flinkSchema = + TableSchema.builder() + .field( + "map_int_long", + DataTypes.MAP(DataTypes.INT(), DataTypes.BIGINT()).notNull()) /* Required */ + .field( + "map_int_array_string", + DataTypes.MAP(DataTypes.ARRAY(DataTypes.INT()), DataTypes.STRING())) + .field( + "map_decimal_string", DataTypes.MAP(DataTypes.DECIMAL(10, 2), DataTypes.STRING())) + .field( + "map_fields_fields", + DataTypes.MAP( + DataTypes.ROW( + DataTypes.FIELD("field_int", DataTypes.INT(), "doc - int"), + DataTypes.FIELD("field_string", DataTypes.STRING(), "doc - string")) + .notNull(), /* Required */ + DataTypes.ROW( + DataTypes.FIELD( + "field_array", + DataTypes.ARRAY(DataTypes.STRING()), + "doc - array")) + .notNull() /* Required */) + .notNull() /* Required */) + .build(); + + Schema icebergSchema = + new Schema( + Types.NestedField.required( + 0, + "map_int_long", + Types.MapType.ofOptional(4, 5, Types.IntegerType.get(), Types.LongType.get()), + null), + Types.NestedField.optional( + 1, + "map_int_array_string", + Types.MapType.ofOptional( + 7, + 8, + Types.ListType.ofOptional(6, Types.IntegerType.get()), + Types.StringType.get()), + null), + Types.NestedField.optional( + 2, + "map_decimal_string", + Types.MapType.ofOptional( + 9, 10, Types.DecimalType.of(10, 2), Types.StringType.get())), + Types.NestedField.required( + 3, + "map_fields_fields", + Types.MapType.ofRequired( + 15, + 16, + Types.StructType.of( + Types.NestedField.optional( + 11, "field_int", Types.IntegerType.get(), "doc - int"), + Types.NestedField.optional( + 12, "field_string", Types.StringType.get(), "doc - string")), + Types.StructType.of( + Types.NestedField.optional( + 14, + "field_array", + Types.ListType.ofOptional(13, Types.StringType.get()), + "doc - array"))))); + + checkSchema(flinkSchema, icebergSchema); + } + + @Test + public void testStructField() { + TableSchema flinkSchema = + TableSchema.builder() + .field( + "struct_int_string_decimal", + DataTypes.ROW( + DataTypes.FIELD("field_int", DataTypes.INT()), + DataTypes.FIELD("field_string", DataTypes.STRING()), + DataTypes.FIELD("field_decimal", DataTypes.DECIMAL(19, 2)), + DataTypes.FIELD( + "field_struct", + DataTypes.ROW( + DataTypes.FIELD("inner_struct_int", DataTypes.INT()), + DataTypes.FIELD( + "inner_struct_float_array", + DataTypes.ARRAY(DataTypes.FLOAT()))) + .notNull()) /* Row is required */) + .notNull()) /* Required */ + .field( + "struct_map_int_int", + DataTypes.ROW( + DataTypes.FIELD( + "field_map", DataTypes.MAP(DataTypes.INT(), DataTypes.INT()))) + .nullable()) /* Optional */ + .build(); + + Schema icebergSchema = + new Schema( + Types.NestedField.required( + 0, + "struct_int_string_decimal", + Types.StructType.of( + Types.NestedField.optional(5, "field_int", Types.IntegerType.get()), + Types.NestedField.optional(6, "field_string", Types.StringType.get()), + Types.NestedField.optional(7, "field_decimal", Types.DecimalType.of(19, 2)), + Types.NestedField.required( + 8, + "field_struct", + Types.StructType.of( + Types.NestedField.optional( + 3, "inner_struct_int", Types.IntegerType.get()), + Types.NestedField.optional( + 4, + "inner_struct_float_array", + Types.ListType.ofOptional(2, Types.FloatType.get())))))), + Types.NestedField.optional( + 1, + "struct_map_int_int", + Types.StructType.of( + Types.NestedField.optional( + 11, + "field_map", + Types.MapType.ofOptional( + 9, 10, Types.IntegerType.get(), Types.IntegerType.get()))))); + + checkSchema(flinkSchema, icebergSchema); + } + + @Test + public void testListField() { + TableSchema flinkSchema = + TableSchema.builder() + .field( + "list_struct_fields", + DataTypes.ARRAY(DataTypes.ROW(DataTypes.FIELD("field_int", DataTypes.INT()))) + .notNull()) /* Required */ + .field( + "list_optional_struct_fields", + DataTypes.ARRAY( + DataTypes.ROW( + DataTypes.FIELD( + "field_timestamp_with_local_time_zone", + DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE()))) + .nullable()) /* Optional */ + .field( + "list_map_fields", + DataTypes.ARRAY( + DataTypes.MAP( + DataTypes.ARRAY( + DataTypes.INT().notNull()), /* Key of map must be required */ + DataTypes.ROW( + DataTypes.FIELD("field_0", DataTypes.INT(), "doc - int"))) + .notNull()) + .notNull()) /* Required */ + .build(); + + Schema icebergSchema = + new Schema( + Types.NestedField.required( + 0, + "list_struct_fields", + Types.ListType.ofOptional( + 4, + Types.StructType.of( + Types.NestedField.optional(3, "field_int", Types.IntegerType.get())))), + Types.NestedField.optional( + 1, + "list_optional_struct_fields", + Types.ListType.ofOptional( + 6, + Types.StructType.of( + Types.NestedField.optional( + 5, + "field_timestamp_with_local_time_zone", + Types.TimestampType.withZone())))), + Types.NestedField.required( + 2, + "list_map_fields", + Types.ListType.ofRequired( + 11, + Types.MapType.ofOptional( + 9, + 10, + Types.ListType.ofRequired(7, Types.IntegerType.get()), + Types.StructType.of( + Types.NestedField.optional( + 8, "field_0", Types.IntegerType.get(), "doc - int")))))); + + checkSchema(flinkSchema, icebergSchema); + } + + private void checkSchema(TableSchema flinkSchema, Schema icebergSchema) { + assertThat(FlinkSchemaUtil.convert(flinkSchema).asStruct()).isEqualTo(icebergSchema.asStruct()); + // The conversion is not a 1:1 mapping, so we just check iceberg types. + assertThat( + FlinkSchemaUtil.convert( + FlinkSchemaUtil.toSchema(FlinkSchemaUtil.convert(icebergSchema))) + .asStruct()) + .isEqualTo(icebergSchema.asStruct()); + } + + @Test + public void testInconsistentTypes() { + checkInconsistentType( + Types.UUIDType.get(), new BinaryType(16), new BinaryType(16), Types.FixedType.ofLength(16)); + checkInconsistentType( + Types.StringType.get(), + new VarCharType(VarCharType.MAX_LENGTH), + new CharType(100), + Types.StringType.get()); + checkInconsistentType( + Types.BinaryType.get(), + new VarBinaryType(VarBinaryType.MAX_LENGTH), + new VarBinaryType(100), + Types.BinaryType.get()); + checkInconsistentType( + Types.TimeType.get(), new TimeType(), new TimeType(3), Types.TimeType.get()); + checkInconsistentType( + Types.TimestampType.withoutZone(), + new TimestampType(6), + new TimestampType(3), + Types.TimestampType.withoutZone()); + checkInconsistentType( + Types.TimestampType.withZone(), + new LocalZonedTimestampType(6), + new LocalZonedTimestampType(3), + Types.TimestampType.withZone()); + } + + private void checkInconsistentType( + Type icebergType, + LogicalType flinkExpectedType, + LogicalType flinkType, + Type icebergExpectedType) { + assertThat(FlinkSchemaUtil.convert(icebergType)).isEqualTo(flinkExpectedType); + assertThat(FlinkSchemaUtil.convert(FlinkSchemaUtil.toSchema(RowType.of(flinkType))).asStruct()) + .isEqualTo(Types.StructType.of(Types.NestedField.optional(0, "f0", icebergExpectedType))); + } + + @Test + public void testConvertFlinkSchemaBaseOnIcebergSchema() { + Schema baseSchema = + new Schema( + Lists.newArrayList( + Types.NestedField.required(101, "int", Types.IntegerType.get()), + Types.NestedField.optional(102, "string", Types.StringType.get())), + Sets.newHashSet(101)); + + TableSchema flinkSchema = + TableSchema.builder() + .field("int", DataTypes.INT().notNull()) + .field("string", DataTypes.STRING().nullable()) + .primaryKey("int") + .build(); + Schema convertedSchema = FlinkSchemaUtil.convert(baseSchema, flinkSchema); + assertThat(convertedSchema.asStruct()).isEqualTo(baseSchema.asStruct()); + assertThat(convertedSchema.identifierFieldIds()).containsExactly(101); + } + + @Test + public void testConvertFlinkSchemaWithPrimaryKeys() { + Schema icebergSchema = + new Schema( + Lists.newArrayList( + Types.NestedField.required(1, "int", Types.IntegerType.get()), + Types.NestedField.required(2, "string", Types.StringType.get())), + Sets.newHashSet(1, 2)); + + TableSchema tableSchema = FlinkSchemaUtil.toSchema(icebergSchema); + assertThat(tableSchema.getPrimaryKey()) + .isPresent() + .get() + .satisfies(k -> assertThat(k.getColumns()).containsExactly("int", "string")); + } + + @Test + public void testConvertFlinkSchemaWithNestedColumnInPrimaryKeys() { + Schema icebergSchema = + new Schema( + Lists.newArrayList( + Types.NestedField.required( + 1, + "struct", + Types.StructType.of( + Types.NestedField.required(2, "inner", Types.IntegerType.get())))), + Sets.newHashSet(2)); + + assertThatThrownBy(() -> FlinkSchemaUtil.toSchema(icebergSchema)) + .isInstanceOf(ValidationException.class) + .hasMessageStartingWith("Could not create a PRIMARY KEY") + .hasMessageContaining("Column 'struct.inner' does not exist."); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSink.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSink.java new file mode 100644 index 000000000000..2978a92945a2 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSink.java @@ -0,0 +1,244 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import static org.assertj.core.api.Assumptions.assumeThat; + +import java.util.List; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.api.EnvironmentSettings; +import org.apache.flink.table.api.Expressions; +import org.apache.flink.table.api.TableEnvironment; +import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Parameter; +import org.apache.iceberg.Parameters; +import org.apache.iceberg.Table; +import org.apache.iceberg.catalog.Namespace; +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.flink.source.BoundedTableFactory; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.TestTemplate; + +public class TestFlinkTableSink extends CatalogTestBase { + + private static final String SOURCE_TABLE = "default_catalog.default_database.bounded_source"; + private static final String TABLE_NAME = "test_table"; + private TableEnvironment tEnv; + private Table icebergTable; + + @Parameter(index = 2) + private FileFormat format; + + @Parameter(index = 3) + private boolean isStreamingJob; + + @Parameters(name = "catalogName={0}, baseNamespace={1}, format={2}, isStreaming={3}") + public static List parameters() { + List parameters = Lists.newArrayList(); + for (FileFormat format : + new FileFormat[] {FileFormat.ORC, FileFormat.AVRO, FileFormat.PARQUET}) { + for (Boolean isStreaming : new Boolean[] {true, false}) { + for (Object[] catalogParams : CatalogTestBase.parameters()) { + String catalogName = (String) catalogParams[0]; + Namespace baseNamespace = (Namespace) catalogParams[1]; + parameters.add(new Object[] {catalogName, baseNamespace, format, isStreaming}); + } + } + } + return parameters; + } + + @Override + protected TableEnvironment getTableEnv() { + if (tEnv == null) { + synchronized (this) { + EnvironmentSettings.Builder settingsBuilder = EnvironmentSettings.newInstance(); + if (isStreamingJob) { + settingsBuilder.inStreamingMode(); + StreamExecutionEnvironment env = + StreamExecutionEnvironment.getExecutionEnvironment( + MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG); + env.enableCheckpointing(400); + env.setMaxParallelism(2); + env.setParallelism(2); + tEnv = StreamTableEnvironment.create(env, settingsBuilder.build()); + } else { + settingsBuilder.inBatchMode(); + tEnv = TableEnvironment.create(settingsBuilder.build()); + } + } + } + return tEnv; + } + + @Override + @BeforeEach + public void before() { + super.before(); + sql("CREATE DATABASE %s", flinkDatabase); + sql("USE CATALOG %s", catalogName); + sql("USE %s", DATABASE); + sql( + "CREATE TABLE %s (id int, data varchar) with ('write.format.default'='%s')", + TABLE_NAME, format.name()); + icebergTable = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE_NAME)); + } + + @Override + @AfterEach + public void clean() { + sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, TABLE_NAME); + dropDatabase(flinkDatabase, true); + BoundedTableFactory.clearDataSets(); + super.clean(); + } + + @TestTemplate + public void testInsertFromSourceTable() throws Exception { + // Register the rows into a temporary table. + getTableEnv() + .createTemporaryView( + "sourceTable", + getTableEnv() + .fromValues( + SimpleDataUtil.FLINK_SCHEMA.toRowDataType(), + Expressions.row(1, "hello"), + Expressions.row(2, "world"), + Expressions.row(3, (String) null), + Expressions.row(null, "bar"))); + + // Redirect the records from source table to destination table. + sql("INSERT INTO %s SELECT id,data from sourceTable", TABLE_NAME); + + // Assert the table records as expected. + SimpleDataUtil.assertTableRecords( + icebergTable, + Lists.newArrayList( + SimpleDataUtil.createRecord(1, "hello"), + SimpleDataUtil.createRecord(2, "world"), + SimpleDataUtil.createRecord(3, null), + SimpleDataUtil.createRecord(null, "bar"))); + } + + @TestTemplate + public void testOverwriteTable() throws Exception { + assumeThat(isStreamingJob) + .as("Flink unbounded streaming does not support overwrite operation") + .isFalse(); + + sql("INSERT INTO %s SELECT 1, 'a'", TABLE_NAME); + SimpleDataUtil.assertTableRecords( + icebergTable, Lists.newArrayList(SimpleDataUtil.createRecord(1, "a"))); + + sql("INSERT OVERWRITE %s SELECT 2, 'b'", TABLE_NAME); + SimpleDataUtil.assertTableRecords( + icebergTable, Lists.newArrayList(SimpleDataUtil.createRecord(2, "b"))); + } + + @TestTemplate + public void testReplacePartitions() throws Exception { + assumeThat(isStreamingJob) + .as("Flink unbounded streaming does not support overwrite operation") + .isFalse(); + String tableName = "test_partition"; + sql( + "CREATE TABLE %s(id INT, data VARCHAR) PARTITIONED BY (data) WITH ('write.format.default'='%s')", + tableName, format.name()); + + try { + Table partitionedTable = + validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, tableName)); + + sql("INSERT INTO %s SELECT 1, 'a'", tableName); + sql("INSERT INTO %s SELECT 2, 'b'", tableName); + sql("INSERT INTO %s SELECT 3, 'c'", tableName); + + SimpleDataUtil.assertTableRecords( + partitionedTable, + Lists.newArrayList( + SimpleDataUtil.createRecord(1, "a"), + SimpleDataUtil.createRecord(2, "b"), + SimpleDataUtil.createRecord(3, "c"))); + + sql("INSERT OVERWRITE %s SELECT 4, 'b'", tableName); + sql("INSERT OVERWRITE %s SELECT 5, 'a'", tableName); + + SimpleDataUtil.assertTableRecords( + partitionedTable, + Lists.newArrayList( + SimpleDataUtil.createRecord(5, "a"), + SimpleDataUtil.createRecord(4, "b"), + SimpleDataUtil.createRecord(3, "c"))); + + sql("INSERT OVERWRITE %s PARTITION (data='a') SELECT 6", tableName); + + SimpleDataUtil.assertTableRecords( + partitionedTable, + Lists.newArrayList( + SimpleDataUtil.createRecord(6, "a"), + SimpleDataUtil.createRecord(4, "b"), + SimpleDataUtil.createRecord(3, "c"))); + } finally { + sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, tableName); + } + } + + @TestTemplate + public void testInsertIntoPartition() throws Exception { + String tableName = "test_insert_into_partition"; + sql( + "CREATE TABLE %s(id INT, data VARCHAR) PARTITIONED BY (data) WITH ('write.format.default'='%s')", + tableName, format.name()); + + try { + Table partitionedTable = + validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, tableName)); + + // Full partition. + sql("INSERT INTO %s PARTITION (data='a') SELECT 1", tableName); + sql("INSERT INTO %s PARTITION (data='a') SELECT 2", tableName); + sql("INSERT INTO %s PARTITION (data='b') SELECT 3", tableName); + + SimpleDataUtil.assertTableRecords( + partitionedTable, + Lists.newArrayList( + SimpleDataUtil.createRecord(1, "a"), + SimpleDataUtil.createRecord(2, "a"), + SimpleDataUtil.createRecord(3, "b"))); + + // Partial partition. + sql("INSERT INTO %s SELECT 4, 'c'", tableName); + sql("INSERT INTO %s SELECT 5, 'd'", tableName); + + SimpleDataUtil.assertTableRecords( + partitionedTable, + Lists.newArrayList( + SimpleDataUtil.createRecord(1, "a"), + SimpleDataUtil.createRecord(2, "a"), + SimpleDataUtil.createRecord(3, "b"), + SimpleDataUtil.createRecord(4, "c"), + SimpleDataUtil.createRecord(5, "d"))); + } finally { + sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, tableName); + } + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSinkExtended.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSinkExtended.java new file mode 100644 index 000000000000..482cfd110bde --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSinkExtended.java @@ -0,0 +1,244 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import static org.apache.iceberg.flink.FlinkCatalogFactory.ICEBERG_CATALOG_TYPE_HADOOP; +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.File; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; +import java.util.stream.IntStream; +import org.apache.flink.api.dag.Transformation; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.api.EnvironmentSettings; +import org.apache.flink.table.api.TableEnvironment; +import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; +import org.apache.flink.table.api.internal.TableEnvironmentImpl; +import org.apache.flink.table.operations.ModifyOperation; +import org.apache.flink.table.planner.delegation.PlannerBase; +import org.apache.flink.test.junit5.MiniClusterExtension; +import org.apache.flink.types.Row; +import org.apache.hadoop.conf.Configuration; +import org.apache.iceberg.CatalogProperties; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.DistributionMode; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Parameter; +import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.Parameters; +import org.apache.iceberg.Table; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.catalog.Namespace; +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.flink.source.BoundedTableFactory; +import org.apache.iceberg.hadoop.HadoopCatalog; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.api.extension.ExtendWith; +import org.junit.jupiter.api.extension.RegisterExtension; +import org.junit.jupiter.api.io.TempDir; + +/** + * This class tests the more extended features of Flink sink. Extract them separately since it is + * unnecessary to test all the parameters combinations in {@link TestFlinkTableSink}, like catalog + * types, namespaces, file format, streaming/batch. Those combinations explode exponentially. Each + * test method in {@link TestFlinkTableSink} runs 21 combinations, which are expensive and slow. + */ +@ExtendWith(ParameterizedTestExtension.class) +public class TestFlinkTableSinkExtended extends SqlBase { + protected static final String CATALOG = "testhadoop"; + protected static final String DATABASE = "db"; + protected static final String TABLE = "tbl"; + + @RegisterExtension + public static MiniClusterExtension miniClusterResource = + MiniFlinkClusterExtension.createWithClassloaderCheckDisabled(); + + private static final String SOURCE_TABLE = "default_catalog.default_database.bounded_source"; + private static final String FLINK_DATABASE = CATALOG + "." + DATABASE; + private static final Namespace ICEBERG_NAMESPACE = Namespace.of(new String[] {DATABASE}); + + @TempDir protected File warehouseRoot; + + protected HadoopCatalog catalog = null; + + private TableEnvironment tEnv; + + @Parameter protected boolean isStreamingJob; + + @Parameters(name = "isStreamingJob={0}") + protected static List parameters() { + return Arrays.asList(new Boolean[] {true}, new Boolean[] {false}); + } + + protected synchronized TableEnvironment getTableEnv() { + if (tEnv == null) { + EnvironmentSettings.Builder settingsBuilder = EnvironmentSettings.newInstance(); + if (isStreamingJob) { + settingsBuilder.inStreamingMode(); + StreamExecutionEnvironment env = + StreamExecutionEnvironment.getExecutionEnvironment( + MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG); + env.enableCheckpointing(400); + env.setMaxParallelism(2); + env.setParallelism(2); + tEnv = StreamTableEnvironment.create(env, settingsBuilder.build()); + } else { + settingsBuilder.inBatchMode(); + tEnv = TableEnvironment.create(settingsBuilder.build()); + } + } + return tEnv; + } + + @BeforeEach + public void before() { + String warehouseLocation = "file:" + warehouseRoot.getPath(); + this.catalog = new HadoopCatalog(new Configuration(), warehouseLocation); + Map config = Maps.newHashMap(); + config.put("type", "iceberg"); + config.put(FlinkCatalogFactory.ICEBERG_CATALOG_TYPE, ICEBERG_CATALOG_TYPE_HADOOP); + config.put(CatalogProperties.WAREHOUSE_LOCATION, warehouseLocation); + sql("CREATE CATALOG %s WITH %s", CATALOG, toWithClause(config)); + + sql("CREATE DATABASE %s", FLINK_DATABASE); + sql("USE CATALOG %s", CATALOG); + sql("USE %s", DATABASE); + sql( + "CREATE TABLE %s (id int, data varchar) with ('write.format.default'='%s')", + TABLE, FileFormat.PARQUET.name()); + } + + @AfterEach + public void clean() throws Exception { + sql("DROP TABLE IF EXISTS %s.%s", FLINK_DATABASE, TABLE); + dropDatabase(FLINK_DATABASE, true); + BoundedTableFactory.clearDataSets(); + + dropCatalog(CATALOG, true); + catalog.close(); + } + + @TestTemplate + public void testWriteParallelism() { + List dataSet = + IntStream.range(1, 1000) + .mapToObj(i -> ImmutableList.of(Row.of(i, "aaa"), Row.of(i, "bbb"), Row.of(i, "ccc"))) + .flatMap(List::stream) + .collect(Collectors.toList()); + String dataId = BoundedTableFactory.registerDataSet(ImmutableList.of(dataSet)); + sql( + "CREATE TABLE %s(id INT NOT NULL, data STRING NOT NULL)" + + " WITH ('connector'='BoundedSource', 'data-id'='%s')", + SOURCE_TABLE, dataId); + + PlannerBase planner = (PlannerBase) ((TableEnvironmentImpl) getTableEnv()).getPlanner(); + String insertSQL = + String.format( + "INSERT INTO %s /*+ OPTIONS('write-parallelism'='1') */ SELECT * FROM %s", + TABLE, SOURCE_TABLE); + ModifyOperation operation = (ModifyOperation) planner.getParser().parse(insertSQL).get(0); + Transformation dummySink = planner.translate(Collections.singletonList(operation)).get(0); + Transformation committer = dummySink.getInputs().get(0); + Transformation writer = committer.getInputs().get(0); + + assertThat(writer.getParallelism()).as("Should have the expected 1 parallelism.").isEqualTo(1); + writer + .getInputs() + .forEach( + input -> + assertThat(input.getParallelism()) + .as("Should have the expected parallelism.") + .isEqualTo(isStreamingJob ? 2 : 4)); + } + + @TestTemplate + public void testHashDistributeMode() throws Exception { + // Initialize a BoundedSource table to precisely emit those rows in only one checkpoint. + List dataSet = + IntStream.range(1, 1000) + .mapToObj(i -> ImmutableList.of(Row.of(i, "aaa"), Row.of(i, "bbb"), Row.of(i, "ccc"))) + .flatMap(List::stream) + .collect(Collectors.toList()); + String dataId = BoundedTableFactory.registerDataSet(ImmutableList.of(dataSet)); + sql( + "CREATE TABLE %s(id INT NOT NULL, data STRING NOT NULL)" + + " WITH ('connector'='BoundedSource', 'data-id'='%s')", + SOURCE_TABLE, dataId); + + assertThat(sql("SELECT * FROM %s", SOURCE_TABLE)) + .as("Should have the expected rows in source table.") + .containsExactlyInAnyOrderElementsOf(dataSet); + + Map tableProps = + ImmutableMap.of( + "write.format.default", + FileFormat.PARQUET.name(), + TableProperties.WRITE_DISTRIBUTION_MODE, + DistributionMode.HASH.modeName()); + + String tableName = "test_hash_distribution_mode"; + sql( + "CREATE TABLE %s(id INT, data VARCHAR) PARTITIONED BY (data) WITH %s", + tableName, toWithClause(tableProps)); + + try { + // Insert data set. + sql("INSERT INTO %s SELECT * FROM %s", tableName, SOURCE_TABLE); + + assertThat(sql("SELECT * FROM %s", tableName)) + .as("Should have the expected rows in sink table.") + .containsExactlyInAnyOrderElementsOf(dataSet); + + // Sometimes we will have more than one checkpoint if we pass the auto checkpoint interval, + // thus producing multiple snapshots. Here we assert that each snapshot has only 1 file per + // partition. + Table table = catalog.loadTable(TableIdentifier.of(ICEBERG_NAMESPACE, tableName)); + Map> snapshotToDataFiles = SimpleDataUtil.snapshotToDataFiles(table); + for (List dataFiles : snapshotToDataFiles.values()) { + if (dataFiles.isEmpty()) { + continue; + } + + assertThat( + SimpleDataUtil.matchingPartitions( + dataFiles, table.spec(), ImmutableMap.of("data", "aaa"))) + .hasSize(1); + assertThat( + SimpleDataUtil.matchingPartitions( + dataFiles, table.spec(), ImmutableMap.of("data", "bbb"))) + .hasSize(1); + assertThat( + SimpleDataUtil.matchingPartitions( + dataFiles, table.spec(), ImmutableMap.of("data", "ccc"))) + .hasSize(1); + } + } finally { + sql("DROP TABLE IF EXISTS %s.%s", FLINK_DATABASE, tableName); + } + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkUpsert.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkUpsert.java new file mode 100644 index 000000000000..c5becb6caca1 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestFlinkUpsert.java @@ -0,0 +1,334 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import java.time.LocalDate; +import java.util.List; +import java.util.Map; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.api.EnvironmentSettings; +import org.apache.flink.table.api.TableEnvironment; +import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; +import org.apache.flink.types.Row; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Parameter; +import org.apache.iceberg.Parameters; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.catalog.Namespace; +import org.apache.iceberg.relocated.com.google.common.collect.Iterables; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.TestTemplate; + +public class TestFlinkUpsert extends CatalogTestBase { + + @Parameter(index = 2) + private FileFormat format; + + @Parameter(index = 3) + private boolean isStreamingJob; + + private final Map tableUpsertProps = Maps.newHashMap(); + private TableEnvironment tEnv; + + @Parameters(name = "catalogName={0}, baseNamespace={1}, format={2}, isStreaming={3}") + public static List parameters() { + List parameters = Lists.newArrayList(); + for (FileFormat format : + new FileFormat[] {FileFormat.PARQUET, FileFormat.AVRO, FileFormat.ORC}) { + for (Boolean isStreaming : new Boolean[] {true, false}) { + // Only test with one catalog as this is a file operation concern. + // FlinkCatalogTestBase requires the catalog name start with testhadoop if using hadoop + // catalog. + String catalogName = "testhadoop"; + Namespace baseNamespace = Namespace.of("default"); + parameters.add(new Object[] {catalogName, baseNamespace, format, isStreaming}); + } + } + return parameters; + } + + @Override + protected TableEnvironment getTableEnv() { + if (tEnv == null) { + synchronized (this) { + EnvironmentSettings.Builder settingsBuilder = EnvironmentSettings.newInstance(); + if (isStreamingJob) { + settingsBuilder.inStreamingMode(); + StreamExecutionEnvironment env = + StreamExecutionEnvironment.getExecutionEnvironment( + MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG); + env.enableCheckpointing(400); + env.setMaxParallelism(2); + env.setParallelism(2); + tEnv = StreamTableEnvironment.create(env, settingsBuilder.build()); + } else { + settingsBuilder.inBatchMode(); + tEnv = TableEnvironment.create(settingsBuilder.build()); + } + } + } + return tEnv; + } + + @Override + @BeforeEach + public void before() { + super.before(); + sql("CREATE DATABASE IF NOT EXISTS %s", flinkDatabase); + sql("USE CATALOG %s", catalogName); + sql("USE %s", DATABASE); + tableUpsertProps.put(TableProperties.FORMAT_VERSION, "2"); + tableUpsertProps.put(TableProperties.UPSERT_ENABLED, "true"); + tableUpsertProps.put(TableProperties.DEFAULT_FILE_FORMAT, format.name()); + } + + @Override + @AfterEach + public void clean() { + dropDatabase(flinkDatabase, true); + super.clean(); + } + + @TestTemplate + public void testUpsertAndQuery() { + String tableName = "test_upsert_query"; + LocalDate dt20220301 = LocalDate.of(2022, 3, 1); + LocalDate dt20220302 = LocalDate.of(2022, 3, 2); + + sql( + "CREATE TABLE %s(id INT NOT NULL, name STRING NOT NULL, dt DATE, PRIMARY KEY(id,dt) NOT ENFORCED) " + + "PARTITIONED BY (dt) WITH %s", + tableName, toWithClause(tableUpsertProps)); + + try { + sql( + "INSERT INTO %s VALUES " + + "(1, 'Bill', DATE '2022-03-01')," + + "(1, 'Jane', DATE '2022-03-01')," + + "(2, 'Jane', DATE '2022-03-01')", + tableName); + + sql( + "INSERT INTO %s VALUES " + + "(2, 'Bill', DATE '2022-03-01')," + + "(1, 'Jane', DATE '2022-03-02')," + + "(2, 'Jane', DATE '2022-03-02')", + tableName); + + List rowsOn20220301 = + Lists.newArrayList(Row.of(1, "Jane", dt20220301), Row.of(2, "Bill", dt20220301)); + TestHelpers.assertRows( + sql("SELECT * FROM %s WHERE dt < '2022-03-02'", tableName), rowsOn20220301); + + List rowsOn20220302 = + Lists.newArrayList(Row.of(1, "Jane", dt20220302), Row.of(2, "Jane", dt20220302)); + TestHelpers.assertRows( + sql("SELECT * FROM %s WHERE dt = '2022-03-02'", tableName), rowsOn20220302); + + TestHelpers.assertRows( + sql("SELECT * FROM %s", tableName), + Lists.newArrayList(Iterables.concat(rowsOn20220301, rowsOn20220302))); + } finally { + sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, tableName); + } + } + + @TestTemplate + public void testUpsertOptions() { + String tableName = "test_upsert_options"; + LocalDate dt20220301 = LocalDate.of(2022, 3, 1); + LocalDate dt20220302 = LocalDate.of(2022, 3, 2); + + Map optionsUpsertProps = Maps.newHashMap(tableUpsertProps); + optionsUpsertProps.remove(TableProperties.UPSERT_ENABLED); + sql( + "CREATE TABLE %s(id INT NOT NULL, name STRING NOT NULL, dt DATE, PRIMARY KEY(id,dt) NOT ENFORCED) " + + "PARTITIONED BY (dt) WITH %s", + tableName, toWithClause(optionsUpsertProps)); + + try { + sql( + "INSERT INTO %s /*+ OPTIONS('upsert-enabled'='true')*/ VALUES " + + "(1, 'Bill', DATE '2022-03-01')," + + "(1, 'Jane', DATE '2022-03-01')," + + "(2, 'Jane', DATE '2022-03-01')", + tableName); + + sql( + "INSERT INTO %s /*+ OPTIONS('upsert-enabled'='true')*/ VALUES " + + "(2, 'Bill', DATE '2022-03-01')," + + "(1, 'Jane', DATE '2022-03-02')," + + "(2, 'Jane', DATE '2022-03-02')", + tableName); + + List rowsOn20220301 = + Lists.newArrayList(Row.of(1, "Jane", dt20220301), Row.of(2, "Bill", dt20220301)); + TestHelpers.assertRows( + sql("SELECT * FROM %s WHERE dt < '2022-03-02'", tableName), rowsOn20220301); + + List rowsOn20220302 = + Lists.newArrayList(Row.of(1, "Jane", dt20220302), Row.of(2, "Jane", dt20220302)); + TestHelpers.assertRows( + sql("SELECT * FROM %s WHERE dt = '2022-03-02'", tableName), rowsOn20220302); + + TestHelpers.assertRows( + sql("SELECT * FROM %s", tableName), + Lists.newArrayList(Iterables.concat(rowsOn20220301, rowsOn20220302))); + } finally { + sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, tableName); + } + } + + @TestTemplate + public void testPrimaryKeyEqualToPartitionKey() { + // This is an SQL based reproduction of TestFlinkIcebergSinkV2#testUpsertOnDataKey + String tableName = "upsert_on_id_key"; + try { + sql( + "CREATE TABLE %s(id INT NOT NULL, name STRING NOT NULL, PRIMARY KEY(id) NOT ENFORCED) " + + "PARTITIONED BY (id) WITH %s", + tableName, toWithClause(tableUpsertProps)); + + sql("INSERT INTO %s VALUES " + "(1, 'Bill')," + "(1, 'Jane')," + "(2, 'Bill')", tableName); + + TestHelpers.assertRows( + sql("SELECT * FROM %s", tableName), + Lists.newArrayList(Row.of(1, "Jane"), Row.of(2, "Bill"))); + + sql("INSERT INTO %s VALUES " + "(1, 'Bill')," + "(2, 'Jane')", tableName); + + TestHelpers.assertRows( + sql("SELECT * FROM %s", tableName), + Lists.newArrayList(Row.of(1, "Bill"), Row.of(2, "Jane"))); + + sql("INSERT INTO %s VALUES " + "(3, 'Bill')," + "(4, 'Jane')", tableName); + + TestHelpers.assertRows( + sql("SELECT * FROM %s", tableName), + Lists.newArrayList( + Row.of(1, "Bill"), Row.of(2, "Jane"), Row.of(3, "Bill"), Row.of(4, "Jane"))); + } finally { + sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, tableName); + } + } + + @TestTemplate + public void testPrimaryKeyFieldsAtBeginningOfSchema() { + String tableName = "upsert_on_pk_at_schema_start"; + LocalDate dt = LocalDate.of(2022, 3, 1); + try { + sql( + "CREATE TABLE %s(id INT, dt DATE NOT NULL, name STRING NOT NULL, PRIMARY KEY(id,dt) NOT ENFORCED) " + + "PARTITIONED BY (dt) WITH %s", + tableName, toWithClause(tableUpsertProps)); + + sql( + "INSERT INTO %s VALUES " + + "(1, DATE '2022-03-01', 'Andy')," + + "(1, DATE '2022-03-01', 'Bill')," + + "(2, DATE '2022-03-01', 'Jane')", + tableName); + + TestHelpers.assertRows( + sql("SELECT * FROM %s", tableName), + Lists.newArrayList(Row.of(1, dt, "Bill"), Row.of(2, dt, "Jane"))); + + sql( + "INSERT INTO %s VALUES " + + "(1, DATE '2022-03-01', 'Jane')," + + "(2, DATE '2022-03-01', 'Bill')", + tableName); + + TestHelpers.assertRows( + sql("SELECT * FROM %s", tableName), + Lists.newArrayList(Row.of(1, dt, "Jane"), Row.of(2, dt, "Bill"))); + + sql( + "INSERT INTO %s VALUES " + + "(3, DATE '2022-03-01', 'Duke')," + + "(4, DATE '2022-03-01', 'Leon')", + tableName); + + TestHelpers.assertRows( + sql("SELECT * FROM %s", tableName), + Lists.newArrayList( + Row.of(1, dt, "Jane"), + Row.of(2, dt, "Bill"), + Row.of(3, dt, "Duke"), + Row.of(4, dt, "Leon"))); + } finally { + sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, tableName); + } + } + + @TestTemplate + public void testPrimaryKeyFieldsAtEndOfTableSchema() { + // This is the same test case as testPrimaryKeyFieldsAtBeginningOfSchema, but the primary key + // fields + // are located at the end of the flink schema. + String tableName = "upsert_on_pk_at_schema_end"; + LocalDate dt = LocalDate.of(2022, 3, 1); + try { + sql( + "CREATE TABLE %s(name STRING NOT NULL, id INT, dt DATE NOT NULL, PRIMARY KEY(id,dt) NOT ENFORCED) " + + "PARTITIONED BY (dt) WITH %s", + tableName, toWithClause(tableUpsertProps)); + + sql( + "INSERT INTO %s VALUES " + + "('Andy', 1, DATE '2022-03-01')," + + "('Bill', 1, DATE '2022-03-01')," + + "('Jane', 2, DATE '2022-03-01')", + tableName); + + TestHelpers.assertRows( + sql("SELECT * FROM %s", tableName), + Lists.newArrayList(Row.of("Bill", 1, dt), Row.of("Jane", 2, dt))); + + sql( + "INSERT INTO %s VALUES " + + "('Jane', 1, DATE '2022-03-01')," + + "('Bill', 2, DATE '2022-03-01')", + tableName); + + TestHelpers.assertRows( + sql("SELECT * FROM %s", tableName), + Lists.newArrayList(Row.of("Jane", 1, dt), Row.of("Bill", 2, dt))); + + sql( + "INSERT INTO %s VALUES " + + "('Duke', 3, DATE '2022-03-01')," + + "('Leon', 4, DATE '2022-03-01')", + tableName); + + TestHelpers.assertRows( + sql("SELECT * FROM %s", tableName), + Lists.newArrayList( + Row.of("Jane", 1, dt), + Row.of("Bill", 2, dt), + Row.of("Duke", 3, dt), + Row.of("Leon", 4, dt))); + } finally { + sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, tableName); + } + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestHelpers.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestHelpers.java new file mode 100644 index 000000000000..8cebf950c5f0 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestHelpers.java @@ -0,0 +1,632 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.IOException; +import java.math.BigDecimal; +import java.nio.ByteBuffer; +import java.time.LocalDate; +import java.time.LocalDateTime; +import java.time.LocalTime; +import java.time.OffsetDateTime; +import java.util.Collection; +import java.util.List; +import java.util.Map; +import java.util.UUID; +import java.util.function.Consumer; +import java.util.stream.Collectors; +import org.apache.avro.generic.GenericData; +import org.apache.flink.api.common.ExecutionConfig; +import org.apache.flink.api.common.typeutils.TypeSerializer; +import org.apache.flink.api.java.typeutils.runtime.kryo.KryoSerializer; +import org.apache.flink.core.memory.DataInputDeserializer; +import org.apache.flink.core.memory.DataOutputSerializer; +import org.apache.flink.table.data.ArrayData; +import org.apache.flink.table.data.DecimalData; +import org.apache.flink.table.data.GenericArrayData; +import org.apache.flink.table.data.GenericMapData; +import org.apache.flink.table.data.MapData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.TimestampData; +import org.apache.flink.table.data.conversion.DataStructureConverter; +import org.apache.flink.table.data.conversion.DataStructureConverters; +import org.apache.flink.table.runtime.typeutils.InternalSerializers; +import org.apache.flink.table.types.logical.ArrayType; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.table.types.logical.MapType; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.table.types.utils.TypeConversions; +import org.apache.flink.types.Row; +import org.apache.iceberg.ContentFile; +import org.apache.iceberg.ManifestFile; +import org.apache.iceberg.Schema; +import org.apache.iceberg.StructLike; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.flink.data.RowDataUtil; +import org.apache.iceberg.flink.source.FlinkInputFormat; +import org.apache.iceberg.flink.source.FlinkInputSplit; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Streams; +import org.apache.iceberg.types.Type; +import org.apache.iceberg.types.Types; +import org.apache.iceberg.util.DateTimeUtil; + +public class TestHelpers { + private TestHelpers() {} + + public static T roundTripKryoSerialize(Class clazz, T table) throws IOException { + KryoSerializer kryo = new KryoSerializer<>(clazz, new ExecutionConfig()); + + DataOutputSerializer outputView = new DataOutputSerializer(1024); + kryo.serialize(table, outputView); + + DataInputDeserializer inputView = new DataInputDeserializer(outputView.getCopyOfBuffer()); + return kryo.deserialize(inputView); + } + + public static RowData copyRowData(RowData from, RowType rowType) { + TypeSerializer[] fieldSerializers = + rowType.getChildren().stream() + .map((LogicalType type) -> InternalSerializers.create(type)) + .toArray(TypeSerializer[]::new); + RowData.FieldGetter[] fieldGetters = new RowData.FieldGetter[rowType.getFieldCount()]; + for (int i = 0; i < rowType.getFieldCount(); ++i) { + fieldGetters[i] = RowData.createFieldGetter(rowType.getTypeAt(i), i); + } + + return RowDataUtil.clone(from, null, rowType, fieldSerializers, fieldGetters); + } + + public static void readRowData(FlinkInputFormat input, Consumer visitor) + throws IOException { + for (FlinkInputSplit s : input.createInputSplits(0)) { + input.open(s); + try { + while (!input.reachedEnd()) { + RowData row = input.nextRecord(null); + visitor.accept(row); + } + } finally { + input.close(); + } + } + } + + public static List readRowData(FlinkInputFormat inputFormat, RowType rowType) + throws IOException { + List results = Lists.newArrayList(); + readRowData(inputFormat, row -> results.add(copyRowData(row, rowType))); + return results; + } + + public static List readRows(FlinkInputFormat inputFormat, RowType rowType) + throws IOException { + return convertRowDataToRow(readRowData(inputFormat, rowType), rowType); + } + + public static List convertRowDataToRow(List rowDataList, RowType rowType) { + DataStructureConverter converter = + DataStructureConverters.getConverter(TypeConversions.fromLogicalToDataType(rowType)); + return rowDataList.stream() + .map(converter::toExternal) + .map(Row.class::cast) + .collect(Collectors.toList()); + } + + private static List convertRecordToRow(List expectedRecords, Schema schema) { + List expected = Lists.newArrayList(); + @SuppressWarnings("unchecked") + DataStructureConverter converter = + (DataStructureConverter) + DataStructureConverters.getConverter( + TypeConversions.fromLogicalToDataType(FlinkSchemaUtil.convert(schema))); + expectedRecords.forEach( + r -> expected.add(converter.toExternal(RowDataConverter.convert(schema, r)))); + return expected; + } + + public static void assertRecordsWithOrder( + List results, List expectedRecords, Schema schema) { + List expected = convertRecordToRow(expectedRecords, schema); + assertRowsWithOrder(results, expected); + } + + public static void assertRecords(List results, List expectedRecords, Schema schema) { + List expected = convertRecordToRow(expectedRecords, schema); + assertRows(results, expected); + } + + public static void assertRows(List results, List expected, RowType rowType) { + assertRows(convertRowDataToRow(results, rowType), convertRowDataToRow(expected, rowType)); + } + + public static void assertRows(List results, List expected) { + assertThat(results).containsExactlyInAnyOrderElementsOf(expected); + } + + public static void assertRowsWithOrder(List results, List expected) { + assertThat(results).containsExactlyElementsOf(expected); + } + + public static void assertRowData(Schema schema, StructLike expected, RowData actual) { + assertRowData(schema.asStruct(), FlinkSchemaUtil.convert(schema), expected, actual); + } + + public static void assertRowData( + Types.StructType structType, + LogicalType rowType, + StructLike expectedRecord, + RowData actualRowData) { + if (expectedRecord == null && actualRowData == null) { + return; + } + + assertThat(expectedRecord).isNotNull(); + assertThat(actualRowData).isNotNull(); + + List types = Lists.newArrayList(); + for (Types.NestedField field : structType.fields()) { + types.add(field.type()); + } + + for (int i = 0; i < types.size(); i += 1) { + LogicalType logicalType = ((RowType) rowType).getTypeAt(i); + Object expected = expectedRecord.get(i, Object.class); + // The RowData.createFieldGetter won't return null for the required field. But in the + // projection case, if we are + // projecting a nested required field from an optional struct, then we should give a null for + // the projected field + // if the outer struct value is null. So we need to check the nullable for actualRowData here. + // For more details + // please see issue #2738. + Object actual = + actualRowData.isNullAt(i) + ? null + : RowData.createFieldGetter(logicalType, i).getFieldOrNull(actualRowData); + assertEquals(types.get(i), logicalType, expected, actual); + } + } + + private static void assertEquals( + Type type, LogicalType logicalType, Object expected, Object actual) { + + if (expected == null && actual == null) { + return; + } + + assertThat(expected).isNotNull(); + assertThat(actual).isNotNull(); + + switch (type.typeId()) { + case BOOLEAN: + assertThat(actual).as("boolean value should be equal").isEqualTo(expected); + break; + case INTEGER: + assertThat(actual).as("int value should be equal").isEqualTo(expected); + break; + case LONG: + assertThat(actual).as("long value should be equal").isEqualTo(expected); + break; + case FLOAT: + assertThat(actual).as("float value should be equal").isEqualTo(expected); + break; + case DOUBLE: + assertThat(actual).as("double value should be equal").isEqualTo(expected); + break; + case STRING: + assertThat(expected).as("Should expect a CharSequence").isInstanceOf(CharSequence.class); + assertThat(actual.toString()) + .as("string should be equal") + .isEqualTo(String.valueOf(expected)); + break; + case DATE: + assertThat(expected).as("Should expect a Date").isInstanceOf(LocalDate.class); + LocalDate date = DateTimeUtil.dateFromDays((int) actual); + assertThat(date).as("date should be equal").isEqualTo(expected); + break; + case TIME: + assertThat(expected).as("Should expect a LocalTime").isInstanceOf(LocalTime.class); + int milliseconds = (int) (((LocalTime) expected).toNanoOfDay() / 1000_000); + assertThat(actual).as("time millis should be equal").isEqualTo(milliseconds); + break; + case TIMESTAMP: + if (((Types.TimestampType) type).shouldAdjustToUTC()) { + assertThat(expected) + .as("Should expect a OffsetDataTime") + .isInstanceOf(OffsetDateTime.class); + OffsetDateTime ts = (OffsetDateTime) expected; + assertThat(((TimestampData) actual).toLocalDateTime()) + .as("OffsetDataTime should be equal") + .isEqualTo(ts.toLocalDateTime()); + } else { + assertThat(expected) + .as("Should expect a LocalDataTime") + .isInstanceOf(LocalDateTime.class); + LocalDateTime ts = (LocalDateTime) expected; + assertThat(((TimestampData) actual).toLocalDateTime()) + .as("LocalDataTime should be equal") + .isEqualTo(ts); + } + break; + case BINARY: + assertThat(ByteBuffer.wrap((byte[]) actual)) + .as("Should expect a ByteBuffer") + .isInstanceOf(ByteBuffer.class) + .isEqualTo(expected); + break; + case DECIMAL: + assertThat(expected).as("Should expect a BigDecimal").isInstanceOf(BigDecimal.class); + BigDecimal bd = (BigDecimal) expected; + assertThat(((DecimalData) actual).toBigDecimal()) + .as("decimal value should be equal") + .isEqualTo(bd); + break; + case LIST: + assertThat(expected).as("Should expect a Collection").isInstanceOf(Collection.class); + Collection expectedArrayData = (Collection) expected; + ArrayData actualArrayData = (ArrayData) actual; + LogicalType elementType = ((ArrayType) logicalType).getElementType(); + assertThat(actualArrayData.size()) + .as("array length should be equal") + .isEqualTo(expectedArrayData.size()); + assertArrayValues( + type.asListType().elementType(), elementType, expectedArrayData, actualArrayData); + break; + case MAP: + assertThat(expected).as("Should expect a Map").isInstanceOf(Map.class); + assertMapValues(type.asMapType(), logicalType, (Map) expected, (MapData) actual); + break; + case STRUCT: + assertThat(expected).as("Should expect a Record").isInstanceOf(StructLike.class); + assertRowData(type.asStructType(), logicalType, (StructLike) expected, (RowData) actual); + break; + case UUID: + assertThat(expected).as("Should expect a UUID").isInstanceOf(UUID.class); + ByteBuffer bb = ByteBuffer.wrap((byte[]) actual); + long firstLong = bb.getLong(); + long secondLong = bb.getLong(); + assertThat(new UUID(firstLong, secondLong).toString()) + .as("UUID should be equal") + .isEqualTo(expected.toString()); + break; + case FIXED: + assertThat(actual) + .as("Should expect byte[]") + .isInstanceOf(byte[].class) + .isEqualTo(expected); + break; + default: + throw new IllegalArgumentException("Not a supported type: " + type); + } + } + + public static void assertEquals(Schema schema, List records, List rows) { + Streams.forEachPair( + records.stream(), rows.stream(), (record, row) -> assertEquals(schema, record, row)); + } + + public static void assertEquals(Schema schema, GenericData.Record record, Row row) { + List fields = schema.asStruct().fields(); + assertThat(fields).hasSameSizeAs(record.getSchema().getFields()); + assertThat(fields).hasSize(row.getArity()); + + RowType rowType = FlinkSchemaUtil.convert(schema); + for (int i = 0; i < fields.size(); ++i) { + Type fieldType = fields.get(i).type(); + Object expectedValue = record.get(i); + Object actualValue = row.getField(i); + LogicalType logicalType = rowType.getTypeAt(i); + assertAvroEquals(fieldType, logicalType, expectedValue, actualValue); + } + } + + private static void assertEquals(Types.StructType struct, GenericData.Record record, Row row) { + List fields = struct.fields(); + for (int i = 0; i < fields.size(); i += 1) { + Type fieldType = fields.get(i).type(); + Object expectedValue = record.get(i); + Object actualValue = row.getField(i); + assertAvroEquals(fieldType, null, expectedValue, actualValue); + } + } + + private static void assertAvroEquals( + Type type, LogicalType logicalType, Object expected, Object actual) { + + if (expected == null && actual == null) { + return; + } + assertThat(expected).isNotNull(); + assertThat(actual).isNotNull(); + + switch (type.typeId()) { + case BOOLEAN: + case INTEGER: + case LONG: + case FLOAT: + case DOUBLE: + assertThat(expected) + .as("Should expect a " + type.typeId().javaClass()) + .isInstanceOf(type.typeId().javaClass()); + assertThat(actual) + .as("Should expect a " + type.typeId().javaClass()) + .isInstanceOf(type.typeId().javaClass()); + assertThat(actual).as(type.typeId() + " value should be equal").isEqualTo(expected); + break; + case STRING: + assertThat(expected).as("Should expect a CharSequence").isInstanceOf(CharSequence.class); + assertThat(actual).as("Should expect a CharSequence").isInstanceOf(CharSequence.class); + assertThat(actual.toString()).as("string should be equal").isEqualTo(expected.toString()); + break; + case DATE: + assertThat(expected).as("Should expect a Date").isInstanceOf(LocalDate.class); + LocalDate date = DateTimeUtil.dateFromDays((int) actual); + assertThat(date).as("date should be equal").isEqualTo(expected); + break; + case TIME: + assertThat(expected).as("Should expect a LocalTime").isInstanceOf(LocalTime.class); + int milliseconds = (int) (((LocalTime) expected).toNanoOfDay() / 1000_000); + assertThat(actual).as("time millis should be equal").isEqualTo(milliseconds); + break; + case TIMESTAMP: + if (((Types.TimestampType) type).shouldAdjustToUTC()) { + assertThat(expected) + .as("Should expect a OffsetDataTime") + .isInstanceOf(OffsetDateTime.class); + OffsetDateTime ts = (OffsetDateTime) expected; + assertThat(((TimestampData) actual).toLocalDateTime()) + .as("OffsetDataTime should be equal") + .isEqualTo(ts.toLocalDateTime()); + } else { + assertThat(expected) + .as("Should expect a LocalDataTime") + .isInstanceOf(LocalDateTime.class); + LocalDateTime ts = (LocalDateTime) expected; + assertThat(((TimestampData) actual).toLocalDateTime()) + .as("LocalDataTime should be equal") + .isEqualTo(ts); + } + break; + case BINARY: + assertThat(ByteBuffer.wrap((byte[]) actual)) + .as("Should expect a ByteBuffer") + .isInstanceOf(ByteBuffer.class) + .isEqualTo(expected); + break; + case DECIMAL: + assertThat(expected).as("Should expect a BigDecimal").isInstanceOf(BigDecimal.class); + BigDecimal bd = (BigDecimal) expected; + assertThat(((DecimalData) actual).toBigDecimal()) + .as("decimal value should be equal") + .isEqualTo(bd); + break; + case LIST: + assertThat(expected).as("Should expect a Collection").isInstanceOf(Collection.class); + Collection expectedArrayData = (Collection) expected; + ArrayData actualArrayData; + try { + actualArrayData = (ArrayData) actual; + } catch (ClassCastException e) { + actualArrayData = new GenericArrayData((Object[]) actual); + } + LogicalType elementType = ((ArrayType) logicalType).getElementType(); + assertThat(actualArrayData.size()) + .as("array length should be equal") + .isEqualTo(expectedArrayData.size()); + assertArrayValues( + type.asListType().elementType(), elementType, expectedArrayData, actualArrayData); + break; + case MAP: + assertThat(expected).as("Should expect a Map").isInstanceOf(Map.class); + MapData actualMap; + try { + actualMap = (MapData) actual; + } catch (ClassCastException e) { + actualMap = new GenericMapData((Map) actual); + } + assertMapValues(type.asMapType(), logicalType, (Map) expected, actualMap); + break; + case STRUCT: + assertThat(expected).as("Should expect a Record").isInstanceOf(GenericData.Record.class); + assertEquals( + type.asNestedType().asStructType(), (GenericData.Record) expected, (Row) actual); + break; + case UUID: + assertThat(expected).as("Should expect a UUID").isInstanceOf(UUID.class); + ByteBuffer bb = ByteBuffer.wrap((byte[]) actual); + long firstLong = bb.getLong(); + long secondLong = bb.getLong(); + assertThat(new UUID(firstLong, secondLong).toString()) + .as("UUID should be equal") + .isEqualTo(expected.toString()); + break; + case FIXED: + assertThat(actual) + .as("Should expect byte[]") + .isInstanceOf(byte[].class) + .isEqualTo(expected); + break; + default: + throw new IllegalArgumentException("Not a supported type: " + type); + } + } + + private static void assertArrayValues( + Type type, LogicalType logicalType, Collection expectedArray, ArrayData actualArray) { + List expectedElements = Lists.newArrayList(expectedArray); + for (int i = 0; i < expectedArray.size(); i += 1) { + if (expectedElements.get(i) == null) { + assertThat(actualArray.isNullAt(i)).isTrue(); + continue; + } + + Object expected = expectedElements.get(i); + + assertEquals( + type, + logicalType, + expected, + ArrayData.createElementGetter(logicalType).getElementOrNull(actualArray, i)); + } + } + + private static void assertMapValues( + Types.MapType mapType, LogicalType type, Map expected, MapData actual) { + assertThat(actual.size()).as("map size should be equal").isEqualTo(expected.size()); + + ArrayData actualKeyArrayData = actual.keyArray(); + ArrayData actualValueArrayData = actual.valueArray(); + LogicalType actualKeyType = ((MapType) type).getKeyType(); + LogicalType actualValueType = ((MapType) type).getValueType(); + Type keyType = mapType.keyType(); + Type valueType = mapType.valueType(); + + ArrayData.ElementGetter keyGetter = ArrayData.createElementGetter(actualKeyType); + ArrayData.ElementGetter valueGetter = ArrayData.createElementGetter(actualValueType); + + for (Map.Entry entry : expected.entrySet()) { + Object matchedActualKey = null; + int matchedKeyIndex = 0; + for (int i = 0; i < actual.size(); i += 1) { + try { + Object key = keyGetter.getElementOrNull(actualKeyArrayData, i); + assertEquals(keyType, actualKeyType, entry.getKey(), key); + matchedActualKey = key; + matchedKeyIndex = i; + break; + } catch (AssertionError e) { + // not found + } + } + assertThat(matchedActualKey).as("Should have a matching key").isNotNull(); + final int valueIndex = matchedKeyIndex; + assertEquals( + valueType, + actualValueType, + entry.getValue(), + valueGetter.getElementOrNull(actualValueArrayData, valueIndex)); + } + } + + public static void assertEquals(ManifestFile expected, ManifestFile actual) { + if (expected == actual) { + return; + } + assertThat(expected).isNotNull(); + assertThat(actual).isNotNull(); + assertThat(actual.path()).as("Path must match").isEqualTo(expected.path()); + assertThat(actual.length()).as("Length must match").isEqualTo(expected.length()); + assertThat(actual.partitionSpecId()) + .as("Spec id must match") + .isEqualTo(expected.partitionSpecId()); + assertThat(actual.content()).as("ManifestContent must match").isEqualTo(expected.content()); + assertThat(actual.sequenceNumber()) + .as("SequenceNumber must match") + .isEqualTo(expected.sequenceNumber()); + assertThat(actual.minSequenceNumber()) + .as("MinSequenceNumber must match") + .isEqualTo(expected.minSequenceNumber()); + assertThat(actual.snapshotId()).as("Snapshot id must match").isEqualTo(expected.snapshotId()); + assertThat(actual.hasAddedFiles()) + .as("Added files flag must match") + .isEqualTo(expected.hasAddedFiles()); + assertThat(actual.addedFilesCount()) + .as("Added files count must match") + .isEqualTo(expected.addedFilesCount()); + assertThat(actual.addedRowsCount()) + .as("Added rows count must match") + .isEqualTo(expected.addedRowsCount()); + assertThat(actual.hasExistingFiles()) + .as("Existing files flag must match") + .isEqualTo(expected.hasExistingFiles()); + assertThat(actual.existingFilesCount()) + .as("Existing files count must match") + .isEqualTo(expected.existingFilesCount()); + assertThat(actual.existingRowsCount()) + .as("Existing rows count must match") + .isEqualTo(expected.existingRowsCount()); + assertThat(actual.hasDeletedFiles()) + .as("Deleted files flag must match") + .isEqualTo(expected.hasDeletedFiles()); + assertThat(actual.deletedFilesCount()) + .as("Deleted files count must match") + .isEqualTo(expected.deletedFilesCount()); + assertThat(actual.deletedRowsCount()) + .as("Deleted rows count must match") + .isEqualTo(expected.deletedRowsCount()); + + List expectedSummaries = expected.partitions(); + List actualSummaries = actual.partitions(); + assertThat(actualSummaries) + .as("PartitionFieldSummary size does not match") + .hasSameSizeAs(expectedSummaries); + for (int i = 0; i < expectedSummaries.size(); i++) { + assertThat(actualSummaries.get(i).containsNull()) + .as("Null flag in partition must match") + .isEqualTo(expectedSummaries.get(i).containsNull()); + assertThat(actualSummaries.get(i).containsNaN()) + .as("NaN flag in partition must match") + .isEqualTo(expectedSummaries.get(i).containsNaN()); + assertThat(actualSummaries.get(i).lowerBound()) + .as("Lower bounds in partition must match") + .isEqualTo(expectedSummaries.get(i).lowerBound()); + assertThat(actualSummaries.get(i).upperBound()) + .as("Upper bounds in partition must match") + .isEqualTo(expectedSummaries.get(i).upperBound()); + } + } + + public static void assertEquals(ContentFile expected, ContentFile actual) { + if (expected == actual) { + return; + } + assertThat(expected).isNotNull(); + assertThat(actual).isNotNull(); + assertThat(actual.specId()).as("SpecId").isEqualTo(expected.specId()); + assertThat(actual.content()).as("Content").isEqualTo(expected.content()); + assertThat(actual.path()).as("Path").isEqualTo(expected.path()); + assertThat(actual.format()).as("Format").isEqualTo(expected.format()); + assertThat(actual.partition().size()) + .as("Partition size") + .isEqualTo(expected.partition().size()); + for (int i = 0; i < expected.partition().size(); i++) { + assertThat(actual.partition().get(i, Object.class)) + .as("Partition data at index " + i) + .isEqualTo(expected.partition().get(i, Object.class)); + } + assertThat(actual.recordCount()).as("Record count").isEqualTo(expected.recordCount()); + assertThat(actual.fileSizeInBytes()) + .as("File size in bytes") + .isEqualTo(expected.fileSizeInBytes()); + assertThat(actual.columnSizes()).as("Column sizes").isEqualTo(expected.columnSizes()); + assertThat(actual.valueCounts()).as("Value counts").isEqualTo(expected.valueCounts()); + assertThat(actual.nullValueCounts()) + .as("Null value counts") + .isEqualTo(expected.nullValueCounts()); + assertThat(actual.lowerBounds()).as("Lower bounds").isEqualTo(expected.lowerBounds()); + assertThat(actual.upperBounds()).as("Upper bounds").isEqualTo(expected.upperBounds()); + assertThat(actual.keyMetadata()).as("Key metadata").isEqualTo(expected.keyMetadata()); + assertThat(actual.splitOffsets()).as("Split offsets").isEqualTo(expected.splitOffsets()); + assertThat(actual.equalityFieldIds()) + .as("Equality field id list") + .isEqualTo(expected.equalityFieldIds()); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestIcebergConnector.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestIcebergConnector.java new file mode 100644 index 000000000000..47f5485df879 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestIcebergConnector.java @@ -0,0 +1,331 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.nio.file.Files; +import java.util.Map; +import org.apache.flink.configuration.CoreOptions; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.api.EnvironmentSettings; +import org.apache.flink.table.api.TableEnvironment; +import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; +import org.apache.flink.table.catalog.Catalog; +import org.apache.flink.table.catalog.ObjectPath; +import org.apache.flink.types.Row; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.metastore.HiveMetaStoreClient; +import org.apache.iceberg.CatalogProperties; +import org.apache.iceberg.Parameter; +import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.Parameters; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.thrift.TException; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.api.extension.ExtendWith; + +@ExtendWith(ParameterizedTestExtension.class) +public class TestIcebergConnector extends TestBase { + + private static final String TABLE_NAME = "test_table"; + + @Parameter(index = 0) + private String catalogName; + + @Parameter(index = 1) + private Map properties; + + @Parameter(index = 2) + private boolean isStreaming; + + private volatile TableEnvironment tEnv; + + @Parameters(name = "catalogName = {0}, properties = {1}, isStreaming = {2}") + public static Iterable parameters() { + return Lists.newArrayList( + // Create iceberg table in the hadoop catalog and default database. + new Object[] { + "testhadoop", + ImmutableMap.of( + "connector", "iceberg", + "catalog-type", "hadoop"), + true + }, + new Object[] { + "testhadoop", + ImmutableMap.of( + "connector", "iceberg", + "catalog-type", "hadoop", + "catalog-table", "not_existing_table"), + true + }, + new Object[] { + "testhadoop", + ImmutableMap.of( + "connector", "iceberg", + "catalog-type", "hadoop"), + false + }, + // Create iceberg table in the hadoop catalog and not_existing_db. + new Object[] { + "testhadoop", + ImmutableMap.of( + "connector", "iceberg", + "catalog-type", "hadoop", + "catalog-database", "not_existing_db"), + true + }, + new Object[] { + "testhadoop", + ImmutableMap.of( + "connector", "iceberg", + "catalog-type", "hadoop", + "catalog-database", "not_existing_db", + "catalog-table", "not_existing_table"), + true + }, + new Object[] { + "testhadoop", + ImmutableMap.of( + "connector", "iceberg", + "catalog-type", "hadoop", + "catalog-database", "not_existing_db"), + false + }, + // Create iceberg table in the hive catalog and default database. + new Object[] { + "testhive", + ImmutableMap.of( + "connector", "iceberg", + "catalog-type", "hive"), + true + }, + new Object[] { + "testhive", + ImmutableMap.of( + "connector", "iceberg", + "catalog-type", "hive", + "catalog-table", "not_existing_table"), + true + }, + new Object[] { + "testhive", + ImmutableMap.of( + "connector", "iceberg", + "catalog-type", "hive"), + false + }, + // Create iceberg table in the hive catalog and not_existing_db. + new Object[] { + "testhive", + ImmutableMap.of( + "connector", "iceberg", + "catalog-type", "hive", + "catalog-database", "not_existing_db"), + true + }, + new Object[] { + "testhive", + ImmutableMap.of( + "connector", "iceberg", + "catalog-type", "hive", + "catalog-database", "not_existing_db", + "catalog-table", "not_existing_table"), + true + }, + new Object[] { + "testhive", + ImmutableMap.of( + "connector", "iceberg", + "catalog-type", "hive", + "catalog-database", "not_existing_db"), + false + }); + } + + @Override + protected TableEnvironment getTableEnv() { + if (tEnv == null) { + synchronized (this) { + if (tEnv == null) { + EnvironmentSettings.Builder settingsBuilder = EnvironmentSettings.newInstance(); + if (isStreaming) { + settingsBuilder.inStreamingMode(); + StreamExecutionEnvironment env = + StreamExecutionEnvironment.getExecutionEnvironment( + MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG); + env.enableCheckpointing(400); + env.setMaxParallelism(2); + env.setParallelism(2); + tEnv = StreamTableEnvironment.create(env, settingsBuilder.build()); + } else { + settingsBuilder.inBatchMode(); + tEnv = TableEnvironment.create(settingsBuilder.build()); + } + // Set only one parallelism. + tEnv.getConfig() + .getConfiguration() + .set(CoreOptions.DEFAULT_PARALLELISM, 1) + .set(FlinkConfigOptions.TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM, false); + } + } + } + return tEnv; + } + + @AfterEach + public void after() throws TException { + sql("DROP TABLE IF EXISTS %s", TABLE_NAME); + + // Clean the created orphan databases and tables from hive-metastore. + if (isHiveCatalog()) { + HiveMetaStoreClient metaStoreClient = new HiveMetaStoreClient(hiveConf); + try { + metaStoreClient.dropTable(databaseName(), tableName()); + if (!isDefaultDatabaseName()) { + try { + metaStoreClient.dropDatabase(databaseName()); + } catch (Exception ignored) { + // Ignore + } + } + } finally { + metaStoreClient.close(); + } + } + } + + private void testCreateConnectorTable() { + Map tableProps = createTableProps(); + + // Create table under the flink's current database. + sql("CREATE TABLE %s (id BIGINT, data STRING) WITH %s", TABLE_NAME, toWithClause(tableProps)); + sql("INSERT INTO %s VALUES (1, 'AAA'), (2, 'BBB'), (3, 'CCC')", TABLE_NAME); + assertThat(sql("SELECT * FROM %s", TABLE_NAME)) + .containsExactlyInAnyOrder(Row.of(1L, "AAA"), Row.of(2L, "BBB"), Row.of(3L, "CCC")); + + FlinkCatalogFactory factory = new FlinkCatalogFactory(); + Catalog flinkCatalog = factory.createCatalog(catalogName, tableProps, new Configuration()); + assertThat(flinkCatalog.databaseExists(databaseName())).isTrue(); + assertThat(flinkCatalog.tableExists(new ObjectPath(databaseName(), tableName()))).isTrue(); + + // Drop and create it again. + sql("DROP TABLE %s", TABLE_NAME); + sql("CREATE TABLE %s (id BIGINT, data STRING) WITH %s", TABLE_NAME, toWithClause(tableProps)); + assertThat(sql("SELECT * FROM %s", TABLE_NAME)) + .containsExactlyInAnyOrder(Row.of(1L, "AAA"), Row.of(2L, "BBB"), Row.of(3L, "CCC")); + } + + @TestTemplate + public void testCreateTableUnderDefaultDatabase() { + testCreateConnectorTable(); + } + + @TestTemplate + public void testCatalogDatabaseConflictWithFlinkDatabase() { + sql("CREATE DATABASE IF NOT EXISTS `%s`", databaseName()); + sql("USE `%s`", databaseName()); + testCreateConnectorTable(); + // Ensure that the table was created under the specific database. + assertThatThrownBy( + () -> sql("CREATE TABLE `default_catalog`.`%s`.`%s`", databaseName(), TABLE_NAME)) + .isInstanceOf(org.apache.flink.table.api.TableException.class) + .hasMessageStartingWith("Could not execute CreateTable in path"); + } + + @TestTemplate + public void testConnectorTableInIcebergCatalog() { + // Create the catalog properties + Map catalogProps = Maps.newHashMap(); + catalogProps.put("type", "iceberg"); + if (isHiveCatalog()) { + catalogProps.put("catalog-type", "hive"); + catalogProps.put(CatalogProperties.URI, CatalogTestBase.getURI(hiveConf)); + } else { + catalogProps.put("catalog-type", "hadoop"); + } + catalogProps.put(CatalogProperties.WAREHOUSE_LOCATION, createWarehouse()); + + // Create the table properties + Map tableProps = createTableProps(); + + // Create a connector table in an iceberg catalog. + sql("CREATE CATALOG `test_catalog` WITH %s", toWithClause(catalogProps)); + try { + assertThatThrownBy( + () -> + sql( + "CREATE TABLE `test_catalog`.`%s`.`%s` (id BIGINT, data STRING) WITH %s", + FlinkCatalogFactory.DEFAULT_DATABASE_NAME, + TABLE_NAME, + toWithClause(tableProps))) + .cause() + .isInstanceOf(IllegalArgumentException.class) + .hasMessage( + "Cannot create the table with 'connector'='iceberg' table property in an iceberg catalog, " + + "Please create table with 'connector'='iceberg' property in a non-iceberg catalog or " + + "create table without 'connector'='iceberg' related properties in an iceberg table."); + } finally { + sql("DROP CATALOG IF EXISTS `test_catalog`"); + } + } + + private Map createTableProps() { + Map tableProps = Maps.newHashMap(properties); + tableProps.put("catalog-name", catalogName); + tableProps.put(CatalogProperties.WAREHOUSE_LOCATION, createWarehouse()); + if (isHiveCatalog()) { + tableProps.put(CatalogProperties.URI, CatalogTestBase.getURI(hiveConf)); + } + return tableProps; + } + + private boolean isHiveCatalog() { + return "testhive".equalsIgnoreCase(catalogName); + } + + private boolean isDefaultDatabaseName() { + return FlinkCatalogFactory.DEFAULT_DATABASE_NAME.equalsIgnoreCase(databaseName()); + } + + private String tableName() { + return properties.getOrDefault("catalog-table", TABLE_NAME); + } + + private String databaseName() { + return properties.getOrDefault("catalog-database", "default_database"); + } + + private String createWarehouse() { + try { + return String.format( + "file://%s", + Files.createTempDirectory(temporaryDirectory, "junit").toFile().getAbsolutePath()); + } catch (IOException e) { + throw new UncheckedIOException(e); + } + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestManifestFileSerialization.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestManifestFileSerialization.java new file mode 100644 index 000000000000..8f1f129e183b --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestManifestFileSerialization.java @@ -0,0 +1,173 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.IOException; +import java.io.ObjectInputStream; +import java.io.ObjectOutputStream; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.nio.file.Path; +import org.apache.flink.api.common.ExecutionConfig; +import org.apache.flink.api.java.typeutils.runtime.kryo.KryoSerializer; +import org.apache.flink.core.memory.DataInputDeserializer; +import org.apache.flink.core.memory.DataOutputSerializer; +import org.apache.hadoop.conf.Configuration; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.DataFiles; +import org.apache.iceberg.GenericManifestFile; +import org.apache.iceberg.ManifestFile; +import org.apache.iceberg.ManifestFiles; +import org.apache.iceberg.ManifestWriter; +import org.apache.iceberg.Metrics; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.hadoop.HadoopFileIO; +import org.apache.iceberg.io.FileIO; +import org.apache.iceberg.io.OutputFile; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.types.Types; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +public class TestManifestFileSerialization { + + private static final Schema SCHEMA = + new Schema( + required(1, "id", Types.LongType.get()), + optional(2, "data", Types.StringType.get()), + required(3, "date", Types.StringType.get()), + required(4, "double", Types.DoubleType.get())); + + private static final PartitionSpec SPEC = + PartitionSpec.builderFor(SCHEMA).identity("double").build(); + + private static final DataFile FILE_A = + DataFiles.builder(SPEC) + .withPath("/path/to/data-1.parquet") + .withFileSizeInBytes(0) + .withPartition(org.apache.iceberg.TestHelpers.Row.of(1D)) + .withPartitionPath("double=1") + .withMetrics( + new Metrics( + 5L, + null, // no column sizes + ImmutableMap.of(1, 5L, 2, 3L), // value count + ImmutableMap.of(1, 0L, 2, 2L), // null count + ImmutableMap.of(), // nan count + ImmutableMap.of(1, longToBuffer(0L)), // lower bounds + ImmutableMap.of(1, longToBuffer(4L)) // upper bounds + )) + .build(); + + private static final DataFile FILE_B = + DataFiles.builder(SPEC) + .withPath("/path/to/data-2.parquet") + .withFileSizeInBytes(0) + .withPartition(org.apache.iceberg.TestHelpers.Row.of(Double.NaN)) + .withPartitionPath("double=NaN") + .withMetrics( + new Metrics( + 1L, + null, // no column sizes + ImmutableMap.of(1, 1L, 4, 1L), // value count + ImmutableMap.of(1, 0L, 2, 0L), // null count + ImmutableMap.of(4, 1L), // nan count + ImmutableMap.of(1, longToBuffer(0L)), // lower bounds + ImmutableMap.of(1, longToBuffer(1L)) // upper bounds + )) + .build(); + + private static final FileIO FILE_IO = new HadoopFileIO(new Configuration()); + + @TempDir private Path temp; + + @Test + public void testKryoSerialization() throws IOException { + KryoSerializer kryo = + new KryoSerializer<>(ManifestFile.class, new ExecutionConfig()); + + DataOutputSerializer outputView = new DataOutputSerializer(1024); + + ManifestFile manifest = writeManifest(FILE_A, FILE_B); + + kryo.serialize(manifest, outputView); + kryo.serialize(manifest.copy(), outputView); + kryo.serialize(GenericManifestFile.copyOf(manifest).build(), outputView); + + DataInputDeserializer inputView = new DataInputDeserializer(outputView.getCopyOfBuffer()); + ManifestFile m1 = kryo.deserialize(inputView); + ManifestFile m2 = kryo.deserialize(inputView); + ManifestFile m3 = kryo.deserialize(inputView); + + TestHelpers.assertEquals(manifest, m1); + TestHelpers.assertEquals(manifest, m2); + TestHelpers.assertEquals(manifest, m3); + } + + @Test + public void testJavaSerialization() throws Exception { + ByteArrayOutputStream bytes = new ByteArrayOutputStream(); + + ManifestFile manifest = writeManifest(FILE_A, FILE_B); + + try (ObjectOutputStream out = new ObjectOutputStream(bytes)) { + out.writeObject(manifest); + out.writeObject(manifest.copy()); + out.writeObject(GenericManifestFile.copyOf(manifest).build()); + } + + try (ObjectInputStream in = + new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray()))) { + for (int i = 0; i < 3; i += 1) { + Object obj = in.readObject(); + assertThat(obj).as("Should be a ManifestFile").isInstanceOf(ManifestFile.class); + TestHelpers.assertEquals(manifest, (ManifestFile) obj); + } + } + } + + private ManifestFile writeManifest(DataFile... files) throws IOException { + File manifestFile = File.createTempFile("input", "m0.avro", temp.toFile()); + assertThat(manifestFile.delete()).isTrue(); + OutputFile outputFile = FILE_IO.newOutputFile(manifestFile.getCanonicalPath()); + + ManifestWriter writer = ManifestFiles.write(SPEC, outputFile); + try { + for (DataFile file : files) { + writer.add(file); + } + } finally { + writer.close(); + } + + return writer.toManifestFile(); + } + + private static ByteBuffer longToBuffer(long value) { + return ByteBuffer.allocate(8).order(ByteOrder.LITTLE_ENDIAN).putLong(0, value); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestRowDataWrapper.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestRowDataWrapper.java new file mode 100644 index 000000000000..0af49e9e2365 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestRowDataWrapper.java @@ -0,0 +1,93 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.util.Iterator; +import org.apache.flink.table.data.RowData; +import org.apache.iceberg.RecordWrapperTest; +import org.apache.iceberg.Schema; +import org.apache.iceberg.StructLike; +import org.apache.iceberg.data.InternalRecordWrapper; +import org.apache.iceberg.data.RandomGenericData; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.flink.data.RandomRowData; +import org.apache.iceberg.util.StructLikeWrapper; + +public class TestRowDataWrapper extends RecordWrapperTest { + + /** + * Flink's time type has been truncated to millis seconds, so we need a customized assert method + * to check the values. + */ + @Override + public void testTime() { + generateAndValidate( + new Schema(TIME.fields()), + (message, expectedWrapper, actualWrapper) -> { + for (int pos = 0; pos < TIME.fields().size(); pos++) { + Object expected = expectedWrapper.get().get(pos, Object.class); + Object actual = actualWrapper.get().get(pos, Object.class); + if (expected == actual) { + return; + } + + assertThat(actual).isNotNull(); + assertThat(expected).isNotNull(); + + int expectedMilliseconds = (int) ((long) expected / 1000_000); + int actualMilliseconds = (int) ((long) actual / 1000_000); + assertThat(actualMilliseconds).as(message).isEqualTo(expectedMilliseconds); + } + }); + } + + @Override + protected void generateAndValidate(Schema schema, RecordWrapperTest.AssertMethod assertMethod) { + int numRecords = 100; + Iterable recordList = RandomGenericData.generate(schema, numRecords, 101L); + Iterable rowDataList = RandomRowData.generate(schema, numRecords, 101L); + + InternalRecordWrapper recordWrapper = new InternalRecordWrapper(schema.asStruct()); + RowDataWrapper rowDataWrapper = + new RowDataWrapper(FlinkSchemaUtil.convert(schema), schema.asStruct()); + + Iterator actual = recordList.iterator(); + Iterator expected = rowDataList.iterator(); + + StructLikeWrapper actualWrapper = StructLikeWrapper.forType(schema.asStruct()); + StructLikeWrapper expectedWrapper = StructLikeWrapper.forType(schema.asStruct()); + for (int i = 0; i < numRecords; i++) { + assertThat(actual).hasNext(); + assertThat(expected).hasNext(); + + StructLike recordStructLike = recordWrapper.wrap(actual.next()); + StructLike rowDataStructLike = rowDataWrapper.wrap(expected.next()); + + assertMethod.assertEquals( + "Should have expected StructLike values", + expectedWrapper.set(rowDataStructLike), + actualWrapper.set(recordStructLike)); + } + + assertThat(actual).isExhausted(); + assertThat(expected).isExhausted(); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestTableLoader.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestTableLoader.java new file mode 100644 index 000000000000..a7c58e551112 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestTableLoader.java @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import java.io.File; +import org.apache.iceberg.Table; +import org.apache.iceberg.TestTables; + +public class TestTableLoader implements TableLoader { + private final File dir; + + public static TableLoader of(String dir) { + return new TestTableLoader(dir); + } + + public TestTableLoader(String dir) { + this.dir = new File(dir); + } + + @Override + public void open() {} + + @Override + public boolean isOpen() { + return true; + } + + @Override + public Table loadTable() { + return TestTables.load(dir, "test"); + } + + @Override + @SuppressWarnings({"checkstyle:NoClone", "checkstyle:SuperClone"}) + public TableLoader clone() { + return new TestTableLoader(dir.getAbsolutePath()); + } + + @Override + public void close() {} +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestTableSerialization.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestTableSerialization.java new file mode 100644 index 000000000000..7f0e7acaa822 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/TestTableSerialization.java @@ -0,0 +1,110 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink; + +import static org.apache.iceberg.flink.TestHelpers.roundTripKryoSerialize; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Path; +import java.util.Map; +import org.apache.iceberg.HasTableOperations; +import org.apache.iceberg.MetadataTableType; +import org.apache.iceberg.MetadataTableUtils; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SerializableTable; +import org.apache.iceberg.SortOrder; +import org.apache.iceberg.Table; +import org.apache.iceberg.TableOperations; +import org.apache.iceberg.TestHelpers; +import org.apache.iceberg.Transaction; +import org.apache.iceberg.hadoop.HadoopTables; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.types.Types; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +public class TestTableSerialization { + private static final HadoopTables TABLES = new HadoopTables(); + + private static final Schema SCHEMA = + new Schema( + required(1, "id", Types.LongType.get()), + optional(2, "data", Types.StringType.get()), + required(3, "date", Types.StringType.get()), + optional(4, "double", Types.DoubleType.get())); + + private static final PartitionSpec SPEC = + PartitionSpec.builderFor(SCHEMA).identity("date").build(); + + private static final SortOrder SORT_ORDER = SortOrder.builderFor(SCHEMA).asc("id").build(); + + @TempDir private Path temp; + private Table table; + + @BeforeEach + public void initTable() throws IOException { + Map props = ImmutableMap.of("k1", "v1", "k2", "v2"); + + File tableLocation = File.createTempFile("junit", null, temp.toFile()); + assertThat(tableLocation.delete()).isTrue(); + + this.table = TABLES.create(SCHEMA, SPEC, SORT_ORDER, props, tableLocation.toString()); + } + + @Test + public void testSerializableTableKryoSerialization() throws IOException { + SerializableTable serializableTable = (SerializableTable) SerializableTable.copyOf(table); + TestHelpers.assertSerializedAndLoadedMetadata( + table, roundTripKryoSerialize(SerializableTable.class, serializableTable)); + } + + @Test + public void testSerializableMetadataTableKryoSerialization() throws IOException { + for (MetadataTableType type : MetadataTableType.values()) { + TableOperations ops = ((HasTableOperations) table).operations(); + Table metadataTable = + MetadataTableUtils.createMetadataTableInstance(ops, table.name(), "meta", type); + SerializableTable serializableMetadataTable = + (SerializableTable) SerializableTable.copyOf(metadataTable); + + TestHelpers.assertSerializedAndLoadedMetadata( + metadataTable, + roundTripKryoSerialize(SerializableTable.class, serializableMetadataTable)); + } + } + + @Test + public void testSerializableTransactionTableKryoSerialization() throws IOException { + Transaction txn = table.newTransaction(); + + txn.updateProperties().set("k1", "v1").commit(); + + Table txnTable = txn.table(); + SerializableTable serializableTxnTable = (SerializableTable) SerializableTable.copyOf(txnTable); + + TestHelpers.assertSerializedMetadata( + txnTable, roundTripKryoSerialize(SerializableTable.class, serializableTxnTable)); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/actions/TestRewriteDataFilesAction.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/actions/TestRewriteDataFilesAction.java new file mode 100644 index 000000000000..6b8399f666d4 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/actions/TestRewriteDataFilesAction.java @@ -0,0 +1,481 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.actions; + +import static org.apache.iceberg.flink.SimpleDataUtil.RECORD; +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Path; +import java.util.Collection; +import java.util.List; +import java.util.Set; +import java.util.UUID; +import java.util.stream.Collectors; +import org.apache.commons.lang3.StringUtils; +import org.apache.flink.configuration.CoreOptions; +import org.apache.flink.table.api.TableEnvironment; +import org.apache.flink.types.Row; +import org.apache.iceberg.ContentFile; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.DataFiles; +import org.apache.iceberg.DeleteFile; +import org.apache.iceberg.FileContent; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.FileScanTask; +import org.apache.iceberg.Files; +import org.apache.iceberg.Parameter; +import org.apache.iceberg.Parameters; +import org.apache.iceberg.Schema; +import org.apache.iceberg.Table; +import org.apache.iceberg.actions.RewriteDataFilesActionResult; +import org.apache.iceberg.catalog.Namespace; +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.data.GenericAppenderFactory; +import org.apache.iceberg.data.GenericRecord; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.exceptions.ValidationException; +import org.apache.iceberg.expressions.Expressions; +import org.apache.iceberg.flink.CatalogTestBase; +import org.apache.iceberg.flink.SimpleDataUtil; +import org.apache.iceberg.io.CloseableIterable; +import org.apache.iceberg.io.FileAppender; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.Iterables; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.types.Types; +import org.apache.iceberg.util.Pair; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.api.io.TempDir; + +public class TestRewriteDataFilesAction extends CatalogTestBase { + + private static final String TABLE_NAME_UNPARTITIONED = "test_table_unpartitioned"; + private static final String TABLE_NAME_PARTITIONED = "test_table_partitioned"; + private static final String TABLE_NAME_WITH_PK = "test_table_with_pk"; + + @Parameter(index = 2) + private FileFormat format; + + private Table icebergTableUnPartitioned; + private Table icebergTablePartitioned; + private Table icebergTableWithPk; + + @Override + protected TableEnvironment getTableEnv() { + super.getTableEnv().getConfig().getConfiguration().set(CoreOptions.DEFAULT_PARALLELISM, 1); + return super.getTableEnv(); + } + + @Parameters(name = "catalogName={0}, baseNamespace={1}, format={2}") + public static List parameters() { + List parameters = Lists.newArrayList(); + for (FileFormat format : + new FileFormat[] {FileFormat.AVRO, FileFormat.ORC, FileFormat.PARQUET}) { + for (Object[] catalogParams : CatalogTestBase.parameters()) { + String catalogName = (String) catalogParams[0]; + Namespace baseNamespace = (Namespace) catalogParams[1]; + parameters.add(new Object[] {catalogName, baseNamespace, format}); + } + } + return parameters; + } + + private @TempDir Path temp; + + @Override + @BeforeEach + public void before() { + super.before(); + sql("CREATE DATABASE %s", flinkDatabase); + sql("USE CATALOG %s", catalogName); + sql("USE %s", DATABASE); + sql( + "CREATE TABLE %s (id int, data varchar) with ('write.format.default'='%s')", + TABLE_NAME_UNPARTITIONED, format.name()); + icebergTableUnPartitioned = + validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE_NAME_UNPARTITIONED)); + + sql( + "CREATE TABLE %s (id int, data varchar,spec varchar) " + + " PARTITIONED BY (data,spec) with ('write.format.default'='%s')", + TABLE_NAME_PARTITIONED, format.name()); + icebergTablePartitioned = + validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE_NAME_PARTITIONED)); + + sql( + "CREATE TABLE %s (id int, data varchar, PRIMARY KEY(`id`) NOT ENFORCED) with ('write.format.default'='%s', 'format-version'='2')", + TABLE_NAME_WITH_PK, format.name()); + icebergTableWithPk = + validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE_NAME_WITH_PK)); + } + + @Override + @AfterEach + public void clean() { + sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, TABLE_NAME_UNPARTITIONED); + sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, TABLE_NAME_PARTITIONED); + sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, TABLE_NAME_WITH_PK); + dropDatabase(flinkDatabase, true); + super.clean(); + } + + @TestTemplate + public void testRewriteDataFilesEmptyTable() throws Exception { + assertThat(icebergTableUnPartitioned.currentSnapshot()).isNull(); + Actions.forTable(icebergTableUnPartitioned).rewriteDataFiles().execute(); + assertThat(icebergTableUnPartitioned.currentSnapshot()).isNull(); + } + + @TestTemplate + public void testRewriteDataFilesUnpartitionedTable() throws Exception { + sql("INSERT INTO %s SELECT 1, 'hello'", TABLE_NAME_UNPARTITIONED); + sql("INSERT INTO %s SELECT 2, 'world'", TABLE_NAME_UNPARTITIONED); + + icebergTableUnPartitioned.refresh(); + + CloseableIterable tasks = icebergTableUnPartitioned.newScan().planFiles(); + List dataFiles = + Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); + assertThat(dataFiles).hasSize(2); + RewriteDataFilesActionResult result = + Actions.forTable(icebergTableUnPartitioned).rewriteDataFiles().execute(); + + assertThat(result.deletedDataFiles()).hasSize(2); + assertThat(result.addedDataFiles()).hasSize(1); + + icebergTableUnPartitioned.refresh(); + + CloseableIterable tasks1 = icebergTableUnPartitioned.newScan().planFiles(); + List dataFiles1 = + Lists.newArrayList(CloseableIterable.transform(tasks1, FileScanTask::file)); + assertThat(dataFiles1).hasSize(1); + // Assert the table records as expected. + SimpleDataUtil.assertTableRecords( + icebergTableUnPartitioned, + Lists.newArrayList( + SimpleDataUtil.createRecord(1, "hello"), SimpleDataUtil.createRecord(2, "world"))); + } + + @TestTemplate + public void testRewriteDataFilesPartitionedTable() throws Exception { + sql("INSERT INTO %s SELECT 1, 'hello' ,'a'", TABLE_NAME_PARTITIONED); + sql("INSERT INTO %s SELECT 2, 'hello' ,'a'", TABLE_NAME_PARTITIONED); + sql("INSERT INTO %s SELECT 3, 'world' ,'b'", TABLE_NAME_PARTITIONED); + sql("INSERT INTO %s SELECT 4, 'world' ,'b'", TABLE_NAME_PARTITIONED); + + icebergTablePartitioned.refresh(); + + CloseableIterable tasks = icebergTablePartitioned.newScan().planFiles(); + List dataFiles = + Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); + assertThat(dataFiles).hasSize(4); + RewriteDataFilesActionResult result = + Actions.forTable(icebergTablePartitioned).rewriteDataFiles().execute(); + + assertThat(result.deletedDataFiles()).hasSize(4); + assertThat(result.addedDataFiles()).hasSize(2); + + icebergTablePartitioned.refresh(); + + CloseableIterable tasks1 = icebergTablePartitioned.newScan().planFiles(); + List dataFiles1 = + Lists.newArrayList(CloseableIterable.transform(tasks1, FileScanTask::file)); + assertThat(dataFiles1).hasSize(2); + // Assert the table records as expected. + Schema schema = + new Schema( + Types.NestedField.optional(1, "id", Types.IntegerType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get()), + Types.NestedField.optional(3, "spec", Types.StringType.get())); + + Record record = GenericRecord.create(schema); + SimpleDataUtil.assertTableRecords( + icebergTablePartitioned, + Lists.newArrayList( + record.copy("id", 1, "data", "hello", "spec", "a"), + record.copy("id", 2, "data", "hello", "spec", "a"), + record.copy("id", 3, "data", "world", "spec", "b"), + record.copy("id", 4, "data", "world", "spec", "b"))); + } + + @TestTemplate + public void testRewriteDataFilesWithFilter() throws Exception { + sql("INSERT INTO %s SELECT 1, 'hello' ,'a'", TABLE_NAME_PARTITIONED); + sql("INSERT INTO %s SELECT 2, 'hello' ,'a'", TABLE_NAME_PARTITIONED); + sql("INSERT INTO %s SELECT 3, 'world' ,'a'", TABLE_NAME_PARTITIONED); + sql("INSERT INTO %s SELECT 4, 'world' ,'b'", TABLE_NAME_PARTITIONED); + sql("INSERT INTO %s SELECT 5, 'world' ,'b'", TABLE_NAME_PARTITIONED); + + icebergTablePartitioned.refresh(); + + CloseableIterable tasks = icebergTablePartitioned.newScan().planFiles(); + List dataFiles = + Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); + assertThat(dataFiles).hasSize(5); + RewriteDataFilesActionResult result = + Actions.forTable(icebergTablePartitioned) + .rewriteDataFiles() + .filter(Expressions.equal("spec", "a")) + .filter(Expressions.startsWith("data", "he")) + .execute(); + assertThat(result.deletedDataFiles()).hasSize(2); + assertThat(result.addedDataFiles()).hasSize(1); + + icebergTablePartitioned.refresh(); + + CloseableIterable tasks1 = icebergTablePartitioned.newScan().planFiles(); + List dataFiles1 = + Lists.newArrayList(CloseableIterable.transform(tasks1, FileScanTask::file)); + assertThat(dataFiles1).hasSize(4); + // Assert the table records as expected. + Schema schema = + new Schema( + Types.NestedField.optional(1, "id", Types.IntegerType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get()), + Types.NestedField.optional(3, "spec", Types.StringType.get())); + + Record record = GenericRecord.create(schema); + SimpleDataUtil.assertTableRecords( + icebergTablePartitioned, + Lists.newArrayList( + record.copy("id", 1, "data", "hello", "spec", "a"), + record.copy("id", 2, "data", "hello", "spec", "a"), + record.copy("id", 3, "data", "world", "spec", "a"), + record.copy("id", 4, "data", "world", "spec", "b"), + record.copy("id", 5, "data", "world", "spec", "b"))); + } + + @TestTemplate + public void testRewriteLargeTableHasResiduals() throws IOException { + // all records belong to the same partition + List records1 = Lists.newArrayList(); + List records2 = Lists.newArrayList(); + List expected = Lists.newArrayList(); + for (int i = 0; i < 100; i++) { + int id = i; + String data = String.valueOf(i % 3); + if (i % 2 == 0) { + records1.add("(" + id + ",'" + data + "')"); + } else { + records2.add("(" + id + ",'" + data + "')"); + } + Record record = RECORD.copy(); + record.setField("id", id); + record.setField("data", data); + expected.add(record); + } + + sql("INSERT INTO %s values " + StringUtils.join(records1, ","), TABLE_NAME_UNPARTITIONED); + sql("INSERT INTO %s values " + StringUtils.join(records2, ","), TABLE_NAME_UNPARTITIONED); + + icebergTableUnPartitioned.refresh(); + + CloseableIterable tasks = + icebergTableUnPartitioned + .newScan() + .ignoreResiduals() + .filter(Expressions.equal("data", "0")) + .planFiles(); + for (FileScanTask task : tasks) { + assertThat(task.residual()) + .as("Residuals must be ignored") + .isEqualTo(Expressions.alwaysTrue()); + } + List dataFiles = + Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); + assertThat(dataFiles).hasSize(2); + Actions actions = Actions.forTable(icebergTableUnPartitioned); + + RewriteDataFilesActionResult result = + actions.rewriteDataFiles().filter(Expressions.equal("data", "0")).execute(); + assertThat(result.deletedDataFiles()).hasSize(2); + assertThat(result.addedDataFiles()).hasSize(1); + // Assert the table records as expected. + SimpleDataUtil.assertTableRecords(icebergTableUnPartitioned, expected); + } + + /** + * a test case to test avoid repeate compress + * + *

    If datafile cannot be combined to CombinedScanTask with other DataFiles, the size of the + * CombinedScanTask list size is 1, so we remove these CombinedScanTasks to avoid compressed + * repeatedly. + * + *

    In this test case,we generated 3 data files and set targetSizeInBytes greater than the + * largest file size so that it cannot be combined a CombinedScanTask with other datafiles. The + * datafile with the largest file size will not be compressed. + * + * @throws IOException IOException + */ + @TestTemplate + public void testRewriteAvoidRepeateCompress() throws IOException { + List expected = Lists.newArrayList(); + Schema schema = icebergTableUnPartitioned.schema(); + GenericAppenderFactory genericAppenderFactory = new GenericAppenderFactory(schema); + File file = File.createTempFile("junit", null, temp.toFile()); + int count = 0; + try (FileAppender fileAppender = + genericAppenderFactory.newAppender(Files.localOutput(file), format)) { + long filesize = 20000; + for (; fileAppender.length() < filesize; count++) { + Record record = SimpleDataUtil.createRecord(count, UUID.randomUUID().toString()); + fileAppender.add(record); + expected.add(record); + } + } + + DataFile dataFile = + DataFiles.builder(icebergTableUnPartitioned.spec()) + .withPath(file.getAbsolutePath()) + .withFileSizeInBytes(file.length()) + .withFormat(format) + .withRecordCount(count) + .build(); + + icebergTableUnPartitioned.newAppend().appendFile(dataFile).commit(); + + sql("INSERT INTO %s SELECT 1,'a' ", TABLE_NAME_UNPARTITIONED); + sql("INSERT INTO %s SELECT 2,'b' ", TABLE_NAME_UNPARTITIONED); + + icebergTableUnPartitioned.refresh(); + + CloseableIterable tasks = icebergTableUnPartitioned.newScan().planFiles(); + List dataFiles = + Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); + assertThat(dataFiles).hasSize(3); + Actions actions = Actions.forTable(icebergTableUnPartitioned); + + long targetSizeInBytes = file.length() + 10; + RewriteDataFilesActionResult result = + actions + .rewriteDataFiles() + .targetSizeInBytes(targetSizeInBytes) + .splitOpenFileCost(1) + .execute(); + assertThat(result.deletedDataFiles()).hasSize(2); + assertThat(result.addedDataFiles()).hasSize(1); + icebergTableUnPartitioned.refresh(); + + CloseableIterable tasks1 = icebergTableUnPartitioned.newScan().planFiles(); + List dataFilesRewrote = + Lists.newArrayList(CloseableIterable.transform(tasks1, FileScanTask::file)); + assertThat(dataFilesRewrote).hasSize(2); + // the biggest file do not be rewrote + List rewroteDataFileNames = + dataFilesRewrote.stream().map(ContentFile::path).collect(Collectors.toList()); + assertThat(rewroteDataFileNames).contains(file.getAbsolutePath()); + + // Assert the table records as expected. + expected.add(SimpleDataUtil.createRecord(1, "a")); + expected.add(SimpleDataUtil.createRecord(2, "b")); + SimpleDataUtil.assertTableRecords(icebergTableUnPartitioned, expected); + } + + @TestTemplate + public void testRewriteNoConflictWithEqualityDeletes() throws IOException { + // Add 2 data files + sql("INSERT INTO %s SELECT 1, 'hello'", TABLE_NAME_WITH_PK); + sql("INSERT INTO %s SELECT 2, 'world'", TABLE_NAME_WITH_PK); + + // Load 2 stale tables to pass to rewrite actions + // Since the first rewrite will refresh stale1, we need another stale2 for the second rewrite + Table stale1 = + validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE_NAME_WITH_PK)); + Table stale2 = + validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE_NAME_WITH_PK)); + + // Add 1 data file and 1 equality-delete file + sql("INSERT INTO %s /*+ OPTIONS('upsert-enabled'='true')*/ SELECT 1, 'hi'", TABLE_NAME_WITH_PK); + + icebergTableWithPk.refresh(); + assertThat(icebergTableWithPk.currentSnapshot().sequenceNumber()) + .as("The latest sequence number should be greater than that of the stale snapshot") + .isEqualTo(stale1.currentSnapshot().sequenceNumber() + 1); + CloseableIterable tasks = icebergTableWithPk.newScan().planFiles(); + List dataFiles = + Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); + Set deleteFiles = + Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::deletes)).stream() + .flatMap(Collection::stream) + .collect(Collectors.toSet()); + assertThat(dataFiles).hasSize(3); + assertThat(deleteFiles).hasSize(1); + assertThat(Iterables.getOnlyElement(deleteFiles).content()) + .isEqualTo(FileContent.EQUALITY_DELETES); + shouldHaveDataAndFileSequenceNumbers( + TABLE_NAME_WITH_PK, + ImmutableList.of(Pair.of(1L, 1L), Pair.of(2L, 2L), Pair.of(3L, 3L), Pair.of(3L, 3L))); + + assertThatThrownBy( + () -> + Actions.forTable(stale1) + .rewriteDataFiles() + .useStartingSequenceNumber(false) + .execute(), + "Rewrite using new sequence number should fail") + .isInstanceOf(ValidationException.class); + + // Rewrite using the starting sequence number should succeed + RewriteDataFilesActionResult result = + Actions.forTable(stale2).rewriteDataFiles().useStartingSequenceNumber(true).execute(); + + // Should not rewrite files from the new commit + assertThat(result.deletedDataFiles()).hasSize(2); + assertThat(result.addedDataFiles()).hasSize(1); + // The 2 older files with file-sequence-number <= 2 should be rewritten into a new file. + // The new file is the one with file-sequence-number == 4. + // The new file should use rewrite's starting-sequence-number 2 as its data-sequence-number. + shouldHaveDataAndFileSequenceNumbers( + TABLE_NAME_WITH_PK, ImmutableList.of(Pair.of(3L, 3L), Pair.of(3L, 3L), Pair.of(2L, 4L))); + + // Assert the table records as expected. + SimpleDataUtil.assertTableRecords( + icebergTableWithPk, + Lists.newArrayList( + SimpleDataUtil.createRecord(1, "hi"), SimpleDataUtil.createRecord(2, "world"))); + } + + /** + * Assert that data files and delete files in the table should have expected data sequence numbers + * and file sequence numbers + * + * @param tableName table name + * @param expectedSequenceNumbers list of {@link Pair}'s. Each {@link Pair} contains + * (expectedDataSequenceNumber, expectedFileSequenceNumber) of a file. + */ + private void shouldHaveDataAndFileSequenceNumbers( + String tableName, List> expectedSequenceNumbers) { + // "status < 2" for added or existing entries + List liveEntries = sql("SELECT * FROM %s$entries WHERE status < 2", tableName); + + List> actualSequenceNumbers = + liveEntries.stream() + .map( + row -> + Pair.of( + row.getFieldAs("sequence_number"), row.getFieldAs("file_sequence_number"))) + .collect(Collectors.toList()); + assertThat(actualSequenceNumbers).hasSameElementsAs(expectedSequenceNumbers); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/data/RandomRowData.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/data/RandomRowData.java new file mode 100644 index 000000000000..cc58d9817ac6 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/data/RandomRowData.java @@ -0,0 +1,38 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.data; + +import org.apache.flink.table.data.RowData; +import org.apache.iceberg.Schema; +import org.apache.iceberg.data.RandomGenericData; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.flink.RowDataConverter; +import org.apache.iceberg.relocated.com.google.common.collect.Iterables; + +public class RandomRowData { + private RandomRowData() {} + + public static Iterable generate(Schema schema, int numRecords, long seed) { + return convert(schema, RandomGenericData.generate(schema, numRecords, seed)); + } + + public static Iterable convert(Schema schema, Iterable records) { + return Iterables.transform(records, record -> RowDataConverter.convert(schema, record)); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/data/RowDataToRowMapper.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/data/RowDataToRowMapper.java new file mode 100644 index 000000000000..74b1da6007e6 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/data/RowDataToRowMapper.java @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.data; + +import org.apache.flink.api.common.functions.RichMapFunction; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.conversion.DataStructureConverter; +import org.apache.flink.table.data.conversion.DataStructureConverters; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.table.types.utils.TypeConversions; +import org.apache.flink.types.Row; + +public class RowDataToRowMapper extends RichMapFunction { + + private final RowType rowType; + + private transient DataStructureConverter converter; + + public RowDataToRowMapper(RowType rowType) { + this.rowType = rowType; + } + + @Override + public void open(Configuration parameters) throws Exception { + this.converter = + DataStructureConverters.getConverter(TypeConversions.fromLogicalToDataType(rowType)); + } + + @Override + public Row map(RowData value) throws Exception { + return (Row) converter.toExternal(value); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkAvroReaderWriter.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkAvroReaderWriter.java new file mode 100644 index 000000000000..a1039d27d888 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkAvroReaderWriter.java @@ -0,0 +1,185 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.data; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.File; +import java.io.IOException; +import java.math.BigDecimal; +import java.sql.Date; +import java.sql.Time; +import java.util.Iterator; +import java.util.List; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.iceberg.Files; +import org.apache.iceberg.Schema; +import org.apache.iceberg.avro.Avro; +import org.apache.iceberg.data.DataTest; +import org.apache.iceberg.data.GenericRecord; +import org.apache.iceberg.data.RandomGenericData; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.data.avro.DataReader; +import org.apache.iceberg.data.avro.DataWriter; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.TestHelpers; +import org.apache.iceberg.io.CloseableIterable; +import org.apache.iceberg.io.FileAppender; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.types.Types; +import org.apache.iceberg.util.DateTimeUtil; +import org.junit.jupiter.api.Test; + +public class TestFlinkAvroReaderWriter extends DataTest { + + private static final int NUM_RECORDS = 100; + + private static final Schema SCHEMA_NUM_TYPE = + new Schema( + Types.NestedField.optional(1, "id", Types.IntegerType.get()), + Types.NestedField.optional(2, "int", Types.IntegerType.get()), + Types.NestedField.optional(3, "float", Types.FloatType.get()), + Types.NestedField.optional(4, "double", Types.DoubleType.get()), + Types.NestedField.optional(5, "date", Types.DateType.get()), + Types.NestedField.optional(6, "time", Types.TimeType.get()), + Types.NestedField.optional(7, "timestamp", Types.TimestampType.withoutZone()), + Types.NestedField.optional(8, "bigint", Types.LongType.get()), + Types.NestedField.optional(9, "decimal", Types.DecimalType.of(4, 2))); + + @Override + protected void writeAndValidate(Schema schema) throws IOException { + List expectedRecords = RandomGenericData.generate(schema, NUM_RECORDS, 1991L); + writeAndValidate(schema, expectedRecords, NUM_RECORDS); + } + + private void writeAndValidate(Schema schema, List expectedRecords, int numRecord) + throws IOException { + RowType flinkSchema = FlinkSchemaUtil.convert(schema); + List expectedRows = Lists.newArrayList(RandomRowData.convert(schema, expectedRecords)); + + File recordsFile = File.createTempFile("junit", null, temp.toFile()); + assertThat(recordsFile.delete()).isTrue(); + + // Write the expected records into AVRO file, then read them into RowData and assert with the + // expected Record list. + try (FileAppender writer = + Avro.write(Files.localOutput(recordsFile)) + .schema(schema) + .createWriterFunc(DataWriter::create) + .build()) { + writer.addAll(expectedRecords); + } + + try (CloseableIterable reader = + Avro.read(Files.localInput(recordsFile)) + .project(schema) + .createReaderFunc(FlinkAvroReader::new) + .build()) { + Iterator expected = expectedRecords.iterator(); + Iterator rows = reader.iterator(); + for (int i = 0; i < numRecord; i++) { + assertThat(rows).hasNext(); + TestHelpers.assertRowData(schema.asStruct(), flinkSchema, expected.next(), rows.next()); + } + assertThat(rows).isExhausted(); + } + + File rowDataFile = File.createTempFile("junit", null, temp.toFile()); + assertThat(rowDataFile.delete()).isTrue(); + + // Write the expected RowData into AVRO file, then read them into Record and assert with the + // expected RowData list. + try (FileAppender writer = + Avro.write(Files.localOutput(rowDataFile)) + .schema(schema) + .createWriterFunc(ignore -> new FlinkAvroWriter(flinkSchema)) + .build()) { + writer.addAll(expectedRows); + } + + try (CloseableIterable reader = + Avro.read(Files.localInput(rowDataFile)) + .project(schema) + .createReaderFunc(DataReader::create) + .build()) { + Iterator expected = expectedRows.iterator(); + Iterator records = reader.iterator(); + for (int i = 0; i < numRecord; i += 1) { + assertThat(records).hasNext(); + TestHelpers.assertRowData(schema.asStruct(), flinkSchema, records.next(), expected.next()); + } + assertThat(records).isExhausted(); + } + } + + private Record recordNumType( + int id, + int intV, + float floatV, + double doubleV, + long date, + long time, + long timestamp, + long bigint, + double decimal) { + Record record = GenericRecord.create(SCHEMA_NUM_TYPE); + record.setField("id", id); + record.setField("int", intV); + record.setField("float", floatV); + record.setField("double", doubleV); + record.setField( + "date", DateTimeUtil.dateFromDays((int) new Date(date).toLocalDate().toEpochDay())); + record.setField("time", new Time(time).toLocalTime()); + record.setField("timestamp", DateTimeUtil.timestampFromMicros(timestamp * 1000)); + record.setField("bigint", bigint); + record.setField("decimal", BigDecimal.valueOf(decimal)); + return record; + } + + @Test + public void testNumericTypes() throws IOException { + + List expected = + ImmutableList.of( + recordNumType( + 2, + Integer.MAX_VALUE, + Float.MAX_VALUE, + Double.MAX_VALUE, + Long.MAX_VALUE, + 1643811742000L, + 1643811742000L, + 1643811742000L, + 10.24d), + recordNumType( + 2, + Integer.MIN_VALUE, + Float.MIN_VALUE, + Double.MIN_VALUE, + Long.MIN_VALUE, + 1643811742000L, + 1643811742000L, + 1643811742000L, + 10.24d)); + + writeAndValidate(SCHEMA_NUM_TYPE, expected, 2); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkOrcReaderWriter.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkOrcReaderWriter.java new file mode 100644 index 000000000000..72f2ce4f4bce --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkOrcReaderWriter.java @@ -0,0 +1,107 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.data; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.File; +import java.io.IOException; +import java.util.Iterator; +import java.util.List; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.iceberg.Files; +import org.apache.iceberg.Schema; +import org.apache.iceberg.data.DataTest; +import org.apache.iceberg.data.RandomGenericData; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.data.orc.GenericOrcReader; +import org.apache.iceberg.data.orc.GenericOrcWriter; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.TestHelpers; +import org.apache.iceberg.io.CloseableIterable; +import org.apache.iceberg.io.FileAppender; +import org.apache.iceberg.orc.ORC; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; + +public class TestFlinkOrcReaderWriter extends DataTest { + private static final int NUM_RECORDS = 100; + + @Override + protected void writeAndValidate(Schema schema) throws IOException { + RowType flinkSchema = FlinkSchemaUtil.convert(schema); + List expectedRecords = RandomGenericData.generate(schema, NUM_RECORDS, 1990L); + List expectedRows = Lists.newArrayList(RandomRowData.convert(schema, expectedRecords)); + + File recordsFile = File.createTempFile("junit", null, temp.toFile()); + assertThat(recordsFile.delete()).isTrue(); + + // Write the expected records into ORC file, then read them into RowData and assert with the + // expected Record list. + try (FileAppender writer = + ORC.write(Files.localOutput(recordsFile)) + .schema(schema) + .createWriterFunc(GenericOrcWriter::buildWriter) + .build()) { + writer.addAll(expectedRecords); + } + + try (CloseableIterable reader = + ORC.read(Files.localInput(recordsFile)) + .project(schema) + .createReaderFunc(type -> new FlinkOrcReader(schema, type)) + .build()) { + Iterator expected = expectedRecords.iterator(); + Iterator rows = reader.iterator(); + for (int i = 0; i < NUM_RECORDS; i++) { + assertThat(rows).hasNext(); + TestHelpers.assertRowData(schema.asStruct(), flinkSchema, expected.next(), rows.next()); + } + assertThat(rows).isExhausted(); + } + + File rowDataFile = File.createTempFile("junit", null, temp.toFile()); + assertThat(rowDataFile.delete()).isTrue(); + + // Write the expected RowData into ORC file, then read them into Record and assert with the + // expected RowData list. + RowType rowType = FlinkSchemaUtil.convert(schema); + try (FileAppender writer = + ORC.write(Files.localOutput(rowDataFile)) + .schema(schema) + .createWriterFunc((iSchema, typeDesc) -> FlinkOrcWriter.buildWriter(rowType, iSchema)) + .build()) { + writer.addAll(expectedRows); + } + + try (CloseableIterable reader = + ORC.read(Files.localInput(rowDataFile)) + .project(schema) + .createReaderFunc(type -> GenericOrcReader.buildReader(schema, type)) + .build()) { + Iterator expected = expectedRows.iterator(); + Iterator records = reader.iterator(); + for (int i = 0; i < NUM_RECORDS; i += 1) { + assertThat(records.hasNext()).isTrue(); + TestHelpers.assertRowData(schema.asStruct(), flinkSchema, records.next(), expected.next()); + } + assertThat(records).isExhausted(); + } + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkParquetReader.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkParquetReader.java new file mode 100644 index 000000000000..4cfb24f62921 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkParquetReader.java @@ -0,0 +1,239 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.data; + +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.parquet.schema.Types.primitive; +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.File; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.Iterator; +import java.util.List; +import org.apache.avro.generic.GenericData; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.generic.GenericRecordBuilder; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.hadoop.fs.Path; +import org.apache.iceberg.Files; +import org.apache.iceberg.Schema; +import org.apache.iceberg.avro.AvroSchemaUtil; +import org.apache.iceberg.data.DataTest; +import org.apache.iceberg.data.RandomGenericData; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.data.parquet.GenericParquetWriter; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.TestHelpers; +import org.apache.iceberg.io.CloseableIterable; +import org.apache.iceberg.io.FileAppender; +import org.apache.iceberg.parquet.Parquet; +import org.apache.iceberg.parquet.ParquetValueReader; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.types.Types; +import org.apache.parquet.avro.AvroParquetWriter; +import org.apache.parquet.hadoop.ParquetWriter; +import org.apache.parquet.schema.LogicalTypeAnnotation; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Type; +import org.junit.jupiter.api.Test; + +public class TestFlinkParquetReader extends DataTest { + private static final int NUM_RECORDS = 100; + + @Test + public void testBuildReader() { + MessageType fileSchema = + new MessageType( + "test", + // 0: required(100, "id", LongType.get()) + primitive(PrimitiveType.PrimitiveTypeName.INT64, Type.Repetition.REQUIRED) + .id(100) + .named("id"), + // 1: optional(101, "data", Types.StringType.get()) + primitive(PrimitiveType.PrimitiveTypeName.BINARY, Type.Repetition.OPTIONAL) + .id(101) + .named("data"), + // 2: required(102, "b", Types.BooleanType.get()) + primitive(PrimitiveType.PrimitiveTypeName.BOOLEAN, Type.Repetition.REQUIRED) + .id(102) + .named("b"), + // 3: optional(103, "i", Types.IntegerType.get()) + primitive(PrimitiveType.PrimitiveTypeName.INT32, Type.Repetition.OPTIONAL) + .id(103) + .named("i"), + // 4: optional(105, "f", Types.FloatType.get()) + primitive(PrimitiveType.PrimitiveTypeName.INT64, Type.Repetition.REQUIRED) + .id(104) + .named("l"), + // 5: required(106, "d", Types.DoubleType.get()) + primitive(PrimitiveType.PrimitiveTypeName.FLOAT, Type.Repetition.OPTIONAL) + .id(105) + .named("f"), + // 6: required(106, "d", Types.DoubleType.get()) + primitive(PrimitiveType.PrimitiveTypeName.DOUBLE, Type.Repetition.REQUIRED) + .id(106) + .named("d"), + // 7: optional(107, "date", Types.DateType.get()) + primitive(PrimitiveType.PrimitiveTypeName.INT32, Type.Repetition.OPTIONAL) + .id(107) + .as(LogicalTypeAnnotation.dateType()) + .named("date"), + // 8: required(108, "ts_tz", Types.TimestampType.withZone()) + primitive(PrimitiveType.PrimitiveTypeName.INT64, Type.Repetition.REQUIRED) + .id(108) + .as( + LogicalTypeAnnotation.timestampType( + true, LogicalTypeAnnotation.TimeUnit.MICROS)) + .named("ts_tz"), + // 9: required(109, "ts", Types.TimestampType.withoutZone()) + primitive(PrimitiveType.PrimitiveTypeName.INT64, Type.Repetition.REQUIRED) + .id(109) + .as( + LogicalTypeAnnotation.timestampType( + false, LogicalTypeAnnotation.TimeUnit.MICROS)) + .named("ts"), + // 10: required(110, "s", Types.StringType.get()) + primitive(PrimitiveType.PrimitiveTypeName.BINARY, Type.Repetition.REQUIRED) + .id(110) + .as(LogicalTypeAnnotation.stringType()) + .named("s"), + // 11: required(112, "fixed", Types.FixedType.ofLength(7)) + primitive( + PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY, Type.Repetition.REQUIRED) + .id(112) + .length(7) + .named("f"), + // 12: optional(113, "bytes", Types.BinaryType.get()) + primitive(PrimitiveType.PrimitiveTypeName.BINARY, Type.Repetition.OPTIONAL) + .id(113) + .named("bytes"), + // 13: required(114, "dec_9_0", Types.DecimalType.of(9, 0)) + primitive(PrimitiveType.PrimitiveTypeName.INT64, Type.Repetition.REQUIRED) + .id(114) + .as(LogicalTypeAnnotation.decimalType(0, 9)) + .named("dec_9_0"), + // 14: required(115, "dec_11_2", Types.DecimalType.of(11, 2)) + primitive(PrimitiveType.PrimitiveTypeName.INT64, Type.Repetition.REQUIRED) + .id(115) + .as(LogicalTypeAnnotation.decimalType(2, 11)) + .named("dec_11_2"), + // 15: required(116, "dec_38_10", Types.DecimalType.of(38, 10)) // maximum precision + primitive( + PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY, Type.Repetition.REQUIRED) + .id(116) + .length(16) + .as(LogicalTypeAnnotation.decimalType(10, 38)) + .named("dec_38_10"), + // 16: required(117, "time", Types.TimeType.get()) + primitive(PrimitiveType.PrimitiveTypeName.INT64, Type.Repetition.OPTIONAL) + .id(117) + .as(LogicalTypeAnnotation.timeType(true, LogicalTypeAnnotation.TimeUnit.MICROS)) + .named("time")); + ParquetValueReader reader = + FlinkParquetReaders.buildReader(new Schema(SUPPORTED_PRIMITIVES.fields()), fileSchema); + + assertThat(reader.columns().size()).isEqualTo(SUPPORTED_PRIMITIVES.fields().size()); + } + + @Test + public void testTwoLevelList() throws IOException { + Schema schema = + new Schema( + optional(1, "arraybytes", Types.ListType.ofRequired(3, Types.BinaryType.get())), + optional(2, "topbytes", Types.BinaryType.get())); + org.apache.avro.Schema avroSchema = AvroSchemaUtil.convert(schema.asStruct()); + + File testFile = File.createTempFile("junit", null, temp.toFile()); + assertThat(testFile.delete()).isTrue(); + + ParquetWriter writer = + AvroParquetWriter.builder(new Path(testFile.toURI())) + .withDataModel(GenericData.get()) + .withSchema(avroSchema) + .config("parquet.avro.add-list-element-records", "true") + .config("parquet.avro.write-old-list-structure", "true") + .build(); + + GenericRecordBuilder recordBuilder = new GenericRecordBuilder(avroSchema); + List expectedByteList = Lists.newArrayList(); + byte[] expectedByte = {0x00, 0x01}; + ByteBuffer expectedBinary = ByteBuffer.wrap(expectedByte); + expectedByteList.add(expectedBinary); + recordBuilder.set("arraybytes", expectedByteList); + recordBuilder.set("topbytes", expectedBinary); + GenericData.Record expectedRecord = recordBuilder.build(); + + writer.write(expectedRecord); + writer.close(); + + try (CloseableIterable reader = + Parquet.read(Files.localInput(testFile)) + .project(schema) + .createReaderFunc(type -> FlinkParquetReaders.buildReader(schema, type)) + .build()) { + Iterator rows = reader.iterator(); + assertThat(rows).hasNext(); + RowData rowData = rows.next(); + assertThat(rowData.getArray(0).getBinary(0)).isEqualTo(expectedByte); + assertThat(rowData.getBinary(1)).isEqualTo(expectedByte); + assertThat(rows).isExhausted(); + } + } + + private void writeAndValidate(Iterable iterable, Schema schema) throws IOException { + File testFile = File.createTempFile("junit", null, temp.toFile()); + assertThat(testFile.delete()).isTrue(); + + try (FileAppender writer = + Parquet.write(Files.localOutput(testFile)) + .schema(schema) + .createWriterFunc(GenericParquetWriter::buildWriter) + .build()) { + writer.addAll(iterable); + } + + try (CloseableIterable reader = + Parquet.read(Files.localInput(testFile)) + .project(schema) + .createReaderFunc(type -> FlinkParquetReaders.buildReader(schema, type)) + .build()) { + Iterator expected = iterable.iterator(); + Iterator rows = reader.iterator(); + LogicalType rowType = FlinkSchemaUtil.convert(schema); + for (int i = 0; i < NUM_RECORDS; i += 1) { + assertThat(rows).hasNext(); + TestHelpers.assertRowData(schema.asStruct(), rowType, expected.next(), rows.next()); + } + assertThat(rows).isExhausted(); + } + } + + @Override + protected void writeAndValidate(Schema schema) throws IOException { + writeAndValidate(RandomGenericData.generate(schema, NUM_RECORDS, 19981), schema); + writeAndValidate( + RandomGenericData.generateDictionaryEncodableRecords(schema, NUM_RECORDS, 21124), schema); + writeAndValidate( + RandomGenericData.generateFallbackRecords(schema, NUM_RECORDS, 21124, NUM_RECORDS / 20), + schema); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkParquetWriter.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkParquetWriter.java new file mode 100644 index 000000000000..b1e6f5aa00ff --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkParquetWriter.java @@ -0,0 +1,94 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.data; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Path; +import java.util.Iterator; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.iceberg.Files; +import org.apache.iceberg.Schema; +import org.apache.iceberg.data.DataTest; +import org.apache.iceberg.data.RandomGenericData; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.data.parquet.GenericParquetReaders; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.TestHelpers; +import org.apache.iceberg.io.CloseableIterable; +import org.apache.iceberg.io.FileAppender; +import org.apache.iceberg.parquet.Parquet; +import org.junit.jupiter.api.io.TempDir; + +public class TestFlinkParquetWriter extends DataTest { + private static final int NUM_RECORDS = 100; + + @TempDir private Path temp; + + private void writeAndValidate(Iterable iterable, Schema schema) throws IOException { + File testFile = File.createTempFile("junit", null, temp.toFile()); + assertThat(testFile.delete()).isTrue(); + + LogicalType logicalType = FlinkSchemaUtil.convert(schema); + + try (FileAppender writer = + Parquet.write(Files.localOutput(testFile)) + .schema(schema) + .createWriterFunc(msgType -> FlinkParquetWriters.buildWriter(logicalType, msgType)) + .build()) { + writer.addAll(iterable); + } + + try (CloseableIterable reader = + Parquet.read(Files.localInput(testFile)) + .project(schema) + .createReaderFunc(msgType -> GenericParquetReaders.buildReader(schema, msgType)) + .build()) { + Iterator expected = iterable.iterator(); + Iterator actual = reader.iterator(); + LogicalType rowType = FlinkSchemaUtil.convert(schema); + for (int i = 0; i < NUM_RECORDS; i += 1) { + assertThat(actual).hasNext(); + TestHelpers.assertRowData(schema.asStruct(), rowType, actual.next(), expected.next()); + } + assertThat(actual).isExhausted(); + } + } + + @Override + protected void writeAndValidate(Schema schema) throws IOException { + writeAndValidate(RandomRowData.generate(schema, NUM_RECORDS, 19981), schema); + + writeAndValidate( + RandomRowData.convert( + schema, + RandomGenericData.generateDictionaryEncodableRecords(schema, NUM_RECORDS, 21124)), + schema); + + writeAndValidate( + RandomRowData.convert( + schema, + RandomGenericData.generateFallbackRecords( + schema, NUM_RECORDS, 21124, NUM_RECORDS / 20)), + schema); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/data/TestRowDataProjection.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/data/TestRowDataProjection.java new file mode 100644 index 000000000000..d078b2228456 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/data/TestRowDataProjection.java @@ -0,0 +1,593 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.data; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatNoException; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import java.util.List; +import org.apache.flink.table.data.GenericArrayData; +import org.apache.flink.table.data.GenericMapData; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.iceberg.Schema; +import org.apache.iceberg.StructLike; +import org.apache.iceberg.data.RandomGenericData; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.flink.DataGenerator; +import org.apache.iceberg.flink.DataGenerators; +import org.apache.iceberg.flink.TestHelpers; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.types.Types; +import org.apache.iceberg.util.StructProjection; +import org.junit.jupiter.api.Test; + +public class TestRowDataProjection { + @Test + public void testNullRootRowData() { + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); + + RowDataProjection projection = RowDataProjection.create(schema, schema.select("id")); + + assertThatThrownBy(() -> projection.wrap(null)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Invalid row data: null"); + } + + @Test + public void testFullProjection() { + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); + + generateAndValidate(schema, schema); + + GenericRowData rowData = GenericRowData.of(1L, StringData.fromString("a")); + GenericRowData copyRowData = GenericRowData.of(1L, StringData.fromString("a")); + GenericRowData otherRowData = GenericRowData.of(2L, StringData.fromString("b")); + testEqualsAndHashCode(schema, schema, rowData, copyRowData, otherRowData); + } + + @Test + public void testReorderedFullProjection() { + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); + + Schema reordered = + new Schema( + Types.NestedField.optional(1, "data", Types.StringType.get()), + Types.NestedField.required(0, "id", Types.LongType.get())); + + generateAndValidate(schema, reordered); + + GenericRowData rowData = GenericRowData.of(1L, StringData.fromString("a")); + GenericRowData copyRowData = GenericRowData.of(1L, StringData.fromString("a")); + GenericRowData otherRowData = GenericRowData.of(2L, StringData.fromString("b")); + testEqualsAndHashCode(schema, reordered, rowData, copyRowData, otherRowData); + } + + @Test + public void testBasicProjection() { + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); + Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); + Schema dataOnly = new Schema(Types.NestedField.optional(1, "data", Types.StringType.get())); + generateAndValidate(schema, idOnly); + generateAndValidate(schema, dataOnly); + + GenericRowData rowData = GenericRowData.of(1L, StringData.fromString("a")); + GenericRowData copyRowData = GenericRowData.of(1L, StringData.fromString("a")); + GenericRowData otherRowData = GenericRowData.of(2L, StringData.fromString("b")); + testEqualsAndHashCode(schema, idOnly, rowData, copyRowData, otherRowData); + testEqualsAndHashCode(schema, dataOnly, rowData, copyRowData, otherRowData); + } + + @Test + public void testEmptyProjection() { + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); + generateAndValidate(schema, schema.select()); + + GenericRowData rowData = GenericRowData.of(1L, StringData.fromString("a")); + GenericRowData copyRowData = GenericRowData.of(1L, StringData.fromString("a")); + GenericRowData otherRowData = GenericRowData.of(2L, StringData.fromString("b")); + testEqualsAndHashCode(schema, schema.select(), rowData, copyRowData, otherRowData, true); + } + + @Test + public void testRename() { + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); + + Schema renamed = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "renamed", Types.StringType.get())); + generateAndValidate(schema, renamed); + + GenericRowData rowData = GenericRowData.of(1L, StringData.fromString("a")); + GenericRowData copyRowData = GenericRowData.of(1L, StringData.fromString("a")); + GenericRowData otherRowData = GenericRowData.of(2L, StringData.fromString("b")); + testEqualsAndHashCode(schema, renamed, rowData, copyRowData, otherRowData); + } + + @Test + public void testNestedProjection() { + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 3, + "location", + Types.StructType.of( + Types.NestedField.required(1, "lat", Types.FloatType.get()), + Types.NestedField.required(2, "long", Types.FloatType.get())))); + + GenericRowData rowData = GenericRowData.of(1L, GenericRowData.of(1.0f, 1.0f)); + GenericRowData copyRowData = GenericRowData.of(1L, GenericRowData.of(1.0f, 1.0f)); + GenericRowData otherRowData = GenericRowData.of(2L, GenericRowData.of(2.0f, 2.0f)); + + GenericRowData rowDataNullStruct = GenericRowData.of(1L, null); + GenericRowData copyRowDataNullStruct = GenericRowData.of(1L, null); + GenericRowData otherRowDataNullStruct = GenericRowData.of(2L, null); + + // Project id only. + Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); + assertThat(idOnly.columns().size()).isGreaterThan(0); + generateAndValidate(schema, idOnly); + testEqualsAndHashCode(schema, idOnly, rowData, copyRowData, otherRowData); + testEqualsAndHashCode( + schema, idOnly, rowDataNullStruct, copyRowDataNullStruct, otherRowDataNullStruct); + + // Project lat only. + Schema latOnly = + new Schema( + Types.NestedField.optional( + 3, + "location", + Types.StructType.of(Types.NestedField.required(1, "lat", Types.FloatType.get())))); + assertThat(latOnly.columns().size()).isGreaterThan(0); + generateAndValidate(schema, latOnly); + testEqualsAndHashCode(schema, latOnly, rowData, copyRowData, otherRowData); + testEqualsAndHashCode( + schema, latOnly, rowDataNullStruct, copyRowDataNullStruct, otherRowDataNullStruct, true); + + // Project long only. + Schema longOnly = + new Schema( + Types.NestedField.optional( + 3, + "location", + Types.StructType.of(Types.NestedField.required(2, "long", Types.FloatType.get())))); + assertThat(longOnly.columns().size()).isGreaterThan(0); + generateAndValidate(schema, longOnly); + testEqualsAndHashCode(schema, longOnly, rowData, copyRowData, otherRowData); + testEqualsAndHashCode( + schema, longOnly, rowDataNullStruct, copyRowDataNullStruct, otherRowDataNullStruct, true); + + // Project location. + Schema locationOnly = schema.select("location"); + assertThat(locationOnly.columns().size()).isGreaterThan(0); + generateAndValidate(schema, locationOnly); + testEqualsAndHashCode(schema, locationOnly, rowData, copyRowData, otherRowData); + testEqualsAndHashCode( + schema, + locationOnly, + rowDataNullStruct, + copyRowDataNullStruct, + otherRowDataNullStruct, + true); + } + + @Test + public void testPrimitivesFullProjection() { + DataGenerator dataGenerator = new DataGenerators.Primitives(); + Schema schema = dataGenerator.icebergSchema(); + generateAndValidate(schema, schema); + + GenericRowData rowData = dataGenerator.generateFlinkRowData(); + GenericRowData copyRowData = dataGenerator.generateFlinkRowData(); + GenericRowData otherRowData = dataGenerator.generateFlinkRowData(); + // modify the string field value (position 6) + otherRowData.setField(6, StringData.fromString("foo_bar")); + testEqualsAndHashCode(schema, schema, rowData, copyRowData, otherRowData); + + GenericRowData rowDataNullOptionalFields = dataGenerator.generateFlinkRowData(); + setOptionalFieldsNullForPrimitives(rowDataNullOptionalFields); + GenericRowData copyRowDataNullOptionalFields = dataGenerator.generateFlinkRowData(); + setOptionalFieldsNullForPrimitives(copyRowDataNullOptionalFields); + GenericRowData otherRowDataNullOptionalFields = dataGenerator.generateFlinkRowData(); + // modify the string field value (position 6) + otherRowDataNullOptionalFields.setField(6, StringData.fromString("foo_bar")); + setOptionalFieldsNullForPrimitives(otherRowData); + testEqualsAndHashCode( + schema, + schema, + rowDataNullOptionalFields, + copyRowDataNullOptionalFields, + otherRowDataNullOptionalFields); + } + + private void setOptionalFieldsNullForPrimitives(GenericRowData rowData) { + // fields from [1, 5] range are optional + for (int pos = 1; pos <= 5; ++pos) { + rowData.setField(pos, null); + } + } + + @Test + public void testMapOfPrimitivesProjection() { + DataGenerator dataGenerator = new DataGenerators.MapOfPrimitives(); + Schema schema = dataGenerator.icebergSchema(); + + // Project id only. + Schema idOnly = schema.select("row_id"); + assertThat(idOnly.columns().size()).isGreaterThan(0); + generateAndValidate(schema, idOnly); + + // Project map only. + Schema mapOnly = schema.select("map_of_primitives"); + assertThat(mapOnly.columns().size()).isGreaterThan(0); + generateAndValidate(schema, mapOnly); + + // Project all. + generateAndValidate(schema, schema); + + GenericRowData rowData = dataGenerator.generateFlinkRowData(); + GenericRowData copyRowData = dataGenerator.generateFlinkRowData(); + // modify the map field value + GenericRowData otherRowData = + GenericRowData.of( + StringData.fromString("row_id_value"), + new GenericMapData( + ImmutableMap.of(StringData.fromString("foo"), 1, StringData.fromString("bar"), 2))); + testEqualsAndHashCode(schema, idOnly, rowData, copyRowData, otherRowData, true); + testEqualsAndHashCode(schema, mapOnly, rowData, copyRowData, otherRowData); + testEqualsAndHashCode(schema, schema, rowData, copyRowData, otherRowData); + + GenericRowData rowDataNullOptionalFields = + GenericRowData.of(StringData.fromString("row_id_value"), null); + GenericRowData copyRowDataNullOptionalFields = + GenericRowData.of(StringData.fromString("row_id_value"), null); + // modify the map field value + GenericRowData otherRowDataNullOptionalFields = + GenericRowData.of(StringData.fromString("other_row_id_value"), null); + testEqualsAndHashCode( + schema, + idOnly, + rowDataNullOptionalFields, + copyRowDataNullOptionalFields, + otherRowDataNullOptionalFields); + testEqualsAndHashCode( + schema, + mapOnly, + rowDataNullOptionalFields, + copyRowDataNullOptionalFields, + otherRowDataNullOptionalFields, + true); + testEqualsAndHashCode( + schema, + schema, + rowDataNullOptionalFields, + copyRowDataNullOptionalFields, + otherRowDataNullOptionalFields); + } + + @Test + public void testMapOfStructStructProjection() { + DataGenerator dataGenerator = new DataGenerators.MapOfStructStruct(); + Schema schema = dataGenerator.icebergSchema(); + + // Project id only. + Schema idOnly = schema.select("row_id"); + assertThat(idOnly.columns().size()).isGreaterThan(0); + generateAndValidate(schema, idOnly); + + // Project map only. + Schema mapOnly = schema.select("map"); + assertThat(mapOnly.columns().size()).isGreaterThan(0); + generateAndValidate(schema, mapOnly); + + // Project all. + generateAndValidate(schema, schema); + + // Project partial map key. + Schema partialMapKey = + new Schema( + Types.NestedField.optional( + 2, + "map", + Types.MapType.ofOptional( + 101, + 102, + Types.StructType.of( + Types.NestedField.required(201, "key", Types.LongType.get())), + Types.StructType.of( + Types.NestedField.required(203, "value", Types.LongType.get()), + Types.NestedField.required(204, "valueData", Types.StringType.get()))))); + assertThatThrownBy(() -> generateAndValidate(schema, partialMapKey)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("Cannot project a partial map key or value struct."); + + // Project partial map key. + Schema partialMapValue = + new Schema( + Types.NestedField.optional( + 2, + "map", + Types.MapType.ofOptional( + 101, + 102, + Types.StructType.of( + Types.NestedField.required(201, "key", Types.LongType.get()), + Types.NestedField.required(202, "keyData", Types.StringType.get())), + Types.StructType.of( + Types.NestedField.required(203, "value", Types.LongType.get()))))); + assertThatThrownBy(() -> generateAndValidate(schema, partialMapValue)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("Cannot project a partial map key or value struct."); + + GenericRowData rowData = dataGenerator.generateFlinkRowData(); + GenericRowData copyRowData = dataGenerator.generateFlinkRowData(); + // modify the map field value + GenericRowData otherRowData = + GenericRowData.of( + StringData.fromString("other_row_id_value"), + new GenericMapData( + ImmutableMap.of( + GenericRowData.of(1L, StringData.fromString("other_key_data")), + GenericRowData.of(1L, StringData.fromString("other_value_data"))))); + testEqualsAndHashCode(schema, idOnly, rowData, copyRowData, otherRowData); + testEqualsAndHashCode(schema, mapOnly, rowData, copyRowData, otherRowData); + testEqualsAndHashCode(schema, schema, rowData, copyRowData, otherRowData); + + GenericRowData rowDataNullOptionalFields = + GenericRowData.of( + StringData.fromString("row_id_value"), + new GenericMapData( + ImmutableMap.of(GenericRowData.of(1L, null), GenericRowData.of(1L, null)))); + GenericRowData copyRowDataNullOptionalFields = + GenericRowData.of( + StringData.fromString("row_id_value"), + new GenericMapData( + ImmutableMap.of(GenericRowData.of(1L, null), GenericRowData.of(1L, null)))); + // modify the map field value + GenericRowData otherRowDataNullOptionalFields = + GenericRowData.of( + StringData.fromString("other_row_id_value"), + new GenericMapData( + ImmutableMap.of(GenericRowData.of(2L, null), GenericRowData.of(2L, null)))); + testEqualsAndHashCode( + schema, + idOnly, + rowDataNullOptionalFields, + copyRowDataNullOptionalFields, + otherRowDataNullOptionalFields); + testEqualsAndHashCode( + schema, + mapOnly, + rowDataNullOptionalFields, + copyRowDataNullOptionalFields, + otherRowDataNullOptionalFields); + testEqualsAndHashCode( + schema, + schema, + rowDataNullOptionalFields, + copyRowDataNullOptionalFields, + otherRowDataNullOptionalFields); + } + + @Test + public void testArrayOfPrimitiveProjection() { + DataGenerator dataGenerator = new DataGenerators.ArrayOfPrimitive(); + Schema schema = dataGenerator.icebergSchema(); + + // Project id only. + Schema idOnly = schema.select("row_id"); + assertThat(idOnly.columns().size()).isGreaterThan(0); + generateAndValidate(schema, idOnly); + + // Project list only. + Schema arrayOnly = schema.select("array_of_int"); + assertThat(arrayOnly.columns().size()).isGreaterThan(0); + generateAndValidate(schema, arrayOnly); + + // Project all. + generateAndValidate(schema, schema); + + GenericRowData rowData = dataGenerator.generateFlinkRowData(); + GenericRowData copyRowData = dataGenerator.generateFlinkRowData(); + // modify the map field value + GenericRowData otherRowData = + GenericRowData.of( + StringData.fromString("other_row_id_value"), + new GenericArrayData(new Integer[] {4, 5, 6})); + testEqualsAndHashCode(schema, idOnly, rowData, copyRowData, otherRowData); + testEqualsAndHashCode(schema, arrayOnly, rowData, copyRowData, otherRowData); + testEqualsAndHashCode(schema, schema, rowData, copyRowData, otherRowData); + + GenericRowData rowDataNullOptionalFields = + GenericRowData.of( + StringData.fromString("row_id_value"), + new GenericArrayData(new Integer[] {1, null, 3})); + GenericRowData copyRowDataNullOptionalFields = + GenericRowData.of( + StringData.fromString("row_id_value"), + new GenericArrayData(new Integer[] {1, null, 3})); + // modify the map field value + GenericRowData otherRowDataNullOptionalFields = + GenericRowData.of( + StringData.fromString("other_row_id_value"), + new GenericArrayData(new Integer[] {4, null, 6})); + testEqualsAndHashCode( + schema, + idOnly, + rowDataNullOptionalFields, + copyRowDataNullOptionalFields, + otherRowDataNullOptionalFields); + testEqualsAndHashCode( + schema, + arrayOnly, + rowDataNullOptionalFields, + copyRowDataNullOptionalFields, + otherRowDataNullOptionalFields); + testEqualsAndHashCode( + schema, + schema, + rowDataNullOptionalFields, + copyRowDataNullOptionalFields, + otherRowDataNullOptionalFields); + } + + @Test + public void testArrayOfStructProjection() { + DataGenerator dataGenerator = new DataGenerators.ArrayOfStruct(); + Schema schema = dataGenerator.icebergSchema(); + + // Project id only. + Schema idOnly = schema.select("row_id"); + assertThat(idOnly.columns().size()).isGreaterThan(0); + generateAndValidate(schema, idOnly); + + // Project list only. + Schema arrayOnly = schema.select("array_of_struct"); + assertThat(arrayOnly.columns().size()).isGreaterThan(0); + generateAndValidate(schema, arrayOnly); + + // Project all. + generateAndValidate(schema, schema); + + // Project partial list value. + Schema partialList = + new Schema( + Types.NestedField.optional( + 2, + "array_of_struct", + Types.ListType.ofOptional( + 101, + Types.StructType.of( + Types.NestedField.required(202, "name", Types.StringType.get()))))); + + assertThatThrownBy(() -> generateAndValidate(schema, partialList)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("Cannot project a partial list element struct."); + + GenericRowData rowData = dataGenerator.generateFlinkRowData(); + GenericRowData copyRowData = dataGenerator.generateFlinkRowData(); + // modify the map field value + GenericRowData otherRowData = + GenericRowData.of( + StringData.fromString("row_id_value"), new GenericArrayData(new Integer[] {4, 5, 6})); + testEqualsAndHashCode(schema, schema, rowData, copyRowData, otherRowData); + + GenericRowData rowDataNullOptionalFields = + GenericRowData.of( + StringData.fromString("row_id_value"), + new GenericArrayData(new Integer[] {1, null, 3})); + GenericRowData copyRowDataNullOptionalFields = + GenericRowData.of( + StringData.fromString("row_id_value"), + new GenericArrayData(new Integer[] {1, null, 3})); + // modify the map field value + GenericRowData otherRowDataNullOptionalFields = + GenericRowData.of( + StringData.fromString("row_id_value"), + new GenericArrayData(new Integer[] {4, null, 6})); + testEqualsAndHashCode( + schema, + schema, + rowDataNullOptionalFields, + copyRowDataNullOptionalFields, + otherRowDataNullOptionalFields); + } + + private void generateAndValidate(Schema schema, Schema projectSchema) { + int numRecords = 100; + List recordList = RandomGenericData.generate(schema, numRecords, 102L); + List rowDataList = + Lists.newArrayList(RandomRowData.generate(schema, numRecords, 102L).iterator()); + assertThat(rowDataList).hasSize(recordList.size()); + + StructProjection structProjection = StructProjection.create(schema, projectSchema); + RowDataProjection rowDataProjection = RowDataProjection.create(schema, projectSchema); + + for (int i = 0; i < numRecords; i++) { + StructLike expected = structProjection.wrap(recordList.get(i)); + RowData projected = rowDataProjection.wrap(rowDataList.get(i)); + TestHelpers.assertRowData(projectSchema, expected, projected); + + assertThat(projected).isEqualTo(projected); + assertThat(projected).hasSameHashCodeAs(projected); + // make sure toString doesn't throw NPE for null values + assertThatNoException().isThrownBy(projected::toString); + } + } + + private void testEqualsAndHashCode( + Schema schema, + Schema projectionSchema, + RowData rowData, + RowData copyRowData, + RowData otherRowData) { + testEqualsAndHashCode(schema, projectionSchema, rowData, copyRowData, otherRowData, false); + } + + /** + * @param isOtherRowDataSameAsRowData sometimes projection on otherRowData can result in the same + * RowData, e.g. due to empty projection or null struct + */ + private void testEqualsAndHashCode( + Schema schema, + Schema projectionSchema, + RowData rowData, + RowData copyRowData, + RowData otherRowData, + boolean isOtherRowDataSameAsRowData) { + RowDataProjection projection = RowDataProjection.create(schema, projectionSchema); + RowDataProjection copyProjection = RowDataProjection.create(schema, projectionSchema); + RowDataProjection otherProjection = RowDataProjection.create(schema, projectionSchema); + + assertThat(projection.wrap(rowData)).isEqualTo(copyProjection.wrap(copyRowData)); + assertThat(projection.wrap(rowData)).hasSameHashCodeAs(copyProjection.wrap(copyRowData)); + + if (isOtherRowDataSameAsRowData) { + assertThat(projection.wrap(rowData)).isEqualTo(otherProjection.wrap(otherRowData)); + assertThat(projection.wrap(rowData)).hasSameHashCodeAs(otherProjection.wrap(otherRowData)); + } else { + assertThat(projection.wrap(rowData)).isNotEqualTo(otherProjection.wrap(otherRowData)); + assertThat(projection.wrap(rowData)) + .doesNotHaveSameHashCodeAs(otherProjection.wrap(otherRowData)); + } + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/data/TestRowProjection.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/data/TestRowProjection.java new file mode 100644 index 000000000000..7dd4e8759c0e --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/data/TestRowProjection.java @@ -0,0 +1,596 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.data; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.withPrecision; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Path; +import java.util.Map; +import org.apache.flink.table.data.ArrayData; +import org.apache.flink.table.data.GenericArrayData; +import org.apache.flink.table.data.GenericMapData; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.iceberg.Files; +import org.apache.iceberg.Schema; +import org.apache.iceberg.avro.Avro; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.io.FileAppender; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Iterables; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.types.Comparators; +import org.apache.iceberg.types.Types; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +public class TestRowProjection { + + @TempDir private Path temp; + + private RowData writeAndRead(String desc, Schema writeSchema, Schema readSchema, RowData row) + throws IOException { + File file = File.createTempFile("junit", desc + ".avro", temp.toFile()); + assertThat(file.delete()).isTrue(); + + try (FileAppender appender = + Avro.write(Files.localOutput(file)) + .schema(writeSchema) + .createWriterFunc(ignore -> new FlinkAvroWriter(FlinkSchemaUtil.convert(writeSchema))) + .build()) { + appender.add(row); + } + + Iterable records = + Avro.read(Files.localInput(file)) + .project(readSchema) + .createReaderFunc(FlinkAvroReader::new) + .build(); + + return Iterables.getOnlyElement(records); + } + + @Test + public void testFullProjection() throws Exception { + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); + + RowData row = GenericRowData.of(34L, StringData.fromString("test")); + + RowData projected = writeAndRead("full_projection", schema, schema, row); + + assertThat(projected.getLong(0)).isEqualTo(34); + assertThat(projected.getString(1)).asString().isEqualTo("test"); + } + + @Test + public void testSpecialCharacterProjection() throws Exception { + Schema schema = + new Schema( + Types.NestedField.required(0, "user id", Types.LongType.get()), + Types.NestedField.optional(1, "data%0", Types.StringType.get())); + + RowData row = GenericRowData.of(34L, StringData.fromString("test")); + + RowData full = writeAndRead("special_chars", schema, schema, row); + + assertThat(full.getLong(0)).isEqualTo(34L); + assertThat(full.getString(1)).asString().isEqualTo("test"); + + RowData projected = writeAndRead("special_characters", schema, schema.select("data%0"), full); + + assertThat(projected.getArity()).isEqualTo(1); + assertThat(projected.getString(0)).asString().isEqualTo("test"); + } + + @Test + public void testReorderedFullProjection() throws Exception { + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); + + RowData row = GenericRowData.of(34L, StringData.fromString("test")); + + Schema reordered = + new Schema( + Types.NestedField.optional(1, "data", Types.StringType.get()), + Types.NestedField.required(0, "id", Types.LongType.get())); + + RowData projected = writeAndRead("full_projection", schema, reordered, row); + + assertThat(projected.getString(0)).asString().isEqualTo("test"); + assertThat(projected.getLong(1)).isEqualTo(34); + } + + @Test + public void testReorderedProjection() throws Exception { + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); + + RowData row = GenericRowData.of(34L, StringData.fromString("test")); + + Schema reordered = + new Schema( + Types.NestedField.optional(2, "missing_1", Types.StringType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get()), + Types.NestedField.optional(3, "missing_2", Types.LongType.get())); + + RowData projected = writeAndRead("full_projection", schema, reordered, row); + + assertThat(projected.isNullAt(0)).isTrue(); + assertThat(projected.getString(1)).asString().isEqualTo("test"); + assertThat(projected.isNullAt(2)).isTrue(); + } + + @Test + public void testRenamedAddedField() throws Exception { + Schema schema = + new Schema( + Types.NestedField.required(1, "a", Types.LongType.get()), + Types.NestedField.required(2, "b", Types.LongType.get()), + Types.NestedField.required(3, "d", Types.LongType.get())); + + RowData row = GenericRowData.of(100L, 200L, 300L); + + Schema renamedAdded = + new Schema( + Types.NestedField.optional(1, "a", Types.LongType.get()), + Types.NestedField.optional(2, "b", Types.LongType.get()), + Types.NestedField.optional(3, "c", Types.LongType.get()), + Types.NestedField.optional(4, "d", Types.LongType.get())); + + RowData projected = writeAndRead("rename_and_add_column_projection", schema, renamedAdded, row); + assertThat(projected.getLong(0)) + .as("Should contain the correct value in column 1") + .isEqualTo(100L); + assertThat(projected.getLong(1)) + .as("Should contain the correct value in column 2") + .isEqualTo(200L); + assertThat(projected.getLong(2)) + .as("Should contain the correct value in column 1") + .isEqualTo(300L); + assertThat(projected.isNullAt(3)).as("Should contain empty value on new column 4").isTrue(); + } + + @Test + public void testEmptyProjection() throws Exception { + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); + + RowData row = GenericRowData.of(34L, StringData.fromString("test")); + + RowData projected = writeAndRead("empty_projection", schema, schema.select(), row); + + assertThat(projected).isNotNull(); + assertThat(projected.getArity()).isEqualTo(0); + } + + @Test + public void testBasicProjection() throws Exception { + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); + + RowData row = GenericRowData.of(34L, StringData.fromString("test")); + + Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); + + RowData projected = writeAndRead("basic_projection_id", writeSchema, idOnly, row); + assertThat(projected.getArity()).as("Should not project data").isEqualTo(1); + assertThat(projected.getLong(0)).isEqualTo(34L); + + Schema dataOnly = new Schema(Types.NestedField.optional(1, "data", Types.StringType.get())); + + projected = writeAndRead("basic_projection_data", writeSchema, dataOnly, row); + + assertThat(projected.getArity()).as("Should not project id").isEqualTo(1); + int cmp = Comparators.charSequences().compare("test", projected.getString(0).toString()); + assertThat(projected.getString(0)).asString().isEqualTo("test"); + } + + @Test + public void testRename() throws Exception { + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); + + RowData row = GenericRowData.of(34L, StringData.fromString("test")); + + Schema readSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "renamed", Types.StringType.get())); + + RowData projected = writeAndRead("project_and_rename", writeSchema, readSchema, row); + + assertThat(projected.getLong(0)).isEqualTo(34L); + assertThat(projected.getString(1)) + .as("Should contain the correct data/renamed value") + .asString() + .isEqualTo("test"); + } + + @Test + public void testNestedStructProjection() throws Exception { + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 3, + "location", + Types.StructType.of( + Types.NestedField.required(1, "lat", Types.FloatType.get()), + Types.NestedField.required(2, "long", Types.FloatType.get())))); + + RowData location = GenericRowData.of(52.995143f, -1.539054f); + RowData record = GenericRowData.of(34L, location); + + Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); + + RowData projected = writeAndRead("id_only", writeSchema, idOnly, record); + assertThat(projected.getArity()).isEqualTo(1); + assertThat(projected.getLong(0)).as("Should contain the correct id value").isEqualTo(34L); + + Schema latOnly = + new Schema( + Types.NestedField.optional( + 3, + "location", + Types.StructType.of(Types.NestedField.required(1, "lat", Types.FloatType.get())))); + + projected = writeAndRead("latitude_only", writeSchema, latOnly, record); + RowData projectedLocation = projected.getRow(0, 1); + assertThat(projected.getArity()).as("Should not project id").isEqualTo(1); + assertThat(projected.isNullAt(0)).as("Should project location").isFalse(); + assertThat(projectedLocation.getArity()).as("Should not project longitude").isEqualTo(1); + assertThat(projectedLocation.getFloat(0)) + .as("Should project latitude") + .isEqualTo(52.995143f, withPrecision(0.000001f)); + + Schema longOnly = + new Schema( + Types.NestedField.optional( + 3, + "location", + Types.StructType.of(Types.NestedField.required(2, "long", Types.FloatType.get())))); + + projected = writeAndRead("longitude_only", writeSchema, longOnly, record); + projectedLocation = projected.getRow(0, 1); + assertThat(projected.getArity()).as("Should not project id").isEqualTo(1); + assertThat(projected.isNullAt(0)).as("Should project location").isFalse(); + assertThat(projectedLocation.getArity()).as("Should not project latitutde").isEqualTo(1); + assertThat(projectedLocation.getFloat(0)) + .as("Should project longitude") + .isEqualTo(-1.539054f, withPrecision(0.000001f)); + + Schema locationOnly = writeSchema.select("location"); + projected = writeAndRead("location_only", writeSchema, locationOnly, record); + projectedLocation = projected.getRow(0, 1); + assertThat(projected.getArity()).as("Should not project id").isEqualTo(1); + assertThat(projected.isNullAt(0)).as("Should project location").isFalse(); + assertThat(projectedLocation.getFloat(0)) + .as("Should project latitude") + .isEqualTo(52.995143f, withPrecision(0.000001f)); + assertThat(projectedLocation.getFloat(1)) + .as("Should project longitude") + .isEqualTo(-1.539054f, withPrecision(0.000001f)); + } + + @Test + public void testMapProjection() throws IOException { + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 5, + "properties", + Types.MapType.ofOptional(6, 7, Types.StringType.get(), Types.StringType.get()))); + + GenericMapData properties = + new GenericMapData( + ImmutableMap.of( + StringData.fromString("a"), + StringData.fromString("A"), + StringData.fromString("b"), + StringData.fromString("B"))); + + RowData row = GenericRowData.of(34L, properties); + + Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); + + RowData projected = writeAndRead("id_only", writeSchema, idOnly, row); + assertThat(projected.getLong(0)).isEqualTo(34L); + assertThat(projected.getArity()).as("Should not project properties map").isEqualTo(1); + + Schema keyOnly = writeSchema.select("properties.key"); + projected = writeAndRead("key_only", writeSchema, keyOnly, row); + assertThat(projected.getArity()).as("Should not project id").isEqualTo(1); + assertThat(projected.getMap(0)).isEqualTo(properties); + + Schema valueOnly = writeSchema.select("properties.value"); + projected = writeAndRead("value_only", writeSchema, valueOnly, row); + assertThat(projected.getArity()).as("Should not project id").isEqualTo(1); + assertThat(projected.getMap(0)).isEqualTo(properties); + + Schema mapOnly = writeSchema.select("properties"); + projected = writeAndRead("map_only", writeSchema, mapOnly, row); + assertThat(projected.getArity()).as("Should not project id").isEqualTo(1); + assertThat(projected.getMap(0)).isEqualTo(properties); + } + + private Map toStringMap(Map map) { + Map stringMap = Maps.newHashMap(); + for (Map.Entry entry : map.entrySet()) { + if (entry.getValue() instanceof CharSequence) { + stringMap.put(entry.getKey().toString(), entry.getValue().toString()); + } else { + stringMap.put(entry.getKey().toString(), entry.getValue()); + } + } + return stringMap; + } + + @Test + public void testMapOfStructsProjection() throws IOException { + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 5, + "locations", + Types.MapType.ofOptional( + 6, + 7, + Types.StringType.get(), + Types.StructType.of( + Types.NestedField.required(1, "lat", Types.FloatType.get()), + Types.NestedField.required(2, "long", Types.FloatType.get()))))); + + RowData l1 = GenericRowData.of(53.992811f, -1.542616f); + RowData l2 = GenericRowData.of(52.995143f, -1.539054f); + GenericMapData map = + new GenericMapData( + ImmutableMap.of(StringData.fromString("L1"), l1, StringData.fromString("L2"), l2)); + RowData row = GenericRowData.of(34L, map); + + Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); + + RowData projected = writeAndRead("id_only", writeSchema, idOnly, row); + assertThat(projected.getLong(0)).isEqualTo(34L); + assertThat(projected.getArity()).as("Should not project locations map").isEqualTo(1); + + projected = writeAndRead("all_locations", writeSchema, writeSchema.select("locations"), row); + assertThat(projected.getArity()).as("Should not project id").isEqualTo(1); + assertThat(projected.getMap(0)).isEqualTo(row.getMap(1)); + + projected = writeAndRead("lat_only", writeSchema, writeSchema.select("locations.lat"), row); + GenericMapData locations = (GenericMapData) projected.getMap(0); + assertThat(locations).isNotNull(); + GenericArrayData l1l2Array = + new GenericArrayData( + new Object[] {StringData.fromString("L2"), StringData.fromString("L1")}); + assertThat(locations.keyArray()).isEqualTo(l1l2Array); + RowData projectedL1 = (RowData) locations.get(StringData.fromString("L1")); + assertThat(projectedL1).isNotNull(); + assertThat(projectedL1.getFloat(0)) + .as("L1 should contain lat") + .isEqualTo(53.992811f, withPrecision(0.000001f)); + assertThat(projectedL1.getArity()).as("L1 should not contain long").isEqualTo(1); + RowData projectedL2 = (RowData) locations.get(StringData.fromString("L2")); + assertThat(projectedL2).isNotNull(); + assertThat(projectedL2.getFloat(0)) + .as("L2 should contain lat") + .isEqualTo(52.995143f, withPrecision(0.000001f)); + assertThat(projectedL2.getArity()).as("L2 should not contain long").isEqualTo(1); + + projected = writeAndRead("long_only", writeSchema, writeSchema.select("locations.long"), row); + assertThat(projected.getArity()).as("Should not project id").isEqualTo(1); + locations = (GenericMapData) projected.getMap(0); + assertThat(locations).isNotNull(); + assertThat(locations.keyArray()).isEqualTo(l1l2Array); + projectedL1 = (RowData) locations.get(StringData.fromString("L1")); + assertThat(projectedL1).isNotNull(); + assertThat(projectedL1.getArity()).as("L1 should not contain lat").isEqualTo(1); + assertThat(projectedL1.getFloat(0)) + .as("L1 should contain long") + .isEqualTo(-1.542616f, withPrecision(0.000001f)); + projectedL2 = (RowData) locations.get(StringData.fromString("L2")); + assertThat(projectedL2).isNotNull(); + assertThat(projectedL2.getArity()).as("L2 should not contain lat").isEqualTo(1); + assertThat(projectedL2.getFloat(0)) + .as("L2 should contain long") + .isEqualTo(-1.539054f, withPrecision(0.000001f)); + + Schema latitiudeRenamed = + new Schema( + Types.NestedField.optional( + 5, + "locations", + Types.MapType.ofOptional( + 6, + 7, + Types.StringType.get(), + Types.StructType.of( + Types.NestedField.required(1, "latitude", Types.FloatType.get()))))); + + projected = writeAndRead("latitude_renamed", writeSchema, latitiudeRenamed, row); + assertThat(projected.getArity()).as("Should not project id").isEqualTo(1); + locations = (GenericMapData) projected.getMap(0); + assertThat(locations).isNotNull(); + assertThat(locations.keyArray()).isEqualTo(l1l2Array); + projectedL1 = (RowData) locations.get(StringData.fromString("L1")); + assertThat(projectedL1).isNotNull(); + assertThat(projectedL1.getFloat(0)) + .as("L1 should contain latitude") + .isEqualTo(53.992811f, withPrecision(0.000001f)); + projectedL2 = (RowData) locations.get(StringData.fromString("L2")); + assertThat(projectedL2).isNotNull(); + assertThat(projectedL2.getFloat(0)) + .as("L2 should contain latitude") + .isEqualTo(52.995143f, withPrecision(0.000001f)); + } + + @Test + public void testListProjection() throws IOException { + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 10, "values", Types.ListType.ofOptional(11, Types.LongType.get()))); + + GenericArrayData values = new GenericArrayData(new Long[] {56L, 57L, 58L}); + + RowData row = GenericRowData.of(34L, values); + + Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); + + RowData projected = writeAndRead("id_only", writeSchema, idOnly, row); + assertThat(projected.getLong(0)).isEqualTo(34L); + assertThat(projected.getArity()).as("Should not project values list").isEqualTo(1); + + Schema elementOnly = writeSchema.select("values.element"); + projected = writeAndRead("element_only", writeSchema, elementOnly, row); + assertThat(projected.getArity()).as("Should not project id").isEqualTo(1); + assertThat(projected.getArray(0)).isEqualTo(values); + + Schema listOnly = writeSchema.select("values"); + projected = writeAndRead("list_only", writeSchema, listOnly, row); + assertThat(projected.getArity()).as("Should not project id").isEqualTo(1); + assertThat(projected.getArray(0)).isEqualTo(values); + } + + @Test + @SuppressWarnings("unchecked") + public void testListOfStructsProjection() throws IOException { + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 22, + "points", + Types.ListType.ofOptional( + 21, + Types.StructType.of( + Types.NestedField.required(19, "x", Types.IntegerType.get()), + Types.NestedField.optional(18, "y", Types.IntegerType.get()))))); + + RowData p1 = GenericRowData.of(1, 2); + RowData p2 = GenericRowData.of(3, null); + GenericArrayData arrayData = new GenericArrayData(new RowData[] {p1, p2}); + RowData row = GenericRowData.of(34L, arrayData); + + Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); + + RowData projected = writeAndRead("id_only", writeSchema, idOnly, row); + assertThat(projected.getLong(0)).isEqualTo(34L); + assertThat(projected.getArity()).isEqualTo(1); + + projected = writeAndRead("all_points", writeSchema, writeSchema.select("points"), row); + assertThat(projected.getArity()).as("Should not project id").isEqualTo(1); + assertThat(projected.getArray(0)).isEqualTo(row.getArray(1)); + + projected = writeAndRead("x_only", writeSchema, writeSchema.select("points.x"), row); + assertThat(projected.getArity()).as("Should not project id").isEqualTo(1); + assertThat(projected.isNullAt(0)).isFalse(); + ArrayData points = projected.getArray(0); + assertThat(points.size()).isEqualTo(2); + RowData projectedP1 = points.getRow(0, 2); + assertThat(projectedP1.getInt(0)).as("Should project x").isEqualTo(1); + assertThat(projectedP1.getArity()).as("Should not project y").isEqualTo(1); + RowData projectedP2 = points.getRow(1, 2); + assertThat(projectedP2.getArity()).as("Should not project y").isEqualTo(1); + assertThat(projectedP2.getInt(0)).as("Should project x").isEqualTo(3); + + projected = writeAndRead("y_only", writeSchema, writeSchema.select("points.y"), row); + assertThat(projected.getArity()).as("Should not project id").isEqualTo(1); + assertThat(projected.isNullAt(0)).isFalse(); + points = projected.getArray(0); + assertThat(points.size()).isEqualTo(2); + projectedP1 = points.getRow(0, 2); + assertThat(projectedP1.getArity()).as("Should not project x").isEqualTo(1); + assertThat(projectedP1.getInt(0)).as("Should project y").isEqualTo(2); + projectedP2 = points.getRow(1, 2); + assertThat(projectedP2.getArity()).as("Should not project x").isEqualTo(1); + assertThat(projectedP2.isNullAt(0)).as("Should project null y").isTrue(); + + Schema yRenamed = + new Schema( + Types.NestedField.optional( + 22, + "points", + Types.ListType.ofOptional( + 21, + Types.StructType.of( + Types.NestedField.optional(18, "z", Types.IntegerType.get()))))); + + projected = writeAndRead("y_renamed", writeSchema, yRenamed, row); + assertThat(projected.getArity()).as("Should not project id").isEqualTo(1); + assertThat(projected.isNullAt(0)).isFalse(); + points = projected.getArray(0); + assertThat(points.size()).isEqualTo(2); + projectedP1 = points.getRow(0, 2); + assertThat(projectedP1.getArity()).as("Should not project x and y").isEqualTo(1); + assertThat(projectedP1.getInt(0)).as("Should project z").isEqualTo(2); + projectedP2 = points.getRow(1, 2); + assertThat(projectedP2.getArity()).as("Should not project x and y").isEqualTo(1); + assertThat(projectedP2.isNullAt(0)).as("Should project null z").isTrue(); + } + + @Test + public void testAddedFieldsWithRequiredChildren() throws Exception { + Schema schema = new Schema(Types.NestedField.required(1, "a", Types.LongType.get())); + + RowData row = GenericRowData.of(100L); + + Schema addedFields = + new Schema( + Types.NestedField.optional(1, "a", Types.LongType.get()), + Types.NestedField.optional( + 2, + "b", + Types.StructType.of(Types.NestedField.required(3, "c", Types.LongType.get()))), + Types.NestedField.optional(4, "d", Types.ListType.ofRequired(5, Types.LongType.get())), + Types.NestedField.optional( + 6, + "e", + Types.MapType.ofRequired(7, 8, Types.LongType.get(), Types.LongType.get()))); + + RowData projected = + writeAndRead("add_fields_with_required_children_projection", schema, addedFields, row); + assertThat(projected.getLong(0)) + .as("Should contain the correct value in column 1") + .isEqualTo(100L); + assertThat(projected.isNullAt(1)).as("Should contain empty value in new column 2").isTrue(); + assertThat(projected.isNullAt(2)).as("Should contain empty value in new column 4").isTrue(); + assertThat(projected.isNullAt(3)).as("Should contain empty value in new column 6").isTrue(); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/data/TestStructRowData.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/data/TestStructRowData.java new file mode 100644 index 000000000000..eccab20e04fc --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/data/TestStructRowData.java @@ -0,0 +1,100 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.data; + +import org.apache.iceberg.data.GenericRecord; +import org.apache.iceberg.flink.DataGenerator; +import org.apache.iceberg.flink.DataGenerators; +import org.apache.iceberg.flink.TestHelpers; +import org.junit.jupiter.api.Test; + +public class TestStructRowData { + + protected void testConverter(DataGenerator dataGenerator) { + StructRowData converter = new StructRowData(dataGenerator.icebergSchema().asStruct()); + GenericRecord expected = dataGenerator.generateIcebergGenericRecord(); + StructRowData actual = converter.setStruct(expected); + TestHelpers.assertRowData(dataGenerator.icebergSchema(), expected, actual); + } + + @Test + public void testPrimitiveTypes() { + testConverter(new DataGenerators.Primitives()); + } + + @Test + public void testStructOfPrimitive() { + testConverter(new DataGenerators.StructOfPrimitive()); + } + + @Test + public void testStructOfArray() { + testConverter(new DataGenerators.StructOfArray()); + } + + @Test + public void testStructOfMap() { + testConverter(new DataGenerators.StructOfMap()); + } + + @Test + public void testStructOfStruct() { + testConverter(new DataGenerators.StructOfStruct()); + } + + @Test + public void testArrayOfPrimitive() { + testConverter(new DataGenerators.ArrayOfPrimitive()); + } + + @Test + public void testArrayOfArray() { + testConverter(new DataGenerators.ArrayOfArray()); + } + + @Test + public void testArrayOfMap() { + testConverter(new DataGenerators.ArrayOfMap()); + } + + @Test + public void testArrayOfStruct() { + testConverter(new DataGenerators.ArrayOfStruct()); + } + + @Test + public void testMapOfPrimitives() { + testConverter(new DataGenerators.MapOfPrimitives()); + } + + @Test + public void testMapOfArray() { + testConverter(new DataGenerators.MapOfArray()); + } + + @Test + public void testMapOfMap() { + testConverter(new DataGenerators.MapOfMap()); + } + + @Test + public void testMapOfStruct() { + testConverter(new DataGenerators.MapOfStruct()); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/CollectingSink.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/CollectingSink.java new file mode 100644 index 000000000000..9b6580fad0bf --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/CollectingSink.java @@ -0,0 +1,115 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.maintenance.operator; + +import java.time.Duration; +import java.util.Collections; +import java.util.List; +import java.util.concurrent.BlockingQueue; +import java.util.concurrent.LinkedBlockingQueue; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; +import java.util.concurrent.atomic.AtomicInteger; +import org.apache.flink.api.connector.sink2.Sink; +import org.apache.flink.api.connector.sink2.SinkWriter; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; + +/** Sink for collecting output during testing. */ +class CollectingSink implements Sink { + private static final long serialVersionUID = 1L; + private static final List> QUEUES = + Collections.synchronizedList(Lists.newArrayListWithExpectedSize(1)); + private static final AtomicInteger NUM_SINKS = new AtomicInteger(-1); + private final int index; + + /** Creates a new sink which collects the elements received. */ + CollectingSink() { + this.index = NUM_SINKS.incrementAndGet(); + QUEUES.add(new LinkedBlockingQueue<>()); + } + + /** + * Gets all the remaining output received by this {@link Sink}. + * + * @return all the remaining output + */ + List remainingOutput() { + return Lists.newArrayList((BlockingQueue) QUEUES.get(this.index)); + } + + /** + * Check if there is no remaining output received by this {@link Sink}. + * + * @return true if there is no remaining output + */ + boolean isEmpty() { + return QUEUES.get(this.index).isEmpty(); + } + + /** + * Wait until the next element received by the {@link Sink}. + * + * @param timeout for the poll + * @return The first element received by this {@link Sink} + * @throws TimeoutException if no element received until the timeout + */ + T poll(Duration timeout) throws TimeoutException { + Object element; + + try { + element = QUEUES.get(this.index).poll(timeout.toMillis(), TimeUnit.MILLISECONDS); + } catch (InterruptedException var4) { + throw new RuntimeException(var4); + } + + if (element == null) { + throw new TimeoutException(); + } else { + return (T) element; + } + } + + @Override + public SinkWriter createWriter(InitContext context) { + return new CollectingWriter<>(index); + } + + private static class CollectingWriter implements SinkWriter { + private final int index; + + CollectingWriter(int index) { + this.index = index; + } + + @Override + public void write(T element, Context context) { + QUEUES.get(index).add(element); + } + + @Override + public void flush(boolean endOfInput) { + // Nothing to do here + } + + @Override + public void close() { + // Nothing to do here + } + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/FlinkSqlExtension.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/FlinkSqlExtension.java new file mode 100644 index 000000000000..91d36aa3e85d --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/FlinkSqlExtension.java @@ -0,0 +1,135 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.maintenance.operator; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Comparator; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; +import java.util.stream.Stream; +import org.apache.flink.table.api.EnvironmentSettings; +import org.apache.flink.table.api.TableEnvironment; +import org.apache.flink.table.api.TableResult; +import org.apache.flink.types.Row; +import org.apache.flink.util.CloseableIterator; +import org.apache.hadoop.conf.Configuration; +import org.apache.iceberg.CatalogProperties; +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.flink.CatalogLoader; +import org.apache.iceberg.flink.TableLoader; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.junit.jupiter.api.extension.AfterEachCallback; +import org.junit.jupiter.api.extension.BeforeEachCallback; +import org.junit.jupiter.api.extension.ExtensionContext; + +/** + * Junit 5 extension for running Flink SQL queries. {@link + * org.apache.flink.test.junit5.MiniClusterExtension} is used for executing the SQL batch jobs. + */ +public class FlinkSqlExtension implements BeforeEachCallback, AfterEachCallback { + private final String catalogName; + private final Map catalogProperties; + private final String databaseName; + private final Path warehouse; + private final CatalogLoader catalogLoader; + private TableEnvironment tableEnvironment; + + public FlinkSqlExtension( + String catalogName, Map catalogProperties, String databaseName) { + this.catalogName = catalogName; + this.catalogProperties = Maps.newHashMap(catalogProperties); + this.databaseName = databaseName; + + // Add temporary dir as a warehouse location + try { + this.warehouse = Files.createTempDirectory("warehouse"); + } catch (IOException e) { + throw new RuntimeException(e); + } + this.catalogProperties.put( + CatalogProperties.WAREHOUSE_LOCATION, String.format("file://%s", warehouse)); + this.catalogLoader = + CatalogLoader.hadoop(catalogName, new Configuration(), this.catalogProperties); + } + + @Override + public void beforeEach(ExtensionContext context) { + // We need to recreate the tableEnvironment for every test as the minicluster is recreated + this.tableEnvironment = + TableEnvironment.create(EnvironmentSettings.newInstance().inBatchMode().build()); + exec("CREATE CATALOG %s WITH %s", catalogName, toWithClause(catalogProperties)); + exec("CREATE DATABASE %s.%s", catalogName, databaseName); + exec("USE CATALOG %s", catalogName); + exec("USE %s", databaseName); + } + + @Override + public void afterEach(ExtensionContext context) throws IOException { + List tables = exec("SHOW TABLES"); + tables.forEach(t -> exec("DROP TABLE IF EXISTS %s", t.getField(0))); + exec("USE CATALOG default_catalog"); + exec("DROP CATALOG IF EXISTS %s", catalogName); + try (Stream files = Files.walk(warehouse)) { + files.sorted(Comparator.reverseOrder()).map(Path::toFile).forEach(File::delete); + } + } + + /** + * Executes an SQL query with the given parameters. The parameter substitution is done by {@link + * String#format(String, Object...)}. + * + * @param query to run + * @param parameters to substitute to the query + * @return The {@link Row}s returned by the query + */ + public List exec(String query, Object... parameters) { + TableResult tableResult = tableEnvironment.executeSql(String.format(query, parameters)); + try (CloseableIterator iter = tableResult.collect()) { + return Lists.newArrayList(iter); + } catch (Exception e) { + throw new RuntimeException("Failed to collect table result", e); + } + } + + /** + * Returns the {@link TableLoader} which could be used to access the given table. + * + * @param tableName of the table + * @return the {@link TableLoader} for the table + */ + public TableLoader tableLoader(String tableName) { + TableLoader tableLoader = + TableLoader.fromCatalog(catalogLoader, TableIdentifier.of(databaseName, tableName)); + tableLoader.open(); + return tableLoader; + } + + private static String toWithClause(Map props) { + return String.format( + "(%s)", + props.entrySet().stream() + .map(e -> String.format("'%s'='%s'", e.getKey(), e.getValue())) + .collect(Collectors.joining(","))); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/FlinkStreamingTestUtils.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/FlinkStreamingTestUtils.java new file mode 100644 index 000000000000..9cdc55cb0cce --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/FlinkStreamingTestUtils.java @@ -0,0 +1,73 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.maintenance.operator; + +import java.io.File; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.core.execution.JobClient; +import org.apache.flink.core.execution.SavepointFormatType; +import org.apache.flink.runtime.jobgraph.SavepointConfigOptions; +import org.awaitility.Awaitility; + +class FlinkStreamingTestUtils { + private FlinkStreamingTestUtils() { + // Do not instantiate + } + + /** + * Close the {@link JobClient} and wait for the job closure. If the savepointDir is specified, it + * stops the job with a savepoint. + * + * @param jobClient the job to close + * @param savepointDir the savepointDir to store the last savepoint. If null then + * stop without a savepoint. + * @return configuration for restarting the job from the savepoint + */ + static Configuration closeJobClient(JobClient jobClient, File savepointDir) { + Configuration conf = new Configuration(); + if (jobClient != null) { + if (savepointDir != null) { + // Stop with savepoint + jobClient.stopWithSavepoint(false, savepointDir.getPath(), SavepointFormatType.CANONICAL); + // Wait until the savepoint is created and the job has been stopped + Awaitility.await().until(() -> savepointDir.listFiles(File::isDirectory).length == 1); + conf.set( + SavepointConfigOptions.SAVEPOINT_PATH, + savepointDir.listFiles(File::isDirectory)[0].getAbsolutePath()); + } else { + jobClient.cancel(); + } + + // Wait until the job has been stopped + Awaitility.await().until(() -> jobClient.getJobStatus().get().isTerminalState()); + return conf; + } + + return null; + } + + /** + * Close the {@link JobClient} and wait for the job closure. + * + * @param jobClient the job to close + */ + static void closeJobClient(JobClient jobClient) { + closeJobClient(jobClient, null); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/ManualSource.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/ManualSource.java new file mode 100644 index 000000000000..38bb9c393fa9 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/ManualSource.java @@ -0,0 +1,316 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.maintenance.operator; + +import java.util.ArrayDeque; +import java.util.Collections; +import java.util.List; +import java.util.concurrent.CompletableFuture; +import org.apache.flink.api.common.eventtime.WatermarkStrategy; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.connector.source.Boundedness; +import org.apache.flink.api.connector.source.ReaderOutput; +import org.apache.flink.api.connector.source.Source; +import org.apache.flink.api.connector.source.SourceReader; +import org.apache.flink.api.connector.source.SourceReaderContext; +import org.apache.flink.api.connector.source.SourceSplit; +import org.apache.flink.api.connector.source.SplitEnumerator; +import org.apache.flink.api.connector.source.SplitEnumeratorContext; +import org.apache.flink.api.java.tuple.Tuple2; +import org.apache.flink.api.java.typeutils.ResultTypeQueryable; +import org.apache.flink.core.io.InputStatus; +import org.apache.flink.core.io.SimpleVersionedSerializer; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Queues; +import org.jetbrains.annotations.Nullable; + +/** Testing source implementation for Flink sources which can be triggered manually. */ +class ManualSource + implements Source, + ResultTypeQueryable { + + private static final long serialVersionUID = 1L; + private static final List>> QUEUES = + Collections.synchronizedList(Lists.newArrayList()); + private static final List> AVAILABILITIES = + Collections.synchronizedList(Lists.newArrayList()); + private static int numSources = 0; + private final TypeInformation type; + private final int index; + private transient DataStream stream; + private final transient StreamExecutionEnvironment env; + + /** + * Creates a new source for testing. + * + * @param env to register the source + * @param type of the events returned by the source + */ + ManualSource(StreamExecutionEnvironment env, TypeInformation type) { + this.type = type; + this.env = env; + this.index = numSources++; + QUEUES.add(Queues.newArrayDeque()); + AVAILABILITIES.add(new CompletableFuture<>()); + } + + /** + * Emit a new record from the source. + * + * @param event to emit + */ + void sendRecord(T event) { + this.sendInternal(Tuple2.of(event, null)); + } + + /** + * Emit a new record with the given event time from the source. + * + * @param event to emit + * @param eventTime of the event + */ + void sendRecord(T event, long eventTime) { + this.sendInternal(Tuple2.of(event, eventTime)); + } + + /** + * Emit a watermark from the source. + * + * @param timeStamp of the watermark + */ + void sendWatermark(long timeStamp) { + this.sendInternal(Tuple2.of(null, timeStamp)); + } + + /** Mark the source as finished. */ + void markFinished() { + this.sendWatermark(Long.MAX_VALUE); + this.sendInternal(Tuple2.of(null, null)); + } + + /** + * Get the {@link DataStream} for this source. + * + * @return the stream emitted by this source + */ + DataStream dataStream() { + if (this.stream == null) { + this.stream = + this.env + .fromSource(this, WatermarkStrategy.noWatermarks(), "ManualSource-" + index, type) + .forceNonParallel(); + } + + return this.stream; + } + + private void sendInternal(Tuple2 tuple) { + QUEUES.get(index).offer(tuple); + AVAILABILITIES.get(index).complete(null); + } + + @Override + public Boundedness getBoundedness() { + return Boundedness.CONTINUOUS_UNBOUNDED; + } + + @Override + public SplitEnumerator createEnumerator( + SplitEnumeratorContext enumContext) { + return new DummyCheckpointEnumerator(); + } + + @Override + public SplitEnumerator restoreEnumerator( + SplitEnumeratorContext enumContext, DummyCheckpoint checkpoint) { + return new DummyCheckpointEnumerator(); + } + + @Override + public SimpleVersionedSerializer getSplitSerializer() { + return new NoOpDummySplitSerializer(); + } + + @Override + public SimpleVersionedSerializer getEnumeratorCheckpointSerializer() { + return new NoOpDummyCheckpointSerializer(); + } + + @Override + public SourceReader createReader(SourceReaderContext sourceReaderContext) { + return new SourceReader() { + @Override + public void start() { + // Do nothing + } + + @Override + public InputStatus pollNext(ReaderOutput output) { + Tuple2 next = (Tuple2) QUEUES.get(index).poll(); + + if (next != null) { + if (next.f0 == null) { + // No more input + return InputStatus.END_OF_INPUT; + } + + if (next.f1 == null) { + // No event time set + output.collect(next.f0); + } else { + // With event time + output.collect(next.f0, next.f1); + } + } + + AVAILABILITIES.set(index, new CompletableFuture<>()); + return QUEUES.get(index).isEmpty() + ? InputStatus.NOTHING_AVAILABLE + : InputStatus.MORE_AVAILABLE; + } + + @Override + public List snapshotState(long checkpointId) { + return Lists.newArrayList(new DummySplit()); + } + + @Override + public CompletableFuture isAvailable() { + return AVAILABILITIES.get(index); + } + + @Override + public void addSplits(List splits) { + // do nothing + } + + @Override + public void notifyNoMoreSplits() { + // do nothing + } + + @Override + public void close() { + // do nothing + } + }; + } + + @Override + public TypeInformation getProducedType() { + return this.type; + } + + /** + * Placeholder because the ManualSource itself implicitly represents the only split and does not + * require an actual split object. + */ + public static class DummySplit implements SourceSplit { + @Override + public String splitId() { + return "dummy"; + } + } + + /** + * Placeholder because the ManualSource does not support fault-tolerance and thus does not require + * actual checkpointing. + */ + public static class DummyCheckpoint {} + + /** Placeholder because the ManualSource does not need enumeration, but checkpointing needs it. */ + private static class DummyCheckpointEnumerator + implements SplitEnumerator { + + @Override + public void start() { + // do nothing + } + + @Override + public void handleSplitRequest(int subtaskId, @Nullable String requesterHostname) { + // do nothing + } + + @Override + public void addSplitsBack(List splits, int subtaskId) { + // do nothing + } + + @Override + public void addReader(int subtaskId) { + // do nothing + } + + @Override + public DummyCheckpoint snapshotState(long checkpointId) { + return new DummyCheckpoint(); + } + + @Override + public void close() { + // do nothing + } + } + + /** + * Not used - only required to avoid NullPointerException. The split is not transferred from the + * enumerator, it is implicitly represented by the ManualSource. + */ + private static class NoOpDummySplitSerializer implements SimpleVersionedSerializer { + @Override + public int getVersion() { + return 0; + } + + @Override + public byte[] serialize(DummySplit split) { + return new byte[0]; + } + + @Override + public DummySplit deserialize(int version, byte[] serialized) { + return new DummySplit(); + } + } + + /** + * Not used - only required to avoid NullPointerException. The split is not transferred from the + * enumerator, it is implicitly represented by the ManualSource. + */ + private static class NoOpDummyCheckpointSerializer + implements SimpleVersionedSerializer { + @Override + public int getVersion() { + return 0; + } + + @Override + public byte[] serialize(DummyCheckpoint split) { + return new byte[0]; + } + + @Override + public DummyCheckpoint deserialize(int version, byte[] serialized) { + return new DummyCheckpoint(); + } + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/OperatorTestBase.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/OperatorTestBase.java new file mode 100644 index 000000000000..272e0b693fd3 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/OperatorTestBase.java @@ -0,0 +1,51 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.maintenance.operator; + +import static org.apache.iceberg.flink.MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG; + +import org.apache.flink.configuration.Configuration; +import org.apache.flink.runtime.testutils.MiniClusterResourceConfiguration; +import org.apache.flink.test.junit5.MiniClusterExtension; +import org.apache.iceberg.flink.FlinkCatalogFactory; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.junit.jupiter.api.extension.RegisterExtension; + +class OperatorTestBase { + private static final int NUMBER_TASK_MANAGERS = 1; + private static final int SLOTS_PER_TASK_MANAGER = 8; + + static final String TABLE_NAME = "test_table"; + + @RegisterExtension + protected static final MiniClusterExtension MINI_CLUSTER_EXTENSION = + new MiniClusterExtension( + new MiniClusterResourceConfiguration.Builder() + .setNumberTaskManagers(NUMBER_TASK_MANAGERS) + .setNumberSlotsPerTaskManager(SLOTS_PER_TASK_MANAGER) + .setConfiguration(new Configuration(DISABLE_CLASSLOADER_CHECK_CONFIG)) + .build()); + + @RegisterExtension + final FlinkSqlExtension sql = + new FlinkSqlExtension( + "catalog", + ImmutableMap.of("type", "iceberg", FlinkCatalogFactory.ICEBERG_CATALOG_TYPE, "hadoop"), + "db"); +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestMonitorSource.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestMonitorSource.java new file mode 100644 index 000000000000..876d64214560 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestMonitorSource.java @@ -0,0 +1,362 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.maintenance.operator; + +import static org.apache.iceberg.flink.maintenance.operator.FlinkStreamingTestUtils.closeJobClient; +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import java.io.File; +import java.time.Duration; +import java.util.List; +import java.util.concurrent.atomic.AtomicReference; +import org.apache.flink.api.common.eventtime.WatermarkStrategy; +import org.apache.flink.api.connector.source.util.ratelimit.RateLimiterStrategy; +import org.apache.flink.configuration.CheckpointingOptions; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.core.execution.JobClient; +import org.apache.flink.runtime.client.JobExecutionException; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.hadoop.fs.Path; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.DeleteFile; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.RewriteFiles; +import org.apache.iceberg.Table; +import org.apache.iceberg.data.GenericAppenderHelper; +import org.apache.iceberg.data.RandomGenericData; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.flink.TableLoader; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.awaitility.Awaitility; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; + +class TestMonitorSource extends OperatorTestBase { + private static final TableChange EMPTY_EVENT = TableChange.empty(); + private static final RateLimiterStrategy HIGH_RATE = RateLimiterStrategy.perSecond(100.0); + private static final RateLimiterStrategy LOW_RATE = RateLimiterStrategy.perSecond(1.0 / 10000.0); + + @TempDir private File checkpointDir; + + @ParameterizedTest + @ValueSource(booleans = {true, false}) + void testChangeReaderIterator(boolean withDelete) { + if (withDelete) { + sql.exec( + "CREATE TABLE %s (id int, data varchar, PRIMARY KEY(`id`) NOT ENFORCED) WITH ('format-version'='2', 'write.upsert.enabled'='true')", + TABLE_NAME); + } else { + sql.exec("CREATE TABLE %s (id int, data varchar)", TABLE_NAME); + } + + TableLoader tableLoader = sql.tableLoader(TABLE_NAME); + tableLoader.open(); + Table table = tableLoader.loadTable(); + + MonitorSource.TableChangeIterator iterator = + new MonitorSource.TableChangeIterator(tableLoader, null, Long.MAX_VALUE); + + // For an empty table we get an empty result + assertThat(iterator.next()).isEqualTo(EMPTY_EVENT); + + // Add a single commit and get back the commit data in the event + sql.exec("INSERT INTO %s VALUES (1, 'a')", TABLE_NAME); + table.refresh(); + TableChange expected = tableChangeWithLastSnapshot(table, TableChange.empty()); + assertThat(iterator.next()).isEqualTo(expected); + // Make sure that consecutive calls do not return the data again + assertThat(iterator.next()).isEqualTo(EMPTY_EVENT); + + // Add two more commits, but fetch the data in one loop + sql.exec("INSERT INTO %s VALUES (2, 'b')", TABLE_NAME); + table.refresh(); + expected = tableChangeWithLastSnapshot(table, TableChange.empty()); + + sql.exec("INSERT INTO %s VALUES (3, 'c')", TABLE_NAME); + table.refresh(); + expected = tableChangeWithLastSnapshot(table, expected); + + assertThat(iterator.next()).isEqualTo(expected); + // Make sure that consecutive calls do not return the data again + assertThat(iterator.next()).isEqualTo(EMPTY_EVENT); + } + + /** + * Create a table and check that the source returns the data as new commits arrive to the table. + */ + @Test + void testSource() throws Exception { + sql.exec( + "CREATE TABLE %s (id int, data varchar) " + + "WITH ('flink.max-continuous-empty-commits'='100000')", + TABLE_NAME); + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + TableLoader tableLoader = sql.tableLoader(TABLE_NAME); + tableLoader.open(); + Table table = tableLoader.loadTable(); + DataStream events = + env.fromSource( + new MonitorSource(tableLoader, HIGH_RATE, Long.MAX_VALUE), + WatermarkStrategy.noWatermarks(), + "TableChangeSource") + .forceNonParallel(); + + // Sink to collect the results + CollectingSink result = new CollectingSink<>(); + events.sinkTo(result); + + JobClient jobClient = null; + try { + // First result is an empty event + jobClient = env.executeAsync("Table Change Source Test"); + assertThat(result.poll(Duration.ofSeconds(5L))).isEqualTo(EMPTY_EVENT); + + // Insert some data + File dataDir = new File(new Path(table.location(), "data").toUri().getPath()); + dataDir.mkdir(); + GenericAppenderHelper dataAppender = + new GenericAppenderHelper(table, FileFormat.PARQUET, dataDir.toPath()); + List batch1 = RandomGenericData.generate(table.schema(), 2, 1); + dataAppender.appendToTable(batch1); + + // Wait until the changes are committed + Awaitility.await() + .until( + () -> { + table.refresh(); + return table.currentSnapshot() != null; + }); + + table.refresh(); + long size = firstFileLength(table); + + // Wait until the first non-empty event has arrived, and check the expected result + Awaitility.await() + .until( + () -> { + TableChange newEvent = result.poll(Duration.ofSeconds(5L)); + // Fetch every empty event from the beginning + while (newEvent.equals(EMPTY_EVENT)) { + newEvent = result.poll(Duration.ofSeconds(5L)); + } + + // The first non-empty event should contain the expected value + return newEvent.equals(new TableChange(1, 0, size, 0L, 1)); + }); + } finally { + closeJobClient(jobClient); + } + } + + /** Check that the {@link MonitorSource} operator state is restored correctly. */ + @Test + void testStateRestore(@TempDir File savepointDir) throws Exception { + sql.exec("CREATE TABLE %s (id int, data varchar)", TABLE_NAME); + sql.exec("INSERT INTO %s VALUES (1, 'a')", TABLE_NAME); + + Configuration config = new Configuration(); + config.set(CheckpointingOptions.CHECKPOINT_STORAGE, "filesystem"); + config.set(CheckpointingOptions.CHECKPOINTS_DIRECTORY, "file://" + checkpointDir.getPath()); + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(config); + env.enableCheckpointing(1000); + + TableLoader tableLoader = sql.tableLoader(TABLE_NAME); + tableLoader.open(); + DataStream events = + env.fromSource( + new MonitorSource(tableLoader, HIGH_RATE, Long.MAX_VALUE), + WatermarkStrategy.noWatermarks(), + "TableChangeSource") + .forceNonParallel(); + + // Sink to collect the results + CollectingSink result = new CollectingSink<>(); + events.sinkTo(result); + + // Start the job + Configuration conf; + JobClient jobClient = null; + AtomicReference firstNonEmptyEvent = new AtomicReference<>(); + try { + jobClient = env.executeAsync("Table Change Source Test"); + + Awaitility.await() + .until( + () -> { + TableChange newEvent = result.poll(Duration.ofSeconds(5L)); + // Fetch every empty event from the beginning + while (newEvent.equals(EMPTY_EVENT)) { + newEvent = result.poll(Duration.ofSeconds(5L)); + } + + // The first non-empty event should contain the expected value + firstNonEmptyEvent.set(newEvent); + return true; + }); + } finally { + // Stop with savepoint + conf = closeJobClient(jobClient, savepointDir); + } + + // Restore from savepoint, create the same topology with a different env + env = StreamExecutionEnvironment.getExecutionEnvironment(conf); + events = + env.fromSource( + new MonitorSource(tableLoader, LOW_RATE, Long.MAX_VALUE), + WatermarkStrategy.noWatermarks(), + "TableChangeSource") + .forceNonParallel(); + CollectingSink resultWithSavepoint = new CollectingSink<>(); + events.sinkTo(resultWithSavepoint); + + // Make sure that the job with restored source does not read new records from the table + JobClient clientWithSavepoint = null; + try { + clientWithSavepoint = env.executeAsync("Table Change Source test with savepoint"); + + assertThat(resultWithSavepoint.poll(Duration.ofSeconds(5L))).isEqualTo(EMPTY_EVENT); + } finally { + closeJobClient(clientWithSavepoint, null); + } + + // Restore without savepoint + env = StreamExecutionEnvironment.getExecutionEnvironment(); + events = + env.fromSource( + new MonitorSource(tableLoader, LOW_RATE, Long.MAX_VALUE), + WatermarkStrategy.noWatermarks(), + "TableChangeSource") + .forceNonParallel(); + CollectingSink resultWithoutSavepoint = new CollectingSink<>(); + events.sinkTo(resultWithoutSavepoint); + + // Make sure that a new job without state reads the event as expected + JobClient clientWithoutSavepoint = null; + try { + clientWithoutSavepoint = env.executeAsync("Table Change Source Test without savepoint"); + assertThat(resultWithoutSavepoint.poll(Duration.ofSeconds(5L))) + .isEqualTo(firstNonEmptyEvent.get()); + } finally { + closeJobClient(clientWithoutSavepoint); + } + } + + @Test + void testNotOneParallelismThrows() { + sql.exec("CREATE TABLE %s (id int, data varchar)", TABLE_NAME); + + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + TableLoader tableLoader = sql.tableLoader(TABLE_NAME); + tableLoader.open(); + + env.fromSource( + new MonitorSource(tableLoader, HIGH_RATE, Long.MAX_VALUE), + WatermarkStrategy.noWatermarks(), + "TableChangeSource") + .setParallelism(2) + .print(); + + assertThatThrownBy(env::execute) + .isInstanceOf(JobExecutionException.class) + .rootCause() + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Parallelism should be set to 1"); + } + + @Test + void testMaxReadBack() { + sql.exec("CREATE TABLE %s (id int, data varchar)", TABLE_NAME); + sql.exec("INSERT INTO %s VALUES (1, 'a')", TABLE_NAME); + sql.exec("INSERT INTO %s VALUES (2, 'b')", TABLE_NAME); + sql.exec("INSERT INTO %s VALUES (3, 'c')", TABLE_NAME); + + TableLoader tableLoader = sql.tableLoader(TABLE_NAME); + tableLoader.open(); + + MonitorSource.TableChangeIterator iterator = + new MonitorSource.TableChangeIterator(tableLoader, null, 1); + + // For a single maxReadBack we only get a single change + assertThat(iterator.next().commitNum()).isEqualTo(1); + + iterator = new MonitorSource.TableChangeIterator(tableLoader, null, 2); + + // Expecting 2 commits/snapshots for maxReadBack=2 + assertThat(iterator.next().commitNum()).isEqualTo(2); + + iterator = new MonitorSource.TableChangeIterator(tableLoader, null, Long.MAX_VALUE); + + // For maxReadBack Long.MAX_VALUE we get every change + assertThat(iterator.next().commitNum()).isEqualTo(3); + } + + @Test + void testSkipReplace() { + sql.exec("CREATE TABLE %s (id int, data varchar)", TABLE_NAME); + sql.exec("INSERT INTO %s VALUES (1, 'a')", TABLE_NAME); + + TableLoader tableLoader = sql.tableLoader(TABLE_NAME); + tableLoader.open(); + + MonitorSource.TableChangeIterator iterator = + new MonitorSource.TableChangeIterator(tableLoader, null, Long.MAX_VALUE); + + // Read the current snapshot + assertThat(iterator.next().commitNum()).isEqualTo(1); + + // Create a DataOperations.REPLACE snapshot + Table table = tableLoader.loadTable(); + DataFile dataFile = + table.snapshots().iterator().next().addedDataFiles(table.io()).iterator().next(); + RewriteFiles rewrite = tableLoader.loadTable().newRewrite(); + // Replace the file with itself for testing purposes + rewrite.deleteFile(dataFile); + rewrite.addFile(dataFile); + rewrite.commit(); + + // Check that the rewrite is ignored + assertThat(iterator.next()).isEqualTo(EMPTY_EVENT); + } + + private static long firstFileLength(Table table) { + return table.currentSnapshot().addedDataFiles(table.io()).iterator().next().fileSizeInBytes(); + } + + private static TableChange tableChangeWithLastSnapshot(Table table, TableChange previous) { + List dataFiles = + Lists.newArrayList(table.currentSnapshot().addedDataFiles(table.io()).iterator()); + List deleteFiles = + Lists.newArrayList(table.currentSnapshot().addedDeleteFiles(table.io()).iterator()); + + long dataSize = dataFiles.stream().mapToLong(d -> d.fileSizeInBytes()).sum(); + long deleteSize = deleteFiles.stream().mapToLong(d -> d.fileSizeInBytes()).sum(); + boolean hasDelete = table.currentSnapshot().addedDeleteFiles(table.io()).iterator().hasNext(); + + return new TableChange( + previous.dataFileNum() + dataFiles.size(), + previous.deleteFileNum() + deleteFiles.size(), + previous.dataFileSize() + dataSize, + previous.deleteFileSize() + deleteSize, + previous.commitNum() + 1); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestAvroGenericRecordToRowDataMapper.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestAvroGenericRecordToRowDataMapper.java new file mode 100644 index 000000000000..44eb907a17aa --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestAvroGenericRecordToRowDataMapper.java @@ -0,0 +1,38 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import static org.assertj.core.api.Assertions.assertThat; + +import org.apache.flink.table.data.RowData; +import org.apache.iceberg.flink.AvroGenericRecordConverterBase; +import org.apache.iceberg.flink.DataGenerator; + +public class TestAvroGenericRecordToRowDataMapper extends AvroGenericRecordConverterBase { + @Override + protected void testConverter(DataGenerator dataGenerator) throws Exception { + // Need to use avroSchema from DataGenerator because some primitive types have special Avro + // type handling. Hence the Avro schema converted from Iceberg schema won't work. + AvroGenericRecordToRowDataMapper mapper = + AvroGenericRecordToRowDataMapper.forAvroSchema(dataGenerator.avroSchema()); + RowData expected = dataGenerator.generateFlinkRowData(); + RowData actual = mapper.map(dataGenerator.generateAvroGenericRecord()); + assertThat(actual).isEqualTo(expected); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitionKeySelector.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitionKeySelector.java new file mode 100644 index 000000000000..abac605f81fd --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitionKeySelector.java @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatExceptionOfType; + +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.flink.SimpleDataUtil; +import org.apache.iceberg.flink.sink.TestBucketPartitionerUtil.TableSchemaType; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.EnumSource; + +public class TestBucketPartitionKeySelector { + + @ParameterizedTest + @EnumSource( + value = TableSchemaType.class, + names = {"ONE_BUCKET", "IDENTITY_AND_BUCKET"}) + public void testCorrectKeySelection(TableSchemaType tableSchemaType) { + int numBuckets = 60; + + PartitionSpec partitionSpec = tableSchemaType.getPartitionSpec(numBuckets); + BucketPartitionKeySelector keySelector = + new BucketPartitionKeySelector( + partitionSpec, SimpleDataUtil.SCHEMA, SimpleDataUtil.ROW_TYPE); + + TestBucketPartitionerUtil.generateRowsForBucketIdRange(2, numBuckets) + .forEach( + rowData -> { + int expectedBucketId = + TestBucketPartitionerUtil.computeBucketId( + numBuckets, rowData.getString(1).toString()); + Integer key = keySelector.getKey(rowData); + assertThat(key).isEqualTo(expectedBucketId); + }); + } + + @Test + public void testKeySelectorMultipleBucketsFail() { + PartitionSpec partitionSpec = TableSchemaType.TWO_BUCKETS.getPartitionSpec(1); + + assertThatExceptionOfType(RuntimeException.class) + .isThrownBy( + () -> + new BucketPartitionKeySelector( + partitionSpec, SimpleDataUtil.SCHEMA, SimpleDataUtil.ROW_TYPE)) + .withMessage(BucketPartitionerUtil.BAD_NUMBER_OF_BUCKETS_ERROR_MESSAGE, 2); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitioner.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitioner.java new file mode 100644 index 000000000000..59bdba578ebb --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitioner.java @@ -0,0 +1,108 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import static org.apache.iceberg.flink.sink.BucketPartitioner.BUCKET_GREATER_THAN_UPPER_BOUND_MESSAGE; +import static org.apache.iceberg.flink.sink.BucketPartitioner.BUCKET_LESS_THAN_LOWER_BOUND_MESSAGE; +import static org.apache.iceberg.flink.sink.BucketPartitioner.BUCKET_NULL_MESSAGE; +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatExceptionOfType; + +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.flink.sink.TestBucketPartitionerUtil.TableSchemaType; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.CsvSource; + +public class TestBucketPartitioner { + + static final int DEFAULT_NUM_BUCKETS = 60; + + @ParameterizedTest + @CsvSource({"ONE_BUCKET,50", "IDENTITY_AND_BUCKET,50", "ONE_BUCKET,60", "IDENTITY_AND_BUCKET,60"}) + public void testPartitioningParallelismGreaterThanBuckets( + String schemaTypeStr, String numBucketsStr) { + int numPartitions = 500; + TableSchemaType tableSchemaType = TableSchemaType.valueOf(schemaTypeStr); + int numBuckets = Integer.parseInt(numBucketsStr); + PartitionSpec partitionSpec = tableSchemaType.getPartitionSpec(numBuckets); + BucketPartitioner bucketPartitioner = new BucketPartitioner(partitionSpec); + + int bucketId = 0; + for (int expectedIndex = 0; expectedIndex < numPartitions; expectedIndex++) { + int actualPartitionIndex = bucketPartitioner.partition(bucketId, numPartitions); + assertThat(actualPartitionIndex).isEqualTo(expectedIndex); + bucketId++; + if (bucketId == numBuckets) { + bucketId = 0; + } + } + } + + @ParameterizedTest + @CsvSource({"ONE_BUCKET,50", "IDENTITY_AND_BUCKET,50", "ONE_BUCKET,60", "IDENTITY_AND_BUCKET,60"}) + public void testPartitioningParallelismEqualLessThanBuckets( + String schemaTypeStr, String numBucketsStr) { + int numPartitions = 30; + TableSchemaType tableSchemaType = TableSchemaType.valueOf(schemaTypeStr); + int numBuckets = Integer.parseInt(numBucketsStr); + PartitionSpec partitionSpec = tableSchemaType.getPartitionSpec(numBuckets); + BucketPartitioner bucketPartitioner = new BucketPartitioner(partitionSpec); + + for (int bucketId = 0; bucketId < numBuckets; bucketId++) { + int actualPartitionIndex = bucketPartitioner.partition(bucketId, numPartitions); + assertThat(actualPartitionIndex).isEqualTo(bucketId % numPartitions); + } + } + + @Test + public void testPartitionerBucketIdNullFail() { + PartitionSpec partitionSpec = TableSchemaType.ONE_BUCKET.getPartitionSpec(DEFAULT_NUM_BUCKETS); + BucketPartitioner bucketPartitioner = new BucketPartitioner(partitionSpec); + + assertThatExceptionOfType(RuntimeException.class) + .isThrownBy(() -> bucketPartitioner.partition(null, DEFAULT_NUM_BUCKETS)) + .withMessage(BUCKET_NULL_MESSAGE); + } + + @Test + public void testPartitionerMultipleBucketsFail() { + PartitionSpec partitionSpec = TableSchemaType.TWO_BUCKETS.getPartitionSpec(DEFAULT_NUM_BUCKETS); + + assertThatExceptionOfType(RuntimeException.class) + .isThrownBy(() -> new BucketPartitioner(partitionSpec)) + .withMessage(BucketPartitionerUtil.BAD_NUMBER_OF_BUCKETS_ERROR_MESSAGE, 2); + } + + @Test + public void testPartitionerBucketIdOutOfRangeFail() { + PartitionSpec partitionSpec = TableSchemaType.ONE_BUCKET.getPartitionSpec(DEFAULT_NUM_BUCKETS); + BucketPartitioner bucketPartitioner = new BucketPartitioner(partitionSpec); + + int negativeBucketId = -1; + assertThatExceptionOfType(IllegalArgumentException.class) + .isThrownBy(() -> bucketPartitioner.partition(negativeBucketId, 1)) + .withMessage(BUCKET_LESS_THAN_LOWER_BOUND_MESSAGE, negativeBucketId); + + int tooBigBucketId = DEFAULT_NUM_BUCKETS; + assertThatExceptionOfType(IllegalArgumentException.class) + .isThrownBy(() -> bucketPartitioner.partition(tooBigBucketId, 1)) + .withMessage(BUCKET_GREATER_THAN_UPPER_BOUND_MESSAGE, tooBigBucketId, DEFAULT_NUM_BUCKETS); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitionerFlinkIcebergSink.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitionerFlinkIcebergSink.java new file mode 100644 index 000000000000..ba0ea867ffb7 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitionerFlinkIcebergSink.java @@ -0,0 +1,227 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import static org.apache.iceberg.flink.MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG; +import static org.apache.iceberg.flink.TestFixtures.DATABASE; +import static org.apache.iceberg.flink.TestFixtures.TABLE_IDENTIFIER; +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.IOException; +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.java.typeutils.RowTypeInfo; +import org.apache.flink.runtime.testutils.MiniClusterResourceConfiguration; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.util.DataFormatConverters; +import org.apache.flink.test.junit5.MiniClusterExtension; +import org.apache.flink.types.Row; +import org.apache.iceberg.DistributionMode; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.FileScanTask; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Table; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.flink.HadoopCatalogExtension; +import org.apache.iceberg.flink.SimpleDataUtil; +import org.apache.iceberg.flink.TableLoader; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.flink.sink.TestBucketPartitionerUtil.TableSchemaType; +import org.apache.iceberg.flink.source.BoundedTestSource; +import org.apache.iceberg.flink.util.FlinkCompatibilityUtil; +import org.apache.iceberg.io.CloseableIterable; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.junit.jupiter.api.extension.RegisterExtension; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.EnumSource; + +public class TestBucketPartitionerFlinkIcebergSink { + + private static final int NUMBER_TASK_MANAGERS = 1; + private static final int SLOTS_PER_TASK_MANAGER = 8; + + @RegisterExtension + private static final MiniClusterExtension MINI_CLUSTER_EXTENSION = + new MiniClusterExtension( + new MiniClusterResourceConfiguration.Builder() + .setNumberTaskManagers(NUMBER_TASK_MANAGERS) + .setNumberSlotsPerTaskManager(SLOTS_PER_TASK_MANAGER) + .setConfiguration(DISABLE_CLASSLOADER_CHECK_CONFIG) + .build()); + + @RegisterExtension + private static final HadoopCatalogExtension CATALOG_EXTENSION = + new HadoopCatalogExtension(DATABASE, TestFixtures.TABLE); + + private static final TypeInformation ROW_TYPE_INFO = + new RowTypeInfo(SimpleDataUtil.FLINK_SCHEMA.getFieldTypes()); + + // Parallelism = 8 (parallelism > numBuckets) throughout the test suite + private final int parallelism = NUMBER_TASK_MANAGERS * SLOTS_PER_TASK_MANAGER; + private final FileFormat format = FileFormat.PARQUET; + private final int numBuckets = 4; + + private Table table; + private StreamExecutionEnvironment env; + private TableLoader tableLoader; + + private void setupEnvironment(TableSchemaType tableSchemaType) { + PartitionSpec partitionSpec = tableSchemaType.getPartitionSpec(numBuckets); + table = + CATALOG_EXTENSION + .catalog() + .createTable( + TABLE_IDENTIFIER, + SimpleDataUtil.SCHEMA, + partitionSpec, + ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name())); + env = + StreamExecutionEnvironment.getExecutionEnvironment(DISABLE_CLASSLOADER_CHECK_CONFIG) + .enableCheckpointing(100) + .setParallelism(parallelism) + .setMaxParallelism(parallelism * 2); + tableLoader = CATALOG_EXTENSION.tableLoader(); + } + + private void appendRowsToTable(List allRows) throws Exception { + DataFormatConverters.RowConverter converter = + new DataFormatConverters.RowConverter(SimpleDataUtil.FLINK_SCHEMA.getFieldDataTypes()); + + DataStream dataStream = + env.addSource( + new BoundedTestSource<>( + allRows.stream().map(converter::toExternal).toArray(Row[]::new)), + ROW_TYPE_INFO) + .map(converter::toInternal, FlinkCompatibilityUtil.toTypeInfo(SimpleDataUtil.ROW_TYPE)) + .partitionCustom( + new BucketPartitioner(table.spec()), + new BucketPartitionKeySelector( + table.spec(), + table.schema(), + FlinkSink.toFlinkRowType(table.schema(), SimpleDataUtil.FLINK_SCHEMA))); + + FlinkSink.forRowData(dataStream) + .table(table) + .tableLoader(tableLoader) + .writeParallelism(parallelism) + .distributionMode(DistributionMode.NONE) + .append(); + + env.execute("Test Iceberg DataStream"); + + SimpleDataUtil.assertTableRows(table, allRows); + } + + @ParameterizedTest + @EnumSource( + value = TableSchemaType.class, + names = {"ONE_BUCKET", "IDENTITY_AND_BUCKET"}) + public void testSendRecordsToAllBucketsEvenly(TableSchemaType tableSchemaType) throws Exception { + setupEnvironment(tableSchemaType); + List rows = generateTestDataRows(); + + appendRowsToTable(rows); + TableTestStats stats = extractPartitionResults(tableSchemaType); + + assertThat(stats.totalRowCount).isEqualTo(rows.size()); + // All 4 buckets should've been written to + assertThat(stats.writersPerBucket.size()).isEqualTo(numBuckets); + assertThat(stats.numFilesPerBucket.size()).isEqualTo(numBuckets); + // Writer expectation (2 writers per bucket): + // - Bucket0 -> Writers [0, 4] + // - Bucket1 -> Writers [1, 5] + // - Bucket2 -> Writers [2, 6] + // - Bucket3 -> Writers [3, 7] + for (int i = 0, j = numBuckets; i < numBuckets; i++, j++) { + assertThat(stats.writersPerBucket.get(i)).hasSameElementsAs(Arrays.asList(i, j)); + // 2 files per bucket (one file is created by each writer) + assertThat(stats.numFilesPerBucket.get(i)).isEqualTo(2); + // 2 rows per file (total of 16 rows across 8 files) + assertThat(stats.rowsPerWriter.get(i)).isEqualTo(2); + } + } + + /** + * Generating 16 rows to be sent uniformly to all writers (round-robin across 8 writers -> 4 + * buckets) + */ + private List generateTestDataRows() { + int totalNumRows = parallelism * 2; + int numRowsPerBucket = totalNumRows / numBuckets; + return TestBucketPartitionerUtil.generateRowsForBucketIdRange(numRowsPerBucket, numBuckets); + } + + private TableTestStats extractPartitionResults(TableSchemaType tableSchemaType) + throws IOException { + int totalRecordCount = 0; + Map> writersPerBucket = Maps.newHashMap(); // > + Map filesPerBucket = Maps.newHashMap(); // + Map rowsPerWriter = Maps.newHashMap(); // + + try (CloseableIterable fileScanTasks = table.newScan().planFiles()) { + for (FileScanTask scanTask : fileScanTasks) { + long recordCountInFile = scanTask.file().recordCount(); + + String[] splitFilePath = scanTask.file().path().toString().split("/"); + // Filename example: 00007-0-a7d3a29a-33e9-4740-88f4-0f494397d60c-00001.parquet + // Writer ID: .......^^^^^ + String filename = splitFilePath[splitFilePath.length - 1]; + int writerId = Integer.parseInt(filename.split("-")[0]); + + totalRecordCount += recordCountInFile; + int bucketId = + scanTask + .file() + .partition() + .get(tableSchemaType.bucketPartitionColumnPosition(), Integer.class); + writersPerBucket.computeIfAbsent(bucketId, k -> Lists.newArrayList()); + writersPerBucket.get(bucketId).add(writerId); + filesPerBucket.put(bucketId, filesPerBucket.getOrDefault(bucketId, 0) + 1); + rowsPerWriter.put(writerId, rowsPerWriter.getOrDefault(writerId, 0L) + recordCountInFile); + } + } + + return new TableTestStats(totalRecordCount, writersPerBucket, filesPerBucket, rowsPerWriter); + } + + /** DTO to hold Test Stats */ + private static class TableTestStats { + final int totalRowCount; + final Map> writersPerBucket; + final Map numFilesPerBucket; + final Map rowsPerWriter; + + TableTestStats( + int totalRecordCount, + Map> writersPerBucket, + Map numFilesPerBucket, + Map rowsPerWriter) { + this.totalRowCount = totalRecordCount; + this.writersPerBucket = writersPerBucket; + this.numFilesPerBucket = numFilesPerBucket; + this.rowsPerWriter = rowsPerWriter; + } + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitionerUtil.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitionerUtil.java new file mode 100644 index 000000000000..e1309bfac6d5 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitionerUtil.java @@ -0,0 +1,126 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import java.util.List; +import java.util.UUID; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.flink.SimpleDataUtil; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.util.BucketUtil; + +final class TestBucketPartitionerUtil { + + enum TableSchemaType { + ONE_BUCKET { + @Override + public int bucketPartitionColumnPosition() { + return 0; + } + + @Override + public PartitionSpec getPartitionSpec(int numBuckets) { + return PartitionSpec.builderFor(SimpleDataUtil.SCHEMA).bucket("data", numBuckets).build(); + } + }, + IDENTITY_AND_BUCKET { + @Override + public int bucketPartitionColumnPosition() { + return 1; + } + + @Override + public PartitionSpec getPartitionSpec(int numBuckets) { + return PartitionSpec.builderFor(SimpleDataUtil.SCHEMA) + .identity("id") + .bucket("data", numBuckets) + .build(); + } + }, + TWO_BUCKETS { + @Override + public int bucketPartitionColumnPosition() { + return 1; + } + + @Override + public PartitionSpec getPartitionSpec(int numBuckets) { + return PartitionSpec.builderFor(SimpleDataUtil.SCHEMA) + .bucket("id", numBuckets) + .bucket("data", numBuckets) + .build(); + } + }; + + public abstract int bucketPartitionColumnPosition(); + + public abstract PartitionSpec getPartitionSpec(int numBuckets); + } + + private TestBucketPartitionerUtil() {} + + /** + * Utility method to generate rows whose values will "hash" to a range of bucketIds (from 0 to + * numBuckets - 1) + * + * @param numRowsPerBucket how many different rows should be generated per bucket + * @param numBuckets max number of buckets to consider + * @return the list of rows whose data "hashes" to the desired bucketId + */ + static List generateRowsForBucketIdRange(int numRowsPerBucket, int numBuckets) { + List rows = Lists.newArrayListWithCapacity(numBuckets * numRowsPerBucket); + // For some of our tests, this order of the generated rows matters + for (int i = 0; i < numRowsPerBucket; i++) { + for (int bucketId = 0; bucketId < numBuckets; bucketId++) { + String value = generateValueForBucketId(bucketId, numBuckets); + rows.add(GenericRowData.of(1, StringData.fromString(value))); + } + } + return rows; + } + + /** + * Utility method to generate a UUID string that will "hash" to a desired bucketId + * + * @param bucketId the desired bucketId + * @return the string data that "hashes" to the desired bucketId + */ + private static String generateValueForBucketId(int bucketId, int numBuckets) { + while (true) { + String uuid = UUID.randomUUID().toString(); + if (computeBucketId(numBuckets, uuid) == bucketId) { + return uuid; + } + } + } + + /** + * Utility that performs the same hashing/bucketing mechanism used by Bucket.java + * + * @param numBuckets max number of buckets to consider + * @param value the string to compute the bucketId from + * @return the computed bucketId + */ + static int computeBucketId(int numBuckets, String value) { + return (BucketUtil.hash(value) & Integer.MAX_VALUE) % numBuckets; + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestCachingTableSupplier.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestCachingTableSupplier.java new file mode 100644 index 000000000000..360db658cd2f --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestCachingTableSupplier.java @@ -0,0 +1,81 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +import java.time.Duration; +import java.util.concurrent.TimeUnit; +import org.apache.iceberg.SerializableTable; +import org.apache.iceberg.Table; +import org.apache.iceberg.flink.TableLoader; +import org.awaitility.Awaitility; +import org.junit.jupiter.api.Test; + +public class TestCachingTableSupplier { + + @Test + public void testCheckArguments() { + SerializableTable initialTable = mock(SerializableTable.class); + + Table loadedTable = mock(Table.class); + TableLoader tableLoader = mock(TableLoader.class); + when(tableLoader.loadTable()).thenReturn(loadedTable); + + new CachingTableSupplier(initialTable, tableLoader, Duration.ofMillis(100)); + + assertThatThrownBy(() -> new CachingTableSupplier(initialTable, tableLoader, null)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("tableRefreshInterval cannot be null"); + assertThatThrownBy(() -> new CachingTableSupplier(null, tableLoader, Duration.ofMillis(100))) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("initialTable cannot be null"); + assertThatThrownBy(() -> new CachingTableSupplier(initialTable, null, Duration.ofMillis(100))) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("tableLoader cannot be null"); + } + + @Test + public void testTableReload() { + SerializableTable initialTable = mock(SerializableTable.class); + + Table loadedTable = mock(Table.class); + TableLoader tableLoader = mock(TableLoader.class); + when(tableLoader.loadTable()).thenReturn(loadedTable); + + CachingTableSupplier cachingTableSupplier = + new CachingTableSupplier(initialTable, tableLoader, Duration.ofMillis(100)); + + // refresh shouldn't do anything as the min reload interval hasn't passed + cachingTableSupplier.refreshTable(); + assertThat(cachingTableSupplier.get()).isEqualTo(initialTable); + + // refresh after waiting past the min reload interval + Awaitility.await() + .atLeast(100, TimeUnit.MILLISECONDS) + .untilAsserted( + () -> { + cachingTableSupplier.refreshTable(); + assertThat(cachingTableSupplier.get()).isEqualTo(loadedTable); + }); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestCompressionSettings.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestCompressionSettings.java new file mode 100644 index 000000000000..8faae1b05a4e --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestCompressionSettings.java @@ -0,0 +1,257 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Map; +import org.apache.flink.streaming.util.OneInputStreamOperatorTestHarness; +import org.apache.flink.table.api.TableSchema; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.iceberg.Parameter; +import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.Parameters; +import org.apache.iceberg.Table; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.common.DynFields; +import org.apache.iceberg.flink.FlinkWriteConf; +import org.apache.iceberg.flink.FlinkWriteOptions; +import org.apache.iceberg.flink.SimpleDataUtil; +import org.apache.iceberg.io.BaseTaskWriter; +import org.apache.iceberg.io.TaskWriter; +import org.apache.iceberg.io.WriteResult; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.api.extension.ExtendWith; +import org.junit.jupiter.api.io.TempDir; + +@ExtendWith(ParameterizedTestExtension.class) +public class TestCompressionSettings { + @TempDir protected Path temporaryFolder; + + private Table table; + + @Parameter(index = 0) + private Map initProperties; + + @Parameters(name = "tableProperties = {0}") + public static Object[][] parameters() { + return new Object[][] { + new Object[] {ImmutableMap.of()}, + new Object[] { + ImmutableMap.of( + TableProperties.AVRO_COMPRESSION, + "zstd", + TableProperties.AVRO_COMPRESSION_LEVEL, + "3", + TableProperties.PARQUET_COMPRESSION, + "zstd", + TableProperties.PARQUET_COMPRESSION_LEVEL, + "3", + TableProperties.ORC_COMPRESSION, + "zstd", + TableProperties.ORC_COMPRESSION_STRATEGY, + "compression") + } + }; + } + + @BeforeEach + public void before() throws IOException { + File folder = Files.createTempDirectory(temporaryFolder, "junit").toFile(); + table = SimpleDataUtil.createTable(folder.getAbsolutePath(), initProperties, false); + } + + @TestTemplate + public void testCompressionAvro() throws Exception { + // No override provided + Map resultProperties = + appenderProperties( + table, + SimpleDataUtil.FLINK_SCHEMA, + ImmutableMap.of(FlinkWriteOptions.WRITE_FORMAT.key(), "AVRO")); + + if (initProperties.get(TableProperties.AVRO_COMPRESSION) == null) { + assertThat(resultProperties) + .containsEntry(TableProperties.AVRO_COMPRESSION, TableProperties.AVRO_COMPRESSION_DEFAULT) + .doesNotContainKey(TableProperties.AVRO_COMPRESSION_LEVEL); + } else { + assertThat(resultProperties) + .containsEntry( + TableProperties.AVRO_COMPRESSION, + initProperties.get(TableProperties.AVRO_COMPRESSION)) + .containsEntry( + TableProperties.AVRO_COMPRESSION_LEVEL, + initProperties.get(TableProperties.AVRO_COMPRESSION_LEVEL)); + } + + // Override compression to snappy and some random level + resultProperties = + appenderProperties( + table, + SimpleDataUtil.FLINK_SCHEMA, + ImmutableMap.of( + FlinkWriteOptions.WRITE_FORMAT.key(), + "AVRO", + FlinkWriteOptions.COMPRESSION_CODEC.key(), + "snappy", + FlinkWriteOptions.COMPRESSION_LEVEL.key(), + "6")); + + assertThat(resultProperties) + .containsEntry(TableProperties.AVRO_COMPRESSION, "snappy") + .containsEntry(TableProperties.AVRO_COMPRESSION_LEVEL, "6"); + } + + @TestTemplate + public void testCompressionParquet() throws Exception { + // No override provided + Map resultProperties = + appenderProperties( + table, + SimpleDataUtil.FLINK_SCHEMA, + ImmutableMap.of(FlinkWriteOptions.WRITE_FORMAT.key(), "PARQUET")); + + if (initProperties.get(TableProperties.PARQUET_COMPRESSION) == null) { + assertThat(resultProperties) + .containsEntry( + TableProperties.PARQUET_COMPRESSION, + TableProperties.PARQUET_COMPRESSION_DEFAULT_SINCE_1_4_0) + .doesNotContainKey(TableProperties.PARQUET_COMPRESSION_LEVEL); + } else { + assertThat(resultProperties) + .containsEntry( + TableProperties.PARQUET_COMPRESSION, + initProperties.get(TableProperties.PARQUET_COMPRESSION)) + .containsEntry( + TableProperties.PARQUET_COMPRESSION_LEVEL, + initProperties.get(TableProperties.PARQUET_COMPRESSION_LEVEL)); + } + + // Override compression to snappy and some random level + resultProperties = + appenderProperties( + table, + SimpleDataUtil.FLINK_SCHEMA, + ImmutableMap.of( + FlinkWriteOptions.WRITE_FORMAT.key(), + "PARQUET", + FlinkWriteOptions.COMPRESSION_CODEC.key(), + "snappy", + FlinkWriteOptions.COMPRESSION_LEVEL.key(), + "6")); + + assertThat(resultProperties) + .containsEntry(TableProperties.PARQUET_COMPRESSION, "snappy") + .containsEntry(TableProperties.PARQUET_COMPRESSION_LEVEL, "6"); + } + + @TestTemplate + public void testCompressionOrc() throws Exception { + // No override provided + Map resultProperties = + appenderProperties( + table, + SimpleDataUtil.FLINK_SCHEMA, + ImmutableMap.of(FlinkWriteOptions.WRITE_FORMAT.key(), "ORC")); + + if (initProperties.get(TableProperties.ORC_COMPRESSION) == null) { + assertThat(resultProperties) + .containsEntry(TableProperties.ORC_COMPRESSION, TableProperties.ORC_COMPRESSION_DEFAULT) + .containsEntry( + TableProperties.ORC_COMPRESSION_STRATEGY, + TableProperties.ORC_COMPRESSION_STRATEGY_DEFAULT); + } else { + assertThat(resultProperties) + .containsEntry( + TableProperties.ORC_COMPRESSION, initProperties.get(TableProperties.ORC_COMPRESSION)) + .containsEntry( + TableProperties.ORC_COMPRESSION_STRATEGY, + initProperties.get(TableProperties.ORC_COMPRESSION_STRATEGY)); + } + + // Override compression to snappy and a different strategy + resultProperties = + appenderProperties( + table, + SimpleDataUtil.FLINK_SCHEMA, + ImmutableMap.of( + FlinkWriteOptions.WRITE_FORMAT.key(), + "ORC", + FlinkWriteOptions.COMPRESSION_CODEC.key(), + "snappy", + FlinkWriteOptions.COMPRESSION_STRATEGY.key(), + "speed")); + + assertThat(resultProperties) + .containsEntry(TableProperties.ORC_COMPRESSION, "snappy") + .containsEntry(TableProperties.ORC_COMPRESSION_STRATEGY, "speed"); + } + + private static OneInputStreamOperatorTestHarness createIcebergStreamWriter( + Table icebergTable, TableSchema flinkSchema, Map override) throws Exception { + RowType flinkRowType = FlinkSink.toFlinkRowType(icebergTable.schema(), flinkSchema); + FlinkWriteConf flinkWriteConfig = + new FlinkWriteConf( + icebergTable, override, new org.apache.flink.configuration.Configuration()); + + IcebergStreamWriter streamWriter = + FlinkSink.createStreamWriter(() -> icebergTable, flinkWriteConfig, flinkRowType, null); + OneInputStreamOperatorTestHarness harness = + new OneInputStreamOperatorTestHarness<>(streamWriter, 1, 1, 0); + + harness.setup(); + harness.open(); + + return harness; + } + + private static Map appenderProperties( + Table table, TableSchema schema, Map override) throws Exception { + try (OneInputStreamOperatorTestHarness testHarness = + createIcebergStreamWriter(table, schema, override)) { + testHarness.processElement(SimpleDataUtil.createRowData(1, "hello"), 1); + + testHarness.prepareSnapshotPreBarrier(1L); + DynFields.BoundField operatorField = + DynFields.builder() + .hiddenImpl(testHarness.getOperatorFactory().getClass(), "operator") + .build(testHarness.getOperatorFactory()); + DynFields.BoundField writerField = + DynFields.builder() + .hiddenImpl(IcebergStreamWriter.class, "writer") + .build(operatorField.get()); + DynFields.BoundField appenderField = + DynFields.builder() + .hiddenImpl(BaseTaskWriter.class, "appenderFactory") + .build(writerField.get()); + DynFields.BoundField> propsField = + DynFields.builder() + .hiddenImpl(FlinkAppenderFactory.class, "props") + .build(appenderField.get()); + return propsField.get(); + } + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestDeltaTaskWriter.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestDeltaTaskWriter.java new file mode 100644 index 000000000000..21f3ee2c655a --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestDeltaTaskWriter.java @@ -0,0 +1,429 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import static org.apache.iceberg.flink.SimpleDataUtil.createDelete; +import static org.apache.iceberg.flink.SimpleDataUtil.createInsert; +import static org.apache.iceberg.flink.SimpleDataUtil.createRecord; +import static org.apache.iceberg.flink.SimpleDataUtil.createUpdateAfter; +import static org.apache.iceberg.flink.SimpleDataUtil.createUpdateBefore; +import static org.apache.iceberg.types.Types.NestedField.required; +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.time.OffsetDateTime; +import java.util.Arrays; +import java.util.List; +import java.util.stream.Collectors; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.TimestampData; +import org.apache.flink.table.runtime.typeutils.RowDataSerializer; +import org.apache.flink.table.types.logical.IntType; +import org.apache.flink.table.types.logical.LocalZonedTimestampType; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.types.RowKind; +import org.apache.iceberg.ContentFile; +import org.apache.iceberg.FileContent; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Parameter; +import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.Parameters; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.RowDelta; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SerializableTable; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.TestBase; +import org.apache.iceberg.TestTables; +import org.apache.iceberg.data.GenericRecord; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.SimpleDataUtil; +import org.apache.iceberg.io.TaskWriter; +import org.apache.iceberg.io.WriteResult; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Sets; +import org.apache.iceberg.types.Types; +import org.apache.iceberg.util.StructLikeSet; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.api.extension.ExtendWith; + +@ExtendWith(ParameterizedTestExtension.class) +public class TestDeltaTaskWriter extends TestBase { + + @Parameter(index = 1) + private FileFormat format; + + @Parameters(name = "formatVersion = {0}, fileFormat = {1}") + protected static List parameters() { + return Arrays.asList( + new Object[] {2, FileFormat.AVRO}, + new Object[] {2, FileFormat.ORC}, + new Object[] {2, FileFormat.PARQUET}); + } + + @Override + @BeforeEach + public void setupTable() throws IOException { + this.tableDir = Files.createTempDirectory(temp, "junit").toFile(); + assertThat(tableDir.delete()).isTrue(); // created by table create + + this.metadataDir = new File(tableDir, "metadata"); + } + + private int idFieldId() { + return table.schema().findField("id").fieldId(); + } + + private int dataFieldId() { + return table.schema().findField("data").fieldId(); + } + + private void testCdcEvents(boolean partitioned) throws IOException { + List equalityFieldIds = Lists.newArrayList(idFieldId()); + TaskWriterFactory taskWriterFactory = createTaskWriterFactory(equalityFieldIds); + taskWriterFactory.initialize(1, 1); + + // Start the 1th transaction. + TaskWriter writer = taskWriterFactory.create(); + + writer.write(createInsert(1, "aaa")); + writer.write(createInsert(2, "bbb")); + writer.write(createInsert(3, "ccc")); + + // Update <2, 'bbb'> to <2, 'ddd'> + writer.write(createUpdateBefore(2, "bbb")); // 1 pos-delete and 1 eq-delete. + writer.write(createUpdateAfter(2, "ddd")); + + // Update <1, 'aaa'> to <1, 'eee'> + writer.write(createUpdateBefore(1, "aaa")); // 1 pos-delete and 1 eq-delete. + writer.write(createUpdateAfter(1, "eee")); + + // Insert <4, 'fff'> + writer.write(createInsert(4, "fff")); + // Insert <5, 'ggg'> + writer.write(createInsert(5, "ggg")); + + // Delete <3, 'ccc'> + writer.write(createDelete(3, "ccc")); // 1 pos-delete and 1 eq-delete. + + WriteResult result = writer.complete(); + assertThat(result.dataFiles()).hasSize(partitioned ? 7 : 1); + assertThat(result.deleteFiles()).hasSize(partitioned ? 3 : 1); + commitTransaction(result); + + assertThat(actualRowSet("*")) + .isEqualTo( + expectedRowSet( + createRecord(1, "eee"), + createRecord(2, "ddd"), + createRecord(4, "fff"), + createRecord(5, "ggg"))); + + // Start the 2nd transaction. + writer = taskWriterFactory.create(); + + // Update <2, 'ddd'> to <6, 'hhh'> - (Update both key and value) + writer.write(createUpdateBefore(2, "ddd")); // 1 eq-delete + writer.write(createUpdateAfter(6, "hhh")); + + // Update <5, 'ggg'> to <5, 'iii'> + writer.write(createUpdateBefore(5, "ggg")); // 1 eq-delete + writer.write(createUpdateAfter(5, "iii")); + + // Delete <4, 'fff'> + writer.write(createDelete(4, "fff")); // 1 eq-delete. + + result = writer.complete(); + assertThat(result.dataFiles()).hasSize(partitioned ? 2 : 1); + assertThat(result.deleteFiles()).hasSize(partitioned ? 3 : 1); + commitTransaction(result); + + assertThat(actualRowSet("*")) + .isEqualTo( + expectedRowSet(createRecord(1, "eee"), createRecord(5, "iii"), createRecord(6, "hhh"))); + } + + @TestTemplate + public void testUnpartitioned() throws IOException { + createAndInitTable(false); + testCdcEvents(false); + } + + @TestTemplate + public void testPartitioned() throws IOException { + createAndInitTable(true); + testCdcEvents(true); + } + + private void testWritePureEqDeletes(boolean partitioned) throws IOException { + createAndInitTable(partitioned); + List equalityFieldIds = Lists.newArrayList(idFieldId()); + TaskWriterFactory taskWriterFactory = createTaskWriterFactory(equalityFieldIds); + taskWriterFactory.initialize(1, 1); + + TaskWriter writer = taskWriterFactory.create(); + writer.write(createDelete(1, "aaa")); + writer.write(createDelete(2, "bbb")); + writer.write(createDelete(3, "ccc")); + + WriteResult result = writer.complete(); + assertThat(result.dataFiles()).isEmpty(); + assertThat(result.deleteFiles()).hasSize(partitioned ? 3 : 1); + commitTransaction(result); + + assertThat(actualRowSet("*")).isEqualTo(expectedRowSet()); + } + + @TestTemplate + public void testUnpartitionedPureEqDeletes() throws IOException { + testWritePureEqDeletes(false); + } + + @TestTemplate + public void testPartitionedPureEqDeletes() throws IOException { + testWritePureEqDeletes(true); + } + + private void testAbort(boolean partitioned) throws IOException { + createAndInitTable(partitioned); + List equalityFieldIds = Lists.newArrayList(idFieldId()); + TaskWriterFactory taskWriterFactory = createTaskWriterFactory(equalityFieldIds); + taskWriterFactory.initialize(1, 1); + + TaskWriter writer = taskWriterFactory.create(); + for (int i = 0; i < 8_000; i += 2) { + writer.write(createUpdateBefore(i + 1, "aaa")); + writer.write(createUpdateAfter(i + 1, "aaa")); + + writer.write(createUpdateBefore(i + 2, "bbb")); + writer.write(createUpdateAfter(i + 2, "bbb")); + } + + // Assert the current data/delete file count. + List files = + Files.walk(Paths.get(tableDir.getPath(), "data")) + .filter(p -> p.toFile().isFile()) + .filter(p -> !p.toString().endsWith(".crc")) + .collect(Collectors.toList()); + assertThat(files).hasSize(partitioned ? 4 : 2); + + writer.abort(); + for (Path file : files) { + assertThat(file).doesNotExist(); + } + } + + @TestTemplate + public void testUnpartitionedAbort() throws IOException { + testAbort(false); + } + + @TestTemplate + public void testPartitionedAbort() throws IOException { + testAbort(true); + } + + @TestTemplate + public void testPartitionedTableWithDataAsKey() throws IOException { + createAndInitTable(true); + List equalityFieldIds = Lists.newArrayList(dataFieldId()); + TaskWriterFactory taskWriterFactory = createTaskWriterFactory(equalityFieldIds); + taskWriterFactory.initialize(1, 1); + + // Start the 1th transaction. + TaskWriter writer = taskWriterFactory.create(); + writer.write(createInsert(1, "aaa")); + writer.write(createInsert(2, "aaa")); + writer.write(createInsert(3, "bbb")); + writer.write(createInsert(4, "ccc")); + + WriteResult result = writer.complete(); + assertThat(result.dataFiles()).hasSize(3); + assertThat(result.deleteFiles()).hasSize(1); + commitTransaction(result); + + assertThat(actualRowSet("*")) + .isEqualTo( + expectedRowSet(createRecord(2, "aaa"), createRecord(3, "bbb"), createRecord(4, "ccc"))); + + // Start the 2nd transaction. + writer = taskWriterFactory.create(); + writer.write(createInsert(5, "aaa")); + writer.write(createInsert(6, "bbb")); + writer.write(createDelete(7, "ccc")); // 1 eq-delete. + + result = writer.complete(); + assertThat(result.dataFiles()).hasSize(2); + assertThat(result.deleteFiles()).hasSize(1); + commitTransaction(result); + + assertThat(actualRowSet("*")) + .isEqualTo( + expectedRowSet( + createRecord(2, "aaa"), + createRecord(5, "aaa"), + createRecord(3, "bbb"), + createRecord(6, "bbb"))); + } + + @TestTemplate + public void testPartitionedTableWithDataAndIdAsKey() throws IOException { + createAndInitTable(true); + List equalityFieldIds = Lists.newArrayList(dataFieldId(), idFieldId()); + TaskWriterFactory taskWriterFactory = createTaskWriterFactory(equalityFieldIds); + taskWriterFactory.initialize(1, 1); + + TaskWriter writer = taskWriterFactory.create(); + writer.write(createInsert(1, "aaa")); + writer.write(createInsert(2, "aaa")); + + writer.write(createDelete(2, "aaa")); // 1 pos-delete. + + WriteResult result = writer.complete(); + assertThat(result.dataFiles()).hasSize(1); + assertThat(result.deleteFiles()).hasSize(1); + assertThat(result.deleteFiles()[0].content()).isEqualTo(FileContent.POSITION_DELETES); + commitTransaction(result); + + assertThat(actualRowSet("*")).isEqualTo(expectedRowSet(createRecord(1, "aaa"))); + } + + @TestTemplate + public void testEqualityColumnOnCustomPrecisionTSColumn() throws IOException { + Schema tableSchema = + new Schema( + required(3, "id", Types.IntegerType.get()), + required(4, "ts", Types.TimestampType.withZone())); + RowType flinkType = + new RowType( + false, + ImmutableList.of( + new RowType.RowField("id", new IntType()), + new RowType.RowField("ts", new LocalZonedTimestampType(3)))); + + this.table = create(tableSchema, PartitionSpec.unpartitioned()); + initTable(table); + + List equalityIds = ImmutableList.of(table.schema().findField("ts").fieldId()); + TaskWriterFactory taskWriterFactory = createTaskWriterFactory(flinkType, equalityIds); + taskWriterFactory.initialize(1, 1); + + TaskWriter writer = taskWriterFactory.create(); + RowDataSerializer serializer = new RowDataSerializer(flinkType); + OffsetDateTime start = OffsetDateTime.now(); + writer.write( + serializer.toBinaryRow( + GenericRowData.ofKind( + RowKind.INSERT, 1, TimestampData.fromInstant(start.toInstant())))); + writer.write( + serializer.toBinaryRow( + GenericRowData.ofKind( + RowKind.INSERT, 2, TimestampData.fromInstant(start.plusSeconds(1).toInstant())))); + writer.write( + serializer.toBinaryRow( + GenericRowData.ofKind( + RowKind.DELETE, 2, TimestampData.fromInstant(start.plusSeconds(1).toInstant())))); + + WriteResult result = writer.complete(); + // One data file + assertThat(result.dataFiles()).hasSize(1); + // One eq delete file + one pos delete file + assertThat(result.deleteFiles()).hasSize(2); + assertThat( + Arrays.stream(result.deleteFiles()) + .map(ContentFile::content) + .collect(Collectors.toSet())) + .isEqualTo(Sets.newHashSet(FileContent.POSITION_DELETES, FileContent.EQUALITY_DELETES)); + commitTransaction(result); + + Record expectedRecord = GenericRecord.create(tableSchema); + expectedRecord.setField("id", 1); + int cutPrecisionNano = start.getNano() / 1000000 * 1000000; + expectedRecord.setField("ts", start.withNano(cutPrecisionNano)); + + assertThat(actualRowSet("*")).isEqualTo(expectedRowSet(expectedRecord)); + } + + private void commitTransaction(WriteResult result) { + RowDelta rowDelta = table.newRowDelta(); + Arrays.stream(result.dataFiles()).forEach(rowDelta::addRows); + Arrays.stream(result.deleteFiles()).forEach(rowDelta::addDeletes); + rowDelta + .validateDeletedFiles() + .validateDataFilesExist(Lists.newArrayList(result.referencedDataFiles())) + .commit(); + } + + private StructLikeSet expectedRowSet(Record... records) { + return SimpleDataUtil.expectedRowSet(table, records); + } + + private StructLikeSet actualRowSet(String... columns) throws IOException { + return SimpleDataUtil.actualRowSet(table, columns); + } + + private TaskWriterFactory createTaskWriterFactory(List equalityFieldIds) { + return new RowDataTaskWriterFactory( + SerializableTable.copyOf(table), + FlinkSchemaUtil.convert(table.schema()), + 128 * 1024 * 1024, + format, + table.properties(), + equalityFieldIds, + false); + } + + private TaskWriterFactory createTaskWriterFactory( + RowType flinkType, List equalityFieldIds) { + return new RowDataTaskWriterFactory( + SerializableTable.copyOf(table), + flinkType, + 128 * 1024 * 1024, + format, + table.properties(), + equalityFieldIds, + true); + } + + private void createAndInitTable(boolean partitioned) { + if (partitioned) { + this.table = create(SCHEMA, PartitionSpec.builderFor(SCHEMA).identity("data").build()); + } else { + this.table = create(SCHEMA, PartitionSpec.unpartitioned()); + } + + initTable(table); + } + + private void initTable(TestTables.TestTable testTable) { + testTable + .updateProperties() + .set(TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES, String.valueOf(8 * 1024)) + .defaultFormat(format) + .commit(); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkAppenderFactory.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkAppenderFactory.java new file mode 100644 index 000000000000..dd89f43483b0 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkAppenderFactory.java @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import java.util.List; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.iceberg.Schema; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.RowDataWrapper; +import org.apache.iceberg.flink.SimpleDataUtil; +import org.apache.iceberg.io.FileAppenderFactory; +import org.apache.iceberg.io.TestAppenderFactory; +import org.apache.iceberg.util.ArrayUtil; +import org.apache.iceberg.util.StructLikeSet; + +public class TestFlinkAppenderFactory extends TestAppenderFactory { + + private final RowType rowType = FlinkSchemaUtil.convert(SCHEMA); + + @Override + protected FileAppenderFactory createAppenderFactory( + List equalityFieldIds, Schema eqDeleteSchema, Schema posDeleteRowSchema) { + return new FlinkAppenderFactory( + table, + table.schema(), + rowType, + table.properties(), + table.spec(), + ArrayUtil.toIntArray(equalityFieldIds), + eqDeleteSchema, + posDeleteRowSchema); + } + + @Override + protected RowData createRow(Integer id, String data) { + return SimpleDataUtil.createRowData(id, data); + } + + @Override + protected StructLikeSet expectedRowSet(Iterable rows) { + StructLikeSet set = StructLikeSet.create(table.schema().asStruct()); + for (RowData row : rows) { + RowDataWrapper wrapper = new RowDataWrapper(rowType, table.schema().asStruct()); + set.add(wrapper.wrap(row)); + } + return set; + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkFileWriterFactory.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkFileWriterFactory.java new file mode 100644 index 000000000000..414ee40d1357 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkFileWriterFactory.java @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import java.util.List; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.iceberg.Schema; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.RowDataWrapper; +import org.apache.iceberg.flink.SimpleDataUtil; +import org.apache.iceberg.io.FileWriterFactory; +import org.apache.iceberg.io.TestFileWriterFactory; +import org.apache.iceberg.util.ArrayUtil; +import org.apache.iceberg.util.StructLikeSet; + +public class TestFlinkFileWriterFactory extends TestFileWriterFactory { + + @Override + protected FileWriterFactory newWriterFactory( + Schema dataSchema, + List equalityFieldIds, + Schema equalityDeleteRowSchema, + Schema positionDeleteRowSchema) { + return FlinkFileWriterFactory.builderFor(table) + .dataSchema(table.schema()) + .dataFileFormat(format()) + .deleteFileFormat(format()) + .equalityFieldIds(ArrayUtil.toIntArray(equalityFieldIds)) + .equalityDeleteRowSchema(equalityDeleteRowSchema) + .positionDeleteRowSchema(positionDeleteRowSchema) + .build(); + } + + @Override + protected RowData toRow(Integer id, String data) { + return SimpleDataUtil.createRowData(id, data); + } + + @Override + protected StructLikeSet toSet(Iterable rows) { + StructLikeSet set = StructLikeSet.create(table.schema().asStruct()); + RowType flinkType = FlinkSchemaUtil.convert(table.schema()); + for (RowData row : rows) { + RowDataWrapper wrapper = new RowDataWrapper(flinkType, table.schema().asStruct()); + set.add(wrapper.wrap(row)); + } + return set; + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSink.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSink.java new file mode 100644 index 000000000000..b778037c559c --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSink.java @@ -0,0 +1,125 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import java.io.IOException; +import java.util.List; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.data.RowData; +import org.apache.flink.types.Row; +import org.apache.iceberg.DistributionMode; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Parameter; +import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.Parameters; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.flink.MiniFlinkClusterExtension; +import org.apache.iceberg.flink.SimpleDataUtil; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.flink.util.FlinkCompatibilityUtil; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.api.extension.ExtendWith; + +@ExtendWith(ParameterizedTestExtension.class) +public class TestFlinkIcebergSink extends TestFlinkIcebergSinkBase { + @Parameter(index = 0) + private FileFormat format; + + @Parameter(index = 1) + private int parallelism; + + @Parameter(index = 2) + private boolean partitioned; + + @Parameters(name = "format={0}, parallelism = {1}, partitioned = {2}") + public static Object[][] parameters() { + return new Object[][] { + {FileFormat.AVRO, 1, true}, + {FileFormat.AVRO, 1, false}, + {FileFormat.AVRO, 2, true}, + {FileFormat.AVRO, 2, false}, + {FileFormat.ORC, 1, true}, + {FileFormat.ORC, 1, false}, + {FileFormat.ORC, 2, true}, + {FileFormat.ORC, 2, false}, + {FileFormat.PARQUET, 1, true}, + {FileFormat.PARQUET, 1, false}, + {FileFormat.PARQUET, 2, true}, + {FileFormat.PARQUET, 2, false} + }; + } + + @BeforeEach + public void before() throws IOException { + this.table = + CATALOG_EXTENSION + .catalog() + .createTable( + TestFixtures.TABLE_IDENTIFIER, + SimpleDataUtil.SCHEMA, + partitioned + ? PartitionSpec.builderFor(SimpleDataUtil.SCHEMA).identity("data").build() + : PartitionSpec.unpartitioned(), + ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name())); + + this.env = + StreamExecutionEnvironment.getExecutionEnvironment( + MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG) + .enableCheckpointing(100) + .setParallelism(parallelism) + .setMaxParallelism(parallelism); + + this.tableLoader = CATALOG_EXTENSION.tableLoader(); + } + + @TestTemplate + public void testWriteRowData() throws Exception { + List rows = Lists.newArrayList(Row.of(1, "hello"), Row.of(2, "world"), Row.of(3, "foo")); + DataStream dataStream = + env.addSource(createBoundedSource(rows), ROW_TYPE_INFO) + .map(CONVERTER::toInternal, FlinkCompatibilityUtil.toTypeInfo(SimpleDataUtil.ROW_TYPE)); + + FlinkSink.forRowData(dataStream) + .table(table) + .tableLoader(tableLoader) + .writeParallelism(parallelism) + .append(); + + // Execute the program. + env.execute("Test Iceberg DataStream"); + + // Assert the iceberg table's records. + SimpleDataUtil.assertTableRows(table, convertToRowData(rows)); + } + + @TestTemplate + public void testWriteRow() throws Exception { + testWriteRow(parallelism, null, DistributionMode.NONE); + } + + @TestTemplate + public void testWriteRowWithTableSchema() throws Exception { + testWriteRow(parallelism, SimpleDataUtil.FLINK_SCHEMA, DistributionMode.NONE); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkBase.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkBase.java new file mode 100644 index 000000000000..9ce36cc1e8d0 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkBase.java @@ -0,0 +1,111 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import static org.apache.iceberg.flink.TestFixtures.DATABASE; + +import java.io.IOException; +import java.util.List; +import java.util.stream.Collectors; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.java.typeutils.RowTypeInfo; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.api.TableSchema; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.util.DataFormatConverters; +import org.apache.flink.test.junit5.MiniClusterExtension; +import org.apache.flink.types.Row; +import org.apache.iceberg.DistributionMode; +import org.apache.iceberg.Table; +import org.apache.iceberg.flink.HadoopCatalogExtension; +import org.apache.iceberg.flink.MiniFlinkClusterExtension; +import org.apache.iceberg.flink.SimpleDataUtil; +import org.apache.iceberg.flink.TableLoader; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.flink.source.BoundedTestSource; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.junit.jupiter.api.extension.RegisterExtension; + +public class TestFlinkIcebergSinkBase { + + @RegisterExtension + public static MiniClusterExtension miniClusterResource = + MiniFlinkClusterExtension.createWithClassloaderCheckDisabled(); + + @RegisterExtension + protected static final HadoopCatalogExtension CATALOG_EXTENSION = + new HadoopCatalogExtension(DATABASE, TestFixtures.TABLE); + + protected static final TypeInformation ROW_TYPE_INFO = + new RowTypeInfo(SimpleDataUtil.FLINK_SCHEMA.getFieldTypes()); + + protected static final DataFormatConverters.RowConverter CONVERTER = + new DataFormatConverters.RowConverter(SimpleDataUtil.FLINK_SCHEMA.getFieldDataTypes()); + + protected TableLoader tableLoader; + protected Table table; + protected StreamExecutionEnvironment env; + + protected BoundedTestSource createBoundedSource(List rows) { + return new BoundedTestSource<>(rows.toArray(new Row[0])); + } + + protected List createRows(String prefix) { + return Lists.newArrayList( + Row.of(1, prefix + "aaa"), + Row.of(1, prefix + "bbb"), + Row.of(1, prefix + "ccc"), + Row.of(2, prefix + "aaa"), + Row.of(2, prefix + "bbb"), + Row.of(2, prefix + "ccc"), + Row.of(3, prefix + "aaa"), + Row.of(3, prefix + "bbb"), + Row.of(3, prefix + "ccc")); + } + + protected List convertToRowData(List rows) { + return rows.stream().map(CONVERTER::toInternal).collect(Collectors.toList()); + } + + protected void testWriteRow( + int writerParallelism, TableSchema tableSchema, DistributionMode distributionMode) + throws Exception { + List rows = createRows(""); + DataStream dataStream = env.addSource(createBoundedSource(rows), ROW_TYPE_INFO); + + FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) + .table(table) + .tableLoader(tableLoader) + .tableSchema(tableSchema) + .writeParallelism(writerParallelism) + .distributionMode(distributionMode) + .append(); + + // Execute the program. + env.execute("Test Iceberg DataStream."); + + SimpleDataUtil.assertTableRows(table, convertToRowData(rows)); + } + + protected int partitionFiles(String partition) throws IOException { + return SimpleDataUtil.partitionDataFiles(table, ImmutableMap.of("data", partition)).size(); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkBranch.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkBranch.java new file mode 100644 index 000000000000..441b5ed2a4ae --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkBranch.java @@ -0,0 +1,137 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import static org.apache.iceberg.flink.TestFixtures.DATABASE; +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.IOException; +import java.util.List; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.api.TableSchema; +import org.apache.flink.types.Row; +import org.apache.iceberg.DistributionMode; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Parameter; +import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.Parameters; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.SnapshotRef; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.flink.HadoopCatalogExtension; +import org.apache.iceberg.flink.MiniFlinkClusterExtension; +import org.apache.iceberg.flink.SimpleDataUtil; +import org.apache.iceberg.flink.TableLoader; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.api.extension.ExtendWith; +import org.junit.jupiter.api.extension.RegisterExtension; + +@ExtendWith(ParameterizedTestExtension.class) +public class TestFlinkIcebergSinkBranch extends TestFlinkIcebergSinkBase { + @RegisterExtension + public static final HadoopCatalogExtension CATALOG_EXTENSION = + new HadoopCatalogExtension(DATABASE, TestFixtures.TABLE); + + @Parameter(index = 0) + private String formatVersion; + + @Parameter(index = 1) + private String branch; + + private TableLoader tableLoader; + + @Parameters(name = "formatVersion = {0}, branch = {1}") + public static Object[][] parameters() { + return new Object[][] { + {"1", "main"}, + {"1", "testBranch"}, + {"2", "main"}, + {"2", "testBranch"} + }; + } + + @BeforeEach + public void before() throws IOException { + table = + CATALOG_EXTENSION + .catalog() + .createTable( + TestFixtures.TABLE_IDENTIFIER, + SimpleDataUtil.SCHEMA, + PartitionSpec.unpartitioned(), + ImmutableMap.of( + TableProperties.DEFAULT_FILE_FORMAT, + FileFormat.AVRO.name(), + TableProperties.FORMAT_VERSION, + formatVersion)); + + env = + StreamExecutionEnvironment.getExecutionEnvironment( + MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG) + .enableCheckpointing(100); + + tableLoader = CATALOG_EXTENSION.tableLoader(); + } + + @TestTemplate + public void testWriteRowWithTableSchema() throws Exception { + testWriteRow(SimpleDataUtil.FLINK_SCHEMA, DistributionMode.NONE); + verifyOtherBranchUnmodified(); + } + + private void testWriteRow(TableSchema tableSchema, DistributionMode distributionMode) + throws Exception { + List rows = createRows(""); + DataStream dataStream = env.addSource(createBoundedSource(rows), ROW_TYPE_INFO); + + FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) + .table(table) + .tableLoader(tableLoader) + .tableSchema(tableSchema) + .toBranch(branch) + .distributionMode(distributionMode) + .append(); + + // Execute the program. + env.execute("Test Iceberg DataStream."); + + SimpleDataUtil.assertTableRows(table, convertToRowData(rows), branch); + SimpleDataUtil.assertTableRows( + table, + ImmutableList.of(), + branch.equals(SnapshotRef.MAIN_BRANCH) ? "test-branch" : SnapshotRef.MAIN_BRANCH); + + verifyOtherBranchUnmodified(); + } + + private void verifyOtherBranchUnmodified() { + String otherBranch = + branch.equals(SnapshotRef.MAIN_BRANCH) ? "test-branch" : SnapshotRef.MAIN_BRANCH; + if (otherBranch.equals(SnapshotRef.MAIN_BRANCH)) { + assertThat(table.currentSnapshot()).isNull(); + } + + assertThat(table.snapshot(otherBranch)).isNull(); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkDistributionMode.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkDistributionMode.java new file mode 100644 index 000000000000..75e397d3f203 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkDistributionMode.java @@ -0,0 +1,180 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import java.io.IOException; +import java.util.List; +import java.util.Map; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.test.junit5.MiniClusterExtension; +import org.apache.flink.types.Row; +import org.apache.iceberg.DistributionMode; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Parameter; +import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.Parameters; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.flink.FlinkWriteOptions; +import org.apache.iceberg.flink.HadoopCatalogExtension; +import org.apache.iceberg.flink.MiniFlinkClusterExtension; +import org.apache.iceberg.flink.SimpleDataUtil; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.api.extension.ExtendWith; +import org.junit.jupiter.api.extension.RegisterExtension; + +/** + * This tests the distribution mode of Flink sink. Extract them separately since it is unnecessary + * to test different file formats (Avro, Orc, Parquet) like in {@link TestFlinkIcebergSink}. + * Removing the file format dimension reduces the number of combinations from 12 to 4, which helps + * reduce test run time. + */ +@ExtendWith(ParameterizedTestExtension.class) +public class TestFlinkIcebergSinkDistributionMode extends TestFlinkIcebergSinkBase { + + @RegisterExtension + public static MiniClusterExtension miniClusterResource = + MiniFlinkClusterExtension.createWithClassloaderCheckDisabled(); + + @RegisterExtension + private static final HadoopCatalogExtension CATALOG_EXTENSION = + new HadoopCatalogExtension(TestFixtures.DATABASE, TestFixtures.TABLE); + + private final FileFormat format = FileFormat.PARQUET; + + @Parameter(index = 0) + private int parallelism; + + @Parameter(index = 1) + private boolean partitioned; + + @Parameters(name = "parallelism = {0}, partitioned = {1}") + public static Object[][] parameters() { + return new Object[][] { + {1, true}, + {1, false}, + {2, true}, + {2, false} + }; + } + + @BeforeEach + public void before() throws IOException { + this.table = + CATALOG_EXTENSION + .catalog() + .createTable( + TestFixtures.TABLE_IDENTIFIER, + SimpleDataUtil.SCHEMA, + partitioned + ? PartitionSpec.builderFor(SimpleDataUtil.SCHEMA).identity("data").build() + : PartitionSpec.unpartitioned(), + ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name())); + + this.env = + StreamExecutionEnvironment.getExecutionEnvironment( + MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG) + .enableCheckpointing(100) + .setParallelism(parallelism) + .setMaxParallelism(parallelism); + + this.tableLoader = CATALOG_EXTENSION.tableLoader(); + } + + @TestTemplate + public void testShuffleByPartitionWithSchema() throws Exception { + testWriteRow(parallelism, SimpleDataUtil.FLINK_SCHEMA, DistributionMode.HASH); + if (partitioned) { + assertThat(partitionFiles("aaa")).isEqualTo(1); + assertThat(partitionFiles("bbb")).isEqualTo(1); + assertThat(partitionFiles("ccc")).isEqualTo(1); + } + } + + @TestTemplate + public void testJobNoneDistributeMode() throws Exception { + table + .updateProperties() + .set(TableProperties.WRITE_DISTRIBUTION_MODE, DistributionMode.HASH.modeName()) + .commit(); + + testWriteRow(parallelism, null, DistributionMode.NONE); + + if (parallelism > 1) { + if (partitioned) { + int files = partitionFiles("aaa") + partitionFiles("bbb") + partitionFiles("ccc"); + assertThat(files).isGreaterThan(3); + } + } + } + + @TestTemplate + public void testJobNullDistributionMode() throws Exception { + table + .updateProperties() + .set(TableProperties.WRITE_DISTRIBUTION_MODE, DistributionMode.HASH.modeName()) + .commit(); + + testWriteRow(parallelism, null, null); + + if (partitioned) { + assertThat(partitionFiles("aaa")).isEqualTo(1); + assertThat(partitionFiles("bbb")).isEqualTo(1); + assertThat(partitionFiles("ccc")).isEqualTo(1); + } + } + + @TestTemplate + public void testPartitionWriteMode() throws Exception { + testWriteRow(parallelism, null, DistributionMode.HASH); + if (partitioned) { + assertThat(partitionFiles("aaa")).isEqualTo(1); + assertThat(partitionFiles("bbb")).isEqualTo(1); + assertThat(partitionFiles("ccc")).isEqualTo(1); + } + } + + @TestTemplate + public void testOverrideWriteConfigWithUnknownDistributionMode() { + Map newProps = Maps.newHashMap(); + newProps.put(FlinkWriteOptions.DISTRIBUTION_MODE.key(), "UNRECOGNIZED"); + + List rows = createRows(""); + DataStream dataStream = env.addSource(createBoundedSource(rows), ROW_TYPE_INFO); + + FlinkSink.Builder builder = + FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) + .table(table) + .tableLoader(tableLoader) + .writeParallelism(parallelism) + .setAll(newProps); + + assertThatThrownBy(builder::append) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Invalid distribution mode: UNRECOGNIZED"); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkExtended.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkExtended.java new file mode 100644 index 000000000000..36a59b20431c --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkExtended.java @@ -0,0 +1,208 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import java.io.IOException; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.data.RowData; +import org.apache.flink.types.Row; +import org.apache.iceberg.DistributionMode; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Table; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.flink.FlinkWriteOptions; +import org.apache.iceberg.flink.MiniFlinkClusterExtension; +import org.apache.iceberg.flink.SimpleDataUtil; +import org.apache.iceberg.flink.TableLoader; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.flink.util.FlinkCompatibilityUtil; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +/** + * This class tests the more extended features of Flink sink. Extract them separately since it is + * unnecessary to test all the parameters combinations in {@link TestFlinkIcebergSink}. Each test + * method in {@link TestFlinkIcebergSink} runs 12 combinations, which are expensive and slow. + */ +public class TestFlinkIcebergSinkExtended extends TestFlinkIcebergSinkBase { + private final boolean partitioned = true; + private final int parallelism = 2; + private final FileFormat format = FileFormat.PARQUET; + + @BeforeEach + public void before() throws IOException { + this.table = + CATALOG_EXTENSION + .catalog() + .createTable( + TestFixtures.TABLE_IDENTIFIER, + SimpleDataUtil.SCHEMA, + partitioned + ? PartitionSpec.builderFor(SimpleDataUtil.SCHEMA).identity("data").build() + : PartitionSpec.unpartitioned(), + ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name())); + + this.env = + StreamExecutionEnvironment.getExecutionEnvironment( + MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG) + .enableCheckpointing(100) + .setParallelism(parallelism) + .setMaxParallelism(parallelism); + + this.tableLoader = CATALOG_EXTENSION.tableLoader(); + } + + @Test + public void testTwoSinksInDisjointedDAG() throws Exception { + Map props = ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name()); + + Table leftTable = + CATALOG_EXTENSION + .catalog() + .createTable( + TableIdentifier.of("left"), + SimpleDataUtil.SCHEMA, + partitioned + ? PartitionSpec.builderFor(SimpleDataUtil.SCHEMA).identity("data").build() + : PartitionSpec.unpartitioned(), + props); + TableLoader leftTableLoader = + TableLoader.fromCatalog(CATALOG_EXTENSION.catalogLoader(), TableIdentifier.of("left")); + + Table rightTable = + CATALOG_EXTENSION + .catalog() + .createTable( + TableIdentifier.of("right"), + SimpleDataUtil.SCHEMA, + partitioned + ? PartitionSpec.builderFor(SimpleDataUtil.SCHEMA).identity("data").build() + : PartitionSpec.unpartitioned(), + props); + TableLoader rightTableLoader = + TableLoader.fromCatalog(CATALOG_EXTENSION.catalogLoader(), TableIdentifier.of("right")); + + env = + StreamExecutionEnvironment.getExecutionEnvironment( + MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG) + .enableCheckpointing(100) + .setParallelism(parallelism) + .setMaxParallelism(parallelism); + env.getConfig().disableAutoGeneratedUIDs(); + + List leftRows = createRows("left-"); + DataStream leftStream = + env.fromCollection(leftRows, ROW_TYPE_INFO) + .name("leftCustomSource") + .uid("leftCustomSource"); + FlinkSink.forRow(leftStream, SimpleDataUtil.FLINK_SCHEMA) + .table(leftTable) + .tableLoader(leftTableLoader) + .tableSchema(SimpleDataUtil.FLINK_SCHEMA) + .distributionMode(DistributionMode.NONE) + .uidPrefix("leftIcebergSink") + .append(); + + List rightRows = createRows("right-"); + DataStream rightStream = + env.fromCollection(rightRows, ROW_TYPE_INFO) + .name("rightCustomSource") + .uid("rightCustomSource"); + FlinkSink.forRow(rightStream, SimpleDataUtil.FLINK_SCHEMA) + .table(rightTable) + .tableLoader(rightTableLoader) + .tableSchema(SimpleDataUtil.FLINK_SCHEMA) + .writeParallelism(parallelism) + .distributionMode(DistributionMode.HASH) + .uidPrefix("rightIcebergSink") + .setSnapshotProperty("flink.test", TestFlinkIcebergSink.class.getName()) + .setSnapshotProperties(Collections.singletonMap("direction", "rightTable")) + .append(); + + // Execute the program. + env.execute("Test Iceberg DataStream."); + + SimpleDataUtil.assertTableRows(leftTable, convertToRowData(leftRows)); + SimpleDataUtil.assertTableRows(rightTable, convertToRowData(rightRows)); + + leftTable.refresh(); + assertThat(leftTable.currentSnapshot().summary()).doesNotContainKeys("flink.test", "direction"); + rightTable.refresh(); + assertThat(rightTable.currentSnapshot().summary()) + .containsEntry("flink.test", TestFlinkIcebergSink.class.getName()) + .containsEntry("direction", "rightTable"); + } + + @Test + public void testOverrideWriteConfigWithUnknownFileFormat() { + Map newProps = Maps.newHashMap(); + newProps.put(FlinkWriteOptions.WRITE_FORMAT.key(), "UNRECOGNIZED"); + + List rows = createRows(""); + DataStream dataStream = env.addSource(createBoundedSource(rows), ROW_TYPE_INFO); + + FlinkSink.Builder builder = + FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) + .table(table) + .tableLoader(tableLoader) + .writeParallelism(parallelism) + .setAll(newProps); + + assertThatThrownBy(builder::append) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Invalid file format: UNRECOGNIZED"); + } + + @Test + public void testWriteRowWithTableRefreshInterval() throws Exception { + List rows = Lists.newArrayList(Row.of(1, "hello"), Row.of(2, "world"), Row.of(3, "foo")); + DataStream dataStream = + env.addSource(createBoundedSource(rows), ROW_TYPE_INFO) + .map(CONVERTER::toInternal, FlinkCompatibilityUtil.toTypeInfo(SimpleDataUtil.ROW_TYPE)); + + Configuration flinkConf = new Configuration(); + flinkConf.setString(FlinkWriteOptions.TABLE_REFRESH_INTERVAL.key(), "100ms"); + + FlinkSink.forRowData(dataStream) + .table(table) + .tableLoader(tableLoader) + .flinkConf(flinkConf) + .writeParallelism(parallelism) + .append(); + + // Execute the program. + env.execute("Test Iceberg DataStream"); + + // Assert the iceberg table's records. + SimpleDataUtil.assertTableRows(table, convertToRowData(rows)); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2.java new file mode 100644 index 000000000000..577c54976b9a --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2.java @@ -0,0 +1,235 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import static org.apache.iceberg.flink.TestFixtures.DATABASE; +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; +import static org.assertj.core.api.Assumptions.assumeThat; + +import java.util.List; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.test.junit5.MiniClusterExtension; +import org.apache.flink.types.Row; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.DeleteFile; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.MetadataColumns; +import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.SnapshotRef; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.flink.HadoopCatalogExtension; +import org.apache.iceberg.flink.MiniFlinkClusterExtension; +import org.apache.iceberg.flink.SimpleDataUtil; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.flink.source.BoundedTestSource; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.types.Types; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.api.Timeout; +import org.junit.jupiter.api.extension.ExtendWith; +import org.junit.jupiter.api.extension.RegisterExtension; + +@ExtendWith(ParameterizedTestExtension.class) +@Timeout(value = 60) +public class TestFlinkIcebergSinkV2 extends TestFlinkIcebergSinkV2Base { + @RegisterExtension + public static final MiniClusterExtension MINI_CLUSTER_EXTENSION = + MiniFlinkClusterExtension.createWithClassloaderCheckDisabled(); + + @RegisterExtension + private static final HadoopCatalogExtension CATALOG_EXTENSION = + new HadoopCatalogExtension(DATABASE, TestFixtures.TABLE); + + @BeforeEach + public void setupTable() { + table = + CATALOG_EXTENSION + .catalog() + .createTable( + TestFixtures.TABLE_IDENTIFIER, + SimpleDataUtil.SCHEMA, + partitioned + ? PartitionSpec.builderFor(SimpleDataUtil.SCHEMA).identity("data").build() + : PartitionSpec.unpartitioned(), + ImmutableMap.of( + TableProperties.DEFAULT_FILE_FORMAT, + format.name(), + TableProperties.FORMAT_VERSION, + String.valueOf(FORMAT_V2))); + + table + .updateProperties() + .set(TableProperties.DEFAULT_FILE_FORMAT, format.name()) + .set(TableProperties.WRITE_DISTRIBUTION_MODE, writeDistributionMode) + .commit(); + + env = + StreamExecutionEnvironment.getExecutionEnvironment( + MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG) + .enableCheckpointing(100L) + .setParallelism(parallelism) + .setMaxParallelism(parallelism); + + tableLoader = CATALOG_EXTENSION.tableLoader(); + } + + @TestTemplate + public void testCheckAndGetEqualityFieldIds() { + table + .updateSchema() + .allowIncompatibleChanges() + .addRequiredColumn("type", Types.StringType.get()) + .setIdentifierFields("type") + .commit(); + + DataStream dataStream = + env.addSource(new BoundedTestSource<>(ImmutableList.of()), ROW_TYPE_INFO); + FlinkSink.Builder builder = + FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA).table(table); + + // Use schema identifier field IDs as equality field id list by default + assertThat(builder.checkAndGetEqualityFieldIds()) + .containsExactlyInAnyOrderElementsOf(table.schema().identifierFieldIds()); + + // Use user-provided equality field column as equality field id list + builder.equalityFieldColumns(Lists.newArrayList("id")); + assertThat(builder.checkAndGetEqualityFieldIds()) + .containsExactlyInAnyOrder(table.schema().findField("id").fieldId()); + + builder.equalityFieldColumns(Lists.newArrayList("type")); + assertThat(builder.checkAndGetEqualityFieldIds()) + .containsExactlyInAnyOrder(table.schema().findField("type").fieldId()); + } + + @TestTemplate + public void testChangeLogOnIdKey() throws Exception { + testChangeLogOnIdKey(SnapshotRef.MAIN_BRANCH); + } + + @TestTemplate + public void testUpsertOnlyDeletesOnDataKey() throws Exception { + List> elementsPerCheckpoint = + ImmutableList.of( + ImmutableList.of(row("+I", 1, "aaa")), + ImmutableList.of(row("-D", 1, "aaa"), row("-D", 2, "bbb"))); + + List> expectedRecords = + ImmutableList.of(ImmutableList.of(record(1, "aaa")), ImmutableList.of()); + + testChangeLogs( + ImmutableList.of("data"), + row -> row.getField(ROW_DATA_POS), + true, + elementsPerCheckpoint, + expectedRecords, + SnapshotRef.MAIN_BRANCH); + } + + @TestTemplate + public void testChangeLogOnDataKey() throws Exception { + testChangeLogOnDataKey(SnapshotRef.MAIN_BRANCH); + } + + @TestTemplate + public void testChangeLogOnIdDataKey() throws Exception { + testChangeLogOnIdDataKey(SnapshotRef.MAIN_BRANCH); + } + + @TestTemplate + public void testChangeLogOnSameKey() throws Exception { + testChangeLogOnSameKey(SnapshotRef.MAIN_BRANCH); + } + + @TestTemplate + public void testUpsertModeCheck() throws Exception { + DataStream dataStream = + env.addSource(new BoundedTestSource<>(ImmutableList.of()), ROW_TYPE_INFO); + FlinkSink.Builder builder = + FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) + .tableLoader(tableLoader) + .tableSchema(SimpleDataUtil.FLINK_SCHEMA) + .writeParallelism(parallelism) + .upsert(true); + + assertThatThrownBy( + () -> + builder + .equalityFieldColumns(ImmutableList.of("id", "data")) + .overwrite(true) + .append()) + .isInstanceOf(IllegalStateException.class) + .hasMessage( + "OVERWRITE mode shouldn't be enable when configuring to use UPSERT data stream."); + + assertThatThrownBy( + () -> builder.equalityFieldColumns(ImmutableList.of()).overwrite(false).append()) + .isInstanceOf(IllegalStateException.class) + .hasMessage( + "Equality field columns shouldn't be empty when configuring to use UPSERT data stream."); + } + + @TestTemplate + public void testUpsertOnIdKey() throws Exception { + testUpsertOnIdKey(SnapshotRef.MAIN_BRANCH); + } + + @TestTemplate + public void testUpsertOnDataKey() throws Exception { + testUpsertOnDataKey(SnapshotRef.MAIN_BRANCH); + } + + @TestTemplate + public void testUpsertOnIdDataKey() throws Exception { + testUpsertOnIdDataKey(SnapshotRef.MAIN_BRANCH); + } + + @TestTemplate + public void testDeleteStats() throws Exception { + assumeThat(format).isNotEqualTo(FileFormat.AVRO); + + List> elementsPerCheckpoint = + ImmutableList.of( + // Checkpoint #1 + ImmutableList.of(row("+I", 1, "aaa"), row("-D", 1, "aaa"), row("+I", 1, "aaa"))); + + List> expectedRecords = ImmutableList.of(ImmutableList.of(record(1, "aaa"))); + + testChangeLogs( + ImmutableList.of("id", "data"), + row -> Row.of(row.getField(ROW_ID_POS), row.getField(ROW_DATA_POS)), + false, + elementsPerCheckpoint, + expectedRecords, + "main"); + + DeleteFile deleteFile = table.currentSnapshot().addedDeleteFiles(table.io()).iterator().next(); + String fromStat = + new String( + deleteFile.lowerBounds().get(MetadataColumns.DELETE_FILE_PATH.fieldId()).array()); + DataFile dataFile = table.currentSnapshot().addedDataFiles(table.io()).iterator().next(); + assumeThat(fromStat).isEqualTo(dataFile.path().toString()); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2Base.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2Base.java new file mode 100644 index 000000000000..fc33c2fea5e6 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2Base.java @@ -0,0 +1,389 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import java.io.IOException; +import java.util.List; +import java.util.Map; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.java.functions.KeySelector; +import org.apache.flink.api.java.typeutils.RowTypeInfo; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.types.Row; +import org.apache.flink.types.RowKind; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Parameter; +import org.apache.iceberg.Parameters; +import org.apache.iceberg.Snapshot; +import org.apache.iceberg.Table; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.data.IcebergGenerics; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.flink.SimpleDataUtil; +import org.apache.iceberg.flink.TableLoader; +import org.apache.iceberg.flink.source.BoundedTestSource; +import org.apache.iceberg.io.CloseableIterable; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.util.StructLikeSet; + +public class TestFlinkIcebergSinkV2Base { + + protected static final int FORMAT_V2 = 2; + protected static final TypeInformation ROW_TYPE_INFO = + new RowTypeInfo(SimpleDataUtil.FLINK_SCHEMA.getFieldTypes()); + + protected static final int ROW_ID_POS = 0; + protected static final int ROW_DATA_POS = 1; + + protected TableLoader tableLoader; + protected Table table; + protected StreamExecutionEnvironment env; + + @Parameter(index = 0) + protected FileFormat format; + + @Parameter(index = 1) + protected int parallelism = 1; + + @Parameter(index = 2) + protected boolean partitioned; + + @Parameter(index = 3) + protected String writeDistributionMode; + + @Parameters(name = "FileFormat={0}, Parallelism={1}, Partitioned={2}, WriteDistributionMode={3}") + public static Object[][] parameters() { + return new Object[][] { + new Object[] {FileFormat.AVRO, 1, true, TableProperties.WRITE_DISTRIBUTION_MODE_NONE}, + new Object[] {FileFormat.AVRO, 1, false, TableProperties.WRITE_DISTRIBUTION_MODE_NONE}, + new Object[] {FileFormat.AVRO, 4, true, TableProperties.WRITE_DISTRIBUTION_MODE_NONE}, + new Object[] {FileFormat.AVRO, 4, false, TableProperties.WRITE_DISTRIBUTION_MODE_NONE}, + new Object[] {FileFormat.ORC, 1, true, TableProperties.WRITE_DISTRIBUTION_MODE_HASH}, + new Object[] {FileFormat.ORC, 1, false, TableProperties.WRITE_DISTRIBUTION_MODE_HASH}, + new Object[] {FileFormat.ORC, 4, true, TableProperties.WRITE_DISTRIBUTION_MODE_HASH}, + new Object[] {FileFormat.ORC, 4, false, TableProperties.WRITE_DISTRIBUTION_MODE_HASH}, + new Object[] {FileFormat.PARQUET, 1, true, TableProperties.WRITE_DISTRIBUTION_MODE_RANGE}, + new Object[] {FileFormat.PARQUET, 1, false, TableProperties.WRITE_DISTRIBUTION_MODE_RANGE}, + new Object[] {FileFormat.PARQUET, 4, true, TableProperties.WRITE_DISTRIBUTION_MODE_RANGE}, + new Object[] {FileFormat.PARQUET, 4, false, TableProperties.WRITE_DISTRIBUTION_MODE_RANGE} + }; + } + + protected static final Map ROW_KIND_MAP = + ImmutableMap.of( + "+I", RowKind.INSERT, + "-D", RowKind.DELETE, + "-U", RowKind.UPDATE_BEFORE, + "+U", RowKind.UPDATE_AFTER); + + protected Row row(String rowKind, int id, String data) { + RowKind kind = ROW_KIND_MAP.get(rowKind); + if (kind == null) { + throw new IllegalArgumentException("Unknown row kind: " + rowKind); + } + + return Row.ofKind(kind, id, data); + } + + protected void testUpsertOnIdDataKey(String branch) throws Exception { + List> elementsPerCheckpoint = + ImmutableList.of( + ImmutableList.of(row("+I", 1, "aaa"), row("+U", 1, "aaa"), row("+I", 2, "bbb")), + ImmutableList.of(row("+I", 1, "aaa"), row("-D", 2, "bbb"), row("+I", 2, "ccc")), + ImmutableList.of(row("+U", 1, "bbb"), row("-U", 1, "ccc"), row("-D", 1, "aaa"))); + + List> expectedRecords = + ImmutableList.of( + ImmutableList.of(record(1, "aaa"), record(2, "bbb")), + ImmutableList.of(record(1, "aaa"), record(2, "ccc")), + ImmutableList.of(record(1, "bbb"), record(2, "ccc"))); + testChangeLogs( + ImmutableList.of("id", "data"), + row -> Row.of(row.getField(ROW_ID_POS), row.getField(ROW_DATA_POS)), + true, + elementsPerCheckpoint, + expectedRecords, + branch); + } + + protected void testChangeLogOnIdDataKey(String branch) throws Exception { + List> elementsPerCheckpoint = + ImmutableList.of( + ImmutableList.of( + row("+I", 1, "aaa"), + row("-D", 1, "aaa"), + row("+I", 2, "bbb"), + row("+I", 1, "bbb"), + row("+I", 2, "aaa")), + ImmutableList.of(row("-U", 2, "aaa"), row("+U", 1, "ccc"), row("+I", 1, "aaa")), + ImmutableList.of(row("-D", 1, "bbb"), row("+I", 2, "aaa"))); + + List> expectedRecords = + ImmutableList.of( + ImmutableList.of(record(1, "bbb"), record(2, "aaa"), record(2, "bbb")), + ImmutableList.of( + record(1, "aaa"), record(1, "bbb"), record(1, "ccc"), record(2, "bbb")), + ImmutableList.of( + record(1, "aaa"), record(1, "ccc"), record(2, "aaa"), record(2, "bbb"))); + + testChangeLogs( + ImmutableList.of("data", "id"), + row -> Row.of(row.getField(ROW_ID_POS), row.getField(ROW_DATA_POS)), + false, + elementsPerCheckpoint, + expectedRecords, + branch); + } + + protected void testChangeLogOnSameKey(String branch) throws Exception { + List> elementsPerCheckpoint = + ImmutableList.of( + // Checkpoint #1 + ImmutableList.of(row("+I", 1, "aaa"), row("-D", 1, "aaa"), row("+I", 1, "aaa")), + // Checkpoint #2 + ImmutableList.of(row("-U", 1, "aaa"), row("+U", 1, "aaa")), + // Checkpoint #3 + ImmutableList.of(row("-D", 1, "aaa"), row("+I", 1, "aaa")), + // Checkpoint #4 + ImmutableList.of(row("-U", 1, "aaa"), row("+U", 1, "aaa"), row("+I", 1, "aaa"))); + + List> expectedRecords = + ImmutableList.of( + ImmutableList.of(record(1, "aaa")), + ImmutableList.of(record(1, "aaa")), + ImmutableList.of(record(1, "aaa")), + ImmutableList.of(record(1, "aaa"), record(1, "aaa"))); + + testChangeLogs( + ImmutableList.of("id", "data"), + row -> Row.of(row.getField(ROW_ID_POS), row.getField(ROW_DATA_POS)), + false, + elementsPerCheckpoint, + expectedRecords, + branch); + } + + protected void testChangeLogOnDataKey(String branch) throws Exception { + List> elementsPerCheckpoint = + ImmutableList.of( + ImmutableList.of( + row("+I", 1, "aaa"), + row("-D", 1, "aaa"), + row("+I", 2, "bbb"), + row("+I", 1, "bbb"), + row("+I", 2, "aaa")), + ImmutableList.of(row("-U", 2, "aaa"), row("+U", 1, "ccc"), row("+I", 1, "aaa")), + ImmutableList.of(row("-D", 1, "bbb"), row("+I", 2, "aaa"), row("+I", 2, "ccc"))); + + List> expectedRecords = + ImmutableList.of( + ImmutableList.of(record(1, "bbb"), record(2, "aaa")), + ImmutableList.of(record(1, "aaa"), record(1, "bbb"), record(1, "ccc")), + ImmutableList.of( + record(1, "aaa"), record(1, "ccc"), record(2, "aaa"), record(2, "ccc"))); + + testChangeLogs( + ImmutableList.of("data"), + row -> row.getField(ROW_DATA_POS), + false, + elementsPerCheckpoint, + expectedRecords, + branch); + } + + protected void testUpsertOnDataKey(String branch) throws Exception { + List> elementsPerCheckpoint = + ImmutableList.of( + ImmutableList.of(row("+I", 1, "aaa"), row("+I", 2, "aaa"), row("+I", 3, "bbb")), + ImmutableList.of(row("+U", 4, "aaa"), row("-U", 3, "bbb"), row("+U", 5, "bbb")), + ImmutableList.of(row("+I", 6, "aaa"), row("+U", 7, "bbb"))); + + List> expectedRecords = + ImmutableList.of( + ImmutableList.of(record(2, "aaa"), record(3, "bbb")), + ImmutableList.of(record(4, "aaa"), record(5, "bbb")), + ImmutableList.of(record(6, "aaa"), record(7, "bbb"))); + + testChangeLogs( + ImmutableList.of("data"), + row -> row.getField(ROW_DATA_POS), + true, + elementsPerCheckpoint, + expectedRecords, + branch); + } + + protected void testChangeLogOnIdKey(String branch) throws Exception { + List> elementsPerCheckpoint = + ImmutableList.of( + ImmutableList.of( + row("+I", 1, "aaa"), + row("-D", 1, "aaa"), + row("+I", 1, "bbb"), + row("+I", 2, "aaa"), + row("-D", 2, "aaa"), + row("+I", 2, "bbb")), + ImmutableList.of( + row("-U", 2, "bbb"), row("+U", 2, "ccc"), row("-D", 2, "ccc"), row("+I", 2, "ddd")), + ImmutableList.of( + row("-D", 1, "bbb"), + row("+I", 1, "ccc"), + row("-D", 1, "ccc"), + row("+I", 1, "ddd"))); + + List> expectedRecords = + ImmutableList.of( + ImmutableList.of(record(1, "bbb"), record(2, "bbb")), + ImmutableList.of(record(1, "bbb"), record(2, "ddd")), + ImmutableList.of(record(1, "ddd"), record(2, "ddd"))); + + if (partitioned && writeDistributionMode.equals(TableProperties.WRITE_DISTRIBUTION_MODE_HASH)) { + assertThatThrownBy( + () -> + testChangeLogs( + ImmutableList.of("id"), + row -> row.getField(ROW_ID_POS), + false, + elementsPerCheckpoint, + expectedRecords, + branch)) + .isInstanceOf(IllegalStateException.class) + .hasMessageStartingWith( + "In 'hash' distribution mode with equality fields set, partition field") + .hasMessageContaining("should be included in equality fields:"); + + } else { + testChangeLogs( + ImmutableList.of("id"), + row -> row.getField(ROW_ID_POS), + false, + elementsPerCheckpoint, + expectedRecords, + branch); + } + } + + protected void testUpsertOnIdKey(String branch) throws Exception { + List> elementsPerCheckpoint = + ImmutableList.of( + ImmutableList.of(row("+I", 1, "aaa"), row("+U", 1, "bbb")), + ImmutableList.of(row("+I", 1, "ccc")), + ImmutableList.of(row("+U", 1, "ddd"), row("+I", 1, "eee"))); + + List> expectedRecords = + ImmutableList.of( + ImmutableList.of(record(1, "bbb")), + ImmutableList.of(record(1, "ccc")), + ImmutableList.of(record(1, "eee"))); + + if (!partitioned) { + testChangeLogs( + ImmutableList.of("id"), + row -> row.getField(ROW_ID_POS), + true, + elementsPerCheckpoint, + expectedRecords, + branch); + } else { + assertThatThrownBy( + () -> + testChangeLogs( + ImmutableList.of("id"), + row -> row.getField(ROW_ID_POS), + true, + elementsPerCheckpoint, + expectedRecords, + branch)) + .isInstanceOf(IllegalStateException.class) + .hasMessageContaining("should be included in equality fields:"); + } + } + + protected void testChangeLogs( + List equalityFieldColumns, + KeySelector keySelector, + boolean insertAsUpsert, + List> elementsPerCheckpoint, + List> expectedRecordsPerCheckpoint, + String branch) + throws Exception { + DataStream dataStream = + env.addSource(new BoundedTestSource<>(elementsPerCheckpoint), ROW_TYPE_INFO); + + FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) + .tableLoader(tableLoader) + .tableSchema(SimpleDataUtil.FLINK_SCHEMA) + .writeParallelism(parallelism) + .equalityFieldColumns(equalityFieldColumns) + .upsert(insertAsUpsert) + .toBranch(branch) + .append(); + + // Execute the program. + env.execute("Test Iceberg Change-Log DataStream."); + + table.refresh(); + List snapshots = findValidSnapshots(); + int expectedSnapshotNum = expectedRecordsPerCheckpoint.size(); + assertThat(snapshots).hasSize(expectedSnapshotNum); + + for (int i = 0; i < expectedSnapshotNum; i++) { + long snapshotId = snapshots.get(i).snapshotId(); + List expectedRecords = expectedRecordsPerCheckpoint.get(i); + assertThat(actualRowSet(snapshotId, "*")) + .as("Should have the expected records for the checkpoint#" + i) + .isEqualTo(expectedRowSet(expectedRecords.toArray(new Record[0]))); + } + } + + protected Record record(int id, String data) { + return SimpleDataUtil.createRecord(id, data); + } + + private List findValidSnapshots() { + List validSnapshots = Lists.newArrayList(); + for (Snapshot snapshot : table.snapshots()) { + if (snapshot.allManifests(table.io()).stream() + .anyMatch(m -> snapshot.snapshotId() == m.snapshotId())) { + validSnapshots.add(snapshot); + } + } + return validSnapshots; + } + + private StructLikeSet expectedRowSet(Record... records) { + return SimpleDataUtil.expectedRowSet(table, records); + } + + private StructLikeSet actualRowSet(long snapshotId, String... columns) throws IOException { + table.refresh(); + StructLikeSet set = StructLikeSet.create(table.schema().asStruct()); + try (CloseableIterable reader = + IcebergGenerics.read(table).useSnapshot(snapshotId).select(columns).build()) { + reader.forEach(set::add); + } + return set; + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2Branch.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2Branch.java new file mode 100644 index 000000000000..0b0c55f51c32 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2Branch.java @@ -0,0 +1,125 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import static org.apache.iceberg.flink.TestFixtures.DATABASE; +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.IOException; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Parameter; +import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.Parameters; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.SnapshotRef; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.flink.HadoopCatalogExtension; +import org.apache.iceberg.flink.MiniFlinkClusterExtension; +import org.apache.iceberg.flink.SimpleDataUtil; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.api.extension.ExtendWith; +import org.junit.jupiter.api.extension.RegisterExtension; + +@ExtendWith(ParameterizedTestExtension.class) +public class TestFlinkIcebergSinkV2Branch extends TestFlinkIcebergSinkV2Base { + @RegisterExtension + private static final HadoopCatalogExtension CATALOG_EXTENSION = + new HadoopCatalogExtension(DATABASE, TestFixtures.TABLE); + + @Parameter(index = 0) + private String branch; + + @Parameters(name = "branch = {0}") + public static Object[][] parameters() { + return new Object[][] {new Object[] {"main"}, new Object[] {"testBranch"}}; + } + + @BeforeEach + public void before() throws IOException { + table = + CATALOG_EXTENSION + .catalog() + .createTable( + TestFixtures.TABLE_IDENTIFIER, + SimpleDataUtil.SCHEMA, + PartitionSpec.unpartitioned(), + ImmutableMap.of( + TableProperties.DEFAULT_FILE_FORMAT, + FileFormat.AVRO.name(), + TableProperties.FORMAT_VERSION, + "2")); + + env = + StreamExecutionEnvironment.getExecutionEnvironment( + MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG) + .enableCheckpointing(100); + + tableLoader = CATALOG_EXTENSION.tableLoader(); + } + + @TestTemplate + public void testChangeLogOnIdKey() throws Exception { + testChangeLogOnIdKey(branch); + verifyOtherBranchUnmodified(); + } + + @TestTemplate + public void testChangeLogOnDataKey() throws Exception { + testChangeLogOnDataKey(branch); + verifyOtherBranchUnmodified(); + } + + @TestTemplate + public void testChangeLogOnIdDataKey() throws Exception { + testChangeLogOnIdDataKey(branch); + verifyOtherBranchUnmodified(); + } + + @TestTemplate + public void testUpsertOnIdKey() throws Exception { + testUpsertOnIdKey(branch); + verifyOtherBranchUnmodified(); + } + + @TestTemplate + public void testUpsertOnDataKey() throws Exception { + testUpsertOnDataKey(branch); + verifyOtherBranchUnmodified(); + } + + @TestTemplate + public void testUpsertOnIdDataKey() throws Exception { + testUpsertOnIdDataKey(branch); + verifyOtherBranchUnmodified(); + } + + private void verifyOtherBranchUnmodified() { + String otherBranch = + branch.equals(SnapshotRef.MAIN_BRANCH) ? "test-branch" : SnapshotRef.MAIN_BRANCH; + if (otherBranch.equals(SnapshotRef.MAIN_BRANCH)) { + assertThat(table.currentSnapshot()).isNull(); + } + + assertThat(table.snapshot(otherBranch)).isNull(); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkManifest.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkManifest.java new file mode 100644 index 000000000000..53b7c4c0cc91 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkManifest.java @@ -0,0 +1,312 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import static org.apache.iceberg.flink.sink.ManifestOutputFileFactory.FLINK_MANIFEST_LOCATION; +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.List; +import java.util.Map; +import java.util.UUID; +import java.util.concurrent.atomic.AtomicInteger; +import org.apache.flink.core.io.SimpleVersionedSerialization; +import org.apache.flink.core.io.SimpleVersionedSerializer; +import org.apache.flink.table.data.RowData; +import org.apache.hadoop.conf.Configuration; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.DeleteFile; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.ManifestFile; +import org.apache.iceberg.ManifestFiles; +import org.apache.iceberg.Table; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.SimpleDataUtil; +import org.apache.iceberg.flink.TestHelpers; +import org.apache.iceberg.io.FileAppenderFactory; +import org.apache.iceberg.io.WriteResult; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.util.Pair; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +public class TestFlinkManifest { + private static final Configuration CONF = new Configuration(); + + @TempDir protected Path temporaryFolder; + + private Table table; + private FileAppenderFactory appenderFactory; + private final AtomicInteger fileCount = new AtomicInteger(0); + + @BeforeEach + public void before() throws IOException { + File folder = Files.createTempDirectory(temporaryFolder, "junit").toFile(); + String warehouse = folder.getAbsolutePath(); + + String tablePath = warehouse.concat("/test"); + assertThat(new File(tablePath).mkdir()).isTrue(); + + // Construct the iceberg table. + table = SimpleDataUtil.createTable(tablePath, ImmutableMap.of(), false); + + int[] equalityFieldIds = + new int[] { + table.schema().findField("id").fieldId(), table.schema().findField("data").fieldId() + }; + this.appenderFactory = + new FlinkAppenderFactory( + table, + table.schema(), + FlinkSchemaUtil.convert(table.schema()), + table.properties(), + table.spec(), + equalityFieldIds, + table.schema(), + null); + } + + @Test + public void testIO() throws IOException { + String flinkJobId = newFlinkJobId(); + String operatorId = newOperatorUniqueId(); + for (long checkpointId = 1; checkpointId <= 3; checkpointId++) { + ManifestOutputFileFactory factory = + FlinkManifestUtil.createOutputFileFactory( + () -> table, table.properties(), flinkJobId, operatorId, 1, 1); + final long curCkpId = checkpointId; + + List dataFiles = generateDataFiles(10); + List eqDeleteFiles = generateEqDeleteFiles(5); + List posDeleteFiles = generatePosDeleteFiles(5); + DeltaManifests deltaManifests = + FlinkManifestUtil.writeCompletedFiles( + WriteResult.builder() + .addDataFiles(dataFiles) + .addDeleteFiles(eqDeleteFiles) + .addDeleteFiles(posDeleteFiles) + .build(), + () -> factory.create(curCkpId), + table.spec()); + + WriteResult result = + FlinkManifestUtil.readCompletedFiles(deltaManifests, table.io(), table.specs()); + assertThat(result.deleteFiles()).hasSize(10); + for (int i = 0; i < dataFiles.size(); i++) { + TestHelpers.assertEquals(dataFiles.get(i), result.dataFiles()[i]); + } + assertThat(result.deleteFiles()).hasSize(10); + for (int i = 0; i < 5; i++) { + TestHelpers.assertEquals(eqDeleteFiles.get(i), result.deleteFiles()[i]); + } + for (int i = 0; i < 5; i++) { + TestHelpers.assertEquals(posDeleteFiles.get(i), result.deleteFiles()[5 + i]); + } + } + } + + @Test + public void testUserProvidedManifestLocation() throws IOException { + long checkpointId = 1; + String flinkJobId = newFlinkJobId(); + String operatorId = newOperatorUniqueId(); + File userProvidedFolder = Files.createTempDirectory(temporaryFolder, "junit").toFile(); + Map props = + ImmutableMap.of(FLINK_MANIFEST_LOCATION, userProvidedFolder.getAbsolutePath() + "///"); + ManifestOutputFileFactory factory = + new ManifestOutputFileFactory(() -> table, props, flinkJobId, operatorId, 1, 1); + + List dataFiles = generateDataFiles(5); + DeltaManifests deltaManifests = + FlinkManifestUtil.writeCompletedFiles( + WriteResult.builder().addDataFiles(dataFiles).build(), + () -> factory.create(checkpointId), + table.spec()); + + assertThat(deltaManifests.dataManifest()).isNotNull(); + assertThat(deltaManifests.deleteManifest()).isNull(); + assertThat(Paths.get(deltaManifests.dataManifest().path())) + .hasParent(userProvidedFolder.toPath()); + + WriteResult result = + FlinkManifestUtil.readCompletedFiles(deltaManifests, table.io(), table.specs()); + + assertThat(result.deleteFiles()).isEmpty(); + assertThat(result.dataFiles()).hasSize(5); + + assertThat(result.dataFiles()).hasSameSizeAs(dataFiles); + for (int i = 0; i < dataFiles.size(); i++) { + TestHelpers.assertEquals(dataFiles.get(i), result.dataFiles()[i]); + } + } + + @Test + public void testVersionedSerializer() throws IOException { + long checkpointId = 1; + String flinkJobId = newFlinkJobId(); + String operatorId = newOperatorUniqueId(); + ManifestOutputFileFactory factory = + FlinkManifestUtil.createOutputFileFactory( + () -> table, table.properties(), flinkJobId, operatorId, 1, 1); + + List dataFiles = generateDataFiles(10); + List eqDeleteFiles = generateEqDeleteFiles(10); + List posDeleteFiles = generatePosDeleteFiles(10); + DeltaManifests expected = + FlinkManifestUtil.writeCompletedFiles( + WriteResult.builder() + .addDataFiles(dataFiles) + .addDeleteFiles(eqDeleteFiles) + .addDeleteFiles(posDeleteFiles) + .build(), + () -> factory.create(checkpointId), + table.spec()); + + byte[] versionedSerializeData = + SimpleVersionedSerialization.writeVersionAndSerialize( + DeltaManifestsSerializer.INSTANCE, expected); + DeltaManifests actual = + SimpleVersionedSerialization.readVersionAndDeSerialize( + DeltaManifestsSerializer.INSTANCE, versionedSerializeData); + TestHelpers.assertEquals(expected.dataManifest(), actual.dataManifest()); + TestHelpers.assertEquals(expected.deleteManifest(), actual.deleteManifest()); + + byte[] versionedSerializeData2 = + SimpleVersionedSerialization.writeVersionAndSerialize( + DeltaManifestsSerializer.INSTANCE, actual); + assertThat(versionedSerializeData2).containsExactly(versionedSerializeData); + } + + @Test + public void testCompatibility() throws IOException { + // The v2 deserializer should be able to deserialize the v1 binary. + long checkpointId = 1; + String flinkJobId = newFlinkJobId(); + String operatorId = newOperatorUniqueId(); + ManifestOutputFileFactory factory = + FlinkManifestUtil.createOutputFileFactory( + () -> table, table.properties(), flinkJobId, operatorId, 1, 1); + + List dataFiles = generateDataFiles(10); + ManifestFile manifest = + FlinkManifestUtil.writeDataFiles(factory.create(checkpointId), table.spec(), dataFiles); + byte[] dataV1 = + SimpleVersionedSerialization.writeVersionAndSerialize(new V1Serializer(), manifest); + + DeltaManifests delta = + SimpleVersionedSerialization.readVersionAndDeSerialize( + DeltaManifestsSerializer.INSTANCE, dataV1); + assertThat(delta.deleteManifest()).isNull(); + assertThat(delta.dataManifest()).isNotNull(); + TestHelpers.assertEquals(manifest, delta.dataManifest()); + + List actualFiles = + FlinkManifestUtil.readDataFiles(delta.dataManifest(), table.io(), table.specs()); + assertThat(actualFiles).hasSize(10); + for (int i = 0; i < 10; i++) { + TestHelpers.assertEquals(dataFiles.get(i), actualFiles.get(i)); + } + } + + private static class V1Serializer implements SimpleVersionedSerializer { + + @Override + public int getVersion() { + return 1; + } + + @Override + public byte[] serialize(ManifestFile m) throws IOException { + return ManifestFiles.encode(m); + } + + @Override + public ManifestFile deserialize(int version, byte[] serialized) throws IOException { + return ManifestFiles.decode(serialized); + } + } + + private DataFile writeDataFile(String filename, List rows) throws IOException { + return SimpleDataUtil.writeFile( + table, + table.schema(), + table.spec(), + CONF, + table.location(), + FileFormat.PARQUET.addExtension(filename), + rows); + } + + private DeleteFile writeEqDeleteFile(String filename, List deletes) throws IOException { + return SimpleDataUtil.writeEqDeleteFile( + table, FileFormat.PARQUET, filename, appenderFactory, deletes); + } + + private DeleteFile writePosDeleteFile(String filename, List> positions) + throws IOException { + return SimpleDataUtil.writePosDeleteFile( + table, FileFormat.PARQUET, filename, appenderFactory, positions); + } + + private List generateDataFiles(int fileNum) throws IOException { + List rowDataList = Lists.newArrayList(); + List dataFiles = Lists.newArrayList(); + for (int i = 0; i < fileNum; i++) { + rowDataList.add(SimpleDataUtil.createRowData(i, "a" + i)); + dataFiles.add(writeDataFile("data-file-" + fileCount.incrementAndGet(), rowDataList)); + } + return dataFiles; + } + + private List generateEqDeleteFiles(int fileNum) throws IOException { + List rowDataList = Lists.newArrayList(); + List deleteFiles = Lists.newArrayList(); + for (int i = 0; i < fileNum; i++) { + rowDataList.add(SimpleDataUtil.createDelete(i, "a" + i)); + deleteFiles.add( + writeEqDeleteFile("eq-delete-file-" + fileCount.incrementAndGet(), rowDataList)); + } + return deleteFiles; + } + + private List generatePosDeleteFiles(int fileNum) throws IOException { + List> positions = Lists.newArrayList(); + List deleteFiles = Lists.newArrayList(); + for (int i = 0; i < fileNum; i++) { + positions.add(Pair.of("data-file-1", (long) i)); + deleteFiles.add( + writePosDeleteFile("pos-delete-file-" + fileCount.incrementAndGet(), positions)); + } + return deleteFiles; + } + + private static String newFlinkJobId() { + return UUID.randomUUID().toString(); + } + + private static String newOperatorUniqueId() { + return UUID.randomUUID().toString(); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkPartitioningWriters.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkPartitioningWriters.java new file mode 100644 index 000000000000..939ed2be7dbc --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkPartitioningWriters.java @@ -0,0 +1,77 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import java.util.Arrays; +import java.util.List; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Parameters; +import org.apache.iceberg.Schema; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.RowDataWrapper; +import org.apache.iceberg.flink.SimpleDataUtil; +import org.apache.iceberg.io.FileWriterFactory; +import org.apache.iceberg.io.TestPartitioningWriters; +import org.apache.iceberg.util.ArrayUtil; +import org.apache.iceberg.util.StructLikeSet; + +public class TestFlinkPartitioningWriters extends TestPartitioningWriters { + + @Parameters(name = "formatVersion = {0}, fileFormat = {1}") + protected static List parameters() { + return Arrays.asList( + new Object[] {2, FileFormat.AVRO}, + new Object[] {2, FileFormat.PARQUET}, + new Object[] {2, FileFormat.ORC}); + } + + @Override + protected FileWriterFactory newWriterFactory( + Schema dataSchema, + List equalityFieldIds, + Schema equalityDeleteRowSchema, + Schema positionDeleteRowSchema) { + return FlinkFileWriterFactory.builderFor(table) + .dataSchema(table.schema()) + .dataFileFormat(format()) + .deleteFileFormat(format()) + .equalityFieldIds(ArrayUtil.toIntArray(equalityFieldIds)) + .equalityDeleteRowSchema(equalityDeleteRowSchema) + .positionDeleteRowSchema(positionDeleteRowSchema) + .build(); + } + + @Override + protected RowData toRow(Integer id, String data) { + return SimpleDataUtil.createRowData(id, data); + } + + @Override + protected StructLikeSet toSet(Iterable rows) { + StructLikeSet set = StructLikeSet.create(table.schema().asStruct()); + RowType flinkType = FlinkSchemaUtil.convert(table.schema()); + for (RowData row : rows) { + RowDataWrapper wrapper = new RowDataWrapper(flinkType, table.schema().asStruct()); + set.add(wrapper.wrap(row)); + } + return set; + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkPositionDeltaWriters.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkPositionDeltaWriters.java new file mode 100644 index 000000000000..3050752d1c24 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkPositionDeltaWriters.java @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import java.util.List; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.iceberg.Schema; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.RowDataWrapper; +import org.apache.iceberg.flink.SimpleDataUtil; +import org.apache.iceberg.io.FileWriterFactory; +import org.apache.iceberg.io.TestPositionDeltaWriters; +import org.apache.iceberg.util.ArrayUtil; +import org.apache.iceberg.util.StructLikeSet; + +public class TestFlinkPositionDeltaWriters extends TestPositionDeltaWriters { + + @Override + protected FileWriterFactory newWriterFactory( + Schema dataSchema, + List equalityFieldIds, + Schema equalityDeleteRowSchema, + Schema positionDeleteRowSchema) { + return FlinkFileWriterFactory.builderFor(table) + .dataSchema(table.schema()) + .dataFileFormat(format()) + .deleteFileFormat(format()) + .equalityFieldIds(ArrayUtil.toIntArray(equalityFieldIds)) + .equalityDeleteRowSchema(equalityDeleteRowSchema) + .positionDeleteRowSchema(positionDeleteRowSchema) + .build(); + } + + @Override + protected RowData toRow(Integer id, String data) { + return SimpleDataUtil.createRowData(id, data); + } + + @Override + protected StructLikeSet toSet(Iterable rows) { + StructLikeSet set = StructLikeSet.create(table.schema().asStruct()); + RowType flinkType = FlinkSchemaUtil.convert(table.schema()); + for (RowData row : rows) { + RowDataWrapper wrapper = new RowDataWrapper(flinkType, table.schema().asStruct()); + set.add(wrapper.wrap(row)); + } + return set; + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkRollingFileWriters.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkRollingFileWriters.java new file mode 100644 index 000000000000..03051b69cf87 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkRollingFileWriters.java @@ -0,0 +1,51 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import java.util.List; +import org.apache.flink.table.data.RowData; +import org.apache.iceberg.Schema; +import org.apache.iceberg.flink.SimpleDataUtil; +import org.apache.iceberg.io.FileWriterFactory; +import org.apache.iceberg.io.TestRollingFileWriters; +import org.apache.iceberg.util.ArrayUtil; + +public class TestFlinkRollingFileWriters extends TestRollingFileWriters { + + @Override + protected FileWriterFactory newWriterFactory( + Schema dataSchema, + List equalityFieldIds, + Schema equalityDeleteRowSchema, + Schema positionDeleteRowSchema) { + return FlinkFileWriterFactory.builderFor(table) + .dataSchema(table.schema()) + .dataFileFormat(format()) + .deleteFileFormat(format()) + .equalityFieldIds(ArrayUtil.toIntArray(equalityFieldIds)) + .equalityDeleteRowSchema(equalityDeleteRowSchema) + .positionDeleteRowSchema(positionDeleteRowSchema) + .build(); + } + + @Override + protected RowData toRow(Integer id, String data) { + return SimpleDataUtil.createRowData(id, data); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkWriterMetrics.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkWriterMetrics.java new file mode 100644 index 000000000000..e6d64ef2c720 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkWriterMetrics.java @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Table; +import org.apache.iceberg.io.FileWriterFactory; +import org.apache.iceberg.io.TestWriterMetrics; + +public class TestFlinkWriterMetrics extends TestWriterMetrics { + + public TestFlinkWriterMetrics(FileFormat fileFormat) { + super(fileFormat); + } + + @Override + protected FileWriterFactory newWriterFactory(Table sourceTable) { + return FlinkFileWriterFactory.builderFor(sourceTable) + .dataSchema(sourceTable.schema()) + .dataFileFormat(fileFormat) + .deleteFileFormat(fileFormat) + .positionDeleteRowSchema(sourceTable.schema()) + .build(); + } + + @Override + protected RowData toRow(Integer id, String data, boolean boolValue, Long longValue) { + GenericRowData nested = GenericRowData.of(boolValue, longValue); + GenericRowData row = GenericRowData.of(id, StringData.fromString(data), nested); + return row; + } + + @Override + public RowData toGenericRow(int value, int repeated) { + GenericRowData row = new GenericRowData(repeated); + for (int i = 0; i < repeated; i++) { + row.setField(i, value); + } + return row; + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergFilesCommitter.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergFilesCommitter.java new file mode 100644 index 000000000000..948c7b31430c --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergFilesCommitter.java @@ -0,0 +1,1148 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; +import static org.apache.iceberg.flink.sink.IcebergFilesCommitter.MAX_CONTINUOUS_EMPTY_COMMITS; +import static org.apache.iceberg.flink.sink.ManifestOutputFileFactory.FLINK_MANIFEST_LOCATION; +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assumptions.assumeThat; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.NavigableMap; +import java.util.SortedMap; +import java.util.stream.Collectors; +import org.apache.flink.api.common.ExecutionConfig; +import org.apache.flink.api.common.JobID; +import org.apache.flink.api.common.state.ListState; +import org.apache.flink.api.common.state.OperatorStateStore; +import org.apache.flink.core.io.SimpleVersionedSerialization; +import org.apache.flink.runtime.checkpoint.OperatorSubtaskState; +import org.apache.flink.runtime.jobgraph.OperatorID; +import org.apache.flink.runtime.operators.testutils.MockEnvironment; +import org.apache.flink.runtime.operators.testutils.MockEnvironmentBuilder; +import org.apache.flink.runtime.operators.testutils.MockInputSplitProvider; +import org.apache.flink.streaming.api.operators.AbstractStreamOperatorFactory; +import org.apache.flink.streaming.api.operators.BoundedOneInput; +import org.apache.flink.streaming.api.operators.OneInputStreamOperatorFactory; +import org.apache.flink.streaming.api.operators.StreamOperator; +import org.apache.flink.streaming.api.operators.StreamOperatorParameters; +import org.apache.flink.streaming.util.OneInputStreamOperatorTestHarness; +import org.apache.flink.table.data.RowData; +import org.apache.hadoop.conf.Configuration; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.DeleteFile; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.GenericManifestFile; +import org.apache.iceberg.ManifestContent; +import org.apache.iceberg.ManifestFile; +import org.apache.iceberg.Parameter; +import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.Parameters; +import org.apache.iceberg.PartitionData; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.StructLike; +import org.apache.iceberg.TestBase; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.SimpleDataUtil; +import org.apache.iceberg.flink.TestHelpers; +import org.apache.iceberg.flink.TestTableLoader; +import org.apache.iceberg.io.FileAppenderFactory; +import org.apache.iceberg.io.WriteResult; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.util.Pair; +import org.apache.iceberg.util.ThreadPools; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.api.extension.ExtendWith; + +@ExtendWith(ParameterizedTestExtension.class) +public class TestIcebergFilesCommitter extends TestBase { + private static final Configuration CONF = new Configuration(); + + private File flinkManifestFolder; + + @Parameter(index = 1) + private FileFormat format; + + @Parameter(index = 2) + private String branch; + + @Parameters(name = "formatVersion = {0}, fileFormat = {1}, branch = {2}") + protected static List parameters() { + return Arrays.asList( + new Object[] {1, FileFormat.AVRO, "main"}, + new Object[] {2, FileFormat.AVRO, "test-branch"}, + new Object[] {1, FileFormat.PARQUET, "main"}, + new Object[] {2, FileFormat.PARQUET, "test-branch"}, + new Object[] {1, FileFormat.ORC, "main"}, + new Object[] {2, FileFormat.ORC, "test-branch"}); + } + + @Override + @BeforeEach + public void setupTable() throws IOException { + flinkManifestFolder = Files.createTempDirectory(temp, "flink").toFile(); + + this.tableDir = Files.createTempDirectory(temp, "junit").toFile(); + this.metadataDir = new File(tableDir, "metadata"); + assertThat(tableDir.delete()).isTrue(); + + // Construct the iceberg table. + table = create(SimpleDataUtil.SCHEMA, PartitionSpec.unpartitioned()); + + table + .updateProperties() + .set(DEFAULT_FILE_FORMAT, format.name()) + .set(FLINK_MANIFEST_LOCATION, flinkManifestFolder.getAbsolutePath()) + .set(MAX_CONTINUOUS_EMPTY_COMMITS, "1") + .commit(); + } + + @TestTemplate + public void testCommitTxnWithoutDataFiles() throws Exception { + long checkpointId = 0; + long timestamp = 0; + JobID jobId = new JobID(); + OperatorID operatorId; + try (OneInputStreamOperatorTestHarness harness = createStreamSink(jobId)) { + harness.setup(); + harness.open(); + operatorId = harness.getOperator().getOperatorID(); + + SimpleDataUtil.assertTableRows(table, Lists.newArrayList(), branch); + assertSnapshotSize(0); + assertMaxCommittedCheckpointId(jobId, operatorId, -1L); + + // It's better to advance the max-committed-checkpoint-id in iceberg snapshot, so that the + // future flink job + // failover won't fail. + for (int i = 1; i <= 3; i++) { + harness.snapshot(++checkpointId, ++timestamp); + assertFlinkManifests(0); + + harness.notifyOfCompletedCheckpoint(checkpointId); + assertFlinkManifests(0); + + assertSnapshotSize(i); + assertMaxCommittedCheckpointId(jobId, operatorId, checkpointId); + } + } + } + + @TestTemplate + public void testMaxContinuousEmptyCommits() throws Exception { + table.updateProperties().set(MAX_CONTINUOUS_EMPTY_COMMITS, "3").commit(); + + JobID jobId = new JobID(); + long checkpointId = 0; + long timestamp = 0; + try (OneInputStreamOperatorTestHarness harness = createStreamSink(jobId)) { + harness.setup(); + harness.open(); + + assertSnapshotSize(0); + + for (int i = 1; i <= 9; i++) { + harness.snapshot(++checkpointId, ++timestamp); + harness.notifyOfCompletedCheckpoint(checkpointId); + + assertSnapshotSize(i / 3); + } + } + } + + private WriteResult of(DataFile dataFile) { + return WriteResult.builder().addDataFiles(dataFile).build(); + } + + @TestTemplate + public void testCommitTxn() throws Exception { + // Test with 3 continues checkpoints: + // 1. snapshotState for checkpoint#1 + // 2. notifyCheckpointComplete for checkpoint#1 + // 3. snapshotState for checkpoint#2 + // 4. notifyCheckpointComplete for checkpoint#2 + // 5. snapshotState for checkpoint#3 + // 6. notifyCheckpointComplete for checkpoint#3 + long timestamp = 0; + + JobID jobID = new JobID(); + OperatorID operatorId; + try (OneInputStreamOperatorTestHarness harness = createStreamSink(jobID)) { + harness.setup(); + harness.open(); + operatorId = harness.getOperator().getOperatorID(); + + assertSnapshotSize(0); + + List rows = Lists.newArrayListWithExpectedSize(3); + for (int i = 1; i <= 3; i++) { + RowData rowData = SimpleDataUtil.createRowData(i, "hello" + i); + DataFile dataFile = writeDataFile("data-" + i, ImmutableList.of(rowData)); + harness.processElement(of(dataFile), ++timestamp); + rows.add(rowData); + + harness.snapshot(i, ++timestamp); + assertFlinkManifests(1); + + harness.notifyOfCompletedCheckpoint(i); + assertFlinkManifests(0); + + SimpleDataUtil.assertTableRows(table, ImmutableList.copyOf(rows), branch); + assertSnapshotSize(i); + assertMaxCommittedCheckpointId(jobID, operatorId, i); + assertThat(SimpleDataUtil.latestSnapshot(table, branch).summary()) + .containsEntry("flink.test", TestIcebergFilesCommitter.class.getName()); + } + } + } + + @TestTemplate + public void testOrderedEventsBetweenCheckpoints() throws Exception { + // It's possible that two checkpoints happen in the following orders: + // 1. snapshotState for checkpoint#1; + // 2. snapshotState for checkpoint#2; + // 3. notifyCheckpointComplete for checkpoint#1; + // 4. notifyCheckpointComplete for checkpoint#2; + long timestamp = 0; + + JobID jobId = new JobID(); + OperatorID operatorId; + try (OneInputStreamOperatorTestHarness harness = createStreamSink(jobId)) { + harness.setup(); + harness.open(); + operatorId = harness.getOperator().getOperatorID(); + + assertMaxCommittedCheckpointId(jobId, operatorId, -1L); + + RowData row1 = SimpleDataUtil.createRowData(1, "hello"); + DataFile dataFile1 = writeDataFile("data-1", ImmutableList.of(row1)); + + harness.processElement(of(dataFile1), ++timestamp); + assertMaxCommittedCheckpointId(jobId, operatorId, -1L); + + // 1. snapshotState for checkpoint#1 + long firstCheckpointId = 1; + harness.snapshot(firstCheckpointId, ++timestamp); + assertFlinkManifests(1); + + RowData row2 = SimpleDataUtil.createRowData(2, "world"); + DataFile dataFile2 = writeDataFile("data-2", ImmutableList.of(row2)); + harness.processElement(of(dataFile2), ++timestamp); + assertMaxCommittedCheckpointId(jobId, operatorId, -1L); + + // 2. snapshotState for checkpoint#2 + long secondCheckpointId = 2; + harness.snapshot(secondCheckpointId, ++timestamp); + assertFlinkManifests(2); + + // 3. notifyCheckpointComplete for checkpoint#1 + harness.notifyOfCompletedCheckpoint(firstCheckpointId); + SimpleDataUtil.assertTableRows(table, ImmutableList.of(row1), branch); + assertMaxCommittedCheckpointId(jobId, operatorId, firstCheckpointId); + assertFlinkManifests(1); + + // 4. notifyCheckpointComplete for checkpoint#2 + harness.notifyOfCompletedCheckpoint(secondCheckpointId); + SimpleDataUtil.assertTableRows(table, ImmutableList.of(row1, row2), branch); + assertMaxCommittedCheckpointId(jobId, operatorId, secondCheckpointId); + assertFlinkManifests(0); + } + } + + @TestTemplate + public void testDisorderedEventsBetweenCheckpoints() throws Exception { + // It's possible that the two checkpoints happen in the following orders: + // 1. snapshotState for checkpoint#1; + // 2. snapshotState for checkpoint#2; + // 3. notifyCheckpointComplete for checkpoint#2; + // 4. notifyCheckpointComplete for checkpoint#1; + long timestamp = 0; + + JobID jobId = new JobID(); + OperatorID operatorId; + try (OneInputStreamOperatorTestHarness harness = createStreamSink(jobId)) { + harness.setup(); + harness.open(); + operatorId = harness.getOperator().getOperatorID(); + + assertMaxCommittedCheckpointId(jobId, operatorId, -1L); + + RowData row1 = SimpleDataUtil.createRowData(1, "hello"); + DataFile dataFile1 = writeDataFile("data-1", ImmutableList.of(row1)); + + harness.processElement(of(dataFile1), ++timestamp); + assertMaxCommittedCheckpointId(jobId, operatorId, -1L); + + // 1. snapshotState for checkpoint#1 + long firstCheckpointId = 1; + harness.snapshot(firstCheckpointId, ++timestamp); + assertFlinkManifests(1); + + RowData row2 = SimpleDataUtil.createRowData(2, "world"); + DataFile dataFile2 = writeDataFile("data-2", ImmutableList.of(row2)); + harness.processElement(of(dataFile2), ++timestamp); + assertMaxCommittedCheckpointId(jobId, operatorId, -1L); + + // 2. snapshotState for checkpoint#2 + long secondCheckpointId = 2; + harness.snapshot(secondCheckpointId, ++timestamp); + assertFlinkManifests(2); + + // 3. notifyCheckpointComplete for checkpoint#2 + harness.notifyOfCompletedCheckpoint(secondCheckpointId); + SimpleDataUtil.assertTableRows(table, ImmutableList.of(row1, row2), branch); + assertMaxCommittedCheckpointId(jobId, operatorId, secondCheckpointId); + assertFlinkManifests(0); + + // 4. notifyCheckpointComplete for checkpoint#1 + harness.notifyOfCompletedCheckpoint(firstCheckpointId); + SimpleDataUtil.assertTableRows(table, ImmutableList.of(row1, row2), branch); + assertMaxCommittedCheckpointId(jobId, operatorId, secondCheckpointId); + assertFlinkManifests(0); + } + } + + @TestTemplate + public void testRecoveryFromValidSnapshot() throws Exception { + long checkpointId = 0; + long timestamp = 0; + List expectedRows = Lists.newArrayList(); + OperatorSubtaskState snapshot; + + JobID jobId = new JobID(); + OperatorID operatorId; + try (OneInputStreamOperatorTestHarness harness = createStreamSink(jobId)) { + harness.setup(); + harness.open(); + operatorId = harness.getOperator().getOperatorID(); + + assertSnapshotSize(0); + assertMaxCommittedCheckpointId(jobId, operatorId, -1L); + + RowData row = SimpleDataUtil.createRowData(1, "hello"); + expectedRows.add(row); + DataFile dataFile1 = writeDataFile("data-1", ImmutableList.of(row)); + + harness.processElement(of(dataFile1), ++timestamp); + snapshot = harness.snapshot(++checkpointId, ++timestamp); + assertFlinkManifests(1); + + harness.notifyOfCompletedCheckpoint(checkpointId); + assertFlinkManifests(0); + + SimpleDataUtil.assertTableRows(table, ImmutableList.of(row), branch); + assertSnapshotSize(1); + assertMaxCommittedCheckpointId(jobId, operatorId, checkpointId); + } + + // Restore from the given snapshot + try (OneInputStreamOperatorTestHarness harness = createStreamSink(jobId)) { + harness.getStreamConfig().setOperatorID(operatorId); + harness.setup(); + harness.initializeState(snapshot); + harness.open(); + + SimpleDataUtil.assertTableRows(table, expectedRows, branch); + assertSnapshotSize(1); + assertMaxCommittedCheckpointId(jobId, operatorId, checkpointId); + + RowData row = SimpleDataUtil.createRowData(2, "world"); + expectedRows.add(row); + DataFile dataFile = writeDataFile("data-2", ImmutableList.of(row)); + harness.processElement(of(dataFile), ++timestamp); + + harness.snapshot(++checkpointId, ++timestamp); + assertFlinkManifests(1); + + harness.notifyOfCompletedCheckpoint(checkpointId); + assertFlinkManifests(0); + + SimpleDataUtil.assertTableRows(table, expectedRows, branch); + assertSnapshotSize(2); + assertMaxCommittedCheckpointId(jobId, operatorId, checkpointId); + } + } + + @TestTemplate + public void testRecoveryFromSnapshotWithoutCompletedNotification() throws Exception { + // We've two steps in checkpoint: 1. snapshotState(ckp); 2. notifyCheckpointComplete(ckp). It's + // possible that we + // flink job will restore from a checkpoint with only step#1 finished. + long checkpointId = 0; + long timestamp = 0; + OperatorSubtaskState snapshot; + List expectedRows = Lists.newArrayList(); + JobID jobId = new JobID(); + OperatorID operatorId; + try (OneInputStreamOperatorTestHarness harness = createStreamSink(jobId)) { + harness.setup(); + harness.open(); + operatorId = harness.getOperator().getOperatorID(); + + assertSnapshotSize(0); + assertMaxCommittedCheckpointId(jobId, operatorId, -1L); + + RowData row = SimpleDataUtil.createRowData(1, "hello"); + expectedRows.add(row); + DataFile dataFile = writeDataFile("data-1", ImmutableList.of(row)); + harness.processElement(of(dataFile), ++timestamp); + + snapshot = harness.snapshot(++checkpointId, ++timestamp); + SimpleDataUtil.assertTableRows(table, ImmutableList.of(), branch); + assertMaxCommittedCheckpointId(jobId, operatorId, -1L); + assertFlinkManifests(1); + } + + try (OneInputStreamOperatorTestHarness harness = createStreamSink(jobId)) { + harness.getStreamConfig().setOperatorID(operatorId); + harness.setup(); + harness.initializeState(snapshot); + harness.open(); + + // All flink manifests should be cleaned because it has committed the unfinished iceberg + // transaction. + assertFlinkManifests(0); + + SimpleDataUtil.assertTableRows(table, expectedRows, branch); + assertMaxCommittedCheckpointId(jobId, operatorId, checkpointId); + + harness.snapshot(++checkpointId, ++timestamp); + // Did not write any new record, so it won't generate new manifest. + assertFlinkManifests(0); + + harness.notifyOfCompletedCheckpoint(checkpointId); + assertFlinkManifests(0); + + SimpleDataUtil.assertTableRows(table, expectedRows, branch); + assertSnapshotSize(2); + assertMaxCommittedCheckpointId(jobId, operatorId, checkpointId); + + RowData row = SimpleDataUtil.createRowData(2, "world"); + expectedRows.add(row); + DataFile dataFile = writeDataFile("data-2", ImmutableList.of(row)); + harness.processElement(of(dataFile), ++timestamp); + + snapshot = harness.snapshot(++checkpointId, ++timestamp); + assertFlinkManifests(1); + } + + // Redeploying flink job from external checkpoint. + JobID newJobId = new JobID(); + try (OneInputStreamOperatorTestHarness harness = + createStreamSink(newJobId)) { + harness.setup(); + harness.initializeState(snapshot); + harness.open(); + operatorId = harness.getOperator().getOperatorID(); + + // All flink manifests should be cleaned because it has committed the unfinished iceberg + // transaction. + assertFlinkManifests(0); + + assertMaxCommittedCheckpointId(newJobId, operatorId, -1); + assertMaxCommittedCheckpointId(jobId, operatorId, checkpointId); + SimpleDataUtil.assertTableRows(table, expectedRows, branch); + assertSnapshotSize(3); + + RowData row = SimpleDataUtil.createRowData(3, "foo"); + expectedRows.add(row); + DataFile dataFile = writeDataFile("data-3", ImmutableList.of(row)); + harness.processElement(of(dataFile), ++timestamp); + + harness.snapshot(++checkpointId, ++timestamp); + assertFlinkManifests(1); + + harness.notifyOfCompletedCheckpoint(checkpointId); + assertFlinkManifests(0); + + SimpleDataUtil.assertTableRows(table, expectedRows, branch); + assertSnapshotSize(4); + assertMaxCommittedCheckpointId(newJobId, operatorId, checkpointId); + } + } + + @TestTemplate + public void testStartAnotherJobToWriteSameTable() throws Exception { + long checkpointId = 0; + long timestamp = 0; + List rows = Lists.newArrayList(); + List tableRows = Lists.newArrayList(); + + JobID oldJobId = new JobID(); + OperatorID oldOperatorId; + try (OneInputStreamOperatorTestHarness harness = + createStreamSink(oldJobId)) { + harness.setup(); + harness.open(); + oldOperatorId = harness.getOperator().getOperatorID(); + + assertSnapshotSize(0); + assertMaxCommittedCheckpointId(oldJobId, oldOperatorId, -1L); + + for (int i = 1; i <= 3; i++) { + rows.add(SimpleDataUtil.createRowData(i, "hello" + i)); + tableRows.addAll(rows); + + DataFile dataFile = writeDataFile(String.format("data-%d", i), rows); + harness.processElement(of(dataFile), ++timestamp); + harness.snapshot(++checkpointId, ++timestamp); + assertFlinkManifests(1); + + harness.notifyOfCompletedCheckpoint(checkpointId); + assertFlinkManifests(0); + + SimpleDataUtil.assertTableRows(table, tableRows, branch); + assertSnapshotSize(i); + assertMaxCommittedCheckpointId(oldJobId, oldOperatorId, checkpointId); + } + } + + // The new started job will start with checkpoint = 1 again. + checkpointId = 0; + timestamp = 0; + JobID newJobId = new JobID(); + OperatorID newOperatorId; + try (OneInputStreamOperatorTestHarness harness = + createStreamSink(newJobId)) { + harness.setup(); + harness.open(); + newOperatorId = harness.getOperator().getOperatorID(); + + assertSnapshotSize(3); + assertMaxCommittedCheckpointId(oldJobId, oldOperatorId, 3); + assertMaxCommittedCheckpointId(newJobId, newOperatorId, -1); + + rows.add(SimpleDataUtil.createRowData(2, "world")); + tableRows.addAll(rows); + + DataFile dataFile = writeDataFile("data-new-1", rows); + harness.processElement(of(dataFile), ++timestamp); + harness.snapshot(++checkpointId, ++timestamp); + assertFlinkManifests(1); + + harness.notifyOfCompletedCheckpoint(checkpointId); + assertFlinkManifests(0); + SimpleDataUtil.assertTableRows(table, tableRows, branch); + assertSnapshotSize(4); + assertMaxCommittedCheckpointId(newJobId, newOperatorId, checkpointId); + } + } + + @TestTemplate + public void testMultipleJobsWriteSameTable() throws Exception { + long timestamp = 0; + List tableRows = Lists.newArrayList(); + + JobID[] jobs = new JobID[] {new JobID(), new JobID(), new JobID()}; + OperatorID[] operatorIds = + new OperatorID[] {new OperatorID(), new OperatorID(), new OperatorID()}; + for (int i = 0; i < 20; i++) { + int jobIndex = i % 3; + int checkpointId = i / 3; + JobID jobId = jobs[jobIndex]; + OperatorID operatorId = operatorIds[jobIndex]; + try (OneInputStreamOperatorTestHarness harness = createStreamSink(jobId)) { + harness.getStreamConfig().setOperatorID(operatorId); + harness.setup(); + harness.open(); + + assertSnapshotSize(i); + assertMaxCommittedCheckpointId(jobId, operatorId, checkpointId == 0 ? -1 : checkpointId); + + List rows = Lists.newArrayList(SimpleDataUtil.createRowData(i, "word-" + i)); + tableRows.addAll(rows); + + DataFile dataFile = writeDataFile(String.format("data-%d", i), rows); + harness.processElement(of(dataFile), ++timestamp); + harness.snapshot(checkpointId + 1, ++timestamp); + assertFlinkManifests(1); + + harness.notifyOfCompletedCheckpoint(checkpointId + 1); + assertFlinkManifests(0); + SimpleDataUtil.assertTableRows(table, tableRows, branch); + assertSnapshotSize(i + 1); + assertMaxCommittedCheckpointId(jobId, operatorId, checkpointId + 1); + } + } + } + + @TestTemplate + public void testMultipleSinksRecoveryFromValidSnapshot() throws Exception { + long checkpointId = 0; + long timestamp = 0; + List expectedRows = Lists.newArrayList(); + OperatorSubtaskState snapshot1; + OperatorSubtaskState snapshot2; + + JobID jobId = new JobID(); + OperatorID operatorId1 = new OperatorID(); + OperatorID operatorId2 = new OperatorID(); + try (OneInputStreamOperatorTestHarness harness1 = createStreamSink(jobId); + OneInputStreamOperatorTestHarness harness2 = createStreamSink(jobId)) { + harness1.getStreamConfig().setOperatorID(operatorId1); + harness1.setup(); + harness1.open(); + harness2.getStreamConfig().setOperatorID(operatorId2); + harness2.setup(); + harness2.open(); + + assertSnapshotSize(0); + assertMaxCommittedCheckpointId(jobId, operatorId1, -1L); + assertMaxCommittedCheckpointId(jobId, operatorId2, -1L); + + RowData row1 = SimpleDataUtil.createRowData(1, "hello1"); + expectedRows.add(row1); + DataFile dataFile1 = writeDataFile("data-1-1", ImmutableList.of(row1)); + + harness1.processElement(of(dataFile1), ++timestamp); + snapshot1 = harness1.snapshot(++checkpointId, ++timestamp); + + RowData row2 = SimpleDataUtil.createRowData(1, "hello2"); + expectedRows.add(row2); + DataFile dataFile2 = writeDataFile("data-1-2", ImmutableList.of(row2)); + + harness2.processElement(of(dataFile2), ++timestamp); + snapshot2 = harness2.snapshot(checkpointId, ++timestamp); + assertFlinkManifests(2); + + // Only notify one of the committers + harness1.notifyOfCompletedCheckpoint(checkpointId); + assertFlinkManifests(1); + + // Only the first row is committed at this point + SimpleDataUtil.assertTableRows(table, ImmutableList.of(row1), branch); + assertSnapshotSize(1); + assertMaxCommittedCheckpointId(jobId, operatorId1, checkpointId); + assertMaxCommittedCheckpointId(jobId, operatorId2, -1); + } + + // Restore from the given snapshot + try (OneInputStreamOperatorTestHarness harness1 = createStreamSink(jobId); + OneInputStreamOperatorTestHarness harness2 = createStreamSink(jobId)) { + harness1.getStreamConfig().setOperatorID(operatorId1); + harness1.setup(); + harness1.initializeState(snapshot1); + harness1.open(); + + harness2.getStreamConfig().setOperatorID(operatorId2); + harness2.setup(); + harness2.initializeState(snapshot2); + harness2.open(); + + // All flink manifests should be cleaned because it has committed the unfinished iceberg + // transaction. + assertFlinkManifests(0); + + SimpleDataUtil.assertTableRows(table, expectedRows, branch); + assertSnapshotSize(2); + assertMaxCommittedCheckpointId(jobId, operatorId1, checkpointId); + assertMaxCommittedCheckpointId(jobId, operatorId2, checkpointId); + + RowData row1 = SimpleDataUtil.createRowData(2, "world1"); + expectedRows.add(row1); + DataFile dataFile1 = writeDataFile("data-2-1", ImmutableList.of(row1)); + + harness1.processElement(of(dataFile1), ++timestamp); + harness1.snapshot(++checkpointId, ++timestamp); + + RowData row2 = SimpleDataUtil.createRowData(2, "world2"); + expectedRows.add(row2); + DataFile dataFile2 = writeDataFile("data-2-2", ImmutableList.of(row2)); + harness2.processElement(of(dataFile2), ++timestamp); + harness2.snapshot(checkpointId, ++timestamp); + + assertFlinkManifests(2); + + harness1.notifyOfCompletedCheckpoint(checkpointId); + harness2.notifyOfCompletedCheckpoint(checkpointId); + assertFlinkManifests(0); + + SimpleDataUtil.assertTableRows(table, expectedRows, branch); + assertSnapshotSize(4); + assertMaxCommittedCheckpointId(jobId, operatorId1, checkpointId); + assertMaxCommittedCheckpointId(jobId, operatorId2, checkpointId); + } + } + + @TestTemplate + public void testBoundedStream() throws Exception { + JobID jobId = new JobID(); + OperatorID operatorId; + try (OneInputStreamOperatorTestHarness harness = createStreamSink(jobId)) { + harness.setup(); + harness.open(); + operatorId = harness.getOperator().getOperatorID(); + + assertFlinkManifests(0); + assertSnapshotSize(0); + assertMaxCommittedCheckpointId(jobId, operatorId, -1L); + + List tableRows = Lists.newArrayList(SimpleDataUtil.createRowData(1, "word-1")); + + DataFile dataFile = writeDataFile("data-1", tableRows); + harness.processElement(of(dataFile), 1); + ((BoundedOneInput) harness.getOneInputOperator()).endInput(); + + assertFlinkManifests(0); + SimpleDataUtil.assertTableRows(table, tableRows, branch); + assertSnapshotSize(1); + assertMaxCommittedCheckpointId(jobId, operatorId, Long.MAX_VALUE); + assertThat(SimpleDataUtil.latestSnapshot(table, branch).summary()) + .containsEntry("flink.test", TestIcebergFilesCommitter.class.getName()); + } + } + + @TestTemplate + public void testFlinkManifests() throws Exception { + long timestamp = 0; + final long checkpoint = 10; + + JobID jobId = new JobID(); + OperatorID operatorId; + try (OneInputStreamOperatorTestHarness harness = createStreamSink(jobId)) { + harness.setup(); + harness.open(); + operatorId = harness.getOperator().getOperatorID(); + + assertMaxCommittedCheckpointId(jobId, operatorId, -1L); + + RowData row1 = SimpleDataUtil.createRowData(1, "hello"); + DataFile dataFile1 = writeDataFile("data-1", ImmutableList.of(row1)); + + harness.processElement(of(dataFile1), ++timestamp); + assertMaxCommittedCheckpointId(jobId, operatorId, -1L); + + // 1. snapshotState for checkpoint#1 + harness.snapshot(checkpoint, ++timestamp); + List manifestPaths = assertFlinkManifests(1); + Path manifestPath = manifestPaths.get(0); + assertThat(manifestPath.getFileName()) + .asString() + .isEqualTo( + String.format("%s-%s-%05d-%d-%d-%05d.avro", jobId, operatorId, 0, 0, checkpoint, 1)); + + // 2. Read the data files from manifests and assert. + List dataFiles = + FlinkManifestUtil.readDataFiles( + createTestingManifestFile(manifestPath), table.io(), table.specs()); + assertThat(dataFiles).hasSize(1); + TestHelpers.assertEquals(dataFile1, dataFiles.get(0)); + + // 3. notifyCheckpointComplete for checkpoint#1 + harness.notifyOfCompletedCheckpoint(checkpoint); + SimpleDataUtil.assertTableRows(table, ImmutableList.of(row1), branch); + assertMaxCommittedCheckpointId(jobId, operatorId, checkpoint); + assertFlinkManifests(0); + } + } + + @TestTemplate + public void testDeleteFiles() throws Exception { + assumeThat(formatVersion) + .as("Only support equality-delete in format v2 or later.") + .isGreaterThan(1); + + long timestamp = 0; + long checkpoint = 10; + + JobID jobId = new JobID(); + OperatorID operatorId; + FileAppenderFactory appenderFactory = createDeletableAppenderFactory(); + + try (OneInputStreamOperatorTestHarness harness = createStreamSink(jobId)) { + harness.setup(); + harness.open(); + operatorId = harness.getOperator().getOperatorID(); + + assertMaxCommittedCheckpointId(jobId, operatorId, -1L); + + RowData row1 = SimpleDataUtil.createInsert(1, "aaa"); + DataFile dataFile1 = writeDataFile("data-file-1", ImmutableList.of(row1)); + harness.processElement(of(dataFile1), ++timestamp); + assertMaxCommittedCheckpointId(jobId, operatorId, -1L); + + // 1. snapshotState for checkpoint#1 + harness.snapshot(checkpoint, ++timestamp); + List manifestPaths = assertFlinkManifests(1); + Path manifestPath = manifestPaths.get(0); + assertThat(manifestPath.getFileName()) + .asString() + .isEqualTo( + String.format("%s-%s-%05d-%d-%d-%05d.avro", jobId, operatorId, 0, 0, checkpoint, 1)); + + // 2. Read the data files from manifests and assert. + List dataFiles = + FlinkManifestUtil.readDataFiles( + createTestingManifestFile(manifestPath), table.io(), table.specs()); + assertThat(dataFiles).hasSize(1); + TestHelpers.assertEquals(dataFile1, dataFiles.get(0)); + + // 3. notifyCheckpointComplete for checkpoint#1 + harness.notifyOfCompletedCheckpoint(checkpoint); + SimpleDataUtil.assertTableRows(table, ImmutableList.of(row1), branch); + assertMaxCommittedCheckpointId(jobId, operatorId, checkpoint); + assertFlinkManifests(0); + + // 4. process both data files and delete files. + RowData row2 = SimpleDataUtil.createInsert(2, "bbb"); + DataFile dataFile2 = writeDataFile("data-file-2", ImmutableList.of(row2)); + + RowData delete1 = SimpleDataUtil.createDelete(1, "aaa"); + DeleteFile deleteFile1 = + writeEqDeleteFile(appenderFactory, "delete-file-1", ImmutableList.of(delete1)); + harness.processElement( + WriteResult.builder().addDataFiles(dataFile2).addDeleteFiles(deleteFile1).build(), + ++timestamp); + assertMaxCommittedCheckpointId(jobId, operatorId, checkpoint); + + // 5. snapshotState for checkpoint#2 + harness.snapshot(++checkpoint, ++timestamp); + assertFlinkManifests(2); + + // 6. notifyCheckpointComplete for checkpoint#2 + harness.notifyOfCompletedCheckpoint(checkpoint); + SimpleDataUtil.assertTableRows(table, ImmutableList.of(row2), branch); + assertMaxCommittedCheckpointId(jobId, operatorId, checkpoint); + assertFlinkManifests(0); + } + } + + @TestTemplate + public void testCommitTwoCheckpointsInSingleTxn() throws Exception { + assumeThat(formatVersion) + .as("Only support equality-delete in format v2 or later.") + .isGreaterThan(1); + + long timestamp = 0; + long checkpoint = 10; + + JobID jobId = new JobID(); + OperatorID operatorId; + FileAppenderFactory appenderFactory = createDeletableAppenderFactory(); + + try (OneInputStreamOperatorTestHarness harness = createStreamSink(jobId)) { + harness.setup(); + harness.open(); + operatorId = harness.getOperator().getOperatorID(); + + assertMaxCommittedCheckpointId(jobId, operatorId, -1L); + + RowData insert1 = SimpleDataUtil.createInsert(1, "aaa"); + RowData insert2 = SimpleDataUtil.createInsert(2, "bbb"); + RowData delete3 = SimpleDataUtil.createDelete(3, "ccc"); + DataFile dataFile1 = writeDataFile("data-file-1", ImmutableList.of(insert1, insert2)); + DeleteFile deleteFile1 = + writeEqDeleteFile(appenderFactory, "delete-file-1", ImmutableList.of(delete3)); + harness.processElement( + WriteResult.builder().addDataFiles(dataFile1).addDeleteFiles(deleteFile1).build(), + ++timestamp); + + // The 1th snapshotState. + harness.snapshot(checkpoint, ++timestamp); + + RowData insert4 = SimpleDataUtil.createInsert(4, "ddd"); + RowData delete2 = SimpleDataUtil.createDelete(2, "bbb"); + DataFile dataFile2 = writeDataFile("data-file-2", ImmutableList.of(insert4)); + DeleteFile deleteFile2 = + writeEqDeleteFile(appenderFactory, "delete-file-2", ImmutableList.of(delete2)); + harness.processElement( + WriteResult.builder().addDataFiles(dataFile2).addDeleteFiles(deleteFile2).build(), + ++timestamp); + + // The 2nd snapshotState. + harness.snapshot(++checkpoint, ++timestamp); + + // Notify the 2nd snapshot to complete. + harness.notifyOfCompletedCheckpoint(checkpoint); + SimpleDataUtil.assertTableRows(table, ImmutableList.of(insert1, insert4), branch); + assertMaxCommittedCheckpointId(jobId, operatorId, checkpoint); + assertFlinkManifests(0); + assertThat(table.snapshots()).hasSize(2); + } + } + + @TestTemplate + public void testSpecEvolution() throws Exception { + long timestamp = 0; + int checkpointId = 0; + List rows = Lists.newArrayList(); + JobID jobId = new JobID(); + + OperatorID operatorId; + OperatorSubtaskState snapshot; + DataFile dataFile; + int specId; + + try (OneInputStreamOperatorTestHarness harness = createStreamSink(jobId)) { + harness.setup(); + harness.open(); + operatorId = harness.getOperator().getOperatorID(); + + assertSnapshotSize(0); + + checkpointId++; + RowData rowData = SimpleDataUtil.createRowData(checkpointId, "hello" + checkpointId); + // table unpartitioned + dataFile = writeDataFile("data-" + checkpointId, ImmutableList.of(rowData)); + harness.processElement(of(dataFile), ++timestamp); + rows.add(rowData); + harness.snapshot(checkpointId, ++timestamp); + + specId = + getStagingManifestSpecId(harness.getOperator().getOperatorStateBackend(), checkpointId); + assertThat(specId).isEqualTo(table.spec().specId()); + + harness.notifyOfCompletedCheckpoint(checkpointId); + + // Change partition spec + table.refresh(); + PartitionSpec oldSpec = table.spec(); + table.updateSpec().addField("id").commit(); + + checkpointId++; + rowData = SimpleDataUtil.createRowData(checkpointId, "hello" + checkpointId); + // write data with old partition spec + dataFile = writeDataFile("data-" + checkpointId, ImmutableList.of(rowData), oldSpec, null); + harness.processElement(of(dataFile), ++timestamp); + rows.add(rowData); + snapshot = harness.snapshot(checkpointId, ++timestamp); + + specId = + getStagingManifestSpecId(harness.getOperator().getOperatorStateBackend(), checkpointId); + assertThat(specId).isEqualTo(oldSpec.specId()); + + harness.notifyOfCompletedCheckpoint(checkpointId); + + assertFlinkManifests(0); + + SimpleDataUtil.assertTableRows(table, ImmutableList.copyOf(rows), branch); + assertSnapshotSize(checkpointId); + assertMaxCommittedCheckpointId(jobId, operatorId, checkpointId); + } + + // Restore from the given snapshot + try (OneInputStreamOperatorTestHarness harness = createStreamSink(jobId)) { + harness.getStreamConfig().setOperatorID(operatorId); + harness.setup(); + harness.initializeState(snapshot); + harness.open(); + + SimpleDataUtil.assertTableRows(table, rows, branch); + assertSnapshotSize(checkpointId); + assertMaxCommittedCheckpointId(jobId, operatorId, checkpointId); + + checkpointId++; + RowData row = SimpleDataUtil.createRowData(checkpointId, "world" + checkpointId); + StructLike partition = new PartitionData(table.spec().partitionType()); + partition.set(0, checkpointId); + dataFile = + writeDataFile("data-" + checkpointId, ImmutableList.of(row), table.spec(), partition); + harness.processElement(of(dataFile), ++timestamp); + rows.add(row); + harness.snapshot(checkpointId, ++timestamp); + assertFlinkManifests(1); + + specId = + getStagingManifestSpecId(harness.getOperator().getOperatorStateBackend(), checkpointId); + assertThat(specId).isEqualTo(table.spec().specId()); + + harness.notifyOfCompletedCheckpoint(checkpointId); + assertFlinkManifests(0); + + SimpleDataUtil.assertTableRows(table, rows, branch); + assertSnapshotSize(checkpointId); + assertMaxCommittedCheckpointId(jobId, operatorId, checkpointId); + } + } + + private int getStagingManifestSpecId(OperatorStateStore operatorStateStore, long checkPointId) + throws Exception { + ListState> checkpointsState = + operatorStateStore.getListState(IcebergFilesCommitter.buildStateDescriptor()); + NavigableMap statedDataFiles = + Maps.newTreeMap(checkpointsState.get().iterator().next()); + DeltaManifests deltaManifests = + SimpleVersionedSerialization.readVersionAndDeSerialize( + DeltaManifestsSerializer.INSTANCE, statedDataFiles.get(checkPointId)); + return deltaManifests.dataManifest().partitionSpecId(); + } + + private DeleteFile writeEqDeleteFile( + FileAppenderFactory appenderFactory, String filename, List deletes) + throws IOException { + return SimpleDataUtil.writeEqDeleteFile(table, format, filename, appenderFactory, deletes); + } + + private DeleteFile writePosDeleteFile( + FileAppenderFactory appenderFactory, + String filename, + List> positions) + throws IOException { + return SimpleDataUtil.writePosDeleteFile(table, format, filename, appenderFactory, positions); + } + + private FileAppenderFactory createDeletableAppenderFactory() { + int[] equalityFieldIds = + new int[] { + table.schema().findField("id").fieldId(), table.schema().findField("data").fieldId() + }; + return new FlinkAppenderFactory( + table, + table.schema(), + FlinkSchemaUtil.convert(table.schema()), + table.properties(), + table.spec(), + equalityFieldIds, + table.schema(), + null); + } + + private ManifestFile createTestingManifestFile(Path manifestPath) { + return new GenericManifestFile( + manifestPath.toAbsolutePath().toString(), + manifestPath.toFile().length(), + 0, + ManifestContent.DATA, + 0, + 0, + 0L, + 0, + 0, + 0, + 0, + 0, + 0, + null, + null); + } + + private List assertFlinkManifests(int expectedCount) throws IOException { + List manifests = + Files.list(flinkManifestFolder.toPath()) + .filter(p -> !p.toString().endsWith(".crc")) + .collect(Collectors.toList()); + assertThat(manifests).hasSize(expectedCount); + return manifests; + } + + private DataFile writeDataFile(String filename, List rows) throws IOException { + return SimpleDataUtil.writeFile( + table, + table.schema(), + table.spec(), + CONF, + table.location(), + format.addExtension(filename), + rows); + } + + private DataFile writeDataFile( + String filename, List rows, PartitionSpec spec, StructLike partition) + throws IOException { + return SimpleDataUtil.writeFile( + table, + table.schema(), + spec, + CONF, + table.location(), + format.addExtension(filename), + rows, + partition); + } + + private void assertMaxCommittedCheckpointId(JobID jobID, OperatorID operatorID, long expectedId) { + table.refresh(); + long actualId = + IcebergFilesCommitter.getMaxCommittedCheckpointId( + table, jobID.toString(), operatorID.toHexString(), branch); + assertThat(actualId).isEqualTo(expectedId); + } + + private void assertSnapshotSize(int expectedSnapshotSize) { + table.refresh(); + assertThat(table.snapshots()).hasSize(expectedSnapshotSize); + } + + private OneInputStreamOperatorTestHarness createStreamSink(JobID jobID) + throws Exception { + TestOperatorFactory factory = TestOperatorFactory.of(table.location(), branch, table.spec()); + return new OneInputStreamOperatorTestHarness<>(factory, createEnvironment(jobID)); + } + + private static MockEnvironment createEnvironment(JobID jobID) { + return new MockEnvironmentBuilder() + .setTaskName("test task") + .setManagedMemorySize(32 * 1024) + .setInputSplitProvider(new MockInputSplitProvider()) + .setBufferSize(256) + .setTaskConfiguration(new org.apache.flink.configuration.Configuration()) + .setExecutionConfig(new ExecutionConfig()) + .setMaxParallelism(16) + .setJobID(jobID) + .build(); + } + + private static class TestOperatorFactory extends AbstractStreamOperatorFactory + implements OneInputStreamOperatorFactory { + private final String tablePath; + private final String branch; + private final PartitionSpec spec; + + private TestOperatorFactory(String tablePath, String branch, PartitionSpec spec) { + this.tablePath = tablePath; + this.branch = branch; + this.spec = spec; + } + + private static TestOperatorFactory of(String tablePath, String branch, PartitionSpec spec) { + return new TestOperatorFactory(tablePath, branch, spec); + } + + @Override + @SuppressWarnings("unchecked") + public > T createStreamOperator( + StreamOperatorParameters param) { + IcebergFilesCommitter committer = + new IcebergFilesCommitter( + new TestTableLoader(tablePath), + false, + Collections.singletonMap("flink.test", TestIcebergFilesCommitter.class.getName()), + ThreadPools.WORKER_THREAD_POOL_SIZE, + branch, + spec); + committer.setup(param.getContainingTask(), param.getStreamConfig(), param.getOutput()); + return (T) committer; + } + + @Override + public Class getStreamOperatorClass(ClassLoader classLoader) { + return IcebergFilesCommitter.class; + } + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergStreamWriter.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergStreamWriter.java new file mode 100644 index 000000000000..50283f7ad215 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergStreamWriter.java @@ -0,0 +1,390 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.util.Arrays; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.Set; +import org.apache.flink.streaming.api.operators.BoundedOneInput; +import org.apache.flink.streaming.util.OneInputStreamOperatorTestHarness; +import org.apache.flink.table.api.DataTypes; +import org.apache.flink.table.api.TableSchema; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.LocatedFileStatus; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.RemoteIterator; +import org.apache.iceberg.AppendFiles; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Parameter; +import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.Parameters; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.Table; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.data.GenericRecord; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.flink.FlinkWriteConf; +import org.apache.iceberg.flink.SimpleDataUtil; +import org.apache.iceberg.hadoop.HadoopTables; +import org.apache.iceberg.io.WriteResult; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.relocated.com.google.common.collect.Sets; +import org.apache.iceberg.types.Types; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.api.extension.ExtendWith; +import org.junit.jupiter.api.io.TempDir; + +@ExtendWith(ParameterizedTestExtension.class) +public class TestIcebergStreamWriter { + @TempDir protected java.nio.file.Path temporaryFolder; + + private Table table; + + @Parameter(index = 0) + private FileFormat format; + + @Parameter(index = 1) + private boolean partitioned; + + @Parameters(name = "format = {0}, partitioned = {1}") + public static Object[][] parameters() { + return new Object[][] { + {FileFormat.AVRO, true}, + {FileFormat.AVRO, false}, + {FileFormat.ORC, true}, + {FileFormat.ORC, false}, + {FileFormat.PARQUET, true}, + {FileFormat.PARQUET, false} + }; + } + + @BeforeEach + public void before() throws IOException { + File folder = Files.createTempDirectory(temporaryFolder, "junit").toFile(); + // Construct the iceberg table. + Map props = ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name()); + table = SimpleDataUtil.createTable(folder.getAbsolutePath(), props, partitioned); + } + + @TestTemplate + public void testWritingTable() throws Exception { + long checkpointId = 1L; + try (OneInputStreamOperatorTestHarness testHarness = + createIcebergStreamWriter()) { + // The first checkpoint + testHarness.processElement(SimpleDataUtil.createRowData(1, "hello"), 1); + testHarness.processElement(SimpleDataUtil.createRowData(2, "world"), 1); + testHarness.processElement(SimpleDataUtil.createRowData(3, "hello"), 1); + + testHarness.prepareSnapshotPreBarrier(checkpointId); + int expectedDataFiles = partitioned ? 2 : 1; + WriteResult result = WriteResult.builder().addAll(testHarness.extractOutputValues()).build(); + assertThat(result.deleteFiles()).isEmpty(); + assertThat(result.dataFiles()).hasSize(expectedDataFiles); + + checkpointId = checkpointId + 1; + + // The second checkpoint + testHarness.processElement(SimpleDataUtil.createRowData(4, "foo"), 1); + testHarness.processElement(SimpleDataUtil.createRowData(5, "bar"), 2); + + testHarness.prepareSnapshotPreBarrier(checkpointId); + expectedDataFiles = partitioned ? 4 : 2; + result = WriteResult.builder().addAll(testHarness.extractOutputValues()).build(); + assertThat(result.deleteFiles()).isEmpty(); + assertThat(result.dataFiles()).hasSize(expectedDataFiles); + + // Commit the iceberg transaction. + AppendFiles appendFiles = table.newAppend(); + Arrays.stream(result.dataFiles()).forEach(appendFiles::appendFile); + appendFiles.commit(); + + // Assert the table records. + SimpleDataUtil.assertTableRecords( + table, + Lists.newArrayList( + SimpleDataUtil.createRecord(1, "hello"), + SimpleDataUtil.createRecord(2, "world"), + SimpleDataUtil.createRecord(3, "hello"), + SimpleDataUtil.createRecord(4, "foo"), + SimpleDataUtil.createRecord(5, "bar"))); + } + } + + @TestTemplate + public void testSnapshotTwice() throws Exception { + long checkpointId = 1; + long timestamp = 1; + try (OneInputStreamOperatorTestHarness testHarness = + createIcebergStreamWriter()) { + testHarness.processElement(SimpleDataUtil.createRowData(1, "hello"), timestamp++); + testHarness.processElement(SimpleDataUtil.createRowData(2, "world"), timestamp); + + testHarness.prepareSnapshotPreBarrier(checkpointId++); + int expectedDataFiles = partitioned ? 2 : 1; + WriteResult result = WriteResult.builder().addAll(testHarness.extractOutputValues()).build(); + assertThat(result.deleteFiles()).isEmpty(); + assertThat(result.dataFiles()).hasSize(expectedDataFiles); + + // snapshot again immediately. + for (int i = 0; i < 5; i++) { + testHarness.prepareSnapshotPreBarrier(checkpointId++); + + result = WriteResult.builder().addAll(testHarness.extractOutputValues()).build(); + assertThat(result.deleteFiles()).isEmpty(); + assertThat(result.dataFiles()).hasSize(expectedDataFiles); + } + } + } + + @TestTemplate + public void testTableWithoutSnapshot() throws Exception { + try (OneInputStreamOperatorTestHarness testHarness = + createIcebergStreamWriter()) { + assertThat(testHarness.extractOutputValues()).isEmpty(); + } + // Even if we closed the iceberg stream writer, there's no orphan data file. + assertThat(scanDataFiles()).isEmpty(); + + try (OneInputStreamOperatorTestHarness testHarness = + createIcebergStreamWriter()) { + testHarness.processElement(SimpleDataUtil.createRowData(1, "hello"), 1); + // Still not emit the data file yet, because there is no checkpoint. + assertThat(testHarness.extractOutputValues()).isEmpty(); + } + // Once we closed the iceberg stream writer, there will left an orphan data file. + assertThat(scanDataFiles()).hasSize(1); + } + + private Set scanDataFiles() throws IOException { + Path dataDir = new Path(table.location(), "data"); + FileSystem fs = FileSystem.get(new Configuration()); + if (!fs.exists(dataDir)) { + return ImmutableSet.of(); + } else { + Set paths = Sets.newHashSet(); + RemoteIterator iterators = fs.listFiles(dataDir, true); + while (iterators.hasNext()) { + LocatedFileStatus status = iterators.next(); + if (status.isFile()) { + Path path = status.getPath(); + if (path.getName().endsWith("." + format.toString().toLowerCase(Locale.ROOT))) { + paths.add(path.toString()); + } + } + } + return paths; + } + } + + @TestTemplate + public void testBoundedStreamCloseWithEmittingDataFiles() throws Exception { + try (OneInputStreamOperatorTestHarness testHarness = + createIcebergStreamWriter()) { + testHarness.processElement(SimpleDataUtil.createRowData(1, "hello"), 1); + testHarness.processElement(SimpleDataUtil.createRowData(2, "world"), 2); + + assertThat(testHarness.getOneInputOperator()).isInstanceOf(BoundedOneInput.class); + ((BoundedOneInput) testHarness.getOneInputOperator()).endInput(); + + int expectedDataFiles = partitioned ? 2 : 1; + WriteResult result = WriteResult.builder().addAll(testHarness.extractOutputValues()).build(); + assertThat(result.deleteFiles()).isEmpty(); + assertThat(result.dataFiles()).hasSize(expectedDataFiles); + + ((BoundedOneInput) testHarness.getOneInputOperator()).endInput(); + + result = WriteResult.builder().addAll(testHarness.extractOutputValues()).build(); + assertThat(result.deleteFiles()).isEmpty(); + // Datafiles should not be sent again + assertThat(result.dataFiles()).hasSize(expectedDataFiles); + } + } + + @TestTemplate + public void testBoundedStreamTriggeredEndInputBeforeTriggeringCheckpoint() throws Exception { + try (OneInputStreamOperatorTestHarness testHarness = + createIcebergStreamWriter()) { + testHarness.processElement(SimpleDataUtil.createRowData(1, "hello"), 1); + testHarness.processElement(SimpleDataUtil.createRowData(2, "world"), 2); + + testHarness.endInput(); + + int expectedDataFiles = partitioned ? 2 : 1; + WriteResult result = WriteResult.builder().addAll(testHarness.extractOutputValues()).build(); + assertThat(result.deleteFiles()).isEmpty(); + assertThat(result.dataFiles()).hasSize(expectedDataFiles); + + testHarness.prepareSnapshotPreBarrier(1L); + + result = WriteResult.builder().addAll(testHarness.extractOutputValues()).build(); + assertThat(result.deleteFiles()).isEmpty(); + // It should be ensured that after endInput is triggered, when prepareSnapshotPreBarrier + // is triggered, write should only send WriteResult once + assertThat(result.dataFiles()).hasSize(expectedDataFiles); + } + } + + @TestTemplate + public void testTableWithTargetFileSize() throws Exception { + // Adjust the target-file-size in table properties. + table + .updateProperties() + .set(TableProperties.WRITE_TARGET_FILE_SIZE_BYTES, "4") // ~4 bytes; low enough to trigger + .commit(); + + List rows = Lists.newArrayListWithCapacity(8000); + List records = Lists.newArrayListWithCapacity(8000); + for (int i = 0; i < 2000; i++) { + for (String data : new String[] {"a", "b", "c", "d"}) { + rows.add(SimpleDataUtil.createRowData(i, data)); + records.add(SimpleDataUtil.createRecord(i, data)); + } + } + + try (OneInputStreamOperatorTestHarness testHarness = + createIcebergStreamWriter()) { + for (RowData row : rows) { + testHarness.processElement(row, 1); + } + + // snapshot the operator. + testHarness.prepareSnapshotPreBarrier(1); + WriteResult result = WriteResult.builder().addAll(testHarness.extractOutputValues()).build(); + assertThat(result.deleteFiles()).isEmpty(); + assertThat(result.dataFiles()).hasSize(8); + + // Assert that the data file have the expected records. + for (DataFile dataFile : result.dataFiles()) { + assertThat(dataFile.recordCount()).isEqualTo(1000); + } + + // Commit the iceberg transaction. + AppendFiles appendFiles = table.newAppend(); + Arrays.stream(result.dataFiles()).forEach(appendFiles::appendFile); + appendFiles.commit(); + } + + // Assert the table records. + SimpleDataUtil.assertTableRecords(table, records); + } + + @TestTemplate + public void testPromotedFlinkDataType() throws Exception { + Schema iSchema = + new Schema( + Types.NestedField.required(1, "tinyint", Types.IntegerType.get()), + Types.NestedField.required(2, "smallint", Types.IntegerType.get()), + Types.NestedField.optional(3, "int", Types.IntegerType.get())); + TableSchema flinkSchema = + TableSchema.builder() + .field("tinyint", DataTypes.TINYINT().notNull()) + .field("smallint", DataTypes.SMALLINT().notNull()) + .field("int", DataTypes.INT().nullable()) + .build(); + + PartitionSpec spec; + if (partitioned) { + spec = + PartitionSpec.builderFor(iSchema) + .identity("smallint") + .identity("tinyint") + .identity("int") + .build(); + } else { + spec = PartitionSpec.unpartitioned(); + } + + String location = + Files.createTempDirectory(temporaryFolder, "junit").toFile().getAbsolutePath(); + Map props = ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name()); + Table icebergTable = new HadoopTables().create(iSchema, spec, props, location); + + List rows = + Lists.newArrayList( + GenericRowData.of((byte) 0x01, (short) -32768, 101), + GenericRowData.of((byte) 0x02, (short) 0, 102), + GenericRowData.of((byte) 0x03, (short) 32767, 103)); + + Record record = GenericRecord.create(iSchema); + List expected = + Lists.newArrayList( + record.copy(ImmutableMap.of("tinyint", 1, "smallint", -32768, "int", 101)), + record.copy(ImmutableMap.of("tinyint", 2, "smallint", 0, "int", 102)), + record.copy(ImmutableMap.of("tinyint", 3, "smallint", 32767, "int", 103))); + + try (OneInputStreamOperatorTestHarness testHarness = + createIcebergStreamWriter(icebergTable, flinkSchema)) { + for (RowData row : rows) { + testHarness.processElement(row, 1); + } + testHarness.prepareSnapshotPreBarrier(1); + WriteResult result = WriteResult.builder().addAll(testHarness.extractOutputValues()).build(); + assertThat(result.deleteFiles()).isEmpty(); + assertThat(result.dataFiles()).hasSize(partitioned ? 3 : 1); + + // Commit the iceberg transaction. + AppendFiles appendFiles = icebergTable.newAppend(); + Arrays.stream(result.dataFiles()).forEach(appendFiles::appendFile); + appendFiles.commit(); + } + + SimpleDataUtil.assertTableRecords(location, expected); + } + + private OneInputStreamOperatorTestHarness createIcebergStreamWriter() + throws Exception { + return createIcebergStreamWriter(table, SimpleDataUtil.FLINK_SCHEMA); + } + + private OneInputStreamOperatorTestHarness createIcebergStreamWriter( + Table icebergTable, TableSchema flinkSchema) throws Exception { + RowType flinkRowType = FlinkSink.toFlinkRowType(icebergTable.schema(), flinkSchema); + FlinkWriteConf flinkWriteConfig = + new FlinkWriteConf( + icebergTable, Maps.newHashMap(), new org.apache.flink.configuration.Configuration()); + + IcebergStreamWriter streamWriter = + FlinkSink.createStreamWriter(() -> icebergTable, flinkWriteConfig, flinkRowType, null); + OneInputStreamOperatorTestHarness harness = + new OneInputStreamOperatorTestHarness<>(streamWriter, 1, 1, 0); + + harness.setup(); + harness.open(); + + return harness; + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestRowDataPartitionKey.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestRowDataPartitionKey.java new file mode 100644 index 000000000000..919fef579ab0 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestRowDataPartitionKey.java @@ -0,0 +1,251 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.util.List; +import java.util.stream.Collectors; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.iceberg.PartitionKey; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.data.InternalRecordWrapper; +import org.apache.iceberg.data.RandomGenericData; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.RowDataWrapper; +import org.apache.iceberg.flink.data.RandomRowData; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.types.Types; +import org.junit.jupiter.api.Test; + +public class TestRowDataPartitionKey { + private static final Schema SCHEMA = + new Schema( + Types.NestedField.required(0, "boolType", Types.BooleanType.get()), + Types.NestedField.required(1, "id", Types.IntegerType.get()), + Types.NestedField.required(2, "longType", Types.LongType.get()), + Types.NestedField.required(3, "dateType", Types.DateType.get()), + Types.NestedField.required(4, "timeType", Types.TimeType.get()), + Types.NestedField.required(5, "stringType", Types.StringType.get()), + Types.NestedField.required(6, "timestampWithoutZone", Types.TimestampType.withoutZone()), + Types.NestedField.required(7, "timestampWithZone", Types.TimestampType.withZone()), + Types.NestedField.required(8, "fixedType", Types.FixedType.ofLength(5)), + Types.NestedField.required(9, "uuidType", Types.UUIDType.get()), + Types.NestedField.required(10, "binaryType", Types.BinaryType.get()), + Types.NestedField.required(11, "decimalType1", Types.DecimalType.of(18, 3)), + Types.NestedField.required(12, "decimalType2", Types.DecimalType.of(10, 5)), + Types.NestedField.required(13, "decimalType3", Types.DecimalType.of(38, 19)), + Types.NestedField.required(14, "floatType", Types.FloatType.get()), + Types.NestedField.required(15, "doubleType", Types.DoubleType.get())); + + private static final List SUPPORTED_PRIMITIVES = + SCHEMA.asStruct().fields().stream().map(Types.NestedField::name).collect(Collectors.toList()); + + private static final Schema NESTED_SCHEMA = + new Schema( + Types.NestedField.required( + 1, + "structType", + Types.StructType.of( + Types.NestedField.optional(2, "innerStringType", Types.StringType.get()), + Types.NestedField.optional(3, "innerIntegerType", Types.IntegerType.get())))); + + @Test + public void testNullPartitionValue() { + Schema schema = + new Schema( + Types.NestedField.optional(1, "id", Types.IntegerType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get())); + + PartitionSpec spec = PartitionSpec.builderFor(schema).identity("data").build(); + + List rows = + Lists.newArrayList( + GenericRowData.of(1, StringData.fromString("a")), + GenericRowData.of(2, StringData.fromString("b")), + GenericRowData.of(3, null)); + + RowDataWrapper rowWrapper = + new RowDataWrapper(FlinkSchemaUtil.convert(schema), schema.asStruct()); + + for (RowData row : rows) { + PartitionKey partitionKey = new PartitionKey(spec, schema); + partitionKey.partition(rowWrapper.wrap(row)); + assertThat(partitionKey.size()).isEqualTo(1); + + String expectedStr = row.isNullAt(1) ? null : row.getString(1).toString(); + assertThat(partitionKey.get(0, String.class)).isEqualTo(expectedStr); + } + } + + @Test + public void testPartitionWithOneNestedField() { + RowDataWrapper rowWrapper = + new RowDataWrapper(FlinkSchemaUtil.convert(NESTED_SCHEMA), NESTED_SCHEMA.asStruct()); + List records = RandomGenericData.generate(NESTED_SCHEMA, 10, 1991); + List rows = Lists.newArrayList(RandomRowData.convert(NESTED_SCHEMA, records)); + + PartitionSpec spec1 = + PartitionSpec.builderFor(NESTED_SCHEMA).identity("structType.innerStringType").build(); + PartitionSpec spec2 = + PartitionSpec.builderFor(NESTED_SCHEMA).identity("structType.innerIntegerType").build(); + + for (int i = 0; i < rows.size(); i++) { + RowData row = rows.get(i); + Record record = (Record) records.get(i).get(0); + + PartitionKey partitionKey1 = new PartitionKey(spec1, NESTED_SCHEMA); + partitionKey1.partition(rowWrapper.wrap(row)); + assertThat(partitionKey1.size()).isEqualTo(1); + + assertThat(partitionKey1.get(0, String.class)).isEqualTo(record.get(0)); + + PartitionKey partitionKey2 = new PartitionKey(spec2, NESTED_SCHEMA); + partitionKey2.partition(rowWrapper.wrap(row)); + assertThat(partitionKey2.size()).isEqualTo(1); + + assertThat(partitionKey2.get(0, Integer.class)).isEqualTo(record.get(1)); + } + } + + @Test + public void testPartitionMultipleNestedField() { + RowDataWrapper rowWrapper = + new RowDataWrapper(FlinkSchemaUtil.convert(NESTED_SCHEMA), NESTED_SCHEMA.asStruct()); + List records = RandomGenericData.generate(NESTED_SCHEMA, 10, 1992); + List rows = Lists.newArrayList(RandomRowData.convert(NESTED_SCHEMA, records)); + + PartitionSpec spec1 = + PartitionSpec.builderFor(NESTED_SCHEMA) + .identity("structType.innerIntegerType") + .identity("structType.innerStringType") + .build(); + PartitionSpec spec2 = + PartitionSpec.builderFor(NESTED_SCHEMA) + .identity("structType.innerStringType") + .identity("structType.innerIntegerType") + .build(); + + PartitionKey pk1 = new PartitionKey(spec1, NESTED_SCHEMA); + PartitionKey pk2 = new PartitionKey(spec2, NESTED_SCHEMA); + + for (int i = 0; i < rows.size(); i++) { + RowData row = rows.get(i); + Record record = (Record) records.get(i).get(0); + + pk1.partition(rowWrapper.wrap(row)); + assertThat(pk1.size()).isEqualTo(2); + + assertThat(pk1.get(0, Integer.class)).isEqualTo(record.get(1)); + assertThat(pk1.get(1, String.class)).isEqualTo(record.get(0)); + + pk2.partition(rowWrapper.wrap(row)); + assertThat(pk2.size()).isEqualTo(2); + + assertThat(pk2.get(0, String.class)).isEqualTo(record.get(0)); + assertThat(pk2.get(1, Integer.class)).isEqualTo(record.get(1)); + } + } + + @Test + public void testPartitionValueTypes() { + RowType rowType = FlinkSchemaUtil.convert(SCHEMA); + RowDataWrapper rowWrapper = new RowDataWrapper(rowType, SCHEMA.asStruct()); + InternalRecordWrapper recordWrapper = new InternalRecordWrapper(SCHEMA.asStruct()); + + List records = RandomGenericData.generate(SCHEMA, 10, 1993); + List rows = Lists.newArrayList(RandomRowData.convert(SCHEMA, records)); + + for (String column : SUPPORTED_PRIMITIVES) { + PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity(column).build(); + Class[] javaClasses = spec.javaClasses(); + + PartitionKey pk = new PartitionKey(spec, SCHEMA); + PartitionKey expectedPK = new PartitionKey(spec, SCHEMA); + + for (int j = 0; j < rows.size(); j++) { + RowData row = rows.get(j); + Record record = records.get(j); + + pk.partition(rowWrapper.wrap(row)); + expectedPK.partition(recordWrapper.wrap(record)); + + assertThat(pk.size()) + .as("Partition with column " + column + " should have one field.") + .isEqualTo(1); + + if (column.equals("timeType")) { + assertThat(pk.get(0, Long.class) / 1000) + .as("Partition with column " + column + " should have the expected values") + .isEqualTo(expectedPK.get(0, Long.class) / 1000); + } else { + assertThat(pk.get(0, javaClasses[0])) + .as("Partition with column " + column + " should have the expected values") + .isEqualTo(expectedPK.get(0, javaClasses[0])); + } + } + } + } + + @Test + public void testNestedPartitionValues() { + Schema nestedSchema = new Schema(Types.NestedField.optional(1001, "nested", SCHEMA.asStruct())); + RowType rowType = FlinkSchemaUtil.convert(nestedSchema); + + RowDataWrapper rowWrapper = new RowDataWrapper(rowType, nestedSchema.asStruct()); + InternalRecordWrapper recordWrapper = new InternalRecordWrapper(nestedSchema.asStruct()); + + List records = RandomGenericData.generate(nestedSchema, 10, 1994); + List rows = Lists.newArrayList(RandomRowData.convert(nestedSchema, records)); + + for (String supportedPrimitive : SUPPORTED_PRIMITIVES) { + String column = String.format("nested.%s", supportedPrimitive); + + PartitionSpec spec = PartitionSpec.builderFor(nestedSchema).identity(column).build(); + Class[] javaClasses = spec.javaClasses(); + + PartitionKey pk = new PartitionKey(spec, nestedSchema); + PartitionKey expectedPK = new PartitionKey(spec, nestedSchema); + + for (int j = 0; j < rows.size(); j++) { + pk.partition(rowWrapper.wrap(rows.get(j))); + expectedPK.partition(recordWrapper.wrap(records.get(j))); + + assertThat(pk.size()) + .as("Partition with nested column " + column + " should have one field.") + .isEqualTo(1); + + if (column.equals("nested.timeType")) { + assertThat(pk.get(0, Long.class) / 1000) + .as("Partition with nested column " + column + " should have the expected values.") + .isEqualTo(expectedPK.get(0, Long.class) / 1000); + } else { + assertThat(pk.get(0, javaClasses[0])) + .as("Partition with nested column " + column + " should have the expected values.") + .isEqualTo(expectedPK.get(0, javaClasses[0])); + } + } + } + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestTaskWriters.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestTaskWriters.java new file mode 100644 index 000000000000..8bfd6cb3d043 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/TestTaskWriters.java @@ -0,0 +1,242 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.util.List; +import java.util.Map; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.iceberg.AppendFiles; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Parameter; +import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.Parameters; +import org.apache.iceberg.SerializableTable; +import org.apache.iceberg.Table; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.flink.SimpleDataUtil; +import org.apache.iceberg.flink.data.RandomRowData; +import org.apache.iceberg.io.TaskWriter; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.api.extension.ExtendWith; +import org.junit.jupiter.api.io.TempDir; + +@ExtendWith(ParameterizedTestExtension.class) +public class TestTaskWriters { + private static final Configuration CONF = new Configuration(); + private static final long TARGET_FILE_SIZE = 128 * 1024 * 1024; + + @TempDir protected java.nio.file.Path temporaryFolder; + + @Parameters(name = "format = {0}, partitioned = {1}") + public static Object[][] parameters() { + return new Object[][] { + {FileFormat.AVRO, true}, + {FileFormat.AVRO, false}, + {FileFormat.ORC, true}, + {FileFormat.ORC, false}, + {FileFormat.PARQUET, true}, + {FileFormat.PARQUET, false} + }; + } + + @Parameter(index = 0) + private FileFormat format; + + @Parameter(index = 1) + private boolean partitioned; + + private Table table; + + @BeforeEach + public void before() throws IOException { + File folder = Files.createTempDirectory(temporaryFolder, "junit").toFile(); + // Construct the iceberg table with the specified file format. + Map props = ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name()); + table = SimpleDataUtil.createTable(folder.getAbsolutePath(), props, partitioned); + } + + @TestTemplate + public void testWriteZeroRecord() throws IOException { + try (TaskWriter taskWriter = createTaskWriter(TARGET_FILE_SIZE)) { + taskWriter.close(); + + DataFile[] dataFiles = taskWriter.dataFiles(); + assertThat(dataFiles).isNotNull().isEmpty(); + + // Close again. + taskWriter.close(); + dataFiles = taskWriter.dataFiles(); + assertThat(dataFiles).isNotNull().isEmpty(); + } + } + + @TestTemplate + public void testCloseTwice() throws IOException { + try (TaskWriter taskWriter = createTaskWriter(TARGET_FILE_SIZE)) { + taskWriter.write(SimpleDataUtil.createRowData(1, "hello")); + taskWriter.write(SimpleDataUtil.createRowData(2, "world")); + taskWriter.close(); // The first close + taskWriter.close(); // The second close + + int expectedFiles = partitioned ? 2 : 1; + DataFile[] dataFiles = taskWriter.dataFiles(); + assertThat(dataFiles).hasSize(expectedFiles); + + FileSystem fs = FileSystem.get(CONF); + for (DataFile dataFile : dataFiles) { + assertThat(fs.exists(new Path(dataFile.path().toString()))).isTrue(); + } + } + } + + @TestTemplate + public void testAbort() throws IOException { + try (TaskWriter taskWriter = createTaskWriter(TARGET_FILE_SIZE)) { + taskWriter.write(SimpleDataUtil.createRowData(1, "hello")); + taskWriter.write(SimpleDataUtil.createRowData(2, "world")); + + taskWriter.abort(); + DataFile[] dataFiles = taskWriter.dataFiles(); + + int expectedFiles = partitioned ? 2 : 1; + assertThat(dataFiles).hasSize(expectedFiles); + + FileSystem fs = FileSystem.get(CONF); + for (DataFile dataFile : dataFiles) { + assertThat(fs.exists(new Path(dataFile.path().toString()))).isFalse(); + } + } + } + + @TestTemplate + public void testCompleteFiles() throws IOException { + try (TaskWriter taskWriter = createTaskWriter(TARGET_FILE_SIZE)) { + taskWriter.write(SimpleDataUtil.createRowData(1, "a")); + taskWriter.write(SimpleDataUtil.createRowData(2, "b")); + taskWriter.write(SimpleDataUtil.createRowData(3, "c")); + taskWriter.write(SimpleDataUtil.createRowData(4, "d")); + + DataFile[] dataFiles = taskWriter.dataFiles(); + int expectedFiles = partitioned ? 4 : 1; + assertThat(dataFiles).hasSize(expectedFiles); + + dataFiles = taskWriter.dataFiles(); + assertThat(dataFiles).hasSize(expectedFiles); + + FileSystem fs = FileSystem.get(CONF); + for (DataFile dataFile : dataFiles) { + assertThat(fs.exists(new Path(dataFile.path().toString()))).isTrue(); + } + + AppendFiles appendFiles = table.newAppend(); + for (DataFile dataFile : dataFiles) { + appendFiles.appendFile(dataFile); + } + appendFiles.commit(); + + // Assert the data rows. + SimpleDataUtil.assertTableRecords( + table, + Lists.newArrayList( + SimpleDataUtil.createRecord(1, "a"), + SimpleDataUtil.createRecord(2, "b"), + SimpleDataUtil.createRecord(3, "c"), + SimpleDataUtil.createRecord(4, "d"))); + } + } + + @TestTemplate + public void testRollingWithTargetFileSize() throws IOException { + try (TaskWriter taskWriter = createTaskWriter(4)) { + List rows = Lists.newArrayListWithCapacity(8000); + List records = Lists.newArrayListWithCapacity(8000); + for (int i = 0; i < 2000; i++) { + for (String data : new String[] {"a", "b", "c", "d"}) { + rows.add(SimpleDataUtil.createRowData(i, data)); + records.add(SimpleDataUtil.createRecord(i, data)); + } + } + + for (RowData row : rows) { + taskWriter.write(row); + } + + DataFile[] dataFiles = taskWriter.dataFiles(); + assertThat(dataFiles).hasSize(8); + + AppendFiles appendFiles = table.newAppend(); + for (DataFile dataFile : dataFiles) { + appendFiles.appendFile(dataFile); + } + appendFiles.commit(); + + // Assert the data rows. + SimpleDataUtil.assertTableRecords(table, records); + } + } + + @TestTemplate + public void testRandomData() throws IOException { + try (TaskWriter taskWriter = createTaskWriter(TARGET_FILE_SIZE)) { + Iterable rows = RandomRowData.generate(SimpleDataUtil.SCHEMA, 100, 1996); + for (RowData row : rows) { + taskWriter.write(row); + } + + taskWriter.close(); + DataFile[] dataFiles = taskWriter.dataFiles(); + AppendFiles appendFiles = table.newAppend(); + for (DataFile dataFile : dataFiles) { + appendFiles.appendFile(dataFile); + } + appendFiles.commit(); + + // Assert the data rows. + SimpleDataUtil.assertTableRows(table, Lists.newArrayList(rows)); + } + } + + private TaskWriter createTaskWriter(long targetFileSize) { + TaskWriterFactory taskWriterFactory = + new RowDataTaskWriterFactory( + SerializableTable.copyOf(table), + (RowType) SimpleDataUtil.FLINK_SCHEMA.toRowDataType().getLogicalType(), + targetFileSize, + format, + table.properties(), + null, + false); + taskWriterFactory.initialize(1, 1); + return taskWriterFactory.create(); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/Fixtures.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/Fixtures.java new file mode 100644 index 000000000000..5910bd685510 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/Fixtures.java @@ -0,0 +1,100 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import java.util.Comparator; +import java.util.Map; +import org.apache.flink.api.common.typeutils.TypeSerializer; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.runtime.typeutils.RowDataSerializer; +import org.apache.flink.table.types.logical.IntType; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.table.types.logical.VarCharType; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SortKey; +import org.apache.iceberg.SortOrder; +import org.apache.iceberg.SortOrderComparators; +import org.apache.iceberg.StructLike; +import org.apache.iceberg.flink.RowDataWrapper; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.types.Types; + +class Fixtures { + private Fixtures() {} + + public static final int NUM_SUBTASKS = 2; + public static final Schema SCHEMA = + new Schema( + Types.NestedField.optional(1, "id", Types.StringType.get()), + Types.NestedField.optional(2, "number", Types.IntegerType.get())); + public static final RowType ROW_TYPE = RowType.of(new VarCharType(), new IntType()); + public static final TypeSerializer ROW_SERIALIZER = new RowDataSerializer(ROW_TYPE); + public static final RowDataWrapper ROW_WRAPPER = new RowDataWrapper(ROW_TYPE, SCHEMA.asStruct()); + public static final SortOrder SORT_ORDER = SortOrder.builderFor(SCHEMA).asc("id").build(); + public static final Comparator SORT_ORDER_COMPARTOR = + SortOrderComparators.forSchema(SCHEMA, SORT_ORDER); + public static final SortKeySerializer SORT_KEY_SERIALIZER = + new SortKeySerializer(SCHEMA, SORT_ORDER); + public static final DataStatisticsSerializer TASK_STATISTICS_SERIALIZER = + new DataStatisticsSerializer(SORT_KEY_SERIALIZER); + public static final GlobalStatisticsSerializer GLOBAL_STATISTICS_SERIALIZER = + new GlobalStatisticsSerializer(SORT_KEY_SERIALIZER); + public static final CompletedStatisticsSerializer COMPLETED_STATISTICS_SERIALIZER = + new CompletedStatisticsSerializer(SORT_KEY_SERIALIZER); + + public static final SortKey SORT_KEY = new SortKey(SCHEMA, SORT_ORDER); + public static final Map CHAR_KEYS = createCharKeys(); + + public static StatisticsEvent createStatisticsEvent( + StatisticsType type, + TypeSerializer statisticsSerializer, + long checkpointId, + SortKey... keys) { + DataStatistics statistics = createTaskStatistics(type, keys); + return StatisticsEvent.createTaskStatisticsEvent( + checkpointId, statistics, statisticsSerializer); + } + + public static DataStatistics createTaskStatistics(StatisticsType type, SortKey... keys) { + DataStatistics statistics; + if (type == StatisticsType.Sketch) { + statistics = new SketchDataStatistics(128); + } else { + statistics = new MapDataStatistics(); + } + + for (SortKey key : keys) { + statistics.add(key); + } + + return statistics; + } + + private static Map createCharKeys() { + Map keys = Maps.newHashMap(); + for (char c = 'a'; c <= 'z'; ++c) { + String key = Character.toString(c); + SortKey sortKey = SORT_KEY.copy(); + sortKey.set(0, key); + keys.put(key, sortKey); + } + + return keys; + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestAggregatedStatisticsTracker.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestAggregatedStatisticsTracker.java new file mode 100644 index 000000000000..8322ce683768 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestAggregatedStatisticsTracker.java @@ -0,0 +1,465 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import static org.apache.iceberg.flink.sink.shuffle.Fixtures.CHAR_KEYS; +import static org.apache.iceberg.flink.sink.shuffle.Fixtures.TASK_STATISTICS_SERIALIZER; +import static org.apache.iceberg.flink.sink.shuffle.Fixtures.createStatisticsEvent; +import static org.assertj.core.api.Assertions.assertThat; + +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.EnumSource; + +public class TestAggregatedStatisticsTracker { + @ParameterizedTest + @EnumSource(StatisticsType.class) + public void receiveNewerStatisticsEvent(StatisticsType type) { + AggregatedStatisticsTracker tracker = createTracker(type); + + StatisticsEvent checkpoint1Subtask0StatisticsEvent = + createStatisticsEvent(type, TASK_STATISTICS_SERIALIZER, 1L, CHAR_KEYS.get("a")); + CompletedStatistics completedStatistics = + tracker.updateAndCheckCompletion(0, checkpoint1Subtask0StatisticsEvent); + assertThat(completedStatistics).isNull(); + assertThat(tracker.aggregationsPerCheckpoint().keySet()).containsExactlyInAnyOrder(1L); + AggregatedStatisticsTracker.Aggregation aggregation = + tracker.aggregationsPerCheckpoint().get(1L); + assertThat(aggregation.currentType()).isEqualTo(StatisticsUtil.collectType(type)); + if (StatisticsUtil.collectType(type) == StatisticsType.Map) { + assertThat(aggregation.mapStatistics()).isEqualTo(ImmutableMap.of(CHAR_KEYS.get("a"), 1L)); + } else { + assertThat(aggregation.sketchStatistics().getResult().getSamples()) + .containsExactlyInAnyOrder(CHAR_KEYS.get("a")); + } + + StatisticsEvent checkpoint2Subtask0StatisticsEvent = + createStatisticsEvent( + type, + TASK_STATISTICS_SERIALIZER, + 2L, + CHAR_KEYS.get("a"), + CHAR_KEYS.get("b"), + CHAR_KEYS.get("b")); + completedStatistics = tracker.updateAndCheckCompletion(0, checkpoint2Subtask0StatisticsEvent); + assertThat(completedStatistics).isNull(); + // both checkpoints are tracked + assertThat(tracker.aggregationsPerCheckpoint().keySet()).containsExactlyInAnyOrder(1L, 2L); + aggregation = tracker.aggregationsPerCheckpoint().get(2L); + assertThat(aggregation.currentType()).isEqualTo(StatisticsUtil.collectType(type)); + if (StatisticsUtil.collectType(type) == StatisticsType.Map) { + assertThat(aggregation.mapStatistics()) + .isEqualTo(ImmutableMap.of(CHAR_KEYS.get("a"), 1L, CHAR_KEYS.get("b"), 2L)); + } else { + assertThat(aggregation.sketchStatistics().getResult().getSamples()) + .containsExactlyInAnyOrder(CHAR_KEYS.get("a"), CHAR_KEYS.get("b"), CHAR_KEYS.get("b")); + } + + StatisticsEvent checkpoint1Subtask1StatisticsEvent = + createStatisticsEvent(type, TASK_STATISTICS_SERIALIZER, 1L, CHAR_KEYS.get("b")); + completedStatistics = tracker.updateAndCheckCompletion(1, checkpoint1Subtask1StatisticsEvent); + // checkpoint 1 is completed + assertThat(completedStatistics).isNotNull(); + assertThat(completedStatistics.type()).isEqualTo(StatisticsUtil.collectType(type)); + assertThat(completedStatistics.checkpointId()).isEqualTo(1L); + if (StatisticsUtil.collectType(type) == StatisticsType.Map) { + assertThat(completedStatistics.keyFrequency()) + .isEqualTo( + ImmutableMap.of( + CHAR_KEYS.get("a"), 1L, + CHAR_KEYS.get("b"), 1L)); + } else { + assertThat(completedStatistics.keySamples()) + .containsExactly(CHAR_KEYS.get("a"), CHAR_KEYS.get("b")); + } + + // checkpoint 2 remains + assertThat(tracker.aggregationsPerCheckpoint().keySet()).containsExactlyInAnyOrder(2L); + aggregation = tracker.aggregationsPerCheckpoint().get(2L); + assertThat(aggregation.currentType()).isEqualTo(StatisticsUtil.collectType(type)); + if (StatisticsUtil.collectType(type) == StatisticsType.Map) { + assertThat(aggregation.mapStatistics()) + .isEqualTo(ImmutableMap.of(CHAR_KEYS.get("a"), 1L, CHAR_KEYS.get("b"), 2L)); + } else { + assertThat(aggregation.sketchStatistics().getResult().getSamples()) + .containsExactlyInAnyOrder(CHAR_KEYS.get("a"), CHAR_KEYS.get("b"), CHAR_KEYS.get("b")); + } + } + + @ParameterizedTest + @EnumSource(StatisticsType.class) + public void receiveOlderStatisticsEventTest(StatisticsType type) { + AggregatedStatisticsTracker tracker = createTracker(type); + + StatisticsEvent checkpoint2Subtask0StatisticsEvent = + createStatisticsEvent( + type, + TASK_STATISTICS_SERIALIZER, + 2L, + CHAR_KEYS.get("a"), + CHAR_KEYS.get("b"), + CHAR_KEYS.get("b")); + CompletedStatistics completedStatistics = + tracker.updateAndCheckCompletion(0, checkpoint2Subtask0StatisticsEvent); + assertThat(completedStatistics).isNull(); + assertThat(tracker.aggregationsPerCheckpoint().keySet()).containsExactlyInAnyOrder(2L); + AggregatedStatisticsTracker.Aggregation aggregation = + tracker.aggregationsPerCheckpoint().get(2L); + assertThat(aggregation.currentType()).isEqualTo(StatisticsUtil.collectType(type)); + if (StatisticsUtil.collectType(type) == StatisticsType.Map) { + assertThat(aggregation.mapStatistics()) + .isEqualTo(ImmutableMap.of(CHAR_KEYS.get("a"), 1L, CHAR_KEYS.get("b"), 2L)); + } else { + assertThat(aggregation.sketchStatistics().getResult().getSamples()) + .containsExactlyInAnyOrder(CHAR_KEYS.get("a"), CHAR_KEYS.get("b"), CHAR_KEYS.get("b")); + } + + StatisticsEvent checkpoint1Subtask1StatisticsEvent = + createStatisticsEvent(type, TASK_STATISTICS_SERIALIZER, 1L, CHAR_KEYS.get("b")); + completedStatistics = tracker.updateAndCheckCompletion(1, checkpoint1Subtask1StatisticsEvent); + assertThat(completedStatistics).isNull(); + // both checkpoints are tracked + assertThat(tracker.aggregationsPerCheckpoint().keySet()).containsExactlyInAnyOrder(1L, 2L); + aggregation = tracker.aggregationsPerCheckpoint().get(1L); + assertThat(aggregation.currentType()).isEqualTo(StatisticsUtil.collectType(type)); + if (StatisticsUtil.collectType(type) == StatisticsType.Map) { + assertThat(aggregation.mapStatistics()).isEqualTo(ImmutableMap.of(CHAR_KEYS.get("b"), 1L)); + } else { + assertThat(aggregation.sketchStatistics().getResult().getSamples()) + .containsExactlyInAnyOrder(CHAR_KEYS.get("b")); + } + + StatisticsEvent checkpoint3Subtask0StatisticsEvent = + createStatisticsEvent(type, TASK_STATISTICS_SERIALIZER, 3L, CHAR_KEYS.get("x")); + completedStatistics = tracker.updateAndCheckCompletion(1, checkpoint3Subtask0StatisticsEvent); + assertThat(completedStatistics).isNull(); + assertThat(tracker.aggregationsPerCheckpoint().keySet()).containsExactlyInAnyOrder(1L, 2L, 3L); + aggregation = tracker.aggregationsPerCheckpoint().get(3L); + assertThat(aggregation.currentType()).isEqualTo(StatisticsUtil.collectType(type)); + if (StatisticsUtil.collectType(type) == StatisticsType.Map) { + assertThat(aggregation.mapStatistics()).isEqualTo(ImmutableMap.of(CHAR_KEYS.get("x"), 1L)); + } else { + assertThat(aggregation.sketchStatistics().getResult().getSamples()) + .containsExactlyInAnyOrder(CHAR_KEYS.get("x")); + } + + StatisticsEvent checkpoint2Subtask1StatisticsEvent = + createStatisticsEvent( + type, + TASK_STATISTICS_SERIALIZER, + 2L, + CHAR_KEYS.get("a"), + CHAR_KEYS.get("b"), + CHAR_KEYS.get("b")); + completedStatistics = tracker.updateAndCheckCompletion(1, checkpoint2Subtask1StatisticsEvent); + // checkpoint 1 is cleared along with checkpoint 2. checkpoint 3 remains + assertThat(tracker.aggregationsPerCheckpoint().keySet()).containsExactlyInAnyOrder(3L); + aggregation = tracker.aggregationsPerCheckpoint().get(3L); + assertThat(aggregation.currentType()).isEqualTo(StatisticsUtil.collectType(type)); + if (StatisticsUtil.collectType(type) == StatisticsType.Map) { + assertThat(aggregation.mapStatistics()).isEqualTo(ImmutableMap.of(CHAR_KEYS.get("x"), 1L)); + } else { + assertThat(aggregation.sketchStatistics().getResult().getSamples()) + .containsExactlyInAnyOrder(CHAR_KEYS.get("x")); + } + + assertThat(completedStatistics).isNotNull(); + assertThat(completedStatistics.type()).isEqualTo(StatisticsUtil.collectType(type)); + assertThat(completedStatistics.checkpointId()).isEqualTo(2L); + if (StatisticsUtil.collectType(type) == StatisticsType.Map) { + assertThat(completedStatistics.keyFrequency()) + .isEqualTo( + ImmutableMap.of( + CHAR_KEYS.get("a"), 2L, + CHAR_KEYS.get("b"), 4L)); + } else { + assertThat(completedStatistics.keySamples()) + .containsExactly( + CHAR_KEYS.get("a"), + CHAR_KEYS.get("b"), + CHAR_KEYS.get("b"), + CHAR_KEYS.get("a"), + CHAR_KEYS.get("b"), + CHAR_KEYS.get("b")); + } + } + + @ParameterizedTest + @EnumSource(StatisticsType.class) + public void receiveCompletedStatisticsEvent(StatisticsType type) { + AggregatedStatisticsTracker tracker = createTracker(type); + + StatisticsEvent checkpoint1Subtask0DataStatisticEvent = + createStatisticsEvent( + type, + TASK_STATISTICS_SERIALIZER, + 1L, + CHAR_KEYS.get("a"), + CHAR_KEYS.get("b"), + CHAR_KEYS.get("b")); + + CompletedStatistics completedStatistics = + tracker.updateAndCheckCompletion(0, checkpoint1Subtask0DataStatisticEvent); + assertThat(completedStatistics).isNull(); + assertThat(tracker.aggregationsPerCheckpoint().keySet()).containsExactlyInAnyOrder(1L); + AggregatedStatisticsTracker.Aggregation aggregation = + tracker.aggregationsPerCheckpoint().get(1L); + assertThat(aggregation.subtaskSet()).containsExactlyInAnyOrder(0); + if (StatisticsUtil.collectType(type) == StatisticsType.Map) { + assertThat(aggregation.mapStatistics()) + .isEqualTo(ImmutableMap.of(CHAR_KEYS.get("a"), 1L, CHAR_KEYS.get("b"), 2L)); + } else { + assertThat(aggregation.sketchStatistics().getResult().getSamples()) + .containsExactlyInAnyOrder(CHAR_KEYS.get("a"), CHAR_KEYS.get("b"), CHAR_KEYS.get("b")); + } + + StatisticsEvent checkpoint1Subtask1DataStatisticEvent = + createStatisticsEvent( + type, + TASK_STATISTICS_SERIALIZER, + 1L, + CHAR_KEYS.get("a"), + CHAR_KEYS.get("a"), + CHAR_KEYS.get("b")); + + // Receive data statistics from all subtasks at checkpoint 1 + completedStatistics = + tracker.updateAndCheckCompletion(1, checkpoint1Subtask1DataStatisticEvent); + assertThat(tracker.aggregationsPerCheckpoint()).isEmpty(); + + assertThat(completedStatistics).isNotNull(); + assertThat(completedStatistics.type()).isEqualTo(StatisticsUtil.collectType(type)); + assertThat(completedStatistics.checkpointId()).isEqualTo(1L); + if (StatisticsUtil.collectType(type) == StatisticsType.Map) { + assertThat(completedStatistics.keyFrequency()) + .isEqualTo( + ImmutableMap.of( + CHAR_KEYS.get("a"), 3L, + CHAR_KEYS.get("b"), 3L)); + } else { + assertThat(completedStatistics.keySamples()) + .containsExactly( + CHAR_KEYS.get("a"), + CHAR_KEYS.get("b"), + CHAR_KEYS.get("b"), + CHAR_KEYS.get("a"), + CHAR_KEYS.get("a"), + CHAR_KEYS.get("b")); + } + + StatisticsEvent checkpoint2Subtask0DataStatisticEvent = + createStatisticsEvent(type, TASK_STATISTICS_SERIALIZER, 2L, CHAR_KEYS.get("a")); + completedStatistics = + tracker.updateAndCheckCompletion(0, checkpoint2Subtask0DataStatisticEvent); + assertThat(completedStatistics).isNull(); + assertThat(tracker.aggregationsPerCheckpoint().keySet()).containsExactlyInAnyOrder(2L); + aggregation = tracker.aggregationsPerCheckpoint().get(2L); + assertThat(aggregation.subtaskSet()).containsExactlyInAnyOrder(0); + if (StatisticsUtil.collectType(type) == StatisticsType.Map) { + assertThat(aggregation.mapStatistics()).isEqualTo(ImmutableMap.of(CHAR_KEYS.get("a"), 1L)); + } else { + assertThat(aggregation.sketchStatistics().getResult().getSamples()) + .containsExactlyInAnyOrder(CHAR_KEYS.get("a")); + } + + StatisticsEvent checkpoint2Subtask1DataStatisticEvent = + createStatisticsEvent(type, TASK_STATISTICS_SERIALIZER, 2L, CHAR_KEYS.get("b")); + // Receive data statistics from all subtasks at checkpoint 2 + completedStatistics = + tracker.updateAndCheckCompletion(1, checkpoint2Subtask1DataStatisticEvent); + assertThat(tracker.aggregationsPerCheckpoint()).isEmpty(); + + assertThat(completedStatistics).isNotNull(); + assertThat(completedStatistics.checkpointId()).isEqualTo(2L); + if (StatisticsUtil.collectType(type) == StatisticsType.Map) { + assertThat(completedStatistics.keyFrequency()) + .isEqualTo( + ImmutableMap.of( + CHAR_KEYS.get("a"), 1L, + CHAR_KEYS.get("b"), 1L)); + } else { + assertThat(completedStatistics.keySamples()) + .containsExactly(CHAR_KEYS.get("a"), CHAR_KEYS.get("b")); + } + } + + @Test + public void coordinatorSwitchToSketchOverThreshold() { + int parallelism = 3; + int downstreamParallelism = 3; + int switchToSketchThreshold = 3; + AggregatedStatisticsTracker tracker = + new AggregatedStatisticsTracker( + "testOperator", + parallelism, + Fixtures.SCHEMA, + Fixtures.SORT_ORDER, + downstreamParallelism, + StatisticsType.Auto, + switchToSketchThreshold, + null); + + StatisticsEvent checkpoint1Subtask0StatisticsEvent = + createStatisticsEvent( + StatisticsType.Map, + TASK_STATISTICS_SERIALIZER, + 1L, + CHAR_KEYS.get("a"), + CHAR_KEYS.get("b")); + CompletedStatistics completedStatistics = + tracker.updateAndCheckCompletion(0, checkpoint1Subtask0StatisticsEvent); + assertThat(completedStatistics).isNull(); + assertThat(tracker.aggregationsPerCheckpoint().keySet()).containsExactlyInAnyOrder(1L); + AggregatedStatisticsTracker.Aggregation aggregation = + tracker.aggregationsPerCheckpoint().get(1L); + assertThat(aggregation.subtaskSet()).containsExactlyInAnyOrder(0); + assertThat(aggregation.currentType()).isEqualTo(StatisticsType.Map); + assertThat(aggregation.sketchStatistics()).isNull(); + assertThat(aggregation.mapStatistics()) + .isEqualTo(ImmutableMap.of(CHAR_KEYS.get("a"), 1L, CHAR_KEYS.get("b"), 1L)); + + StatisticsEvent checkpoint1Subtask1StatisticsEvent = + createStatisticsEvent( + StatisticsType.Map, + TASK_STATISTICS_SERIALIZER, + 1L, + CHAR_KEYS.get("c"), + CHAR_KEYS.get("d")); + completedStatistics = tracker.updateAndCheckCompletion(1, checkpoint1Subtask1StatisticsEvent); + assertThat(completedStatistics).isNull(); + assertThat(tracker.aggregationsPerCheckpoint().keySet()).containsExactlyInAnyOrder(1L); + aggregation = tracker.aggregationsPerCheckpoint().get(1L); + assertThat(aggregation.subtaskSet()).containsExactlyInAnyOrder(0, 1); + // converted to sketch statistics as map size is 4 (over the switch threshold of 3) + assertThat(aggregation.currentType()).isEqualTo(StatisticsType.Sketch); + assertThat(aggregation.mapStatistics()).isNull(); + assertThat(aggregation.sketchStatistics().getResult().getSamples()) + .containsExactlyInAnyOrder( + CHAR_KEYS.get("a"), CHAR_KEYS.get("b"), CHAR_KEYS.get("c"), CHAR_KEYS.get("d")); + + StatisticsEvent checkpoint1Subtask2StatisticsEvent = + createStatisticsEvent( + StatisticsType.Map, + TASK_STATISTICS_SERIALIZER, + 1L, + CHAR_KEYS.get("e"), + CHAR_KEYS.get("f")); + completedStatistics = tracker.updateAndCheckCompletion(2, checkpoint1Subtask2StatisticsEvent); + assertThat(tracker.aggregationsPerCheckpoint()).isEmpty(); + assertThat(completedStatistics).isNotNull(); + assertThat(completedStatistics.type()).isEqualTo(StatisticsType.Sketch); + assertThat(completedStatistics.keySamples()) + .containsExactly( + CHAR_KEYS.get("a"), + CHAR_KEYS.get("b"), + CHAR_KEYS.get("c"), + CHAR_KEYS.get("d"), + CHAR_KEYS.get("e"), + CHAR_KEYS.get("f")); + } + + @Test + public void coordinatorMapOperatorSketch() { + int parallelism = 3; + int downstreamParallelism = 3; + AggregatedStatisticsTracker tracker = + new AggregatedStatisticsTracker( + "testOperator", + parallelism, + Fixtures.SCHEMA, + Fixtures.SORT_ORDER, + downstreamParallelism, + StatisticsType.Auto, + SketchUtil.COORDINATOR_SKETCH_SWITCH_THRESHOLD, + null); + + // first operator event has map statistics + StatisticsEvent checkpoint1Subtask0StatisticsEvent = + createStatisticsEvent( + StatisticsType.Map, + TASK_STATISTICS_SERIALIZER, + 1L, + CHAR_KEYS.get("a"), + CHAR_KEYS.get("b")); + CompletedStatistics completedStatistics = + tracker.updateAndCheckCompletion(0, checkpoint1Subtask0StatisticsEvent); + assertThat(completedStatistics).isNull(); + assertThat(tracker.aggregationsPerCheckpoint().keySet()).containsExactlyInAnyOrder(1L); + AggregatedStatisticsTracker.Aggregation aggregation = + tracker.aggregationsPerCheckpoint().get(1L); + assertThat(aggregation.subtaskSet()).containsExactlyInAnyOrder(0); + assertThat(aggregation.currentType()).isEqualTo(StatisticsType.Map); + assertThat(aggregation.sketchStatistics()).isNull(); + assertThat(aggregation.mapStatistics()) + .isEqualTo(ImmutableMap.of(CHAR_KEYS.get("a"), 1L, CHAR_KEYS.get("b"), 1L)); + + // second operator event contains sketch statistics + StatisticsEvent checkpoint1Subtask1StatisticsEvent = + createStatisticsEvent( + StatisticsType.Sketch, + TASK_STATISTICS_SERIALIZER, + 1L, + CHAR_KEYS.get("c"), + CHAR_KEYS.get("d")); + completedStatistics = tracker.updateAndCheckCompletion(1, checkpoint1Subtask1StatisticsEvent); + assertThat(completedStatistics).isNull(); + assertThat(tracker.aggregationsPerCheckpoint().keySet()).containsExactlyInAnyOrder(1L); + aggregation = tracker.aggregationsPerCheckpoint().get(1L); + assertThat(aggregation.subtaskSet()).containsExactlyInAnyOrder(0, 1); + assertThat(aggregation.currentType()).isEqualTo(StatisticsType.Sketch); + assertThat(aggregation.mapStatistics()).isNull(); + assertThat(aggregation.sketchStatistics().getResult().getSamples()) + .containsExactlyInAnyOrder( + CHAR_KEYS.get("a"), CHAR_KEYS.get("b"), CHAR_KEYS.get("c"), CHAR_KEYS.get("d")); + + // third operator event has Map statistics + StatisticsEvent checkpoint1Subtask2StatisticsEvent = + createStatisticsEvent( + StatisticsType.Map, + TASK_STATISTICS_SERIALIZER, + 1L, + CHAR_KEYS.get("e"), + CHAR_KEYS.get("f")); + completedStatistics = tracker.updateAndCheckCompletion(2, checkpoint1Subtask2StatisticsEvent); + assertThat(tracker.aggregationsPerCheckpoint()).isEmpty(); + assertThat(completedStatistics).isNotNull(); + assertThat(completedStatistics.type()).isEqualTo(StatisticsType.Sketch); + assertThat(completedStatistics.keySamples()) + .containsExactly( + CHAR_KEYS.get("a"), + CHAR_KEYS.get("b"), + CHAR_KEYS.get("c"), + CHAR_KEYS.get("d"), + CHAR_KEYS.get("e"), + CHAR_KEYS.get("f")); + } + + private AggregatedStatisticsTracker createTracker(StatisticsType type) { + return new AggregatedStatisticsTracker( + "testOperator", + Fixtures.NUM_SUBTASKS, + Fixtures.SCHEMA, + Fixtures.SORT_ORDER, + Fixtures.NUM_SUBTASKS, + type, + SketchUtil.COORDINATOR_SKETCH_SWITCH_THRESHOLD, + null); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestCompletedStatisticsSerializer.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestCompletedStatisticsSerializer.java new file mode 100644 index 000000000000..4ee9888934a8 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestCompletedStatisticsSerializer.java @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import static org.apache.iceberg.flink.sink.shuffle.Fixtures.CHAR_KEYS; + +import org.apache.flink.api.common.typeutils.SerializerTestBase; +import org.apache.flink.api.common.typeutils.TypeSerializer; +import org.apache.iceberg.SortKey; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; + +public class TestCompletedStatisticsSerializer extends SerializerTestBase { + + @Override + protected TypeSerializer createSerializer() { + return Fixtures.COMPLETED_STATISTICS_SERIALIZER; + } + + @Override + protected int getLength() { + return -1; + } + + @Override + protected Class getTypeClass() { + return CompletedStatistics.class; + } + + @Override + protected CompletedStatistics[] getTestData() { + + return new CompletedStatistics[] { + CompletedStatistics.fromKeyFrequency( + 1L, ImmutableMap.of(CHAR_KEYS.get("a"), 1L, CHAR_KEYS.get("b"), 2L)), + CompletedStatistics.fromKeySamples(2L, new SortKey[] {CHAR_KEYS.get("a"), CHAR_KEYS.get("b")}) + }; + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsCoordinator.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsCoordinator.java new file mode 100644 index 000000000000..a08a8a73e80c --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsCoordinator.java @@ -0,0 +1,246 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import static org.apache.iceberg.flink.sink.shuffle.Fixtures.CHAR_KEYS; +import static org.apache.iceberg.flink.sink.shuffle.Fixtures.NUM_SUBTASKS; +import static org.apache.iceberg.flink.sink.shuffle.Fixtures.SORT_ORDER_COMPARTOR; +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import java.time.Duration; +import java.util.Map; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.ExecutionException; +import org.apache.flink.runtime.jobgraph.OperatorID; +import org.apache.flink.runtime.operators.coordination.EventReceivingTasks; +import org.apache.flink.runtime.operators.coordination.MockOperatorCoordinatorContext; +import org.apache.flink.util.ExceptionUtils; +import org.apache.iceberg.SortKey; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.awaitility.Awaitility; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.EnumSource; + +public class TestDataStatisticsCoordinator { + private static final String OPERATOR_NAME = "TestCoordinator"; + private static final OperatorID TEST_OPERATOR_ID = new OperatorID(1234L, 5678L); + + private EventReceivingTasks receivingTasks; + + @BeforeEach + public void before() throws Exception { + receivingTasks = EventReceivingTasks.createForRunningTasks(); + } + + private void tasksReady(DataStatisticsCoordinator coordinator) { + setAllTasksReady(NUM_SUBTASKS, coordinator, receivingTasks); + } + + @ParameterizedTest + @EnumSource(StatisticsType.class) + public void testThrowExceptionWhenNotStarted(StatisticsType type) throws Exception { + try (DataStatisticsCoordinator dataStatisticsCoordinator = createCoordinator(type)) { + String failureMessage = "The coordinator of TestCoordinator has not started yet."; + assertThatThrownBy( + () -> + dataStatisticsCoordinator.handleEventFromOperator( + 0, + 0, + StatisticsEvent.createTaskStatisticsEvent( + 0, new MapDataStatistics(), Fixtures.TASK_STATISTICS_SERIALIZER))) + .isInstanceOf(IllegalStateException.class) + .hasMessage(failureMessage); + assertThatThrownBy(() -> dataStatisticsCoordinator.executionAttemptFailed(0, 0, null)) + .isInstanceOf(IllegalStateException.class) + .hasMessage(failureMessage); + assertThatThrownBy(() -> dataStatisticsCoordinator.checkpointCoordinator(0, null)) + .isInstanceOf(IllegalStateException.class) + .hasMessage(failureMessage); + } + } + + @ParameterizedTest + @EnumSource(StatisticsType.class) + public void testDataStatisticsEventHandling(StatisticsType type) throws Exception { + try (DataStatisticsCoordinator dataStatisticsCoordinator = createCoordinator(type)) { + dataStatisticsCoordinator.start(); + tasksReady(dataStatisticsCoordinator); + + StatisticsEvent checkpoint1Subtask0DataStatisticEvent = + Fixtures.createStatisticsEvent( + type, + Fixtures.TASK_STATISTICS_SERIALIZER, + 1L, + CHAR_KEYS.get("a"), + CHAR_KEYS.get("b"), + CHAR_KEYS.get("b"), + CHAR_KEYS.get("c"), + CHAR_KEYS.get("c"), + CHAR_KEYS.get("c")); + StatisticsEvent checkpoint1Subtask1DataStatisticEvent = + Fixtures.createStatisticsEvent( + type, + Fixtures.TASK_STATISTICS_SERIALIZER, + 1L, + CHAR_KEYS.get("a"), + CHAR_KEYS.get("b"), + CHAR_KEYS.get("c"), + CHAR_KEYS.get("c")); + // Handle events from operators for checkpoint 1 + dataStatisticsCoordinator.handleEventFromOperator( + 0, 0, checkpoint1Subtask0DataStatisticEvent); + dataStatisticsCoordinator.handleEventFromOperator( + 1, 0, checkpoint1Subtask1DataStatisticEvent); + + waitForCoordinatorToProcessActions(dataStatisticsCoordinator); + + Map keyFrequency = + ImmutableMap.of( + CHAR_KEYS.get("a"), 2L, + CHAR_KEYS.get("b"), 3L, + CHAR_KEYS.get("c"), 5L); + MapAssignment mapAssignment = + MapAssignment.fromKeyFrequency(NUM_SUBTASKS, keyFrequency, 0.0d, SORT_ORDER_COMPARTOR); + + CompletedStatistics completedStatistics = dataStatisticsCoordinator.completedStatistics(); + assertThat(completedStatistics.checkpointId()).isEqualTo(1L); + assertThat(completedStatistics.type()).isEqualTo(StatisticsUtil.collectType(type)); + if (StatisticsUtil.collectType(type) == StatisticsType.Map) { + assertThat(completedStatistics.keyFrequency()).isEqualTo(keyFrequency); + } else { + assertThat(completedStatistics.keySamples()) + .containsExactly( + CHAR_KEYS.get("a"), + CHAR_KEYS.get("a"), + CHAR_KEYS.get("b"), + CHAR_KEYS.get("b"), + CHAR_KEYS.get("b"), + CHAR_KEYS.get("c"), + CHAR_KEYS.get("c"), + CHAR_KEYS.get("c"), + CHAR_KEYS.get("c"), + CHAR_KEYS.get("c")); + } + + GlobalStatistics globalStatistics = dataStatisticsCoordinator.globalStatistics(); + assertThat(globalStatistics.checkpointId()).isEqualTo(1L); + assertThat(globalStatistics.type()).isEqualTo(StatisticsUtil.collectType(type)); + if (StatisticsUtil.collectType(type) == StatisticsType.Map) { + assertThat(globalStatistics.mapAssignment()).isEqualTo(mapAssignment); + } else { + assertThat(globalStatistics.rangeBounds()).containsExactly(CHAR_KEYS.get("b")); + } + } + } + + @Test + public void testRequestGlobalStatisticsEventHandling() throws Exception { + try (DataStatisticsCoordinator dataStatisticsCoordinator = + createCoordinator(StatisticsType.Sketch)) { + dataStatisticsCoordinator.start(); + tasksReady(dataStatisticsCoordinator); + + // receive request before global statistics is ready + dataStatisticsCoordinator.handleEventFromOperator(0, 0, new RequestGlobalStatisticsEvent()); + assertThat(receivingTasks.getSentEventsForSubtask(0)).isEmpty(); + assertThat(receivingTasks.getSentEventsForSubtask(1)).isEmpty(); + + StatisticsEvent checkpoint1Subtask0DataStatisticEvent = + Fixtures.createStatisticsEvent( + StatisticsType.Sketch, Fixtures.TASK_STATISTICS_SERIALIZER, 1L, CHAR_KEYS.get("a")); + StatisticsEvent checkpoint1Subtask1DataStatisticEvent = + Fixtures.createStatisticsEvent( + StatisticsType.Sketch, Fixtures.TASK_STATISTICS_SERIALIZER, 1L, CHAR_KEYS.get("b")); + // Handle events from operators for checkpoint 1 + dataStatisticsCoordinator.handleEventFromOperator( + 0, 0, checkpoint1Subtask0DataStatisticEvent); + dataStatisticsCoordinator.handleEventFromOperator( + 1, 0, checkpoint1Subtask1DataStatisticEvent); + + waitForCoordinatorToProcessActions(dataStatisticsCoordinator); + Awaitility.await("wait for statistics event") + .pollInterval(Duration.ofMillis(10)) + .atMost(Duration.ofSeconds(10)) + .until(() -> receivingTasks.getSentEventsForSubtask(0).size() == 1); + assertThat(receivingTasks.getSentEventsForSubtask(0).get(0)) + .isInstanceOf(StatisticsEvent.class); + + Awaitility.await("wait for statistics event") + .pollInterval(Duration.ofMillis(10)) + .atMost(Duration.ofSeconds(10)) + .until(() -> receivingTasks.getSentEventsForSubtask(1).size() == 1); + assertThat(receivingTasks.getSentEventsForSubtask(1).get(0)) + .isInstanceOf(StatisticsEvent.class); + + dataStatisticsCoordinator.handleEventFromOperator(1, 0, new RequestGlobalStatisticsEvent()); + + // coordinator should send a response to subtask 1 + Awaitility.await("wait for statistics event") + .pollInterval(Duration.ofMillis(10)) + .atMost(Duration.ofSeconds(10)) + .until(() -> receivingTasks.getSentEventsForSubtask(1).size() == 2); + assertThat(receivingTasks.getSentEventsForSubtask(1).get(0)) + .isInstanceOf(StatisticsEvent.class); + assertThat(receivingTasks.getSentEventsForSubtask(1).get(1)) + .isInstanceOf(StatisticsEvent.class); + } + } + + static void setAllTasksReady( + int subtasks, + DataStatisticsCoordinator dataStatisticsCoordinator, + EventReceivingTasks receivingTasks) { + for (int i = 0; i < subtasks; i++) { + dataStatisticsCoordinator.executionAttemptReady( + i, 0, receivingTasks.createGatewayForSubtask(i, 0)); + } + } + + static void waitForCoordinatorToProcessActions(DataStatisticsCoordinator coordinator) { + CompletableFuture future = new CompletableFuture<>(); + coordinator.callInCoordinatorThread( + () -> { + future.complete(null); + return null; + }, + "Coordinator fails to process action"); + + try { + future.get(); + } catch (InterruptedException e) { + throw new AssertionError("test interrupted"); + } catch (ExecutionException e) { + ExceptionUtils.rethrow(ExceptionUtils.stripExecutionException(e)); + } + } + + private static DataStatisticsCoordinator createCoordinator(StatisticsType type) { + return new DataStatisticsCoordinator( + OPERATOR_NAME, + new MockOperatorCoordinatorContext(TEST_OPERATOR_ID, NUM_SUBTASKS), + Fixtures.SCHEMA, + Fixtures.SORT_ORDER, + NUM_SUBTASKS, + type, + 0.0d); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsCoordinatorProvider.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsCoordinatorProvider.java new file mode 100644 index 000000000000..6317f2bfde18 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsCoordinatorProvider.java @@ -0,0 +1,187 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import static org.apache.iceberg.flink.sink.shuffle.Fixtures.CHAR_KEYS; +import static org.apache.iceberg.flink.sink.shuffle.Fixtures.SORT_ORDER_COMPARTOR; +import static org.apache.iceberg.flink.sink.shuffle.Fixtures.TASK_STATISTICS_SERIALIZER; +import static org.apache.iceberg.flink.sink.shuffle.Fixtures.createStatisticsEvent; +import static org.assertj.core.api.Assertions.assertThat; + +import java.util.Map; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.ExecutionException; +import org.apache.flink.runtime.jobgraph.OperatorID; +import org.apache.flink.runtime.operators.coordination.EventReceivingTasks; +import org.apache.flink.runtime.operators.coordination.MockOperatorCoordinatorContext; +import org.apache.flink.runtime.operators.coordination.RecreateOnResetOperatorCoordinator; +import org.apache.iceberg.SortKey; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.EnumSource; + +public class TestDataStatisticsCoordinatorProvider { + private static final OperatorID OPERATOR_ID = new OperatorID(); + + private EventReceivingTasks receivingTasks; + + @BeforeEach + public void before() { + receivingTasks = EventReceivingTasks.createForRunningTasks(); + } + + @ParameterizedTest + @EnumSource(StatisticsType.class) + public void testCheckpointAndReset(StatisticsType type) throws Exception { + DataStatisticsCoordinatorProvider provider = createProvider(type, Fixtures.NUM_SUBTASKS); + try (RecreateOnResetOperatorCoordinator coordinator = + (RecreateOnResetOperatorCoordinator) + provider.create( + new MockOperatorCoordinatorContext(OPERATOR_ID, Fixtures.NUM_SUBTASKS))) { + DataStatisticsCoordinator dataStatisticsCoordinator = + (DataStatisticsCoordinator) coordinator.getInternalCoordinator(); + + // Start the coordinator + coordinator.start(); + TestDataStatisticsCoordinator.setAllTasksReady( + Fixtures.NUM_SUBTASKS, dataStatisticsCoordinator, receivingTasks); + + // Handle events from operators for checkpoint 1 + StatisticsEvent checkpoint1Subtask0StatisticsEvent = + createStatisticsEvent(type, TASK_STATISTICS_SERIALIZER, 1L, CHAR_KEYS.get("a")); + coordinator.handleEventFromOperator(0, 0, checkpoint1Subtask0StatisticsEvent); + TestDataStatisticsCoordinator.waitForCoordinatorToProcessActions(dataStatisticsCoordinator); + + StatisticsEvent checkpoint1Subtask1StatisticsEvent = + createStatisticsEvent(type, TASK_STATISTICS_SERIALIZER, 1L, CHAR_KEYS.get("b")); + coordinator.handleEventFromOperator(1, 0, checkpoint1Subtask1StatisticsEvent); + TestDataStatisticsCoordinator.waitForCoordinatorToProcessActions(dataStatisticsCoordinator); + + // Verify checkpoint 1 global data statistics + Map checkpoint1KeyFrequency = + ImmutableMap.of(CHAR_KEYS.get("a"), 1L, CHAR_KEYS.get("b"), 1L); + MapAssignment checkpoint1MapAssignment = + MapAssignment.fromKeyFrequency( + Fixtures.NUM_SUBTASKS, checkpoint1KeyFrequency, 0.0d, SORT_ORDER_COMPARTOR); + + CompletedStatistics completedStatistics = dataStatisticsCoordinator.completedStatistics(); + assertThat(completedStatistics).isNotNull(); + assertThat(completedStatistics.type()).isEqualTo(StatisticsUtil.collectType(type)); + if (StatisticsUtil.collectType(type) == StatisticsType.Map) { + assertThat(completedStatistics.keyFrequency()).isEqualTo(checkpoint1KeyFrequency); + } else { + assertThat(completedStatistics.keySamples()) + .containsExactly(CHAR_KEYS.get("a"), CHAR_KEYS.get("b")); + } + + GlobalStatistics globalStatistics = dataStatisticsCoordinator.globalStatistics(); + assertThat(globalStatistics).isNotNull(); + assertThat(globalStatistics.type()).isEqualTo(StatisticsUtil.collectType(type)); + if (StatisticsUtil.collectType(type) == StatisticsType.Map) { + assertThat(globalStatistics.mapAssignment()).isEqualTo(checkpoint1MapAssignment); + } else { + assertThat(globalStatistics.rangeBounds()).containsExactly(CHAR_KEYS.get("a")); + } + + byte[] checkpoint1Bytes = waitForCheckpoint(1L, dataStatisticsCoordinator); + + StatisticsEvent checkpoint2Subtask0StatisticsEvent = + createStatisticsEvent( + type, TASK_STATISTICS_SERIALIZER, 2L, CHAR_KEYS.get("d"), CHAR_KEYS.get("e")); + coordinator.handleEventFromOperator(0, 0, checkpoint2Subtask0StatisticsEvent); + TestDataStatisticsCoordinator.waitForCoordinatorToProcessActions(dataStatisticsCoordinator); + + StatisticsEvent checkpoint2Subtask1StatisticsEvent = + createStatisticsEvent(type, TASK_STATISTICS_SERIALIZER, 2L, CHAR_KEYS.get("f")); + coordinator.handleEventFromOperator(1, 0, checkpoint2Subtask1StatisticsEvent); + TestDataStatisticsCoordinator.waitForCoordinatorToProcessActions(dataStatisticsCoordinator); + + // Verify checkpoint 2 global data statistics + Map checkpoint2KeyFrequency = + ImmutableMap.of(CHAR_KEYS.get("d"), 1L, CHAR_KEYS.get("e"), 1L, CHAR_KEYS.get("f"), 1L); + MapAssignment checkpoint2MapAssignment = + MapAssignment.fromKeyFrequency( + Fixtures.NUM_SUBTASKS, checkpoint2KeyFrequency, 0.0d, SORT_ORDER_COMPARTOR); + completedStatistics = dataStatisticsCoordinator.completedStatistics(); + assertThat(completedStatistics.type()).isEqualTo(StatisticsUtil.collectType(type)); + if (StatisticsUtil.collectType(type) == StatisticsType.Map) { + assertThat(completedStatistics.keyFrequency()).isEqualTo(checkpoint2KeyFrequency); + } else { + assertThat(completedStatistics.keySamples()) + .containsExactly(CHAR_KEYS.get("d"), CHAR_KEYS.get("e"), CHAR_KEYS.get("f")); + } + + globalStatistics = dataStatisticsCoordinator.globalStatistics(); + assertThat(globalStatistics.type()).isEqualTo(StatisticsUtil.collectType(type)); + if (StatisticsUtil.collectType(type) == StatisticsType.Map) { + assertThat(globalStatistics.mapAssignment()).isEqualTo(checkpoint2MapAssignment); + } else { + assertThat(globalStatistics.rangeBounds()).containsExactly(CHAR_KEYS.get("e")); + } + + waitForCheckpoint(2L, dataStatisticsCoordinator); + + // Reset coordinator to checkpoint 1 + coordinator.resetToCheckpoint(1L, checkpoint1Bytes); + DataStatisticsCoordinator restoredDataStatisticsCoordinator = + (DataStatisticsCoordinator) coordinator.getInternalCoordinator(); + assertThat(dataStatisticsCoordinator).isNotSameAs(restoredDataStatisticsCoordinator); + + completedStatistics = restoredDataStatisticsCoordinator.completedStatistics(); + assertThat(completedStatistics.type()).isEqualTo(StatisticsUtil.collectType(type)); + // Verify restored data statistics + if (StatisticsUtil.collectType(type) == StatisticsType.Map) { + assertThat(completedStatistics.keyFrequency()) + .isEqualTo(ImmutableMap.of(CHAR_KEYS.get("a"), 1L, CHAR_KEYS.get("b"), 1L)); + } else { + assertThat(completedStatistics.keySamples()) + .containsExactly(CHAR_KEYS.get("a"), CHAR_KEYS.get("b")); + } + + globalStatistics = restoredDataStatisticsCoordinator.globalStatistics(); + assertThat(globalStatistics).isNotNull(); + assertThat(globalStatistics.type()).isEqualTo(StatisticsUtil.collectType(type)); + if (StatisticsUtil.collectType(type) == StatisticsType.Map) { + assertThat(globalStatistics.mapAssignment()).isEqualTo(checkpoint1MapAssignment); + } else { + assertThat(globalStatistics.rangeBounds()).containsExactly(CHAR_KEYS.get("a")); + } + } + } + + private byte[] waitForCheckpoint(long checkpointId, DataStatisticsCoordinator coordinator) + throws InterruptedException, ExecutionException { + CompletableFuture future = new CompletableFuture<>(); + coordinator.checkpointCoordinator(checkpointId, future); + return future.get(); + } + + private static DataStatisticsCoordinatorProvider createProvider( + StatisticsType type, int downstreamParallelism) { + return new DataStatisticsCoordinatorProvider( + "DataStatisticsCoordinatorProvider", + OPERATOR_ID, + Fixtures.SCHEMA, + Fixtures.SORT_ORDER, + downstreamParallelism, + type, + 0.0); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsOperator.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsOperator.java new file mode 100644 index 000000000000..bc248b778184 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsOperator.java @@ -0,0 +1,352 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import static org.apache.iceberg.flink.sink.shuffle.Fixtures.CHAR_KEYS; +import static org.apache.iceberg.flink.sink.shuffle.Fixtures.SORT_ORDER_COMPARTOR; +import static org.assertj.core.api.Assertions.assertThat; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.Mockito.verify; + +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; +import java.util.stream.Stream; +import org.apache.datasketches.sampling.ReservoirItemsSketch; +import org.apache.flink.api.common.ExecutionConfig; +import org.apache.flink.api.common.state.OperatorStateStore; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.core.fs.CloseableRegistry; +import org.apache.flink.runtime.checkpoint.OperatorSubtaskState; +import org.apache.flink.runtime.execution.Environment; +import org.apache.flink.runtime.operators.coordination.MockOperatorEventGateway; +import org.apache.flink.runtime.operators.testutils.MockInputSplitProvider; +import org.apache.flink.runtime.state.AbstractStateBackend; +import org.apache.flink.runtime.state.OperatorStateBackendParametersImpl; +import org.apache.flink.runtime.state.StateInitializationContext; +import org.apache.flink.runtime.state.StateInitializationContextImpl; +import org.apache.flink.runtime.state.TestTaskStateManager; +import org.apache.flink.runtime.state.hashmap.HashMapStateBackend; +import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; +import org.apache.flink.streaming.runtime.tasks.OneInputStreamTask; +import org.apache.flink.streaming.runtime.tasks.StreamMockEnvironment; +import org.apache.flink.streaming.util.MockOutput; +import org.apache.flink.streaming.util.MockStreamConfig; +import org.apache.flink.streaming.util.OneInputStreamOperatorTestHarness; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.iceberg.SortKey; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.EnumSource; +import org.junit.jupiter.params.provider.MethodSource; +import org.mockito.Mockito; + +public class TestDataStatisticsOperator { + + private Environment env; + + @BeforeEach + public void before() throws Exception { + this.env = + new StreamMockEnvironment( + new Configuration(), + new Configuration(), + new ExecutionConfig(), + 1L, + new MockInputSplitProvider(), + 1, + new TestTaskStateManager()); + } + + private DataStatisticsOperator createOperator(StatisticsType type, int downstreamParallelism) + throws Exception { + MockOperatorEventGateway mockGateway = new MockOperatorEventGateway(); + return createOperator(type, downstreamParallelism, mockGateway); + } + + private DataStatisticsOperator createOperator( + StatisticsType type, int downstreamParallelism, MockOperatorEventGateway mockGateway) + throws Exception { + DataStatisticsOperator operator = + new DataStatisticsOperator( + "testOperator", + Fixtures.SCHEMA, + Fixtures.SORT_ORDER, + mockGateway, + downstreamParallelism, + type); + operator.setup( + new OneInputStreamTask(env), + new MockStreamConfig(new Configuration(), 1), + new MockOutput<>(Lists.newArrayList())); + return operator; + } + + @SuppressWarnings("unchecked") + @ParameterizedTest + @EnumSource(StatisticsType.class) + public void testProcessElement(StatisticsType type) throws Exception { + DataStatisticsOperator operator = createOperator(type, Fixtures.NUM_SUBTASKS); + try (OneInputStreamOperatorTestHarness testHarness = + createHarness(operator)) { + StateInitializationContext stateContext = getStateContext(); + operator.initializeState(stateContext); + operator.processElement(new StreamRecord<>(GenericRowData.of(StringData.fromString("a"), 5))); + operator.processElement(new StreamRecord<>(GenericRowData.of(StringData.fromString("a"), 3))); + operator.processElement(new StreamRecord<>(GenericRowData.of(StringData.fromString("b"), 1))); + + DataStatistics localStatistics = operator.localStatistics(); + assertThat(localStatistics.type()).isEqualTo(StatisticsUtil.collectType(type)); + if (StatisticsUtil.collectType(type) == StatisticsType.Map) { + Map keyFrequency = (Map) localStatistics.result(); + assertThat(keyFrequency) + .isEqualTo(ImmutableMap.of(CHAR_KEYS.get("a"), 2L, CHAR_KEYS.get("b"), 1L)); + } else { + ReservoirItemsSketch sketch = + (ReservoirItemsSketch) localStatistics.result(); + assertThat(sketch.getSamples()) + .containsExactly(CHAR_KEYS.get("a"), CHAR_KEYS.get("a"), CHAR_KEYS.get("b")); + } + + testHarness.endInput(); + } + } + + @ParameterizedTest + @EnumSource(StatisticsType.class) + public void testOperatorOutput(StatisticsType type) throws Exception { + DataStatisticsOperator operator = createOperator(type, Fixtures.NUM_SUBTASKS); + try (OneInputStreamOperatorTestHarness testHarness = + createHarness(operator)) { + testHarness.processElement( + new StreamRecord<>(GenericRowData.of(StringData.fromString("a"), 2))); + testHarness.processElement( + new StreamRecord<>(GenericRowData.of(StringData.fromString("b"), 3))); + testHarness.processElement( + new StreamRecord<>(GenericRowData.of(StringData.fromString("b"), 1))); + + List recordsOutput = + testHarness.extractOutputValues().stream() + .filter(StatisticsOrRecord::hasRecord) + .map(StatisticsOrRecord::record) + .collect(Collectors.toList()); + assertThat(recordsOutput) + .containsExactlyInAnyOrderElementsOf( + ImmutableList.of( + GenericRowData.of(StringData.fromString("a"), 2), + GenericRowData.of(StringData.fromString("b"), 3), + GenericRowData.of(StringData.fromString("b"), 1))); + } + } + + private static Stream provideRestoreStateParameters() { + return Stream.of( + Arguments.of(StatisticsType.Map, -1), + Arguments.of(StatisticsType.Map, 0), + Arguments.of(StatisticsType.Map, 1), + Arguments.of(StatisticsType.Sketch, -1), + Arguments.of(StatisticsType.Sketch, 0), + Arguments.of(StatisticsType.Sketch, 1)); + } + + @ParameterizedTest + @MethodSource("provideRestoreStateParameters") + public void testRestoreState(StatisticsType type, int parallelismAdjustment) throws Exception { + Map keyFrequency = + ImmutableMap.of(CHAR_KEYS.get("a"), 2L, CHAR_KEYS.get("b"), 1L, CHAR_KEYS.get("c"), 1L); + SortKey[] rangeBounds = new SortKey[] {CHAR_KEYS.get("a")}; + MapAssignment mapAssignment = + MapAssignment.fromKeyFrequency(2, keyFrequency, 0.0d, SORT_ORDER_COMPARTOR); + DataStatisticsOperator operator = createOperator(type, Fixtures.NUM_SUBTASKS); + OperatorSubtaskState snapshot; + try (OneInputStreamOperatorTestHarness testHarness1 = + createHarness(operator)) { + GlobalStatistics statistics; + if (StatisticsUtil.collectType(type) == StatisticsType.Map) { + statistics = GlobalStatistics.fromMapAssignment(1L, mapAssignment); + } else { + statistics = GlobalStatistics.fromRangeBounds(1L, rangeBounds); + } + + StatisticsEvent event = + StatisticsEvent.createGlobalStatisticsEvent( + statistics, Fixtures.GLOBAL_STATISTICS_SERIALIZER, false); + operator.handleOperatorEvent(event); + + GlobalStatistics globalStatistics = operator.globalStatistics(); + assertThat(globalStatistics.type()).isEqualTo(StatisticsUtil.collectType(type)); + if (StatisticsUtil.collectType(type) == StatisticsType.Map) { + assertThat(globalStatistics.mapAssignment()).isEqualTo(mapAssignment); + assertThat(globalStatistics.rangeBounds()).isNull(); + } else { + assertThat(globalStatistics.mapAssignment()).isNull(); + assertThat(globalStatistics.rangeBounds()).isEqualTo(rangeBounds); + } + + snapshot = testHarness1.snapshot(1L, 0); + } + + // Use the snapshot to initialize state for another new operator and then verify that the global + // statistics for the new operator is same as before + MockOperatorEventGateway spyGateway = Mockito.spy(new MockOperatorEventGateway()); + DataStatisticsOperator restoredOperator = + createOperator(type, Fixtures.NUM_SUBTASKS + parallelismAdjustment, spyGateway); + try (OneInputStreamOperatorTestHarness testHarness2 = + new OneInputStreamOperatorTestHarness<>(restoredOperator, 2, 2, 1)) { + testHarness2.setup(); + testHarness2.initializeState(snapshot); + + GlobalStatistics globalStatistics = restoredOperator.globalStatistics(); + // global statistics is always restored and used initially even if + // downstream parallelism changed. + assertThat(globalStatistics).isNotNull(); + // request is always sent to coordinator during initialization. + // coordinator would respond with a new global statistics that + // has range bound recomputed with new parallelism. + verify(spyGateway).sendEventToCoordinator(any(RequestGlobalStatisticsEvent.class)); + assertThat(globalStatistics.type()).isEqualTo(StatisticsUtil.collectType(type)); + if (StatisticsUtil.collectType(type) == StatisticsType.Map) { + assertThat(globalStatistics.mapAssignment()).isEqualTo(mapAssignment); + assertThat(globalStatistics.rangeBounds()).isNull(); + } else { + assertThat(globalStatistics.mapAssignment()).isNull(); + assertThat(globalStatistics.rangeBounds()).isEqualTo(rangeBounds); + } + } + } + + @SuppressWarnings("unchecked") + @Test + public void testMigrationWithLocalStatsOverThreshold() throws Exception { + DataStatisticsOperator operator = createOperator(StatisticsType.Auto, Fixtures.NUM_SUBTASKS); + try (OneInputStreamOperatorTestHarness testHarness = + createHarness(operator)) { + StateInitializationContext stateContext = getStateContext(); + operator.initializeState(stateContext); + + // add rows with unique keys + for (int i = 0; i < SketchUtil.OPERATOR_SKETCH_SWITCH_THRESHOLD; ++i) { + operator.processElement( + new StreamRecord<>(GenericRowData.of(StringData.fromString(String.valueOf(i)), i))); + assertThat(operator.localStatistics().type()).isEqualTo(StatisticsType.Map); + assertThat((Map) operator.localStatistics().result()).hasSize(i + 1); + } + + // one more item should trigger the migration to sketch stats + operator.processElement( + new StreamRecord<>(GenericRowData.of(StringData.fromString("key-trigger-migration"), 1))); + + int reservoirSize = + SketchUtil.determineOperatorReservoirSize(Fixtures.NUM_SUBTASKS, Fixtures.NUM_SUBTASKS); + + assertThat(operator.localStatistics().type()).isEqualTo(StatisticsType.Sketch); + ReservoirItemsSketch sketch = + (ReservoirItemsSketch) operator.localStatistics().result(); + assertThat(sketch.getK()).isEqualTo(reservoirSize); + assertThat(sketch.getN()).isEqualTo(SketchUtil.OPERATOR_SKETCH_SWITCH_THRESHOLD + 1); + // reservoir not full yet + assertThat(sketch.getN()).isLessThan(reservoirSize); + assertThat(sketch.getSamples()).hasSize((int) sketch.getN()); + + // add more items to saturate the reservoir + for (int i = 0; i < reservoirSize; ++i) { + operator.processElement( + new StreamRecord<>(GenericRowData.of(StringData.fromString(String.valueOf(i)), i))); + } + + assertThat(operator.localStatistics().type()).isEqualTo(StatisticsType.Sketch); + sketch = (ReservoirItemsSketch) operator.localStatistics().result(); + assertThat(sketch.getK()).isEqualTo(reservoirSize); + assertThat(sketch.getN()) + .isEqualTo(SketchUtil.OPERATOR_SKETCH_SWITCH_THRESHOLD + 1 + reservoirSize); + // reservoir is full now + assertThat(sketch.getN()).isGreaterThan(reservoirSize); + assertThat(sketch.getSamples()).hasSize(reservoirSize); + + testHarness.endInput(); + } + } + + @SuppressWarnings("unchecked") + @Test + public void testMigrationWithGlobalSketchStatistics() throws Exception { + DataStatisticsOperator operator = createOperator(StatisticsType.Auto, Fixtures.NUM_SUBTASKS); + try (OneInputStreamOperatorTestHarness testHarness = + createHarness(operator)) { + StateInitializationContext stateContext = getStateContext(); + operator.initializeState(stateContext); + + // started with Map stype + operator.processElement(new StreamRecord<>(GenericRowData.of(StringData.fromString("a"), 1))); + assertThat(operator.localStatistics().type()).isEqualTo(StatisticsType.Map); + assertThat((Map) operator.localStatistics().result()) + .isEqualTo(ImmutableMap.of(CHAR_KEYS.get("a"), 1L)); + + // received global statistics with sketch type + GlobalStatistics globalStatistics = + GlobalStatistics.fromRangeBounds( + 1L, new SortKey[] {CHAR_KEYS.get("c"), CHAR_KEYS.get("f")}); + operator.handleOperatorEvent( + StatisticsEvent.createGlobalStatisticsEvent( + globalStatistics, Fixtures.GLOBAL_STATISTICS_SERIALIZER, false)); + + int reservoirSize = + SketchUtil.determineOperatorReservoirSize(Fixtures.NUM_SUBTASKS, Fixtures.NUM_SUBTASKS); + + assertThat(operator.localStatistics().type()).isEqualTo(StatisticsType.Sketch); + ReservoirItemsSketch sketch = + (ReservoirItemsSketch) operator.localStatistics().result(); + assertThat(sketch.getK()).isEqualTo(reservoirSize); + assertThat(sketch.getN()).isEqualTo(1); + assertThat(sketch.getSamples()).isEqualTo(new SortKey[] {CHAR_KEYS.get("a")}); + + testHarness.endInput(); + } + } + + private StateInitializationContext getStateContext() throws Exception { + AbstractStateBackend abstractStateBackend = new HashMapStateBackend(); + CloseableRegistry cancelStreamRegistry = new CloseableRegistry(); + OperatorStateStore operatorStateStore = + abstractStateBackend.createOperatorStateBackend( + new OperatorStateBackendParametersImpl( + env, "test-operator", Collections.emptyList(), cancelStreamRegistry)); + return new StateInitializationContextImpl(null, operatorStateStore, null, null, null); + } + + private OneInputStreamOperatorTestHarness createHarness( + DataStatisticsOperator dataStatisticsOperator) throws Exception { + OneInputStreamOperatorTestHarness harness = + new OneInputStreamOperatorTestHarness<>( + dataStatisticsOperator, Fixtures.NUM_SUBTASKS, Fixtures.NUM_SUBTASKS, 0); + harness.setup( + new StatisticsOrRecordSerializer( + Fixtures.GLOBAL_STATISTICS_SERIALIZER, Fixtures.ROW_SERIALIZER)); + harness.open(); + return harness; + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsSerializer.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsSerializer.java new file mode 100644 index 000000000000..59ce6df05d9d --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsSerializer.java @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import static org.apache.iceberg.flink.sink.shuffle.Fixtures.CHAR_KEYS; + +import org.apache.flink.api.common.typeutils.SerializerTestBase; +import org.apache.flink.api.common.typeutils.TypeSerializer; + +public class TestDataStatisticsSerializer extends SerializerTestBase { + @Override + protected TypeSerializer createSerializer() { + return Fixtures.TASK_STATISTICS_SERIALIZER; + } + + @Override + protected int getLength() { + return -1; + } + + @Override + protected Class getTypeClass() { + return DataStatistics.class; + } + + @Override + protected DataStatistics[] getTestData() { + return new DataStatistics[] { + new MapDataStatistics(), + Fixtures.createTaskStatistics( + StatisticsType.Map, CHAR_KEYS.get("a"), CHAR_KEYS.get("a"), CHAR_KEYS.get("b")), + new SketchDataStatistics(128), + Fixtures.createTaskStatistics( + StatisticsType.Sketch, CHAR_KEYS.get("a"), CHAR_KEYS.get("a"), CHAR_KEYS.get("b")) + }; + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestGlobalStatisticsSerializer.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestGlobalStatisticsSerializer.java new file mode 100644 index 000000000000..7afaf239c668 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestGlobalStatisticsSerializer.java @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import static org.apache.iceberg.flink.sink.shuffle.Fixtures.CHAR_KEYS; +import static org.apache.iceberg.flink.sink.shuffle.Fixtures.SORT_ORDER_COMPARTOR; + +import org.apache.flink.api.common.typeutils.SerializerTestBase; +import org.apache.flink.api.common.typeutils.TypeSerializer; +import org.apache.iceberg.SortKey; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; + +public class TestGlobalStatisticsSerializer extends SerializerTestBase { + + @Override + protected TypeSerializer createSerializer() { + return Fixtures.GLOBAL_STATISTICS_SERIALIZER; + } + + @Override + protected int getLength() { + return -1; + } + + @Override + protected Class getTypeClass() { + return GlobalStatistics.class; + } + + @Override + protected GlobalStatistics[] getTestData() { + return new GlobalStatistics[] { + GlobalStatistics.fromMapAssignment( + 1L, + MapAssignment.fromKeyFrequency( + Fixtures.NUM_SUBTASKS, + ImmutableMap.of(CHAR_KEYS.get("a"), 1L, CHAR_KEYS.get("b"), 2L), + 0.0d, + SORT_ORDER_COMPARTOR)), + GlobalStatistics.fromRangeBounds(2L, new SortKey[] {CHAR_KEYS.get("a"), CHAR_KEYS.get("b")}) + }; + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestMapDataStatistics.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestMapDataStatistics.java new file mode 100644 index 000000000000..8a25c7ad9898 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestMapDataStatistics.java @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import static org.apache.iceberg.flink.sink.shuffle.Fixtures.CHAR_KEYS; +import static org.apache.iceberg.flink.sink.shuffle.Fixtures.ROW_WRAPPER; +import static org.assertj.core.api.Assertions.assertThat; + +import java.util.Map; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.StringData; +import org.apache.iceberg.SortKey; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.junit.jupiter.api.Test; + +public class TestMapDataStatistics { + @SuppressWarnings("unchecked") + @Test + public void testAddsAndGet() { + MapDataStatistics dataStatistics = new MapDataStatistics(); + + GenericRowData reusedRow = GenericRowData.of(StringData.fromString("a"), 1); + Fixtures.SORT_KEY.wrap(ROW_WRAPPER.wrap(reusedRow)); + dataStatistics.add(Fixtures.SORT_KEY); + + reusedRow.setField(0, StringData.fromString("b")); + Fixtures.SORT_KEY.wrap(ROW_WRAPPER.wrap(reusedRow)); + dataStatistics.add(Fixtures.SORT_KEY); + + reusedRow.setField(0, StringData.fromString("c")); + Fixtures.SORT_KEY.wrap(ROW_WRAPPER.wrap(reusedRow)); + dataStatistics.add(Fixtures.SORT_KEY); + + reusedRow.setField(0, StringData.fromString("b")); + Fixtures.SORT_KEY.wrap(ROW_WRAPPER.wrap(reusedRow)); + dataStatistics.add(Fixtures.SORT_KEY); + + reusedRow.setField(0, StringData.fromString("a")); + Fixtures.SORT_KEY.wrap(ROW_WRAPPER.wrap(reusedRow)); + dataStatistics.add(Fixtures.SORT_KEY); + + reusedRow.setField(0, StringData.fromString("b")); + Fixtures.SORT_KEY.wrap(ROW_WRAPPER.wrap(reusedRow)); + dataStatistics.add(Fixtures.SORT_KEY); + + Map actual = (Map) dataStatistics.result(); + Map expected = + ImmutableMap.of(CHAR_KEYS.get("a"), 2L, CHAR_KEYS.get("b"), 3L, CHAR_KEYS.get("c"), 1L); + assertThat(actual).isEqualTo(expected); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestMapRangePartitioner.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestMapRangePartitioner.java new file mode 100644 index 000000000000..d5a0bebc74e7 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestMapRangePartitioner.java @@ -0,0 +1,434 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import static org.apache.iceberg.flink.sink.shuffle.Fixtures.SORT_ORDER_COMPARTOR; +import static org.assertj.core.api.Assertions.assertThat; + +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.atomic.AtomicLong; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.iceberg.SortKey; +import org.apache.iceberg.SortOrder; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.RowDataWrapper; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.relocated.com.google.common.collect.Sets; +import org.apache.iceberg.util.Pair; +import org.junit.jupiter.api.Test; + +public class TestMapRangePartitioner { + private static final SortOrder SORT_ORDER = + SortOrder.builderFor(TestFixtures.SCHEMA).asc("data").build(); + + private static final SortKey SORT_KEY = new SortKey(TestFixtures.SCHEMA, SORT_ORDER); + private static final RowType ROW_TYPE = FlinkSchemaUtil.convert(TestFixtures.SCHEMA); + private static final SortKey[] SORT_KEYS = initSortKeys(); + + private static SortKey[] initSortKeys() { + SortKey[] sortKeys = new SortKey[10]; + for (int i = 0; i < 10; ++i) { + RowData rowData = + GenericRowData.of(StringData.fromString("k" + i), i, StringData.fromString("2023-06-20")); + RowDataWrapper keyWrapper = new RowDataWrapper(ROW_TYPE, TestFixtures.SCHEMA.asStruct()); + keyWrapper.wrap(rowData); + SortKey sortKey = SORT_KEY.copy(); + sortKey.wrap(keyWrapper); + sortKeys[i] = sortKey; + } + return sortKeys; + } + + // Total weight is 800 + private final Map mapStatistics = + ImmutableMap.of( + SORT_KEYS[0], + 350L, + SORT_KEYS[1], + 230L, + SORT_KEYS[2], + 120L, + SORT_KEYS[3], + 40L, + SORT_KEYS[4], + 10L, + SORT_KEYS[5], + 10L, + SORT_KEYS[6], + 10L, + SORT_KEYS[7], + 10L, + SORT_KEYS[8], + 10L, + SORT_KEYS[9], + 10L); + + @Test + public void testEvenlyDividableNoClosingFileCost() { + int numPartitions = 8; + MapAssignment mapAssignment = + MapAssignment.fromKeyFrequency(numPartitions, mapStatistics, 0.0, SORT_ORDER_COMPARTOR); + + // each task should get targeted weight of 100 (=800/8) + Map expectedAssignment = + ImmutableMap.of( + SORT_KEYS[0], + new KeyAssignment( + ImmutableList.of(0, 1, 2, 3), ImmutableList.of(100L, 100L, 100L, 50L), 0L), + SORT_KEYS[1], + new KeyAssignment(ImmutableList.of(3, 4, 5), ImmutableList.of(50L, 100L, 80L), 0L), + SORT_KEYS[2], + new KeyAssignment(ImmutableList.of(5, 6), ImmutableList.of(20L, 100L), 0L), + SORT_KEYS[3], + new KeyAssignment(ImmutableList.of(7), ImmutableList.of(40L), 0L), + SORT_KEYS[4], + new KeyAssignment(ImmutableList.of(7), ImmutableList.of(10L), 0L), + SORT_KEYS[5], + new KeyAssignment(ImmutableList.of(7), ImmutableList.of(10L), 0L), + SORT_KEYS[6], + new KeyAssignment(ImmutableList.of(7), ImmutableList.of(10L), 0L), + SORT_KEYS[7], + new KeyAssignment(ImmutableList.of(7), ImmutableList.of(10L), 0L), + SORT_KEYS[8], + new KeyAssignment(ImmutableList.of(7), ImmutableList.of(10L), 0L), + SORT_KEYS[9], + new KeyAssignment(ImmutableList.of(7), ImmutableList.of(10L), 0L)); + assertThat(mapAssignment).isEqualTo(new MapAssignment(numPartitions, expectedAssignment)); + + // key: subtask id + // value pair: first is the assigned weight, second is the number of assigned keys + Map> expectedAssignmentInfo = + ImmutableMap.of( + 0, + Pair.of(100L, 1), + 1, + Pair.of(100L, 1), + 2, + Pair.of(100L, 1), + 3, + Pair.of(100L, 2), + 4, + Pair.of(100L, 1), + 5, + Pair.of(100L, 2), + 6, + Pair.of(100L, 1), + 7, + Pair.of(100L, 7)); + assertThat(mapAssignment.assignmentInfo()).isEqualTo(expectedAssignmentInfo); + + MapRangePartitioner partitioner = + new MapRangePartitioner(TestFixtures.SCHEMA, SORT_ORDER, mapAssignment); + Map>> partitionResults = + runPartitioner(partitioner, numPartitions, mapStatistics); + validatePartitionResults(expectedAssignmentInfo, partitionResults, 5.0); + } + + @Test + public void testEvenlyDividableWithClosingFileCost() { + int numPartitions = 8; + MapAssignment mapAssignment = + MapAssignment.fromKeyFrequency(numPartitions, mapStatistics, 5.0, SORT_ORDER_COMPARTOR); + + // target subtask weight is 100 before close file cost factored in. + // close file cost is 5 = 5% * 100. + // key weights before and after close file cost factored in + // before: 350, 230, 120, 40, 10, 10, 10, 10, 10, 10 + // close-cost: 20, 15, 10, 5, 5, 5, 5, 5, 5, 5 + // after: 370, 245, 130, 45, 15, 15, 15, 15, 15, 15 + // target subtask weight with close cost per subtask is 110 (880/8) + Map expectedAssignment = + ImmutableMap.of( + SORT_KEYS[0], + new KeyAssignment( + ImmutableList.of(0, 1, 2, 3), ImmutableList.of(110L, 110L, 110L, 40L), 5L), + SORT_KEYS[1], + new KeyAssignment(ImmutableList.of(3, 4, 5), ImmutableList.of(70L, 110L, 65L), 5L), + SORT_KEYS[2], + new KeyAssignment(ImmutableList.of(5, 6), ImmutableList.of(45L, 85L), 5L), + SORT_KEYS[3], + new KeyAssignment(ImmutableList.of(6, 7), ImmutableList.of(25L, 20L), 5L), + SORT_KEYS[4], + new KeyAssignment(ImmutableList.of(7), ImmutableList.of(15L), 5L), + SORT_KEYS[5], + new KeyAssignment(ImmutableList.of(7), ImmutableList.of(15L), 5L), + SORT_KEYS[6], + new KeyAssignment(ImmutableList.of(7), ImmutableList.of(15L), 5L), + SORT_KEYS[7], + new KeyAssignment(ImmutableList.of(7), ImmutableList.of(15L), 5L), + SORT_KEYS[8], + new KeyAssignment(ImmutableList.of(7), ImmutableList.of(15L), 5L), + SORT_KEYS[9], + new KeyAssignment(ImmutableList.of(7), ImmutableList.of(15L), 5L)); + assertThat(mapAssignment.keyAssignments()).isEqualTo(expectedAssignment); + + // key: subtask id + // value pair: first is the assigned weight (excluding close file cost) for the subtask, + // second is the number of keys assigned to the subtask + Map> expectedAssignmentInfo = + ImmutableMap.of( + 0, + Pair.of(105L, 1), + 1, + Pair.of(105L, 1), + 2, + Pair.of(105L, 1), + 3, + Pair.of(100L, 2), + 4, + Pair.of(105L, 1), + 5, + Pair.of(100L, 2), + 6, + Pair.of(100L, 2), + 7, + Pair.of(75L, 7)); + assertThat(mapAssignment.assignmentInfo()).isEqualTo(expectedAssignmentInfo); + + MapRangePartitioner partitioner = + new MapRangePartitioner(TestFixtures.SCHEMA, SORT_ORDER, mapAssignment); + Map>> partitionResults = + runPartitioner(partitioner, numPartitions, mapStatistics); + validatePartitionResults(expectedAssignmentInfo, partitionResults, 5.0); + } + + @Test + public void testNonDividableNoClosingFileCost() { + int numPartitions = 9; + MapAssignment mapAssignment = + MapAssignment.fromKeyFrequency(numPartitions, mapStatistics, 0.0, SORT_ORDER_COMPARTOR); + + // before: 350, 230, 120, 40, 10, 10, 10, 10, 10, 10 + // each task should get targeted weight of 89 = ceiling(800/9) + Map expectedAssignment = + ImmutableMap.of( + SORT_KEYS[0], + new KeyAssignment( + ImmutableList.of(0, 1, 2, 3), ImmutableList.of(89L, 89L, 89L, 83L), 0L), + SORT_KEYS[1], + new KeyAssignment( + ImmutableList.of(3, 4, 5, 6), ImmutableList.of(6L, 89L, 89L, 46L), 0L), + SORT_KEYS[2], + new KeyAssignment(ImmutableList.of(6, 7), ImmutableList.of(43L, 77L), 0L), + SORT_KEYS[3], + new KeyAssignment(ImmutableList.of(7, 8), ImmutableList.of(12L, 28L), 0L), + SORT_KEYS[4], + new KeyAssignment(ImmutableList.of(8), ImmutableList.of(10L), 0L), + SORT_KEYS[5], + new KeyAssignment(ImmutableList.of(8), ImmutableList.of(10L), 0L), + SORT_KEYS[6], + new KeyAssignment(ImmutableList.of(8), ImmutableList.of(10L), 0L), + SORT_KEYS[7], + new KeyAssignment(ImmutableList.of(8), ImmutableList.of(10L), 0L), + SORT_KEYS[8], + new KeyAssignment(ImmutableList.of(8), ImmutableList.of(10L), 0L), + SORT_KEYS[9], + new KeyAssignment(ImmutableList.of(8), ImmutableList.of(10L), 0L)); + assertThat(mapAssignment.keyAssignments()).isEqualTo(expectedAssignment); + + // key: subtask id + // value pair: first is the assigned weight, second is the number of assigned keys + Map> expectedAssignmentInfo = + ImmutableMap.of( + 0, + Pair.of(89L, 1), + 1, + Pair.of(89L, 1), + 2, + Pair.of(89L, 1), + 3, + Pair.of(89L, 2), + 4, + Pair.of(89L, 1), + 5, + Pair.of(89L, 1), + 6, + Pair.of(89L, 2), + 7, + Pair.of(89L, 2), + 8, + Pair.of(88L, 7)); + assertThat(mapAssignment.assignmentInfo()).isEqualTo(expectedAssignmentInfo); + + MapRangePartitioner partitioner = + new MapRangePartitioner(TestFixtures.SCHEMA, SORT_ORDER, mapAssignment); + Map>> partitionResults = + runPartitioner(partitioner, numPartitions, mapStatistics); + validatePartitionResults(expectedAssignmentInfo, partitionResults, 5.0); + } + + @Test + public void testNonDividableWithClosingFileCost() { + int numPartitions = 9; + MapAssignment mapAssignment = + MapAssignment.fromKeyFrequency(numPartitions, mapStatistics, 5.0, SORT_ORDER_COMPARTOR); + + // target subtask weight is 89 before close file cost factored in. + // close file cost is 5 (= 5% * 89) per file. + // key weights before and after close file cost factored in + // before: 350, 230, 120, 40, 10, 10, 10, 10, 10, 10 + // close-cost: 20, 15, 10, 5, 5, 5, 5, 5, 5, 5 + // after: 370, 245, 130, 45, 15, 15, 15, 15, 15, 15 + // target subtask weight per subtask is 98 ceiling(880/9) + Map expectedAssignment = + ImmutableMap.of( + SORT_KEYS[0], + new KeyAssignment( + ImmutableList.of(0, 1, 2, 3), ImmutableList.of(98L, 98L, 98L, 76L), 5L), + SORT_KEYS[1], + new KeyAssignment( + ImmutableList.of(3, 4, 5, 6), ImmutableList.of(22L, 98L, 98L, 27L), 5L), + SORT_KEYS[2], + new KeyAssignment(ImmutableList.of(6, 7), ImmutableList.of(71L, 59L), 5L), + SORT_KEYS[3], + new KeyAssignment(ImmutableList.of(7, 8), ImmutableList.of(39L, 6L), 5L), + SORT_KEYS[4], + new KeyAssignment(ImmutableList.of(8), ImmutableList.of(15L), 5L), + SORT_KEYS[5], + new KeyAssignment(ImmutableList.of(8), ImmutableList.of(15L), 5L), + SORT_KEYS[6], + new KeyAssignment(ImmutableList.of(8), ImmutableList.of(15L), 5L), + SORT_KEYS[7], + new KeyAssignment(ImmutableList.of(8), ImmutableList.of(15L), 5L), + SORT_KEYS[8], + new KeyAssignment(ImmutableList.of(8), ImmutableList.of(15L), 5L), + SORT_KEYS[9], + new KeyAssignment(ImmutableList.of(8), ImmutableList.of(15L), 5L)); + assertThat(mapAssignment.keyAssignments()).isEqualTo(expectedAssignment); + + // key: subtask id + // value pair: first is the assigned weight for the subtask, second is the number of keys + // assigned to the subtask + Map> expectedAssignmentInfo = + ImmutableMap.of( + 0, + Pair.of(93L, 1), + 1, + Pair.of(93L, 1), + 2, + Pair.of(93L, 1), + 3, + Pair.of(88L, 2), + 4, + Pair.of(93L, 1), + 5, + Pair.of(93L, 1), + 6, + Pair.of(88L, 2), + 7, + Pair.of(88L, 2), + 8, + Pair.of(61L, 7)); + assertThat(mapAssignment.assignmentInfo()).isEqualTo(expectedAssignmentInfo); + + MapRangePartitioner partitioner = + new MapRangePartitioner(TestFixtures.SCHEMA, SORT_ORDER, mapAssignment); + Map>> partitionResults = + runPartitioner(partitioner, numPartitions, mapStatistics); + // drift threshold is high for non-dividable scenario with close cost + validatePartitionResults(expectedAssignmentInfo, partitionResults, 10.0); + } + + private static Map>> runPartitioner( + MapRangePartitioner partitioner, int numPartitions, Map mapStatistics) { + // The Map key is the subtaskId. + // For the map value pair, the first element is the count of assigned and + // the second element of Set is for the set of assigned keys. + Map>> partitionResults = Maps.newHashMap(); + mapStatistics.forEach( + (sortKey, weight) -> { + String key = sortKey.get(0, String.class); + // run 100x times of the weight + long iterations = weight * 100; + for (int i = 0; i < iterations; ++i) { + RowData rowData = + GenericRowData.of( + StringData.fromString(key), 1, StringData.fromString("2023-06-20")); + int subtaskId = partitioner.partition(rowData, numPartitions); + partitionResults.computeIfAbsent( + subtaskId, k -> Pair.of(new AtomicLong(0), Sets.newHashSet())); + Pair> pair = partitionResults.get(subtaskId); + pair.first().incrementAndGet(); + pair.second().add(rowData); + } + }); + return partitionResults; + } + + /** @param expectedAssignmentInfo excluding closing cost */ + private void validatePartitionResults( + Map> expectedAssignmentInfo, + Map>> partitionResults, + double maxDriftPercentage) { + + assertThat(partitionResults.size()).isEqualTo(expectedAssignmentInfo.size()); + + List expectedAssignedKeyCounts = + Lists.newArrayListWithExpectedSize(expectedAssignmentInfo.size()); + List actualAssignedKeyCounts = + Lists.newArrayListWithExpectedSize(partitionResults.size()); + List expectedNormalizedWeights = + Lists.newArrayListWithExpectedSize(expectedAssignmentInfo.size()); + List actualNormalizedWeights = + Lists.newArrayListWithExpectedSize(partitionResults.size()); + + long expectedTotalWeight = + expectedAssignmentInfo.values().stream().mapToLong(Pair::first).sum(); + expectedAssignmentInfo.forEach( + (subtaskId, pair) -> { + expectedAssignedKeyCounts.add(pair.second()); + expectedNormalizedWeights.add(pair.first().doubleValue() / expectedTotalWeight); + }); + + long actualTotalWeight = + partitionResults.values().stream().mapToLong(pair -> pair.first().longValue()).sum(); + partitionResults.forEach( + (subtaskId, pair) -> { + actualAssignedKeyCounts.add(pair.second().size()); + actualNormalizedWeights.add(pair.first().doubleValue() / actualTotalWeight); + }); + + // number of assigned keys should match exactly + assertThat(actualAssignedKeyCounts) + .as("the number of assigned keys should match for every subtask") + .isEqualTo(expectedAssignedKeyCounts); + + // weight for every subtask shouldn't differ for more than some threshold relative to the + // expected weight + for (int subtaskId = 0; subtaskId < expectedNormalizedWeights.size(); ++subtaskId) { + double expectedWeight = expectedNormalizedWeights.get(subtaskId); + double min = expectedWeight * (1 - maxDriftPercentage / 100); + double max = expectedWeight * (1 + maxDriftPercentage / 100); + assertThat(actualNormalizedWeights.get(subtaskId)) + .as( + "Subtask %d weight should within %.1f percent of the expected range %s", + subtaskId, maxDriftPercentage, expectedWeight) + .isBetween(min, max); + } + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestRangePartitioner.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestRangePartitioner.java new file mode 100644 index 000000000000..0485fdb7fa04 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestRangePartitioner.java @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import static org.apache.iceberg.flink.sink.shuffle.Fixtures.CHAR_KEYS; +import static org.apache.iceberg.flink.sink.shuffle.Fixtures.SCHEMA; +import static org.apache.iceberg.flink.sink.shuffle.Fixtures.SORT_ORDER; +import static org.assertj.core.api.Assertions.assertThat; + +import java.util.Set; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.StringData; +import org.apache.iceberg.SortKey; +import org.apache.iceberg.relocated.com.google.common.collect.Sets; +import org.junit.jupiter.api.Test; + +public class TestRangePartitioner { + private final int numPartitions = 4; + + @Test + public void testRoundRobinRecordsBeforeStatisticsAvailable() { + RangePartitioner partitioner = new RangePartitioner(SCHEMA, SORT_ORDER); + Set results = Sets.newHashSetWithExpectedSize(numPartitions); + for (int i = 0; i < numPartitions; ++i) { + results.add( + partitioner.partition( + StatisticsOrRecord.fromRecord(GenericRowData.of(StringData.fromString("a"), 1)), + numPartitions)); + } + + // round-robin. every partition should get an assignment + assertThat(results).containsExactlyInAnyOrder(0, 1, 2, 3); + } + + @Test + public void testRoundRobinStatisticsWrapper() { + RangePartitioner partitioner = new RangePartitioner(SCHEMA, SORT_ORDER); + Set results = Sets.newHashSetWithExpectedSize(numPartitions); + for (int i = 0; i < numPartitions; ++i) { + GlobalStatistics statistics = + GlobalStatistics.fromRangeBounds(1L, new SortKey[] {CHAR_KEYS.get("a")}); + results.add( + partitioner.partition(StatisticsOrRecord.fromStatistics(statistics), numPartitions)); + } + + // round-robin. every partition should get an assignment + assertThat(results).containsExactlyInAnyOrder(0, 1, 2, 3); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchDataStatistics.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchDataStatistics.java new file mode 100644 index 000000000000..396bfae2f13c --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchDataStatistics.java @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import static org.apache.iceberg.flink.sink.shuffle.Fixtures.CHAR_KEYS; +import static org.apache.iceberg.flink.sink.shuffle.Fixtures.ROW_WRAPPER; +import static org.assertj.core.api.Assertions.assertThat; + +import org.apache.datasketches.sampling.ReservoirItemsSketch; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.StringData; +import org.apache.iceberg.SortKey; +import org.junit.jupiter.api.Test; + +public class TestSketchDataStatistics { + @SuppressWarnings("unchecked") + @Test + public void testAddsAndGet() { + SketchDataStatistics dataStatistics = new SketchDataStatistics(128); + + GenericRowData reusedRow = GenericRowData.of(StringData.fromString("a"), 1); + Fixtures.SORT_KEY.wrap(ROW_WRAPPER.wrap(reusedRow)); + dataStatistics.add(Fixtures.SORT_KEY); + + reusedRow.setField(0, StringData.fromString("b")); + Fixtures.SORT_KEY.wrap(ROW_WRAPPER.wrap(reusedRow)); + dataStatistics.add(Fixtures.SORT_KEY); + + reusedRow.setField(0, StringData.fromString("c")); + Fixtures.SORT_KEY.wrap(ROW_WRAPPER.wrap(reusedRow)); + dataStatistics.add(Fixtures.SORT_KEY); + + reusedRow.setField(0, StringData.fromString("b")); + Fixtures.SORT_KEY.wrap(ROW_WRAPPER.wrap(reusedRow)); + dataStatistics.add(Fixtures.SORT_KEY); + + ReservoirItemsSketch actual = (ReservoirItemsSketch) dataStatistics.result(); + assertThat(actual.getSamples()) + .isEqualTo( + new SortKey[] { + CHAR_KEYS.get("a"), CHAR_KEYS.get("b"), CHAR_KEYS.get("c"), CHAR_KEYS.get("b") + }); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchRangePartitioner.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchRangePartitioner.java new file mode 100644 index 000000000000..378c6afff077 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchRangePartitioner.java @@ -0,0 +1,88 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import static org.assertj.core.api.Assertions.assertThat; + +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.iceberg.SortKey; +import org.apache.iceberg.SortOrder; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.RowDataWrapper; +import org.apache.iceberg.flink.TestFixtures; +import org.junit.jupiter.api.Test; + +public class TestSketchRangePartitioner { + // sort on the long id field + private static final SortOrder SORT_ORDER = + SortOrder.builderFor(TestFixtures.SCHEMA).asc("id").build(); + private static final SortKey SORT_KEY = new SortKey(TestFixtures.SCHEMA, SORT_ORDER); + private static final RowType ROW_TYPE = FlinkSchemaUtil.convert(TestFixtures.SCHEMA); + private static final int NUM_PARTITIONS = 16; + private static final long RANGE_STEP = 1_000; + private static final long MAX_ID = RANGE_STEP * NUM_PARTITIONS; + private static final SortKey[] RANGE_BOUNDS = createRangeBounds(); + + /** + * To understand how range bounds are used in range partitioning, here is an example for human + * ages with 4 partitions: [15, 32, 60]. The 4 ranges would be + * + *
      + *
    • age <= 15 + *
    • age > 15 && age <= 32 + *
    • age >32 && age <= 60 + *
    • age > 60 + *
    + */ + private static SortKey[] createRangeBounds() { + SortKey[] rangeBounds = new SortKey[NUM_PARTITIONS - 1]; + for (int i = 0; i < NUM_PARTITIONS - 1; ++i) { + RowData rowData = + GenericRowData.of( + StringData.fromString("data"), + RANGE_STEP * (i + 1), + StringData.fromString("2023-06-20")); + RowDataWrapper keyWrapper = new RowDataWrapper(ROW_TYPE, TestFixtures.SCHEMA.asStruct()); + keyWrapper.wrap(rowData); + SortKey sortKey = new SortKey(TestFixtures.SCHEMA, SORT_ORDER); + sortKey.wrap(keyWrapper); + rangeBounds[i] = sortKey; + } + + return rangeBounds; + } + + @Test + public void testRangePartitioningWithRangeBounds() { + SketchRangePartitioner partitioner = + new SketchRangePartitioner(TestFixtures.SCHEMA, SORT_ORDER, RANGE_BOUNDS); + GenericRowData row = + GenericRowData.of(StringData.fromString("data"), 0L, StringData.fromString("2023-06-20")); + for (long id = 0; id < MAX_ID; ++id) { + row.setField(1, id); + int partition = partitioner.partition(row, NUM_PARTITIONS); + assertThat(partition).isGreaterThanOrEqualTo(0).isLessThan(NUM_PARTITIONS); + int expectedPartition = id == 0L ? 0 : (int) ((id - 1) / RANGE_STEP); + assertThat(partition).isEqualTo(expectedPartition); + } + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchUtil.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchUtil.java new file mode 100644 index 000000000000..16202c075ea0 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchUtil.java @@ -0,0 +1,189 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import static org.apache.iceberg.flink.sink.shuffle.Fixtures.CHAR_KEYS; +import static org.apache.iceberg.flink.sink.shuffle.Fixtures.SORT_ORDER_COMPARTOR; +import static org.assertj.core.api.Assertions.assertThat; + +import org.apache.iceberg.SortKey; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; + +public class TestSketchUtil { + @Test + public void testCoordinatorReservoirSize() { + // adjusted to over min threshold of 10_000 and is divisible by number of partitions (3) + assertThat(SketchUtil.determineCoordinatorReservoirSize(3)).isEqualTo(10_002); + // adjust to multiplier of 100 + assertThat(SketchUtil.determineCoordinatorReservoirSize(123)).isEqualTo(123_00); + // adjusted to below max threshold of 1_000_000 and is divisible by number of partitions (3) + assertThat(SketchUtil.determineCoordinatorReservoirSize(10_123)) + .isEqualTo(1_000_000 - (1_000_000 % 10_123)); + } + + @Test + public void testOperatorReservoirSize() { + assertThat(SketchUtil.determineOperatorReservoirSize(5, 3)) + .isEqualTo((10_002 * SketchUtil.OPERATOR_OVER_SAMPLE_RATIO) / 5); + assertThat(SketchUtil.determineOperatorReservoirSize(123, 123)) + .isEqualTo((123_00 * SketchUtil.OPERATOR_OVER_SAMPLE_RATIO) / 123); + assertThat(SketchUtil.determineOperatorReservoirSize(256, 123)) + .isEqualTo( + (int) Math.ceil((double) (123_00 * SketchUtil.OPERATOR_OVER_SAMPLE_RATIO) / 256)); + assertThat(SketchUtil.determineOperatorReservoirSize(5_120, 10_123)) + .isEqualTo( + (int) Math.ceil((double) (992_054 * SketchUtil.OPERATOR_OVER_SAMPLE_RATIO) / 5_120)); + } + + @Test + public void testRangeBoundsOneChannel() { + assertThat( + SketchUtil.rangeBounds( + 1, + SORT_ORDER_COMPARTOR, + new SortKey[] { + CHAR_KEYS.get("a"), + CHAR_KEYS.get("b"), + CHAR_KEYS.get("c"), + CHAR_KEYS.get("d"), + CHAR_KEYS.get("e"), + CHAR_KEYS.get("f") + })) + .isEmpty(); + } + + @Test + public void testRangeBoundsDivisible() { + assertThat( + SketchUtil.rangeBounds( + 3, + SORT_ORDER_COMPARTOR, + new SortKey[] { + CHAR_KEYS.get("a"), + CHAR_KEYS.get("b"), + CHAR_KEYS.get("c"), + CHAR_KEYS.get("d"), + CHAR_KEYS.get("e"), + CHAR_KEYS.get("f") + })) + .containsExactly(CHAR_KEYS.get("b"), CHAR_KEYS.get("d")); + } + + @Test + public void testRangeBoundsNonDivisible() { + // step is 3 = ceiling(11/4) + assertThat( + SketchUtil.rangeBounds( + 4, + SORT_ORDER_COMPARTOR, + new SortKey[] { + CHAR_KEYS.get("a"), + CHAR_KEYS.get("b"), + CHAR_KEYS.get("c"), + CHAR_KEYS.get("d"), + CHAR_KEYS.get("e"), + CHAR_KEYS.get("f"), + CHAR_KEYS.get("g"), + CHAR_KEYS.get("h"), + CHAR_KEYS.get("i"), + CHAR_KEYS.get("j"), + CHAR_KEYS.get("k"), + })) + .containsExactly(CHAR_KEYS.get("c"), CHAR_KEYS.get("f"), CHAR_KEYS.get("i")); + } + + @Test + public void testRangeBoundsSkipDuplicates() { + // step is 3 = ceiling(11/4) + assertThat( + SketchUtil.rangeBounds( + 4, + SORT_ORDER_COMPARTOR, + new SortKey[] { + CHAR_KEYS.get("a"), + CHAR_KEYS.get("b"), + CHAR_KEYS.get("c"), + CHAR_KEYS.get("c"), + CHAR_KEYS.get("c"), + CHAR_KEYS.get("c"), + CHAR_KEYS.get("g"), + CHAR_KEYS.get("h"), + CHAR_KEYS.get("i"), + CHAR_KEYS.get("j"), + CHAR_KEYS.get("k"), + })) + // skipped duplicate c's + .containsExactly(CHAR_KEYS.get("c"), CHAR_KEYS.get("g"), CHAR_KEYS.get("j")); + } + + @ParameterizedTest + @ValueSource(ints = {4, 6}) + public void testPartitioningAndScaleUp(int numPartitions) { + // Range bounds are calculated based on 4 partitions + SortKey[] rangeBounds = + new SortKey[] {CHAR_KEYS.get("c"), CHAR_KEYS.get("j"), CHAR_KEYS.get("m")}; + + // <= c + assertPartition(0, CHAR_KEYS.get("a"), numPartitions, rangeBounds); + assertPartition(0, CHAR_KEYS.get("c"), numPartitions, rangeBounds); + // > c && <= j + assertPartition(1, CHAR_KEYS.get("d"), numPartitions, rangeBounds); + assertPartition(1, CHAR_KEYS.get("i"), numPartitions, rangeBounds); + assertPartition(1, CHAR_KEYS.get("j"), numPartitions, rangeBounds); + // > j && <= m + assertPartition(2, CHAR_KEYS.get("k"), numPartitions, rangeBounds); + assertPartition(2, CHAR_KEYS.get("l"), numPartitions, rangeBounds); + assertPartition(2, CHAR_KEYS.get("m"), numPartitions, rangeBounds); + // > m + assertPartition(3, CHAR_KEYS.get("n"), numPartitions, rangeBounds); + assertPartition(3, CHAR_KEYS.get("z"), numPartitions, rangeBounds); + } + + @Test + public void testPartitionScaleDown() { + // Range bounds are calculated based on 4 partitions + SortKey[] rangeBounds = + new SortKey[] {CHAR_KEYS.get("c"), CHAR_KEYS.get("j"), CHAR_KEYS.get("m")}; + int numPartitions = 3; + + // <= c + assertPartition(0, CHAR_KEYS.get("a"), numPartitions, rangeBounds); + assertPartition(0, CHAR_KEYS.get("c"), numPartitions, rangeBounds); + // > c && <= j + assertPartition(1, CHAR_KEYS.get("d"), numPartitions, rangeBounds); + assertPartition(1, CHAR_KEYS.get("i"), numPartitions, rangeBounds); + assertPartition(1, CHAR_KEYS.get("j"), numPartitions, rangeBounds); + // > j && <= m + assertPartition(2, CHAR_KEYS.get("k"), numPartitions, rangeBounds); + assertPartition(2, CHAR_KEYS.get("l"), numPartitions, rangeBounds); + assertPartition(2, CHAR_KEYS.get("m"), numPartitions, rangeBounds); + // > m + // reassigns out-of-range partitions via mod (% 3 in this case) + assertPartition(0, CHAR_KEYS.get("n"), numPartitions, rangeBounds); + assertPartition(0, CHAR_KEYS.get("z"), numPartitions, rangeBounds); + } + + private static void assertPartition( + int expectedPartition, SortKey key, int numPartitions, SortKey[] rangeBounds) { + assertThat(SketchUtil.partition(key, numPartitions, rangeBounds, SORT_ORDER_COMPARTOR)) + .isEqualTo(expectedPartition); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerBase.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerBase.java new file mode 100644 index 000000000000..c7fea015142c --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerBase.java @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import org.apache.flink.api.common.typeutils.SerializerTestBase; +import org.apache.flink.api.common.typeutils.TypeSerializer; +import org.apache.flink.table.data.GenericRowData; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SortKey; +import org.apache.iceberg.SortOrder; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.RowDataWrapper; + +public abstract class TestSortKeySerializerBase extends SerializerTestBase { + + protected abstract Schema schema(); + + protected abstract SortOrder sortOrder(); + + protected abstract GenericRowData rowData(); + + @Override + protected TypeSerializer createSerializer() { + return new SortKeySerializer(schema(), sortOrder()); + } + + @Override + protected int getLength() { + return -1; + } + + @Override + protected Class getTypeClass() { + return SortKey.class; + } + + @Override + protected SortKey[] getTestData() { + return new SortKey[] {sortKey()}; + } + + private SortKey sortKey() { + RowDataWrapper rowDataWrapper = + new RowDataWrapper(FlinkSchemaUtil.convert(schema()), schema().asStruct()); + SortKey sortKey = new SortKey(schema(), sortOrder()); + sortKey.wrap(rowDataWrapper.wrap(rowData())); + return sortKey; + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerNestedStruct.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerNestedStruct.java new file mode 100644 index 000000000000..0000688a8b55 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerNestedStruct.java @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import org.apache.flink.table.data.GenericRowData; +import org.apache.iceberg.NullOrder; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SortDirection; +import org.apache.iceberg.SortOrder; +import org.apache.iceberg.expressions.Expressions; +import org.apache.iceberg.flink.DataGenerator; +import org.apache.iceberg.flink.DataGenerators; + +public class TestSortKeySerializerNestedStruct extends TestSortKeySerializerBase { + private final DataGenerator generator = new DataGenerators.StructOfStruct(); + + @Override + protected Schema schema() { + return generator.icebergSchema(); + } + + @Override + protected SortOrder sortOrder() { + return SortOrder.builderFor(schema()) + .asc("row_id") + .sortBy( + Expressions.bucket("struct_of_struct.id", 4), SortDirection.DESC, NullOrder.NULLS_LAST) + .sortBy( + Expressions.truncate("struct_of_struct.person_struct.name", 16), + SortDirection.ASC, + NullOrder.NULLS_FIRST) + .build(); + } + + @Override + protected GenericRowData rowData() { + return generator.generateFlinkRowData(); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerPrimitives.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerPrimitives.java new file mode 100644 index 000000000000..54cceae6e55b --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerPrimitives.java @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import static org.assertj.core.api.AssertionsForClassTypes.assertThat; + +import org.apache.flink.core.memory.DataInputDeserializer; +import org.apache.flink.core.memory.DataOutputSerializer; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.iceberg.NullOrder; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SortDirection; +import org.apache.iceberg.SortKey; +import org.apache.iceberg.SortOrder; +import org.apache.iceberg.StructLike; +import org.apache.iceberg.expressions.Expressions; +import org.apache.iceberg.flink.DataGenerator; +import org.apache.iceberg.flink.DataGenerators; +import org.apache.iceberg.flink.RowDataWrapper; +import org.junit.jupiter.api.Test; + +public class TestSortKeySerializerPrimitives extends TestSortKeySerializerBase { + private final DataGenerator generator = new DataGenerators.Primitives(); + + @Override + protected Schema schema() { + return generator.icebergSchema(); + } + + @Override + protected SortOrder sortOrder() { + return SortOrder.builderFor(schema()) + .asc("boolean_field") + .sortBy(Expressions.bucket("int_field", 4), SortDirection.DESC, NullOrder.NULLS_LAST) + .sortBy(Expressions.truncate("string_field", 2), SortDirection.ASC, NullOrder.NULLS_FIRST) + .sortBy(Expressions.bucket("uuid_field", 16), SortDirection.ASC, NullOrder.NULLS_FIRST) + .sortBy(Expressions.hour("ts_with_zone_field"), SortDirection.ASC, NullOrder.NULLS_FIRST) + .sortBy(Expressions.day("ts_without_zone_field"), SortDirection.ASC, NullOrder.NULLS_FIRST) + // can not test HeapByteBuffer due to equality test inside SerializerTestBase + // .sortBy(Expressions.truncate("binary_field", 2), SortDirection.ASC, + // NullOrder.NULLS_FIRST) + .build(); + } + + @Override + protected GenericRowData rowData() { + return generator.generateFlinkRowData(); + } + + @Test + public void testSerializationSize() throws Exception { + RowData rowData = + GenericRowData.of(StringData.fromString("550e8400-e29b-41d4-a716-446655440000"), 1L); + RowDataWrapper rowDataWrapper = + new RowDataWrapper(Fixtures.ROW_TYPE, Fixtures.SCHEMA.asStruct()); + StructLike struct = rowDataWrapper.wrap(rowData); + SortKey sortKey = Fixtures.SORT_KEY.copy(); + sortKey.wrap(struct); + SortKeySerializer serializer = new SortKeySerializer(Fixtures.SCHEMA, Fixtures.SORT_ORDER); + DataOutputSerializer output = new DataOutputSerializer(1024); + serializer.serialize(sortKey, output); + byte[] serializedBytes = output.getCopyOfBuffer(); + assertThat(serializedBytes.length) + .as( + "Serialized bytes for sort key should be 38 bytes (34 UUID text + 4 byte integer of string length") + .isEqualTo(38); + + DataInputDeserializer input = new DataInputDeserializer(serializedBytes); + SortKey deserialized = serializer.deserialize(input); + assertThat(deserialized).isEqualTo(sortKey); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerSnapshot.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerSnapshot.java new file mode 100644 index 000000000000..c0f688f2589e --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerSnapshot.java @@ -0,0 +1,213 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import static org.apache.iceberg.flink.sink.shuffle.Fixtures.ROW_TYPE; +import static org.apache.iceberg.flink.sink.shuffle.Fixtures.SCHEMA; +import static org.apache.iceberg.flink.sink.shuffle.Fixtures.SORT_KEY; +import static org.apache.iceberg.flink.sink.shuffle.Fixtures.SORT_ORDER; +import static org.assertj.core.api.AssertionsForClassTypes.assertThat; + +import java.io.IOException; +import org.apache.flink.api.common.typeutils.TypeSerializer; +import org.apache.flink.api.common.typeutils.TypeSerializerSchemaCompatibility; +import org.apache.flink.api.common.typeutils.TypeSerializerSnapshot; +import org.apache.flink.core.memory.DataInputDeserializer; +import org.apache.flink.core.memory.DataInputView; +import org.apache.flink.core.memory.DataOutputSerializer; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SortKey; +import org.apache.iceberg.SortOrder; +import org.apache.iceberg.StructLike; +import org.apache.iceberg.flink.RowDataWrapper; +import org.apache.iceberg.types.Types; +import org.junit.jupiter.api.Test; + +public class TestSortKeySerializerSnapshot { + private final Schema schema = + new Schema( + Types.NestedField.optional(1, "ts", Types.TimestampType.withoutZone()), + Types.NestedField.optional(2, "str", Types.StringType.get()), + Types.NestedField.optional(3, "int", Types.IntegerType.get()), + Types.NestedField.optional(4, "boolean", Types.BooleanType.get())); + private final SortOrder sortOrder = SortOrder.builderFor(schema).asc("str").asc("int").build(); + + @Test + public void testRestoredSerializer() throws Exception { + RowData rowData = GenericRowData.of(StringData.fromString("str"), 1); + RowDataWrapper rowDataWrapper = new RowDataWrapper(ROW_TYPE, SCHEMA.asStruct()); + StructLike struct = rowDataWrapper.wrap(rowData); + SortKey sortKey = SORT_KEY.copy(); + sortKey.wrap(struct); + + SortKeySerializer originalSerializer = new SortKeySerializer(SCHEMA, SORT_ORDER); + TypeSerializerSnapshot snapshot = + roundTrip(originalSerializer.snapshotConfiguration()); + TypeSerializer restoredSerializer = snapshot.restoreSerializer(); + + DataOutputSerializer output = new DataOutputSerializer(1024); + originalSerializer.serialize(sortKey, output); + byte[] serializedBytes = output.getCopyOfBuffer(); + + DataInputDeserializer input = new DataInputDeserializer(serializedBytes); + SortKey deserialized = restoredSerializer.deserialize(input); + assertThat(deserialized).isEqualTo(sortKey); + } + + @Test + public void testSnapshotIsCompatibleWithSameSortOrder() throws Exception { + SortKeySerializer.SortKeySerializerSnapshot oldSnapshot = + new SortKeySerializer.SortKeySerializerSnapshot(schema, sortOrder); + + SortKeySerializer.SortKeySerializerSnapshot newSnapshot = + roundTrip(new SortKeySerializer.SortKeySerializerSnapshot(schema, sortOrder)); + + TypeSerializerSchemaCompatibility resultCompatibility = + newSnapshot.resolveSchemaCompatibility(oldSnapshot); + assertThat(resultCompatibility.isCompatibleAsIs()).isTrue(); + } + + @Test + public void testSnapshotIsCompatibleWithRemoveNonSortField() throws Exception { + SortKeySerializer.SortKeySerializerSnapshot oldSnapshot = + new SortKeySerializer.SortKeySerializerSnapshot(schema, sortOrder); + + // removed non-sort boolean field + Schema newSchema = + new Schema( + Types.NestedField.optional(1, "ts", Types.TimestampType.withoutZone()), + Types.NestedField.optional(2, "str", Types.StringType.get()), + Types.NestedField.optional(3, "int", Types.IntegerType.get())); + SortOrder newSortOrder = SortOrder.builderFor(newSchema).asc("str").asc("int").build(); + SortKeySerializer.SortKeySerializerSnapshot newSnapshot = + roundTrip(new SortKeySerializer.SortKeySerializerSnapshot(newSchema, newSortOrder)); + + TypeSerializerSchemaCompatibility resultCompatibility = + newSnapshot.resolveSchemaCompatibility(oldSnapshot); + assertThat(resultCompatibility.isCompatibleAsIs()).isTrue(); + } + + @Test + public void testSnapshotIsCompatibleWithAddNonSortField() throws Exception { + SortKeySerializer.SortKeySerializerSnapshot oldSnapshot = + new SortKeySerializer.SortKeySerializerSnapshot(schema, sortOrder); + + // add a new non-sort float field + Schema newSchema = + new Schema( + Types.NestedField.optional(1, "ts", Types.TimestampType.withoutZone()), + Types.NestedField.optional(2, "str", Types.StringType.get()), + Types.NestedField.optional(3, "int", Types.IntegerType.get()), + Types.NestedField.optional(4, "boolean", Types.BooleanType.get()), + Types.NestedField.required(5, "float", Types.FloatType.get())); + SortOrder newSortOrder = SortOrder.builderFor(newSchema).asc("str").asc("int").build(); + SortKeySerializer.SortKeySerializerSnapshot newSnapshot = + roundTrip(new SortKeySerializer.SortKeySerializerSnapshot(newSchema, newSortOrder)); + + TypeSerializerSchemaCompatibility resultCompatibility = + newSnapshot.resolveSchemaCompatibility(oldSnapshot); + assertThat(resultCompatibility.isCompatibleAsIs()).isTrue(); + } + + @Test + public void testSnapshotIsIncompatibleWithIncompatibleSchema() throws Exception { + SortKeySerializer.SortKeySerializerSnapshot oldSnapshot = + new SortKeySerializer.SortKeySerializerSnapshot(schema, sortOrder); + + // change str field to a long type + Schema newSchema = + new Schema( + Types.NestedField.optional(1, "ts", Types.TimestampType.withoutZone()), + Types.NestedField.optional(2, "str", Types.LongType.get()), + Types.NestedField.optional(3, "int", Types.IntegerType.get()), + Types.NestedField.optional(4, "boolean", Types.BooleanType.get())); + SortOrder newSortOrder = SortOrder.builderFor(newSchema).asc("str").asc("int").build(); + // switch sort field order + SortKeySerializer.SortKeySerializerSnapshot newSnapshot = + roundTrip(new SortKeySerializer.SortKeySerializerSnapshot(newSchema, newSortOrder)); + + TypeSerializerSchemaCompatibility resultCompatibility = + newSnapshot.resolveSchemaCompatibility(oldSnapshot); + assertThat(resultCompatibility.isIncompatible()).isTrue(); + } + + @Test + public void testSnapshotIsIncompatibleWithAddSortField() throws Exception { + SortKeySerializer.SortKeySerializerSnapshot oldSnapshot = + new SortKeySerializer.SortKeySerializerSnapshot(schema, sortOrder); + + // removed str field from sort order + SortOrder newSortOrder = + SortOrder.builderFor(schema).asc("str").asc("int").desc("boolean").build(); + SortKeySerializer.SortKeySerializerSnapshot newSnapshot = + roundTrip(new SortKeySerializer.SortKeySerializerSnapshot(schema, newSortOrder)); + + TypeSerializerSchemaCompatibility resultCompatibility = + newSnapshot.resolveSchemaCompatibility(oldSnapshot); + assertThat(resultCompatibility.isIncompatible()).isTrue(); + } + + @Test + public void testSnapshotIsIncompatibleWithRemoveSortField() throws Exception { + SortKeySerializer.SortKeySerializerSnapshot oldSnapshot = + new SortKeySerializer.SortKeySerializerSnapshot(schema, sortOrder); + + // remove str field from sort order + SortOrder newSortOrder = SortOrder.builderFor(schema).asc("int").build(); + SortKeySerializer.SortKeySerializerSnapshot newSnapshot = + roundTrip(new SortKeySerializer.SortKeySerializerSnapshot(schema, newSortOrder)); + + TypeSerializerSchemaCompatibility resultCompatibility = + newSnapshot.resolveSchemaCompatibility(oldSnapshot); + assertThat(resultCompatibility.isIncompatible()).isTrue(); + } + + @Test + public void testSnapshotIsIncompatibleWithSortFieldsOrderChange() throws Exception { + SortKeySerializer.SortKeySerializerSnapshot oldSnapshot = + new SortKeySerializer.SortKeySerializerSnapshot(schema, sortOrder); + + // switch sort field order + SortOrder newSortOrder = SortOrder.builderFor(schema).asc("int").asc("str").build(); + SortKeySerializer.SortKeySerializerSnapshot newSnapshot = + roundTrip(new SortKeySerializer.SortKeySerializerSnapshot(schema, newSortOrder)); + + TypeSerializerSchemaCompatibility resultCompatibility = + newSnapshot.resolveSchemaCompatibility(oldSnapshot); + assertThat(resultCompatibility.isIncompatible()).isTrue(); + } + + /** Copied from Flink {@code AvroSerializerSnapshotTest} */ + private static SortKeySerializer.SortKeySerializerSnapshot roundTrip( + TypeSerializerSnapshot original) throws IOException { + // writeSnapshot(); + DataOutputSerializer out = new DataOutputSerializer(1024); + original.writeSnapshot(out); + // init + SortKeySerializer.SortKeySerializerSnapshot restored = + new SortKeySerializer.SortKeySerializerSnapshot(); + // readSnapshot(); + DataInputView in = new DataInputDeserializer(out.wrapAsByteBuffer()); + restored.readSnapshot(restored.getCurrentVersion(), in, original.getClass().getClassLoader()); + return restored; + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeyUtil.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeyUtil.java new file mode 100644 index 000000000000..1be7e27f2c01 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeyUtil.java @@ -0,0 +1,73 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.shuffle; + +import static org.assertj.core.api.Assertions.assertThat; + +import org.apache.iceberg.NullOrder; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SortDirection; +import org.apache.iceberg.SortOrder; +import org.apache.iceberg.expressions.Expressions; +import org.apache.iceberg.types.Types; +import org.junit.jupiter.api.Test; + +public class TestSortKeyUtil { + @Test + public void testResultSchema() { + Schema schema = + new Schema( + Types.NestedField.required(1, "id", Types.StringType.get()), + Types.NestedField.required(2, "ratio", Types.DoubleType.get()), + Types.NestedField.optional( + 3, + "user", + Types.StructType.of( + Types.NestedField.required(11, "name", Types.StringType.get()), + Types.NestedField.required(12, "ts", Types.TimestampType.withoutZone()), + Types.NestedField.optional(13, "device_id", Types.UUIDType.get()), + Types.NestedField.optional( + 14, + "location", + Types.StructType.of( + Types.NestedField.required(101, "lat", Types.FloatType.get()), + Types.NestedField.required(102, "long", Types.FloatType.get()), + Types.NestedField.required(103, "blob", Types.BinaryType.get())))))); + + SortOrder sortOrder = + SortOrder.builderFor(schema) + .asc("ratio") + .sortBy(Expressions.hour("user.ts"), SortDirection.ASC, NullOrder.NULLS_FIRST) + .sortBy( + Expressions.bucket("user.device_id", 16), SortDirection.ASC, NullOrder.NULLS_FIRST) + .sortBy( + Expressions.truncate("user.location.blob", 16), + SortDirection.ASC, + NullOrder.NULLS_FIRST) + .build(); + + assertThat(SortKeyUtil.sortKeySchema(schema, sortOrder).asStruct()) + .isEqualTo( + Types.StructType.of( + Types.NestedField.required(0, "ratio_0", Types.DoubleType.get()), + Types.NestedField.required(1, "ts_1", Types.IntegerType.get()), + Types.NestedField.optional(2, "device_id_2", Types.IntegerType.get()), + Types.NestedField.required(3, "blob_3", Types.BinaryType.get()))); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/BoundedTableFactory.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/BoundedTableFactory.java new file mode 100644 index 000000000000..a08578a4c106 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/BoundedTableFactory.java @@ -0,0 +1,170 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.function.Supplier; +import java.util.stream.Stream; +import org.apache.flink.api.java.typeutils.RowTypeInfo; +import org.apache.flink.configuration.ConfigOption; +import org.apache.flink.configuration.ConfigOptions; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.streaming.api.functions.source.SourceFunction; +import org.apache.flink.table.api.TableSchema; +import org.apache.flink.table.connector.ChangelogMode; +import org.apache.flink.table.connector.ProviderContext; +import org.apache.flink.table.connector.source.DataStreamScanProvider; +import org.apache.flink.table.connector.source.DynamicTableSource; +import org.apache.flink.table.connector.source.ScanTableSource; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.util.DataFormatConverters; +import org.apache.flink.table.factories.DynamicTableSourceFactory; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.table.utils.TableSchemaUtils; +import org.apache.flink.types.Row; +import org.apache.flink.types.RowKind; +import org.apache.iceberg.flink.util.FlinkCompatibilityUtil; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; + +public class BoundedTableFactory implements DynamicTableSourceFactory { + private static final AtomicInteger DATA_SET_ID = new AtomicInteger(0); + private static final Map>> DATA_SETS = Maps.newHashMap(); + + private static final ConfigOption DATA_ID = + ConfigOptions.key("data-id").stringType().noDefaultValue(); + + public static String registerDataSet(List> dataSet) { + String dataSetId = String.valueOf(DATA_SET_ID.incrementAndGet()); + DATA_SETS.put(dataSetId, dataSet); + return dataSetId; + } + + public static void clearDataSets() { + DATA_SETS.clear(); + } + + @Override + public DynamicTableSource createDynamicTableSource(Context context) { + TableSchema tableSchema = + TableSchemaUtils.getPhysicalSchema(context.getCatalogTable().getSchema()); + + Configuration configuration = Configuration.fromMap(context.getCatalogTable().getOptions()); + String dataId = configuration.getString(DATA_ID); + Preconditions.checkArgument( + DATA_SETS.containsKey(dataId), "data-id %s does not found in registered data set.", dataId); + + return new BoundedTableSource(DATA_SETS.get(dataId), tableSchema); + } + + @Override + public String factoryIdentifier() { + return "BoundedSource"; + } + + @Override + public Set> requiredOptions() { + return ImmutableSet.of(); + } + + @Override + public Set> optionalOptions() { + return ImmutableSet.of(DATA_ID); + } + + private static class BoundedTableSource implements ScanTableSource { + + private final List> elementsPerCheckpoint; + private final TableSchema tableSchema; + + private BoundedTableSource(List> elementsPerCheckpoint, TableSchema tableSchema) { + this.elementsPerCheckpoint = elementsPerCheckpoint; + this.tableSchema = tableSchema; + } + + private BoundedTableSource(BoundedTableSource toCopy) { + this.elementsPerCheckpoint = toCopy.elementsPerCheckpoint; + this.tableSchema = toCopy.tableSchema; + } + + @Override + public ChangelogMode getChangelogMode() { + Supplier> supplier = () -> elementsPerCheckpoint.stream().flatMap(List::stream); + + // Add the INSERT row kind by default. + ChangelogMode.Builder builder = ChangelogMode.newBuilder().addContainedKind(RowKind.INSERT); + + if (supplier.get().anyMatch(r -> r.getKind() == RowKind.DELETE)) { + builder.addContainedKind(RowKind.DELETE); + } + + if (supplier.get().anyMatch(r -> r.getKind() == RowKind.UPDATE_BEFORE)) { + builder.addContainedKind(RowKind.UPDATE_BEFORE); + } + + if (supplier.get().anyMatch(r -> r.getKind() == RowKind.UPDATE_AFTER)) { + builder.addContainedKind(RowKind.UPDATE_AFTER); + } + + return builder.build(); + } + + @Override + public ScanRuntimeProvider getScanRuntimeProvider(ScanContext runtimeProviderContext) { + return new DataStreamScanProvider() { + @Override + public DataStream produceDataStream( + ProviderContext providerContext, StreamExecutionEnvironment env) { + boolean checkpointEnabled = env.getCheckpointConfig().isCheckpointingEnabled(); + SourceFunction source = + new BoundedTestSource<>(elementsPerCheckpoint, checkpointEnabled); + + RowType rowType = (RowType) tableSchema.toRowDataType().getLogicalType(); + // Converter to convert the Row to RowData. + DataFormatConverters.RowConverter rowConverter = + new DataFormatConverters.RowConverter(tableSchema.getFieldDataTypes()); + + return env.addSource(source, new RowTypeInfo(tableSchema.getFieldTypes())) + .map(rowConverter::toInternal, FlinkCompatibilityUtil.toTypeInfo(rowType)); + } + + @Override + public boolean isBounded() { + return true; + } + }; + } + + @Override + public DynamicTableSource copy() { + return new BoundedTableSource(this); + } + + @Override + public String asSummaryString() { + return "Bounded test table source"; + } + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/BoundedTestSource.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/BoundedTestSource.java new file mode 100644 index 000000000000..7b435d059845 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/BoundedTestSource.java @@ -0,0 +1,108 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.concurrent.atomic.AtomicInteger; +import org.apache.flink.api.common.state.CheckpointListener; +import org.apache.flink.streaming.api.functions.source.SourceFunction; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; + +/** + * A stream source that: 1) emits the elements from elementsPerCheckpoint.get(0) without allowing + * checkpoints. 2) then waits for the checkpoint to complete. 3) emits the elements from + * elementsPerCheckpoint.get(1) without allowing checkpoints. 4) then waits for the checkpoint to + * complete. 5) ... + * + *

    Util all the list from elementsPerCheckpoint are exhausted. + */ +public final class BoundedTestSource implements SourceFunction, CheckpointListener { + + private final List> elementsPerCheckpoint; + private final boolean checkpointEnabled; + private volatile boolean running = true; + + private final AtomicInteger numCheckpointsComplete = new AtomicInteger(0); + + /** Emits all those elements in several checkpoints. */ + public BoundedTestSource(List> elementsPerCheckpoint, boolean checkpointEnabled) { + this.elementsPerCheckpoint = elementsPerCheckpoint; + this.checkpointEnabled = checkpointEnabled; + } + + public BoundedTestSource(List> elementsPerCheckpoint) { + this(elementsPerCheckpoint, true); + } + + /** Emits all those elements in a single checkpoint. */ + public BoundedTestSource(T... elements) { + this(Collections.singletonList(Arrays.asList(elements))); + } + + @Override + public void run(SourceContext ctx) throws Exception { + if (!checkpointEnabled) { + Preconditions.checkArgument( + elementsPerCheckpoint.size() <= 1, + "There should be at most one list in the elementsPerCheckpoint when checkpoint is disabled."); + elementsPerCheckpoint.stream().flatMap(List::stream).forEach(ctx::collect); + return; + } + + for (List elements : elementsPerCheckpoint) { + + final int checkpointToAwait; + synchronized (ctx.getCheckpointLock()) { + // Let's say checkpointToAwait = numCheckpointsComplete.get() + delta, in fact the value of + // delta should not + // affect the final table records because we only need to make sure that there will be + // exactly + // elementsPerCheckpoint.size() checkpoints to emit each records buffer from the original + // elementsPerCheckpoint. + // Even if the checkpoints that emitted results are not continuous, the correctness of the + // data should not be + // affected in the end. Setting the delta to be 2 is introducing the variable that produce + // un-continuous + // checkpoints that emit the records buffer from elementsPerCheckpoints. + checkpointToAwait = numCheckpointsComplete.get() + 2; + for (T element : elements) { + ctx.collect(element); + } + } + + synchronized (ctx.getCheckpointLock()) { + while (running && numCheckpointsComplete.get() < checkpointToAwait) { + ctx.getCheckpointLock().wait(1); + } + } + } + } + + @Override + public void notifyCheckpointComplete(long checkpointId) throws Exception { + numCheckpointsComplete.incrementAndGet(); + } + + @Override + public void cancel() { + running = false; + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/ChangeLogTableTestBase.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/ChangeLogTableTestBase.java new file mode 100644 index 000000000000..5dfbbe3abe73 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/ChangeLogTableTestBase.java @@ -0,0 +1,95 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.util.List; +import java.util.stream.Collectors; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.api.EnvironmentSettings; +import org.apache.flink.table.api.TableEnvironment; +import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; +import org.apache.flink.types.Row; +import org.apache.flink.types.RowKind; +import org.apache.iceberg.flink.MiniFlinkClusterExtension; +import org.apache.iceberg.flink.TestBase; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.TestInfo; + +public class ChangeLogTableTestBase extends TestBase { + private volatile TableEnvironment tEnv = null; + + protected String tableName; + + @BeforeEach + public void setup(TestInfo testInfo) { + assertThat(testInfo.getTestMethod()).isPresent(); + this.tableName = testInfo.getTestMethod().get().getName(); + } + + @AfterEach + public void clean() { + sql("DROP TABLE IF EXISTS %s", tableName); + BoundedTableFactory.clearDataSets(); + } + + @Override + protected TableEnvironment getTableEnv() { + if (tEnv == null) { + synchronized (this) { + if (tEnv == null) { + EnvironmentSettings settings = + EnvironmentSettings.newInstance().inStreamingMode().build(); + + StreamExecutionEnvironment env = + StreamExecutionEnvironment.getExecutionEnvironment( + MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG) + .enableCheckpointing(400) + .setMaxParallelism(1) + .setParallelism(1); + + tEnv = StreamTableEnvironment.create(env, settings); + } + } + } + return tEnv; + } + + protected static Row insertRow(Object... values) { + return Row.ofKind(RowKind.INSERT, values); + } + + protected static Row deleteRow(Object... values) { + return Row.ofKind(RowKind.DELETE, values); + } + + protected static Row updateBeforeRow(Object... values) { + return Row.ofKind(RowKind.UPDATE_BEFORE, values); + } + + protected static Row updateAfterRow(Object... values) { + return Row.ofKind(RowKind.UPDATE_AFTER, values); + } + + protected static List listJoin(List> lists) { + return lists.stream().flatMap(List::stream).collect(Collectors.toList()); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/SplitHelpers.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/SplitHelpers.java new file mode 100644 index 000000000000..540902f3cea5 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/SplitHelpers.java @@ -0,0 +1,200 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.mockito.Mockito.doReturn; +import static org.mockito.Mockito.spy; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Path; +import java.util.List; +import java.util.stream.Collectors; +import org.apache.hadoop.conf.Configuration; +import org.apache.iceberg.BaseCombinedScanTask; +import org.apache.iceberg.BaseFileScanTask; +import org.apache.iceberg.CombinedScanTask; +import org.apache.iceberg.DeleteFile; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.FileMetadata; +import org.apache.iceberg.FileScanTask; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.PartitionSpecParser; +import org.apache.iceberg.SchemaParser; +import org.apache.iceberg.Table; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.data.GenericAppenderHelper; +import org.apache.iceberg.data.RandomGenericData; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.expressions.ResidualEvaluator; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.flink.source.split.IcebergSourceSplit; +import org.apache.iceberg.hadoop.HadoopCatalog; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.util.ThreadPools; + +public class SplitHelpers { + + private SplitHelpers() {} + + /** + * This create a list of IcebergSourceSplit from real files + *

  • Create a new Hadoop table under the {@code temporaryFolder} + *
  • write {@code fileCount} number of files to the new Iceberg table + *
  • Discover the splits from the table and partition the splits by the {@code filePerSplit} + * limit + *
  • Delete the Hadoop table + * + *

    Since the table and data files are deleted before this method return, caller shouldn't + * attempt to read the data files. + * + *

    By default, v1 Iceberg table is created. For v2 table use {@link + * SplitHelpers#createSplitsFromTransientHadoopTable(Path, int, int, String)} + * + * @param temporaryFolder Folder to place the data to + * @param fileCount The number of files to create and add to the table + * @param filesPerSplit The number of files used for a split + */ + public static List createSplitsFromTransientHadoopTable( + Path temporaryFolder, int fileCount, int filesPerSplit) throws Exception { + return createSplitsFromTransientHadoopTable(temporaryFolder, fileCount, filesPerSplit, "1"); + } + + /** + * This create a list of IcebergSourceSplit from real files + *

  • Create a new Hadoop table under the {@code temporaryFolder} + *
  • write {@code fileCount} number of files to the new Iceberg table + *
  • Discover the splits from the table and partition the splits by the {@code filePerSplit} + * limit + *
  • Delete the Hadoop table + * + *

    Since the table and data files are deleted before this method return, caller shouldn't + * attempt to read the data files. + * + * @param temporaryFolder Folder to place the data to + * @param fileCount The number of files to create and add to the table + * @param filesPerSplit The number of files used for a split + * @param version The table version to create + */ + public static List createSplitsFromTransientHadoopTable( + Path temporaryFolder, int fileCount, int filesPerSplit, String version) throws Exception { + final File warehouseFile = File.createTempFile("junit", null, temporaryFolder.toFile()); + assertThat(warehouseFile.delete()).isTrue(); + final String warehouse = "file:" + warehouseFile; + Configuration hadoopConf = new Configuration(); + final HadoopCatalog catalog = new HadoopCatalog(hadoopConf, warehouse); + ImmutableMap properties = + ImmutableMap.of(TableProperties.FORMAT_VERSION, version); + try { + final Table table = + catalog.createTable( + TestFixtures.TABLE_IDENTIFIER, + TestFixtures.SCHEMA, + PartitionSpec.unpartitioned(), + null, + properties); + final GenericAppenderHelper dataAppender = + new GenericAppenderHelper(table, FileFormat.PARQUET, temporaryFolder); + for (int i = 0; i < fileCount; ++i) { + List records = RandomGenericData.generate(TestFixtures.SCHEMA, 2, i); + dataAppender.appendToTable(records); + } + + final ScanContext scanContext = ScanContext.builder().build(); + final List splits = + FlinkSplitPlanner.planIcebergSourceSplits( + table, scanContext, ThreadPools.getWorkerPool()); + return splits.stream() + .flatMap( + split -> { + List> filesList = + Lists.partition(Lists.newArrayList(split.task().files()), filesPerSplit); + return filesList.stream() + .map(files -> new BaseCombinedScanTask(files)) + .map( + combinedScanTask -> + IcebergSourceSplit.fromCombinedScanTask(combinedScanTask)); + }) + .collect(Collectors.toList()); + } finally { + catalog.dropTable(TestFixtures.TABLE_IDENTIFIER); + catalog.close(); + } + } + + /** + * This method will equip the {@code icebergSourceSplits} with mock delete files. + *

  • For each split, create {@code deleteFilesPerSplit} number of delete files + *
  • Replace the original {@code FileScanTask} with the new {@code FileScanTask} with mock + *
  • Caller should not attempt to read the deleted files since they are created as mock, and + * they are not real files + * + * @param icebergSourceSplits The real splits to equip with mock delete files + * @param temporaryFolder The temporary folder to create the mock delete files with + * @param deleteFilesPerSplit The number of delete files to create for each split + * @return The list of re-created splits with mock delete files + * @throws IOException If there is any error creating the mock delete files + */ + public static List equipSplitsWithMockDeleteFiles( + List icebergSourceSplits, Path temporaryFolder, int deleteFilesPerSplit) + throws IOException { + List icebergSourceSplitsWithMockDeleteFiles = Lists.newArrayList(); + for (IcebergSourceSplit split : icebergSourceSplits) { + final CombinedScanTask combinedScanTask = spy(split.task()); + + final List deleteFiles = Lists.newArrayList(); + final PartitionSpec spec = + PartitionSpec.builderFor(TestFixtures.SCHEMA).withSpecId(0).build(); + + for (int i = 0; i < deleteFilesPerSplit; ++i) { + final DeleteFile deleteFile = + FileMetadata.deleteFileBuilder(spec) + .withFormat(FileFormat.PARQUET) + .withPath(File.createTempFile("junit", null, temporaryFolder.toFile()).getPath()) + .ofPositionDeletes() + .withFileSizeInBytes(1000) + .withRecordCount(1000) + .build(); + deleteFiles.add(deleteFile); + } + + List newFileScanTasks = Lists.newArrayList(); + for (FileScanTask task : combinedScanTask.tasks()) { + String schemaString = SchemaParser.toJson(task.schema()); + String specString = PartitionSpecParser.toJson(task.spec()); + + BaseFileScanTask baseFileScanTask = + new BaseFileScanTask( + task.file(), + deleteFiles.toArray(new DeleteFile[] {}), + schemaString, + specString, + ResidualEvaluator.unpartitioned(task.residual())); + newFileScanTasks.add(baseFileScanTask); + } + doReturn(newFileScanTasks).when(combinedScanTask).tasks(); + icebergSourceSplitsWithMockDeleteFiles.add( + IcebergSourceSplit.fromCombinedScanTask( + combinedScanTask, split.fileOffset(), split.recordOffset())); + } + return icebergSourceSplitsWithMockDeleteFiles; + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/SqlHelpers.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/SqlHelpers.java new file mode 100644 index 000000000000..e4e48ca67f66 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/SqlHelpers.java @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import java.util.List; +import java.util.Map; +import org.apache.flink.table.api.TableEnvironment; +import org.apache.flink.table.api.TableResult; +import org.apache.flink.types.Row; +import org.apache.flink.util.CloseableIterator; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; + +public class SqlHelpers { + private SqlHelpers() {} + + public static List sql(TableEnvironment tableEnv, String query, Object... args) { + TableResult tableResult = tableEnv.executeSql(String.format(query, args)); + try (CloseableIterator iter = tableResult.collect()) { + List results = Lists.newArrayList(iter); + return results; + } catch (Exception e) { + throw new RuntimeException("Failed to collect table result", e); + } + } + + public static String sqlOptionsToString(Map sqlOptions) { + StringBuilder builder = new StringBuilder(); + sqlOptions.forEach((key, value) -> builder.append(optionToKv(key, value)).append(",")); + String optionStr = builder.toString(); + if (optionStr.endsWith(",")) { + optionStr = optionStr.substring(0, optionStr.length() - 1); + } + + if (!optionStr.isEmpty()) { + optionStr = String.format("/*+ OPTIONS(%s)*/", optionStr); + } + + return optionStr; + } + + private static String optionToKv(String key, Object value) { + return "'" + key + "'='" + value + "'"; + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TableSourceTestBase.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TableSourceTestBase.java new file mode 100644 index 000000000000..f89d63ac73e3 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TableSourceTestBase.java @@ -0,0 +1,105 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.File; +import java.io.IOException; +import org.apache.flink.configuration.CoreOptions; +import org.apache.flink.table.api.TableEnvironment; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Parameter; +import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.Parameters; +import org.apache.iceberg.events.Listeners; +import org.apache.iceberg.events.ScanEvent; +import org.apache.iceberg.flink.FlinkConfigOptions; +import org.apache.iceberg.flink.TestBase; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.extension.ExtendWith; + +@ExtendWith(ParameterizedTestExtension.class) +public abstract class TableSourceTestBase extends TestBase { + @Parameters(name = "useFlip27Source = {0}") + protected static Object[][] parameters() { + return new Object[][] { + {false}, {true}, + }; + } + + @Parameter(index = 0) + protected boolean useFlip27Source; + + protected static final String CATALOG_NAME = "test_catalog"; + protected static final String DATABASE_NAME = "test_db"; + protected static final String TABLE_NAME = "test_table"; + protected final FileFormat format = FileFormat.AVRO; + protected int scanEventCount = 0; + protected ScanEvent lastScanEvent = null; + + @Override + protected TableEnvironment getTableEnv() { + super.getTableEnv().getConfig().getConfiguration().set(CoreOptions.DEFAULT_PARALLELISM, 1); + super.getTableEnv() + .getConfig() + .getConfiguration() + .setBoolean(FlinkConfigOptions.TABLE_EXEC_ICEBERG_USE_FLIP27_SOURCE.key(), useFlip27Source); + return super.getTableEnv(); + } + + @BeforeEach + public void before() throws IOException { + // register a scan event listener to validate pushdown + Listeners.register( + event -> { + scanEventCount += 1; + lastScanEvent = event; + }, + ScanEvent.class); + + File warehouseFile = File.createTempFile("junit", null, temporaryDirectory.toFile()); + assertThat(warehouseFile.delete()).isTrue(); + String warehouse = String.format("file:%s", warehouseFile); + + sql( + "CREATE CATALOG %s WITH ('type'='iceberg', 'catalog-type'='hadoop', 'warehouse'='%s')", + CATALOG_NAME, warehouse); + sql("USE CATALOG %s", CATALOG_NAME); + sql("CREATE DATABASE %s", DATABASE_NAME); + sql("USE %s", DATABASE_NAME); + sql( + "CREATE TABLE %s (id INT, data VARCHAR,d DOUBLE) WITH ('write.format.default'='%s')", + TABLE_NAME, format.name()); + sql( + "INSERT INTO %s VALUES (1,'iceberg',10),(2,'b',20),(3,CAST(NULL AS VARCHAR),30)", + TABLE_NAME); + + this.scanEventCount = 0; + this.lastScanEvent = null; + } + + @AfterEach + public void clean() { + sql("DROP TABLE IF EXISTS %s.%s", DATABASE_NAME, TABLE_NAME); + dropDatabase(DATABASE_NAME, true); + dropCatalog(CATALOG_NAME, true); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestBoundedTableFactory.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestBoundedTableFactory.java new file mode 100644 index 000000000000..bde751e1f87f --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestBoundedTableFactory.java @@ -0,0 +1,81 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.util.List; +import java.util.Objects; +import java.util.stream.Collectors; +import org.apache.flink.types.Row; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.Streams; +import org.junit.jupiter.api.Test; + +public class TestBoundedTableFactory extends ChangeLogTableTestBase { + + @Test + public void testEmptyDataSet() { + List> emptyDataSet = ImmutableList.of(); + + String dataId = BoundedTableFactory.registerDataSet(emptyDataSet); + sql( + "CREATE TABLE %s(id INT, data STRING) WITH ('connector'='BoundedSource', 'data-id'='%s')", + tableName, dataId); + + assertThat(sql("SELECT * FROM %s", tableName)).isEmpty(); + } + + @Test + public void testBoundedTableFactory() { + List> dataSet = + ImmutableList.of( + ImmutableList.of( + insertRow(1, "aaa"), + deleteRow(1, "aaa"), + insertRow(1, "bbb"), + insertRow(2, "aaa"), + deleteRow(2, "aaa"), + insertRow(2, "bbb")), + ImmutableList.of( + updateBeforeRow(2, "bbb"), + updateAfterRow(2, "ccc"), + deleteRow(2, "ccc"), + insertRow(2, "ddd")), + ImmutableList.of( + deleteRow(1, "bbb"), + insertRow(1, "ccc"), + deleteRow(1, "ccc"), + insertRow(1, "ddd"))); + + String dataId = BoundedTableFactory.registerDataSet(dataSet); + sql( + "CREATE TABLE %s(id INT, data STRING) WITH ('connector'='BoundedSource', 'data-id'='%s')", + tableName, dataId); + + List rowSet = dataSet.stream().flatMap(Streams::stream).collect(Collectors.toList()); + assertThat(sql("SELECT * FROM %s", tableName)).isEqualTo(rowSet); + + assertThat(sql("SELECT * FROM %s WHERE data='aaa'", tableName)) + .isEqualTo( + rowSet.stream() + .filter(r -> Objects.equals(r.getField(1), "aaa")) + .collect(Collectors.toList())); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkInputFormat.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkInputFormat.java new file mode 100644 index 000000000000..c8b65e131c33 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkInputFormat.java @@ -0,0 +1,211 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import static org.apache.iceberg.flink.SimpleDataUtil.SCHEMA; +import static org.apache.iceberg.types.Types.NestedField.required; +import static org.assertj.core.api.Assertions.assertThatThrownBy; +import static org.assertj.core.api.Assumptions.assumeThat; + +import java.io.IOException; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.api.DataTypes; +import org.apache.flink.table.api.TableSchema; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.types.Row; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.Table; +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.data.GenericAppenderHelper; +import org.apache.iceberg.data.RandomGenericData; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.flink.TestHelpers; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.types.Types; +import org.junit.jupiter.api.TestTemplate; + +/** Test {@link FlinkInputFormat}. */ +public class TestFlinkInputFormat extends TestFlinkSource { + + @Override + protected List run( + FlinkSource.Builder formatBuilder, + Map sqlOptions, + String sqlFilter, + String... sqlSelectedFields) + throws Exception { + return runFormat(formatBuilder.tableLoader(tableLoader()).buildFormat()); + } + + @TestTemplate + public void testNestedProjection() throws Exception { + Schema schema = + new Schema( + required(1, "data", Types.StringType.get()), + required( + 2, + "nested", + Types.StructType.of( + Types.NestedField.required(3, "f1", Types.StringType.get()), + Types.NestedField.required(4, "f2", Types.StringType.get()), + Types.NestedField.required(5, "f3", Types.LongType.get()))), + required(6, "id", Types.LongType.get())); + + Table table = + CATALOG_EXTENSION.catalog().createTable(TableIdentifier.of("default", "t"), schema); + + List writeRecords = RandomGenericData.generate(schema, 2, 0L); + new GenericAppenderHelper(table, fileFormat, temporaryDirectory).appendToTable(writeRecords); + + // Schema: [data, nested[f1, f2, f3], id] + // Projection: [nested.f2, data] + // The Flink SQL output: [f2, data] + // The FlinkInputFormat output: [nested[f2], data] + + TableSchema projectedSchema = + TableSchema.builder() + .field("nested", DataTypes.ROW(DataTypes.FIELD("f2", DataTypes.STRING()))) + .field("data", DataTypes.STRING()) + .build(); + List result = + runFormat( + FlinkSource.forRowData() + .tableLoader(tableLoader()) + .project(projectedSchema) + .buildFormat()); + + List expected = Lists.newArrayList(); + for (Record record : writeRecords) { + Row nested = Row.of(((Record) record.get(1)).get(1)); + expected.add(Row.of(nested, record.get(0))); + } + + TestHelpers.assertRows(result, expected); + } + + @TestTemplate + public void testBasicProjection() throws IOException { + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get()), + Types.NestedField.optional(2, "time", Types.TimestampType.withZone())); + + Table table = + CATALOG_EXTENSION.catalog().createTable(TableIdentifier.of("default", "t"), writeSchema); + + List writeRecords = RandomGenericData.generate(writeSchema, 2, 0L); + new GenericAppenderHelper(table, fileFormat, temporaryDirectory).appendToTable(writeRecords); + + TableSchema projectedSchema = + TableSchema.builder() + .field("id", DataTypes.BIGINT()) + .field("data", DataTypes.STRING()) + .build(); + List result = + runFormat( + FlinkSource.forRowData() + .tableLoader(tableLoader()) + .project(projectedSchema) + .buildFormat()); + + List expected = Lists.newArrayList(); + for (Record record : writeRecords) { + expected.add(Row.of(record.get(0), record.get(1))); + } + + TestHelpers.assertRows(result, expected); + } + + @TestTemplate + public void testReadPartitionColumn() throws Exception { + assumeThat(fileFormat).as("Temporary skip ORC").isNotEqualTo(FileFormat.ORC); + + Schema nestedSchema = + new Schema( + Types.NestedField.optional(1, "id", Types.LongType.get()), + Types.NestedField.optional( + 2, + "struct", + Types.StructType.of( + Types.NestedField.optional(3, "innerId", Types.LongType.get()), + Types.NestedField.optional(4, "innerName", Types.StringType.get())))); + PartitionSpec spec = + PartitionSpec.builderFor(nestedSchema).identity("struct.innerName").build(); + + Table table = + CATALOG_EXTENSION.catalog().createTable(TestFixtures.TABLE_IDENTIFIER, nestedSchema, spec); + List records = RandomGenericData.generate(nestedSchema, 10, 0L); + GenericAppenderHelper appender = + new GenericAppenderHelper(table, fileFormat, temporaryDirectory); + for (Record record : records) { + org.apache.iceberg.TestHelpers.Row partition = + org.apache.iceberg.TestHelpers.Row.of(record.get(1, Record.class).get(1)); + appender.appendToTable(partition, Collections.singletonList(record)); + } + + TableSchema projectedSchema = + TableSchema.builder() + .field("struct", DataTypes.ROW(DataTypes.FIELD("innerName", DataTypes.STRING()))) + .build(); + List result = + runFormat( + FlinkSource.forRowData() + .tableLoader(tableLoader()) + .project(projectedSchema) + .buildFormat()); + + List expected = Lists.newArrayList(); + for (Record record : records) { + Row nested = Row.of(((Record) record.get(1)).get(1)); + expected.add(Row.of(nested)); + } + + TestHelpers.assertRows(result, expected); + } + + @TestTemplate + public void testValidation() { + CATALOG_EXTENSION.catalog().createTable(TestFixtures.TABLE_IDENTIFIER, SCHEMA); + + assertThatThrownBy( + () -> + FlinkSource.forRowData() + .env(StreamExecutionEnvironment.getExecutionEnvironment()) + .tableLoader(tableLoader()) + .streaming(false) + .endTag("tag") + .endSnapshotId(1L) + .build()) + .hasMessage("END_SNAPSHOT_ID and END_TAG cannot both be set.") + .isInstanceOf(IllegalArgumentException.class); + } + + private List runFormat(FlinkInputFormat inputFormat) throws IOException { + RowType rowType = FlinkSchemaUtil.convert(inputFormat.projectedSchema()); + return TestHelpers.readRows(inputFormat, rowType); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkInputFormatReaderDeletes.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkInputFormatReaderDeletes.java new file mode 100644 index 000000000000..1b4fc863631f --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkInputFormatReaderDeletes.java @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import java.io.IOException; +import java.util.Map; +import org.apache.flink.table.types.logical.RowType; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.iceberg.CatalogProperties; +import org.apache.iceberg.Schema; +import org.apache.iceberg.Table; +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.flink.CatalogLoader; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.RowDataWrapper; +import org.apache.iceberg.flink.TableLoader; +import org.apache.iceberg.flink.TestHelpers; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.util.StructLikeSet; + +public class TestFlinkInputFormatReaderDeletes extends TestFlinkReaderDeletesBase { + + @Override + protected StructLikeSet rowSet(String tableName, Table testTable, String... columns) + throws IOException { + Schema projected = testTable.schema().select(columns); + RowType rowType = FlinkSchemaUtil.convert(projected); + Map properties = Maps.newHashMap(); + properties.put( + CatalogProperties.WAREHOUSE_LOCATION, + hiveConf.get(HiveConf.ConfVars.METASTOREWAREHOUSE.varname)); + properties.put(CatalogProperties.URI, hiveConf.get(HiveConf.ConfVars.METASTOREURIS.varname)); + properties.put( + CatalogProperties.CLIENT_POOL_SIZE, + Integer.toString(hiveConf.getInt("iceberg.hive.client-pool-size", 5))); + CatalogLoader hiveCatalogLoader = CatalogLoader.hive(catalog.name(), hiveConf, properties); + FlinkInputFormat inputFormat = + FlinkSource.forRowData() + .tableLoader( + TableLoader.fromCatalog( + hiveCatalogLoader, TableIdentifier.of("default", tableName))) + .project(FlinkSchemaUtil.toSchema(rowType)) + .buildFormat(); + + StructLikeSet set = StructLikeSet.create(projected.asStruct()); + TestHelpers.readRowData(inputFormat, rowType) + .forEach( + rowData -> { + RowDataWrapper wrapper = new RowDataWrapper(rowType, projected.asStruct()); + set.add(wrapper.wrap(rowData)); + }); + + return set; + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkMergingMetrics.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkMergingMetrics.java new file mode 100644 index 000000000000..59a4c3118cdf --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkMergingMetrics.java @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import java.io.File; +import java.io.IOException; +import java.util.List; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.iceberg.Files; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Table; +import org.apache.iceberg.TestMergingMetrics; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.HadoopCatalogExtension; +import org.apache.iceberg.flink.RowDataConverter; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.flink.sink.FlinkAppenderFactory; +import org.apache.iceberg.io.FileAppender; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.junit.jupiter.api.extension.RegisterExtension; + +public class TestFlinkMergingMetrics extends TestMergingMetrics { + + @RegisterExtension + private static final HadoopCatalogExtension CATALOG_EXTENSION = + new HadoopCatalogExtension("test_db", "test_table"); + + @Override + protected FileAppender writeAndGetAppender(List records) throws IOException { + Table table = CATALOG_EXTENSION.catalog().createTable(TestFixtures.TABLE_IDENTIFIER, SCHEMA); + RowType flinkSchema = FlinkSchemaUtil.convert(SCHEMA); + FileAppender appender = + new FlinkAppenderFactory( + table, + SCHEMA, + flinkSchema, + ImmutableMap.of(), + PartitionSpec.unpartitioned(), + null, + null, + null) + .newAppender( + Files.localOutput(File.createTempFile("junit", null, tempDir)), fileFormat); + try (FileAppender fileAppender = appender) { + records.stream().map(r -> RowDataConverter.convert(SCHEMA, r)).forEach(fileAppender::add); + } + return appender; + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkMetaDataTable.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkMetaDataTable.java new file mode 100644 index 000000000000..8352924d042a --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkMetaDataTable.java @@ -0,0 +1,813 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assumptions.assumeThat; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Path; +import java.time.Instant; +import java.util.Comparator; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; +import java.util.stream.Stream; +import org.apache.avro.generic.GenericData; +import org.apache.commons.collections.ListUtils; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.configuration.CoreOptions; +import org.apache.flink.table.api.TableEnvironment; +import org.apache.flink.types.Row; +import org.apache.iceberg.DeleteFile; +import org.apache.iceberg.FileContent; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Files; +import org.apache.iceberg.HasTableOperations; +import org.apache.iceberg.ManifestFile; +import org.apache.iceberg.MetadataTableType; +import org.apache.iceberg.MetadataTableUtils; +import org.apache.iceberg.MetricsUtil; +import org.apache.iceberg.Parameter; +import org.apache.iceberg.Parameters; +import org.apache.iceberg.Schema; +import org.apache.iceberg.Snapshot; +import org.apache.iceberg.Table; +import org.apache.iceberg.TableMetadata; +import org.apache.iceberg.avro.Avro; +import org.apache.iceberg.catalog.Namespace; +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.data.FileHelpers; +import org.apache.iceberg.data.GenericRecord; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.expressions.Expressions; +import org.apache.iceberg.flink.CatalogTestBase; +import org.apache.iceberg.flink.TestHelpers; +import org.apache.iceberg.io.CloseableIterable; +import org.apache.iceberg.io.InputFile; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.types.Types; +import org.apache.iceberg.util.SnapshotUtil; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.api.io.TempDir; + +public class TestFlinkMetaDataTable extends CatalogTestBase { + private static final String TABLE_NAME = "test_table"; + private final FileFormat format = FileFormat.AVRO; + private @TempDir Path temp; + + @Parameter(index = 2) + private Boolean isPartition; + + @Parameters(name = "catalogName={0}, baseNamespace={1}, isPartition={2}") + protected static List parameters() { + List parameters = Lists.newArrayList(); + + for (Boolean isPartition : new Boolean[] {true, false}) { + String catalogName = "testhadoop"; + Namespace baseNamespace = Namespace.of("default"); + parameters.add(new Object[] {catalogName, baseNamespace, isPartition}); + } + return parameters; + } + + @Override + protected TableEnvironment getTableEnv() { + Configuration configuration = super.getTableEnv().getConfig().getConfiguration(); + configuration.set(CoreOptions.DEFAULT_PARALLELISM, 1); + return super.getTableEnv(); + } + + @BeforeEach + public void before() { + super.before(); + sql("USE CATALOG %s", catalogName); + sql("CREATE DATABASE %s", flinkDatabase); + sql("USE %s", DATABASE); + if (isPartition) { + sql( + "CREATE TABLE %s (id INT, data VARCHAR,d DOUBLE) PARTITIONED BY (data) WITH ('format-version'='2', 'write.format.default'='%s')", + TABLE_NAME, format.name()); + sql("INSERT INTO %s VALUES (1,'a',10),(2,'a',20)", TABLE_NAME); + sql("INSERT INTO %s VALUES (1,'b',10),(2,'b',20)", TABLE_NAME); + } else { + sql( + "CREATE TABLE %s (id INT, data VARCHAR,d DOUBLE) WITH ('format-version'='2', 'write.format.default'='%s')", + TABLE_NAME, format.name()); + sql( + "INSERT INTO %s VALUES (1,'iceberg',10),(2,'b',20),(3,CAST(NULL AS VARCHAR),30)", + TABLE_NAME); + sql("INSERT INTO %s VALUES (4,'iceberg',10)", TABLE_NAME); + } + } + + @Override + @AfterEach + public void clean() { + sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, TABLE_NAME); + dropDatabase(flinkDatabase, true); + super.clean(); + } + + @TestTemplate + public void testSnapshots() { + String sql = String.format("SELECT * FROM %s$snapshots ", TABLE_NAME); + List result = sql(sql); + + Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE_NAME)); + Iterator snapshots = table.snapshots().iterator(); + for (Row row : result) { + Snapshot next = snapshots.next(); + assertThat(((Instant) row.getField(0)).toEpochMilli()) + .as("Should have expected timestamp") + .isEqualTo(next.timestampMillis()); + assertThat(next.snapshotId()) + .as("Should have expected snapshot id") + .isEqualTo(next.snapshotId()); + assertThat(row.getField(2)).as("Should have expected parent id").isEqualTo(next.parentId()); + assertThat(row.getField(3)).as("Should have expected operation").isEqualTo(next.operation()); + assertThat(row.getField(4)) + .as("Should have expected manifest list location") + .isEqualTo(next.manifestListLocation()); + assertThat(row.getField(5)).as("Should have expected summary").isEqualTo(next.summary()); + } + } + + @TestTemplate + public void testHistory() { + String sql = String.format("SELECT * FROM %s$history ", TABLE_NAME); + List result = sql(sql); + + Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE_NAME)); + Iterator snapshots = table.snapshots().iterator(); + for (Row row : result) { + Snapshot next = snapshots.next(); + assertThat(((Instant) row.getField(0)).toEpochMilli()) + .as("Should have expected made_current_at") + .isEqualTo(next.timestampMillis()); + assertThat(row.getField(1)) + .as("Should have expected snapshot id") + .isEqualTo(next.snapshotId()); + assertThat(row.getField(2)).as("Should have expected parent id").isEqualTo(next.parentId()); + assertThat(row.getField(3)) + .as("Should have expected is current ancestor") + .isEqualTo( + SnapshotUtil.isAncestorOf( + table, table.currentSnapshot().snapshotId(), next.snapshotId())); + } + } + + @TestTemplate + public void testManifests() { + String sql = String.format("SELECT * FROM %s$manifests ", TABLE_NAME); + List result = sql(sql); + + Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE_NAME)); + List expectedDataManifests = dataManifests(table); + + for (int i = 0; i < result.size(); i++) { + Row row = result.get(i); + ManifestFile manifestFile = expectedDataManifests.get(i); + assertThat(row.getField(0)) + .as("Should have expected content") + .isEqualTo(manifestFile.content().id()); + assertThat(row.getField(1)).as("Should have expected path").isEqualTo(manifestFile.path()); + assertThat(row.getField(2)) + .as("Should have expected length") + .isEqualTo(manifestFile.length()); + assertThat(row.getField(3)) + .as("Should have expected partition_spec_id") + .isEqualTo(manifestFile.partitionSpecId()); + assertThat(row.getField(4)) + .as("Should have expected added_snapshot_id") + .isEqualTo(manifestFile.snapshotId()); + assertThat(row.getField(5)) + .as("Should have expected added_data_files_count") + .isEqualTo(manifestFile.addedFilesCount()); + assertThat(row.getField(6)) + .as("Should have expected existing_data_files_count") + .isEqualTo(manifestFile.existingFilesCount()); + assertThat(row.getField(7)) + .as("Should have expected deleted_data_files_count") + .isEqualTo(manifestFile.deletedFilesCount()); + } + } + + @TestTemplate + public void testAllManifests() { + Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE_NAME)); + + String sql = String.format("SELECT * FROM %s$all_manifests ", TABLE_NAME); + List result = sql(sql); + + List expectedDataManifests = allDataManifests(table); + + assertThat(expectedDataManifests).hasSize(result.size()); + for (int i = 0; i < result.size(); i++) { + Row row = result.get(i); + ManifestFile manifestFile = expectedDataManifests.get(i); + assertThat(row.getField(0)) + .as("Should have expected content") + .isEqualTo(manifestFile.content().id()); + assertThat(row.getField(1)).as("Should have expected path").isEqualTo(manifestFile.path()); + assertThat(row.getField(2)) + .as("Should have expected length") + .isEqualTo(manifestFile.length()); + assertThat(row.getField(3)) + .as("Should have expected partition_spec_id") + .isEqualTo(manifestFile.partitionSpecId()); + assertThat(row.getField(4)) + .as("Should have expected added_snapshot_id") + .isEqualTo(manifestFile.snapshotId()); + assertThat(row.getField(5)) + .as("Should have expected added_data_files_count") + .isEqualTo(manifestFile.addedFilesCount()); + assertThat(row.getField(6)) + .as("Should have expected existing_data_files_count") + .isEqualTo(manifestFile.existingFilesCount()); + assertThat(row.getField(7)) + .as("Should have expected deleted_data_files_count") + .isEqualTo(manifestFile.deletedFilesCount()); + } + } + + @TestTemplate + public void testUnPartitionedTable() throws IOException { + assumeThat(isPartition).isFalse(); + Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE_NAME)); + + Schema deleteRowSchema = table.schema().select("id"); + Record dataDelete = GenericRecord.create(deleteRowSchema); + List dataDeletes = Lists.newArrayList(dataDelete.copy("id", 1)); + File testFile = File.createTempFile("junit", null, temp.toFile()); + DeleteFile eqDeletes = + FileHelpers.writeDeleteFile( + table, Files.localOutput(testFile), dataDeletes, deleteRowSchema); + table.newRowDelta().addDeletes(eqDeletes).commit(); + + List expectedDataManifests = dataManifests(table); + List expectedDeleteManifests = deleteManifests(table); + + assertThat(expectedDataManifests).hasSize(2); + assertThat(expectedDeleteManifests).hasSize(1); + + Schema entriesTableSchema = + MetadataTableUtils.createMetadataTableInstance(table, MetadataTableType.from("entries")) + .schema(); + + // check delete files table + Schema deleteFilesTableSchema = + MetadataTableUtils.createMetadataTableInstance( + table, MetadataTableType.from("delete_files")) + .schema(); + + List deleteColumns = + deleteFilesTableSchema.columns().stream() + .map(Types.NestedField::name) + .filter(c -> !c.equals(MetricsUtil.READABLE_METRICS)) + .collect(Collectors.toList()); + String deleteNames = + deleteColumns.stream().map(n -> "`" + n + "`").collect(Collectors.joining(",")); + + deleteFilesTableSchema = deleteFilesTableSchema.select(deleteColumns); + + List actualDeleteFiles = sql("SELECT %s FROM %s$delete_files", deleteNames, TABLE_NAME); + assertThat(actualDeleteFiles).hasSize(1); + assertThat(expectedDeleteManifests).as("Should have 1 delete manifest").hasSize(1); + + List expectedDeleteFiles = + expectedEntries( + table, FileContent.EQUALITY_DELETES, entriesTableSchema, expectedDeleteManifests, null); + assertThat(expectedDeleteFiles).as("Should be 1 delete file manifest entry").hasSize(1); + TestHelpers.assertEquals( + deleteFilesTableSchema, expectedDeleteFiles.get(0), actualDeleteFiles.get(0)); + + // Check data files table + Schema filesTableSchema = + MetadataTableUtils.createMetadataTableInstance(table, MetadataTableType.from("files")) + .schema(); + + List columns = + filesTableSchema.columns().stream() + .map(Types.NestedField::name) + .filter(c -> !c.equals(MetricsUtil.READABLE_METRICS)) + .collect(Collectors.toList()); + String names = columns.stream().map(n -> "`" + n + "`").collect(Collectors.joining(",")); + + filesTableSchema = filesTableSchema.select(columns); + + List actualDataFiles = sql("SELECT %s FROM %s$data_files", names, TABLE_NAME); + assertThat(actualDataFiles).as("Metadata table should return 2 data file").hasSize(2); + List expectedDataFiles = + expectedEntries(table, FileContent.DATA, entriesTableSchema, expectedDataManifests, null); + assertThat(expectedDataFiles).as("Should be 2 data file manifest entry").hasSize(2); + TestHelpers.assertEquals(filesTableSchema, expectedDataFiles.get(0), actualDataFiles.get(0)); + + // check all files table + List actualFiles = sql("SELECT %s FROM %s$files ORDER BY content", names, TABLE_NAME); + assertThat(actualFiles).as("Metadata table should return 3 files").hasSize(3); + List expectedFiles = + Stream.concat(expectedDataFiles.stream(), expectedDeleteFiles.stream()) + .collect(Collectors.toList()); + assertThat(expectedFiles).as("Should have 3 files manifest entriess").hasSize(3); + TestHelpers.assertEquals(filesTableSchema, expectedFiles.get(0), actualFiles.get(0)); + TestHelpers.assertEquals(filesTableSchema, expectedFiles.get(1), actualFiles.get(1)); + } + + @TestTemplate + public void testPartitionedTable() throws Exception { + assumeThat(isPartition).isTrue(); + Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE_NAME)); + + Schema deleteRowSchema = table.schema().select("id", "data"); + Record dataDelete = GenericRecord.create(deleteRowSchema); + + Map deleteRow = Maps.newHashMap(); + deleteRow.put("id", 1); + deleteRow.put("data", "a"); + File testFile = File.createTempFile("junit", null, temp.toFile()); + DeleteFile eqDeletes = + FileHelpers.writeDeleteFile( + table, + Files.localOutput(testFile), + org.apache.iceberg.TestHelpers.Row.of("a"), + Lists.newArrayList(dataDelete.copy(deleteRow)), + deleteRowSchema); + table.newRowDelta().addDeletes(eqDeletes).commit(); + + deleteRow.put("data", "b"); + File testFile2 = File.createTempFile("junit", null, temp.toFile()); + DeleteFile eqDeletes2 = + FileHelpers.writeDeleteFile( + table, + Files.localOutput(testFile2), + org.apache.iceberg.TestHelpers.Row.of("b"), + Lists.newArrayList(dataDelete.copy(deleteRow)), + deleteRowSchema); + table.newRowDelta().addDeletes(eqDeletes2).commit(); + + Schema entriesTableSchema = + MetadataTableUtils.createMetadataTableInstance(table, MetadataTableType.from("entries")) + .schema(); + + List expectedDataManifests = dataManifests(table); + List expectedDeleteManifests = deleteManifests(table); + + assertThat(expectedDataManifests).hasSize(2); + assertThat(expectedDeleteManifests).hasSize(2); + Table deleteFilesTable = + MetadataTableUtils.createMetadataTableInstance( + table, MetadataTableType.from("delete_files")); + Schema filesTableSchema = deleteFilesTable.schema(); + + List columns = + filesTableSchema.columns().stream() + .map(Types.NestedField::name) + .filter(c -> !c.equals(MetricsUtil.READABLE_METRICS)) + .collect(Collectors.toList()); + String names = columns.stream().map(n -> "`" + n + "`").collect(Collectors.joining(",")); + + filesTableSchema = filesTableSchema.select(columns); + + // Check delete files table + List expectedDeleteFiles = + expectedEntries( + table, FileContent.EQUALITY_DELETES, entriesTableSchema, expectedDeleteManifests, "a"); + assertThat(expectedDeleteFiles).hasSize(1); + List actualDeleteFiles = + sql("SELECT %s FROM %s$delete_files WHERE `partition`.`data`='a'", names, TABLE_NAME); + + assertThat(actualDeleteFiles).hasSize(1); + TestHelpers.assertEquals( + filesTableSchema, expectedDeleteFiles.get(0), actualDeleteFiles.get(0)); + + // Check data files table + List expectedDataFiles = + expectedEntries(table, FileContent.DATA, entriesTableSchema, expectedDataManifests, "a"); + assertThat(expectedDataFiles).hasSize(1); + List actualDataFiles = + sql("SELECT %s FROM %s$data_files WHERE `partition`.`data`='a'", names, TABLE_NAME); + assertThat(actualDataFiles).hasSize(1); + TestHelpers.assertEquals(filesTableSchema, expectedDataFiles.get(0), actualDataFiles.get(0)); + + List actualPartitionsWithProjection = + sql("SELECT file_count FROM %s$partitions ", TABLE_NAME); + assertThat(actualPartitionsWithProjection).hasSize(2); + for (int i = 0; i < 2; ++i) { + assertThat(actualPartitionsWithProjection.get(i).getField(0)).isEqualTo(1); + } + + // Check files table + List expectedFiles = + Stream.concat(expectedDataFiles.stream(), expectedDeleteFiles.stream()) + .collect(Collectors.toList()); + assertThat(expectedFiles).hasSize(2); + List actualFiles = + sql( + "SELECT %s FROM %s$files WHERE `partition`.`data`='a' ORDER BY content", + names, TABLE_NAME); + assertThat(actualFiles).hasSize(2); + TestHelpers.assertEquals(filesTableSchema, expectedFiles.get(0), actualFiles.get(0)); + TestHelpers.assertEquals(filesTableSchema, expectedFiles.get(1), actualFiles.get(1)); + } + + @TestTemplate + public void testAllFilesUnpartitioned() throws Exception { + assumeThat(isPartition).isFalse(); + Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE_NAME)); + + Schema deleteRowSchema = table.schema().select("id", "data"); + Record dataDelete = GenericRecord.create(deleteRowSchema); + + Map deleteRow = Maps.newHashMap(); + deleteRow.put("id", 1); + File testFile = File.createTempFile("junit", null, temp.toFile()); + DeleteFile eqDeletes = + FileHelpers.writeDeleteFile( + table, + Files.localOutput(testFile), + Lists.newArrayList(dataDelete.copy(deleteRow)), + deleteRowSchema); + table.newRowDelta().addDeletes(eqDeletes).commit(); + + List expectedDataManifests = dataManifests(table); + assertThat(expectedDataManifests).hasSize(2); + List expectedDeleteManifests = deleteManifests(table); + assertThat(expectedDeleteManifests).hasSize(1); + + // Clear table to test whether 'all_files' can read past files + table.newDelete().deleteFromRowFilter(Expressions.alwaysTrue()).commit(); + + Schema entriesTableSchema = + MetadataTableUtils.createMetadataTableInstance(table, MetadataTableType.from("entries")) + .schema(); + Schema filesTableSchema = + MetadataTableUtils.createMetadataTableInstance( + table, MetadataTableType.from("all_data_files")) + .schema(); + + List columns = + filesTableSchema.columns().stream() + .map(Types.NestedField::name) + .filter(c -> !c.equals(MetricsUtil.READABLE_METRICS)) + .collect(Collectors.toList()); + String names = columns.stream().map(n -> "`" + n + "`").collect(Collectors.joining(",")); + + filesTableSchema = filesTableSchema.select(columns); + + // Check all data files table + List actualDataFiles = + sql("SELECT %s FROM %s$all_data_files order by record_count ", names, TABLE_NAME); + + List expectedDataFiles = + expectedEntries(table, FileContent.DATA, entriesTableSchema, expectedDataManifests, null); + assertThat(expectedDataFiles).hasSize(2); + assertThat(actualDataFiles).hasSize(2); + TestHelpers.assertEquals(filesTableSchema, expectedDataFiles, actualDataFiles); + + // Check all delete files table + List actualDeleteFiles = sql("SELECT %s FROM %s$all_delete_files", names, TABLE_NAME); + List expectedDeleteFiles = + expectedEntries( + table, FileContent.EQUALITY_DELETES, entriesTableSchema, expectedDeleteManifests, null); + assertThat(expectedDeleteFiles).hasSize(1); + assertThat(actualDeleteFiles).hasSize(1); + TestHelpers.assertEquals( + filesTableSchema, expectedDeleteFiles.get(0), actualDeleteFiles.get(0)); + + // Check all files table + List actualFiles = + sql("SELECT %s FROM %s$all_files ORDER BY content, record_count asc", names, TABLE_NAME); + List expectedFiles = + ListUtils.union(expectedDataFiles, expectedDeleteFiles); + expectedFiles.sort(Comparator.comparing(r -> ((Integer) r.get("content")))); + assertThat(actualFiles).hasSize(3); + TestHelpers.assertEquals(filesTableSchema, expectedFiles, actualFiles); + } + + @TestTemplate + public void testAllFilesPartitioned() throws Exception { + assumeThat(!isPartition).isFalse(); + Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE_NAME)); + + // Create delete file + Schema deleteRowSchema = table.schema().select("id"); + Record dataDelete = GenericRecord.create(deleteRowSchema); + + Map deleteRow = Maps.newHashMap(); + deleteRow.put("id", 1); + File testFile = File.createTempFile("junit", null, temp.toFile()); + DeleteFile eqDeletes = + FileHelpers.writeDeleteFile( + table, + Files.localOutput(testFile), + org.apache.iceberg.TestHelpers.Row.of("a"), + Lists.newArrayList(dataDelete.copy(deleteRow)), + deleteRowSchema); + File testFile2 = File.createTempFile("junit", null, temp.toFile()); + DeleteFile eqDeletes2 = + FileHelpers.writeDeleteFile( + table, + Files.localOutput(testFile2), + org.apache.iceberg.TestHelpers.Row.of("b"), + Lists.newArrayList(dataDelete.copy(deleteRow)), + deleteRowSchema); + table.newRowDelta().addDeletes(eqDeletes).addDeletes(eqDeletes2).commit(); + + List expectedDataManifests = dataManifests(table); + assertThat(expectedDataManifests).hasSize(2); + List expectedDeleteManifests = deleteManifests(table); + assertThat(expectedDeleteManifests).hasSize(1); + // Clear table to test whether 'all_files' can read past files + table.newDelete().deleteFromRowFilter(Expressions.alwaysTrue()).commit(); + + Schema entriesTableSchema = + MetadataTableUtils.createMetadataTableInstance(table, MetadataTableType.from("entries")) + .schema(); + Schema filesTableSchema = + MetadataTableUtils.createMetadataTableInstance( + table, MetadataTableType.from("all_data_files")) + .schema(); + + List columns = + filesTableSchema.columns().stream() + .map(Types.NestedField::name) + .filter(c -> !c.equals(MetricsUtil.READABLE_METRICS)) + .collect(Collectors.toList()); + String names = columns.stream().map(n -> "`" + n + "`").collect(Collectors.joining(",")); + + filesTableSchema = filesTableSchema.select(columns); + + // Check all data files table + List actualDataFiles = + sql("SELECT %s FROM %s$all_data_files WHERE `partition`.`data`='a'", names, TABLE_NAME); + List expectedDataFiles = + expectedEntries(table, FileContent.DATA, entriesTableSchema, expectedDataManifests, "a"); + assertThat(expectedDataFiles).hasSize(1); + assertThat(actualDataFiles).hasSize(1); + TestHelpers.assertEquals(filesTableSchema, expectedDataFiles.get(0), actualDataFiles.get(0)); + + // Check all delete files table + List actualDeleteFiles = + sql("SELECT %s FROM %s$all_delete_files WHERE `partition`.`data`='a'", names, TABLE_NAME); + List expectedDeleteFiles = + expectedEntries( + table, FileContent.EQUALITY_DELETES, entriesTableSchema, expectedDeleteManifests, "a"); + assertThat(expectedDeleteFiles).hasSize(1); + assertThat(actualDeleteFiles).hasSize(1); + TestHelpers.assertEquals( + filesTableSchema, expectedDeleteFiles.get(0), actualDeleteFiles.get(0)); + + // Check all files table + List actualFiles = + sql( + "SELECT %s FROM %s$all_files WHERE `partition`.`data`='a' ORDER BY content", + names, TABLE_NAME); + List expectedFiles = + ListUtils.union(expectedDataFiles, expectedDeleteFiles); + expectedFiles.sort(Comparator.comparing(r -> ((Integer) r.get("content")))); + assertThat(actualFiles).hasSize(2); + TestHelpers.assertEquals(filesTableSchema, expectedFiles, actualFiles); + } + + @TestTemplate + public void testMetadataLogEntries() { + Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE_NAME)); + + Long currentSnapshotId = table.currentSnapshot().snapshotId(); + TableMetadata tableMetadata = ((HasTableOperations) table).operations().current(); + Snapshot currentSnapshot = tableMetadata.currentSnapshot(); + Snapshot parentSnapshot = table.snapshot(currentSnapshot.parentId()); + List metadataLogEntries = + Lists.newArrayList(tableMetadata.previousFiles()); + + // Check metadataLog table + List metadataLogs = sql("SELECT * FROM %s$metadata_log_entries", TABLE_NAME); + + assertThat(metadataLogs).hasSize(3); + Row metadataLog = metadataLogs.get(0); + assertThat(metadataLog.getField("timestamp")) + .isEqualTo(Instant.ofEpochMilli(metadataLogEntries.get(0).timestampMillis())); + assertThat(metadataLog.getField("file")).isEqualTo(metadataLogEntries.get(0).file()); + assertThat(metadataLog.getField("latest_snapshot_id")).isNull(); + assertThat(metadataLog.getField("latest_schema_id")).isNull(); + assertThat(metadataLog.getField("latest_sequence_number")).isNull(); + + metadataLog = metadataLogs.get(1); + assertThat(metadataLog.getField("timestamp")) + .isEqualTo(Instant.ofEpochMilli(metadataLogEntries.get(1).timestampMillis())); + assertThat(metadataLog.getField("file")).isEqualTo(metadataLogEntries.get(1).file()); + assertThat(metadataLog.getField("latest_snapshot_id")).isEqualTo(parentSnapshot.snapshotId()); + assertThat(metadataLog.getField("latest_schema_id")).isEqualTo(parentSnapshot.schemaId()); + assertThat(metadataLog.getField("latest_sequence_number")) + .isEqualTo(parentSnapshot.sequenceNumber()); + assertThat(metadataLog.getField("latest_snapshot_id")).isEqualTo(parentSnapshot.snapshotId()); + + metadataLog = metadataLogs.get(2); + assertThat(metadataLog.getField("timestamp")) + .isEqualTo(Instant.ofEpochMilli(currentSnapshot.timestampMillis())); + assertThat(metadataLog.getField("file")).isEqualTo(tableMetadata.metadataFileLocation()); + assertThat(metadataLog.getField("latest_snapshot_id")).isEqualTo(currentSnapshot.snapshotId()); + assertThat(metadataLog.getField("latest_schema_id")).isEqualTo(currentSnapshot.schemaId()); + assertThat(metadataLog.getField("latest_sequence_number")) + .isEqualTo(currentSnapshot.sequenceNumber()); + + // test filtering + List metadataLogWithFilters = + sql( + "SELECT * FROM %s$metadata_log_entries WHERE latest_snapshot_id = %s", + TABLE_NAME, currentSnapshotId); + assertThat(metadataLogWithFilters).hasSize(1); + metadataLog = metadataLogWithFilters.get(0); + assertThat(Instant.ofEpochMilli(tableMetadata.currentSnapshot().timestampMillis())) + .isEqualTo(metadataLog.getField("timestamp")); + + assertThat(metadataLog.getField("file")).isEqualTo(tableMetadata.metadataFileLocation()); + assertThat(metadataLog.getField("latest_snapshot_id")) + .isEqualTo(tableMetadata.currentSnapshot().snapshotId()); + assertThat(metadataLog.getField("latest_schema_id")) + .isEqualTo(tableMetadata.currentSnapshot().schemaId()); + assertThat(metadataLog.getField("latest_sequence_number")) + .isEqualTo(tableMetadata.currentSnapshot().sequenceNumber()); + + // test projection + List metadataFiles = + metadataLogEntries.stream() + .map(TableMetadata.MetadataLogEntry::file) + .collect(Collectors.toList()); + metadataFiles.add(tableMetadata.metadataFileLocation()); + List metadataLogWithProjection = + sql("SELECT file FROM %s$metadata_log_entries", TABLE_NAME); + assertThat(metadataLogWithProjection).hasSize(3); + for (int i = 0; i < metadataFiles.size(); i++) { + assertThat(metadataLogWithProjection.get(i).getField("file")).isEqualTo(metadataFiles.get(i)); + } + } + + @TestTemplate + public void testSnapshotReferencesMetatable() { + Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE_NAME)); + + Long currentSnapshotId = table.currentSnapshot().snapshotId(); + + // Create branch + table + .manageSnapshots() + .createBranch("testBranch", currentSnapshotId) + .setMaxRefAgeMs("testBranch", 10) + .setMinSnapshotsToKeep("testBranch", 20) + .setMaxSnapshotAgeMs("testBranch", 30) + .commit(); + // Create Tag + table + .manageSnapshots() + .createTag("testTag", currentSnapshotId) + .setMaxRefAgeMs("testTag", 50) + .commit(); + // Check refs table + List references = sql("SELECT * FROM %s$refs", TABLE_NAME); + List branches = sql("SELECT * FROM %s$refs WHERE type='BRANCH'", TABLE_NAME); + assertThat(references).hasSize(3); + assertThat(branches).hasSize(2); + List tags = sql("SELECT * FROM %s$refs WHERE type='TAG'", TABLE_NAME); + assertThat(tags).hasSize(1); + // Check branch entries in refs table + List mainBranch = + sql("SELECT * FROM %s$refs WHERE name='main' AND type='BRANCH'", TABLE_NAME); + assertThat((String) mainBranch.get(0).getFieldAs("name")).isEqualTo("main"); + assertThat((String) mainBranch.get(0).getFieldAs("type")).isEqualTo("BRANCH"); + assertThat((Long) mainBranch.get(0).getFieldAs("snapshot_id")).isEqualTo(currentSnapshotId); + List testBranch = + sql("SELECT * FROM %s$refs WHERE name='testBranch' AND type='BRANCH'", TABLE_NAME); + assertThat((String) testBranch.get(0).getFieldAs("name")).isEqualTo("testBranch"); + assertThat((String) testBranch.get(0).getFieldAs("type")).isEqualTo("BRANCH"); + assertThat((Long) testBranch.get(0).getFieldAs("snapshot_id")).isEqualTo(currentSnapshotId); + assertThat((Long) testBranch.get(0).getFieldAs("max_reference_age_in_ms")) + .isEqualTo(Long.valueOf(10)); + assertThat((Integer) testBranch.get(0).getFieldAs("min_snapshots_to_keep")) + .isEqualTo(Integer.valueOf(20)); + assertThat((Long) testBranch.get(0).getFieldAs("max_snapshot_age_in_ms")) + .isEqualTo(Long.valueOf(30)); + + // Check tag entries in refs table + List testTag = + sql("SELECT * FROM %s$refs WHERE name='testTag' AND type='TAG'", TABLE_NAME); + assertThat((String) testTag.get(0).getFieldAs("name")).isEqualTo("testTag"); + assertThat((String) testTag.get(0).getFieldAs("type")).isEqualTo("TAG"); + assertThat((Long) testTag.get(0).getFieldAs("snapshot_id")).isEqualTo(currentSnapshotId); + assertThat((Long) testTag.get(0).getFieldAs("max_reference_age_in_ms")) + .isEqualTo(Long.valueOf(50)); + // Check projection in refs table + List testTagProjection = + sql( + "SELECT name,type,snapshot_id,max_reference_age_in_ms,min_snapshots_to_keep FROM %s$refs where type='TAG'", + TABLE_NAME); + assertThat((String) testTagProjection.get(0).getFieldAs("name")).isEqualTo("testTag"); + assertThat((String) testTagProjection.get(0).getFieldAs("type")).isEqualTo("TAG"); + assertThat((Long) testTagProjection.get(0).getFieldAs("snapshot_id")) + .isEqualTo(currentSnapshotId); + assertThat((Long) testTagProjection.get(0).getFieldAs("max_reference_age_in_ms")) + .isEqualTo(Long.valueOf(50)); + assertThat((String) testTagProjection.get(0).getFieldAs("min_snapshots_to_keep")).isNull(); + List mainBranchProjection = + sql("SELECT name, type FROM %s$refs WHERE name='main' AND type = 'BRANCH'", TABLE_NAME); + assertThat((String) mainBranchProjection.get(0).getFieldAs("name")).isEqualTo("main"); + assertThat((String) mainBranchProjection.get(0).getFieldAs("type")).isEqualTo("BRANCH"); + List testBranchProjection = + sql( + "SELECT type, name, max_reference_age_in_ms, snapshot_id FROM %s$refs WHERE name='testBranch' AND type = 'BRANCH'", + TABLE_NAME); + assertThat((String) testBranchProjection.get(0).getFieldAs("name")).isEqualTo("testBranch"); + assertThat((String) testBranchProjection.get(0).getFieldAs("type")).isEqualTo("BRANCH"); + assertThat((Long) testBranchProjection.get(0).getFieldAs("snapshot_id")) + .isEqualTo(currentSnapshotId); + assertThat((Long) testBranchProjection.get(0).getFieldAs("max_reference_age_in_ms")) + .isEqualTo(Long.valueOf(10)); + } + + /** + * Find matching manifest entries of an Iceberg table + * + * @param table iceberg table + * @param expectedContent file content to populate on entries + * @param entriesTableSchema schema of Manifest entries + * @param manifestsToExplore manifests to explore of the table + * @param partValue partition value that manifest entries must match, or null to skip filtering + */ + private List expectedEntries( + Table table, + FileContent expectedContent, + Schema entriesTableSchema, + List manifestsToExplore, + String partValue) + throws IOException { + List expected = Lists.newArrayList(); + for (ManifestFile manifest : manifestsToExplore) { + InputFile in = table.io().newInputFile(manifest.path()); + try (CloseableIterable rows = + Avro.read(in).project(entriesTableSchema).build()) { + for (GenericData.Record record : rows) { + if ((Integer) record.get("status") < 2 /* added or existing */) { + GenericData.Record file = (GenericData.Record) record.get("data_file"); + if (partitionMatch(file, partValue)) { + asMetadataRecord(file, expectedContent); + expected.add(file); + } + } + } + } + } + return expected; + } + + // Populate certain fields derived in the metadata tables + private void asMetadataRecord(GenericData.Record file, FileContent content) { + file.put(0, content.id()); + file.put(3, 0); // specId + } + + private boolean partitionMatch(GenericData.Record file, String partValue) { + if (partValue == null) { + return true; + } + GenericData.Record partition = (GenericData.Record) file.get(4); + return partValue.equals(partition.get(0).toString()); + } + + private List dataManifests(Table table) { + return table.currentSnapshot().dataManifests(table.io()); + } + + private List allDataManifests(Table table) { + List manifests = Lists.newArrayList(); + for (Snapshot snapshot : table.snapshots()) { + manifests.addAll(snapshot.dataManifests(table.io())); + } + return manifests; + } + + private List deleteManifests(Table table) { + return table.currentSnapshot().deleteManifests(table.io()); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkReaderDeletesBase.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkReaderDeletesBase.java new file mode 100644 index 000000000000..0b5a8011ad3f --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkReaderDeletesBase.java @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import java.util.Map; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.iceberg.BaseTable; +import org.apache.iceberg.CatalogUtil; +import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.Table; +import org.apache.iceberg.TableMetadata; +import org.apache.iceberg.TableOperations; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.data.DeleteReadTests; +import org.apache.iceberg.hive.HiveCatalog; +import org.apache.iceberg.hive.TestHiveMetastore; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.extension.ExtendWith; + +@ExtendWith(ParameterizedTestExtension.class) +public abstract class TestFlinkReaderDeletesBase extends DeleteReadTests { + + protected static String databaseName = "default"; + + protected static HiveConf hiveConf = null; + protected static HiveCatalog catalog = null; + private static TestHiveMetastore metastore = null; + + @BeforeAll + public static void startMetastore() { + metastore = new TestHiveMetastore(); + metastore.start(); + hiveConf = metastore.hiveConf(); + catalog = + (HiveCatalog) + CatalogUtil.loadCatalog( + HiveCatalog.class.getName(), "hive", ImmutableMap.of(), hiveConf); + } + + @AfterAll + public static void stopMetastore() throws Exception { + metastore.stop(); + catalog = null; + } + + @Override + protected Table createTable(String name, Schema schema, PartitionSpec spec) { + Map props = Maps.newHashMap(); + props.put(TableProperties.DEFAULT_FILE_FORMAT, format.name()); + + Table table = catalog.createTable(TableIdentifier.of(databaseName, name), schema, spec, props); + TableOperations ops = ((BaseTable) table).operations(); + TableMetadata meta = ops.current(); + ops.commit(meta, meta.upgradeToFormatVersion(2)); + + return table; + } + + @Override + protected void dropTable(String name) { + catalog.dropTable(TableIdentifier.of(databaseName, name)); + } + + @Override + protected boolean expectPruned() { + return false; + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScan.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScan.java new file mode 100644 index 000000000000..cf6b233dcec6 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScan.java @@ -0,0 +1,540 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import java.nio.file.Path; +import java.time.LocalDate; +import java.time.LocalDateTime; +import java.time.LocalTime; +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import org.apache.flink.test.junit5.MiniClusterExtension; +import org.apache.flink.types.Row; +import org.apache.iceberg.AppendFiles; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Parameter; +import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.Parameters; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.Table; +import org.apache.iceberg.data.GenericAppenderHelper; +import org.apache.iceberg.data.RandomGenericData; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.expressions.Expression; +import org.apache.iceberg.expressions.Expressions; +import org.apache.iceberg.flink.HadoopCatalogExtension; +import org.apache.iceberg.flink.MiniFlinkClusterExtension; +import org.apache.iceberg.flink.TableLoader; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.flink.TestHelpers; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.types.Types; +import org.apache.iceberg.util.DateTimeUtil; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.api.extension.ExtendWith; +import org.junit.jupiter.api.extension.RegisterExtension; +import org.junit.jupiter.api.io.TempDir; + +@ExtendWith(ParameterizedTestExtension.class) +public abstract class TestFlinkScan { + @RegisterExtension + protected static MiniClusterExtension miniClusterExtension = + MiniFlinkClusterExtension.createWithClassloaderCheckDisabled(); + + @TempDir protected Path temporaryDirectory; + + @RegisterExtension + protected static final HadoopCatalogExtension CATALOG_EXTENSION = + new HadoopCatalogExtension(TestFixtures.DATABASE, TestFixtures.TABLE); + + @Parameter protected FileFormat fileFormat; + + @Parameters(name = "format={0}") + public static Collection fileFormat() { + return Arrays.asList(FileFormat.AVRO, FileFormat.PARQUET, FileFormat.ORC); + } + + protected TableLoader tableLoader() { + return CATALOG_EXTENSION.tableLoader(); + } + + protected abstract List runWithProjection(String... projected) throws Exception; + + protected abstract List runWithFilter( + Expression filter, String sqlFilter, boolean caseSensitive) throws Exception; + + protected List runWithFilter(Expression filter, String sqlFilter) throws Exception { + return runWithFilter(filter, sqlFilter, true); + } + + protected abstract List runWithOptions(Map options) throws Exception; + + protected abstract List run() throws Exception; + + @TestTemplate + public void testUnpartitionedTable() throws Exception { + Table table = + CATALOG_EXTENSION.catalog().createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA); + List expectedRecords = RandomGenericData.generate(TestFixtures.SCHEMA, 2, 0L); + new GenericAppenderHelper(table, fileFormat, temporaryDirectory).appendToTable(expectedRecords); + TestHelpers.assertRecords(run(), expectedRecords, TestFixtures.SCHEMA); + } + + @TestTemplate + public void testPartitionedTable() throws Exception { + Table table = + CATALOG_EXTENSION + .catalog() + .createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA, TestFixtures.SPEC); + List expectedRecords = RandomGenericData.generate(TestFixtures.SCHEMA, 1, 0L); + expectedRecords.get(0).set(2, "2020-03-20"); + new GenericAppenderHelper(table, fileFormat, temporaryDirectory) + .appendToTable(org.apache.iceberg.TestHelpers.Row.of("2020-03-20", 0), expectedRecords); + TestHelpers.assertRecords(run(), expectedRecords, TestFixtures.SCHEMA); + } + + @TestTemplate + public void testProjection() throws Exception { + Table table = + CATALOG_EXTENSION + .catalog() + .createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA, TestFixtures.SPEC); + List inputRecords = RandomGenericData.generate(TestFixtures.SCHEMA, 1, 0L); + new GenericAppenderHelper(table, fileFormat, temporaryDirectory) + .appendToTable(org.apache.iceberg.TestHelpers.Row.of("2020-03-20", 0), inputRecords); + assertRows(runWithProjection("data"), Row.of(inputRecords.get(0).get(0))); + } + + @TestTemplate + public void testIdentityPartitionProjections() throws Exception { + Schema logSchema = + new Schema( + Types.NestedField.optional(1, "id", Types.IntegerType.get()), + Types.NestedField.optional(2, "dt", Types.StringType.get()), + Types.NestedField.optional(3, "level", Types.StringType.get()), + Types.NestedField.optional(4, "message", Types.StringType.get())); + PartitionSpec spec = + PartitionSpec.builderFor(logSchema).identity("dt").identity("level").build(); + + Table table = + CATALOG_EXTENSION.catalog().createTable(TestFixtures.TABLE_IDENTIFIER, logSchema, spec); + List inputRecords = RandomGenericData.generate(logSchema, 10, 0L); + + int idx = 0; + AppendFiles append = table.newAppend(); + for (Record record : inputRecords) { + record.set(1, "2020-03-2" + idx); + record.set(2, Integer.toString(idx)); + append.appendFile( + new GenericAppenderHelper(table, fileFormat, temporaryDirectory) + .writeFile( + org.apache.iceberg.TestHelpers.Row.of("2020-03-2" + idx, Integer.toString(idx)), + ImmutableList.of(record))); + idx += 1; + } + append.commit(); + + // individual fields + validateIdentityPartitionProjections(table, Collections.singletonList("dt"), inputRecords); + validateIdentityPartitionProjections(table, Collections.singletonList("level"), inputRecords); + validateIdentityPartitionProjections(table, Collections.singletonList("message"), inputRecords); + validateIdentityPartitionProjections(table, Collections.singletonList("id"), inputRecords); + // field pairs + validateIdentityPartitionProjections(table, Arrays.asList("dt", "message"), inputRecords); + validateIdentityPartitionProjections(table, Arrays.asList("level", "message"), inputRecords); + validateIdentityPartitionProjections(table, Arrays.asList("dt", "level"), inputRecords); + // out-of-order pairs + validateIdentityPartitionProjections(table, Arrays.asList("message", "dt"), inputRecords); + validateIdentityPartitionProjections(table, Arrays.asList("message", "level"), inputRecords); + validateIdentityPartitionProjections(table, Arrays.asList("level", "dt"), inputRecords); + // out-of-order triplets + validateIdentityPartitionProjections( + table, Arrays.asList("dt", "level", "message"), inputRecords); + validateIdentityPartitionProjections( + table, Arrays.asList("level", "dt", "message"), inputRecords); + validateIdentityPartitionProjections( + table, Arrays.asList("dt", "message", "level"), inputRecords); + validateIdentityPartitionProjections( + table, Arrays.asList("level", "message", "dt"), inputRecords); + validateIdentityPartitionProjections( + table, Arrays.asList("message", "dt", "level"), inputRecords); + validateIdentityPartitionProjections( + table, Arrays.asList("message", "level", "dt"), inputRecords); + } + + private void validateIdentityPartitionProjections( + Table table, List projectedFields, List inputRecords) throws Exception { + List rows = runWithProjection(projectedFields.toArray(new String[0])); + + for (int pos = 0; pos < inputRecords.size(); pos++) { + Record inputRecord = inputRecords.get(pos); + Row actualRecord = rows.get(pos); + + for (int i = 0; i < projectedFields.size(); i++) { + String name = projectedFields.get(i); + assertThat(inputRecord.getField(name)) + .as("Projected field " + name + " should match") + .isEqualTo(actualRecord.getField(i)); + } + } + } + + @TestTemplate + public void testSnapshotReads() throws Exception { + Table table = + CATALOG_EXTENSION.catalog().createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA); + + GenericAppenderHelper helper = new GenericAppenderHelper(table, fileFormat, temporaryDirectory); + + List expectedRecords = RandomGenericData.generate(TestFixtures.SCHEMA, 1, 0L); + helper.appendToTable(expectedRecords); + long snapshotId = table.currentSnapshot().snapshotId(); + + long timestampMillis = table.currentSnapshot().timestampMillis(); + + // produce another timestamp + waitUntilAfter(timestampMillis); + helper.appendToTable(RandomGenericData.generate(TestFixtures.SCHEMA, 1, 0L)); + + TestHelpers.assertRecords( + runWithOptions(ImmutableMap.of("snapshot-id", Long.toString(snapshotId))), + expectedRecords, + TestFixtures.SCHEMA); + TestHelpers.assertRecords( + runWithOptions(ImmutableMap.of("as-of-timestamp", Long.toString(timestampMillis))), + expectedRecords, + TestFixtures.SCHEMA); + } + + @TestTemplate + public void testTagReads() throws Exception { + Table table = + CATALOG_EXTENSION.catalog().createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA); + + GenericAppenderHelper helper = new GenericAppenderHelper(table, fileFormat, temporaryDirectory); + + List expectedRecords1 = RandomGenericData.generate(TestFixtures.SCHEMA, 1, 0L); + helper.appendToTable(expectedRecords1); + long snapshotId = table.currentSnapshot().snapshotId(); + + table.manageSnapshots().createTag("t1", snapshotId).commit(); + + TestHelpers.assertRecords( + runWithOptions(ImmutableMap.of("tag", "t1")), expectedRecords1, TestFixtures.SCHEMA); + + List expectedRecords2 = RandomGenericData.generate(TestFixtures.SCHEMA, 1, 0L); + helper.appendToTable(expectedRecords2); + snapshotId = table.currentSnapshot().snapshotId(); + + table.manageSnapshots().replaceTag("t1", snapshotId).commit(); + + List expectedRecords = Lists.newArrayList(); + expectedRecords.addAll(expectedRecords1); + expectedRecords.addAll(expectedRecords2); + TestHelpers.assertRecords( + runWithOptions(ImmutableMap.of("tag", "t1")), expectedRecords, TestFixtures.SCHEMA); + } + + @TestTemplate + public void testBranchReads() throws Exception { + Table table = + CATALOG_EXTENSION.catalog().createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA); + + GenericAppenderHelper helper = new GenericAppenderHelper(table, fileFormat, temporaryDirectory); + + List expectedRecordsBase = RandomGenericData.generate(TestFixtures.SCHEMA, 1, 0L); + helper.appendToTable(expectedRecordsBase); + long snapshotId = table.currentSnapshot().snapshotId(); + + String branchName = "b1"; + table.manageSnapshots().createBranch(branchName, snapshotId).commit(); + + List expectedRecordsForBranch = RandomGenericData.generate(TestFixtures.SCHEMA, 1, 0L); + helper.appendToTable(branchName, expectedRecordsForBranch); + + List expectedRecordsForMain = RandomGenericData.generate(TestFixtures.SCHEMA, 1, 0L); + helper.appendToTable(expectedRecordsForMain); + + List branchExpectedRecords = Lists.newArrayList(); + branchExpectedRecords.addAll(expectedRecordsBase); + branchExpectedRecords.addAll(expectedRecordsForBranch); + + TestHelpers.assertRecords( + runWithOptions(ImmutableMap.of("branch", branchName)), + branchExpectedRecords, + TestFixtures.SCHEMA); + + List mainExpectedRecords = Lists.newArrayList(); + mainExpectedRecords.addAll(expectedRecordsBase); + mainExpectedRecords.addAll(expectedRecordsForMain); + + TestHelpers.assertRecords(run(), mainExpectedRecords, TestFixtures.SCHEMA); + } + + @TestTemplate + public void testIncrementalReadViaTag() throws Exception { + Table table = + CATALOG_EXTENSION.catalog().createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA); + + GenericAppenderHelper helper = new GenericAppenderHelper(table, fileFormat, temporaryDirectory); + + List records1 = RandomGenericData.generate(TestFixtures.SCHEMA, 1, 0L); + helper.appendToTable(records1); + long snapshotId1 = table.currentSnapshot().snapshotId(); + String startTag = "t1"; + table.manageSnapshots().createTag(startTag, snapshotId1).commit(); + + List records2 = RandomGenericData.generate(TestFixtures.SCHEMA, 1, 1L); + helper.appendToTable(records2); + + List records3 = RandomGenericData.generate(TestFixtures.SCHEMA, 1, 2L); + helper.appendToTable(records3); + long snapshotId3 = table.currentSnapshot().snapshotId(); + String endTag = "t2"; + table.manageSnapshots().createTag(endTag, snapshotId3).commit(); + + helper.appendToTable(RandomGenericData.generate(TestFixtures.SCHEMA, 1, 3L)); + + List expected = Lists.newArrayList(); + expected.addAll(records2); + expected.addAll(records3); + + TestHelpers.assertRecords( + runWithOptions( + ImmutableMap.builder() + .put("start-tag", startTag) + .put("end-tag", endTag) + .buildOrThrow()), + expected, + TestFixtures.SCHEMA); + + TestHelpers.assertRecords( + runWithOptions( + ImmutableMap.builder() + .put("start-snapshot-id", Long.toString(snapshotId1)) + .put("end-tag", endTag) + .buildOrThrow()), + expected, + TestFixtures.SCHEMA); + + TestHelpers.assertRecords( + runWithOptions( + ImmutableMap.builder() + .put("start-tag", startTag) + .put("end-snapshot-id", Long.toString(snapshotId3)) + .buildOrThrow()), + expected, + TestFixtures.SCHEMA); + + assertThatThrownBy( + () -> + runWithOptions( + ImmutableMap.builder() + .put("start-tag", startTag) + .put("end-tag", endTag) + .put("start-snapshot-id", Long.toString(snapshotId1)) + .buildOrThrow())) + .isInstanceOf(Exception.class) + .hasMessage("START_SNAPSHOT_ID and START_TAG cannot both be set."); + + assertThatThrownBy( + () -> + runWithOptions( + ImmutableMap.builder() + .put("start-tag", startTag) + .put("end-tag", endTag) + .put("end-snapshot-id", Long.toString(snapshotId3)) + .buildOrThrow())) + .isInstanceOf(Exception.class) + .hasMessage("END_SNAPSHOT_ID and END_TAG cannot both be set."); + } + + @TestTemplate + public void testIncrementalRead() throws Exception { + Table table = + CATALOG_EXTENSION.catalog().createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA); + + GenericAppenderHelper helper = new GenericAppenderHelper(table, fileFormat, temporaryDirectory); + + List records1 = RandomGenericData.generate(TestFixtures.SCHEMA, 1, 0L); + helper.appendToTable(records1); + long snapshotId1 = table.currentSnapshot().snapshotId(); + + // snapshot 2 + List records2 = RandomGenericData.generate(TestFixtures.SCHEMA, 1, 1L); + helper.appendToTable(records2); + + List records3 = RandomGenericData.generate(TestFixtures.SCHEMA, 1, 2L); + helper.appendToTable(records3); + long snapshotId3 = table.currentSnapshot().snapshotId(); + + // snapshot 4 + helper.appendToTable(RandomGenericData.generate(TestFixtures.SCHEMA, 1, 3L)); + + List expected2 = Lists.newArrayList(); + expected2.addAll(records2); + expected2.addAll(records3); + TestHelpers.assertRecords( + runWithOptions( + ImmutableMap.builder() + .put("start-snapshot-id", Long.toString(snapshotId1)) + .put("end-snapshot-id", Long.toString(snapshotId3)) + .buildOrThrow()), + expected2, + TestFixtures.SCHEMA); + } + + @TestTemplate + public void testFilterExpPartition() throws Exception { + Table table = + CATALOG_EXTENSION + .catalog() + .createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA, TestFixtures.SPEC); + + List expectedRecords = RandomGenericData.generate(TestFixtures.SCHEMA, 2, 0L); + expectedRecords.get(0).set(2, "2020-03-20"); + expectedRecords.get(1).set(2, "2020-03-20"); + + GenericAppenderHelper helper = new GenericAppenderHelper(table, fileFormat, temporaryDirectory); + DataFile dataFile1 = + helper.writeFile(org.apache.iceberg.TestHelpers.Row.of("2020-03-20", 0), expectedRecords); + DataFile dataFile2 = + helper.writeFile( + org.apache.iceberg.TestHelpers.Row.of("2020-03-21", 0), + RandomGenericData.generate(TestFixtures.SCHEMA, 2, 0L)); + helper.appendToTable(dataFile1, dataFile2); + TestHelpers.assertRecords( + runWithFilter(Expressions.equal("dt", "2020-03-20"), "where dt='2020-03-20'", true), + expectedRecords, + TestFixtures.SCHEMA); + } + + private void testFilterExp(Expression filter, String sqlFilter, boolean caseSensitive) + throws Exception { + Table table = + CATALOG_EXTENSION.catalog().createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA); + + List expectedRecords = RandomGenericData.generate(TestFixtures.SCHEMA, 3, 0L); + expectedRecords.get(0).set(0, "a"); + expectedRecords.get(1).set(0, "b"); + expectedRecords.get(2).set(0, "c"); + + GenericAppenderHelper helper = new GenericAppenderHelper(table, fileFormat, temporaryDirectory); + DataFile dataFile = helper.writeFile(expectedRecords); + helper.appendToTable(dataFile); + + List actual = + runWithFilter(Expressions.greaterThanOrEqual("data", "b"), "where data>='b'", true); + + TestHelpers.assertRecords(actual, expectedRecords.subList(1, 3), TestFixtures.SCHEMA); + } + + @TestTemplate + public void testFilterExp() throws Exception { + testFilterExp(Expressions.greaterThanOrEqual("data", "b"), "where data>='b'", true); + } + + @TestTemplate + public void testFilterExpCaseInsensitive() throws Exception { + // sqlFilter does not support case-insensitive filtering: + // https://issues.apache.org/jira/browse/FLINK-16175 + testFilterExp(Expressions.greaterThanOrEqual("DATA", "b"), "where data>='b'", false); + } + + @TestTemplate + public void testPartitionTypes() throws Exception { + Schema typesSchema = + new Schema( + Types.NestedField.optional(1, "id", Types.IntegerType.get()), + Types.NestedField.optional(2, "decimal", Types.DecimalType.of(38, 18)), + Types.NestedField.optional(3, "str", Types.StringType.get()), + Types.NestedField.optional(4, "binary", Types.BinaryType.get()), + Types.NestedField.optional(5, "date", Types.DateType.get()), + Types.NestedField.optional(6, "time", Types.TimeType.get()), + Types.NestedField.optional(7, "timestamp", Types.TimestampType.withoutZone())); + PartitionSpec spec = + PartitionSpec.builderFor(typesSchema) + .identity("decimal") + .identity("str") + .identity("binary") + .identity("date") + .identity("time") + .identity("timestamp") + .build(); + + Table table = + CATALOG_EXTENSION.catalog().createTable(TestFixtures.TABLE_IDENTIFIER, typesSchema, spec); + List records = RandomGenericData.generate(typesSchema, 10, 0L); + GenericAppenderHelper appender = + new GenericAppenderHelper(table, fileFormat, temporaryDirectory); + for (Record record : records) { + org.apache.iceberg.TestHelpers.Row partition = + org.apache.iceberg.TestHelpers.Row.of( + record.get(1), + record.get(2), + record.get(3), + record.get(4) == null ? null : DateTimeUtil.daysFromDate((LocalDate) record.get(4)), + record.get(5) == null ? null : DateTimeUtil.microsFromTime((LocalTime) record.get(5)), + record.get(6) == null + ? null + : DateTimeUtil.microsFromTimestamp((LocalDateTime) record.get(6))); + appender.appendToTable(partition, Collections.singletonList(record)); + } + + TestHelpers.assertRecords(run(), records, typesSchema); + } + + @TestTemplate + public void testCustomizedFlinkDataTypes() throws Exception { + Schema schema = + new Schema( + Types.NestedField.required( + 1, + "map", + Types.MapType.ofRequired(2, 3, Types.StringType.get(), Types.StringType.get())), + Types.NestedField.required( + 4, "arr", Types.ListType.ofRequired(5, Types.StringType.get()))); + Table table = CATALOG_EXTENSION.catalog().createTable(TestFixtures.TABLE_IDENTIFIER, schema); + List records = RandomGenericData.generate(schema, 10, 0L); + GenericAppenderHelper helper = new GenericAppenderHelper(table, fileFormat, temporaryDirectory); + helper.appendToTable(records); + TestHelpers.assertRecords(run(), records, schema); + } + + private static void assertRows(List results, Row... expected) { + TestHelpers.assertRows(results, Arrays.asList(expected)); + } + + private static void waitUntilAfter(long timestampMillis) { + long current = System.currentTimeMillis(); + while (current <= timestampMillis) { + current = System.currentTimeMillis(); + } + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScanSql.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScanSql.java new file mode 100644 index 000000000000..1493c0932044 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScanSql.java @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import java.io.IOException; +import java.util.List; +import java.util.Map; +import org.apache.flink.table.api.EnvironmentSettings; +import org.apache.flink.table.api.TableEnvironment; +import org.apache.flink.table.api.config.TableConfigOptions; +import org.apache.flink.types.Row; +import org.junit.jupiter.api.BeforeEach; + +/** Test Flink SELECT SQLs. */ +public class TestFlinkScanSql extends TestFlinkSource { + private volatile TableEnvironment tEnv; + + @BeforeEach + public void before() throws IOException { + SqlHelpers.sql( + getTableEnv(), + "create catalog iceberg_catalog with ('type'='iceberg', 'catalog-type'='hadoop', 'warehouse'='%s')", + CATALOG_EXTENSION.warehouse()); + SqlHelpers.sql(getTableEnv(), "use catalog iceberg_catalog"); + getTableEnv() + .getConfig() + .getConfiguration() + .set(TableConfigOptions.TABLE_DYNAMIC_TABLE_OPTIONS_ENABLED, true); + } + + private TableEnvironment getTableEnv() { + if (tEnv == null) { + synchronized (this) { + if (tEnv == null) { + this.tEnv = + TableEnvironment.create(EnvironmentSettings.newInstance().inBatchMode().build()); + } + } + } + return tEnv; + } + + @Override + protected List run( + FlinkSource.Builder formatBuilder, + Map sqlOptions, + String sqlFilter, + String... sqlSelectedFields) { + String select = String.join(",", sqlSelectedFields); + String optionStr = SqlHelpers.sqlOptionsToString(sqlOptions); + return SqlHelpers.sql(getTableEnv(), "select %s from t %s %s", select, optionStr, sqlFilter); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSource.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSource.java new file mode 100644 index 000000000000..dd50170f0fd7 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSource.java @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import org.apache.flink.table.api.TableColumn; +import org.apache.flink.table.api.TableSchema; +import org.apache.flink.types.Row; +import org.apache.iceberg.expressions.Expression; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; + +public abstract class TestFlinkSource extends TestFlinkScan { + + @Override + protected List runWithProjection(String... projected) throws Exception { + TableSchema.Builder builder = TableSchema.builder(); + TableSchema schema = + FlinkSchemaUtil.toSchema( + FlinkSchemaUtil.convert( + CATALOG_EXTENSION.catalog().loadTable(TestFixtures.TABLE_IDENTIFIER).schema())); + for (String field : projected) { + TableColumn column = schema.getTableColumn(field).get(); + builder.field(column.getName(), column.getType()); + } + return run(FlinkSource.forRowData().project(builder.build()), Maps.newHashMap(), "", projected); + } + + @Override + protected List runWithFilter(Expression filter, String sqlFilter, boolean caseSensitive) + throws Exception { + FlinkSource.Builder builder = + FlinkSource.forRowData().filters(Collections.singletonList(filter)); + Map options = Maps.newHashMap(); + options.put("case-sensitive", Boolean.toString(caseSensitive)); + return run(builder, options, sqlFilter, "*"); + } + + @Override + protected List runWithOptions(Map options) throws Exception { + FlinkSource.Builder builder = FlinkSource.forRowData(); + Optional.ofNullable(options.get("case-sensitive")) + .ifPresent(value -> builder.caseSensitive(Boolean.parseBoolean(value))); + Optional.ofNullable(options.get("snapshot-id")) + .ifPresent(value -> builder.snapshotId(Long.parseLong(value))); + Optional.ofNullable(options.get("tag")).ifPresent(value -> builder.tag(value)); + Optional.ofNullable(options.get("branch")).ifPresent(value -> builder.branch(value)); + Optional.ofNullable(options.get("start-tag")).ifPresent(value -> builder.startTag(value)); + Optional.ofNullable(options.get("end-tag")).ifPresent(value -> builder.endTag(value)); + Optional.ofNullable(options.get("start-snapshot-id")) + .ifPresent(value -> builder.startSnapshotId(Long.parseLong(value))); + Optional.ofNullable(options.get("end-snapshot-id")) + .ifPresent(value -> builder.endSnapshotId(Long.parseLong(value))); + Optional.ofNullable(options.get("as-of-timestamp")) + .ifPresent(value -> builder.asOfTimestamp(Long.parseLong(value))); + return run(builder, options, "", "*"); + } + + @Override + protected List run() throws Exception { + return run(FlinkSource.forRowData(), Maps.newHashMap(), "", "*"); + } + + protected abstract List run( + FlinkSource.Builder formatBuilder, + Map sqlOptions, + String sqlFilter, + String... sqlSelectedFields) + throws Exception; +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSourceConfig.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSourceConfig.java new file mode 100644 index 000000000000..14131d9e96d5 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSourceConfig.java @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import java.util.List; +import org.apache.flink.types.Row; +import org.apache.iceberg.flink.FlinkReadOptions; +import org.junit.jupiter.api.TestTemplate; + +public class TestFlinkSourceConfig extends TableSourceTestBase { + private static final String TABLE = "test_table"; + + @TestTemplate + public void testFlinkSessionConfig() { + getTableEnv().getConfig().set(FlinkReadOptions.STREAMING_OPTION, true); + assertThatThrownBy(() -> sql("SELECT * FROM %s /*+ OPTIONS('as-of-timestamp'='1')*/", TABLE)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Cannot set as-of-timestamp option for streaming reader"); + } + + @TestTemplate + public void testFlinkHintConfig() { + List result = + sql( + "SELECT * FROM %s /*+ OPTIONS('as-of-timestamp'='%d','streaming'='false')*/", + TABLE, System.currentTimeMillis()); + assertThat(result).hasSize(3); + } + + @TestTemplate + public void testReadOptionHierarchy() { + getTableEnv().getConfig().set(FlinkReadOptions.LIMIT_OPTION, 1L); + List result = sql("SELECT * FROM %s", TABLE); + // Note that this query doesn't have the limit clause in the SQL. + // This assertions works because limit is pushed down to the reader and + // reader parallelism is 1. + assertThat(result).hasSize(1); + + result = sql("SELECT * FROM %s /*+ OPTIONS('limit'='3')*/", TABLE); + assertThat(result).hasSize(3); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSourceSql.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSourceSql.java new file mode 100644 index 000000000000..e1162c3225b1 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSourceSql.java @@ -0,0 +1,85 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import java.io.IOException; +import java.util.List; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.configuration.PipelineOptions; +import org.apache.flink.table.api.config.TableConfigOptions; +import org.apache.flink.types.Row; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Table; +import org.apache.iceberg.data.GenericAppenderHelper; +import org.apache.iceberg.data.RandomGenericData; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.junit.jupiter.api.Test; + +/** Use the FlinkSource */ +public class TestFlinkSourceSql extends TestSqlBase { + @Override + public void before() throws IOException { + SqlHelpers.sql( + getTableEnv(), + "create catalog iceberg_catalog with ('type'='iceberg', 'catalog-type'='hadoop', 'warehouse'='%s')", + CATALOG_EXTENSION.warehouse()); + SqlHelpers.sql(getTableEnv(), "use catalog iceberg_catalog"); + getTableEnv() + .getConfig() + .getConfiguration() + .set(TableConfigOptions.TABLE_DYNAMIC_TABLE_OPTIONS_ENABLED, true); + } + + @Test + public void testInferParallelismWithGlobalSetting() throws IOException { + Configuration cfg = getTableEnv().getConfig().getConfiguration(); + cfg.set(PipelineOptions.MAX_PARALLELISM, 1); + + Table table = + CATALOG_EXTENSION + .catalog() + .createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA, null); + + GenericAppenderHelper helper = + new GenericAppenderHelper(table, FileFormat.PARQUET, temporaryFolder); + List expectedRecords = Lists.newArrayList(); + long maxFileLen = 0; + for (int i = 0; i < 5; i++) { + List records = RandomGenericData.generate(TestFixtures.SCHEMA, 2, i); + DataFile dataFile = helper.writeFile(null, records); + helper.appendToTable(dataFile); + expectedRecords.addAll(records); + maxFileLen = Math.max(dataFile.fileSizeInBytes(), maxFileLen); + } + + // Make sure to generate multiple CombinedScanTasks + SqlHelpers.sql( + getTableEnv(), + "ALTER TABLE t SET ('read.split.open-file-cost'='1', 'read.split.target-size'='%s')", + maxFileLen); + + List results = run(Maps.newHashMap(), "", "*"); + org.apache.iceberg.flink.TestHelpers.assertRecords( + results, expectedRecords, TestFixtures.SCHEMA); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkTableSource.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkTableSource.java new file mode 100644 index 000000000000..18528c789114 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkTableSource.java @@ -0,0 +1,561 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import java.util.List; +import org.apache.flink.table.api.SqlParserException; +import org.apache.flink.types.Row; +import org.apache.iceberg.expressions.Expressions; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.junit.jupiter.api.TestTemplate; + +public class TestFlinkTableSource extends TableSourceTestBase { + + @TestTemplate + public void testLimitPushDown() { + + assertThatThrownBy(() -> sql("SELECT * FROM %s LIMIT -1", TABLE_NAME)) + .isInstanceOf(SqlParserException.class) + .hasMessageStartingWith("SQL parse failed."); + + assertThat(sql("SELECT * FROM %s LIMIT 0", TABLE_NAME)).isEmpty(); + + String sqlLimitExceed = String.format("SELECT * FROM %s LIMIT 4", TABLE_NAME); + List resultExceed = sql(sqlLimitExceed); + assertThat(resultExceed).hasSize(3); + List expectedList = + Lists.newArrayList(Row.of(1, "iceberg", 10.0), Row.of(2, "b", 20.0), Row.of(3, null, 30.0)); + assertSameElements(expectedList, resultExceed); + + String querySql = String.format("SELECT * FROM %s LIMIT 1", TABLE_NAME); + String explain = getTableEnv().explainSql(querySql); + String expectedExplain = "limit=[1]"; + assertThat(explain).as("Explain should contain LimitPushDown").contains(expectedExplain); + List result = sql(querySql); + assertThat(result).hasSize(1); + assertThat(result).containsAnyElementsOf(expectedList); + + String sqlMixed = String.format("SELECT * FROM %s WHERE id = 1 LIMIT 2", TABLE_NAME); + List mixedResult = sql(sqlMixed); + assertThat(mixedResult).hasSize(1).first().isEqualTo(Row.of(1, "iceberg", 10.0)); + } + + @TestTemplate + public void testNoFilterPushDown() { + String sql = String.format("SELECT * FROM %s ", TABLE_NAME); + List result = sql(sql); + List expectedRecords = + Lists.newArrayList(Row.of(1, "iceberg", 10.0), Row.of(2, "b", 20.0), Row.of(3, null, 30.0)); + assertSameElements(expectedRecords, result); + assertThat(lastScanEvent.filter()) + .as("Should not push down a filter") + .isEqualTo(Expressions.alwaysTrue()); + } + + @TestTemplate + public void testFilterPushDownEqual() { + String sqlLiteralRight = String.format("SELECT * FROM %s WHERE id = 1 ", TABLE_NAME); + String expectedFilter = "ref(name=\"id\") == 1"; + + List result = sql(sqlLiteralRight); + assertThat(result).hasSize(1).first().isEqualTo(Row.of(1, "iceberg", 10.0)); + assertThat(scanEventCount).isEqualTo(1); + assertThat(lastScanEvent.filter()) + .as("Should contain the push down filter") + .asString() + .isEqualTo(expectedFilter); + } + + @TestTemplate + public void testFilterPushDownEqualNull() { + String sqlEqualNull = String.format("SELECT * FROM %s WHERE data = NULL ", TABLE_NAME); + + List result = sql(sqlEqualNull); + assertThat(result).isEmpty(); + assertThat(lastScanEvent).as("Should not push down a filter").isNull(); + } + + @TestTemplate + public void testFilterPushDownEqualLiteralOnLeft() { + String sqlLiteralLeft = String.format("SELECT * FROM %s WHERE 1 = id ", TABLE_NAME); + String expectedFilter = "ref(name=\"id\") == 1"; + + List resultLeft = sql(sqlLiteralLeft); + assertThat(resultLeft).hasSize(1).first().isEqualTo(Row.of(1, "iceberg", 10.0)); + assertThat(scanEventCount).isEqualTo(1); + assertThat(lastScanEvent.filter()) + .as("Should contain the push down filter") + .asString() + .isEqualTo(expectedFilter); + } + + @TestTemplate + public void testFilterPushDownNoEqual() { + String sqlNE = String.format("SELECT * FROM %s WHERE id <> 1 ", TABLE_NAME); + String expectedFilter = "ref(name=\"id\") != 1"; + + List resultNE = sql(sqlNE); + assertThat(resultNE).hasSize(2); + + List expectedNE = Lists.newArrayList(Row.of(2, "b", 20.0), Row.of(3, null, 30.0)); + assertSameElements(expectedNE, resultNE); + assertThat(scanEventCount).isEqualTo(1); + assertThat(lastScanEvent.filter()) + .as("Should contain the push down filter") + .asString() + .isEqualTo(expectedFilter); + } + + @TestTemplate + public void testFilterPushDownNoEqualNull() { + String sqlNotEqualNull = String.format("SELECT * FROM %s WHERE data <> NULL ", TABLE_NAME); + + List resultNE = sql(sqlNotEqualNull); + assertThat(resultNE).isEmpty(); + assertThat(lastScanEvent).as("Should not push down a filter").isNull(); + } + + @TestTemplate + public void testFilterPushDownAnd() { + String sqlAnd = + String.format("SELECT * FROM %s WHERE id = 1 AND data = 'iceberg' ", TABLE_NAME); + + List resultAnd = sql(sqlAnd); + assertThat(resultAnd).hasSize(1).first().isEqualTo(Row.of(1, "iceberg", 10.0)); + assertThat(scanEventCount).isEqualTo(1); + String expected = "(ref(name=\"id\") == 1 and ref(name=\"data\") == \"iceberg\")"; + assertThat(lastScanEvent.filter()) + .as("Should contain the push down filter") + .asString() + .isEqualTo(expected); + } + + @TestTemplate + public void testFilterPushDownOr() { + String sqlOr = String.format("SELECT * FROM %s WHERE id = 1 OR data = 'b' ", TABLE_NAME); + String expectedFilter = "(ref(name=\"id\") == 1 or ref(name=\"data\") == \"b\")"; + + List resultOr = sql(sqlOr); + assertThat(resultOr).hasSize(2); + + List expectedOR = Lists.newArrayList(Row.of(1, "iceberg", 10.0), Row.of(2, "b", 20.0)); + assertSameElements(expectedOR, resultOr); + + assertThat(scanEventCount).isEqualTo(1); + assertThat(lastScanEvent.filter()) + .as("Should contain the push down filter") + .asString() + .isEqualTo(expectedFilter); + } + + @TestTemplate + public void testFilterPushDownGreaterThan() { + String sqlGT = String.format("SELECT * FROM %s WHERE id > 1 ", TABLE_NAME); + String expectedFilter = "ref(name=\"id\") > 1"; + + List resultGT = sql(sqlGT); + assertThat(resultGT).hasSize(2); + + List expectedGT = Lists.newArrayList(Row.of(2, "b", 20.0), Row.of(3, null, 30.0)); + assertSameElements(expectedGT, resultGT); + + assertThat(scanEventCount).isEqualTo(1); + assertThat(lastScanEvent.filter()) + .as("Should contain the push down filter") + .asString() + .isEqualTo(expectedFilter); + } + + @TestTemplate + public void testFilterPushDownGreaterThanNull() { + String sqlGT = String.format("SELECT * FROM %s WHERE data > null ", TABLE_NAME); + + List resultGT = sql(sqlGT); + assertThat(resultGT).isEmpty(); + assertThat(lastScanEvent).as("Should not push down a filter").isNull(); + } + + @TestTemplate + public void testFilterPushDownGreaterThanLiteralOnLeft() { + String sqlGT = String.format("SELECT * FROM %s WHERE 3 > id ", TABLE_NAME); + String expectedFilter = "ref(name=\"id\") < 3"; + + List resultGT = sql(sqlGT); + assertThat(resultGT).hasSize(2); + + List expectedGT = Lists.newArrayList(Row.of(1, "iceberg", 10.0), Row.of(2, "b", 20.0)); + assertSameElements(expectedGT, resultGT); + + assertThat(scanEventCount).isEqualTo(1); + assertThat(lastScanEvent.filter()) + .as("Should contain the push down filter") + .asString() + .isEqualTo(expectedFilter); + } + + @TestTemplate + public void testFilterPushDownGreaterThanEqual() { + String sqlGTE = String.format("SELECT * FROM %s WHERE id >= 2 ", TABLE_NAME); + String expectedFilter = "ref(name=\"id\") >= 2"; + + List resultGTE = sql(sqlGTE); + assertThat(resultGTE).hasSize(2); + + List expectedGTE = Lists.newArrayList(Row.of(2, "b", 20.0), Row.of(3, null, 30.0)); + assertSameElements(expectedGTE, resultGTE); + + assertThat(scanEventCount).isEqualTo(1); + assertThat(lastScanEvent.filter()) + .as("Should contain the push down filter") + .asString() + .isEqualTo(expectedFilter); + } + + @TestTemplate + public void testFilterPushDownGreaterThanEqualNull() { + String sqlGTE = String.format("SELECT * FROM %s WHERE data >= null ", TABLE_NAME); + + List resultGT = sql(sqlGTE); + assertThat(resultGT).isEmpty(); + assertThat(lastScanEvent).as("Should not push down a filter").isNull(); + } + + @TestTemplate + public void testFilterPushDownGreaterThanEqualLiteralOnLeft() { + String sqlGTE = String.format("SELECT * FROM %s WHERE 2 >= id ", TABLE_NAME); + String expectedFilter = "ref(name=\"id\") <= 2"; + + List resultGTE = sql(sqlGTE); + assertThat(resultGTE).hasSize(2); + + List expectedGTE = Lists.newArrayList(Row.of(1, "iceberg", 10.0), Row.of(2, "b", 20.0)); + assertSameElements(expectedGTE, resultGTE); + + assertThat(scanEventCount).isEqualTo(1); + assertThat(lastScanEvent.filter()) + .as("Should contain the push down filter") + .asString() + .isEqualTo(expectedFilter); + } + + @TestTemplate + public void testFilterPushDownLessThan() { + String sqlLT = String.format("SELECT * FROM %s WHERE id < 2 ", TABLE_NAME); + String expectedFilter = "ref(name=\"id\") < 2"; + + List resultLT = sql(sqlLT); + assertThat(resultLT).hasSize(1).first().isEqualTo(Row.of(1, "iceberg", 10.0)); + + assertThat(scanEventCount).isEqualTo(1); + assertThat(lastScanEvent.filter()) + .as("Should contain the push down filter") + .asString() + .isEqualTo(expectedFilter); + } + + @TestTemplate + public void testFilterPushDownLessThanNull() { + String sqlLT = String.format("SELECT * FROM %s WHERE data < null ", TABLE_NAME); + + List resultGT = sql(sqlLT); + assertThat(resultGT).isEmpty(); + assertThat(lastScanEvent).as("Should not push down a filter").isNull(); + } + + @TestTemplate + public void testFilterPushDownLessThanLiteralOnLeft() { + String sqlLT = String.format("SELECT * FROM %s WHERE 2 < id ", TABLE_NAME); + String expectedFilter = "ref(name=\"id\") > 2"; + + List resultLT = sql(sqlLT); + assertThat(resultLT).hasSize(1).first().isEqualTo(Row.of(3, null, 30.0)); + + assertThat(scanEventCount).isEqualTo(1); + assertThat(lastScanEvent.filter()) + .as("Should contain the push down filter") + .asString() + .isEqualTo(expectedFilter); + } + + @TestTemplate + public void testFilterPushDownLessThanEqual() { + String sqlLTE = String.format("SELECT * FROM %s WHERE id <= 1 ", TABLE_NAME); + String expectedFilter = "ref(name=\"id\") <= 1"; + + List resultLTE = sql(sqlLTE); + assertThat(resultLTE).hasSize(1).first().isEqualTo(Row.of(1, "iceberg", 10.0)); + + assertThat(scanEventCount).isEqualTo(1); + assertThat(lastScanEvent.filter()) + .as("Should contain the push down filter") + .asString() + .isEqualTo(expectedFilter); + } + + @TestTemplate + public void testFilterPushDownLessThanEqualNull() { + String sqlLTE = String.format("SELECT * FROM %s WHERE data <= null ", TABLE_NAME); + + List resultGT = sql(sqlLTE); + assertThat(resultGT).isEmpty(); + assertThat(lastScanEvent).as("Should not push down a filter").isNull(); + } + + @TestTemplate + public void testFilterPushDownLessThanEqualLiteralOnLeft() { + String sqlLTE = String.format("SELECT * FROM %s WHERE 3 <= id ", TABLE_NAME); + String expectedFilter = "ref(name=\"id\") >= 3"; + + List resultLTE = sql(sqlLTE); + assertThat(resultLTE).hasSize(1).first().isEqualTo(Row.of(3, null, 30.0)); + + assertThat(scanEventCount).isEqualTo(1); + assertThat(lastScanEvent.filter()) + .as("Should contain the push down filter") + .asString() + .isEqualTo(expectedFilter); + } + + @TestTemplate + public void testFilterPushDownIn() { + String sqlIN = String.format("SELECT * FROM %s WHERE id IN (1,2) ", TABLE_NAME); + String expectedFilter = "(ref(name=\"id\") == 1 or ref(name=\"id\") == 2)"; + List resultIN = sql(sqlIN); + assertThat(resultIN).hasSize(2); + + List expectedIN = Lists.newArrayList(Row.of(1, "iceberg", 10.0), Row.of(2, "b", 20.0)); + assertSameElements(expectedIN, resultIN); + assertThat(scanEventCount).isEqualTo(1); + assertThat(lastScanEvent.filter()) + .as("Should contain the push down filter") + .asString() + .isEqualTo(expectedFilter); + } + + @TestTemplate + public void testFilterPushDownInNull() { + String sqlInNull = + String.format("SELECT * FROM %s WHERE data IN ('iceberg',NULL) ", TABLE_NAME); + + List result = sql(sqlInNull); + assertThat(result).hasSize(1).first().isEqualTo(Row.of(1, "iceberg", 10.0)); + + // In SQL, null check can only be done as IS NULL or IS NOT NULL, so it's correct to ignore it + // and push the rest down. + String expectedScan = "ref(name=\"data\") == \"iceberg\""; + assertThat(lastScanEvent.filter()) + .as("Should contain the push down filter") + .asString() + .isEqualTo(expectedScan); + } + + @TestTemplate + public void testFilterPushDownNotIn() { + String sqlNotIn = String.format("SELECT * FROM %s WHERE id NOT IN (3,2) ", TABLE_NAME); + + List resultNotIn = sql(sqlNotIn); + assertThat(resultNotIn).hasSize(1).first().isEqualTo(Row.of(1, "iceberg", 10.0)); + assertThat(scanEventCount).isEqualTo(1); + String expectedScan = "(ref(name=\"id\") != 2 and ref(name=\"id\") != 3)"; + assertThat(lastScanEvent.filter()) + .as("Should contain the push down filter") + .asString() + .isEqualTo(expectedScan); + } + + @TestTemplate + public void testFilterPushDownNotInNull() { + String sqlNotInNull = String.format("SELECT * FROM %s WHERE id NOT IN (1,2,NULL) ", TABLE_NAME); + List resultGT = sql(sqlNotInNull); + assertThat(resultGT).isEmpty(); + assertThat(lastScanEvent) + .as( + "As the predicate pushdown filter out all rows, Flink did not create scan plan, so it doesn't publish any ScanEvent.") + .isNull(); + } + + @TestTemplate + public void testFilterPushDownIsNotNull() { + String sqlNotNull = String.format("SELECT * FROM %s WHERE data IS NOT NULL", TABLE_NAME); + String expectedFilter = "not_null(ref(name=\"data\"))"; + + List resultNotNull = sql(sqlNotNull); + assertThat(resultNotNull).hasSize(2); + + List expected = Lists.newArrayList(Row.of(1, "iceberg", 10.0), Row.of(2, "b", 20.0)); + assertSameElements(expected, resultNotNull); + + assertThat(scanEventCount).isEqualTo(1); + assertThat(lastScanEvent.filter()) + .as("Should contain the push down filter") + .asString() + .isEqualTo(expectedFilter); + } + + @TestTemplate + public void testFilterPushDownIsNull() { + String sqlNull = String.format("SELECT * FROM %s WHERE data IS NULL", TABLE_NAME); + String expectedFilter = "is_null(ref(name=\"data\"))"; + + List resultNull = sql(sqlNull); + assertThat(resultNull).hasSize(1).first().isEqualTo(Row.of(3, null, 30.0)); + + assertThat(scanEventCount).isEqualTo(1); + assertThat(lastScanEvent.filter()) + .as("Should contain the push down filter") + .asString() + .isEqualTo(expectedFilter); + } + + @TestTemplate + public void testFilterPushDownNot() { + String sqlNot = String.format("SELECT * FROM %s WHERE NOT (id = 1 OR id = 2 ) ", TABLE_NAME); + + List resultNot = sql(sqlNot); + assertThat(resultNot).hasSize(1).first().isEqualTo(Row.of(3, null, 30.0)); + + assertThat(scanEventCount).isEqualTo(1); + String expectedFilter = "(ref(name=\"id\") != 1 and ref(name=\"id\") != 2)"; + assertThat(lastScanEvent.filter()) + .as("Should contain the push down filter") + .asString() + .isEqualTo(expectedFilter); + } + + @TestTemplate + public void testFilterPushDownBetween() { + String sqlBetween = String.format("SELECT * FROM %s WHERE id BETWEEN 1 AND 2 ", TABLE_NAME); + + List resultBetween = sql(sqlBetween); + assertThat(resultBetween).hasSize(2); + + List expectedBetween = + Lists.newArrayList(Row.of(1, "iceberg", 10.0), Row.of(2, "b", 20.0)); + assertSameElements(expectedBetween, resultBetween); + + assertThat(scanEventCount).isEqualTo(1); + String expected = "(ref(name=\"id\") >= 1 and ref(name=\"id\") <= 2)"; + assertThat(lastScanEvent.filter()) + .as("Should contain the push down filter") + .asString() + .isEqualTo(expected); + } + + @TestTemplate + public void testFilterPushDownNotBetween() { + String sqlNotBetween = + String.format("SELECT * FROM %s WHERE id NOT BETWEEN 2 AND 3 ", TABLE_NAME); + String expectedFilter = "(ref(name=\"id\") < 2 or ref(name=\"id\") > 3)"; + + List resultNotBetween = sql(sqlNotBetween); + assertThat(resultNotBetween).hasSize(1).first().isEqualTo(Row.of(1, "iceberg", 10.0)); + + assertThat(scanEventCount).isEqualTo(1); + assertThat(lastScanEvent.filter()) + .as("Should contain the push down filter") + .asString() + .isEqualTo(expectedFilter); + } + + @TestTemplate + public void testFilterPushDownLike() { + String expectedFilter = "ref(name=\"data\") startsWith \"\"ice\"\""; + + String sqlLike = "SELECT * FROM " + TABLE_NAME + " WHERE data LIKE 'ice%%' "; + List resultLike = sql(sqlLike); + assertThat(resultLike).hasSize(1).first().isEqualTo(Row.of(1, "iceberg", 10.0)); + assertThat(scanEventCount).isEqualTo(1); + assertThat(lastScanEvent.filter()) + .as("Should contain the push down filter") + .asString() + .isEqualTo(expectedFilter); + + // %% won't match the row with null value + sqlLike = "SELECT * FROM " + TABLE_NAME + " WHERE data LIKE '%%' "; + resultLike = sql(sqlLike); + assertThat(resultLike).hasSize(2); + List expectedRecords = + Lists.newArrayList(Row.of(1, "iceberg", 10.0), Row.of(2, "b", 20.0)); + assertSameElements(expectedRecords, resultLike); + String expectedScan = "not_null(ref(name=\"data\"))"; + assertThat(lastScanEvent.filter()) + .as("Should contain the push down filter") + .asString() + .isEqualTo(expectedScan); + } + + @TestTemplate + public void testFilterNotPushDownLike() { + Row expectRecord = Row.of(1, "iceberg", 10.0); + String sqlNoPushDown = "SELECT * FROM " + TABLE_NAME + " WHERE data LIKE '%%i' "; + List resultLike = sql(sqlNoPushDown); + assertThat(resultLike).isEmpty(); + assertThat(lastScanEvent.filter()) + .as("Should not push down a filter") + .isEqualTo(Expressions.alwaysTrue()); + + sqlNoPushDown = "SELECT * FROM " + TABLE_NAME + " WHERE data LIKE '%%i%%' "; + resultLike = sql(sqlNoPushDown); + assertThat(resultLike).hasSize(1).first().isEqualTo(expectRecord); + assertThat(lastScanEvent.filter()) + .as("Should not push down a filter") + .isEqualTo(Expressions.alwaysTrue()); + + sqlNoPushDown = "SELECT * FROM " + TABLE_NAME + " WHERE data LIKE '%%ice%%g' "; + resultLike = sql(sqlNoPushDown); + assertThat(resultLike).hasSize(1).first().isEqualTo(expectRecord); + assertThat(lastScanEvent.filter()) + .as("Should not push down a filter") + .isEqualTo(Expressions.alwaysTrue()); + + sqlNoPushDown = "SELECT * FROM " + TABLE_NAME + " WHERE data LIKE 'iceber_' "; + resultLike = sql(sqlNoPushDown); + assertThat(resultLike).hasSize(1).first().isEqualTo(expectRecord); + assertThat(lastScanEvent.filter()) + .as("Should not push down a filter") + .isEqualTo(Expressions.alwaysTrue()); + + sqlNoPushDown = "SELECT * FROM " + TABLE_NAME + " WHERE data LIKE 'i%%g' "; + resultLike = sql(sqlNoPushDown); + assertThat(resultLike).hasSize(1).first().isEqualTo(expectRecord); + assertThat(lastScanEvent.filter()) + .as("Should not push down a filter") + .isEqualTo(Expressions.alwaysTrue()); + } + + @TestTemplate + public void testFilterPushDown2Literal() { + String sql2Literal = String.format("SELECT * FROM %s WHERE 1 > 0 ", TABLE_NAME); + List result = sql(sql2Literal); + List expectedRecords = + Lists.newArrayList(Row.of(1, "iceberg", 10.0), Row.of(2, "b", 20.0), Row.of(3, null, 30.0)); + assertSameElements(expectedRecords, result); + assertThat(lastScanEvent.filter()) + .as("Should not push down a filter") + .isEqualTo(Expressions.alwaysTrue()); + } + + @TestTemplate + public void testSqlParseNaN() { + // todo add some test case to test NaN + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBounded.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBounded.java new file mode 100644 index 000000000000..b7447d15c05a --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBounded.java @@ -0,0 +1,147 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import static org.apache.iceberg.flink.SimpleDataUtil.SCHEMA; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import java.util.Collections; +import java.util.List; +import java.util.Map; +import org.apache.flink.api.common.eventtime.WatermarkStrategy; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.api.TableColumn; +import org.apache.flink.table.api.TableSchema; +import org.apache.flink.table.data.RowData; +import org.apache.flink.types.Row; +import org.apache.flink.util.CloseableIterator; +import org.apache.iceberg.Schema; +import org.apache.iceberg.Table; +import org.apache.iceberg.expressions.Expression; +import org.apache.iceberg.flink.FlinkConfigOptions; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.TableLoader; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.flink.data.RowDataToRowMapper; +import org.apache.iceberg.flink.source.assigner.SimpleSplitAssignerFactory; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.junit.jupiter.api.TestTemplate; + +public class TestIcebergSourceBounded extends TestFlinkScan { + @TestTemplate + public void testValidation() { + CATALOG_EXTENSION.catalog().createTable(TestFixtures.TABLE_IDENTIFIER, SCHEMA); + + assertThatThrownBy( + () -> + IcebergSource.forRowData() + .tableLoader(tableLoader()) + .assignerFactory(new SimpleSplitAssignerFactory()) + .streaming(false) + .endTag("tag") + .endSnapshotId(1L) + .build()) + .hasMessage("END_SNAPSHOT_ID and END_TAG cannot both be set.") + .isInstanceOf(IllegalArgumentException.class); + } + + @Override + protected List runWithProjection(String... projected) throws Exception { + Schema icebergTableSchema = + CATALOG_EXTENSION.catalog().loadTable(TestFixtures.TABLE_IDENTIFIER).schema(); + TableSchema.Builder builder = TableSchema.builder(); + TableSchema schema = FlinkSchemaUtil.toSchema(FlinkSchemaUtil.convert(icebergTableSchema)); + for (String field : projected) { + TableColumn column = schema.getTableColumn(field).get(); + builder.field(column.getName(), column.getType()); + } + TableSchema flinkSchema = builder.build(); + Schema projectedSchema = FlinkSchemaUtil.convert(icebergTableSchema, flinkSchema); + return run(projectedSchema, Lists.newArrayList(), Maps.newHashMap(), "", projected); + } + + @Override + protected List runWithFilter(Expression filter, String sqlFilter, boolean caseSensitive) + throws Exception { + Map options = Maps.newHashMap(); + options.put("case-sensitive", Boolean.toString(caseSensitive)); + return run(null, Collections.singletonList(filter), options, sqlFilter, "*"); + } + + @Override + protected List runWithOptions(Map options) throws Exception { + return run(null, Lists.newArrayList(), options, "", "*"); + } + + @Override + protected List run() throws Exception { + return run(null, Lists.newArrayList(), Maps.newHashMap(), "", "*"); + } + + protected List run( + Schema projectedSchema, + List filters, + Map options, + String sqlFilter, + String... sqlSelectedFields) + throws Exception { + + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + env.setParallelism(1); + Configuration config = new Configuration(); + config.setInteger(FlinkConfigOptions.SOURCE_READER_FETCH_BATCH_RECORD_COUNT, 128); + Table table; + try (TableLoader tableLoader = tableLoader()) { + tableLoader.open(); + table = tableLoader.loadTable(); + } + + IcebergSource.Builder sourceBuilder = + IcebergSource.forRowData() + .tableLoader(tableLoader()) + .table(table) + .assignerFactory(new SimpleSplitAssignerFactory()) + .flinkConfig(config); + if (projectedSchema != null) { + sourceBuilder.project(projectedSchema); + } + + sourceBuilder.filters(filters); + sourceBuilder.properties(options); + + DataStream stream = + env.fromSource( + sourceBuilder.build(), + WatermarkStrategy.noWatermarks(), + "testBasicRead", + TypeInformation.of(RowData.class)) + .map( + new RowDataToRowMapper( + FlinkSchemaUtil.convert( + projectedSchema == null ? table.schema() : projectedSchema))); + + try (CloseableIterator iter = stream.executeAndCollect()) { + return Lists.newArrayList(iter); + } + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBoundedGenericRecord.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBoundedGenericRecord.java new file mode 100644 index 000000000000..7bfed00a9eb4 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBoundedGenericRecord.java @@ -0,0 +1,196 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import java.nio.file.Path; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import org.apache.avro.generic.GenericRecord; +import org.apache.flink.api.common.eventtime.WatermarkStrategy; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.formats.avro.typeutils.GenericRecordAvroTypeInfo; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.types.Row; +import org.apache.flink.util.CloseableIterator; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Parameter; +import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.Parameters; +import org.apache.iceberg.Schema; +import org.apache.iceberg.Table; +import org.apache.iceberg.avro.AvroSchemaUtil; +import org.apache.iceberg.data.GenericAppenderHelper; +import org.apache.iceberg.data.RandomGenericData; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.expressions.Expression; +import org.apache.iceberg.flink.FlinkConfigOptions; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.HadoopCatalogExtension; +import org.apache.iceberg.flink.TableLoader; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.flink.TestHelpers; +import org.apache.iceberg.flink.data.RowDataToRowMapper; +import org.apache.iceberg.flink.sink.AvroGenericRecordToRowDataMapper; +import org.apache.iceberg.flink.source.assigner.SimpleSplitAssignerFactory; +import org.apache.iceberg.flink.source.reader.AvroGenericRecordReaderFunction; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Sets; +import org.apache.iceberg.types.TypeUtil; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.api.extension.ExtendWith; +import org.junit.jupiter.api.extension.RegisterExtension; +import org.junit.jupiter.api.io.TempDir; + +@ExtendWith(ParameterizedTestExtension.class) +public class TestIcebergSourceBoundedGenericRecord { + @TempDir protected Path temporaryFolder; + + @RegisterExtension + private static final HadoopCatalogExtension CATALOG_EXTENSION = + new HadoopCatalogExtension(TestFixtures.DATABASE, TestFixtures.TABLE); + + @Parameters(name = "format={0}, parallelism = {1}") + public static Object[][] parameters() { + return new Object[][] { + {FileFormat.AVRO, 2}, + {FileFormat.PARQUET, 2}, + {FileFormat.ORC, 2} + }; + } + + @Parameter(index = 0) + private FileFormat fileFormat; + + @Parameter(index = 1) + private int parallelism; + + @TestTemplate + public void testUnpartitionedTable() throws Exception { + Table table = + CATALOG_EXTENSION.catalog().createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA); + List expectedRecords = RandomGenericData.generate(TestFixtures.SCHEMA, 2, 0L); + new GenericAppenderHelper(table, fileFormat, temporaryFolder).appendToTable(expectedRecords); + TestHelpers.assertRecords(run(), expectedRecords, TestFixtures.SCHEMA); + } + + @TestTemplate + public void testPartitionedTable() throws Exception { + String dateStr = "2020-03-20"; + Table table = + CATALOG_EXTENSION + .catalog() + .createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA, TestFixtures.SPEC); + List expectedRecords = RandomGenericData.generate(TestFixtures.SCHEMA, 2, 0L); + for (int i = 0; i < expectedRecords.size(); ++i) { + expectedRecords.get(i).setField("dt", dateStr); + } + + new GenericAppenderHelper(table, fileFormat, temporaryFolder) + .appendToTable(org.apache.iceberg.TestHelpers.Row.of(dateStr, 0), expectedRecords); + TestHelpers.assertRecords(run(), expectedRecords, TestFixtures.SCHEMA); + } + + @TestTemplate + public void testProjection() throws Exception { + Table table = + CATALOG_EXTENSION + .catalog() + .createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA, TestFixtures.SPEC); + List expectedRecords = RandomGenericData.generate(TestFixtures.SCHEMA, 2, 0L); + new GenericAppenderHelper(table, fileFormat, temporaryFolder) + .appendToTable(org.apache.iceberg.TestHelpers.Row.of("2020-03-20", 0), expectedRecords); + // select the "data" field (fieldId == 1) + Schema projectedSchema = TypeUtil.select(TestFixtures.SCHEMA, Sets.newHashSet(1)); + List expectedRows = + Arrays.asList(Row.of(expectedRecords.get(0).get(0)), Row.of(expectedRecords.get(1).get(0))); + TestHelpers.assertRows( + run(projectedSchema, Collections.emptyList(), Collections.emptyMap()), expectedRows); + } + + private List run() throws Exception { + return run(null, Collections.emptyList(), Collections.emptyMap()); + } + + private List run( + Schema projectedSchema, List filters, Map options) + throws Exception { + + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + env.setParallelism(parallelism); + env.getConfig().enableObjectReuse(); + + Configuration config = new Configuration(); + config.setInteger(FlinkConfigOptions.SOURCE_READER_FETCH_BATCH_RECORD_COUNT, 128); + Table table; + try (TableLoader tableLoader = CATALOG_EXTENSION.tableLoader()) { + tableLoader.open(); + table = tableLoader.loadTable(); + } + + AvroGenericRecordReaderFunction readerFunction = + new AvroGenericRecordReaderFunction( + TestFixtures.TABLE_IDENTIFIER.name(), + new Configuration(), + table.schema(), + null, + null, + false, + table.io(), + table.encryption(), + filters); + + IcebergSource.Builder sourceBuilder = + IcebergSource.builder() + .tableLoader(CATALOG_EXTENSION.tableLoader()) + .readerFunction(readerFunction) + .assignerFactory(new SimpleSplitAssignerFactory()) + .flinkConfig(config); + if (projectedSchema != null) { + sourceBuilder.project(projectedSchema); + } + + sourceBuilder.filters(filters); + sourceBuilder.setAll(options); + + Schema readSchema = projectedSchema != null ? projectedSchema : table.schema(); + RowType rowType = FlinkSchemaUtil.convert(readSchema); + org.apache.avro.Schema avroSchema = + AvroSchemaUtil.convert(readSchema, TestFixtures.TABLE_IDENTIFIER.name()); + + DataStream stream = + env.fromSource( + sourceBuilder.build(), + WatermarkStrategy.noWatermarks(), + "testBasicRead", + new GenericRecordAvroTypeInfo(avroSchema)) + // There are two reasons for converting GenericRecord back to Row. + // 1. Avro GenericRecord/Schema is not serializable. + // 2. leverage the TestHelpers.assertRecords for validation. + .map(AvroGenericRecordToRowDataMapper.forAvroSchema(avroSchema)) + .map(new RowDataToRowMapper(rowType)); + + try (CloseableIterator iter = stream.executeAndCollect()) { + return Lists.newArrayList(iter); + } + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBoundedSql.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBoundedSql.java new file mode 100644 index 000000000000..0f41c5af4c95 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBoundedSql.java @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import java.io.IOException; +import java.util.List; +import java.util.Map; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.table.api.EnvironmentSettings; +import org.apache.flink.table.api.TableEnvironment; +import org.apache.flink.table.api.config.TableConfigOptions; +import org.apache.flink.types.Row; +import org.apache.iceberg.Schema; +import org.apache.iceberg.expressions.Expression; +import org.apache.iceberg.flink.FlinkConfigOptions; +import org.junit.jupiter.api.BeforeEach; + +public class TestIcebergSourceBoundedSql extends TestIcebergSourceBounded { + private volatile TableEnvironment tEnv; + + @BeforeEach + public void before() throws IOException { + Configuration tableConf = getTableEnv().getConfig().getConfiguration(); + tableConf.setBoolean(FlinkConfigOptions.TABLE_EXEC_ICEBERG_USE_FLIP27_SOURCE.key(), true); + SqlHelpers.sql( + getTableEnv(), + "create catalog iceberg_catalog with ('type'='iceberg', 'catalog-type'='hadoop', 'warehouse'='%s')", + CATALOG_EXTENSION.warehouse()); + SqlHelpers.sql(getTableEnv(), "use catalog iceberg_catalog"); + getTableEnv() + .getConfig() + .getConfiguration() + .set(TableConfigOptions.TABLE_DYNAMIC_TABLE_OPTIONS_ENABLED, true); + } + + private TableEnvironment getTableEnv() { + if (tEnv == null) { + synchronized (this) { + if (tEnv == null) { + this.tEnv = + TableEnvironment.create(EnvironmentSettings.newInstance().inBatchMode().build()); + } + } + } + return tEnv; + } + + @Override + protected List run( + Schema projectedSchema, + List filters, + Map options, + String sqlFilter, + String... sqlSelectedFields) + throws Exception { + String select = String.join(",", sqlSelectedFields); + String optionStr = SqlHelpers.sqlOptionsToString(options); + return SqlHelpers.sql(getTableEnv(), "select %s from t %s %s", select, optionStr, sqlFilter); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceContinuous.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceContinuous.java new file mode 100644 index 000000000000..749cbf89338a --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceContinuous.java @@ -0,0 +1,573 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import java.nio.file.Path; +import java.time.Duration; +import java.util.Collection; +import java.util.List; +import java.util.Optional; +import java.util.concurrent.atomic.AtomicLong; +import java.util.stream.Collectors; +import org.apache.flink.api.common.JobID; +import org.apache.flink.api.common.JobStatus; +import org.apache.flink.api.common.eventtime.WatermarkStrategy; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.client.program.ClusterClient; +import org.apache.flink.metrics.MetricGroup; +import org.apache.flink.runtime.client.JobStatusMessage; +import org.apache.flink.runtime.testutils.InMemoryReporter; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.data.RowData; +import org.apache.flink.test.junit5.InjectClusterClient; +import org.apache.flink.test.junit5.MiniClusterExtension; +import org.apache.flink.types.Row; +import org.apache.flink.util.CloseableIterator; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.data.GenericAppenderHelper; +import org.apache.iceberg.data.RandomGenericData; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.HadoopTableExtension; +import org.apache.iceberg.flink.MiniFlinkClusterExtension; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.flink.TestHelpers; +import org.apache.iceberg.flink.data.RowDataToRowMapper; +import org.apache.iceberg.flink.source.assigner.SimpleSplitAssignerFactory; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.awaitility.Awaitility; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.RegisterExtension; +import org.junit.jupiter.api.io.TempDir; + +public class TestIcebergSourceContinuous { + + public static final InMemoryReporter METRIC_REPORTER = InMemoryReporter.create(); + + @TempDir protected Path temporaryFolder; + + @RegisterExtension + public static MiniClusterExtension miniClusterExtension = + MiniFlinkClusterExtension.createWithClassloaderCheckDisabled(METRIC_REPORTER); + + @RegisterExtension + private static final HadoopTableExtension TABLE_EXTENSION = + new HadoopTableExtension(TestFixtures.DATABASE, TestFixtures.TABLE, TestFixtures.SCHEMA); + + private final AtomicLong randomSeed = new AtomicLong(0L); + + @Test + public void testTableScanThenIncremental() throws Exception { + GenericAppenderHelper dataAppender = + new GenericAppenderHelper(TABLE_EXTENSION.table(), FileFormat.PARQUET, temporaryFolder); + + // snapshot1 + List batch1 = + RandomGenericData.generate( + TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); + dataAppender.appendToTable(batch1); + + ScanContext scanContext = + ScanContext.builder() + .streaming(true) + .monitorInterval(Duration.ofMillis(10L)) + .startingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) + .build(); + + try (CloseableIterator iter = + createStream(scanContext).executeAndCollect(getClass().getSimpleName())) { + List result1 = waitForResult(iter, 2); + TestHelpers.assertRecords(result1, batch1, TABLE_EXTENSION.table().schema()); + + // snapshot2 + List batch2 = + RandomGenericData.generate( + TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); + dataAppender.appendToTable(batch2); + TABLE_EXTENSION.table().currentSnapshot().snapshotId(); + + List result2 = waitForResult(iter, 2); + TestHelpers.assertRecords(result2, batch2, TABLE_EXTENSION.table().schema()); + + // snapshot3 + List batch3 = + RandomGenericData.generate( + TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); + dataAppender.appendToTable(batch3); + TABLE_EXTENSION.table().currentSnapshot().snapshotId(); + + List result3 = waitForResult(iter, 2); + TestHelpers.assertRecords(result3, batch3, TABLE_EXTENSION.table().schema()); + + assertThatIcebergEnumeratorMetricsExist(); + } + } + + @Test + public void testTableScanThenIncrementalAfterExpiration() throws Exception { + GenericAppenderHelper dataAppender = + new GenericAppenderHelper(TABLE_EXTENSION.table(), FileFormat.PARQUET, temporaryFolder); + + // snapshot1 + List batch1 = + RandomGenericData.generate( + TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); + dataAppender.appendToTable(batch1); + long snapshotId = TABLE_EXTENSION.table().currentSnapshot().snapshotId(); + + // snapshot2 + List batch2 = + RandomGenericData.generate( + TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); + dataAppender.appendToTable(batch2); + + TABLE_EXTENSION.table().expireSnapshots().expireSnapshotId(snapshotId).commit(); + + assertThat(TABLE_EXTENSION.table().history()).hasSize(1); + + ScanContext scanContext = + ScanContext.builder() + .streaming(true) + .monitorInterval(Duration.ofMillis(10L)) + .startingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) + .build(); + + assertThat(FlinkSplitPlanner.checkScanMode(scanContext)) + .isEqualTo(FlinkSplitPlanner.ScanMode.BATCH); + + try (CloseableIterator iter = + createStream(scanContext).executeAndCollect(getClass().getSimpleName())) { + List result1 = waitForResult(iter, 4); + List initialRecords = Lists.newArrayList(); + initialRecords.addAll(batch1); + initialRecords.addAll(batch2); + TestHelpers.assertRecords(result1, initialRecords, TABLE_EXTENSION.table().schema()); + + // snapshot3 + List batch3 = + RandomGenericData.generate( + TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); + dataAppender.appendToTable(batch3); + TABLE_EXTENSION.table().currentSnapshot().snapshotId(); + + List result3 = waitForResult(iter, 2); + TestHelpers.assertRecords(result3, batch3, TABLE_EXTENSION.table().schema()); + + assertThatIcebergEnumeratorMetricsExist(); + } + } + + @Test + public void testEarliestSnapshot() throws Exception { + GenericAppenderHelper dataAppender = + new GenericAppenderHelper(TABLE_EXTENSION.table(), FileFormat.PARQUET, temporaryFolder); + + // snapshot0 + List batch0 = + RandomGenericData.generate( + TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); + dataAppender.appendToTable(batch0); + + // snapshot1 + List batch1 = + RandomGenericData.generate( + TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); + dataAppender.appendToTable(batch1); + + ScanContext scanContext = + ScanContext.builder() + .streaming(true) + .monitorInterval(Duration.ofMillis(10L)) + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_EARLIEST_SNAPSHOT) + .build(); + + try (CloseableIterator iter = + createStream(scanContext).executeAndCollect(getClass().getSimpleName())) { + List result1 = waitForResult(iter, 4); + List combinedBatch0AndBatch1 = Lists.newArrayList(batch0); + combinedBatch0AndBatch1.addAll(batch1); + TestHelpers.assertRecords(result1, combinedBatch0AndBatch1, TABLE_EXTENSION.table().schema()); + + // snapshot2 + List batch2 = + RandomGenericData.generate( + TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); + dataAppender.appendToTable(batch2); + + List result2 = waitForResult(iter, 2); + TestHelpers.assertRecords(result2, batch2, TABLE_EXTENSION.table().schema()); + + // snapshot3 + List batch3 = + RandomGenericData.generate( + TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); + dataAppender.appendToTable(batch3); + + List result3 = waitForResult(iter, 2); + TestHelpers.assertRecords(result3, batch3, TABLE_EXTENSION.table().schema()); + + assertThatIcebergEnumeratorMetricsExist(); + } + } + + @Test + public void testLatestSnapshot(@InjectClusterClient ClusterClient clusterClient) + throws Exception { + GenericAppenderHelper dataAppender = + new GenericAppenderHelper(TABLE_EXTENSION.table(), FileFormat.PARQUET, temporaryFolder); + + // snapshot0 + List batch0 = + RandomGenericData.generate( + TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); + dataAppender.appendToTable(batch0); + + // snapshot1 + List batch1 = + RandomGenericData.generate( + TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); + dataAppender.appendToTable(batch1); + + ScanContext scanContext = + ScanContext.builder() + .streaming(true) + .monitorInterval(Duration.ofMillis(10L)) + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_LATEST_SNAPSHOT) + .build(); + + try (CloseableIterator iter = + createStream(scanContext).executeAndCollect(getClass().getSimpleName())) { + // we want to make sure job is running first so that enumerator can + // start from the latest snapshot before inserting the next batch2 below. + waitUntilJobIsRunning(clusterClient); + + // inclusive behavior for starting snapshot + List result1 = waitForResult(iter, 2); + TestHelpers.assertRecords(result1, batch1, TABLE_EXTENSION.table().schema()); + + // snapshot2 + List batch2 = + RandomGenericData.generate( + TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); + dataAppender.appendToTable(batch2); + + List result2 = waitForResult(iter, 2); + TestHelpers.assertRecords(result2, batch2, TABLE_EXTENSION.table().schema()); + + // snapshot3 + List batch3 = + RandomGenericData.generate( + TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); + dataAppender.appendToTable(batch3); + + List result3 = waitForResult(iter, 2); + TestHelpers.assertRecords(result3, batch3, TABLE_EXTENSION.table().schema()); + + assertThatIcebergEnumeratorMetricsExist(); + } + } + + @Test + public void testSpecificSnapshotId() throws Exception { + GenericAppenderHelper dataAppender = + new GenericAppenderHelper(TABLE_EXTENSION.table(), FileFormat.PARQUET, temporaryFolder); + + // snapshot0 + List batch0 = + RandomGenericData.generate( + TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); + dataAppender.appendToTable(batch0); + long snapshot0 = TABLE_EXTENSION.table().currentSnapshot().snapshotId(); + + // snapshot1 + List batch1 = + RandomGenericData.generate( + TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); + dataAppender.appendToTable(batch1); + long snapshot1 = TABLE_EXTENSION.table().currentSnapshot().snapshotId(); + + ScanContext scanContext = + ScanContext.builder() + .streaming(true) + .monitorInterval(Duration.ofMillis(10L)) + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_ID) + .startSnapshotId(snapshot1) + .build(); + + try (CloseableIterator iter = + createStream(scanContext).executeAndCollect(getClass().getSimpleName())) { + List result1 = waitForResult(iter, 2); + TestHelpers.assertRecords(result1, batch1, TABLE_EXTENSION.table().schema()); + + // snapshot2 + List batch2 = + RandomGenericData.generate( + TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); + dataAppender.appendToTable(batch2); + + List result2 = waitForResult(iter, 2); + TestHelpers.assertRecords(result2, batch2, TABLE_EXTENSION.table().schema()); + + // snapshot3 + List batch3 = + RandomGenericData.generate( + TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); + dataAppender.appendToTable(batch3); + + List result3 = waitForResult(iter, 2); + TestHelpers.assertRecords(result3, batch3, TABLE_EXTENSION.table().schema()); + + assertThatIcebergEnumeratorMetricsExist(); + } + } + + @Test + public void testSpecificSnapshotTimestamp() throws Exception { + GenericAppenderHelper dataAppender = + new GenericAppenderHelper(TABLE_EXTENSION.table(), FileFormat.PARQUET, temporaryFolder); + + // snapshot0 + List batch0 = + RandomGenericData.generate( + TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); + dataAppender.appendToTable(batch0); + long snapshot0Timestamp = TABLE_EXTENSION.table().currentSnapshot().timestampMillis(); + + // sleep for 2 ms to make sure snapshot1 has a higher timestamp value + Thread.sleep(2); + + // snapshot1 + List batch1 = + RandomGenericData.generate( + TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); + dataAppender.appendToTable(batch1); + long snapshot1Timestamp = TABLE_EXTENSION.table().currentSnapshot().timestampMillis(); + + ScanContext scanContext = + ScanContext.builder() + .streaming(true) + .monitorInterval(Duration.ofMillis(10L)) + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP) + .startSnapshotTimestamp(snapshot1Timestamp) + .build(); + + try (CloseableIterator iter = + createStream(scanContext).executeAndCollect(getClass().getSimpleName())) { + // consume data from snapshot1 + List result1 = waitForResult(iter, 2); + TestHelpers.assertRecords(result1, batch1, TABLE_EXTENSION.table().schema()); + + // snapshot2 + List batch2 = + RandomGenericData.generate( + TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); + dataAppender.appendToTable(batch2); + + List result2 = waitForResult(iter, 2); + TestHelpers.assertRecords(result2, batch2, TABLE_EXTENSION.table().schema()); + + // snapshot3 + List batch3 = + RandomGenericData.generate( + TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); + dataAppender.appendToTable(batch3); + + List result3 = waitForResult(iter, 2); + TestHelpers.assertRecords(result3, batch3, TABLE_EXTENSION.table().schema()); + + assertThatIcebergEnumeratorMetricsExist(); + } + } + + @Test + public void testReadingFromBranch() throws Exception { + String branch = "b1"; + GenericAppenderHelper dataAppender = + new GenericAppenderHelper(TABLE_EXTENSION.table(), FileFormat.PARQUET, temporaryFolder); + + List batchBase = + RandomGenericData.generate( + TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); + dataAppender.appendToTable(batchBase); + + // create branch + TABLE_EXTENSION + .table() + .manageSnapshots() + .createBranch(branch, TABLE_EXTENSION.table().currentSnapshot().snapshotId()) + .commit(); + + // snapshot1 to branch + List batch1 = + RandomGenericData.generate( + TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); + dataAppender.appendToTable(branch, batch1); + + // snapshot2 to branch + List batch2 = + RandomGenericData.generate( + TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); + dataAppender.appendToTable(branch, batch2); + + List branchExpectedRecords = Lists.newArrayList(); + branchExpectedRecords.addAll(batchBase); + branchExpectedRecords.addAll(batch1); + branchExpectedRecords.addAll(batch2); + // reads from branch: it should contain the first snapshot (before the branch creation) followed + // by the next 2 snapshots added + ScanContext scanContext = + ScanContext.builder() + .streaming(true) + .monitorInterval(Duration.ofMillis(10L)) + .startingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) + .useBranch(branch) + .build(); + + try (CloseableIterator iter = + createStream(scanContext).executeAndCollect(getClass().getSimpleName())) { + List resultMain = waitForResult(iter, 6); + TestHelpers.assertRecords( + resultMain, branchExpectedRecords, TABLE_EXTENSION.table().schema()); + + // snapshot3 to branch + List batch3 = + RandomGenericData.generate( + TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); + dataAppender.appendToTable(branch, batch3); + + List result3 = waitForResult(iter, 2); + TestHelpers.assertRecords(result3, batch3, TABLE_EXTENSION.table().schema()); + + // snapshot4 to branch + List batch4 = + RandomGenericData.generate( + TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); + dataAppender.appendToTable(branch, batch4); + + List result4 = waitForResult(iter, 2); + TestHelpers.assertRecords(result4, batch4, TABLE_EXTENSION.table().schema()); + } + + // read only from main branch. Should contain only the first snapshot + scanContext = + ScanContext.builder() + .streaming(true) + .monitorInterval(Duration.ofMillis(10L)) + .startingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) + .build(); + try (CloseableIterator iter = + createStream(scanContext).executeAndCollect(getClass().getSimpleName())) { + List resultMain = waitForResult(iter, 2); + TestHelpers.assertRecords(resultMain, batchBase, TABLE_EXTENSION.table().schema()); + + List batchMain2 = + RandomGenericData.generate( + TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); + dataAppender.appendToTable(batchMain2); + resultMain = waitForResult(iter, 2); + TestHelpers.assertRecords(resultMain, batchMain2, TABLE_EXTENSION.table().schema()); + } + } + + @Test + public void testValidation() { + assertThatThrownBy( + () -> + IcebergSource.forRowData() + .tableLoader(TABLE_EXTENSION.tableLoader()) + .assignerFactory(new SimpleSplitAssignerFactory()) + .streaming(true) + .endTag("tag") + .build()) + .hasMessage("Cannot set end-tag option for streaming reader") + .isInstanceOf(IllegalArgumentException.class); + } + + private DataStream createStream(ScanContext scanContext) throws Exception { + // start the source and collect output + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + env.setParallelism(1); + DataStream stream = + env.fromSource( + IcebergSource.forRowData() + .tableLoader(TABLE_EXTENSION.tableLoader()) + .assignerFactory(new SimpleSplitAssignerFactory()) + .streaming(scanContext.isStreaming()) + .streamingStartingStrategy(scanContext.streamingStartingStrategy()) + .startSnapshotTimestamp(scanContext.startSnapshotTimestamp()) + .startSnapshotId(scanContext.startSnapshotId()) + .monitorInterval(Duration.ofMillis(10L)) + .branch(scanContext.branch()) + .build(), + WatermarkStrategy.noWatermarks(), + "icebergSource", + TypeInformation.of(RowData.class)) + .map(new RowDataToRowMapper(FlinkSchemaUtil.convert(TABLE_EXTENSION.table().schema()))); + return stream; + } + + public static List waitForResult(CloseableIterator iter, int limit) { + List results = Lists.newArrayListWithCapacity(limit); + while (results.size() < limit) { + if (iter.hasNext()) { + results.add(iter.next()); + } else { + break; + } + } + return results; + } + + public static void waitUntilJobIsRunning(ClusterClient client) { + Awaitility.await("job should be running") + .atMost(Duration.ofSeconds(30)) + .pollInterval(Duration.ofMillis(10)) + .untilAsserted(() -> assertThat(getRunningJobs(client)).isNotEmpty()); + } + + public static List getRunningJobs(ClusterClient client) throws Exception { + Collection statusMessages = client.listJobs().get(); + return statusMessages.stream() + .filter(status -> status.getJobState() == JobStatus.RUNNING) + .map(JobStatusMessage::getJobId) + .collect(Collectors.toList()); + } + + private static void assertThatIcebergEnumeratorMetricsExist() { + assertThatIcebergSourceMetricExists( + "enumerator", "coordinator.enumerator.elapsedSecondsSinceLastSplitDiscovery"); + assertThatIcebergSourceMetricExists("enumerator", "coordinator.enumerator.unassignedSplits"); + assertThatIcebergSourceMetricExists("enumerator", "coordinator.enumerator.pendingRecords"); + } + + private static void assertThatIcebergSourceMetricExists( + String metricGroupPattern, String metricName) { + Optional groups = METRIC_REPORTER.findGroup(metricGroupPattern); + assertThat(groups).isPresent(); + assertThat( + METRIC_REPORTER.getMetricsByGroup(groups.get()).keySet().stream() + .map(name -> groups.get().getMetricIdentifier(name))) + .satisfiesOnlyOnce( + fullMetricName -> assertThat(fullMetricName).containsSubsequence(metricName)); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceFailover.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceFailover.java new file mode 100644 index 000000000000..938ae4d9bb0a --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceFailover.java @@ -0,0 +1,394 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import static org.apache.iceberg.flink.SimpleDataUtil.tableRecords; +import static org.assertj.core.api.Assertions.assertThat; + +import java.nio.file.Path; +import java.time.Duration; +import java.util.List; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.atomic.AtomicInteger; +import org.apache.flink.api.common.JobID; +import org.apache.flink.api.common.eventtime.WatermarkStrategy; +import org.apache.flink.api.common.restartstrategy.RestartStrategies; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.client.program.ClusterClient; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.core.execution.JobClient; +import org.apache.flink.core.execution.SavepointFormatType; +import org.apache.flink.runtime.highavailability.nonha.embedded.HaLeadershipControl; +import org.apache.flink.runtime.jobgraph.SavepointConfigOptions; +import org.apache.flink.runtime.minicluster.MiniCluster; +import org.apache.flink.runtime.minicluster.RpcServiceSharing; +import org.apache.flink.runtime.testutils.MiniClusterResourceConfiguration; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.data.RowData; +import org.apache.flink.test.junit5.InjectClusterClient; +import org.apache.flink.test.junit5.MiniClusterExtension; +import org.apache.flink.test.util.MiniClusterWithClientResource; +import org.apache.flink.util.function.ThrowingConsumer; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Schema; +import org.apache.iceberg.Table; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.data.GenericAppenderHelper; +import org.apache.iceberg.data.RandomGenericData; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.flink.FlinkConfigOptions; +import org.apache.iceberg.flink.FlinkReadOptions; +import org.apache.iceberg.flink.HadoopCatalogExtension; +import org.apache.iceberg.flink.SimpleDataUtil; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.flink.sink.FlinkSink; +import org.apache.iceberg.flink.source.assigner.SimpleSplitAssignerFactory; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.Timeout; +import org.junit.jupiter.api.extension.RegisterExtension; +import org.junit.jupiter.api.io.TempDir; + +@Timeout(value = 120) +public class TestIcebergSourceFailover { + + // Parallelism higher than 1, but lower than the number of splits used by some of our tests + // The goal is to allow some splits to remain in the enumerator when restoring the state + private static final int PARALLELISM = 2; + private static final int DO_NOT_FAIL = Integer.MAX_VALUE; + protected static final MiniClusterResourceConfiguration MINI_CLUSTER_RESOURCE_CONFIG = + new MiniClusterResourceConfiguration.Builder() + .setNumberTaskManagers(1) + .setNumberSlotsPerTaskManager(PARALLELISM) + .setRpcServiceSharing(RpcServiceSharing.DEDICATED) + .withHaLeadershipControl() + .build(); + + @RegisterExtension + public static final MiniClusterExtension MINI_CLUSTER_EXTENSION = + new MiniClusterExtension(MINI_CLUSTER_RESOURCE_CONFIG); + + @TempDir protected Path temporaryFolder; + + @RegisterExtension + protected static final HadoopCatalogExtension SOURCE_CATALOG_EXTENSION = + new HadoopCatalogExtension(TestFixtures.DATABASE, TestFixtures.TABLE); + + @RegisterExtension + protected static final HadoopCatalogExtension SINK_CATALOG_EXTENSION = + new HadoopCatalogExtension(TestFixtures.DATABASE, TestFixtures.SINK_TABLE); + + protected Table sourceTable; + protected Table sinkTable; + + @BeforeEach + protected void setupTable() { + this.sourceTable = + SOURCE_CATALOG_EXTENSION + .catalog() + .createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA); + this.sinkTable = + SINK_CATALOG_EXTENSION + .catalog() + .createTable(TestFixtures.SINK_TABLE_IDENTIFIER, TestFixtures.SCHEMA); + } + + protected IcebergSource.Builder sourceBuilder() { + Configuration config = new Configuration(); + return IcebergSource.forRowData() + .tableLoader(SOURCE_CATALOG_EXTENSION.tableLoader()) + .assignerFactory(new SimpleSplitAssignerFactory()) + // Prevent combining splits + .set( + FlinkReadOptions.SPLIT_FILE_OPEN_COST, + Long.toString(TableProperties.SPLIT_SIZE_DEFAULT)) + .flinkConfig(config); + } + + protected Schema schema() { + return TestFixtures.SCHEMA; + } + + protected List generateRecords(int numRecords, long seed) { + return RandomGenericData.generate(schema(), numRecords, seed); + } + + protected void assertRecords(Table table, List expectedRecords, Duration timeout) + throws Exception { + SimpleDataUtil.assertTableRecords(table, expectedRecords, timeout); + } + + @Disabled("Disabled for now as it is flaky on CI") + @Test + public void testBoundedWithSavepoint(@InjectClusterClient ClusterClient clusterClient) + throws Exception { + List expectedRecords = Lists.newArrayList(); + GenericAppenderHelper dataAppender = + new GenericAppenderHelper(sourceTable, FileFormat.PARQUET, temporaryFolder); + for (int i = 0; i < 4; ++i) { + List records = generateRecords(2, i); + expectedRecords.addAll(records); + dataAppender.appendToTable(records); + } + + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + createBoundedStreams(env, 2); + + JobClient jobClient = env.executeAsync("Bounded Iceberg Source Savepoint Test"); + JobID jobId = jobClient.getJobID(); + + // Write something, but do not finish before checkpoint is created + RecordCounterToWait.waitForCondition(); + CompletableFuture savepoint = + clusterClient.stopWithSavepoint( + jobId, false, temporaryFolder.toString(), SavepointFormatType.CANONICAL); + RecordCounterToWait.continueProcessing(); + + // Wait for the job to stop with the savepoint + String savepointPath = savepoint.get(); + + // We expect that at least a few records has written + assertThat(tableRecords(sinkTable)).hasSizeGreaterThan(0); + + // New env from the savepoint + Configuration conf = new Configuration(); + conf.set(SavepointConfigOptions.SAVEPOINT_PATH, savepointPath); + env = StreamExecutionEnvironment.getExecutionEnvironment(conf); + createBoundedStreams(env, DO_NOT_FAIL); + + env.execute("Bounded Iceberg Source Savepoint Test"); + + // We expect no duplications + assertRecords(sinkTable, expectedRecords, Duration.ofSeconds(120)); + } + + @Test + public void testBoundedWithTaskManagerFailover() throws Exception { + runTestWithNewMiniCluster( + miniCluster -> testBoundedIcebergSource(FailoverType.TM, miniCluster)); + } + + @Test + public void testBoundedWithJobManagerFailover() throws Exception { + runTestWithNewMiniCluster( + miniCluster -> testBoundedIcebergSource(FailoverType.JM, miniCluster)); + } + + private void testBoundedIcebergSource(FailoverType failoverType, MiniCluster miniCluster) + throws Exception { + List expectedRecords = Lists.newArrayList(); + GenericAppenderHelper dataAppender = + new GenericAppenderHelper(sourceTable, FileFormat.PARQUET, temporaryFolder); + for (int i = 0; i < 4; ++i) { + List records = generateRecords(2, i); + expectedRecords.addAll(records); + dataAppender.appendToTable(records); + } + + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + env.setRestartStrategy(RestartStrategies.fixedDelayRestart(1, 0)); + createBoundedStreams(env, 2); + + JobClient jobClient = env.executeAsync("Bounded Iceberg Source Failover Test"); + JobID jobId = jobClient.getJobID(); + + RecordCounterToWait.waitForCondition(); + triggerFailover(failoverType, jobId, RecordCounterToWait::continueProcessing, miniCluster); + + assertRecords(sinkTable, expectedRecords, Duration.ofSeconds(120)); + } + + @Test + public void testContinuousWithTaskManagerFailover() throws Exception { + runTestWithNewMiniCluster( + miniCluster -> testContinuousIcebergSource(FailoverType.TM, miniCluster)); + } + + @Test + public void testContinuousWithJobManagerFailover() throws Exception { + runTestWithNewMiniCluster( + miniCluster -> testContinuousIcebergSource(FailoverType.JM, miniCluster)); + } + + private void testContinuousIcebergSource(FailoverType failoverType, MiniCluster miniCluster) + throws Exception { + GenericAppenderHelper dataAppender = + new GenericAppenderHelper(sourceTable, FileFormat.PARQUET, temporaryFolder); + List expectedRecords = Lists.newArrayList(); + + List batch = generateRecords(2, 0); + expectedRecords.addAll(batch); + dataAppender.appendToTable(batch); + + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + env.setParallelism(PARALLELISM); + env.enableCheckpointing(10L); + Configuration config = new Configuration(); + config.setInteger(FlinkConfigOptions.SOURCE_READER_FETCH_BATCH_RECORD_COUNT, 128); + + DataStream stream = + env.fromSource( + sourceBuilder() + .streaming(true) + .monitorInterval(Duration.ofMillis(10)) + .streamingStartingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) + .build(), + WatermarkStrategy.noWatermarks(), + "IcebergSource", + TypeInformation.of(RowData.class)); + + // CollectStreamSink from DataStream#executeAndCollect() doesn't guarantee + // exactly-once behavior. When Iceberg sink, we can verify end-to-end + // exactly-once. Here we mainly about source exactly-once behavior. + FlinkSink.forRowData(stream) + .table(sinkTable) + .tableLoader(SINK_CATALOG_EXTENSION.tableLoader()) + .append(); + + JobClient jobClient = env.executeAsync("Continuous Iceberg Source Failover Test"); + JobID jobId = jobClient.getJobID(); + + for (int i = 1; i < 5; i++) { + Thread.sleep(10); + List records = generateRecords(2, i); + expectedRecords.addAll(records); + dataAppender.appendToTable(records); + if (i == 2) { + triggerFailover(failoverType, jobId, () -> {}, miniCluster); + } + } + + // wait longer for continuous source to reduce flakiness + // because CI servers tend to be overloaded. + assertRecords(sinkTable, expectedRecords, Duration.ofSeconds(120)); + } + + private void createBoundedStreams(StreamExecutionEnvironment env, int failAfter) { + env.setParallelism(PARALLELISM); + + DataStream stream = + env.fromSource( + sourceBuilder().build(), + WatermarkStrategy.noWatermarks(), + "IcebergSource", + TypeInformation.of(RowData.class)); + + DataStream streamFailingInTheMiddleOfReading = + RecordCounterToWait.wrapWithFailureAfter(stream, failAfter); + + // CollectStreamSink from DataStream#executeAndCollect() doesn't guarantee + // exactly-once behavior. When Iceberg sink, we can verify end-to-end + // exactly-once. Here we mainly about source exactly-once behavior. + FlinkSink.forRowData(streamFailingInTheMiddleOfReading) + .table(sinkTable) + .tableLoader(SINK_CATALOG_EXTENSION.tableLoader()) + .append(); + } + + // ------------------------------------------------------------------------ + // test utilities copied from Flink's FileSourceTextLinesITCase + // ------------------------------------------------------------------------ + + private static void runTestWithNewMiniCluster(ThrowingConsumer testMethod) + throws Exception { + MiniClusterWithClientResource miniCluster = null; + try { + miniCluster = new MiniClusterWithClientResource(MINI_CLUSTER_RESOURCE_CONFIG); + miniCluster.before(); + testMethod.accept(miniCluster.getMiniCluster()); + } finally { + if (miniCluster != null) { + miniCluster.after(); + } + } + } + + private enum FailoverType { + NONE, + TM, + JM + } + + private static void triggerFailover( + FailoverType type, JobID jobId, Runnable afterFailAction, MiniCluster miniCluster) + throws Exception { + switch (type) { + case NONE: + afterFailAction.run(); + break; + case TM: + restartTaskManager(afterFailAction, miniCluster); + break; + case JM: + triggerJobManagerFailover(jobId, afterFailAction, miniCluster); + break; + } + } + + private static void triggerJobManagerFailover( + JobID jobId, Runnable afterFailAction, MiniCluster miniCluster) throws Exception { + HaLeadershipControl haLeadershipControl = miniCluster.getHaLeadershipControl().get(); + haLeadershipControl.revokeJobMasterLeadership(jobId).get(); + afterFailAction.run(); + haLeadershipControl.grantJobMasterLeadership(jobId).get(); + } + + private static void restartTaskManager(Runnable afterFailAction, MiniCluster miniCluster) + throws Exception { + miniCluster.terminateTaskManager(0).get(); + afterFailAction.run(); + miniCluster.startTaskManager(); + } + + private static class RecordCounterToWait { + + private static AtomicInteger records; + private static CountDownLatch countDownLatch; + private static CompletableFuture continueProcessing; + + private static DataStream wrapWithFailureAfter(DataStream stream, int condition) { + + records = new AtomicInteger(); + continueProcessing = new CompletableFuture<>(); + countDownLatch = new CountDownLatch(stream.getParallelism()); + return stream.map( + record -> { + boolean reachedFailPoint = records.incrementAndGet() > condition; + boolean notFailedYet = countDownLatch.getCount() != 0; + if (notFailedYet && reachedFailPoint) { + countDownLatch.countDown(); + continueProcessing.get(); + } + return record; + }); + } + + private static void waitForCondition() throws InterruptedException { + countDownLatch.await(); + } + + private static void continueProcessing() { + continueProcessing.complete(null); + } + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceFailoverWithWatermarkExtractor.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceFailoverWithWatermarkExtractor.java new file mode 100644 index 000000000000..4f61d2f7308a --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceFailoverWithWatermarkExtractor.java @@ -0,0 +1,130 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import java.time.Duration; +import java.time.Instant; +import java.time.LocalDateTime; +import java.time.ZoneId; +import java.time.ZoneOffset; +import java.util.List; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicLong; +import java.util.stream.Collectors; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.table.data.RowData; +import org.apache.iceberg.Schema; +import org.apache.iceberg.Table; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.data.RandomGenericData; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.flink.FlinkReadOptions; +import org.apache.iceberg.flink.SimpleDataUtil; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.types.Comparators; +import org.apache.iceberg.util.StructLikeWrapper; +import org.awaitility.Awaitility; +import org.junit.jupiter.api.BeforeEach; + +public class TestIcebergSourceFailoverWithWatermarkExtractor extends TestIcebergSourceFailover { + // Increment ts by 15 minutes for each generateRecords batch + private static final long RECORD_BATCH_TS_INCREMENT_MILLI = TimeUnit.MINUTES.toMillis(15); + // Within a batch, increment ts by 1 second + private static final long RECORD_TS_INCREMENT_MILLI = TimeUnit.SECONDS.toMillis(1); + + private final AtomicLong tsMilli = new AtomicLong(System.currentTimeMillis()); + + @Override + @BeforeEach + protected void setupTable() { + this.sourceTable = + SOURCE_CATALOG_EXTENSION + .catalog() + .createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.TS_SCHEMA); + this.sinkTable = + SINK_CATALOG_EXTENSION + .catalog() + .createTable(TestFixtures.SINK_TABLE_IDENTIFIER, TestFixtures.TS_SCHEMA); + } + + @Override + protected IcebergSource.Builder sourceBuilder() { + Configuration config = new Configuration(); + return IcebergSource.forRowData() + .tableLoader(SOURCE_CATALOG_EXTENSION.tableLoader()) + .watermarkColumn("ts") + .project(TestFixtures.TS_SCHEMA) + // Prevent combining splits + .set( + FlinkReadOptions.SPLIT_FILE_OPEN_COST, + Long.toString(TableProperties.SPLIT_SIZE_DEFAULT)) + .flinkConfig(config); + } + + @Override + protected Schema schema() { + return TestFixtures.TS_SCHEMA; + } + + @Override + protected List generateRecords(int numRecords, long seed) { + // Override the ts field to create a more realistic situation for event time alignment + tsMilli.addAndGet(RECORD_BATCH_TS_INCREMENT_MILLI); + return RandomGenericData.generate(schema(), numRecords, seed).stream() + .peek( + record -> { + LocalDateTime ts = + LocalDateTime.ofInstant( + Instant.ofEpochMilli(tsMilli.addAndGet(RECORD_TS_INCREMENT_MILLI)), + ZoneId.of("Z")); + record.setField("ts", ts); + }) + .collect(Collectors.toList()); + } + + /** + * This override is needed because {@link Comparators} used by {@link StructLikeWrapper} retrieves + * Timestamp type using Long type as inner class, while the {@link RandomGenericData} generates + * {@link LocalDateTime} for {@code TimestampType.withoutZone()}. This method normalizes the + * {@link LocalDateTime} to a Long type so that Comparators can continue to work. + */ + @Override + protected void assertRecords(Table table, List expectedRecords, Duration timeout) + throws Exception { + List expectedNormalized = convertLocalDateTimeToMilli(expectedRecords); + Awaitility.await("expected list of records should be produced") + .atMost(timeout) + .untilAsserted( + () -> + SimpleDataUtil.assertRecordsEqual( + expectedNormalized, + convertLocalDateTimeToMilli(SimpleDataUtil.tableRecords(table)), + table.schema())); + } + + private List convertLocalDateTimeToMilli(List records) { + return records.stream() + .peek( + r -> { + LocalDateTime localDateTime = ((LocalDateTime) r.getField("ts")); + r.setField("ts", localDateTime.atZone(ZoneOffset.UTC).toInstant().toEpochMilli()); + }) + .collect(Collectors.toList()); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceReaderDeletes.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceReaderDeletes.java new file mode 100644 index 000000000000..df148c212ebd --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceReaderDeletes.java @@ -0,0 +1,102 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import java.io.IOException; +import java.util.List; +import java.util.Map; +import org.apache.flink.api.common.eventtime.WatermarkStrategy; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.test.junit5.MiniClusterExtension; +import org.apache.flink.util.CloseableIterator; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.iceberg.CatalogProperties; +import org.apache.iceberg.Schema; +import org.apache.iceberg.Table; +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.flink.CatalogLoader; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.MiniFlinkClusterExtension; +import org.apache.iceberg.flink.RowDataWrapper; +import org.apache.iceberg.flink.TableLoader; +import org.apache.iceberg.flink.source.assigner.SimpleSplitAssignerFactory; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.util.StructLikeSet; +import org.junit.jupiter.api.extension.RegisterExtension; + +public class TestIcebergSourceReaderDeletes extends TestFlinkReaderDeletesBase { + + private static final int PARALLELISM = 4; + + @RegisterExtension + private static final MiniClusterExtension MINI_CLUSTER = + MiniFlinkClusterExtension.createWithClassloaderCheckDisabled(); + + @Override + protected StructLikeSet rowSet(String tableName, Table testTable, String... columns) + throws IOException { + Schema projected = testTable.schema().select(columns); + RowType rowType = FlinkSchemaUtil.convert(projected); + + Map properties = Maps.newHashMap(); + properties.put( + CatalogProperties.WAREHOUSE_LOCATION, + hiveConf.get(HiveConf.ConfVars.METASTOREWAREHOUSE.varname)); + properties.put(CatalogProperties.URI, hiveConf.get(HiveConf.ConfVars.METASTOREURIS.varname)); + properties.put( + CatalogProperties.CLIENT_POOL_SIZE, + Integer.toString(hiveConf.getInt("iceberg.hive.client-pool-size", 5))); + CatalogLoader hiveCatalogLoader = CatalogLoader.hive(catalog.name(), hiveConf, properties); + TableLoader hiveTableLoader = + TableLoader.fromCatalog(hiveCatalogLoader, TableIdentifier.of("default", tableName)); + hiveTableLoader.open(); + try (TableLoader tableLoader = hiveTableLoader) { + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + env.setParallelism(1); + DataStream stream = + env.fromSource( + IcebergSource.builder() + .tableLoader(tableLoader) + .assignerFactory(new SimpleSplitAssignerFactory()) + .project(projected) + .build(), + WatermarkStrategy.noWatermarks(), + "testBasicRead", + TypeInformation.of(RowData.class)); + + try (CloseableIterator iter = stream.executeAndCollect()) { + List rowDataList = Lists.newArrayList(iter); + StructLikeSet set = StructLikeSet.create(projected.asStruct()); + rowDataList.forEach( + rowData -> { + RowDataWrapper wrapper = new RowDataWrapper(rowType, projected.asStruct()); + set.add(wrapper.wrap(rowData)); + }); + return set; + } catch (Exception e) { + throw new IOException("Failed to collect result", e); + } + } + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceSql.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceSql.java new file mode 100644 index 000000000000..75f0a785a8c5 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceSql.java @@ -0,0 +1,158 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import static org.apache.iceberg.types.Types.NestedField.required; + +import java.io.IOException; +import java.time.Instant; +import java.time.ZoneId; +import java.util.List; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.table.api.TableEnvironment; +import org.apache.flink.table.api.config.TableConfigOptions; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Schema; +import org.apache.iceberg.Table; +import org.apache.iceberg.data.GenericAppenderHelper; +import org.apache.iceberg.data.GenericRecord; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.flink.FlinkConfigOptions; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.flink.TestHelpers; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.types.Types; +import org.junit.jupiter.api.Test; + +/** Use the IcebergSource (FLIP-27) */ +public class TestIcebergSourceSql extends TestSqlBase { + private static final Schema SCHEMA_TS = + new Schema( + required(1, "t1", Types.TimestampType.withoutZone()), + required(2, "t2", Types.LongType.get())); + + @Override + public void before() throws IOException { + TableEnvironment tableEnvironment = getTableEnv(); + Configuration tableConf = tableEnvironment.getConfig().getConfiguration(); + tableConf.setBoolean(FlinkConfigOptions.TABLE_EXEC_ICEBERG_USE_FLIP27_SOURCE.key(), true); + + tableEnvironment.getConfig().set("table.exec.resource.default-parallelism", "1"); + SqlHelpers.sql( + tableEnvironment, + "create catalog iceberg_catalog with ('type'='iceberg', 'catalog-type'='hadoop', 'warehouse'='%s')", + CATALOG_EXTENSION.warehouse()); + SqlHelpers.sql(tableEnvironment, "use catalog iceberg_catalog"); + + tableConf.set(TableConfigOptions.TABLE_DYNAMIC_TABLE_OPTIONS_ENABLED, true); + } + + private Record generateRecord(Instant t1, long t2) { + Record record = GenericRecord.create(SCHEMA_TS); + record.setField("t1", t1.atZone(ZoneId.systemDefault()).toLocalDateTime()); + record.setField("t2", t2); + return record; + } + + /** Generates the records in the expected order, with respect to their datafile */ + private List generateExpectedRecords(boolean ascending) throws Exception { + Table table = CATALOG_EXTENSION.catalog().createTable(TestFixtures.TABLE_IDENTIFIER, SCHEMA_TS); + long baseTime = 1702382109000L; + + GenericAppenderHelper helper = + new GenericAppenderHelper(table, FileFormat.PARQUET, temporaryFolder); + + Record file1Record1 = + generateRecord(Instant.ofEpochMilli(baseTime), baseTime + (1000 * 60 * 60 * 24 * 30L)); + Record file1Record2 = + generateRecord( + Instant.ofEpochMilli(baseTime - 10 * 1000L), baseTime + (1000 * 60 * 60 * 24 * 35L)); + + List recordsDataFile1 = Lists.newArrayList(); + recordsDataFile1.add(file1Record1); + recordsDataFile1.add(file1Record2); + DataFile dataFile1 = helper.writeFile(recordsDataFile1); + + Record file2Record1 = + generateRecord( + Instant.ofEpochMilli(baseTime + 14 * 1000L), baseTime - (1000 * 60 * 60 * 24 * 30L)); + Record file2Record2 = + generateRecord( + Instant.ofEpochMilli(baseTime + 12 * 1000L), baseTime - (1000 * 60 * 61 * 24 * 35L)); + + List recordsDataFile2 = Lists.newArrayList(); + recordsDataFile2.add(file2Record1); + recordsDataFile2.add(file2Record2); + + DataFile dataFile2 = helper.writeFile(recordsDataFile2); + helper.appendToTable(dataFile1, dataFile2); + + // Expected records if the splits are ordered + // - ascending (watermark from t1) - records from the split with early timestamps, then + // records from the split with late timestamps + // - descending (watermark from t2) - records from the split with old longs, then records + // from the split with new longs + List expected = Lists.newArrayList(); + if (ascending) { + expected.addAll(recordsDataFile1); + expected.addAll(recordsDataFile2); + } else { + expected.addAll(recordsDataFile2); + expected.addAll(recordsDataFile1); + } + return expected; + } + + /** Tests the order of splits returned when setting the watermark-column options */ + @Test + public void testWatermarkOptionsAscending() throws Exception { + List expected = generateExpectedRecords(true); + TestHelpers.assertRecordsWithOrder( + run( + ImmutableMap.of("watermark-column", "t1", "split-file-open-cost", "128000000"), + "", + "*"), + expected, + SCHEMA_TS); + } + + /** + * Tests the order of splits returned when setting the watermark-column and + * watermark-column-time-unit" options + */ + @Test + public void testWatermarkOptionsDescending() throws Exception { + List expected = generateExpectedRecords(false); + TestHelpers.assertRecordsWithOrder( + run( + ImmutableMap.of( + "watermark-column", + "t2", + "watermark-column-time-unit", + "MILLISECONDS", + "split-file-open-cost", + "128000000"), + "", + "*"), + expected, + SCHEMA_TS); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceWithWatermarkExtractor.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceWithWatermarkExtractor.java new file mode 100644 index 000000000000..70889f4f76aa --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceWithWatermarkExtractor.java @@ -0,0 +1,408 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import static org.apache.iceberg.flink.MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG; + +import java.io.Serializable; +import java.nio.file.Path; +import java.time.Duration; +import java.time.Instant; +import java.time.LocalDateTime; +import java.time.ZoneId; +import java.util.List; +import java.util.Optional; +import java.util.concurrent.ConcurrentMap; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicInteger; +import org.apache.flink.api.common.JobID; +import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner; +import org.apache.flink.api.common.eventtime.WatermarkStrategy; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.core.execution.JobClient; +import org.apache.flink.metrics.Gauge; +import org.apache.flink.runtime.metrics.MetricNames; +import org.apache.flink.runtime.minicluster.MiniCluster; +import org.apache.flink.runtime.minicluster.RpcServiceSharing; +import org.apache.flink.runtime.testutils.CommonTestUtils; +import org.apache.flink.runtime.testutils.InMemoryReporter; +import org.apache.flink.runtime.testutils.MiniClusterResourceConfiguration; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.streaming.api.functions.windowing.AllWindowFunction; +import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows; +import org.apache.flink.streaming.api.windowing.time.Time; +import org.apache.flink.streaming.api.windowing.windows.TimeWindow; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.test.junit5.InjectMiniCluster; +import org.apache.flink.test.junit5.MiniClusterExtension; +import org.apache.flink.util.CloseableIterator; +import org.apache.flink.util.Collector; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.data.GenericAppenderHelper; +import org.apache.iceberg.data.GenericRecord; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.flink.HadoopTableExtension; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.awaitility.Awaitility; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.RegisterExtension; +import org.junit.jupiter.api.io.TempDir; + +public class TestIcebergSourceWithWatermarkExtractor implements Serializable { + private static final int PARALLELISM = 4; + private static final String SOURCE_NAME = "IcebergSource"; + private static final int RECORD_NUM_FOR_2_SPLITS = 200; + private static final ConcurrentMap WINDOWS = Maps.newConcurrentMap(); + + @TempDir protected Path temporaryFolder; + + private static final InMemoryReporter REPORTER = InMemoryReporter.createWithRetainedMetrics(); + + @RegisterExtension + public static final MiniClusterExtension MINI_CLUSTER_EXTENSION = + new MiniClusterExtension( + new MiniClusterResourceConfiguration.Builder() + .setNumberTaskManagers(1) + .setNumberSlotsPerTaskManager(PARALLELISM) + .setRpcServiceSharing(RpcServiceSharing.DEDICATED) + .setConfiguration(REPORTER.addToConfiguration(DISABLE_CLASSLOADER_CHECK_CONFIG)) + .withHaLeadershipControl() + .build()); + + @RegisterExtension + private static final HadoopTableExtension TABLE_EXTENSION = + new HadoopTableExtension(TestFixtures.DATABASE, TestFixtures.TABLE, TestFixtures.TS_SCHEMA); + + /** + * This is an integration test for watermark handling and windowing. Integration testing the + * following features: + * + *
      + *
    • - Ordering of the splits + *
    • - Emitting of watermarks + *
    • - Firing windows based on watermarks + *
    + * + *

    The test generates 4 splits + * + *

      + *
    • - Split 1 - Watermark 100 min + *
    • - Split 2, 3 - Watermark 0 min + *
    • - Split 4 - Watermark 6 min + *
    + * + *

    Creates a source with 5 minutes tumbling window with parallelism 1 (to prevent concurrency + * issues). + * + *

    Checks that windows are handled correctly based on the emitted watermarks, and splits are + * read in the following order: + * + *

      + *
    • - Split 2, 3 + *
    • - Split 4 + *
    • - Split 1 + *
    + * + *

    As a result the window aggregator emits the records based on in Split 2-3, and Split 4 data. + * + *

    Add 2 more splits, so the task manager close the windows for the original 4 splits and emit + * the appropriate aggregated records. + */ + @Test + public void testWindowing() throws Exception { + GenericAppenderHelper dataAppender = appender(); + List expectedRecords = Lists.newArrayList(); + + // Generate records with the following pattern: + // - File 1 - Later records (Watermark 6000000) + // - Split 1 - 2 records (100, "file_1-recordTs_100"), (103, "file_1-recordTs_103") + // - File 2 - First records (Watermark 0) + // - Split 1 - 100 records (0, "file_2-recordTs_0"), (1, "file_2-recordTs_1"),... + // - Split 2 - 100 records (0, "file_2-recordTs_0"), (1, "file_2-recordTs_1"),... + // - File 3 - Parallel write for the first records (Watermark 360000) + // - Split 1 - 2 records (6, "file_3-recordTs_6"), (7, "file_3-recordTs_7") + List batch = + ImmutableList.of( + generateRecord(100, "file_1-recordTs_100"), + generateRecord(101, "file_1-recordTs_101"), + generateRecord(103, "file_1-recordTs_103")); + expectedRecords.addAll(batch); + dataAppender.appendToTable(batch); + + batch = Lists.newArrayListWithCapacity(100); + for (int i = 0; i < RECORD_NUM_FOR_2_SPLITS; ++i) { + // Generate records where the timestamps are out of order, but still between 0-5 minutes + batch.add(generateRecord(4 - i % 5, "file_2-recordTs_" + i)); + } + expectedRecords.addAll(batch); + dataAppender.appendToTable(batch); + + batch = + ImmutableList.of( + generateRecord(6, "file_3-recordTs_6"), generateRecord(7, "file_3-recordTs_7")); + expectedRecords.addAll(batch); + dataAppender.appendToTable(batch); + + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + env.setParallelism(1); + + DataStream stream = + env.fromSource( + source(), + WatermarkStrategy.noWatermarks() + .withTimestampAssigner(new RowDataTimestampAssigner()), + SOURCE_NAME, + TypeInformation.of(RowData.class)); + + stream + .windowAll(TumblingEventTimeWindows.of(Time.minutes(5))) + .apply( + new AllWindowFunction() { + @Override + public void apply( + TimeWindow window, Iterable values, Collector out) { + // Emit RowData which contains the window start time, and the record count in + // that window + AtomicInteger count = new AtomicInteger(0); + values.forEach(a -> count.incrementAndGet()); + out.collect(row(window.getStart(), count.get())); + WINDOWS.put(window.getStart(), count.get()); + } + }); + + // Use static variable to collect the windows, since other solutions were flaky + WINDOWS.clear(); + env.executeAsync("Iceberg Source Windowing Test"); + + // Wait for the 2 first windows from File 2 and File 3 + Awaitility.await() + .pollInterval(Duration.ofMillis(10)) + .atMost(30, TimeUnit.SECONDS) + .until( + () -> + WINDOWS.equals( + ImmutableMap.of(0L, RECORD_NUM_FOR_2_SPLITS, TimeUnit.MINUTES.toMillis(5), 2))); + + // Write data so the windows containing test data are closed + dataAppender.appendToTable( + dataAppender.writeFile(ImmutableList.of(generateRecord(1500, "last-record")))); + + // Wait for last test record window from File 1 + Awaitility.await() + .pollInterval(Duration.ofMillis(10)) + .atMost(30, TimeUnit.SECONDS) + .until( + () -> + WINDOWS.equals( + ImmutableMap.of( + 0L, + RECORD_NUM_FOR_2_SPLITS, + TimeUnit.MINUTES.toMillis(5), + 2, + TimeUnit.MINUTES.toMillis(100), + 3))); + } + + /** + * This is an integration test for watermark handling and throttling. Integration testing the + * following: + * + *

      + *
    • - Emitting of watermarks + *
    • - Watermark alignment + *
    + * + *

    The test generates 3 splits + * + *

      + *
    • - Split 1 - Watermark 100 min + *
    • - Split 2, 3 - Watermark 0 min + *
    + * + * The splits are read in the following order: + * + *
      + *
    • - Split 2, 3 (Task Manager 1, Task Manager 2) + *
    • - Split 1 (Task Manager 1 or ask Manager 2 depending on scheduling) + *
    + * + * Reading split 1 will cause the watermark alignment to pause reading for the given task manager. + * + *

    The status of the watermark alignment is checked by the alignment related metrics. + * + *

    Adding new records with old timestamps to the table will enable the running reader to + * continue reading the files, but the watermark alignment will still prevent the paused reader to + * continue. + * + *

    After adding some records with new timestamps the blocked reader is un-paused, and both ot + * the readers continue reading. + */ + @Test + public void testThrottling(@InjectMiniCluster MiniCluster miniCluster) throws Exception { + GenericAppenderHelper dataAppender = appender(); + + // Generate records in advance + + // File 1 - Later records (Watermark 6.000.000 - 100 min) + // - Split 1 - 2 records (100, "file_1-recordTs_100"), (103, "file_1-recordTs_103") + List batch1 = + ImmutableList.of( + generateRecord(100, "file_1-recordTs_100"), generateRecord(103, "file_1-recordTs_103")); + + // File 2 - First records (Watermark 0 - 0 min) + // - Split 1 - 100 records (0, "file_2-recordTs_0"), (1, "file_2-recordTs_1"),... + // - Split 2 - 100 records (0, "file_2-recordTs_0"), (1, "file_2-recordTs_1"),... + List batch2 = Lists.newArrayListWithCapacity(100); + for (int i = 0; i < RECORD_NUM_FOR_2_SPLITS; ++i) { + batch2.add(generateRecord(4 - i % 5, "file_2-recordTs_" + i)); + } + + // File 3 - Some records will be blocked (Watermark 900.000 - 15 min) + List batch3 = + ImmutableList.of( + generateRecord(15, "file_3-recordTs_15"), + generateRecord(16, "file_3-recordTs_16"), + generateRecord(17, "file_3-recordTs_17")); + + // File 4 - Some records will be blocked (Watermark 900.000 - 15 min) + List batch4 = + ImmutableList.of( + generateRecord(15, "file_4-recordTs_15"), + generateRecord(16, "file_4-recordTs_16"), + generateRecord(17, "file_4-recordTs_17")); + + // File 5 - Records which will remove the block (Watermark 5.400.000 - 90 min) + List batch5 = + ImmutableList.of( + generateRecord(90, "file_5-recordTs_90"), generateRecord(91, "file_5-recordTs_91")); + + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + env.setParallelism(2); + + DataStream stream = + env.fromSource( + source(), + WatermarkStrategy.noWatermarks() + .withWatermarkAlignment("iceberg", Duration.ofMinutes(20), Duration.ofMillis(10)), + SOURCE_NAME, + TypeInformation.of(RowData.class)); + + try (CloseableIterator resultIterator = stream.collectAsync()) { + JobClient jobClient = env.executeAsync("Iceberg Source Throttling Test"); + CommonTestUtils.waitForAllTaskRunning(miniCluster, jobClient.getJobID(), false); + + // Insert the first data into the table + dataAppender.appendToTable(dataAppender.writeFile(batch1), dataAppender.writeFile(batch2)); + + // Get the drift metric, wait for it to be created and reach the expected state + // (100 min - 20 min - 0 min) + // Also this validates that the WatermarkAlignment is working + Awaitility.await() + .pollInterval(Duration.ofMillis(10)) + .atMost(30, TimeUnit.SECONDS) + .until( + () -> + findAlignmentDriftMetric(jobClient.getJobID(), TimeUnit.MINUTES.toMillis(80)) + .isPresent()); + Gauge drift = + findAlignmentDriftMetric(jobClient.getJobID(), TimeUnit.MINUTES.toMillis(80)).get(); + + // Add some old records with 2 splits, so even if the blocked gets one split, the other reader + // one gets one as well + dataAppender.appendToTable(dataAppender.writeFile(batch3), dataAppender.writeFile(batch4)); + + // Get the drift metric, wait for it to be created and reach the expected state (100 min - 20 + // min - 15 min) + Awaitility.await() + .pollInterval(Duration.ofMillis(10)) + .atMost(30, TimeUnit.SECONDS) + .until(() -> drift.getValue() == TimeUnit.MINUTES.toMillis(65)); + + // Add some new records which should unblock the throttled reader + dataAppender.appendToTable(batch5); + + // Wait for the new drift to decrease below the allowed drift to signal the normal state + Awaitility.await() + .pollInterval(Duration.ofMillis(10)) + .atMost(30, TimeUnit.SECONDS) + .until(() -> drift.getValue() < TimeUnit.MINUTES.toMillis(20)); + } + } + + protected IcebergSource source() { + return IcebergSource.builder() + .tableLoader(TABLE_EXTENSION.tableLoader()) + .watermarkColumn("ts") + .project(TestFixtures.TS_SCHEMA) + .splitSize(100L) + .streaming(true) + .monitorInterval(Duration.ofMillis(10)) + .streamingStartingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) + .build(); + } + + protected Record generateRecord(int minutes, String str) { + // Override the ts field to create a more realistic situation for event time alignment + Record record = GenericRecord.create(TestFixtures.TS_SCHEMA); + LocalDateTime ts = + LocalDateTime.ofInstant( + Instant.ofEpochMilli(Time.of(minutes, TimeUnit.MINUTES).toMilliseconds()), + ZoneId.of("Z")); + record.setField("ts", ts); + record.setField("str", str); + return record; + } + + private Optional> findAlignmentDriftMetric(JobID jobID, long withValue) { + String metricsName = SOURCE_NAME + ".*" + MetricNames.WATERMARK_ALIGNMENT_DRIFT; + return REPORTER.findMetrics(jobID, metricsName).values().stream() + .map(m -> (Gauge) m) + .filter(m -> m.getValue() == withValue) + .findFirst(); + } + + private GenericAppenderHelper appender() { + // We need to create multiple splits, so we need to generate parquet files with multiple offsets + org.apache.hadoop.conf.Configuration hadoopConf = new org.apache.hadoop.conf.Configuration(); + hadoopConf.set("write.parquet.page-size-bytes", "64"); + hadoopConf.set("write.parquet.row-group-size-bytes", "64"); + return new GenericAppenderHelper( + TABLE_EXTENSION.table(), FileFormat.PARQUET, temporaryFolder, hadoopConf); + } + + private static RowData row(long time, long count) { + GenericRowData result = new GenericRowData(2); + result.setField(0, time); + result.setField(1, String.valueOf(count)); + return result; + } + + private static class RowDataTimestampAssigner implements SerializableTimestampAssigner { + @Override + public long extractTimestamp(RowData element, long recordTimestamp) { + return element.getTimestamp(0, 0).getMillisecond(); + } + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSpeculativeExecutionSupport.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSpeculativeExecutionSupport.java new file mode 100644 index 000000000000..41b023b93617 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSpeculativeExecutionSupport.java @@ -0,0 +1,184 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.IOException; +import java.nio.file.Files; +import java.time.Duration; +import java.util.Arrays; +import java.util.List; +import java.util.stream.Collectors; +import org.apache.flink.api.common.RuntimeExecutionMode; +import org.apache.flink.api.common.functions.RichMapFunction; +import org.apache.flink.api.common.typeinfo.Types; +import org.apache.flink.configuration.BatchExecutionOptions; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.configuration.CoreOptions; +import org.apache.flink.configuration.JobManagerOptions; +import org.apache.flink.configuration.RestOptions; +import org.apache.flink.configuration.SlowTaskDetectorOptions; +import org.apache.flink.runtime.testutils.MiniClusterResourceConfiguration; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.api.Table; +import org.apache.flink.table.api.TableEnvironment; +import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; +import org.apache.flink.test.junit5.MiniClusterExtension; +import org.apache.flink.types.Row; +import org.apache.iceberg.flink.FlinkConfigOptions; +import org.apache.iceberg.flink.TestBase; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.RegisterExtension; + +public class TestIcebergSpeculativeExecutionSupport extends TestBase { + private static final int NUM_TASK_MANAGERS = 1; + private static final int NUM_TASK_SLOTS = 3; + + @RegisterExtension + public static final MiniClusterExtension MINI_CLUSTER_EXTENSION = + new MiniClusterExtension( + new MiniClusterResourceConfiguration.Builder() + .setNumberTaskManagers(NUM_TASK_MANAGERS) + .setNumberSlotsPerTaskManager(NUM_TASK_SLOTS) + .setConfiguration(configure()) + .build()); + + private StreamTableEnvironment tEnv; + private static final String CATALOG_NAME = "test_catalog"; + private static final String DATABASE_NAME = "test_db"; + private static final String INPUT_TABLE_NAME = "test_table"; + private static final String OUTPUT_TABLE_NAME = "sink_table"; + + @Override + protected TableEnvironment getTableEnv() { + if (tEnv == null) { + synchronized (this) { + StreamExecutionEnvironment env = + StreamExecutionEnvironment.getExecutionEnvironment(configure()); + env.setRuntimeMode(RuntimeExecutionMode.BATCH); + tEnv = StreamTableEnvironment.create(env); + } + } + + return tEnv; + } + + @BeforeEach + public void before() throws IOException { + String warehouse = + String.format("file:%s", Files.createTempDirectory(temporaryDirectory, "junit").toString()); + sql( + "CREATE CATALOG %s WITH ('type'='iceberg', 'catalog-type'='hadoop', 'warehouse'='%s')", + CATALOG_NAME, warehouse); + sql("USE CATALOG %s", CATALOG_NAME); + sql("CREATE DATABASE %s", DATABASE_NAME); + sql("USE %s", DATABASE_NAME); + + sql("CREATE TABLE %s (i INT, j INT)", INPUT_TABLE_NAME); + sql("INSERT INTO %s VALUES (1, -1),(2, -1),(3, -1)", INPUT_TABLE_NAME); + sql("CREATE TABLE %s (i INT, j INT, subTask INT, attempt INT)", OUTPUT_TABLE_NAME); + } + + @AfterEach + public void after() { + sql("DROP TABLE IF EXISTS %s.%s", DATABASE_NAME, INPUT_TABLE_NAME); + sql("DROP TABLE IF EXISTS %s.%s", DATABASE_NAME, OUTPUT_TABLE_NAME); + dropDatabase(DATABASE_NAME, true); + dropCatalog(CATALOG_NAME, true); + } + + @Test + public void testSpeculativeExecution() throws Exception { + Table table = + tEnv.sqlQuery(String.format("SELECT * FROM %s.%s", DATABASE_NAME, INPUT_TABLE_NAME)); + DataStream slowStream = + tEnv.toDataStream(table, Row.class) + .map(new TestingMap()) + .name("test_map") + .returns( + Types.ROW_NAMED( + new String[] {"i", "j", "subTask", "attempt"}, + Types.INT, + Types.INT, + Types.INT, + Types.INT)) + .setParallelism(NUM_TASK_SLOTS); + + tEnv.fromDataStream(slowStream) + .executeInsert(String.format("%s.%s", DATABASE_NAME, OUTPUT_TABLE_NAME)) + .await(); + + List output = sql(String.format("SELECT * FROM %s.%s", DATABASE_NAME, OUTPUT_TABLE_NAME)); + + // Ensure that all subTasks has attemptNum > 0 + assertThat(output.stream().map(x -> x.getField(3)).collect(Collectors.toSet())).contains(1); + + // Ensure the test_table rows are returned exactly the same after the slow map task from the + // sink_table + assertSameElements( + output.stream().map(x -> Row.of(x.getField(0), x.getField(1))).collect(Collectors.toList()), + Arrays.asList(Row.of(1, -1), Row.of(2, -1), Row.of(3, -1))); + } + + /** A testing map function that simulates the slow task. */ + private static class TestingMap extends RichMapFunction { + @Override + public Row map(Row row) throws Exception { + // Put the subtasks with the first attempt to sleep to trigger speculative + // execution + if (getRuntimeContext().getTaskInfo().getAttemptNumber() <= 0) { + Thread.sleep(Integer.MAX_VALUE); + } + + Row output = + Row.of( + row.getField(0), + row.getField(1), + getRuntimeContext().getTaskInfo().getIndexOfThisSubtask(), + getRuntimeContext().getTaskInfo().getAttemptNumber()); + + return output; + } + } + + private static Configuration configure() { + Configuration configuration = new Configuration(); + configuration.set(CoreOptions.CHECK_LEAKED_CLASSLOADER, false); + configuration.set(RestOptions.BIND_PORT, "0"); + configuration.set(JobManagerOptions.SLOT_REQUEST_TIMEOUT, 5000L); + + // Use FLIP-27 source + configuration.set(FlinkConfigOptions.TABLE_EXEC_ICEBERG_USE_FLIP27_SOURCE, true); + + // for speculative execution + configuration.set(BatchExecutionOptions.SPECULATIVE_ENABLED, true); + + configuration.set(SlowTaskDetectorOptions.EXECUTION_TIME_BASELINE_MULTIPLIER, 1.0); + configuration.set(SlowTaskDetectorOptions.EXECUTION_TIME_BASELINE_RATIO, 0.2); + configuration.set( + SlowTaskDetectorOptions.EXECUTION_TIME_BASELINE_LOWER_BOUND, Duration.ofMillis(0)); + configuration.set(BatchExecutionOptions.BLOCK_SLOW_NODE_DURATION, Duration.ofMillis(0)); + + return configuration; + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestMetadataTableReadableMetrics.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestMetadataTableReadableMetrics.java new file mode 100644 index 000000000000..9cf953342a18 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestMetadataTableReadableMetrics.java @@ -0,0 +1,299 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + +import java.io.File; +import java.io.IOException; +import java.math.BigDecimal; +import java.nio.ByteBuffer; +import java.nio.file.Path; +import java.util.Base64; +import java.util.List; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.configuration.CoreOptions; +import org.apache.flink.table.api.TableEnvironment; +import org.apache.flink.types.Row; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.Files; +import org.apache.iceberg.Parameters; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.Table; +import org.apache.iceberg.catalog.Namespace; +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.data.FileHelpers; +import org.apache.iceberg.data.GenericRecord; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.flink.CatalogTestBase; +import org.apache.iceberg.flink.TestHelpers; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.types.Types; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.api.io.TempDir; + +public class TestMetadataTableReadableMetrics extends CatalogTestBase { + private static final String TABLE_NAME = "test_table"; + + @Parameters(name = "catalogName={0}, baseNamespace={1}") + protected static List parameters() { + List parameters = Lists.newArrayList(); + String catalogName = "testhive"; + Namespace baseNamespace = Namespace.empty(); + parameters.add(new Object[] {catalogName, baseNamespace}); + return parameters; + } + + @Override + protected TableEnvironment getTableEnv() { + Configuration configuration = super.getTableEnv().getConfig().getConfiguration(); + configuration.set(CoreOptions.DEFAULT_PARALLELISM, 1); + return super.getTableEnv(); + } + + private @TempDir Path temp; + + private static final Types.StructType LEAF_STRUCT_TYPE = + Types.StructType.of( + optional(1, "leafLongCol", Types.LongType.get()), + optional(2, "leafDoubleCol", Types.DoubleType.get())); + + private static final Types.StructType NESTED_STRUCT_TYPE = + Types.StructType.of(required(3, "leafStructCol", LEAF_STRUCT_TYPE)); + + private static final Schema NESTED_SCHEMA = + new Schema(required(4, "nestedStructCol", NESTED_STRUCT_TYPE)); + + private static final Schema PRIMITIVE_SCHEMA = + new Schema( + required(1, "booleanCol", Types.BooleanType.get()), + required(2, "intCol", Types.IntegerType.get()), + required(3, "longCol", Types.LongType.get()), + required(4, "floatCol", Types.FloatType.get()), + required(5, "doubleCol", Types.DoubleType.get()), + optional(6, "decimalCol", Types.DecimalType.of(10, 2)), + optional(7, "stringCol", Types.StringType.get()), + optional(8, "fixedCol", Types.FixedType.ofLength(3)), + optional(9, "binaryCol", Types.BinaryType.get())); + + private Table createPrimitiveTable() throws IOException { + Table table = + catalog.createTable( + TableIdentifier.of(DATABASE, TABLE_NAME), + PRIMITIVE_SCHEMA, + PartitionSpec.unpartitioned(), + ImmutableMap.of()); + List records = + Lists.newArrayList( + createPrimitiveRecord( + false, + 1, + 1L, + 0, + 1.0D, + new BigDecimal("1.00"), + "1", + Base64.getDecoder().decode("1111"), + ByteBuffer.wrap(Base64.getDecoder().decode("1111"))), + createPrimitiveRecord( + true, + 2, + 2L, + 0, + 2.0D, + new BigDecimal("2.00"), + "2", + Base64.getDecoder().decode("2222"), + ByteBuffer.wrap(Base64.getDecoder().decode("2222"))), + createPrimitiveRecord(false, 1, 1, Float.NaN, Double.NaN, null, "1", null, null), + createPrimitiveRecord( + false, 2, 2L, Float.NaN, 2.0D, new BigDecimal("2.00"), "2", null, null)); + + File testFile = File.createTempFile("junit", null, temp.toFile()); + DataFile dataFile = FileHelpers.writeDataFile(table, Files.localOutput(testFile), records); + table.newAppend().appendFile(dataFile).commit(); + return table; + } + + private void createNestedTable() throws IOException { + Table table = + validationCatalog.createTable( + TableIdentifier.of(DATABASE, TABLE_NAME), + NESTED_SCHEMA, + PartitionSpec.unpartitioned(), + ImmutableMap.of()); + + List records = + Lists.newArrayList( + createNestedRecord(0L, 0.0), + createNestedRecord(1L, Double.NaN), + createNestedRecord(null, null)); + + File testFile = File.createTempFile("junit", null, temp.toFile()); + DataFile dataFile = FileHelpers.writeDataFile(table, Files.localOutput(testFile), records); + table.newAppend().appendFile(dataFile).commit(); + } + + @BeforeEach + public void before() { + super.before(); + sql("USE CATALOG %s", catalogName); + sql("CREATE DATABASE %s", DATABASE); + sql("USE %s", DATABASE); + } + + @Override + @AfterEach + public void clean() { + sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, TABLE_NAME); + dropDatabase(flinkDatabase, true); + super.clean(); + } + + protected GenericRecord createPrimitiveRecord( + boolean booleanCol, + int intCol, + long longCol, + float floatCol, + double doubleCol, + BigDecimal decimalCol, + String stringCol, + byte[] fixedCol, + ByteBuffer binaryCol) { + GenericRecord record = GenericRecord.create(PRIMITIVE_SCHEMA); + record.set(0, booleanCol); + record.set(1, intCol); + record.set(2, longCol); + record.set(3, floatCol); + record.set(4, doubleCol); + record.set(5, decimalCol); + record.set(6, stringCol); + record.set(7, fixedCol); + record.set(8, binaryCol); + return record; + } + + private GenericRecord createNestedRecord(Long longCol, Double doubleCol) { + GenericRecord record = GenericRecord.create(NESTED_SCHEMA); + GenericRecord nested = GenericRecord.create(NESTED_STRUCT_TYPE); + GenericRecord leaf = GenericRecord.create(LEAF_STRUCT_TYPE); + leaf.set(0, longCol); + leaf.set(1, doubleCol); + nested.set(0, leaf); + record.set(0, nested); + return record; + } + + protected Object[] row(Object... values) { + return values; + } + + @TestTemplate + public void testPrimitiveColumns() throws Exception { + createPrimitiveTable(); + List result = sql("SELECT readable_metrics FROM %s$files", TABLE_NAME); + + Row binaryCol = + Row.of( + 52L, + 4L, + 2L, + null, + Base64.getDecoder().decode("1111"), + Base64.getDecoder().decode("2222")); + Row booleanCol = Row.of(32L, 4L, 0L, null, false, true); + Row decimalCol = Row.of(85L, 4L, 1L, null, new BigDecimal("1.00"), new BigDecimal("2.00")); + Row doubleCol = Row.of(85L, 4L, 0L, 1L, 1.0D, 2.0D); + Row fixedCol = + Row.of( + 44L, + 4L, + 2L, + null, + Base64.getDecoder().decode("1111"), + Base64.getDecoder().decode("2222")); + Row floatCol = Row.of(71L, 4L, 0L, 2L, 0f, 0f); + Row intCol = Row.of(71L, 4L, 0L, null, 1, 2); + Row longCol = Row.of(79L, 4L, 0L, null, 1L, 2L); + Row stringCol = Row.of(79L, 4L, 0L, null, "1", "2"); + + List expected = + Lists.newArrayList( + Row.of( + Row.of( + binaryCol, + booleanCol, + decimalCol, + doubleCol, + fixedCol, + floatCol, + intCol, + longCol, + stringCol))); + TestHelpers.assertRows(result, expected); + } + + @TestTemplate + public void testSelectPrimitiveValues() throws Exception { + createPrimitiveTable(); + + TestHelpers.assertRows( + sql( + "SELECT readable_metrics.intCol.lower_bound, readable_metrics.booleanCol.upper_bound FROM %s$files", + TABLE_NAME), + ImmutableList.of(Row.of(1, true))); + + TestHelpers.assertRows( + sql("SELECT content, readable_metrics.longCol.value_count FROM %s$files", TABLE_NAME), + ImmutableList.of(Row.of(0, 4L))); + + TestHelpers.assertRows( + sql("SELECT readable_metrics.longCol.value_count, content FROM %s$files", TABLE_NAME), + ImmutableList.of(Row.of(4L, 0))); + } + + @TestTemplate + public void testSelectNestedValues() throws Exception { + createNestedTable(); + TestHelpers.assertRows( + sql( + "SELECT readable_metrics.`nestedStructCol.leafStructCol.leafLongCol`.lower_bound, " + + "readable_metrics.`nestedStructCol.leafStructCol.leafDoubleCol`.value_count FROM %s$files", + TABLE_NAME), + ImmutableList.of(Row.of(0L, 3L))); + } + + @TestTemplate + public void testNestedValues() throws Exception { + createNestedTable(); + + Row leafDoubleCol = Row.of(46L, 3L, 1L, 1L, 0.0D, 0.0D); + Row leafLongCol = Row.of(54L, 3L, 1L, null, 0L, 1L); + Row metrics = Row.of(Row.of(leafDoubleCol, leafLongCol)); + + TestHelpers.assertRows( + sql("SELECT readable_metrics FROM %s$files", TABLE_NAME), ImmutableList.of(metrics)); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestProjectMetaColumn.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestProjectMetaColumn.java new file mode 100644 index 000000000000..ce9054ad49b6 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestProjectMetaColumn.java @@ -0,0 +1,188 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.List; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.DeleteFile; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Parameter; +import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.Parameters; +import org.apache.iceberg.RowDelta; +import org.apache.iceberg.SerializableTable; +import org.apache.iceberg.Table; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.flink.SimpleDataUtil; +import org.apache.iceberg.flink.TableLoader; +import org.apache.iceberg.flink.TestHelpers; +import org.apache.iceberg.flink.data.RowDataProjection; +import org.apache.iceberg.flink.sink.RowDataTaskWriterFactory; +import org.apache.iceberg.flink.sink.TaskWriterFactory; +import org.apache.iceberg.io.TaskWriter; +import org.apache.iceberg.io.WriteResult; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.api.extension.ExtendWith; +import org.junit.jupiter.api.io.TempDir; + +@ExtendWith(ParameterizedTestExtension.class) +public class TestProjectMetaColumn { + + @TempDir protected Path temporaryFolder; + + @Parameter(index = 0) + private FileFormat format; + + @Parameters(name = "fileFormat={0}") + public static Iterable parameters() { + return Lists.newArrayList( + new Object[] {FileFormat.PARQUET}, + new Object[] {FileFormat.ORC}, + new Object[] {FileFormat.AVRO}); + } + + private void testSkipToRemoveMetaColumn(int formatVersion) throws IOException { + // Create the table with given format version. + String location = Files.createTempDirectory(temporaryFolder, "junit").toFile().toString(); + Table table = + SimpleDataUtil.createTable( + location, + ImmutableMap.of(TableProperties.FORMAT_VERSION, String.valueOf(formatVersion)), + false); + + List rows = + Lists.newArrayList( + SimpleDataUtil.createInsert(1, "AAA"), + SimpleDataUtil.createInsert(2, "BBB"), + SimpleDataUtil.createInsert(3, "CCC")); + writeAndCommit(table, ImmutableList.of(), false, rows); + + FlinkInputFormat input = + FlinkSource.forRowData().tableLoader(TableLoader.fromHadoopTable(location)).buildFormat(); + + List results = Lists.newArrayList(); + TestHelpers.readRowData( + input, + rowData -> { + // If project to remove the meta columns, it will get a RowDataProjection. + assertThat(rowData).isInstanceOf(GenericRowData.class); + results.add(TestHelpers.copyRowData(rowData, SimpleDataUtil.ROW_TYPE)); + }); + + // Assert the results. + TestHelpers.assertRows(rows, results, SimpleDataUtil.ROW_TYPE); + } + + @TestTemplate + public void testV1SkipToRemoveMetaColumn() throws IOException { + testSkipToRemoveMetaColumn(1); + } + + @TestTemplate + public void testV2SkipToRemoveMetaColumn() throws IOException { + testSkipToRemoveMetaColumn(2); + } + + @TestTemplate + public void testV2RemoveMetaColumn() throws Exception { + // Create the v2 table. + String location = Files.createTempDirectory(temporaryFolder, "junit").toFile().toString(); + Table table = + SimpleDataUtil.createTable( + location, ImmutableMap.of(TableProperties.FORMAT_VERSION, "2"), false); + + List rows = + Lists.newArrayList( + SimpleDataUtil.createInsert(1, "AAA"), + SimpleDataUtil.createDelete(1, "AAA"), + SimpleDataUtil.createInsert(2, "AAA"), + SimpleDataUtil.createInsert(2, "BBB")); + int eqFieldId = table.schema().findField("data").fieldId(); + writeAndCommit(table, ImmutableList.of(eqFieldId), true, rows); + + FlinkInputFormat input = + FlinkSource.forRowData().tableLoader(TableLoader.fromHadoopTable(location)).buildFormat(); + + List results = Lists.newArrayList(); + TestHelpers.readRowData( + input, + rowData -> { + // If project to remove the meta columns, it will get a RowDataProjection. + assertThat(rowData).isInstanceOf(RowDataProjection.class); + results.add(TestHelpers.copyRowData(rowData, SimpleDataUtil.ROW_TYPE)); + }); + + // Assert the results. + TestHelpers.assertRows( + ImmutableList.of( + SimpleDataUtil.createInsert(2, "AAA"), SimpleDataUtil.createInsert(2, "BBB")), + results, + SimpleDataUtil.ROW_TYPE); + } + + private void writeAndCommit( + Table table, List eqFieldIds, boolean upsert, List rows) + throws IOException { + TaskWriter writer = createTaskWriter(table, eqFieldIds, upsert); + try (TaskWriter io = writer) { + for (RowData row : rows) { + io.write(row); + } + } + + RowDelta delta = table.newRowDelta(); + WriteResult result = writer.complete(); + + for (DataFile dataFile : result.dataFiles()) { + delta.addRows(dataFile); + } + + for (DeleteFile deleteFile : result.deleteFiles()) { + delta.addDeletes(deleteFile); + } + + delta.commit(); + } + + private TaskWriter createTaskWriter( + Table table, List equalityFieldIds, boolean upsert) { + TaskWriterFactory taskWriterFactory = + new RowDataTaskWriterFactory( + SerializableTable.copyOf(table), + SimpleDataUtil.ROW_TYPE, + TableProperties.WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT, + format, + table.properties(), + equalityFieldIds, + upsert); + + taskWriterFactory.initialize(1, 1); + return taskWriterFactory.create(); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestRowDataToAvroGenericRecordConverter.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestRowDataToAvroGenericRecordConverter.java new file mode 100644 index 000000000000..6ef40693827e --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestRowDataToAvroGenericRecordConverter.java @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import static org.assertj.core.api.Assertions.assertThat; + +import org.apache.avro.generic.GenericRecord; +import org.apache.iceberg.flink.AvroGenericRecordConverterBase; +import org.apache.iceberg.flink.DataGenerator; + +public class TestRowDataToAvroGenericRecordConverter extends AvroGenericRecordConverterBase { + @Override + protected void testConverter(DataGenerator dataGenerator) { + RowDataToAvroGenericRecordConverter converter = + RowDataToAvroGenericRecordConverter.fromAvroSchema(dataGenerator.avroSchema()); + GenericRecord expected = dataGenerator.generateAvroGenericRecord(); + GenericRecord actual = converter.apply(dataGenerator.generateFlinkRowData()); + assertThat(actual).isEqualTo(expected); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestScanContext.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestScanContext.java new file mode 100644 index 000000000000..5dd7de545e11 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestScanContext.java @@ -0,0 +1,112 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import org.junit.jupiter.api.Test; + +class TestScanContext { + @Test + void testIncrementalFromSnapshotId() { + ScanContext context = + ScanContext.builder() + .streaming(true) + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_ID) + .build(); + assertException( + context, "Invalid starting snapshot id for SPECIFIC_START_SNAPSHOT_ID strategy: null"); + + context = + ScanContext.builder() + .streaming(true) + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_ID) + .startSnapshotId(1L) + .startSnapshotTimestamp(1L) + .build(); + assertException( + context, + "Invalid starting snapshot timestamp for SPECIFIC_START_SNAPSHOT_ID strategy: not null"); + } + + @Test + void testIncrementalFromSnapshotTimestamp() { + ScanContext context = + ScanContext.builder() + .streaming(true) + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP) + .build(); + assertException( + context, + "Invalid starting snapshot timestamp for SPECIFIC_START_SNAPSHOT_TIMESTAMP strategy: null"); + + context = + ScanContext.builder() + .streaming(true) + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP) + .startSnapshotId(1L) + .startSnapshotTimestamp(1L) + .build(); + assertException( + context, "Invalid starting snapshot id for SPECIFIC_START_SNAPSHOT_ID strategy: not null"); + } + + @Test + void testStreaming() { + ScanContext context = ScanContext.builder().streaming(true).useTag("tag").build(); + assertException(context, "Cannot scan table using ref tag configured for streaming reader"); + + context = ScanContext.builder().streaming(true).useSnapshotId(1L).build(); + assertException(context, "Cannot set snapshot-id option for streaming reader"); + + context = ScanContext.builder().streaming(true).asOfTimestamp(1L).build(); + assertException(context, "Cannot set as-of-timestamp option for streaming reader"); + + context = ScanContext.builder().streaming(true).endSnapshotId(1L).build(); + assertException(context, "Cannot set end-snapshot-id option for streaming reader"); + + context = ScanContext.builder().streaming(true).endTag("tag").build(); + assertException(context, "Cannot set end-tag option for streaming reader"); + } + + @Test + void testStartConflict() { + ScanContext context = ScanContext.builder().startTag("tag").startSnapshotId(1L).build(); + assertException(context, "START_SNAPSHOT_ID and START_TAG cannot both be set."); + } + + @Test + void testEndConflict() { + ScanContext context = ScanContext.builder().endTag("tag").endSnapshotId(1L).build(); + assertException(context, "END_SNAPSHOT_ID and END_TAG cannot both be set."); + } + + @Test + void testMaxAllowedPlanningFailures() { + ScanContext context = ScanContext.builder().maxAllowedPlanningFailures(-2).build(); + assertException( + context, "Cannot set maxAllowedPlanningFailures to a negative number other than -1."); + } + + private void assertException(ScanContext context, String message) { + assertThatThrownBy(() -> context.validate()) + .hasMessage(message) + .isInstanceOf(IllegalArgumentException.class); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestSourceUtil.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestSourceUtil.java new file mode 100644 index 000000000000..b701419a7499 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestSourceUtil.java @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.IOException; +import org.apache.flink.configuration.Configuration; +import org.apache.iceberg.flink.FlinkConfigOptions; +import org.junit.jupiter.api.Test; + +public class TestSourceUtil { + @Test + public void testInferedParallelism() throws IOException { + Configuration configuration = new Configuration(); + // Empty table, infer parallelism should be at least 1 + int parallelism = SourceUtil.inferParallelism(configuration, -1L, () -> 0); + assertThat(parallelism).isEqualTo(1); + + // 2 splits (max infer is the default value 100 , max > splits num), the parallelism is splits + // num : 2 + parallelism = SourceUtil.inferParallelism(configuration, -1L, () -> 2); + assertThat(parallelism).isEqualTo(2); + + // 2 splits and limit is 1 , max infer parallelism is default 100, + // which is greater than splits num and limit, the parallelism is the limit value : 1 + parallelism = SourceUtil.inferParallelism(configuration, 1, () -> 2); + assertThat(parallelism).isEqualTo(1); + + // 2 splits and max infer parallelism is 1 (max < splits num), the parallelism is 1 + configuration.setInteger(FlinkConfigOptions.TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM_MAX, 1); + parallelism = SourceUtil.inferParallelism(configuration, -1L, () -> 2); + assertThat(parallelism).isEqualTo(1); + + // 2 splits, max infer parallelism is 1, limit is 3, the parallelism is max infer parallelism : + // 1 + parallelism = SourceUtil.inferParallelism(configuration, 3, () -> 2); + assertThat(parallelism).isEqualTo(1); + + // 2 splits, infer parallelism is disabled, the parallelism is flink default parallelism 1 + configuration.setBoolean(FlinkConfigOptions.TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM, false); + parallelism = SourceUtil.inferParallelism(configuration, 3, () -> 2); + assertThat(parallelism).isEqualTo(1); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestSqlBase.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestSqlBase.java new file mode 100644 index 000000000000..f9b776397cfc --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestSqlBase.java @@ -0,0 +1,160 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import static org.apache.iceberg.flink.TestFixtures.DATABASE; +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.IOException; +import java.nio.file.Path; +import java.util.List; +import java.util.Map; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.table.api.EnvironmentSettings; +import org.apache.flink.table.api.TableEnvironment; +import org.apache.flink.test.junit5.MiniClusterExtension; +import org.apache.flink.types.Row; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Table; +import org.apache.iceberg.TestHelpers; +import org.apache.iceberg.data.GenericAppenderHelper; +import org.apache.iceberg.data.RandomGenericData; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.flink.FlinkConfigOptions; +import org.apache.iceberg.flink.HadoopCatalogExtension; +import org.apache.iceberg.flink.MiniFlinkClusterExtension; +import org.apache.iceberg.flink.TableLoader; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.RegisterExtension; +import org.junit.jupiter.api.io.TempDir; + +/** Test other more advanced usage of SQL. They don't need to run for every file format. */ +public abstract class TestSqlBase { + @RegisterExtension + public static MiniClusterExtension miniClusterExtension = + MiniFlinkClusterExtension.createWithClassloaderCheckDisabled(); + + @RegisterExtension + public static final HadoopCatalogExtension CATALOG_EXTENSION = + new HadoopCatalogExtension(DATABASE, TestFixtures.TABLE); + + @TempDir protected Path temporaryFolder; + + private volatile TableEnvironment tEnv; + + protected TableEnvironment getTableEnv() { + if (tEnv == null) { + synchronized (this) { + if (tEnv == null) { + this.tEnv = + TableEnvironment.create(EnvironmentSettings.newInstance().inBatchMode().build()); + } + } + } + return tEnv; + } + + @BeforeEach + public abstract void before() throws IOException; + + @Test + public void testResiduals() throws Exception { + Table table = + CATALOG_EXTENSION + .catalog() + .createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA, TestFixtures.SPEC); + + List writeRecords = RandomGenericData.generate(TestFixtures.SCHEMA, 2, 0L); + writeRecords.get(0).set(1, 123L); + writeRecords.get(0).set(2, "2020-03-20"); + writeRecords.get(1).set(1, 456L); + writeRecords.get(1).set(2, "2020-03-20"); + + GenericAppenderHelper helper = + new GenericAppenderHelper(table, FileFormat.PARQUET, temporaryFolder); + + List expectedRecords = Lists.newArrayList(); + expectedRecords.add(writeRecords.get(0)); + + DataFile dataFile1 = helper.writeFile(TestHelpers.Row.of("2020-03-20", 0), writeRecords); + DataFile dataFile2 = + helper.writeFile( + TestHelpers.Row.of("2020-03-21", 0), + RandomGenericData.generate(TestFixtures.SCHEMA, 2, 0L)); + helper.appendToTable(dataFile1, dataFile2); + + org.apache.iceberg.flink.TestHelpers.assertRecords( + run(Maps.newHashMap(), "where dt='2020-03-20' and id=123", "*"), + expectedRecords, + TestFixtures.SCHEMA); + } + + @Test + public void testExposeLocality() throws Exception { + Table table = + CATALOG_EXTENSION + .catalog() + .createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA, TestFixtures.SPEC); + + TableLoader tableLoader = TableLoader.fromHadoopTable(table.location()); + List expectedRecords = RandomGenericData.generate(TestFixtures.SCHEMA, 10, 0L); + expectedRecords.forEach(expectedRecord -> expectedRecord.set(2, "2020-03-20")); + + GenericAppenderHelper helper = + new GenericAppenderHelper(table, FileFormat.PARQUET, temporaryFolder); + DataFile dataFile = helper.writeFile(TestHelpers.Row.of("2020-03-20", 0), expectedRecords); + helper.appendToTable(dataFile); + + // test sql api + Configuration tableConf = getTableEnv().getConfig().getConfiguration(); + tableConf.setBoolean( + FlinkConfigOptions.TABLE_EXEC_ICEBERG_EXPOSE_SPLIT_LOCALITY_INFO.key(), false); + + List results = SqlHelpers.sql(getTableEnv(), "select * from t"); + org.apache.iceberg.flink.TestHelpers.assertRecords( + results, expectedRecords, TestFixtures.SCHEMA); + + // test table api + tableConf.setBoolean( + FlinkConfigOptions.TABLE_EXEC_ICEBERG_EXPOSE_SPLIT_LOCALITY_INFO.key(), true); + FlinkSource.Builder builder = FlinkSource.forRowData().tableLoader(tableLoader).table(table); + + // When running with CI or local, `localityEnabled` will be false even if this configuration is + // enabled + assertThat(SourceUtil.isLocalityEnabled(table, tableConf, true)) + .as("Expose split locality info should be false.") + .isFalse(); + + results = run(Maps.newHashMap(), "where dt='2020-03-20'", "*"); + org.apache.iceberg.flink.TestHelpers.assertRecords( + results, expectedRecords, TestFixtures.SCHEMA); + } + + protected List run( + Map options, String sqlFilter, String... sqlSelectedFields) { + String select = String.join(",", sqlSelectedFields); + String optionStr = SqlHelpers.sqlOptionsToString(options); + return SqlHelpers.sql(getTableEnv(), "select %s from t %s %s", select, optionStr, sqlFilter); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamScanSql.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamScanSql.java new file mode 100644 index 000000000000..97ed4ca1e93f --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamScanSql.java @@ -0,0 +1,434 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import java.io.IOException; +import java.util.Iterator; +import java.util.List; +import org.apache.flink.core.execution.JobClient; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.api.EnvironmentSettings; +import org.apache.flink.table.api.TableEnvironment; +import org.apache.flink.table.api.TableResult; +import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; +import org.apache.flink.table.api.config.TableConfigOptions; +import org.apache.flink.types.Row; +import org.apache.flink.util.CloseableIterator; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.SnapshotRef; +import org.apache.iceberg.Table; +import org.apache.iceberg.TestHelpers; +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.data.GenericAppenderHelper; +import org.apache.iceberg.data.GenericRecord; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.flink.CatalogTestBase; +import org.apache.iceberg.flink.MiniFlinkClusterExtension; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.api.Timeout; + +@Timeout(60) +public class TestStreamScanSql extends CatalogTestBase { + private static final String TABLE = "test_table"; + private static final FileFormat FORMAT = FileFormat.PARQUET; + + private volatile TableEnvironment tEnv; + + @Override + protected TableEnvironment getTableEnv() { + TableEnvironment tableEnv = tEnv; + if (tableEnv != null) { + return tableEnv; + } + synchronized (this) { + if (tEnv == null) { + EnvironmentSettings.Builder settingsBuilder = + EnvironmentSettings.newInstance().inStreamingMode(); + + StreamExecutionEnvironment env = + StreamExecutionEnvironment.getExecutionEnvironment( + MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG); + env.enableCheckpointing(400); + + StreamTableEnvironment streamTableEnv = + StreamTableEnvironment.create(env, settingsBuilder.build()); + streamTableEnv + .getConfig() + .getConfiguration() + .set(TableConfigOptions.TABLE_DYNAMIC_TABLE_OPTIONS_ENABLED, true); + tEnv = streamTableEnv; + } + } + return tEnv; + } + + @Override + @BeforeEach + public void before() { + super.before(); + sql("CREATE DATABASE %s", flinkDatabase); + sql("USE CATALOG %s", catalogName); + sql("USE %s", DATABASE); + } + + @Override + @AfterEach + public void clean() { + sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, TABLE); + dropDatabase(flinkDatabase, true); + super.clean(); + } + + private void insertRows(String partition, Table table, Row... rows) throws IOException { + insertRows(partition, SnapshotRef.MAIN_BRANCH, table, rows); + } + + private void insertRows(String partition, String branch, Table table, Row... rows) + throws IOException { + GenericAppenderHelper appender = new GenericAppenderHelper(table, FORMAT, temporaryDirectory); + + GenericRecord gRecord = GenericRecord.create(table.schema()); + List records = Lists.newArrayList(); + for (Row row : rows) { + records.add( + gRecord.copy( + "id", row.getField(0), + "data", row.getField(1), + "dt", row.getField(2))); + } + + if (partition != null) { + appender.appendToTable(TestHelpers.Row.of(partition, 0), branch, records); + } else { + appender.appendToTable(branch, records); + } + } + + private void insertRowsInBranch(String branch, Table table, Row... rows) throws IOException { + insertRows(null, branch, table, rows); + } + + private void insertRows(Table table, Row... rows) throws IOException { + insertRows(null, table, rows); + } + + private void assertRows(List expectedRows, Iterator iterator) { + for (Row expectedRow : expectedRows) { + assertThat(iterator).hasNext(); + Row actualRow = iterator.next(); + assertThat(actualRow.getArity()).isEqualTo(3); + assertThat(actualRow.getField(0)).isEqualTo(expectedRow.getField(0)); + assertThat(actualRow.getField(1)).isEqualTo(expectedRow.getField(1)); + assertThat(actualRow.getField(2)).isEqualTo(expectedRow.getField(2)); + } + } + + @TestTemplate + public void testUnPartitionedTable() throws Exception { + sql("CREATE TABLE %s (id INT, data VARCHAR, dt VARCHAR)", TABLE); + Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE)); + + TableResult result = + exec("SELECT * FROM %s /*+ OPTIONS('streaming'='true', 'monitor-interval'='1s')*/", TABLE); + try (CloseableIterator iterator = result.collect()) { + + Row row1 = Row.of(1, "aaa", "2021-01-01"); + insertRows(table, row1); + assertRows(ImmutableList.of(row1), iterator); + + Row row2 = Row.of(2, "bbb", "2021-01-01"); + insertRows(table, row2); + assertRows(ImmutableList.of(row2), iterator); + } + result.getJobClient().ifPresent(JobClient::cancel); + } + + @TestTemplate + public void testPartitionedTable() throws Exception { + sql("CREATE TABLE %s (id INT, data VARCHAR, dt VARCHAR) PARTITIONED BY (dt)", TABLE); + Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE)); + + TableResult result = + exec("SELECT * FROM %s /*+ OPTIONS('streaming'='true', 'monitor-interval'='1s')*/", TABLE); + try (CloseableIterator iterator = result.collect()) { + Row row1 = Row.of(1, "aaa", "2021-01-01"); + insertRows("2021-01-01", table, row1); + assertRows(ImmutableList.of(row1), iterator); + + Row row2 = Row.of(2, "bbb", "2021-01-02"); + insertRows("2021-01-02", table, row2); + assertRows(ImmutableList.of(row2), iterator); + + Row row3 = Row.of(1, "aaa", "2021-01-02"); + insertRows("2021-01-02", table, row3); + assertRows(ImmutableList.of(row3), iterator); + + Row row4 = Row.of(2, "bbb", "2021-01-01"); + insertRows("2021-01-01", table, row4); + assertRows(ImmutableList.of(row4), iterator); + } + result.getJobClient().ifPresent(JobClient::cancel); + } + + @TestTemplate + public void testConsumeFromBeginning() throws Exception { + sql("CREATE TABLE %s (id INT, data VARCHAR, dt VARCHAR)", TABLE); + Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE)); + + Row row1 = Row.of(1, "aaa", "2021-01-01"); + Row row2 = Row.of(2, "bbb", "2021-01-01"); + insertRows(table, row1, row2); + + TableResult result = + exec("SELECT * FROM %s /*+ OPTIONS('streaming'='true', 'monitor-interval'='1s')*/", TABLE); + try (CloseableIterator iterator = result.collect()) { + assertRows(ImmutableList.of(row1, row2), iterator); + + Row row3 = Row.of(3, "ccc", "2021-01-01"); + insertRows(table, row3); + assertRows(ImmutableList.of(row3), iterator); + + Row row4 = Row.of(4, "ddd", "2021-01-01"); + insertRows(table, row4); + assertRows(ImmutableList.of(row4), iterator); + } + result.getJobClient().ifPresent(JobClient::cancel); + } + + @TestTemplate + /** + * Insert records on the main branch. Then, insert in a named branch. Reads from the main branch + * and assert that the only records from main are returned + */ + public void testConsumeFilesFromMainBranch() throws Exception { + sql("CREATE TABLE %s (id INT, data VARCHAR, dt VARCHAR)", TABLE); + Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE)); + + // Produce two snapshots on main branch + Row row1 = Row.of(1, "aaa", "2021-01-01"); + Row row2 = Row.of(2, "bbb", "2021-01-01"); + + insertRows(table, row1, row2); + String branchName = "b1"; + table.manageSnapshots().createBranch(branchName).commit(); + + // insert on the 'b1' branch + Row row3 = Row.of(3, "ccc", "2021-01-01"); + Row row4 = Row.of(4, "ddd", "2021-01-01"); + + insertRowsInBranch(branchName, table, row3, row4); + + // read from main + TableResult result = + exec("SELECT * FROM %s /*+ OPTIONS('streaming'='true', 'monitor-interval'='1s')*/", TABLE); + + try (CloseableIterator iterator = result.collect()) { + // the start snapshot(row2) is exclusive. + assertRows(ImmutableList.of(row1, row2), iterator); + + Row row5 = Row.of(5, "eee", "2021-01-01"); + Row row6 = Row.of(6, "fff", "2021-01-01"); + insertRows(table, row5, row6); + assertRows(ImmutableList.of(row5, row6), iterator); + + Row row7 = Row.of(7, "ggg", "2021-01-01"); + insertRows(table, row7); + assertRows(ImmutableList.of(row7), iterator); + } + result.getJobClient().ifPresent(JobClient::cancel); + } + + @TestTemplate + /** + * Insert records on the main branch. Creates a named branch. Insert record on named branch. Then + * select from the named branch and assert all the records are returned. + */ + public void testConsumeFilesFromBranch() throws Exception { + sql("CREATE TABLE %s (id INT, data VARCHAR, dt VARCHAR)", TABLE); + Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE)); + + // Produce two snapshots on main branch + Row row1 = Row.of(1, "aaa", "2021-01-01"); + Row row2 = Row.of(2, "bbb", "2021-01-01"); + + insertRows(table, row1, row2); + String branchName = "b1"; + table.manageSnapshots().createBranch(branchName).commit(); + + TableResult result = + exec( + "SELECT * FROM %s /*+ OPTIONS('streaming'='true', 'monitor-interval'='1s', 'branch'='%s')*/ ", + TABLE, branchName); + + try (CloseableIterator iterator = result.collect()) { + assertRows(ImmutableList.of(row1, row2), iterator); + // insert on the 'b1' branch + Row row3 = Row.of(3, "ccc", "2021-01-01"); + Row row4 = Row.of(4, "ddd", "2021-01-01"); + insertRowsInBranch(branchName, table, row3, row4); + assertRows(ImmutableList.of(row3, row4), iterator); + } + result.getJobClient().ifPresent(JobClient::cancel); + } + + @TestTemplate + /** + * Insert records on branch b1. Then insert record on b2. Then select from each branch and assert + * the correct records are returned + */ + public void testConsumeFilesFromTwoBranches() throws Exception { + sql("CREATE TABLE %s (id INT, data VARCHAR, dt VARCHAR)", TABLE); + Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE)); + + String branch1 = "b1"; + String branch2 = "b2"; + table.manageSnapshots().createBranch(branch1).commit(); + table.manageSnapshots().createBranch(branch2).commit(); + + // Produce two snapshots on main branch + Row row1Branch1 = Row.of(1, "b1", "2021-01-01"); + Row row2Branch1 = Row.of(2, "b1", "2021-01-01"); + + Row row1Branch2 = Row.of(2, "b2", "2021-01-01"); + Row row2Branch2 = Row.of(3, "b3", "2021-01-01"); + + insertRowsInBranch(branch1, table, row1Branch1, row2Branch1); + insertRowsInBranch(branch2, table, row1Branch2, row2Branch2); + + TableResult resultBranch1 = + exec( + "SELECT * FROM %s /*+ OPTIONS('streaming'='true', 'monitor-interval'='1s', 'branch'='%s')*/ ", + TABLE, branch1); + + try (CloseableIterator iterator = resultBranch1.collect()) { + assertRows(ImmutableList.of(row1Branch1, row2Branch1), iterator); + Row another = Row.of(4, "ccc", "2021-01-01"); + insertRowsInBranch(branch1, table, another); + assertRows(ImmutableList.of(another), iterator); + } + + TableResult resultBranch2 = + exec( + "SELECT * FROM %s /*+ OPTIONS('streaming'='true', 'monitor-interval'='1s', 'branch'='%s')*/ ", + TABLE, branch2); + try (CloseableIterator iterator = resultBranch2.collect()) { + assertRows(ImmutableList.of(row1Branch2, row2Branch2), iterator); + Row another = Row.of(4, "ccc", "2021-01-01"); + insertRowsInBranch(branch2, table, another); + assertRows(ImmutableList.of(another), iterator); + } + + resultBranch1.getJobClient().ifPresent(JobClient::cancel); + resultBranch2.getJobClient().ifPresent(JobClient::cancel); + } + + @TestTemplate + public void testConsumeFromStartSnapshotId() throws Exception { + sql("CREATE TABLE %s (id INT, data VARCHAR, dt VARCHAR)", TABLE); + Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE)); + + // Produce two snapshots. + Row row1 = Row.of(1, "aaa", "2021-01-01"); + Row row2 = Row.of(2, "bbb", "2021-01-01"); + insertRows(table, row1); + insertRows(table, row2); + + long startSnapshotId = table.currentSnapshot().snapshotId(); + + Row row3 = Row.of(3, "ccc", "2021-01-01"); + Row row4 = Row.of(4, "ddd", "2021-01-01"); + insertRows(table, row3, row4); + + TableResult result = + exec( + "SELECT * FROM %s /*+ OPTIONS('streaming'='true', 'monitor-interval'='1s', " + + "'start-snapshot-id'='%d')*/", + TABLE, startSnapshotId); + try (CloseableIterator iterator = result.collect()) { + // the start snapshot(row2) is exclusive. + assertRows(ImmutableList.of(row3, row4), iterator); + + Row row5 = Row.of(5, "eee", "2021-01-01"); + Row row6 = Row.of(6, "fff", "2021-01-01"); + insertRows(table, row5, row6); + assertRows(ImmutableList.of(row5, row6), iterator); + + Row row7 = Row.of(7, "ggg", "2021-01-01"); + insertRows(table, row7); + assertRows(ImmutableList.of(row7), iterator); + } + result.getJobClient().ifPresent(JobClient::cancel); + } + + @TestTemplate + public void testConsumeFromStartTag() throws Exception { + sql("CREATE TABLE %s (id INT, data VARCHAR, dt VARCHAR)", TABLE); + Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE)); + + // Produce two snapshots. + Row row1 = Row.of(1, "aaa", "2021-01-01"); + Row row2 = Row.of(2, "bbb", "2021-01-01"); + insertRows(table, row1); + insertRows(table, row2); + + String tagName = "t1"; + long startSnapshotId = table.currentSnapshot().snapshotId(); + table.manageSnapshots().createTag(tagName, startSnapshotId).commit(); + + Row row3 = Row.of(3, "ccc", "2021-01-01"); + Row row4 = Row.of(4, "ddd", "2021-01-01"); + insertRows(table, row3, row4); + + TableResult result = + exec( + "SELECT * FROM %s /*+ OPTIONS('streaming'='true', 'monitor-interval'='1s', " + + "'start-tag'='%s')*/", + TABLE, tagName); + try (CloseableIterator iterator = result.collect()) { + // the start snapshot(row2) is exclusive. + assertRows(ImmutableList.of(row3, row4), iterator); + + Row row5 = Row.of(5, "eee", "2021-01-01"); + Row row6 = Row.of(6, "fff", "2021-01-01"); + insertRows(table, row5, row6); + assertRows(ImmutableList.of(row5, row6), iterator); + + Row row7 = Row.of(7, "ggg", "2021-01-01"); + insertRows(table, row7); + assertRows(ImmutableList.of(row7), iterator); + } + result.getJobClient().ifPresent(JobClient::cancel); + + assertThatThrownBy( + () -> + exec( + "SELECT * FROM %s /*+ OPTIONS('streaming'='true', 'monitor-interval'='1s', 'start-tag'='%s', " + + "'start-snapshot-id'='%d' )*/", + TABLE, tagName, startSnapshotId)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("START_SNAPSHOT_ID and START_TAG cannot both be set."); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamingMonitorFunction.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamingMonitorFunction.java new file mode 100644 index 000000000000..9c4f476b02b4 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamingMonitorFunction.java @@ -0,0 +1,402 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.time.Duration; +import java.util.Arrays; +import java.util.List; +import java.util.concurrent.CountDownLatch; +import org.apache.flink.runtime.checkpoint.OperatorSubtaskState; +import org.apache.flink.streaming.api.functions.source.SourceFunction; +import org.apache.flink.streaming.api.operators.StreamSource; +import org.apache.flink.streaming.api.watermark.Watermark; +import org.apache.flink.streaming.util.AbstractStreamOperatorTestHarness; +import org.apache.flink.table.data.RowData; +import org.apache.flink.types.Row; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.Parameters; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.TestBase; +import org.apache.iceberg.data.GenericAppenderHelper; +import org.apache.iceberg.data.RandomGenericData; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.flink.TestHelpers; +import org.apache.iceberg.flink.TestTableLoader; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.Iterables; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.types.Types; +import org.apache.iceberg.util.SnapshotUtil; +import org.apache.iceberg.util.ThreadPools; +import org.awaitility.Awaitility; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.api.extension.ExtendWith; + +@ExtendWith(ParameterizedTestExtension.class) +public class TestStreamingMonitorFunction extends TestBase { + + private static final Schema SCHEMA = + new Schema( + Types.NestedField.required(1, "id", Types.IntegerType.get()), + Types.NestedField.required(2, "data", Types.StringType.get())); + private static final FileFormat DEFAULT_FORMAT = FileFormat.PARQUET; + private static final long WAIT_TIME_MILLIS = 10 * 1000L; + + @Parameters(name = "formatVersion = {0}") + protected static List parameters() { + return Arrays.asList(1, 2); + } + + @BeforeEach + @Override + public void setupTable() throws IOException { + this.tableDir = Files.createTempDirectory(temp, "junit").toFile(); + this.metadataDir = new File(tableDir, "metadata"); + assertThat(tableDir.delete()).isTrue(); + + // Construct the iceberg table. + table = create(SCHEMA, PartitionSpec.unpartitioned()); + } + + private void runSourceFunctionInTask( + TestSourceContext sourceContext, StreamingMonitorFunction function) { + Thread task = + new Thread( + () -> { + try { + function.run(sourceContext); + } catch (Exception e) { + throw new RuntimeException(e); + } + }); + task.start(); + } + + @TestTemplate + public void testConsumeWithoutStartSnapshotId() throws Exception { + List> recordsList = generateRecordsAndCommitTxn(10); + ScanContext scanContext = ScanContext.builder().monitorInterval(Duration.ofMillis(100)).build(); + + StreamingMonitorFunction function = createFunction(scanContext); + try (AbstractStreamOperatorTestHarness harness = createHarness(function)) { + harness.setup(); + harness.open(); + + TestSourceContext sourceContext = new TestSourceContext(new CountDownLatch(1)); + runSourceFunctionInTask(sourceContext, function); + + awaitExpectedSplits(sourceContext); + + // Stop the stream task. + function.close(); + + TestHelpers.assertRecords( + sourceContext.toRows(), Lists.newArrayList(Iterables.concat(recordsList)), SCHEMA); + } + } + + @TestTemplate + public void testConsumeFromStartSnapshotId() throws Exception { + // Commit the first five transactions. + generateRecordsAndCommitTxn(5); + long startSnapshotId = table.currentSnapshot().snapshotId(); + + // Commit the next five transactions. + List> recordsList = generateRecordsAndCommitTxn(5); + + ScanContext scanContext = + ScanContext.builder() + .monitorInterval(Duration.ofMillis(100)) + .startSnapshotId(startSnapshotId) + .build(); + + StreamingMonitorFunction function = createFunction(scanContext); + try (AbstractStreamOperatorTestHarness harness = createHarness(function)) { + harness.setup(); + harness.open(); + + TestSourceContext sourceContext = new TestSourceContext(new CountDownLatch(1)); + runSourceFunctionInTask(sourceContext, function); + + awaitExpectedSplits(sourceContext); + + // Stop the stream task. + function.close(); + + TestHelpers.assertRecords( + sourceContext.toRows(), Lists.newArrayList(Iterables.concat(recordsList)), SCHEMA); + } + } + + @TestTemplate + public void testConsumeFromStartTag() throws Exception { + // Commit the first five transactions. + generateRecordsAndCommitTxn(5); + long startSnapshotId = table.currentSnapshot().snapshotId(); + String tagName = "t1"; + table.manageSnapshots().createTag(tagName, startSnapshotId).commit(); + + // Commit the next five transactions. + List> recordsList = generateRecordsAndCommitTxn(5); + + ScanContext scanContext = + ScanContext.builder().monitorInterval(Duration.ofMillis(100)).startTag(tagName).build(); + + StreamingMonitorFunction function = createFunction(scanContext); + try (AbstractStreamOperatorTestHarness harness = createHarness(function)) { + harness.setup(); + harness.open(); + + TestSourceContext sourceContext = new TestSourceContext(new CountDownLatch(1)); + runSourceFunctionInTask(sourceContext, function); + + awaitExpectedSplits(sourceContext); + + // Stop the stream task. + function.close(); + + TestHelpers.assertRecords( + sourceContext.toRows(), Lists.newArrayList(Iterables.concat(recordsList)), SCHEMA); + } + } + + @TestTemplate + public void testCheckpointRestore() throws Exception { + List> recordsList = generateRecordsAndCommitTxn(10); + ScanContext scanContext = ScanContext.builder().monitorInterval(Duration.ofMillis(100)).build(); + + StreamingMonitorFunction func = createFunction(scanContext); + OperatorSubtaskState state; + try (AbstractStreamOperatorTestHarness harness = createHarness(func)) { + harness.setup(); + harness.open(); + + TestSourceContext sourceContext = new TestSourceContext(new CountDownLatch(1)); + runSourceFunctionInTask(sourceContext, func); + + awaitExpectedSplits(sourceContext); + + state = harness.snapshot(1, 1); + + // Stop the stream task. + func.close(); + + TestHelpers.assertRecords( + sourceContext.toRows(), Lists.newArrayList(Iterables.concat(recordsList)), SCHEMA); + } + + List> newRecordsList = generateRecordsAndCommitTxn(10); + StreamingMonitorFunction newFunc = createFunction(scanContext); + try (AbstractStreamOperatorTestHarness harness = createHarness(newFunc)) { + harness.setup(); + // Recover to process the remaining snapshots. + harness.initializeState(state); + harness.open(); + + TestSourceContext sourceContext = new TestSourceContext(new CountDownLatch(1)); + runSourceFunctionInTask(sourceContext, newFunc); + + awaitExpectedSplits(sourceContext); + + // Stop the stream task. + newFunc.close(); + + TestHelpers.assertRecords( + sourceContext.toRows(), Lists.newArrayList(Iterables.concat(newRecordsList)), SCHEMA); + } + } + + private void awaitExpectedSplits(TestSourceContext sourceContext) { + Awaitility.await("expected splits should be produced") + .atMost(Duration.ofMillis(WAIT_TIME_MILLIS)) + .untilAsserted( + () -> { + assertThat(sourceContext.latch.getCount()).isEqualTo(0); + assertThat(sourceContext.splits).as("Should produce the expected splits").hasSize(1); + }); + } + + @TestTemplate + public void testInvalidMaxPlanningSnapshotCount() { + ScanContext scanContext1 = + ScanContext.builder() + .monitorInterval(Duration.ofMillis(100)) + .maxPlanningSnapshotCount(0) + .build(); + + assertThatThrownBy(() -> createFunction(scanContext1)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("The max-planning-snapshot-count must be greater than zero"); + + ScanContext scanContext2 = + ScanContext.builder() + .monitorInterval(Duration.ofMillis(100)) + .maxPlanningSnapshotCount(-10) + .build(); + + assertThatThrownBy(() -> createFunction(scanContext2)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("The max-planning-snapshot-count must be greater than zero"); + } + + @TestTemplate + public void testConsumeWithMaxPlanningSnapshotCount() throws Exception { + generateRecordsAndCommitTxn(10); + + // Use the oldest snapshot as starting to avoid the initial case. + long oldestSnapshotId = SnapshotUtil.oldestAncestor(table).snapshotId(); + + ScanContext scanContext = + ScanContext.builder() + .monitorInterval(Duration.ofMillis(100)) + .splitSize(1000L) + .startSnapshotId(oldestSnapshotId) + .maxPlanningSnapshotCount(Integer.MAX_VALUE) + .build(); + + FlinkInputSplit[] expectedSplits = + FlinkSplitPlanner.planInputSplits(table, scanContext, ThreadPools.getWorkerPool()); + + assertThat(expectedSplits).hasSize(9); + + // This covers three cases that maxPlanningSnapshotCount is less than, equal or greater than the + // total splits number + for (int maxPlanningSnapshotCount : ImmutableList.of(1, 9, 15)) { + scanContext = + ScanContext.builder() + .monitorInterval(Duration.ofMillis(500)) + .startSnapshotId(oldestSnapshotId) + .splitSize(1000L) + .maxPlanningSnapshotCount(maxPlanningSnapshotCount) + .build(); + + StreamingMonitorFunction function = createFunction(scanContext); + try (AbstractStreamOperatorTestHarness harness = createHarness(function)) { + harness.setup(); + harness.open(); + + CountDownLatch latch = new CountDownLatch(1); + TestSourceContext sourceContext = new TestSourceContext(latch); + function.sourceContext(sourceContext); + function.monitorAndForwardSplits(); + + if (maxPlanningSnapshotCount < 10) { + assertThat(sourceContext.splits).hasSize(maxPlanningSnapshotCount); + } + } + } + } + + private List> generateRecordsAndCommitTxn(int commitTimes) throws IOException { + List> expectedRecords = Lists.newArrayList(); + for (int i = 0; i < commitTimes; i++) { + List records = RandomGenericData.generate(SCHEMA, 100, 0L); + expectedRecords.add(records); + + // Commit those records to iceberg table. + writeRecords(records); + } + return expectedRecords; + } + + private void writeRecords(List records) throws IOException { + GenericAppenderHelper appender = new GenericAppenderHelper(table, DEFAULT_FORMAT, temp); + appender.appendToTable(records); + } + + private StreamingMonitorFunction createFunction(ScanContext scanContext) { + return new StreamingMonitorFunction( + TestTableLoader.of(tableDir.getAbsolutePath()), scanContext); + } + + private AbstractStreamOperatorTestHarness createHarness( + StreamingMonitorFunction function) throws Exception { + StreamSource streamSource = + new StreamSource<>(function); + return new AbstractStreamOperatorTestHarness<>(streamSource, 1, 1, 0); + } + + private class TestSourceContext implements SourceFunction.SourceContext { + private final List splits = Lists.newArrayList(); + private final Object checkpointLock = new Object(); + private final CountDownLatch latch; + + TestSourceContext(CountDownLatch latch) { + this.latch = latch; + } + + @Override + public void collect(FlinkInputSplit element) { + splits.add(element); + latch.countDown(); + } + + @Override + public void collectWithTimestamp(FlinkInputSplit element, long timestamp) { + collect(element); + } + + @Override + public void emitWatermark(Watermark mark) {} + + @Override + public void markAsTemporarilyIdle() {} + + @Override + public Object getCheckpointLock() { + return checkpointLock; + } + + @Override + public void close() {} + + private List toRows() throws IOException { + FlinkInputFormat format = + FlinkSource.forRowData() + .tableLoader(TestTableLoader.of(tableDir.getAbsolutePath())) + .buildFormat(); + + List rows = Lists.newArrayList(); + for (FlinkInputSplit split : splits) { + format.open(split); + + RowData element = null; + try { + while (!format.reachedEnd()) { + element = format.nextRecord(element); + rows.add(Row.of(element.getInt(0), element.getString(1).toString())); + } + } finally { + format.close(); + } + } + + return rows; + } + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamingReaderOperator.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamingReaderOperator.java new file mode 100644 index 000000000000..1606ee9f9648 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamingReaderOperator.java @@ -0,0 +1,293 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import org.apache.flink.runtime.checkpoint.OperatorSubtaskState; +import org.apache.flink.streaming.api.TimeCharacteristic; +import org.apache.flink.streaming.api.operators.OneInputStreamOperatorFactory; +import org.apache.flink.streaming.runtime.tasks.StreamTaskActionExecutor; +import org.apache.flink.streaming.runtime.tasks.mailbox.MailboxDefaultAction; +import org.apache.flink.streaming.runtime.tasks.mailbox.SteppingMailboxProcessor; +import org.apache.flink.streaming.util.OneInputStreamOperatorTestHarness; +import org.apache.flink.table.data.RowData; +import org.apache.flink.types.Row; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.Parameters; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.TestBase; +import org.apache.iceberg.data.GenericAppenderHelper; +import org.apache.iceberg.data.RandomGenericData; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.flink.TestHelpers; +import org.apache.iceberg.flink.TestTableLoader; +import org.apache.iceberg.relocated.com.google.common.collect.Iterables; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.types.Types; +import org.apache.iceberg.util.SnapshotUtil; +import org.apache.iceberg.util.ThreadPools; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.api.extension.ExtendWith; + +@ExtendWith(ParameterizedTestExtension.class) +public class TestStreamingReaderOperator extends TestBase { + + private static final Schema SCHEMA = + new Schema( + Types.NestedField.required(1, "id", Types.IntegerType.get()), + Types.NestedField.required(2, "data", Types.StringType.get())); + private static final FileFormat DEFAULT_FORMAT = FileFormat.PARQUET; + + @Parameters(name = "formatVersion = {0}") + protected static List parameters() { + return Arrays.asList(1, 2); + } + + @BeforeEach + @Override + public void setupTable() throws IOException { + this.tableDir = Files.createTempDirectory(temp, "junit").toFile(); + this.metadataDir = new File(tableDir, "metadata"); + assertThat(tableDir.delete()).isTrue(); + + // Construct the iceberg table. + table = create(SCHEMA, PartitionSpec.unpartitioned()); + } + + @TestTemplate + public void testProcessAllRecords() throws Exception { + List> expectedRecords = generateRecordsAndCommitTxn(10); + + List splits = generateSplits(); + assertThat(splits).hasSize(10); + + try (OneInputStreamOperatorTestHarness harness = createReader()) { + harness.setup(); + harness.open(); + + SteppingMailboxProcessor processor = createLocalMailbox(harness); + + List expected = Lists.newArrayList(); + for (int i = 0; i < splits.size(); i++) { + // Process this element to enqueue to mail-box. + harness.processElement(splits.get(i), -1); + + // Run the mail-box once to read all records from the given split. + assertThat(processor.runMailboxStep()).as("Should processed 1 split").isTrue(); + + // Assert the output has expected elements. + expected.addAll(expectedRecords.get(i)); + TestHelpers.assertRecords(readOutputValues(harness), expected, SCHEMA); + } + } + } + + @TestTemplate + public void testTriggerCheckpoint() throws Exception { + // Received emitted splits: split1, split2, split3, checkpoint request is triggered when reading + // records from + // split1. + List> expectedRecords = generateRecordsAndCommitTxn(3); + + List splits = generateSplits(); + assertThat(splits).hasSize(3); + + long timestamp = 0; + try (OneInputStreamOperatorTestHarness harness = createReader()) { + harness.setup(); + harness.open(); + + SteppingMailboxProcessor processor = createLocalMailbox(harness); + + harness.processElement(splits.get(0), ++timestamp); + harness.processElement(splits.get(1), ++timestamp); + harness.processElement(splits.get(2), ++timestamp); + + // Trigger snapshot state, it will start to work once all records from split0 are read. + processor.getMainMailboxExecutor().execute(() -> harness.snapshot(1, 3), "Trigger snapshot"); + + assertThat(processor.runMailboxStep()).as("Should have processed the split0").isTrue(); + assertThat(processor.runMailboxStep()) + .as("Should have processed the snapshot state action") + .isTrue(); + + TestHelpers.assertRecords(readOutputValues(harness), expectedRecords.get(0), SCHEMA); + + // Read records from split1. + assertThat(processor.runMailboxStep()).as("Should have processed the split1").isTrue(); + + // Read records from split2. + assertThat(processor.runMailboxStep()).as("Should have processed the split2").isTrue(); + + TestHelpers.assertRecords( + readOutputValues(harness), Lists.newArrayList(Iterables.concat(expectedRecords)), SCHEMA); + } + } + + @TestTemplate + public void testCheckpointRestore() throws Exception { + List> expectedRecords = generateRecordsAndCommitTxn(15); + + List splits = generateSplits(); + assertThat(splits).hasSize(15); + + OperatorSubtaskState state; + List expected = Lists.newArrayList(); + try (OneInputStreamOperatorTestHarness harness = createReader()) { + harness.setup(); + harness.open(); + + // Enqueue all the splits. + for (FlinkInputSplit split : splits) { + harness.processElement(split, -1); + } + + // Read all records from the first five splits. + SteppingMailboxProcessor localMailbox = createLocalMailbox(harness); + for (int i = 0; i < 5; i++) { + expected.addAll(expectedRecords.get(i)); + assertThat(localMailbox.runMailboxStep()) + .as("Should have processed the split#" + i) + .isTrue(); + + TestHelpers.assertRecords(readOutputValues(harness), expected, SCHEMA); + } + + // Snapshot state now, there're 10 splits left in the state. + state = harness.snapshot(1, 1); + } + + expected.clear(); + try (OneInputStreamOperatorTestHarness harness = createReader()) { + harness.setup(); + // Recover to process the remaining splits. + harness.initializeState(state); + harness.open(); + + SteppingMailboxProcessor localMailbox = createLocalMailbox(harness); + + for (int i = 5; i < 10; i++) { + expected.addAll(expectedRecords.get(i)); + assertThat(localMailbox.runMailboxStep()) + .as("Should have processed the split#" + i) + .isTrue(); + + TestHelpers.assertRecords(readOutputValues(harness), expected, SCHEMA); + } + + // Let's process the final 5 splits now. + for (int i = 10; i < 15; i++) { + expected.addAll(expectedRecords.get(i)); + harness.processElement(splits.get(i), 1); + + assertThat(localMailbox.runMailboxStep()) + .as("Should have processed the split#" + i) + .isTrue(); + TestHelpers.assertRecords(readOutputValues(harness), expected, SCHEMA); + } + } + } + + private List readOutputValues( + OneInputStreamOperatorTestHarness harness) { + List results = Lists.newArrayList(); + for (RowData rowData : harness.extractOutputValues()) { + results.add(Row.of(rowData.getInt(0), rowData.getString(1).toString())); + } + return results; + } + + private List> generateRecordsAndCommitTxn(int commitTimes) throws IOException { + List> expectedRecords = Lists.newArrayList(); + for (int i = 0; i < commitTimes; i++) { + List records = RandomGenericData.generate(SCHEMA, 100, 0L); + expectedRecords.add(records); + + // Commit those records to iceberg table. + writeRecords(records); + } + return expectedRecords; + } + + private void writeRecords(List records) throws IOException { + GenericAppenderHelper appender = new GenericAppenderHelper(table, DEFAULT_FORMAT, temp); + appender.appendToTable(records); + } + + private List generateSplits() { + List inputSplits = Lists.newArrayList(); + + List snapshotIds = SnapshotUtil.currentAncestorIds(table); + for (int i = snapshotIds.size() - 1; i >= 0; i--) { + ScanContext scanContext; + if (i == snapshotIds.size() - 1) { + // Generate the splits from the first snapshot. + scanContext = ScanContext.builder().useSnapshotId(snapshotIds.get(i)).build(); + } else { + // Generate the splits between the previous snapshot and current snapshot. + scanContext = + ScanContext.builder() + .startSnapshotId(snapshotIds.get(i + 1)) + .endSnapshotId(snapshotIds.get(i)) + .build(); + } + + Collections.addAll( + inputSplits, + FlinkSplitPlanner.planInputSplits(table, scanContext, ThreadPools.getWorkerPool())); + } + + return inputSplits; + } + + private OneInputStreamOperatorTestHarness createReader() + throws Exception { + // This input format is used to opening the emitted split. + FlinkInputFormat inputFormat = + FlinkSource.forRowData() + .tableLoader(TestTableLoader.of(tableDir.getAbsolutePath())) + .buildFormat(); + + OneInputStreamOperatorFactory factory = + StreamingReaderOperator.factory(inputFormat); + OneInputStreamOperatorTestHarness harness = + new OneInputStreamOperatorTestHarness<>(factory, 1, 1, 0); + harness.getStreamConfig().setTimeCharacteristic(TimeCharacteristic.ProcessingTime); + + return harness; + } + + private SteppingMailboxProcessor createLocalMailbox( + OneInputStreamOperatorTestHarness harness) { + return new SteppingMailboxProcessor( + MailboxDefaultAction.Controller::suspendDefaultAction, + harness.getTaskMailbox(), + StreamTaskActionExecutor.IMMEDIATE); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/assigner/SplitAssignerTestBase.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/assigner/SplitAssignerTestBase.java new file mode 100644 index 000000000000..1e612b0a2b2a --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/assigner/SplitAssignerTestBase.java @@ -0,0 +1,132 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.assigner; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.fail; + +import java.nio.file.Path; +import java.util.Collection; +import java.util.List; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.atomic.AtomicBoolean; +import org.apache.iceberg.flink.source.SplitHelpers; +import org.apache.iceberg.flink.source.split.IcebergSourceSplit; +import org.apache.iceberg.flink.source.split.IcebergSourceSplitState; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +public abstract class SplitAssignerTestBase { + @TempDir protected Path temporaryFolder; + + @Test + public void testEmptyInitialization() { + SplitAssigner assigner = splitAssigner(); + assertGetNext(assigner, GetSplitResult.Status.UNAVAILABLE); + } + + /** Test a sequence of interactions for StaticEnumerator */ + @Test + public void testStaticEnumeratorSequence() throws Exception { + SplitAssigner assigner = splitAssigner(); + assigner.onDiscoveredSplits(createSplits(4, 1, "1")); + + assertGetNext(assigner, GetSplitResult.Status.AVAILABLE); + assertGetNext(assigner, GetSplitResult.Status.AVAILABLE); + assertGetNext(assigner, GetSplitResult.Status.AVAILABLE); + assertSnapshot(assigner, 1); + assigner.onUnassignedSplits(createSplits(1, 1, "1")); + assertSnapshot(assigner, 2); + + assertGetNext(assigner, GetSplitResult.Status.AVAILABLE); + assertGetNext(assigner, GetSplitResult.Status.AVAILABLE); + assertGetNext(assigner, GetSplitResult.Status.UNAVAILABLE); + assertSnapshot(assigner, 0); + } + + /** Test a sequence of interactions for ContinuousEnumerator */ + @Test + public void testContinuousEnumeratorSequence() throws Exception { + SplitAssigner assigner = splitAssigner(); + assertGetNext(assigner, GetSplitResult.Status.UNAVAILABLE); + + List splits1 = createSplits(1, 1, "1"); + assertAvailableFuture(assigner, 1, () -> assigner.onDiscoveredSplits(splits1)); + List splits2 = createSplits(1, 1, "1"); + assertAvailableFuture(assigner, 1, () -> assigner.onUnassignedSplits(splits2)); + + assigner.onDiscoveredSplits(createSplits(2, 1, "1")); + assertSnapshot(assigner, 2); + assertGetNext(assigner, GetSplitResult.Status.AVAILABLE); + assertGetNext(assigner, GetSplitResult.Status.AVAILABLE); + assertGetNext(assigner, GetSplitResult.Status.UNAVAILABLE); + assertSnapshot(assigner, 0); + } + + private void assertAvailableFuture( + SplitAssigner assigner, int splitCount, Runnable addSplitsRunnable) { + // register callback + AtomicBoolean futureCompleted = new AtomicBoolean(); + CompletableFuture future = assigner.isAvailable(); + future.thenAccept(ignored -> futureCompleted.set(true)); + // calling isAvailable again should return the same object reference + // note that thenAccept will return a new future. + // we want to assert the same instance on the assigner returned future + assertThat(assigner.isAvailable()).isSameAs(future); + + // now add some splits + addSplitsRunnable.run(); + assertThat(futureCompleted.get()).isTrue(); + + for (int i = 0; i < splitCount; ++i) { + assertGetNext(assigner, GetSplitResult.Status.AVAILABLE); + } + assertGetNext(assigner, GetSplitResult.Status.UNAVAILABLE); + assertSnapshot(assigner, 0); + } + + protected void assertGetNext(SplitAssigner assigner, GetSplitResult.Status expectedStatus) { + GetSplitResult result = assigner.getNext(null); + assertThat(result.status()).isEqualTo(expectedStatus); + switch (expectedStatus) { + case AVAILABLE: + assertThat(result.split()).isNotNull(); + break; + case CONSTRAINED: + case UNAVAILABLE: + assertThat(result.split()).isNull(); + break; + default: + fail("Unknown status: %s", expectedStatus); + } + } + + protected void assertSnapshot(SplitAssigner assigner, int splitCount) { + Collection stateBeforeGet = assigner.state(); + assertThat(stateBeforeGet).hasSize(splitCount); + } + + protected List createSplits(int fileCount, int filesPerSplit, String version) + throws Exception { + return SplitHelpers.createSplitsFromTransientHadoopTable( + temporaryFolder, fileCount, filesPerSplit, version); + } + + protected abstract SplitAssigner splitAssigner(); +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/assigner/TestDefaultSplitAssigner.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/assigner/TestDefaultSplitAssigner.java new file mode 100644 index 000000000000..17e64bbf0594 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/assigner/TestDefaultSplitAssigner.java @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.assigner; + +import org.apache.iceberg.flink.source.SplitHelpers; +import org.junit.jupiter.api.Test; + +public class TestDefaultSplitAssigner extends SplitAssignerTestBase { + @Override + protected SplitAssigner splitAssigner() { + return new DefaultSplitAssigner(null); + } + + /** Test the assigner when multiple files are in a single split */ + @Test + public void testMultipleFilesInASplit() throws Exception { + SplitAssigner assigner = splitAssigner(); + assigner.onDiscoveredSplits( + SplitHelpers.createSplitsFromTransientHadoopTable(temporaryFolder, 4, 2)); + + assertGetNext(assigner, GetSplitResult.Status.AVAILABLE); + assertSnapshot(assigner, 1); + assertGetNext(assigner, GetSplitResult.Status.AVAILABLE); + assertGetNext(assigner, GetSplitResult.Status.UNAVAILABLE); + assertSnapshot(assigner, 0); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/assigner/TestFileSequenceNumberBasedSplitAssigner.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/assigner/TestFileSequenceNumberBasedSplitAssigner.java new file mode 100644 index 000000000000..ff63ba8e58a0 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/assigner/TestFileSequenceNumberBasedSplitAssigner.java @@ -0,0 +1,81 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.assigner; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import java.util.List; +import org.apache.iceberg.ContentFile; +import org.apache.iceberg.flink.source.split.IcebergSourceSplit; +import org.apache.iceberg.flink.source.split.SerializableComparator; +import org.apache.iceberg.flink.source.split.SplitComparators; +import org.apache.iceberg.util.SerializationUtil; +import org.junit.jupiter.api.Test; + +public class TestFileSequenceNumberBasedSplitAssigner extends SplitAssignerTestBase { + @Override + protected SplitAssigner splitAssigner() { + return new OrderedSplitAssignerFactory(SplitComparators.fileSequenceNumber()).createAssigner(); + } + + /** Test the assigner when multiple files are in a single split */ + @Test + public void testMultipleFilesInAnIcebergSplit() { + SplitAssigner assigner = splitAssigner(); + assertThatThrownBy( + () -> assigner.onDiscoveredSplits(createSplits(4, 2, "2")), + "Multiple files in a split is not allowed") + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("Please use 'split-open-file-cost'"); + } + + /** Test sorted splits */ + @Test + public void testSplitSort() throws Exception { + SplitAssigner assigner = splitAssigner(); + List splits = createSplits(5, 1, "2"); + + assigner.onDiscoveredSplits(splits.subList(3, 5)); + assigner.onDiscoveredSplits(splits.subList(0, 1)); + assigner.onDiscoveredSplits(splits.subList(1, 3)); + + assertGetNext(assigner, 1L); + assertGetNext(assigner, 2L); + assertGetNext(assigner, 3L); + assertGetNext(assigner, 4L); + assertGetNext(assigner, 5L); + + assertGetNext(assigner, GetSplitResult.Status.UNAVAILABLE); + } + + @Test + public void testSerializable() { + byte[] bytes = SerializationUtil.serializeToBytes(SplitComparators.fileSequenceNumber()); + SerializableComparator comparator = + SerializationUtil.deserializeFromBytes(bytes); + assertThat(comparator).isNotNull(); + } + + private void assertGetNext(SplitAssigner assigner, Long expectedSequenceNumber) { + GetSplitResult result = assigner.getNext(null); + ContentFile file = result.split().task().files().iterator().next().file(); + assertThat(file.fileSequenceNumber()).isEqualTo(expectedSequenceNumber); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/assigner/TestWatermarkBasedSplitAssigner.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/assigner/TestWatermarkBasedSplitAssigner.java new file mode 100644 index 000000000000..84f04d5a530a --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/assigner/TestWatermarkBasedSplitAssigner.java @@ -0,0 +1,146 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.assigner; + +import static org.apache.iceberg.types.Types.NestedField.required; +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.IOException; +import java.time.Instant; +import java.time.LocalDateTime; +import java.time.ZoneOffset; +import java.time.temporal.ChronoUnit; +import java.util.List; +import java.util.concurrent.TimeUnit; +import java.util.stream.Collectors; +import java.util.stream.IntStream; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Schema; +import org.apache.iceberg.data.GenericAppenderFactory; +import org.apache.iceberg.data.GenericRecord; +import org.apache.iceberg.data.RandomGenericData; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.flink.source.reader.ColumnStatsWatermarkExtractor; +import org.apache.iceberg.flink.source.reader.ReaderUtil; +import org.apache.iceberg.flink.source.split.IcebergSourceSplit; +import org.apache.iceberg.flink.source.split.SerializableComparator; +import org.apache.iceberg.flink.source.split.SplitComparators; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.types.Types; +import org.apache.iceberg.util.SerializationUtil; +import org.junit.jupiter.api.Test; + +public class TestWatermarkBasedSplitAssigner extends SplitAssignerTestBase { + public static final Schema SCHEMA = + new Schema(required(1, "timestamp_column", Types.TimestampType.withoutZone())); + private static final GenericAppenderFactory APPENDER_FACTORY = new GenericAppenderFactory(SCHEMA); + + @Override + protected SplitAssigner splitAssigner() { + return new OrderedSplitAssignerFactory( + SplitComparators.watermark( + new ColumnStatsWatermarkExtractor(SCHEMA, "timestamp_column", null))) + .createAssigner(); + } + + /** Test the assigner when multiple files are in a single split */ + @Test + public void testMultipleFilesInAnIcebergSplit() { + SplitAssigner assigner = splitAssigner(); + assigner.onDiscoveredSplits(createSplits(4, 2, "2")); + + assertGetNext(assigner, GetSplitResult.Status.AVAILABLE); + assertGetNext(assigner, GetSplitResult.Status.AVAILABLE); + assertGetNext(assigner, GetSplitResult.Status.UNAVAILABLE); + } + + /** Test sorted splits */ + @Test + public void testSplitSort() { + SplitAssigner assigner = splitAssigner(); + + Instant now = Instant.now(); + List splits = + IntStream.range(0, 5) + .mapToObj(i -> splitFromInstant(now.plus(i, ChronoUnit.MINUTES))) + .collect(Collectors.toList()); + + assigner.onDiscoveredSplits(splits.subList(3, 5)); + assigner.onDiscoveredSplits(splits.subList(0, 1)); + assigner.onDiscoveredSplits(splits.subList(1, 3)); + + assertGetNext(assigner, splits.get(0)); + assertGetNext(assigner, splits.get(1)); + assertGetNext(assigner, splits.get(2)); + assertGetNext(assigner, splits.get(3)); + assertGetNext(assigner, splits.get(4)); + + assertGetNext(assigner, GetSplitResult.Status.UNAVAILABLE); + } + + @Test + public void testSerializable() { + byte[] bytes = + SerializationUtil.serializeToBytes( + SplitComparators.watermark( + new ColumnStatsWatermarkExtractor( + TestFixtures.SCHEMA, "id", TimeUnit.MILLISECONDS))); + SerializableComparator comparator = + SerializationUtil.deserializeFromBytes(bytes); + assertThat(comparator).isNotNull(); + } + + private void assertGetNext(SplitAssigner assigner, IcebergSourceSplit split) { + GetSplitResult result = assigner.getNext(null); + assertThat(split).isEqualTo(result.split()); + } + + @Override + protected List createSplits( + int fileCount, int filesPerSplit, String version) { + return IntStream.range(0, fileCount / filesPerSplit) + .mapToObj( + splitNum -> + splitFromRecords( + IntStream.range(0, filesPerSplit) + .mapToObj( + fileNum -> + RandomGenericData.generate( + SCHEMA, 2, (long) splitNum * filesPerSplit + fileNum)) + .collect(Collectors.toList()))) + .collect(Collectors.toList()); + } + + private IcebergSourceSplit splitFromInstant(Instant instant) { + Record record = GenericRecord.create(SCHEMA); + record.set(0, LocalDateTime.ofInstant(instant, ZoneOffset.UTC)); + return splitFromRecords(ImmutableList.of(ImmutableList.of(record))); + } + + private IcebergSourceSplit splitFromRecords(List> records) { + try { + return IcebergSourceSplit.fromCombinedScanTask( + ReaderUtil.createCombinedScanTask( + records, temporaryFolder, FileFormat.PARQUET, APPENDER_FACTORY)); + } catch (IOException e) { + throw new RuntimeException("Split creation exception", e); + } + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/ManualContinuousSplitPlanner.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/ManualContinuousSplitPlanner.java new file mode 100644 index 000000000000..ebc92df02360 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/ManualContinuousSplitPlanner.java @@ -0,0 +1,97 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.enumerator; + +import java.io.IOException; +import java.util.List; +import java.util.NavigableMap; +import java.util.TreeMap; +import org.apache.iceberg.flink.source.ScanContext; +import org.apache.iceberg.flink.source.split.IcebergSourceSplit; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; + +class ManualContinuousSplitPlanner implements ContinuousSplitPlanner { + private final int maxPlanningSnapshotCount; + // track splits per snapshot + private final NavigableMap> splits; + private long latestSnapshotId; + private int remainingFailures; + + ManualContinuousSplitPlanner(ScanContext scanContext, int expectedFailures) { + this.maxPlanningSnapshotCount = scanContext.maxPlanningSnapshotCount(); + this.splits = new TreeMap<>(); + this.latestSnapshotId = 0L; + this.remainingFailures = expectedFailures; + } + + @Override + public synchronized ContinuousEnumerationResult planSplits( + IcebergEnumeratorPosition lastPosition) { + if (remainingFailures > 0) { + remainingFailures--; + throw new RuntimeException("Expected failure at planning"); + } + + long fromSnapshotIdExclusive = 0; + if (lastPosition != null && lastPosition.snapshotId() != null) { + fromSnapshotIdExclusive = lastPosition.snapshotId(); + } + + Preconditions.checkArgument( + fromSnapshotIdExclusive <= latestSnapshotId, + "last enumerated snapshotId is greater than the latestSnapshotId"); + if (fromSnapshotIdExclusive == latestSnapshotId) { + // already discovered everything. + return new ContinuousEnumerationResult(Lists.newArrayList(), lastPosition, lastPosition); + } + + // find the subset of snapshots to return discovered splits + long toSnapshotIdInclusive; + if (latestSnapshotId - fromSnapshotIdExclusive > maxPlanningSnapshotCount) { + toSnapshotIdInclusive = fromSnapshotIdExclusive + maxPlanningSnapshotCount; + } else { + toSnapshotIdInclusive = latestSnapshotId; + } + + List discoveredSplits = Lists.newArrayList(); + NavigableMap> discoveredView = + splits.subMap(fromSnapshotIdExclusive, false, toSnapshotIdInclusive, true); + discoveredView.forEach((snapshotId, snapshotSplits) -> discoveredSplits.addAll(snapshotSplits)); + ContinuousEnumerationResult result = + new ContinuousEnumerationResult( + discoveredSplits, + lastPosition, + // use the snapshot Id as snapshot timestamp. + IcebergEnumeratorPosition.of(toSnapshotIdInclusive, toSnapshotIdInclusive)); + return result; + } + + /** + * Add a collection of new splits. A monotonically increased snapshotId is assigned to each batch + * of splits added by this method. + */ + public synchronized void addSplits(List newSplits) { + latestSnapshotId += 1; + splits.put(latestSnapshotId, newSplits); + } + + @Override + public void close() throws IOException {} +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousIcebergEnumerator.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousIcebergEnumerator.java new file mode 100644 index 000000000000..41a787762fda --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousIcebergEnumerator.java @@ -0,0 +1,352 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.enumerator; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import java.nio.file.Path; +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.List; +import java.util.stream.Collectors; +import org.apache.flink.api.connector.source.SplitEnumeratorContext; +import org.apache.flink.connector.testutils.source.reader.TestingSplitEnumeratorContext; +import org.apache.iceberg.flink.source.ScanContext; +import org.apache.iceberg.flink.source.SplitHelpers; +import org.apache.iceberg.flink.source.StreamingStartingStrategy; +import org.apache.iceberg.flink.source.assigner.DefaultSplitAssigner; +import org.apache.iceberg.flink.source.split.IcebergSourceSplit; +import org.apache.iceberg.flink.source.split.IcebergSourceSplitState; +import org.apache.iceberg.flink.source.split.IcebergSourceSplitStatus; +import org.apache.iceberg.flink.source.split.SplitRequestEvent; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +public class TestContinuousIcebergEnumerator { + @TempDir protected Path temporaryFolder; + + @Test + public void testDiscoverSplitWhenNoReaderRegistered() throws Exception { + TestingSplitEnumeratorContext enumeratorContext = + new TestingSplitEnumeratorContext<>(4); + ScanContext scanContext = + ScanContext.builder() + .streaming(true) + .startingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) + .build(); + ManualContinuousSplitPlanner splitPlanner = new ManualContinuousSplitPlanner(scanContext, 0); + ContinuousIcebergEnumerator enumerator = + createEnumerator(enumeratorContext, scanContext, splitPlanner); + + Collection pendingSplitsEmpty = + enumerator.snapshotState(1).pendingSplits(); + assertThat(pendingSplitsEmpty).isEmpty(); + + // make one split available and trigger the periodic discovery + List splits = + SplitHelpers.createSplitsFromTransientHadoopTable(temporaryFolder, 1, 1); + splitPlanner.addSplits(splits); + enumeratorContext.triggerAllActions(); + + Collection pendingSplits = enumerator.snapshotState(2).pendingSplits(); + assertThat(pendingSplits).hasSize(1); + IcebergSourceSplitState pendingSplit = pendingSplits.iterator().next(); + assertThat(pendingSplit.split().splitId()).isEqualTo(splits.get(0).splitId()); + assertThat(pendingSplit.status()).isEqualTo(IcebergSourceSplitStatus.UNASSIGNED); + } + + @Test + public void testDiscoverWhenReaderRegistered() throws Exception { + TestingSplitEnumeratorContext enumeratorContext = + new TestingSplitEnumeratorContext<>(4); + ScanContext scanContext = + ScanContext.builder() + .streaming(true) + .startingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) + .build(); + ManualContinuousSplitPlanner splitPlanner = new ManualContinuousSplitPlanner(scanContext, 0); + ContinuousIcebergEnumerator enumerator = + createEnumerator(enumeratorContext, scanContext, splitPlanner); + + // register one reader, and let it request a split + enumeratorContext.registerReader(2, "localhost"); + enumerator.addReader(2); + enumerator.handleSourceEvent(2, new SplitRequestEvent()); + + // make one split available and trigger the periodic discovery + List splits = + SplitHelpers.createSplitsFromTransientHadoopTable(temporaryFolder, 1, 1); + splitPlanner.addSplits(splits); + enumeratorContext.triggerAllActions(); + + assertThat(enumerator.snapshotState(1).pendingSplits()).isEmpty(); + assertThat(enumeratorContext.getSplitAssignments().get(2).getAssignedSplits()) + .contains(splits.get(0)); + } + + @Test + public void testRequestingReaderUnavailableWhenSplitDiscovered() throws Exception { + TestingSplitEnumeratorContext enumeratorContext = + new TestingSplitEnumeratorContext<>(4); + ScanContext scanContext = + ScanContext.builder() + .streaming(true) + .startingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) + .build(); + ManualContinuousSplitPlanner splitPlanner = new ManualContinuousSplitPlanner(scanContext, 0); + ContinuousIcebergEnumerator enumerator = + createEnumerator(enumeratorContext, scanContext, splitPlanner); + + // register one reader, and let it request a split + enumeratorContext.registerReader(2, "localhost"); + enumerator.addReader(2); + enumerator.handleSourceEvent(2, new SplitRequestEvent()); + + // remove the reader (like in a failure) + enumeratorContext.registeredReaders().remove(2); + + // make one split available and trigger the periodic discovery + List splits = + SplitHelpers.createSplitsFromTransientHadoopTable(temporaryFolder, 1, 1); + assertThat(splits).hasSize(1); + splitPlanner.addSplits(splits); + enumeratorContext.triggerAllActions(); + + assertThat(enumeratorContext.getSplitAssignments()).doesNotContainKey(2); + List pendingSplitIds = + enumerator.snapshotState(1).pendingSplits().stream() + .map(IcebergSourceSplitState::split) + .map(IcebergSourceSplit::splitId) + .collect(Collectors.toList()); + assertThat(pendingSplitIds).hasSameSizeAs(splits).first().isEqualTo(splits.get(0).splitId()); + + // register the reader again, and let it request a split + enumeratorContext.registerReader(2, "localhost"); + enumerator.addReader(2); + enumerator.handleSourceEvent(2, new SplitRequestEvent()); + + assertThat(enumerator.snapshotState(2).pendingSplits()).isEmpty(); + assertThat(enumeratorContext.getSplitAssignments().get(2).getAssignedSplits()) + .contains(splits.get(0)); + } + + @Test + public void testThrottlingDiscovery() throws Exception { + // create 10 splits + List splits = + SplitHelpers.createSplitsFromTransientHadoopTable(temporaryFolder, 10, 1); + + TestingSplitEnumeratorContext enumeratorContext = + new TestingSplitEnumeratorContext<>(4); + ScanContext scanContext = + ScanContext.builder() + .streaming(true) + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_EARLIEST_SNAPSHOT) + // discover one snapshot at a time + .maxPlanningSnapshotCount(1) + .build(); + ManualContinuousSplitPlanner splitPlanner = new ManualContinuousSplitPlanner(scanContext, 0); + ContinuousIcebergEnumerator enumerator = + createEnumerator(enumeratorContext, scanContext, splitPlanner); + + // register reader-2, and let it request a split + enumeratorContext.registerReader(2, "localhost"); + enumerator.addReader(2); + enumerator.handleSourceEvent(2, new SplitRequestEvent()); + + // add splits[0] to the planner for next discovery + splitPlanner.addSplits(Arrays.asList(splits.get(0))); + enumeratorContext.triggerAllActions(); + + // because discovered split was assigned to reader, pending splits should be empty + assertThat(enumerator.snapshotState(1).pendingSplits()).isEmpty(); + // split assignment to reader-2 should contain splits[0, 1) + assertThat(enumeratorContext.getSplitAssignments().get(2).getAssignedSplits()) + .containsExactlyElementsOf(splits.subList(0, 1)); + + // add the remaining 9 splits (one for every snapshot) + // run discovery cycles while reader-2 still processing the splits[0] + for (int i = 1; i < 10; ++i) { + splitPlanner.addSplits(Arrays.asList(splits.get(i))); + enumeratorContext.triggerAllActions(); + } + + // can only discover up to 3 snapshots/splits + assertThat(enumerator.snapshotState(2).pendingSplits()).hasSize(3); + // split assignment to reader-2 should be splits[0, 1) + assertThat(enumeratorContext.getSplitAssignments().get(2).getAssignedSplits()) + .containsExactlyElementsOf(splits.subList(0, 1)); + + // now reader-2 finished splits[0] + enumerator.handleSourceEvent(2, new SplitRequestEvent(Arrays.asList(splits.get(0).splitId()))); + enumeratorContext.triggerAllActions(); + // still have 3 pending splits. After assigned splits[1] to reader-2, one more split was + // discovered and added. + assertThat(enumerator.snapshotState(3).pendingSplits()).hasSize(3); + // split assignment to reader-2 should be splits[0, 2) + assertThat(enumeratorContext.getSplitAssignments().get(2).getAssignedSplits()) + .containsExactlyElementsOf(splits.subList(0, 2)); + + // run 3 more split discovery cycles + for (int i = 0; i < 3; ++i) { + enumeratorContext.triggerAllActions(); + } + + // no more splits are discovered due to throttling + assertThat(enumerator.snapshotState(4).pendingSplits()).hasSize(3); + // split assignment to reader-2 should still be splits[0, 2) + assertThat(enumeratorContext.getSplitAssignments().get(2).getAssignedSplits()) + .containsExactlyElementsOf(splits.subList(0, 2)); + + // now reader-2 finished splits[1] + enumerator.handleSourceEvent(2, new SplitRequestEvent(Arrays.asList(splits.get(1).splitId()))); + enumeratorContext.triggerAllActions(); + // still have 3 pending splits. After assigned new splits[2] to reader-2, one more split was + // discovered and added. + assertThat(enumerator.snapshotState(5).pendingSplits()).hasSize(3); + // split assignment to reader-2 should be splits[0, 3) + assertThat(enumeratorContext.getSplitAssignments().get(2).getAssignedSplits()) + .containsExactlyElementsOf(splits.subList(0, 3)); + } + + @Test + public void testTransientPlanningErrorsWithSuccessfulRetry() throws Exception { + TestingSplitEnumeratorContext enumeratorContext = + new TestingSplitEnumeratorContext<>(4); + ScanContext scanContext = + ScanContext.builder() + .streaming(true) + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_EARLIEST_SNAPSHOT) + .maxPlanningSnapshotCount(1) + .maxAllowedPlanningFailures(2) + .build(); + ManualContinuousSplitPlanner splitPlanner = new ManualContinuousSplitPlanner(scanContext, 1); + ContinuousIcebergEnumerator enumerator = + createEnumerator(enumeratorContext, scanContext, splitPlanner); + + // Make one split available and trigger the periodic discovery + List splits = + SplitHelpers.createSplitsFromTransientHadoopTable(temporaryFolder, 1, 1); + splitPlanner.addSplits(splits); + + // Trigger a planning and check that no splits returned due to the planning error + enumeratorContext.triggerAllActions(); + assertThat(enumerator.snapshotState(2).pendingSplits()).isEmpty(); + + // Second scan planning should succeed and discover the expected splits + enumeratorContext.triggerAllActions(); + Collection pendingSplits = enumerator.snapshotState(3).pendingSplits(); + assertThat(pendingSplits).hasSize(1); + IcebergSourceSplitState pendingSplit = pendingSplits.iterator().next(); + assertThat(pendingSplit.split().splitId()).isEqualTo(splits.get(0).splitId()); + assertThat(pendingSplit.status()).isEqualTo(IcebergSourceSplitStatus.UNASSIGNED); + } + + @Test + public void testOverMaxAllowedPlanningErrors() throws Exception { + TestingSplitEnumeratorContext enumeratorContext = + new TestingSplitEnumeratorContext<>(4); + ScanContext scanContext = + ScanContext.builder() + .streaming(true) + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_EARLIEST_SNAPSHOT) + .maxPlanningSnapshotCount(1) + .maxAllowedPlanningFailures(1) + .build(); + ManualContinuousSplitPlanner splitPlanner = new ManualContinuousSplitPlanner(scanContext, 2); + createEnumerator(enumeratorContext, scanContext, splitPlanner); + + // Make one split available and trigger the periodic discovery + List splits = + SplitHelpers.createSplitsFromTransientHadoopTable(temporaryFolder, 1, 1); + splitPlanner.addSplits(splits); + + // Check that the scheduler response ignores the current error and continues to run until the + // failure limit is reached + enumeratorContext.triggerAllActions(); + assertThat(enumeratorContext.getExecutorService().getAllScheduledTasks().get(0).isDone()) + .isFalse(); + + // Check that the task has failed with the expected exception after the failure limit is reached + enumeratorContext.triggerAllActions(); + assertThat(enumeratorContext.getExecutorService().getAllScheduledTasks().get(0).isDone()) + .isTrue(); + assertThatThrownBy( + () -> enumeratorContext.getExecutorService().getAllScheduledTasks().get(0).get()) + .hasCauseInstanceOf(RuntimeException.class) + .hasMessageContaining("Failed to discover new split"); + } + + @Test + public void testPlanningIgnoringErrors() throws Exception { + int expectedFailures = 3; + TestingSplitEnumeratorContext enumeratorContext = + new TestingSplitEnumeratorContext<>(4); + ScanContext scanContext = + ScanContext.builder() + .streaming(true) + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_EARLIEST_SNAPSHOT) + .maxPlanningSnapshotCount(1) + .maxAllowedPlanningFailures(-1) + .build(); + ManualContinuousSplitPlanner splitPlanner = + new ManualContinuousSplitPlanner(scanContext, expectedFailures); + ContinuousIcebergEnumerator enumerator = + createEnumerator(enumeratorContext, scanContext, splitPlanner); + + // Make one split available and trigger the periodic discovery + List splits = + SplitHelpers.createSplitsFromTransientHadoopTable(temporaryFolder, 1, 1); + splitPlanner.addSplits(splits); + + Collection pendingSplits; + // Can not discover the new split with planning failures + for (int i = 0; i < expectedFailures; ++i) { + enumeratorContext.triggerAllActions(); + pendingSplits = enumerator.snapshotState(i).pendingSplits(); + assertThat(pendingSplits).isEmpty(); + } + + // Discovered the new split after a successful scan planning + enumeratorContext.triggerAllActions(); + pendingSplits = enumerator.snapshotState(expectedFailures + 1).pendingSplits(); + assertThat(pendingSplits).hasSize(1); + IcebergSourceSplitState pendingSplit = pendingSplits.iterator().next(); + assertThat(pendingSplit.split().splitId()).isEqualTo(splits.get(0).splitId()); + assertThat(pendingSplit.status()).isEqualTo(IcebergSourceSplitStatus.UNASSIGNED); + } + + private static ContinuousIcebergEnumerator createEnumerator( + SplitEnumeratorContext context, + ScanContext scanContext, + ContinuousSplitPlanner splitPlanner) { + + ContinuousIcebergEnumerator enumerator = + new ContinuousIcebergEnumerator( + context, + new DefaultSplitAssigner(null, Collections.emptyList()), + scanContext, + splitPlanner, + null); + enumerator.start(); + return enumerator; + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousSplitPlannerImpl.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousSplitPlannerImpl.java new file mode 100644 index 000000000000..0690b456e033 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousSplitPlannerImpl.java @@ -0,0 +1,692 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.enumerator; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import java.io.IOException; +import java.nio.file.Path; +import java.util.List; +import java.util.Set; +import java.util.concurrent.atomic.AtomicLong; +import java.util.stream.Collectors; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Snapshot; +import org.apache.iceberg.data.GenericAppenderHelper; +import org.apache.iceberg.data.RandomGenericData; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.flink.HadoopTableExtension; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.flink.source.ScanContext; +import org.apache.iceberg.flink.source.StreamingStartingStrategy; +import org.apache.iceberg.flink.source.split.IcebergSourceSplit; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; +import org.apache.iceberg.relocated.com.google.common.collect.Iterables; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.RegisterExtension; +import org.junit.jupiter.api.io.TempDir; + +public class TestContinuousSplitPlannerImpl { + @TempDir protected Path temporaryFolder; + + private static final FileFormat FILE_FORMAT = FileFormat.PARQUET; + private static final AtomicLong RANDOM_SEED = new AtomicLong(); + + @RegisterExtension + private static final HadoopTableExtension TABLE_RESOURCE = + new HadoopTableExtension(TestFixtures.DATABASE, TestFixtures.TABLE, TestFixtures.SCHEMA); + + private GenericAppenderHelper dataAppender; + private DataFile dataFile1; + private Snapshot snapshot1; + private DataFile dataFile2; + private Snapshot snapshot2; + + @BeforeEach + public void before() throws IOException { + dataAppender = new GenericAppenderHelper(TABLE_RESOURCE.table(), FILE_FORMAT, temporaryFolder); + } + + private void appendTwoSnapshots() throws IOException { + // snapshot1 + List batch1 = RandomGenericData.generate(TestFixtures.SCHEMA, 2, 0L); + dataFile1 = dataAppender.writeFile(null, batch1); + dataAppender.appendToTable(dataFile1); + snapshot1 = TABLE_RESOURCE.table().currentSnapshot(); + + // snapshot2 + List batch2 = RandomGenericData.generate(TestFixtures.SCHEMA, 2, 1L); + dataFile2 = dataAppender.writeFile(null, batch2); + dataAppender.appendToTable(dataFile2); + snapshot2 = TABLE_RESOURCE.table().currentSnapshot(); + } + + /** @return the last enumerated snapshot id */ + private CycleResult verifyOneCycle( + ContinuousSplitPlannerImpl splitPlanner, IcebergEnumeratorPosition lastPosition) + throws Exception { + List batch = + RandomGenericData.generate(TestFixtures.SCHEMA, 2, RANDOM_SEED.incrementAndGet()); + DataFile dataFile = dataAppender.writeFile(null, batch); + dataAppender.appendToTable(dataFile); + Snapshot snapshot = TABLE_RESOURCE.table().currentSnapshot(); + + ContinuousEnumerationResult result = splitPlanner.planSplits(lastPosition); + assertThat(result.fromPosition().snapshotId()).isEqualTo(lastPosition.snapshotId()); + assertThat(result.fromPosition().snapshotTimestampMs()) + .isEqualTo(lastPosition.snapshotTimestampMs()); + assertThat(result.toPosition().snapshotId().longValue()).isEqualTo(snapshot.snapshotId()); + assertThat(result.toPosition().snapshotTimestampMs().longValue()) + .isEqualTo(snapshot.timestampMillis()); + assertThat(result.splits()).hasSize(1); + IcebergSourceSplit split = Iterables.getOnlyElement(result.splits()); + assertThat(split.task().files()) + .hasSize(1) + .first() + .satisfies( + fileScanTask -> assertThat(fileScanTask.file().path()).isEqualTo(dataFile.path())); + return new CycleResult(result.toPosition(), split); + } + + @Test + public void testTableScanThenIncrementalWithEmptyTable() throws Exception { + ScanContext scanContext = + ScanContext.builder() + .startingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) + .build(); + ContinuousSplitPlannerImpl splitPlanner = + new ContinuousSplitPlannerImpl(TABLE_RESOURCE.tableLoader().clone(), scanContext, null); + + ContinuousEnumerationResult emptyTableInitialDiscoveryResult = splitPlanner.planSplits(null); + assertThat(emptyTableInitialDiscoveryResult.splits()).isEmpty(); + assertThat(emptyTableInitialDiscoveryResult.fromPosition()).isNull(); + assertThat(emptyTableInitialDiscoveryResult.toPosition().isEmpty()).isTrue(); + assertThat(emptyTableInitialDiscoveryResult.toPosition().snapshotTimestampMs()).isNull(); + + ContinuousEnumerationResult emptyTableSecondDiscoveryResult = + splitPlanner.planSplits(emptyTableInitialDiscoveryResult.toPosition()); + assertThat(emptyTableSecondDiscoveryResult.splits()).isEmpty(); + assertThat(emptyTableSecondDiscoveryResult.fromPosition().isEmpty()).isTrue(); + assertThat(emptyTableSecondDiscoveryResult.fromPosition().snapshotTimestampMs()).isNull(); + assertThat(emptyTableSecondDiscoveryResult.toPosition().isEmpty()).isTrue(); + assertThat(emptyTableSecondDiscoveryResult.toPosition().snapshotTimestampMs()).isNull(); + + // next 3 snapshots + IcebergEnumeratorPosition lastPosition = emptyTableSecondDiscoveryResult.toPosition(); + for (int i = 0; i < 3; ++i) { + lastPosition = verifyOneCycle(splitPlanner, lastPosition).lastPosition; + } + } + + @Test + public void testTableScanThenIncrementalWithNonEmptyTable() throws Exception { + appendTwoSnapshots(); + + ScanContext scanContext = + ScanContext.builder() + .startingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) + .build(); + ContinuousSplitPlannerImpl splitPlanner = + new ContinuousSplitPlannerImpl(TABLE_RESOURCE.tableLoader().clone(), scanContext, null); + + ContinuousEnumerationResult initialResult = splitPlanner.planSplits(null); + assertThat(initialResult.fromPosition()).isNull(); + assertThat(initialResult.toPosition().snapshotId().longValue()) + .isEqualTo(snapshot2.snapshotId()); + assertThat(initialResult.toPosition().snapshotTimestampMs().longValue()) + .isEqualTo(snapshot2.timestampMillis()); + assertThat(initialResult.splits()).hasSize(1); + IcebergSourceSplit split = Iterables.getOnlyElement(initialResult.splits()); + assertThat(split.task().files()).hasSize(2); + Set discoveredFiles = + split.task().files().stream() + .map(fileScanTask -> fileScanTask.file().path().toString()) + .collect(Collectors.toSet()); + Set expectedFiles = + ImmutableSet.of(dataFile1.path().toString(), dataFile2.path().toString()); + assertThat(discoveredFiles).containsExactlyInAnyOrderElementsOf(expectedFiles); + + IcebergEnumeratorPosition lastPosition = initialResult.toPosition(); + for (int i = 0; i < 3; ++i) { + lastPosition = verifyOneCycle(splitPlanner, lastPosition).lastPosition; + } + } + + @Test + public void testIncrementalFromLatestSnapshotWithEmptyTable() throws Exception { + ScanContext scanContext = + ScanContext.builder() + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_LATEST_SNAPSHOT) + .splitSize(1L) + .build(); + ContinuousSplitPlannerImpl splitPlanner = + new ContinuousSplitPlannerImpl(TABLE_RESOURCE.tableLoader().clone(), scanContext, null); + + ContinuousEnumerationResult emptyTableInitialDiscoveryResult = splitPlanner.planSplits(null); + assertThat(emptyTableInitialDiscoveryResult.splits()).isEmpty(); + assertThat(emptyTableInitialDiscoveryResult.fromPosition()).isNull(); + assertThat(emptyTableInitialDiscoveryResult.toPosition().isEmpty()).isTrue(); + assertThat(emptyTableInitialDiscoveryResult.toPosition().snapshotTimestampMs()).isNull(); + + ContinuousEnumerationResult emptyTableSecondDiscoveryResult = + splitPlanner.planSplits(emptyTableInitialDiscoveryResult.toPosition()); + assertThat(emptyTableSecondDiscoveryResult.splits()).isEmpty(); + assertThat(emptyTableSecondDiscoveryResult.fromPosition().isEmpty()).isTrue(); + assertThat(emptyTableSecondDiscoveryResult.fromPosition().snapshotTimestampMs()).isNull(); + assertThat(emptyTableSecondDiscoveryResult.toPosition().isEmpty()).isTrue(); + assertThat(emptyTableSecondDiscoveryResult.toPosition().snapshotTimestampMs()).isNull(); + + // latest mode should discover both snapshots, as latest position is marked by when job starts + appendTwoSnapshots(); + ContinuousEnumerationResult afterTwoSnapshotsAppended = + splitPlanner.planSplits(emptyTableSecondDiscoveryResult.toPosition()); + assertThat(afterTwoSnapshotsAppended.splits()).hasSize(2); + + // next 3 snapshots + IcebergEnumeratorPosition lastPosition = afterTwoSnapshotsAppended.toPosition(); + for (int i = 0; i < 3; ++i) { + lastPosition = verifyOneCycle(splitPlanner, lastPosition).lastPosition; + } + } + + @Test + public void testIncrementalFromLatestSnapshotWithNonEmptyTable() throws Exception { + appendTwoSnapshots(); + + ScanContext scanContext = + ScanContext.builder() + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_LATEST_SNAPSHOT) + .build(); + ContinuousSplitPlannerImpl splitPlanner = + new ContinuousSplitPlannerImpl(TABLE_RESOURCE.tableLoader().clone(), scanContext, null); + + ContinuousEnumerationResult initialResult = splitPlanner.planSplits(null); + assertThat(initialResult.fromPosition()).isNull(); + // For inclusive behavior, the initial result should point to snapshot1 + // Then the next incremental scan shall discover files from latest snapshot2 (inclusive) + assertThat(initialResult.toPosition().snapshotId().longValue()) + .isEqualTo(snapshot1.snapshotId()); + assertThat(initialResult.toPosition().snapshotTimestampMs().longValue()) + .isEqualTo(snapshot1.timestampMillis()); + assertThat(initialResult.splits()).isEmpty(); + + ContinuousEnumerationResult secondResult = splitPlanner.planSplits(initialResult.toPosition()); + assertThat(secondResult.fromPosition().snapshotId().longValue()) + .isEqualTo(snapshot1.snapshotId()); + assertThat(secondResult.fromPosition().snapshotTimestampMs().longValue()) + .isEqualTo(snapshot1.timestampMillis()); + assertThat(secondResult.toPosition().snapshotId().longValue()) + .isEqualTo(snapshot2.snapshotId()); + assertThat(secondResult.toPosition().snapshotTimestampMs().longValue()) + .isEqualTo(snapshot2.timestampMillis()); + IcebergSourceSplit split = Iterables.getOnlyElement(secondResult.splits()); + assertThat(split.task().files()).hasSize(1); + Set discoveredFiles = + split.task().files().stream() + .map(fileScanTask -> fileScanTask.file().path().toString()) + .collect(Collectors.toSet()); + // should discover dataFile2 appended in snapshot2 + Set expectedFiles = ImmutableSet.of(dataFile2.path().toString()); + assertThat(discoveredFiles).containsExactlyElementsOf(expectedFiles); + + IcebergEnumeratorPosition lastPosition = secondResult.toPosition(); + for (int i = 0; i < 3; ++i) { + lastPosition = verifyOneCycle(splitPlanner, lastPosition).lastPosition; + } + } + + @Test + public void testIncrementalFromEarliestSnapshotWithEmptyTable() throws Exception { + ScanContext scanContext = + ScanContext.builder() + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_EARLIEST_SNAPSHOT) + .build(); + ContinuousSplitPlannerImpl splitPlanner = + new ContinuousSplitPlannerImpl(TABLE_RESOURCE.tableLoader().clone(), scanContext, null); + + ContinuousEnumerationResult emptyTableInitialDiscoveryResult = splitPlanner.planSplits(null); + assertThat(emptyTableInitialDiscoveryResult.splits()).isEmpty(); + assertThat(emptyTableInitialDiscoveryResult.fromPosition()).isNull(); + assertThat(emptyTableInitialDiscoveryResult.toPosition().snapshotId()).isNull(); + assertThat(emptyTableInitialDiscoveryResult.toPosition().snapshotTimestampMs()).isNull(); + + ContinuousEnumerationResult emptyTableSecondDiscoveryResult = + splitPlanner.planSplits(emptyTableInitialDiscoveryResult.toPosition()); + assertThat(emptyTableSecondDiscoveryResult.splits()).isEmpty(); + assertThat(emptyTableSecondDiscoveryResult.fromPosition().snapshotId()).isNull(); + assertThat(emptyTableSecondDiscoveryResult.fromPosition().snapshotTimestampMs()).isNull(); + assertThat(emptyTableSecondDiscoveryResult.toPosition().snapshotId()).isNull(); + assertThat(emptyTableSecondDiscoveryResult.toPosition().snapshotTimestampMs()).isNull(); + + // next 3 snapshots + IcebergEnumeratorPosition lastPosition = emptyTableSecondDiscoveryResult.toPosition(); + for (int i = 0; i < 3; ++i) { + lastPosition = verifyOneCycle(splitPlanner, lastPosition).lastPosition; + } + } + + @Test + public void testIncrementalFromEarliestSnapshotWithNonEmptyTable() throws Exception { + appendTwoSnapshots(); + + ScanContext scanContext = + ScanContext.builder() + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_EARLIEST_SNAPSHOT) + .build(); + ContinuousSplitPlannerImpl splitPlanner = + new ContinuousSplitPlannerImpl(TABLE_RESOURCE.tableLoader().clone(), scanContext, null); + + ContinuousEnumerationResult initialResult = splitPlanner.planSplits(null); + assertThat(initialResult.fromPosition()).isNull(); + // For inclusive behavior, the initial result should point to snapshot1's parent, + // which leads to null snapshotId and snapshotTimestampMs. + assertThat(initialResult.toPosition().snapshotId()).isNull(); + assertThat(initialResult.toPosition().snapshotTimestampMs()).isNull(); + assertThat(initialResult.splits()).isEmpty(); + + ContinuousEnumerationResult secondResult = splitPlanner.planSplits(initialResult.toPosition()); + assertThat(secondResult.fromPosition().snapshotId()).isNull(); + assertThat(secondResult.fromPosition().snapshotTimestampMs()).isNull(); + assertThat(secondResult.toPosition().snapshotId().longValue()) + .isEqualTo(snapshot2.snapshotId()); + assertThat(secondResult.toPosition().snapshotTimestampMs().longValue()) + .isEqualTo(snapshot2.timestampMillis()); + IcebergSourceSplit split = Iterables.getOnlyElement(secondResult.splits()); + assertThat(split.task().files()).hasSize(2); + Set discoveredFiles = + split.task().files().stream() + .map(fileScanTask -> fileScanTask.file().path().toString()) + .collect(Collectors.toSet()); + // should discover files appended in both snapshot1 and snapshot2 + Set expectedFiles = + ImmutableSet.of(dataFile1.path().toString(), dataFile2.path().toString()); + assertThat(discoveredFiles).containsExactlyInAnyOrderElementsOf(expectedFiles); + + IcebergEnumeratorPosition lastPosition = secondResult.toPosition(); + for (int i = 0; i < 3; ++i) { + lastPosition = verifyOneCycle(splitPlanner, lastPosition).lastPosition; + } + } + + @Test + public void testIncrementalFromSnapshotIdWithEmptyTable() { + ScanContext scanContextWithInvalidSnapshotId = + ScanContext.builder() + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_ID) + .startSnapshotId(1L) + .build(); + ContinuousSplitPlannerImpl splitPlanner = + new ContinuousSplitPlannerImpl( + TABLE_RESOURCE.tableLoader().clone(), scanContextWithInvalidSnapshotId, null); + + assertThatThrownBy(() -> splitPlanner.planSplits(null)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Start snapshot id not found in history: 1"); + } + + @Test + public void testIncrementalFromSnapshotIdWithInvalidIds() throws Exception { + appendTwoSnapshots(); + + // find an invalid snapshotId + long invalidSnapshotId = 0L; + while (invalidSnapshotId == snapshot1.snapshotId() + || invalidSnapshotId == snapshot2.snapshotId()) { + invalidSnapshotId++; + } + + ScanContext scanContextWithInvalidSnapshotId = + ScanContext.builder() + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_ID) + .startSnapshotId(invalidSnapshotId) + .build(); + + ContinuousSplitPlannerImpl splitPlanner = + new ContinuousSplitPlannerImpl( + TABLE_RESOURCE.tableLoader().clone(), scanContextWithInvalidSnapshotId, null); + + assertThatThrownBy(() -> splitPlanner.planSplits(null)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Start snapshot id not found in history: " + invalidSnapshotId); + } + + @Test + public void testIncrementalFromSnapshotId() throws Exception { + appendTwoSnapshots(); + + ScanContext scanContext = + ScanContext.builder() + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_ID) + .startSnapshotId(snapshot2.snapshotId()) + .build(); + ContinuousSplitPlannerImpl splitPlanner = + new ContinuousSplitPlannerImpl(TABLE_RESOURCE.tableLoader().clone(), scanContext, null); + + ContinuousEnumerationResult initialResult = splitPlanner.planSplits(null); + assertThat(initialResult.fromPosition()).isNull(); + // For inclusive behavior of snapshot2, the initial result should point to snapshot1 (as + // snapshot2's parent) + assertThat(initialResult.toPosition().snapshotId().longValue()) + .isEqualTo(snapshot1.snapshotId()); + assertThat(initialResult.toPosition().snapshotTimestampMs().longValue()) + .isEqualTo(snapshot1.timestampMillis()); + assertThat(initialResult.splits()).isEmpty(); + + ContinuousEnumerationResult secondResult = splitPlanner.planSplits(initialResult.toPosition()); + assertThat(secondResult.fromPosition().snapshotId().longValue()) + .isEqualTo(snapshot1.snapshotId()); + assertThat(secondResult.fromPosition().snapshotTimestampMs().longValue()) + .isEqualTo(snapshot1.timestampMillis()); + assertThat(secondResult.toPosition().snapshotId().longValue()) + .isEqualTo(snapshot2.snapshotId()); + assertThat(secondResult.toPosition().snapshotTimestampMs().longValue()) + .isEqualTo(snapshot2.timestampMillis()); + IcebergSourceSplit split = Iterables.getOnlyElement(secondResult.splits()); + assertThat(split.task().files()).hasSize(1); + Set discoveredFiles = + split.task().files().stream() + .map(fileScanTask -> fileScanTask.file().path().toString()) + .collect(Collectors.toSet()); + // should discover dataFile2 appended in snapshot2 + Set expectedFiles = ImmutableSet.of(dataFile2.path().toString()); + assertThat(discoveredFiles).containsExactlyElementsOf(expectedFiles); + + IcebergEnumeratorPosition lastPosition = secondResult.toPosition(); + for (int i = 0; i < 3; ++i) { + lastPosition = verifyOneCycle(splitPlanner, lastPosition).lastPosition; + } + } + + @Test + public void testIncrementalFromSnapshotTimestampWithEmptyTable() { + ScanContext scanContextWithInvalidSnapshotId = + ScanContext.builder() + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP) + .startSnapshotTimestamp(1L) + .build(); + ContinuousSplitPlannerImpl splitPlanner = + new ContinuousSplitPlannerImpl( + TABLE_RESOURCE.tableLoader().clone(), scanContextWithInvalidSnapshotId, null); + + assertThatThrownBy(() -> splitPlanner.planSplits(null)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Cannot find a snapshot after: 1"); + } + + @Test + public void testIncrementalFromSnapshotTimestampWithInvalidIds() throws Exception { + appendTwoSnapshots(); + + long invalidSnapshotTimestampMs = snapshot2.timestampMillis() + 1000L; + + ScanContext scanContextWithInvalidSnapshotId = + ScanContext.builder() + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP) + .startSnapshotTimestamp(invalidSnapshotTimestampMs) + .build(); + + ContinuousSplitPlannerImpl splitPlanner = + new ContinuousSplitPlannerImpl( + TABLE_RESOURCE.tableLoader().clone(), scanContextWithInvalidSnapshotId, null); + + assertThatThrownBy(() -> splitPlanner.planSplits(null)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageStartingWith("Cannot find a snapshot after:"); + } + + @Test + public void testIncrementalFromSnapshotTimestamp() throws Exception { + appendTwoSnapshots(); + + ScanContext scanContext = + ScanContext.builder() + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP) + .startSnapshotTimestamp(snapshot2.timestampMillis()) + .build(); + ContinuousSplitPlannerImpl splitPlanner = + new ContinuousSplitPlannerImpl(TABLE_RESOURCE.tableLoader().clone(), scanContext, null); + + ContinuousEnumerationResult initialResult = splitPlanner.planSplits(null); + assertThat(initialResult.fromPosition()).isNull(); + // For inclusive behavior, the initial result should point to snapshot1 (as snapshot2's parent). + assertThat(initialResult.toPosition().snapshotId().longValue()) + .isEqualTo(snapshot1.snapshotId()); + assertThat(initialResult.toPosition().snapshotTimestampMs().longValue()) + .isEqualTo(snapshot1.timestampMillis()); + assertThat(initialResult.splits()).isEmpty(); + + ContinuousEnumerationResult secondResult = splitPlanner.planSplits(initialResult.toPosition()); + assertThat(secondResult.fromPosition().snapshotId().longValue()) + .isEqualTo(snapshot1.snapshotId()); + assertThat(secondResult.fromPosition().snapshotTimestampMs().longValue()) + .isEqualTo(snapshot1.timestampMillis()); + assertThat(secondResult.toPosition().snapshotId().longValue()) + .isEqualTo(snapshot2.snapshotId()); + assertThat(secondResult.toPosition().snapshotTimestampMs().longValue()) + .isEqualTo(snapshot2.timestampMillis()); + IcebergSourceSplit split = Iterables.getOnlyElement(secondResult.splits()); + assertThat(split.task().files()).hasSize(1); + Set discoveredFiles = + split.task().files().stream() + .map(fileScanTask -> fileScanTask.file().path().toString()) + .collect(Collectors.toSet()); + // should discover dataFile2 appended in snapshot2 + Set expectedFiles = ImmutableSet.of(dataFile2.path().toString()); + assertThat(discoveredFiles).containsExactlyElementsOf(expectedFiles); + + IcebergEnumeratorPosition lastPosition = secondResult.toPosition(); + for (int i = 0; i < 3; ++i) { + lastPosition = verifyOneCycle(splitPlanner, lastPosition).lastPosition; + } + } + + @Test + public void testMaxPlanningSnapshotCount() throws Exception { + appendTwoSnapshots(); + // append 3 more snapshots + for (int i = 2; i < 5; ++i) { + appendSnapshot(i, 2); + } + + ScanContext scanContext = + ScanContext.builder() + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_EARLIEST_SNAPSHOT) + // limit to 1 snapshot per discovery + .maxPlanningSnapshotCount(1) + .build(); + ContinuousSplitPlannerImpl splitPlanner = + new ContinuousSplitPlannerImpl(TABLE_RESOURCE.tableLoader().clone(), scanContext, null); + + ContinuousEnumerationResult initialResult = splitPlanner.planSplits(null); + assertThat(initialResult.fromPosition()).isNull(); + // For inclusive behavior, the initial result should point to snapshot1's parent, + // which leads to null snapshotId and snapshotTimestampMs. + assertThat(initialResult.toPosition().snapshotId()).isNull(); + assertThat(initialResult.toPosition().snapshotTimestampMs()).isNull(); + assertThat(initialResult.splits()).isEmpty(); + + ContinuousEnumerationResult secondResult = splitPlanner.planSplits(initialResult.toPosition()); + // should discover dataFile1 appended in snapshot1 + verifyMaxPlanningSnapshotCountResult( + secondResult, null, snapshot1, ImmutableSet.of(dataFile1.path().toString())); + + ContinuousEnumerationResult thirdResult = splitPlanner.planSplits(secondResult.toPosition()); + // should discover dataFile2 appended in snapshot2 + verifyMaxPlanningSnapshotCountResult( + thirdResult, snapshot1, snapshot2, ImmutableSet.of(dataFile2.path().toString())); + } + + @Test + public void testTableScanNoStats() throws Exception { + appendTwoSnapshots(); + + ScanContext scanContext = + ScanContext.builder() + .includeColumnStats(false) + .startingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) + .build(); + ContinuousSplitPlannerImpl splitPlanner = + new ContinuousSplitPlannerImpl(TABLE_RESOURCE.tableLoader().clone(), scanContext, null); + + ContinuousEnumerationResult initialResult = splitPlanner.planSplits(null); + assertThat(initialResult.splits()).hasSize(1); + IcebergSourceSplit split = Iterables.getOnlyElement(initialResult.splits()); + assertThat(split.task().files()).hasSize(2); + verifyStatCount(split, 0); + + IcebergEnumeratorPosition lastPosition = initialResult.toPosition(); + for (int i = 0; i < 3; ++i) { + CycleResult result = verifyOneCycle(splitPlanner, lastPosition); + verifyStatCount(result.split, 0); + lastPosition = result.lastPosition; + } + } + + @Test + public void testTableScanAllStats() throws Exception { + appendTwoSnapshots(); + + ScanContext scanContext = + ScanContext.builder() + .includeColumnStats(true) + .startingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) + .build(); + ContinuousSplitPlannerImpl splitPlanner = + new ContinuousSplitPlannerImpl(TABLE_RESOURCE.tableLoader().clone(), scanContext, null); + + ContinuousEnumerationResult initialResult = splitPlanner.planSplits(null); + assertThat(initialResult.splits()).hasSize(1); + IcebergSourceSplit split = Iterables.getOnlyElement(initialResult.splits()); + assertThat(split.task().files()).hasSize(2); + verifyStatCount(split, 3); + + IcebergEnumeratorPosition lastPosition = initialResult.toPosition(); + for (int i = 0; i < 3; ++i) { + CycleResult result = verifyOneCycle(splitPlanner, lastPosition); + verifyStatCount(result.split, 3); + lastPosition = result.lastPosition; + } + } + + @Test + public void testTableScanSingleStat() throws Exception { + appendTwoSnapshots(); + + ScanContext scanContext = + ScanContext.builder() + .includeColumnStats(ImmutableSet.of("data")) + .startingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) + .build(); + ContinuousSplitPlannerImpl splitPlanner = + new ContinuousSplitPlannerImpl(TABLE_RESOURCE.tableLoader().clone(), scanContext, null); + + ContinuousEnumerationResult initialResult = splitPlanner.planSplits(null); + assertThat(initialResult.splits()).hasSize(1); + IcebergSourceSplit split = Iterables.getOnlyElement(initialResult.splits()); + assertThat(split.task().files()).hasSize(2); + verifyStatCount(split, 1); + + IcebergEnumeratorPosition lastPosition = initialResult.toPosition(); + for (int i = 0; i < 3; ++i) { + CycleResult result = verifyOneCycle(splitPlanner, lastPosition); + verifyStatCount(result.split, 1); + lastPosition = result.lastPosition; + } + } + + private void verifyStatCount(IcebergSourceSplit split, int expected) { + if (expected == 0) { + split + .task() + .files() + .forEach( + f -> { + assertThat(f.file().valueCounts()).isNull(); + assertThat(f.file().columnSizes()).isNull(); + assertThat(f.file().lowerBounds()).isNull(); + assertThat(f.file().upperBounds()).isNull(); + assertThat(f.file().nanValueCounts()).isNull(); + assertThat(f.file().nullValueCounts()).isNull(); + }); + } else { + split + .task() + .files() + .forEach( + f -> { + assertThat(f.file().valueCounts()).hasSize(expected); + assertThat(f.file().columnSizes()).hasSize(expected); + assertThat(f.file().lowerBounds()).hasSize(expected); + assertThat(f.file().upperBounds()).hasSize(expected); + assertThat(f.file().nullValueCounts()).hasSize(expected); + // The nanValue is not counted for long and string fields + assertThat(f.file().nanValueCounts()).isEmpty(); + }); + } + } + + private void verifyMaxPlanningSnapshotCountResult( + ContinuousEnumerationResult result, + Snapshot fromSnapshotExclusive, + Snapshot toSnapshotInclusive, + Set expectedFiles) { + if (fromSnapshotExclusive == null) { + assertThat(result.fromPosition().snapshotId()).isNull(); + assertThat(result.fromPosition().snapshotTimestampMs()).isNull(); + } else { + assertThat(result.fromPosition().snapshotId().longValue()) + .isEqualTo(fromSnapshotExclusive.snapshotId()); + assertThat(result.fromPosition().snapshotTimestampMs().longValue()) + .isEqualTo(fromSnapshotExclusive.timestampMillis()); + } + assertThat(result.toPosition().snapshotId().longValue()) + .isEqualTo(toSnapshotInclusive.snapshotId()); + assertThat(result.toPosition().snapshotTimestampMs().longValue()) + .isEqualTo(toSnapshotInclusive.timestampMillis()); + // should only have one split with one data file, because split discover is limited to + // one snapshot and each snapshot has only one data file appended. + IcebergSourceSplit split = Iterables.getOnlyElement(result.splits()); + assertThat(split.task().files()).hasSize(1); + Set discoveredFiles = + split.task().files().stream() + .map(fileScanTask -> fileScanTask.file().path().toString()) + .collect(Collectors.toSet()); + assertThat(discoveredFiles).containsExactlyElementsOf(expectedFiles); + } + + private Snapshot appendSnapshot(long seed, int numRecords) throws Exception { + List batch = RandomGenericData.generate(TestFixtures.SCHEMA, numRecords, seed); + DataFile dataFile = dataAppender.writeFile(null, batch); + dataAppender.appendToTable(dataFile); + return TABLE_RESOURCE.table().currentSnapshot(); + } + + private static class CycleResult { + IcebergEnumeratorPosition lastPosition; + IcebergSourceSplit split; + + CycleResult(IcebergEnumeratorPosition lastPosition, IcebergSourceSplit split) { + this.lastPosition = lastPosition; + this.split = split; + } + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousSplitPlannerImplStartStrategy.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousSplitPlannerImplStartStrategy.java new file mode 100644 index 000000000000..b2185675340f --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousSplitPlannerImplStartStrategy.java @@ -0,0 +1,200 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.enumerator; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import java.io.IOException; +import java.nio.file.Path; +import java.util.List; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Snapshot; +import org.apache.iceberg.data.GenericAppenderHelper; +import org.apache.iceberg.data.RandomGenericData; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.flink.HadoopTableExtension; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.flink.source.ScanContext; +import org.apache.iceberg.flink.source.StreamingStartingStrategy; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.RegisterExtension; +import org.junit.jupiter.api.io.TempDir; + +public class TestContinuousSplitPlannerImplStartStrategy { + private static final FileFormat FILE_FORMAT = FileFormat.PARQUET; + + @TempDir protected Path temporaryFolder; + + @RegisterExtension + private static final HadoopTableExtension TABLE_RESOURCE = + new HadoopTableExtension(TestFixtures.DATABASE, TestFixtures.TABLE, TestFixtures.SCHEMA); + + private GenericAppenderHelper dataAppender; + private Snapshot snapshot1; + private Snapshot snapshot2; + private Snapshot snapshot3; + + @BeforeEach + public void before() throws IOException { + dataAppender = new GenericAppenderHelper(TABLE_RESOURCE.table(), FILE_FORMAT, temporaryFolder); + } + + private void appendThreeSnapshots() throws IOException { + List batch1 = RandomGenericData.generate(TestFixtures.SCHEMA, 2, 0L); + dataAppender.appendToTable(batch1); + snapshot1 = TABLE_RESOURCE.table().currentSnapshot(); + + List batch2 = RandomGenericData.generate(TestFixtures.SCHEMA, 2, 1L); + dataAppender.appendToTable(batch2); + snapshot2 = TABLE_RESOURCE.table().currentSnapshot(); + + List batch3 = RandomGenericData.generate(TestFixtures.SCHEMA, 2, 2L); + dataAppender.appendToTable(batch3); + snapshot3 = TABLE_RESOURCE.table().currentSnapshot(); + } + + @Test + public void testTableScanThenIncrementalStrategy() throws IOException { + ScanContext scanContext = + ScanContext.builder() + .streaming(true) + .startingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) + .build(); + + assertThat(ContinuousSplitPlannerImpl.startSnapshot(TABLE_RESOURCE.table(), scanContext)) + .isNotPresent(); + + appendThreeSnapshots(); + Snapshot startSnapshot = + ContinuousSplitPlannerImpl.startSnapshot(TABLE_RESOURCE.table(), scanContext).get(); + assertThat(startSnapshot.snapshotId()).isEqualTo(snapshot3.snapshotId()); + } + + @Test + public void testForLatestSnapshotStrategy() throws IOException { + ScanContext scanContext = + ScanContext.builder() + .streaming(true) + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_LATEST_SNAPSHOT) + .build(); + + assertThat(ContinuousSplitPlannerImpl.startSnapshot(TABLE_RESOURCE.table(), scanContext)) + .isNotPresent(); + + appendThreeSnapshots(); + Snapshot startSnapshot = + ContinuousSplitPlannerImpl.startSnapshot(TABLE_RESOURCE.table(), scanContext).get(); + assertThat(startSnapshot.snapshotId()).isEqualTo(snapshot3.snapshotId()); + } + + @Test + public void testForEarliestSnapshotStrategy() throws IOException { + ScanContext scanContext = + ScanContext.builder() + .streaming(true) + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_EARLIEST_SNAPSHOT) + .build(); + + assertThat(ContinuousSplitPlannerImpl.startSnapshot(TABLE_RESOURCE.table(), scanContext)) + .isNotPresent(); + + appendThreeSnapshots(); + Snapshot startSnapshot = + ContinuousSplitPlannerImpl.startSnapshot(TABLE_RESOURCE.table(), scanContext).get(); + assertThat(startSnapshot.snapshotId()).isEqualTo(snapshot1.snapshotId()); + } + + @Test + public void testForSpecificSnapshotIdStrategy() throws IOException { + ScanContext scanContextInvalidSnapshotId = + ScanContext.builder() + .streaming(true) + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_ID) + .startSnapshotId(1L) + .build(); + + assertThatThrownBy( + () -> + ContinuousSplitPlannerImpl.startSnapshot( + TABLE_RESOURCE.table(), scanContextInvalidSnapshotId)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Start snapshot id not found in history: 1"); + + appendThreeSnapshots(); + + ScanContext scanContext = + ScanContext.builder() + .streaming(true) + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_ID) + .startSnapshotId(snapshot2.snapshotId()) + .build(); + + Snapshot startSnapshot = + ContinuousSplitPlannerImpl.startSnapshot(TABLE_RESOURCE.table(), scanContext).get(); + assertThat(startSnapshot.snapshotId()).isEqualTo(snapshot2.snapshotId()); + } + + @Test + public void testForSpecificSnapshotTimestampStrategySnapshot2() throws IOException { + ScanContext scanContextInvalidSnapshotTimestamp = + ScanContext.builder() + .streaming(true) + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP) + .startSnapshotTimestamp(1L) + .build(); + + assertThatThrownBy( + () -> + ContinuousSplitPlannerImpl.startSnapshot( + TABLE_RESOURCE.table(), scanContextInvalidSnapshotTimestamp)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageStartingWith("Cannot find a snapshot after: "); + + appendThreeSnapshots(); + + ScanContext scanContext = + ScanContext.builder() + .streaming(true) + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP) + .startSnapshotTimestamp(snapshot2.timestampMillis()) + .build(); + + Snapshot startSnapshot = + ContinuousSplitPlannerImpl.startSnapshot(TABLE_RESOURCE.table(), scanContext).get(); + assertThat(startSnapshot.snapshotId()).isEqualTo(snapshot2.snapshotId()); + } + + @Test + public void testForSpecificSnapshotTimestampStrategySnapshot2Minus1() throws IOException { + appendThreeSnapshots(); + + ScanContext config = + ScanContext.builder() + .streaming(true) + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP) + .startSnapshotTimestamp(snapshot2.timestampMillis() - 1L) + .build(); + + Snapshot startSnapshot = + ContinuousSplitPlannerImpl.startSnapshot(TABLE_RESOURCE.table(), config).get(); + assertThat(startSnapshot.snapshotId()).isEqualTo(snapshot2.snapshotId()); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestEnumerationHistory.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestEnumerationHistory.java new file mode 100644 index 000000000000..feefcb98646b --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestEnumerationHistory.java @@ -0,0 +1,135 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.enumerator; + +import static org.assertj.core.api.Assertions.assertThat; + +import org.junit.jupiter.api.Test; + +public class TestEnumerationHistory { + private static final int MAX_HISTORY_SIZE = 3; + private static final int FEW_PENDING_SPLITS = 2; + private static final int TOO_MANY_PENDING_SPLITS = 100; + + @Test + public void testEmptyHistory() { + EnumerationHistory history = new EnumerationHistory(MAX_HISTORY_SIZE); + int[] expectedHistorySnapshot = new int[0]; + testHistory(history, expectedHistorySnapshot); + } + + @Test + public void testNotFullHistory() { + EnumerationHistory history = new EnumerationHistory(3); + history.add(1); + history.add(2); + int[] expectedHistorySnapshot = {1, 2}; + testHistory(history, expectedHistorySnapshot); + } + + @Test + public void testExactFullHistory() { + EnumerationHistory history = new EnumerationHistory(3); + history.add(1); + history.add(2); + history.add(3); + int[] expectedHistorySnapshot = {1, 2, 3}; + testHistory(history, expectedHistorySnapshot); + } + + @Test + public void testOneMoreThanFullHistory() { + EnumerationHistory history = new EnumerationHistory(3); + history.add(1); + history.add(2); + history.add(3); + history.add(4); + int[] expectedHistorySnapshot = {2, 3, 4}; + testHistory(history, expectedHistorySnapshot); + } + + @Test + public void testTwoMoreThanFullHistory() { + EnumerationHistory history = new EnumerationHistory(3); + history.add(1); + history.add(2); + history.add(3); + history.add(4); + history.add(5); + int[] expectedHistorySnapshot = {3, 4, 5}; + testHistory(history, expectedHistorySnapshot); + } + + @Test + public void testThreeMoreThanFullHistory() { + EnumerationHistory history = new EnumerationHistory(3); + history.add(1); + history.add(2); + history.add(3); + history.add(4); + history.add(5); + history.add(6); + int[] expectedHistorySnapshot = {4, 5, 6}; + testHistory(history, expectedHistorySnapshot); + } + + private void testHistory(EnumerationHistory history, int[] expectedHistorySnapshot) { + assertThat(history.shouldPauseSplitDiscovery(FEW_PENDING_SPLITS)).isFalse(); + if (history.hasFullHistory()) { + // throttle because pending split count is more than the sum of enumeration history + assertThat(history.shouldPauseSplitDiscovery(TOO_MANY_PENDING_SPLITS)).isTrue(); + } else { + // skipped throttling check because there is not enough history + assertThat(history.shouldPauseSplitDiscovery(TOO_MANY_PENDING_SPLITS)).isFalse(); + } + + int[] historySnapshot = history.snapshot(); + assertThat(historySnapshot).containsExactly(expectedHistorySnapshot); + + EnumerationHistory restoredHistory = new EnumerationHistory(MAX_HISTORY_SIZE); + restoredHistory.restore(historySnapshot); + + assertThat(history.shouldPauseSplitDiscovery(FEW_PENDING_SPLITS)).isFalse(); + if (history.hasFullHistory()) { + // throttle because pending split count is more than the sum of enumeration history + assertThat(history.shouldPauseSplitDiscovery(TOO_MANY_PENDING_SPLITS)).isTrue(); + } else { + // skipped throttling check because there is not enough history + assertThat(history.shouldPauseSplitDiscovery(30)).isFalse(); + } + } + + @Test + public void testRestoreDifferentSize() { + EnumerationHistory history = new EnumerationHistory(3); + history.add(1); + history.add(2); + history.add(3); + int[] historySnapshot = history.snapshot(); + + EnumerationHistory smallerHistory = new EnumerationHistory(2); + smallerHistory.restore(historySnapshot); + int[] expectedRestoredHistorySnapshot = {2, 3}; + assertThat(smallerHistory.snapshot()).containsExactly(expectedRestoredHistorySnapshot); + + EnumerationHistory largerHisotry = new EnumerationHistory(4); + largerHisotry.restore(historySnapshot); + assertThat(largerHisotry.snapshot()).containsExactly(historySnapshot); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestIcebergEnumeratorStateSerializer.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestIcebergEnumeratorStateSerializer.java new file mode 100644 index 000000000000..2520a6b763e4 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestIcebergEnumeratorStateSerializer.java @@ -0,0 +1,146 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.enumerator; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.IOException; +import java.nio.file.Path; +import java.util.Collection; +import java.util.Collections; +import java.util.Iterator; +import java.util.List; +import org.apache.iceberg.Parameter; +import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.Parameters; +import org.apache.iceberg.flink.source.SplitHelpers; +import org.apache.iceberg.flink.source.split.IcebergSourceSplit; +import org.apache.iceberg.flink.source.split.IcebergSourceSplitState; +import org.apache.iceberg.flink.source.split.IcebergSourceSplitStatus; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.api.extension.ExtendWith; +import org.junit.jupiter.api.io.TempDir; + +@ExtendWith(ParameterizedTestExtension.class) +public class TestIcebergEnumeratorStateSerializer { + @TempDir protected Path temporaryFolder; + + private final IcebergEnumeratorStateSerializer serializer = + new IcebergEnumeratorStateSerializer(true); + + @Parameter(index = 0) + protected int version; + + @Parameters(name = "version={0}") + public static Object[][] parameters() { + return new Object[][] {new Object[] {1}, new Object[] {2}}; + } + + @TestTemplate + public void testEmptySnapshotIdAndPendingSplits() throws Exception { + IcebergEnumeratorState enumeratorState = new IcebergEnumeratorState(Collections.emptyList()); + testSerializer(enumeratorState); + } + + @TestTemplate + public void testSomeSnapshotIdAndEmptyPendingSplits() throws Exception { + IcebergEnumeratorPosition position = + IcebergEnumeratorPosition.of(1L, System.currentTimeMillis()); + + IcebergEnumeratorState enumeratorState = + new IcebergEnumeratorState(position, Collections.emptyList()); + testSerializer(enumeratorState); + } + + @TestTemplate + public void testSomeSnapshotIdAndPendingSplits() throws Exception { + IcebergEnumeratorPosition position = + IcebergEnumeratorPosition.of(2L, System.currentTimeMillis()); + List splits = + SplitHelpers.createSplitsFromTransientHadoopTable(temporaryFolder, 3, 1); + Collection pendingSplits = Lists.newArrayList(); + pendingSplits.add( + new IcebergSourceSplitState(splits.get(0), IcebergSourceSplitStatus.UNASSIGNED)); + pendingSplits.add( + new IcebergSourceSplitState(splits.get(1), IcebergSourceSplitStatus.ASSIGNED)); + pendingSplits.add( + new IcebergSourceSplitState(splits.get(2), IcebergSourceSplitStatus.COMPLETED)); + + IcebergEnumeratorState enumeratorState = new IcebergEnumeratorState(position, pendingSplits); + testSerializer(enumeratorState); + } + + @TestTemplate + public void testEnumerationSplitCountHistory() throws Exception { + if (version == 2) { + IcebergEnumeratorPosition position = + IcebergEnumeratorPosition.of(2L, System.currentTimeMillis()); + List splits = + SplitHelpers.createSplitsFromTransientHadoopTable(temporaryFolder, 3, 1); + Collection pendingSplits = Lists.newArrayList(); + pendingSplits.add( + new IcebergSourceSplitState(splits.get(0), IcebergSourceSplitStatus.UNASSIGNED)); + pendingSplits.add( + new IcebergSourceSplitState(splits.get(1), IcebergSourceSplitStatus.ASSIGNED)); + pendingSplits.add( + new IcebergSourceSplitState(splits.get(2), IcebergSourceSplitStatus.COMPLETED)); + int[] enumerationSplitCountHistory = {1, 2, 3}; + + IcebergEnumeratorState enumeratorState = + new IcebergEnumeratorState(position, pendingSplits, enumerationSplitCountHistory); + testSerializer(enumeratorState); + } + } + + private void testSerializer(IcebergEnumeratorState enumeratorState) throws IOException { + byte[] result; + if (version == 1) { + result = serializer.serializeV1(enumeratorState); + } else { + result = serializer.serialize(enumeratorState); + } + + IcebergEnumeratorState deserialized = serializer.deserialize(version, result); + assertEnumeratorStateEquals(enumeratorState, deserialized); + } + + private void assertEnumeratorStateEquals( + IcebergEnumeratorState expected, IcebergEnumeratorState actual) { + assertThat(actual.lastEnumeratedPosition()).isEqualTo(expected.lastEnumeratedPosition()); + + assertThat(actual.pendingSplits()).hasSameSizeAs(expected.pendingSplits()); + Iterator expectedIterator = expected.pendingSplits().iterator(); + Iterator actualIterator = actual.pendingSplits().iterator(); + for (int i = 0; i < expected.pendingSplits().size(); ++i) { + IcebergSourceSplitState expectedSplitState = expectedIterator.next(); + IcebergSourceSplitState actualSplitState = actualIterator.next(); + assertThat(actualSplitState.split().splitId()) + .isEqualTo(expectedSplitState.split().splitId()); + assertThat(actualSplitState.split().fileOffset()) + .isEqualTo(expectedSplitState.split().fileOffset()); + assertThat(actualSplitState.split().recordOffset()) + .isEqualTo(expectedSplitState.split().recordOffset()); + assertThat(actualSplitState.status()).isEqualTo(expectedSplitState.status()); + } + + assertThat(actual.enumerationSplitCountHistory()) + .containsExactly(expected.enumerationSplitCountHistory()); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/reader/ReaderFunctionTestBase.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/reader/ReaderFunctionTestBase.java new file mode 100644 index 000000000000..0d1d0ce3217c --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/reader/ReaderFunctionTestBase.java @@ -0,0 +1,218 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.reader; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.IOException; +import java.nio.file.Path; +import java.util.List; +import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; +import org.apache.iceberg.CombinedScanTask; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Parameter; +import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.Parameters; +import org.apache.iceberg.Schema; +import org.apache.iceberg.data.GenericAppenderFactory; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.flink.source.split.IcebergSourceSplit; +import org.apache.iceberg.io.CloseableIterator; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.api.extension.ExtendWith; +import org.junit.jupiter.api.io.TempDir; + +@ExtendWith(ParameterizedTestExtension.class) +public abstract class ReaderFunctionTestBase { + + @Parameters(name = "fileFormat={0}") + public static Object[][] parameters() { + return new Object[][] { + new Object[] {FileFormat.AVRO}, + new Object[] {FileFormat.ORC}, + new Object[] {FileFormat.PARQUET} + }; + } + + @TempDir protected Path temporaryFolder; + + protected abstract ReaderFunction readerFunction(); + + protected abstract void assertRecords(List expected, List actual, Schema schema); + + @Parameter(index = 0) + private FileFormat fileFormat; + + private final GenericAppenderFactory appenderFactory = + new GenericAppenderFactory(TestFixtures.SCHEMA); + + private void assertRecordsAndPosition( + List expectedRecords, + int expectedFileOffset, + long startRecordOffset, + RecordsWithSplitIds> batch) { + batch.nextSplit(); + List actualRecords = Lists.newArrayList(); + long recordOffset = startRecordOffset; + RecordAndPosition recordAndPosition; + while ((recordAndPosition = batch.nextRecordFromSplit()) != null) { + actualRecords.add(recordAndPosition.record()); + assertThat(recordAndPosition.fileOffset()).isEqualTo(expectedFileOffset); + assertThat(recordAndPosition.recordOffset() - 1).isEqualTo(recordOffset); + recordOffset++; + } + + assertThat(actualRecords).hasSameSizeAs(expectedRecords); + assertRecords(expectedRecords, actualRecords, TestFixtures.SCHEMA); + } + + @TestTemplate + public void testNoCheckpointedPosition() throws IOException { + List> recordBatchList = + ReaderUtil.createRecordBatchList(TestFixtures.SCHEMA, 3, 2); + CombinedScanTask combinedScanTask = + ReaderUtil.createCombinedScanTask( + recordBatchList, temporaryFolder, fileFormat, appenderFactory); + IcebergSourceSplit split = IcebergSourceSplit.fromCombinedScanTask(combinedScanTask); + CloseableIterator>> reader = + readerFunction().apply(split); + + RecordsWithSplitIds> batch0 = reader.next(); + assertRecordsAndPosition(recordBatchList.get(0), 0, 0L, batch0); + batch0.recycle(); + + RecordsWithSplitIds> batch1 = reader.next(); + assertRecordsAndPosition(recordBatchList.get(1), 1, 0L, batch1); + batch1.recycle(); + + RecordsWithSplitIds> batch2 = reader.next(); + assertRecordsAndPosition(recordBatchList.get(2), 2, 0L, batch2); + batch2.recycle(); + } + + @TestTemplate + public void testCheckpointedPositionBeforeFirstFile() throws IOException { + List> recordBatchList = + ReaderUtil.createRecordBatchList(TestFixtures.SCHEMA, 3, 2); + CombinedScanTask combinedScanTask = + ReaderUtil.createCombinedScanTask( + recordBatchList, temporaryFolder, fileFormat, appenderFactory); + IcebergSourceSplit split = IcebergSourceSplit.fromCombinedScanTask(combinedScanTask, 0, 0L); + CloseableIterator>> reader = + readerFunction().apply(split); + + RecordsWithSplitIds> batch0 = reader.next(); + assertRecordsAndPosition(recordBatchList.get(0), 0, 0L, batch0); + batch0.recycle(); + + RecordsWithSplitIds> batch1 = reader.next(); + assertRecordsAndPosition(recordBatchList.get(1), 1, 0L, batch1); + batch1.recycle(); + + RecordsWithSplitIds> batch2 = reader.next(); + assertRecordsAndPosition(recordBatchList.get(2), 2, 0L, batch2); + batch2.recycle(); + } + + @TestTemplate + public void testCheckpointedPositionMiddleFirstFile() throws IOException { + List> recordBatchList = + ReaderUtil.createRecordBatchList(TestFixtures.SCHEMA, 3, 2); + CombinedScanTask combinedScanTask = + ReaderUtil.createCombinedScanTask( + recordBatchList, temporaryFolder, fileFormat, appenderFactory); + IcebergSourceSplit split = IcebergSourceSplit.fromCombinedScanTask(combinedScanTask, 0, 1L); + CloseableIterator>> reader = + readerFunction().apply(split); + + RecordsWithSplitIds> batch0 = reader.next(); + assertRecordsAndPosition(recordBatchList.get(0).subList(1, 2), 0, 1L, batch0); + batch0.recycle(); + + RecordsWithSplitIds> batch1 = reader.next(); + assertRecordsAndPosition(recordBatchList.get(1), 1, 0L, batch1); + batch1.recycle(); + + RecordsWithSplitIds> batch2 = reader.next(); + assertRecordsAndPosition(recordBatchList.get(2), 2, 0L, batch2); + batch2.recycle(); + } + + @TestTemplate + public void testCheckpointedPositionAfterFirstFile() throws IOException { + List> recordBatchList = + ReaderUtil.createRecordBatchList(TestFixtures.SCHEMA, 3, 2); + CombinedScanTask combinedScanTask = + ReaderUtil.createCombinedScanTask( + recordBatchList, temporaryFolder, fileFormat, appenderFactory); + IcebergSourceSplit split = IcebergSourceSplit.fromCombinedScanTask(combinedScanTask, 0, 2L); + CloseableIterator>> reader = + readerFunction().apply(split); + + RecordsWithSplitIds> batch1 = reader.next(); + assertRecordsAndPosition(recordBatchList.get(1), 1, 0L, batch1); + batch1.recycle(); + + RecordsWithSplitIds> batch2 = reader.next(); + assertRecordsAndPosition(recordBatchList.get(2), 2, 0L, batch2); + batch2.recycle(); + } + + @TestTemplate + public void testCheckpointedPositionBeforeSecondFile() throws IOException { + List> recordBatchList = + ReaderUtil.createRecordBatchList(TestFixtures.SCHEMA, 3, 2); + CombinedScanTask combinedScanTask = + ReaderUtil.createCombinedScanTask( + recordBatchList, temporaryFolder, fileFormat, appenderFactory); + IcebergSourceSplit split = IcebergSourceSplit.fromCombinedScanTask(combinedScanTask, 1, 0L); + CloseableIterator>> reader = + readerFunction().apply(split); + + RecordsWithSplitIds> batch1 = reader.next(); + assertRecordsAndPosition(recordBatchList.get(1), 1, 0L, batch1); + batch1.recycle(); + + RecordsWithSplitIds> batch2 = reader.next(); + assertRecordsAndPosition(recordBatchList.get(2), 2, 0L, batch2); + batch2.recycle(); + } + + @TestTemplate + public void testCheckpointedPositionMidSecondFile() throws IOException { + List> recordBatchList = + ReaderUtil.createRecordBatchList(TestFixtures.SCHEMA, 3, 2); + CombinedScanTask combinedScanTask = + ReaderUtil.createCombinedScanTask( + recordBatchList, temporaryFolder, fileFormat, appenderFactory); + IcebergSourceSplit split = IcebergSourceSplit.fromCombinedScanTask(combinedScanTask, 1, 1L); + CloseableIterator>> reader = + readerFunction().apply(split); + + RecordsWithSplitIds> batch1 = reader.next(); + assertRecordsAndPosition(recordBatchList.get(1).subList(1, 2), 1, 1L, batch1); + batch1.recycle(); + + RecordsWithSplitIds> batch2 = reader.next(); + assertRecordsAndPosition(recordBatchList.get(2), 2, 0L, batch2); + batch2.recycle(); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/reader/ReaderUtil.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/reader/ReaderUtil.java new file mode 100644 index 000000000000..0edf8ae009fe --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/reader/ReaderUtil.java @@ -0,0 +1,128 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.reader; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Path; +import java.util.Collections; +import java.util.List; +import org.apache.flink.table.data.RowData; +import org.apache.iceberg.BaseCombinedScanTask; +import org.apache.iceberg.BaseFileScanTask; +import org.apache.iceberg.CombinedScanTask; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.DataFiles; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.FileScanTask; +import org.apache.iceberg.Files; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.PartitionSpecParser; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SchemaParser; +import org.apache.iceberg.data.GenericAppenderFactory; +import org.apache.iceberg.data.RandomGenericData; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.encryption.PlaintextEncryptionManager; +import org.apache.iceberg.expressions.Expressions; +import org.apache.iceberg.expressions.ResidualEvaluator; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.flink.source.DataIterator; +import org.apache.iceberg.flink.source.RowDataFileScanTaskReader; +import org.apache.iceberg.hadoop.HadoopFileIO; +import org.apache.iceberg.io.FileAppender; +import org.apache.iceberg.io.FileAppenderFactory; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; + +public class ReaderUtil { + + private ReaderUtil() {} + + public static FileScanTask createFileTask( + List records, + File file, + FileFormat fileFormat, + FileAppenderFactory appenderFactory) + throws IOException { + FileAppender appender = + appenderFactory.newAppender(Files.localOutput(file), fileFormat); + try { + appender.addAll(records); + } finally { + appender.close(); + } + + DataFile dataFile = + DataFiles.builder(PartitionSpec.unpartitioned()) + .withRecordCount(records.size()) + .withFileSizeInBytes(file.length()) + .withPath(file.toString()) + .withFormat(fileFormat) + .withMetrics(appender.metrics()) + .build(); + + ResidualEvaluator residuals = ResidualEvaluator.unpartitioned(Expressions.alwaysTrue()); + return new BaseFileScanTask( + dataFile, + null, + SchemaParser.toJson(TestFixtures.SCHEMA), + PartitionSpecParser.toJson(PartitionSpec.unpartitioned()), + residuals); + } + + public static DataIterator createDataIterator(CombinedScanTask combinedTask) { + return new DataIterator<>( + new RowDataFileScanTaskReader( + TestFixtures.SCHEMA, TestFixtures.SCHEMA, null, true, Collections.emptyList()), + combinedTask, + new HadoopFileIO(new org.apache.hadoop.conf.Configuration()), + PlaintextEncryptionManager.instance()); + } + + public static List> createRecordBatchList( + Schema schema, int listSize, int batchCount) { + return createRecordBatchList(0L, schema, listSize, batchCount); + } + + public static List> createRecordBatchList( + long seed, Schema schema, int listSize, int batchCount) { + List records = RandomGenericData.generate(schema, listSize * batchCount, seed); + return Lists.partition(records, batchCount); + } + + public static CombinedScanTask createCombinedScanTask( + List> recordBatchList, + Path temporaryFolder, + FileFormat fileFormat, + GenericAppenderFactory appenderFactory) + throws IOException { + List fileTasks = Lists.newArrayListWithCapacity(recordBatchList.size()); + for (List recordBatch : recordBatchList) { + FileScanTask fileTask = + ReaderUtil.createFileTask( + recordBatch, + File.createTempFile("junit", null, temporaryFolder.toFile()), + fileFormat, + appenderFactory); + fileTasks.add(fileTask); + } + + return new BaseCombinedScanTask(fileTasks); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestArrayBatchRecords.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestArrayBatchRecords.java new file mode 100644 index 000000000000..6f09bd9a56d6 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestArrayBatchRecords.java @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.reader; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.util.concurrent.atomic.AtomicBoolean; +import org.junit.jupiter.api.Test; + +public class TestArrayBatchRecords { + + @Test + public void testFullRange() { + String[] elements = new String[] {"0", "1", "2", "3"}; + testArray(elements, elements.length, 2, 119); + } + + @Test + public void testSubRange() { + String[] elements = new String[] {"0", "1", "2", "3"}; + testArray(elements, 2, 0, 0); + } + + private void testArray( + String[] elements, int numberOfRecords, int fileOffset, long startingRecordOffset) { + String splitId = "iceberg_split_1"; + AtomicBoolean recycled = new AtomicBoolean(); + + ArrayBatchRecords recordsWithSplitIds = + ArrayBatchRecords.forRecords( + splitId, + ignored -> recycled.set(true), + elements, + numberOfRecords, + fileOffset, + startingRecordOffset); + + assertThat(recordsWithSplitIds.nextSplit()).isEqualTo(splitId); + + for (int i = 0; i < numberOfRecords; i++) { + RecordAndPosition recAndPos = recordsWithSplitIds.nextRecordFromSplit(); + assertThat(recAndPos.record()).isEqualTo(elements[i]); + assertThat(recAndPos.fileOffset()).isEqualTo(fileOffset); + // recordOffset points to the position after this one + assertThat(recAndPos.recordOffset()).isEqualTo(startingRecordOffset + i + 1); + } + + assertThat(recordsWithSplitIds.nextRecordFromSplit()).isNull(); + assertThat(recordsWithSplitIds.nextSplit()).isNull(); + recordsWithSplitIds.recycle(); + assertThat(recycled.get()).isTrue(); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestArrayPoolDataIteratorBatcherRowData.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestArrayPoolDataIteratorBatcherRowData.java new file mode 100644 index 000000000000..1a78bb1b0010 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestArrayPoolDataIteratorBatcherRowData.java @@ -0,0 +1,360 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.reader; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.File; +import java.nio.file.Path; +import java.util.Arrays; +import java.util.List; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; +import org.apache.flink.connector.base.source.reader.SourceReaderOptions; +import org.apache.flink.table.data.RowData; +import org.apache.iceberg.BaseCombinedScanTask; +import org.apache.iceberg.CombinedScanTask; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.FileScanTask; +import org.apache.iceberg.data.GenericAppenderFactory; +import org.apache.iceberg.data.RandomGenericData; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.flink.FlinkConfigOptions; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.flink.TestHelpers; +import org.apache.iceberg.flink.source.DataIterator; +import org.apache.iceberg.io.CloseableIterator; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +public class TestArrayPoolDataIteratorBatcherRowData { + + @TempDir protected Path temporaryFolder; + private static final FileFormat FILE_FORMAT = FileFormat.PARQUET; + private final Configuration config = + new Configuration() + .set(SourceReaderOptions.ELEMENT_QUEUE_CAPACITY, 1) + .set(FlinkConfigOptions.SOURCE_READER_FETCH_BATCH_RECORD_COUNT, 2); + + private final GenericAppenderFactory appenderFactory = + new GenericAppenderFactory(TestFixtures.SCHEMA); + private final DataIteratorBatcher batcher = + new ArrayPoolDataIteratorBatcher<>(config, new RowDataRecordFactory(TestFixtures.ROW_TYPE)); + + /** Read a CombinedScanTask that contains a single file with less than a full batch of records */ + @Test + public void testSingleFileLessThanOneFullBatch() throws Exception { + List records = RandomGenericData.generate(TestFixtures.SCHEMA, 1, 1); + FileScanTask fileTask = + ReaderUtil.createFileTask( + records, + File.createTempFile("junit", null, temporaryFolder.toFile()), + FILE_FORMAT, + appenderFactory); + CombinedScanTask combinedTask = new BaseCombinedScanTask(fileTask); + DataIterator dataIterator = ReaderUtil.createDataIterator(combinedTask); + String splitId = "someSplitId"; + CloseableIterator>> recordBatchIterator = + batcher.batch(splitId, dataIterator); + + ArrayBatchRecords batch = (ArrayBatchRecords) recordBatchIterator.next(); + assertThat(batch.finishedSplits()).isEmpty(); + assertThat(batch.nextSplit()).isEqualTo(splitId); + assertThat(batch.records()).hasSize(2); + assertThat(batch.numberOfRecords()).isEqualTo(1); + + RecordAndPosition recordAndPosition = batch.nextRecordFromSplit(); + + /////////////////////////////// + // assert first record + + assertThat(recordAndPosition.fileOffset()).isEqualTo(0); + assertThat(recordAndPosition.recordOffset()) + .as("The position points to where the reader should resume after this record is processed.") + .isEqualTo(1); + TestHelpers.assertRowData(TestFixtures.SCHEMA, records.get(0), recordAndPosition.record()); + + assertThat(batch.nextRecordFromSplit()).isNull(); + assertThat(batch.nextSplit()).isNull(); + batch.recycle(); + + assertThat(recordBatchIterator).isExhausted(); + } + + /** + * Read a CombinedScanTask that contains a single file with multiple batches. + * + *

    Insert 5 records in a single file that should result in 3 batches + */ + @Test + public void testSingleFileWithMultipleBatches() throws Exception { + List records = RandomGenericData.generate(TestFixtures.SCHEMA, 5, 1); + FileScanTask fileTask = + ReaderUtil.createFileTask( + records, + File.createTempFile("junit", null, temporaryFolder.toFile()), + FILE_FORMAT, + appenderFactory); + CombinedScanTask combinedTask = new BaseCombinedScanTask(fileTask); + DataIterator dataIterator = ReaderUtil.createDataIterator(combinedTask); + String splitId = "someSplitId"; + CloseableIterator>> recordBatchIterator = + batcher.batch(splitId, dataIterator); + + /////////////////////////////// + // assert first batch with full batch of 2 records + + ArrayBatchRecords batch0 = (ArrayBatchRecords) recordBatchIterator.next(); + assertThat(batch0.finishedSplits()).isEmpty(); + assertThat(batch0.nextSplit()).isEqualTo(splitId); + assertThat(batch0.records()).hasSize(2); + assertThat(batch0.numberOfRecords()).isEqualTo(2); + + RecordAndPosition recordAndPosition; + + // assert first record + recordAndPosition = batch0.nextRecordFromSplit(); + assertThat(recordAndPosition.fileOffset()).isEqualTo(0); + assertThat(recordAndPosition.recordOffset()) + .as("The position points to where the reader should resume after this record is processed.") + .isEqualTo(1); + TestHelpers.assertRowData(TestFixtures.SCHEMA, records.get(0), recordAndPosition.record()); + + // assert second record + recordAndPosition = batch0.nextRecordFromSplit(); + assertThat(recordAndPosition.fileOffset()).isEqualTo(0); + assertThat(recordAndPosition.recordOffset()) + .as("The position points to where the reader should resume after this record is processed.") + .isEqualTo(2); + TestHelpers.assertRowData(TestFixtures.SCHEMA, records.get(1), recordAndPosition.record()); + + assertThat(batch0.nextRecordFromSplit()).isNull(); + assertThat(batch0.nextSplit()).isNull(); + batch0.recycle(); + + /////////////////////////////// + // assert second batch with full batch of 2 records + + ArrayBatchRecords batch1 = (ArrayBatchRecords) recordBatchIterator.next(); + assertThat(batch1.records()).containsExactlyInAnyOrder(batch0.records()); + assertThat(batch1.finishedSplits()).isEmpty(); + assertThat(batch1.nextSplit()).isEqualTo(splitId); + assertThat(batch1.records()).hasSize(2); + assertThat(batch1.numberOfRecords()).isEqualTo(2); + + // assert third record + recordAndPosition = batch1.nextRecordFromSplit(); + assertThat(recordAndPosition.fileOffset()).isEqualTo(0); + assertThat(recordAndPosition.recordOffset()) + .as("The position points to where the reader should resume after this record is processed.") + .isEqualTo(3); + TestHelpers.assertRowData(TestFixtures.SCHEMA, records.get(2), recordAndPosition.record()); + + // assert fourth record + recordAndPosition = batch1.nextRecordFromSplit(); + assertThat(recordAndPosition.fileOffset()).isEqualTo(0); + assertThat(recordAndPosition.recordOffset()) + .as("The position points to where the reader should resume after this record is processed.") + .isEqualTo(4); + TestHelpers.assertRowData(TestFixtures.SCHEMA, records.get(3), recordAndPosition.record()); + + assertThat(batch1.nextRecordFromSplit()).isNull(); + assertThat(batch1.nextSplit()).isNull(); + batch1.recycle(); + + /////////////////////////////// + // assert third batch with partial batch of 1 record + + ArrayBatchRecords batch2 = (ArrayBatchRecords) recordBatchIterator.next(); + assertThat(batch2.records()).containsExactlyInAnyOrder(batch0.records()); + assertThat(batch2.finishedSplits()).isEmpty(); + assertThat(batch2.nextSplit()).isEqualTo(splitId); + assertThat(batch2.records()).hasSize(2); + assertThat(batch2.numberOfRecords()).isEqualTo(1); + + // assert fifth record + recordAndPosition = batch2.nextRecordFromSplit(); + assertThat(recordAndPosition.fileOffset()).isEqualTo(0); + assertThat(recordAndPosition.recordOffset()) + .as("The position points to where the reader should resume after this record is processed.") + .isEqualTo(5); + TestHelpers.assertRowData(TestFixtures.SCHEMA, records.get(4), recordAndPosition.record()); + + assertThat(batch2.nextRecordFromSplit()).isNull(); + assertThat(batch2.nextSplit()).isNull(); + batch2.recycle(); + + assertThat(recordBatchIterator).isExhausted(); + } + + /** + * Read a CombinedScanTask that contains with multiple files. + * + *

    In this test, we also seek the iterator to starting position (1, 1). + */ + @Test + public void testMultipleFilesWithSeekPosition() throws Exception { + List records0 = RandomGenericData.generate(TestFixtures.SCHEMA, 1, 1); + FileScanTask fileTask0 = + ReaderUtil.createFileTask( + records0, + File.createTempFile("junit", null, temporaryFolder.toFile()), + FILE_FORMAT, + appenderFactory); + List records1 = RandomGenericData.generate(TestFixtures.SCHEMA, 4, 2); + FileScanTask fileTask1 = + ReaderUtil.createFileTask( + records1, + File.createTempFile("junit", null, temporaryFolder.toFile()), + FILE_FORMAT, + appenderFactory); + List records2 = RandomGenericData.generate(TestFixtures.SCHEMA, 3, 3); + FileScanTask fileTask2 = + ReaderUtil.createFileTask( + records2, + File.createTempFile("junit", null, temporaryFolder.toFile()), + FILE_FORMAT, + appenderFactory); + CombinedScanTask combinedTask = + new BaseCombinedScanTask(Arrays.asList(fileTask0, fileTask1, fileTask2)); + + DataIterator dataIterator = ReaderUtil.createDataIterator(combinedTask); + dataIterator.seek(1, 1); + + String splitId = "someSplitId"; + CloseableIterator>> recordBatchIterator = + batcher.batch(splitId, dataIterator); + + /////////////////////////////// + // file0 is skipped by seek + + /////////////////////////////// + // file1 has 4 records. because the seek position, first record is skipped. + // we should read 3 remaining records in 2 batches: + // batch10 with 2 records and batch11 with 1 records. + + // assert first batch from file1 with full batch of 2 records + + // variable naming convention: batch + ArrayBatchRecords batch10 = (ArrayBatchRecords) recordBatchIterator.next(); + assertThat(batch10.finishedSplits()).isEmpty(); + assertThat(batch10.nextSplit()).isEqualTo(splitId); + assertThat(batch10.records()).hasSize(2); + assertThat(batch10.numberOfRecords()).isEqualTo(2); + + RecordAndPosition recordAndPosition; + + recordAndPosition = batch10.nextRecordFromSplit(); + assertThat(recordAndPosition.fileOffset()).isEqualTo(1); + assertThat(recordAndPosition.recordOffset()) + .as("seek should skip the first record in file1. starting from the second record") + .isEqualTo(2); + TestHelpers.assertRowData(TestFixtures.SCHEMA, records1.get(1), recordAndPosition.record()); + + recordAndPosition = batch10.nextRecordFromSplit(); + assertThat(recordAndPosition.fileOffset()).isEqualTo(1); + assertThat(recordAndPosition.recordOffset()) + .as("The position points to where the reader should resume after this record is processed.") + .isEqualTo(3); + TestHelpers.assertRowData(TestFixtures.SCHEMA, records1.get(2), recordAndPosition.record()); + + assertThat(batch10.nextRecordFromSplit()).isNull(); + assertThat(batch10.nextSplit()).isNull(); + batch10.recycle(); + + // assert second batch from file1 with partial batch of 1 record + + // variable naming convention: batch__ + ArrayBatchRecords batch11 = (ArrayBatchRecords) recordBatchIterator.next(); + assertThat(batch11.records()).containsExactlyInAnyOrder(batch10.records()); + assertThat(batch11.finishedSplits()).isEmpty(); + assertThat(batch11.nextSplit()).isEqualTo(splitId); + assertThat(batch11.records()).hasSize(2); + assertThat(batch11.numberOfRecords()).isEqualTo(1); + + recordAndPosition = batch11.nextRecordFromSplit(); + assertThat(recordAndPosition.fileOffset()).isEqualTo(1); + assertThat(recordAndPosition.recordOffset()) + .as("The position points to where the reader should resume after this record is processed.") + .isEqualTo(4); + TestHelpers.assertRowData(TestFixtures.SCHEMA, records1.get(3), recordAndPosition.record()); + + assertThat(batch11.nextRecordFromSplit()).isNull(); + assertThat(batch11.nextSplit()).isNull(); + batch11.recycle(); + + /////////////////////////////// + // file2 has 3 records. + // we should read 3 records in 2 batches: + // batch20 with 2 records and batch21 with 1 records + + // assert first batch from file2 with full batch of 2 records + + // variable naming convention: batch__ + ArrayBatchRecords batch20 = (ArrayBatchRecords) recordBatchIterator.next(); + assertThat(batch20.records()).containsExactlyInAnyOrder(batch10.records()); + assertThat(batch20.finishedSplits()).isEmpty(); + assertThat(batch20.nextSplit()).isEqualTo(splitId); + assertThat(batch20.records()).hasSize(2); + assertThat(batch20.numberOfRecords()).isEqualTo(2); + + recordAndPosition = batch20.nextRecordFromSplit(); + assertThat(recordAndPosition.fileOffset()).isEqualTo(2); + assertThat(recordAndPosition.recordOffset()) + .as("The position points to where the reader should resume after this record is processed.") + .isEqualTo(1); + TestHelpers.assertRowData(TestFixtures.SCHEMA, records2.get(0), recordAndPosition.record()); + + recordAndPosition = batch20.nextRecordFromSplit(); + assertThat(recordAndPosition.fileOffset()).isEqualTo(2); + assertThat(recordAndPosition.recordOffset()) + .as("The position points to where the reader should resume after this record is processed.") + .isEqualTo(2); + TestHelpers.assertRowData(TestFixtures.SCHEMA, records2.get(1), recordAndPosition.record()); + + assertThat(batch20.nextRecordFromSplit()).isNull(); + assertThat(batch20.nextSplit()).isNull(); + batch20.recycle(); + + /////////////////////////////// + // assert second batch from file2 with partial batch of 1 record + + // variable naming convention: batch__ + ArrayBatchRecords batch21 = (ArrayBatchRecords) recordBatchIterator.next(); + assertThat(batch21.records()).containsExactlyInAnyOrder(batch10.records()); + assertThat(batch21.finishedSplits()).isEmpty(); + assertThat(batch21.nextSplit()).isEqualTo(splitId); + assertThat(batch21.records()).hasSize(2); + assertThat(batch21.numberOfRecords()).isEqualTo(1); + + recordAndPosition = batch21.nextRecordFromSplit(); + assertThat(recordAndPosition.fileOffset()).isEqualTo(2); + + assertThat(recordAndPosition.recordOffset()) + .as("The position points to where the reader should resume after this record is processed.") + .isEqualTo(3); + TestHelpers.assertRowData(TestFixtures.SCHEMA, records2.get(2), recordAndPosition.record()); + + assertThat(batch21.nextRecordFromSplit()).isNull(); + assertThat(batch21.nextSplit()).isNull(); + batch21.recycle(); + + assertThat(recordBatchIterator).isExhausted(); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestColumnStatsWatermarkExtractor.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestColumnStatsWatermarkExtractor.java new file mode 100644 index 000000000000..af806d4c655d --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestColumnStatsWatermarkExtractor.java @@ -0,0 +1,176 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.reader; + +import static org.apache.iceberg.flink.TestFixtures.DATABASE; +import static org.apache.iceberg.types.Types.NestedField.required; +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; +import static org.assertj.core.api.Assumptions.assumeThat; + +import java.io.IOException; +import java.nio.file.Path; +import java.time.LocalDateTime; +import java.time.OffsetDateTime; +import java.time.ZoneOffset; +import java.util.Collection; +import java.util.List; +import java.util.Map; +import java.util.concurrent.TimeUnit; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Parameter; +import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.Parameters; +import org.apache.iceberg.Schema; +import org.apache.iceberg.data.GenericAppenderFactory; +import org.apache.iceberg.data.RandomGenericData; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.flink.HadoopTableExtension; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.flink.source.split.IcebergSourceSplit; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.types.Types; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.api.extension.ExtendWith; +import org.junit.jupiter.api.extension.RegisterExtension; +import org.junit.jupiter.api.io.TempDir; + +@ExtendWith(ParameterizedTestExtension.class) +public class TestColumnStatsWatermarkExtractor { + public static final Schema SCHEMA = + new Schema( + required(1, "timestamp_column", Types.TimestampType.withoutZone()), + required(2, "timestamptz_column", Types.TimestampType.withZone()), + required(3, "long_column", Types.LongType.get()), + required(4, "string_column", Types.StringType.get())); + + private static final GenericAppenderFactory APPENDER_FACTORY = new GenericAppenderFactory(SCHEMA); + + private static final List> TEST_RECORDS = + ImmutableList.of( + RandomGenericData.generate(SCHEMA, 3, 2L), RandomGenericData.generate(SCHEMA, 3, 19L)); + + private static final List> MIN_VALUES = + ImmutableList.of(Maps.newHashMapWithExpectedSize(3), Maps.newHashMapWithExpectedSize(3)); + + @TempDir protected Path temporaryFolder; + + @RegisterExtension + private static final HadoopTableExtension SOURCE_TABLE_EXTENSION = + new HadoopTableExtension(DATABASE, TestFixtures.TABLE, SCHEMA); + + @Parameter(index = 0) + private String columnName; + + @BeforeAll + public static void updateMinValue() { + for (int i = 0; i < TEST_RECORDS.size(); ++i) { + for (Record r : TEST_RECORDS.get(i)) { + Map minValues = MIN_VALUES.get(i); + + LocalDateTime localDateTime = (LocalDateTime) r.get(0); + minValues.merge( + "timestamp_column", localDateTime.toInstant(ZoneOffset.UTC).toEpochMilli(), Math::min); + + OffsetDateTime offsetDateTime = (OffsetDateTime) r.get(1); + minValues.merge("timestamptz_column", offsetDateTime.toInstant().toEpochMilli(), Math::min); + + minValues.merge("long_column", (Long) r.get(2), Math::min); + } + } + } + + @Parameters(name = "columnName = {0}") + public static Collection data() { + return ImmutableList.of( + new Object[] {"timestamp_column"}, + new Object[] {"timestamptz_column"}, + new Object[] {"long_column"}); + } + + @TestTemplate + public void testSingle() throws IOException { + ColumnStatsWatermarkExtractor extractor = + new ColumnStatsWatermarkExtractor(SCHEMA, columnName, TimeUnit.MILLISECONDS); + + assertThat(extractor.extractWatermark(split(0))) + .isEqualTo(MIN_VALUES.get(0).get(columnName).longValue()); + } + + @TestTemplate + public void testTimeUnit() throws IOException { + assumeThat(columnName).isEqualTo("long_column"); + ColumnStatsWatermarkExtractor extractor = + new ColumnStatsWatermarkExtractor(SCHEMA, columnName, TimeUnit.MICROSECONDS); + + assertThat(extractor.extractWatermark(split(0))) + .isEqualTo(MIN_VALUES.get(0).get(columnName) / 1000L); + } + + @TestTemplate + public void testMultipleFiles() throws IOException { + assumeThat(columnName).isEqualTo("timestamp_column"); + IcebergSourceSplit combinedSplit = + IcebergSourceSplit.fromCombinedScanTask( + ReaderUtil.createCombinedScanTask( + TEST_RECORDS, temporaryFolder, FileFormat.PARQUET, APPENDER_FACTORY)); + + ColumnStatsWatermarkExtractor extractor = + new ColumnStatsWatermarkExtractor(SCHEMA, columnName, null); + + assertThat(extractor.extractWatermark(split(0))) + .isEqualTo(MIN_VALUES.get(0).get(columnName).longValue()); + assertThat(extractor.extractWatermark(split(1))) + .isEqualTo(MIN_VALUES.get(1).get(columnName).longValue()); + assertThat(extractor.extractWatermark(combinedSplit)) + .isEqualTo(Math.min(MIN_VALUES.get(0).get(columnName), MIN_VALUES.get(1).get(columnName))); + } + + @TestTemplate + public void testWrongColumn() { + assumeThat(columnName).isEqualTo("string_column"); + assertThatThrownBy(() -> new ColumnStatsWatermarkExtractor(SCHEMA, columnName, null)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining( + "Found STRING, expected a LONG or TIMESTAMP column for watermark generation."); + } + + @TestTemplate + public void testEmptyStatistics() throws IOException { + assumeThat(columnName).isEqualTo("timestamp_column"); + + // Create an extractor for a column we do not have statistics + ColumnStatsWatermarkExtractor extractor = + new ColumnStatsWatermarkExtractor(10, "missing_field"); + assertThatThrownBy(() -> extractor.extractWatermark(split(0))) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("Missing statistics for column"); + } + + private IcebergSourceSplit split(int id) throws IOException { + return IcebergSourceSplit.fromCombinedScanTask( + ReaderUtil.createCombinedScanTask( + ImmutableList.of(TEST_RECORDS.get(id)), + temporaryFolder, + FileFormat.PARQUET, + APPENDER_FACTORY)); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestIcebergSourceReader.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestIcebergSourceReader.java new file mode 100644 index 000000000000..8d6782586676 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestIcebergSourceReader.java @@ -0,0 +1,181 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.reader; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.nio.file.Path; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import org.apache.flink.api.connector.source.SourceReaderContext; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.connector.testutils.source.reader.TestingReaderContext; +import org.apache.flink.connector.testutils.source.reader.TestingReaderOutput; +import org.apache.flink.metrics.MetricGroup; +import org.apache.flink.table.data.RowData; +import org.apache.iceberg.CombinedScanTask; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.data.GenericAppenderFactory; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.encryption.PlaintextEncryptionManager; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.flink.TestHelpers; +import org.apache.iceberg.flink.source.split.IcebergSourceSplit; +import org.apache.iceberg.flink.source.split.SerializableComparator; +import org.apache.iceberg.hadoop.HadoopFileIO; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +public class TestIcebergSourceReader { + @TempDir protected Path temporaryFolder; + + private final GenericAppenderFactory appenderFactory = + new GenericAppenderFactory(TestFixtures.SCHEMA); + + @Test + public void testReaderMetrics() throws Exception { + TestingReaderOutput readerOutput = new TestingReaderOutput<>(); + TestingMetricGroup metricGroup = new TestingMetricGroup(); + TestingReaderContext readerContext = new TestingReaderContext(new Configuration(), metricGroup); + IcebergSourceReader reader = createReader(metricGroup, readerContext, null); + reader.start(); + + testOneSplitFetcher(reader, readerOutput, metricGroup, 1); + testOneSplitFetcher(reader, readerOutput, metricGroup, 2); + } + + @Test + public void testReaderOrder() throws Exception { + // Create 2 splits + List> recordBatchList1 = + ReaderUtil.createRecordBatchList(0L, TestFixtures.SCHEMA, 1, 1); + CombinedScanTask task1 = + ReaderUtil.createCombinedScanTask( + recordBatchList1, temporaryFolder, FileFormat.PARQUET, appenderFactory); + + List> recordBatchList2 = + ReaderUtil.createRecordBatchList(1L, TestFixtures.SCHEMA, 1, 1); + CombinedScanTask task2 = + ReaderUtil.createCombinedScanTask( + recordBatchList2, temporaryFolder, FileFormat.PARQUET, appenderFactory); + + // Sort the splits in one way + List rowDataList1 = + read( + Arrays.asList( + IcebergSourceSplit.fromCombinedScanTask(task1), + IcebergSourceSplit.fromCombinedScanTask(task2)), + 2); + + // Reverse the splits + List rowDataList2 = + read( + Arrays.asList( + IcebergSourceSplit.fromCombinedScanTask(task2), + IcebergSourceSplit.fromCombinedScanTask(task1)), + 2); + + // Check that the order of the elements is not changed + assertThat(rowDataList1).containsExactlyElementsOf(rowDataList2); + } + + private List read(List splits, long expected) throws Exception { + TestingMetricGroup metricGroup = new TestingMetricGroup(); + TestingReaderContext readerContext = new TestingReaderContext(new Configuration(), metricGroup); + // Using IdBasedComparator, so we can have a deterministic order of the splits + IcebergSourceReader reader = createReader(metricGroup, readerContext, new IdBasedComparator()); + reader.start(); + + reader.addSplits(splits); + TestingReaderOutput readerOutput = new TestingReaderOutput<>(); + while (readerOutput.getEmittedRecords().size() < expected) { + reader.pollNext(readerOutput); + } + + reader.pollNext(readerOutput); + + assertThat(readerOutput.getEmittedRecords()).hasSize((int) expected); + return readerOutput.getEmittedRecords(); + } + + private void testOneSplitFetcher( + IcebergSourceReader reader, + TestingReaderOutput readerOutput, + TestingMetricGroup metricGroup, + int expectedCount) + throws Exception { + long seed = expectedCount; + // Each split should contain only one file with one record + List> recordBatchList = + ReaderUtil.createRecordBatchList(seed, TestFixtures.SCHEMA, 1, 1); + CombinedScanTask task = + ReaderUtil.createCombinedScanTask( + recordBatchList, temporaryFolder, FileFormat.PARQUET, appenderFactory); + IcebergSourceSplit split = IcebergSourceSplit.fromCombinedScanTask(task); + reader.addSplits(Collections.singletonList(split)); + + while (readerOutput.getEmittedRecords().size() < expectedCount) { + reader.pollNext(readerOutput); + } + + assertThat(readerOutput.getEmittedRecords()).hasSize(expectedCount); + TestHelpers.assertRowData( + TestFixtures.SCHEMA, + recordBatchList.get(0).get(0), + readerOutput.getEmittedRecords().get(expectedCount - 1)); + assertThat(metricGroup.counters().get("assignedSplits").getCount()).isEqualTo(expectedCount); + + // One more poll will get null record batch. + // That will finish the split and cause split fetcher to be closed due to idleness. + // Then next split will create a new split reader. + reader.pollNext(readerOutput); + } + + private IcebergSourceReader createReader( + MetricGroup metricGroup, + SourceReaderContext readerContext, + SerializableComparator splitComparator) { + IcebergSourceReaderMetrics readerMetrics = + new IcebergSourceReaderMetrics(metricGroup, "db.tbl"); + RowDataReaderFunction readerFunction = + new RowDataReaderFunction( + new Configuration(), + TestFixtures.SCHEMA, + TestFixtures.SCHEMA, + null, + true, + new HadoopFileIO(new org.apache.hadoop.conf.Configuration()), + PlaintextEncryptionManager.instance(), + Collections.emptyList()); + return new IcebergSourceReader<>( + SerializableRecordEmitter.defaultEmitter(), + readerMetrics, + readerFunction, + splitComparator, + readerContext); + } + + private static class IdBasedComparator implements SerializableComparator { + @Override + public int compare(IcebergSourceSplit o1, IcebergSourceSplit o2) { + return o1.splitId().compareTo(o2.splitId()); + } + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestLimitableDataIterator.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestLimitableDataIterator.java new file mode 100644 index 000000000000..36749d3ec2dc --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestLimitableDataIterator.java @@ -0,0 +1,84 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.reader; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.nio.file.Path; +import java.util.Collections; +import java.util.List; +import org.apache.flink.table.data.RowData; +import org.apache.iceberg.CombinedScanTask; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.data.GenericAppenderFactory; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.encryption.EncryptionManager; +import org.apache.iceberg.encryption.PlaintextEncryptionManager; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.flink.source.RowDataFileScanTaskReader; +import org.apache.iceberg.hadoop.HadoopFileIO; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.io.TempDir; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; + +public class TestLimitableDataIterator { + @TempDir private static Path temporaryFolder; + + private final RowDataFileScanTaskReader reader = + new RowDataFileScanTaskReader( + TestFixtures.SCHEMA, TestFixtures.SCHEMA, null, true, Collections.emptyList()); + private final HadoopFileIO fileIO = new HadoopFileIO(new org.apache.hadoop.conf.Configuration()); + private final EncryptionManager encryptionManager = PlaintextEncryptionManager.instance(); + + private static CombinedScanTask combinedScanTask; + private static int totalRecords; + + @BeforeAll + public static void beforeClass() throws Exception { + GenericAppenderFactory appenderFactory = new GenericAppenderFactory(TestFixtures.SCHEMA); + List> recordBatchList = + ReaderUtil.createRecordBatchList(TestFixtures.SCHEMA, 3, 2); + combinedScanTask = + ReaderUtil.createCombinedScanTask( + recordBatchList, temporaryFolder, FileFormat.PARQUET, appenderFactory); + totalRecords = 3 * 2; + } + + @ParameterizedTest + @ValueSource(longs = {-1L, 0L, 1L, 6L, 7L}) + public void testUnlimited(long limit) { + LimitableDataIterator dataIterator = + new LimitableDataIterator<>( + reader, combinedScanTask, fileIO, encryptionManager, RecordLimiter.create(limit)); + + List result = Lists.newArrayList(); + while (dataIterator.hasNext()) { + result.add(dataIterator.next()); + } + + if (limit <= 0 || limit > totalRecords) { + // read all records + assertThat(result).hasSize(totalRecords); + } else { + assertThat(result).hasSize((int) limit); + } + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestRowDataReaderFunction.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestRowDataReaderFunction.java new file mode 100644 index 000000000000..55f9c0af3a29 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestRowDataReaderFunction.java @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.reader; + +import java.util.Collections; +import java.util.List; +import java.util.stream.Collectors; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.conversion.DataStructureConverter; +import org.apache.flink.table.data.conversion.DataStructureConverters; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.table.types.utils.TypeConversions; +import org.apache.flink.types.Row; +import org.apache.iceberg.Schema; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.encryption.PlaintextEncryptionManager; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.flink.TestHelpers; +import org.apache.iceberg.hadoop.HadoopFileIO; + +public class TestRowDataReaderFunction extends ReaderFunctionTestBase { + + protected static final RowType ROW_TYPE = FlinkSchemaUtil.convert(TestFixtures.SCHEMA); + private static final DataStructureConverter ROW_DATA_CONVERTER = + DataStructureConverters.getConverter(TypeConversions.fromLogicalToDataType(ROW_TYPE)); + + @Override + protected ReaderFunction readerFunction() { + return new RowDataReaderFunction( + new Configuration(), + TestFixtures.SCHEMA, + TestFixtures.SCHEMA, + null, + true, + new HadoopFileIO(new org.apache.hadoop.conf.Configuration()), + PlaintextEncryptionManager.instance(), + Collections.emptyList()); + } + + @Override + protected void assertRecords(List expected, List actual, Schema schema) { + List rows = toRows(actual); + TestHelpers.assertRecords(rows, expected, TestFixtures.SCHEMA); + } + + private List toRows(List actual) { + return actual.stream() + .map(rowData -> (Row) ROW_DATA_CONVERTER.toExternal(rowData)) + .collect(Collectors.toList()); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestingMetricGroup.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestingMetricGroup.java new file mode 100644 index 000000000000..290628c5fc90 --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestingMetricGroup.java @@ -0,0 +1,102 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.reader; + +import java.util.Map; +import org.apache.flink.metrics.Counter; +import org.apache.flink.metrics.Gauge; +import org.apache.flink.metrics.MetricGroup; +import org.apache.flink.metrics.SimpleCounter; +import org.apache.flink.metrics.groups.OperatorIOMetricGroup; +import org.apache.flink.metrics.groups.SourceReaderMetricGroup; +import org.apache.flink.metrics.groups.UnregisteredMetricsGroup; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; + +class TestingMetricGroup extends UnregisteredMetricsGroup implements SourceReaderMetricGroup { + private final Map counters; + + TestingMetricGroup() { + this.counters = Maps.newHashMap(); + } + + /** Pass along the reference to share the map for child metric groups. */ + private TestingMetricGroup(Map counters) { + this.counters = counters; + } + + Map counters() { + return counters; + } + + @Override + public Counter counter(String name) { + Counter counter = new SimpleCounter(); + counters.put(name, counter); + return counter; + } + + @Override + public MetricGroup addGroup(String name) { + return new TestingMetricGroup(counters); + } + + @Override + public MetricGroup addGroup(String key, String value) { + return new TestingMetricGroup(counters); + } + + @Override + public OperatorIOMetricGroup getIOMetricGroup() { + return new TestingOperatorIOMetricGroup(); + } + + @Override + public Counter getNumRecordsInErrorsCounter() { + return new SimpleCounter(); + } + + @Override + public void setPendingBytesGauge(Gauge pendingBytesGauge) {} + + @Override + public void setPendingRecordsGauge(Gauge pendingRecordsGauge) {} + + private static class TestingOperatorIOMetricGroup extends UnregisteredMetricsGroup + implements OperatorIOMetricGroup { + @Override + public Counter getNumRecordsInCounter() { + return new SimpleCounter(); + } + + @Override + public Counter getNumRecordsOutCounter() { + return new SimpleCounter(); + } + + @Override + public Counter getNumBytesInCounter() { + return new SimpleCounter(); + } + + @Override + public Counter getNumBytesOutCounter() { + return new SimpleCounter(); + } + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/split/TestIcebergSourceSplitSerializer.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/split/TestIcebergSourceSplitSerializer.java new file mode 100644 index 000000000000..12bacdcd074d --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/source/split/TestIcebergSourceSplitSerializer.java @@ -0,0 +1,183 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.split; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.nio.file.Path; +import java.util.List; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.stream.Collectors; +import org.apache.iceberg.FileScanTask; +import org.apache.iceberg.flink.source.SplitHelpers; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +public class TestIcebergSourceSplitSerializer { + + @TempDir protected Path temporaryFolder; + + private final IcebergSourceSplitSerializer serializer = new IcebergSourceSplitSerializer(true); + + @Test + public void testLatestVersion() throws Exception { + serializeAndDeserialize(1, 1); + serializeAndDeserialize(10, 2); + } + + private void serializeAndDeserialize(int splitCount, int filesPerSplit) throws Exception { + final List splits = + SplitHelpers.createSplitsFromTransientHadoopTable( + temporaryFolder, splitCount, filesPerSplit); + for (IcebergSourceSplit split : splits) { + byte[] result = serializer.serialize(split); + IcebergSourceSplit deserialized = serializer.deserialize(serializer.getVersion(), result); + assertSplitEquals(split, deserialized); + + byte[] cachedResult = serializer.serialize(split); + assertThat(cachedResult).isSameAs(result); + IcebergSourceSplit deserialized2 = + serializer.deserialize(serializer.getVersion(), cachedResult); + assertSplitEquals(split, deserialized2); + + split.updatePosition(0, 100); + byte[] resultAfterUpdatePosition = serializer.serialize(split); + // after position change, serialized bytes should have changed + assertThat(resultAfterUpdatePosition).isNotSameAs(cachedResult); + IcebergSourceSplit deserialized3 = + serializer.deserialize(serializer.getVersion(), resultAfterUpdatePosition); + assertSplitEquals(split, deserialized3); + } + } + + @Test + public void testV1() throws Exception { + serializeAndDeserializeV1(1, 1); + serializeAndDeserializeV1(10, 2); + } + + private void serializeAndDeserializeV1(int splitCount, int filesPerSplit) throws Exception { + final List splits = + SplitHelpers.createSplitsFromTransientHadoopTable( + temporaryFolder, splitCount, filesPerSplit); + for (IcebergSourceSplit split : splits) { + byte[] result = split.serializeV1(); + IcebergSourceSplit deserialized = IcebergSourceSplit.deserializeV1(result); + assertSplitEquals(split, deserialized); + } + } + + @Test + public void testV2() throws Exception { + serializeAndDeserializeV2(1, 1); + serializeAndDeserializeV2(10, 2); + } + + private void serializeAndDeserializeV2(int splitCount, int filesPerSplit) throws Exception { + final List splits = + SplitHelpers.createSplitsFromTransientHadoopTable( + temporaryFolder, splitCount, filesPerSplit); + for (IcebergSourceSplit split : splits) { + byte[] result = split.serializeV2(); + IcebergSourceSplit deserialized = IcebergSourceSplit.deserializeV2(result, true); + assertSplitEquals(split, deserialized); + } + } + + @Test + public void testV3WithTooManyDeleteFiles() throws Exception { + serializeAndDeserializeV3(1, 1, 5000); + } + + private void serializeAndDeserializeV3(int splitCount, int filesPerSplit, int mockDeletesPerSplit) + throws Exception { + final List splits = + SplitHelpers.createSplitsFromTransientHadoopTable( + temporaryFolder, splitCount, filesPerSplit); + final List splitsWithMockDeleteFiles = + SplitHelpers.equipSplitsWithMockDeleteFiles(splits, temporaryFolder, mockDeletesPerSplit); + + for (IcebergSourceSplit split : splitsWithMockDeleteFiles) { + byte[] result = split.serializeV3(); + IcebergSourceSplit deserialized = IcebergSourceSplit.deserializeV3(result, true); + assertSplitEquals(split, deserialized); + } + } + + @Test + public void testDeserializeV1() throws Exception { + final List splits = + SplitHelpers.createSplitsFromTransientHadoopTable(temporaryFolder, 1, 1); + for (IcebergSourceSplit split : splits) { + byte[] result = split.serializeV1(); + IcebergSourceSplit deserialized = serializer.deserialize(1, result); + assertSplitEquals(split, deserialized); + } + } + + @Test + public void testCheckpointedPosition() throws Exception { + final AtomicInteger index = new AtomicInteger(); + final List splits = + SplitHelpers.createSplitsFromTransientHadoopTable(temporaryFolder, 10, 2).stream() + .map( + split -> { + IcebergSourceSplit result; + if (index.get() % 2 == 0) { + result = IcebergSourceSplit.fromCombinedScanTask(split.task(), 1, 1); + } else { + result = split; + } + index.incrementAndGet(); + return result; + }) + .collect(Collectors.toList()); + + for (IcebergSourceSplit split : splits) { + byte[] result = serializer.serialize(split); + IcebergSourceSplit deserialized = serializer.deserialize(serializer.getVersion(), result); + assertSplitEquals(split, deserialized); + + byte[] cachedResult = serializer.serialize(split); + assertThat(cachedResult).isSameAs(result); + IcebergSourceSplit deserialized2 = + serializer.deserialize(serializer.getVersion(), cachedResult); + assertSplitEquals(split, deserialized2); + } + } + + private void assertSplitEquals(IcebergSourceSplit expected, IcebergSourceSplit actual) { + List expectedTasks = Lists.newArrayList(expected.task().tasks().iterator()); + List actualTasks = Lists.newArrayList(actual.task().tasks().iterator()); + assertThat(actualTasks).hasSameSizeAs(expectedTasks); + for (int i = 0; i < expectedTasks.size(); ++i) { + FileScanTask expectedTask = expectedTasks.get(i); + FileScanTask actualTask = actualTasks.get(i); + assertThat(actualTask.file().path()).isEqualTo(expectedTask.file().path()); + assertThat(actualTask.sizeBytes()).isEqualTo(expectedTask.sizeBytes()); + assertThat(actualTask.filesCount()).isEqualTo(expectedTask.filesCount()); + assertThat(actualTask.start()).isEqualTo(expectedTask.start()); + assertThat(actualTask.length()).isEqualTo(expectedTask.length()); + } + + assertThat(actual.fileOffset()).isEqualTo(expected.fileOffset()); + assertThat(actual.recordOffset()).isEqualTo(expected.recordOffset()); + } +} diff --git a/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/util/TestFlinkPackage.java b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/util/TestFlinkPackage.java new file mode 100644 index 000000000000..4ba4f9d983dc --- /dev/null +++ b/flink/v1.19/flink/src/test/java/org/apache/iceberg/flink/util/TestFlinkPackage.java @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.util; + +import static org.assertj.core.api.Assertions.assertThat; + +import org.junit.jupiter.api.Test; +import org.mockito.MockedStatic; +import org.mockito.Mockito; + +public class TestFlinkPackage { + + /** This unit test would need to be adjusted as new Flink version is supported. */ + @Test + public void testVersion() { + assertThat(FlinkPackage.version()).isEqualTo("1.19.0"); + } + + @Test + public void testDefaultVersion() { + // It's difficult to reproduce a reflection error in a unit test, so we just inject a mocked + // fault to test the default logic + + // First make sure we're not caching a version result from a previous test + FlinkPackage.setVersion(null); + try (MockedStatic mockedStatic = Mockito.mockStatic(FlinkPackage.class)) { + mockedStatic.when(FlinkPackage::versionFromJar).thenThrow(RuntimeException.class); + mockedStatic.when(FlinkPackage::version).thenCallRealMethod(); + assertThat(FlinkPackage.version()).isEqualTo(FlinkPackage.FLINK_UNKNOWN_VERSION); + } + FlinkPackage.setVersion(null); + try (MockedStatic mockedStatic = Mockito.mockStatic(FlinkPackage.class)) { + mockedStatic.when(FlinkPackage::versionFromJar).thenReturn(null); + mockedStatic.when(FlinkPackage::version).thenCallRealMethod(); + FlinkPackage.setVersion(null); + assertThat(FlinkPackage.version()).isEqualTo(FlinkPackage.FLINK_UNKNOWN_VERSION); + } + } +} diff --git a/flink/v1.19/flink/src/test/resources/META-INF/services/org.apache.flink.table.factories.Factory b/flink/v1.19/flink/src/test/resources/META-INF/services/org.apache.flink.table.factories.Factory new file mode 100644 index 000000000000..47a3c94aa991 --- /dev/null +++ b/flink/v1.19/flink/src/test/resources/META-INF/services/org.apache.flink.table.factories.Factory @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +org.apache.iceberg.flink.source.BoundedTableFactory From 0d8f2c42ff0d1eff6a6d05f248da7085163ac93f Mon Sep 17 00:00:00 2001 From: Steven Wu Date: Mon, 5 Aug 2024 08:58:45 -0700 Subject: [PATCH 41/55] Flink: remove v1.17 module --- flink/v1.17/build.gradle | 268 ---- flink/v1.17/flink-runtime/LICENSE | 502 ------- flink/v1.17/flink-runtime/NOTICE | 91 -- .../flink/IcebergConnectorSmokeTest.java | 21 - .../shuffle/MapRangePartitionerBenchmark.java | 207 --- .../apache/iceberg/flink/CatalogLoader.java | 215 --- .../apache/iceberg/flink/FlinkCatalog.java | 834 ------------ .../iceberg/flink/FlinkCatalogFactory.java | 213 --- .../apache/iceberg/flink/FlinkConfParser.java | 261 ---- .../iceberg/flink/FlinkConfigOptions.java | 107 -- .../flink/FlinkDynamicTableFactory.java | 208 --- .../flink/FlinkEnvironmentContext.java | 31 - .../apache/iceberg/flink/FlinkFilters.java | 266 ---- .../apache/iceberg/flink/FlinkFixupTypes.java | 50 - .../apache/iceberg/flink/FlinkReadConf.java | 213 --- .../iceberg/flink/FlinkReadOptions.java | 123 -- .../apache/iceberg/flink/FlinkSchemaUtil.java | 232 ---- .../iceberg/flink/FlinkSourceFilter.java | 49 - .../apache/iceberg/flink/FlinkTypeToType.java | 203 --- .../iceberg/flink/FlinkTypeVisitor.java | 80 -- .../apache/iceberg/flink/FlinkWriteConf.java | 205 --- .../iceberg/flink/FlinkWriteOptions.java | 73 - .../iceberg/flink/IcebergTableSink.java | 121 -- .../apache/iceberg/flink/RowDataWrapper.java | 142 -- .../org/apache/iceberg/flink/TableLoader.java | 159 --- .../apache/iceberg/flink/TypeToFlinkType.java | 134 -- .../apache/iceberg/flink/actions/Actions.java | 52 - .../flink/actions/RewriteDataFilesAction.java | 72 - .../data/AvroWithFlinkSchemaVisitor.java | 75 -- .../iceberg/flink/data/FlinkAvroReader.java | 169 --- .../iceberg/flink/data/FlinkAvroWriter.java | 165 --- .../iceberg/flink/data/FlinkOrcReader.java | 131 -- .../iceberg/flink/data/FlinkOrcReaders.java | 283 ---- .../iceberg/flink/data/FlinkOrcWriter.java | 163 --- .../iceberg/flink/data/FlinkOrcWriters.java | 317 ----- .../flink/data/FlinkParquetReaders.java | 905 ------------- .../flink/data/FlinkParquetWriters.java | 504 ------- .../flink/data/FlinkSchemaVisitor.java | 161 --- .../iceberg/flink/data/FlinkValueReaders.java | 312 ----- .../iceberg/flink/data/FlinkValueWriters.java | 253 ---- .../data/ParquetWithFlinkSchemaVisitor.java | 222 ---- .../iceberg/flink/data/RowDataProjection.java | 341 ----- .../iceberg/flink/data/RowDataUtil.java | 123 -- .../iceberg/flink/data/StructRowData.java | 300 ----- .../AvroGenericRecordToRowDataMapper.java | 61 - .../flink/sink/BaseDeltaTaskWriter.java | 126 -- .../sink/BucketPartitionKeySelector.java | 70 - .../iceberg/flink/sink/BucketPartitioner.java | 103 -- .../flink/sink/BucketPartitionerUtil.java | 125 -- .../flink/sink/CachingTableSupplier.java | 91 -- .../iceberg/flink/sink/CommitSummary.java | 93 -- .../iceberg/flink/sink/DeltaManifests.java | 71 - .../flink/sink/DeltaManifestsSerializer.java | 122 -- .../flink/sink/EqualityFieldKeySelector.java | 86 -- .../flink/sink/FlinkAppenderFactory.java | 280 ---- .../flink/sink/FlinkFileWriterFactory.java | 293 ----- .../iceberg/flink/sink/FlinkManifestUtil.java | 132 -- .../apache/iceberg/flink/sink/FlinkSink.java | 654 --------- .../flink/sink/IcebergFilesCommitter.java | 516 -------- .../sink/IcebergFilesCommitterMetrics.java | 96 -- .../flink/sink/IcebergStreamWriter.java | 120 -- .../sink/IcebergStreamWriterMetrics.java | 89 -- .../flink/sink/ManifestOutputFileFactory.java | 94 -- .../flink/sink/PartitionKeySelector.java | 64 - .../flink/sink/PartitionedDeltaWriter.java | 97 -- .../flink/sink/RowDataTaskWriterFactory.java | 244 ---- .../iceberg/flink/sink/TaskWriterFactory.java | 45 - .../flink/sink/UnpartitionedDeltaWriter.java | 69 - .../shuffle/AggregatedStatisticsTracker.java | 262 ---- .../sink/shuffle/CompletedStatistics.java | 111 -- .../CompletedStatisticsSerializer.java | 178 --- .../flink/sink/shuffle/DataStatistics.java | 48 - .../shuffle/DataStatisticsCoordinator.java | 522 -------- .../DataStatisticsCoordinatorProvider.java | 70 - .../sink/shuffle/DataStatisticsOperator.java | 265 ---- .../shuffle/DataStatisticsSerializer.java | 206 --- .../flink/sink/shuffle/GlobalStatistics.java | 114 -- .../shuffle/GlobalStatisticsSerializer.java | 201 --- .../flink/sink/shuffle/KeyAssignment.java | 155 --- .../flink/sink/shuffle/MapAssignment.java | 242 ---- .../flink/sink/shuffle/MapDataStatistics.java | 88 -- .../sink/shuffle/MapRangePartitioner.java | 95 -- .../flink/sink/shuffle/RangePartitioner.java | 110 -- .../shuffle/RequestGlobalStatisticsEvent.java | 38 - .../sink/shuffle/SketchDataStatistics.java | 87 -- .../sink/shuffle/SketchRangePartitioner.java | 51 - .../flink/sink/shuffle/SketchUtil.java | 159 --- .../flink/sink/shuffle/SortKeySerializer.java | 373 ------ .../sink/shuffle/SortKeySketchSerializer.java | 143 -- .../flink/sink/shuffle/SortKeyUtil.java | 59 - .../flink/sink/shuffle/StatisticsEvent.java | 76 -- .../sink/shuffle/StatisticsOrRecord.java | 112 -- .../shuffle/StatisticsOrRecordSerializer.java | 210 --- .../flink/sink/shuffle/StatisticsType.java | 55 - .../flink/sink/shuffle/StatisticsUtil.java | 126 -- .../AvroGenericRecordFileScanTaskReader.java | 42 - .../iceberg/flink/source/DataIterator.java | 156 --- .../iceberg/flink/source/DataTaskReader.java | 47 - .../flink/source/FileScanTaskReader.java | 35 - .../flink/source/FlinkInputFormat.java | 141 -- .../iceberg/flink/source/FlinkInputSplit.java | 48 - .../iceberg/flink/source/FlinkSource.java | 307 ----- .../flink/source/FlinkSplitPlanner.java | 189 --- .../iceberg/flink/source/IcebergSource.java | 549 -------- .../flink/source/IcebergTableSource.java | 229 ---- .../source/RowDataFileScanTaskReader.java | 243 ---- .../iceberg/flink/source/RowDataRewriter.java | 172 --- .../RowDataToAvroGenericRecordConverter.java | 70 - .../iceberg/flink/source/ScanContext.java | 597 --------- .../iceberg/flink/source/SourceUtil.java | 77 -- .../source/StreamingMonitorFunction.java | 269 ---- .../flink/source/StreamingReaderOperator.java | 246 ---- .../source/StreamingStartingStrategy.java | 54 - .../source/assigner/DefaultSplitAssigner.java | 112 -- .../flink/source/assigner/GetSplitResult.java | 77 -- .../assigner/OrderedSplitAssignerFactory.java | 46 - .../assigner/SimpleSplitAssignerFactory.java | 37 - .../flink/source/assigner/SplitAssigner.java | 118 -- .../source/assigner/SplitAssignerFactory.java | 30 - .../source/assigner/SplitAssignerType.java | 33 - .../enumerator/AbstractIcebergEnumerator.java | 182 --- .../ContinuousEnumerationResult.java | 57 - .../ContinuousIcebergEnumerator.java | 178 --- .../enumerator/ContinuousSplitPlanner.java | 30 - .../ContinuousSplitPlannerImpl.java | 240 ---- .../source/enumerator/EnumerationHistory.java | 100 -- .../enumerator/IcebergEnumeratorPosition.java | 79 -- .../IcebergEnumeratorPositionSerializer.java | 90 -- .../enumerator/IcebergEnumeratorState.java | 65 - .../IcebergEnumeratorStateSerializer.java | 194 --- .../enumerator/StaticIcebergEnumerator.java | 51 - .../source/reader/ArrayBatchRecords.java | 171 --- .../reader/ArrayPoolDataIteratorBatcher.java | 130 -- .../AvroGenericRecordReaderFunction.java | 102 -- .../reader/ColumnStatsWatermarkExtractor.java | 98 -- .../source/reader/DataIteratorBatcher.java | 36 - .../reader/DataIteratorReaderFunction.java | 43 - .../source/reader/IcebergSourceReader.java | 77 -- .../reader/IcebergSourceReaderMetrics.java | 61 - .../reader/IcebergSourceSplitReader.java | 167 --- .../source/reader/LimitableDataIterator.java | 56 - .../flink/source/reader/ListBatchRecords.java | 85 -- .../reader/ListDataIteratorBatcher.java | 94 -- .../source/reader/MetaDataReaderFunction.java | 65 - .../flink/source/reader/ReaderFunction.java | 31 - .../source/reader/RecordAndPosition.java | 78 -- .../flink/source/reader/RecordFactory.java | 34 - .../flink/source/reader/RecordLimiter.java | 45 - .../source/reader/RowDataReaderFunction.java | 115 -- .../source/reader/RowDataRecordFactory.java | 73 - .../reader/SerializableRecordEmitter.java | 40 - .../reader/SplitWatermarkExtractor.java | 28 - .../WatermarkExtractorRecordEmitter.java | 67 - .../source/split/IcebergSourceSplit.java | 220 ---- .../split/IcebergSourceSplitSerializer.java | 62 - .../source/split/IcebergSourceSplitState.java | 37 - .../split/IcebergSourceSplitStatus.java | 25 - .../source/split/SerializableComparator.java | 24 - .../flink/source/split/SerializerHelper.java | 206 --- .../flink/source/split/SplitComparators.java | 76 -- .../flink/source/split/SplitRequestEvent.java | 54 - .../flink/util/FlinkAlterTableUtil.java | 248 ---- .../flink/util/FlinkCompatibilityUtil.java | 47 - .../iceberg/flink/util/FlinkPackage.java | 61 - .../org.apache.flink.table.factories.Factory | 16 - ....apache.flink.table.factories.TableFactory | 16 - .../flink/AvroGenericRecordConverterBase.java | 90 -- .../apache/iceberg/flink/CatalogTestBase.java | 143 -- .../apache/iceberg/flink/DataGenerator.java | 42 - .../apache/iceberg/flink/DataGenerators.java | 1172 ----------------- .../iceberg/flink/HadoopCatalogExtension.java | 105 -- .../iceberg/flink/HadoopTableExtension.java | 59 - .../flink/MiniFlinkClusterExtension.java | 67 - .../iceberg/flink/RowDataConverter.java | 135 -- .../apache/iceberg/flink/SimpleDataUtil.java | 439 ------ .../org/apache/iceberg/flink/TestBase.java | 128 -- .../iceberg/flink/TestCatalogLoader.java | 116 -- .../iceberg/flink/TestCatalogTableLoader.java | 113 -- .../iceberg/flink/TestChangeLogTable.java | 296 ----- .../flink/TestDataFileSerialization.java | 203 --- .../apache/iceberg/flink/TestFixtures.java | 61 - .../flink/TestFlinkAnonymousTable.java | 65 - .../flink/TestFlinkCatalogDatabase.java | 253 ---- .../flink/TestFlinkCatalogFactory.java | 119 -- .../iceberg/flink/TestFlinkCatalogTable.java | 669 ---------- .../TestFlinkCatalogTablePartitions.java | 119 -- .../iceberg/flink/TestFlinkConfParser.java | 61 - .../iceberg/flink/TestFlinkFilters.java | 462 ------- .../iceberg/flink/TestFlinkHiveCatalog.java | 101 -- .../iceberg/flink/TestFlinkSchemaUtil.java | 416 ------ .../iceberg/flink/TestFlinkTableSink.java | 358 ----- .../apache/iceberg/flink/TestFlinkUpsert.java | 334 ----- .../org/apache/iceberg/flink/TestHelpers.java | 632 --------- .../iceberg/flink/TestIcebergConnector.java | 343 ----- .../flink/TestManifestFileSerialization.java | 173 --- .../iceberg/flink/TestRowDataWrapper.java | 93 -- .../apache/iceberg/flink/TestTableLoader.java | 57 - .../iceberg/flink/TestTableSerialization.java | 110 -- .../actions/TestRewriteDataFilesAction.java | 481 ------- .../iceberg/flink/data/RandomRowData.java | 38 - .../flink/data/RowDataToRowMapper.java | 50 - .../flink/data/TestFlinkAvroReaderWriter.java | 185 --- .../flink/data/TestFlinkOrcReaderWriter.java | 107 -- .../flink/data/TestFlinkParquetReader.java | 239 ---- .../flink/data/TestFlinkParquetWriter.java | 94 -- .../flink/data/TestRowDataProjection.java | 593 --------- .../iceberg/flink/data/TestRowProjection.java | 594 --------- .../iceberg/flink/data/TestStructRowData.java | 100 -- .../TestAvroGenericRecordToRowDataMapper.java | 38 - .../sink/TestBucketPartitionKeySelector.java | 67 - .../flink/sink/TestBucketPartitioner.java | 108 -- ...TestBucketPartitionerFlinkIcebergSink.java | 227 ---- .../flink/sink/TestBucketPartitionerUtil.java | 126 -- .../flink/sink/TestCachingTableSupplier.java | 81 -- .../flink/sink/TestCompressionSettings.java | 257 ---- .../flink/sink/TestDeltaTaskWriter.java | 429 ------ .../flink/sink/TestFlinkAppenderFactory.java | 65 - .../sink/TestFlinkFileWriterFactory.java | 66 - .../flink/sink/TestFlinkIcebergSink.java | 385 ------ .../flink/sink/TestFlinkIcebergSinkBase.java | 64 - .../sink/TestFlinkIcebergSinkBranch.java | 137 -- .../flink/sink/TestFlinkIcebergSinkV2.java | 235 ---- .../sink/TestFlinkIcebergSinkV2Base.java | 389 ------ .../sink/TestFlinkIcebergSinkV2Branch.java | 125 -- .../iceberg/flink/sink/TestFlinkManifest.java | 312 ----- .../sink/TestFlinkPartitioningWriters.java | 66 - .../sink/TestFlinkPositionDeltaWriters.java | 66 - .../sink/TestFlinkRollingFileWriters.java | 51 - .../flink/sink/TestFlinkWriterMetrics.java | 60 - .../flink/sink/TestIcebergFilesCommitter.java | 1148 ---------------- .../flink/sink/TestIcebergStreamWriter.java | 390 ------ .../flink/sink/TestRowDataPartitionKey.java | 251 ---- .../iceberg/flink/sink/TestTaskWriters.java | 242 ---- .../iceberg/flink/sink/shuffle/Fixtures.java | 100 -- .../TestAggregatedStatisticsTracker.java | 465 ------- .../TestCompletedStatisticsSerializer.java | 54 - .../TestDataStatisticsCoordinator.java | 246 ---- ...TestDataStatisticsCoordinatorProvider.java | 187 --- .../shuffle/TestDataStatisticsOperator.java | 350 ----- .../shuffle/TestDataStatisticsSerializer.java | 53 - .../TestGlobalStatisticsSerializer.java | 59 - .../sink/shuffle/TestMapDataStatistics.java | 67 - .../sink/shuffle/TestMapRangePartitioner.java | 434 ------ .../sink/shuffle/TestRangePartitioner.java | 65 - .../shuffle/TestSketchDataStatistics.java | 60 - .../shuffle/TestSketchRangePartitioner.java | 88 -- .../flink/sink/shuffle/TestSketchUtil.java | 189 --- .../shuffle/TestSortKeySerializerBase.java | 65 - .../TestSortKeySerializerNestedStruct.java | 55 - .../TestSortKeySerializerPrimitives.java | 90 -- .../TestSortKeySerializerSnapshot.java | 213 --- .../flink/sink/shuffle/TestSortKeyUtil.java | 73 - .../flink/source/BoundedTableFactory.java | 170 --- .../flink/source/BoundedTestSource.java | 108 -- .../flink/source/ChangeLogTableTestBase.java | 95 -- .../iceberg/flink/source/SplitHelpers.java | 200 --- .../iceberg/flink/source/SqlHelpers.java | 60 - .../flink/source/TableSourceTestBase.java | 104 -- .../flink/source/TestBoundedTableFactory.java | 81 -- .../flink/source/TestFlinkInputFormat.java | 211 --- .../TestFlinkInputFormatReaderDeletes.java | 70 - .../flink/source/TestFlinkMergingMetrics.java | 67 - .../flink/source/TestFlinkMetaDataTable.java | 813 ------------ .../source/TestFlinkReaderDeletesBase.java | 90 -- .../iceberg/flink/source/TestFlinkScan.java | 540 -------- .../flink/source/TestFlinkScanSql.java | 69 - .../iceberg/flink/source/TestFlinkSource.java | 90 -- .../flink/source/TestFlinkSourceConfig.java | 61 - .../flink/source/TestFlinkSourceSql.java | 85 -- .../flink/source/TestFlinkTableSource.java | 561 -------- .../source/TestIcebergSourceBounded.java | 147 --- ...TestIcebergSourceBoundedGenericRecord.java | 196 --- .../source/TestIcebergSourceBoundedSql.java | 76 -- .../source/TestIcebergSourceContinuous.java | 538 -------- .../source/TestIcebergSourceFailover.java | 394 ------ ...gSourceFailoverWithWatermarkExtractor.java | 130 -- .../TestIcebergSourceReaderDeletes.java | 102 -- .../flink/source/TestIcebergSourceSql.java | 158 --- ...stIcebergSourceWithWatermarkExtractor.java | 408 ------ ...estIcebergSpeculativeExecutionSupport.java | 184 --- .../TestMetadataTableReadableMetrics.java | 299 ----- .../flink/source/TestProjectMetaColumn.java | 188 --- ...stRowDataToAvroGenericRecordConverter.java | 36 - .../iceberg/flink/source/TestScanContext.java | 112 -- .../iceberg/flink/source/TestSourceUtil.java | 61 - .../iceberg/flink/source/TestSqlBase.java | 160 --- .../flink/source/TestStreamScanSql.java | 434 ------ .../source/TestStreamingMonitorFunction.java | 402 ------ .../source/TestStreamingReaderOperator.java | 293 ----- .../assigner/SplitAssignerTestBase.java | 132 -- .../assigner/TestDefaultSplitAssigner.java | 43 - ...tFileSequenceNumberBasedSplitAssigner.java | 81 -- .../TestWatermarkBasedSplitAssigner.java | 146 -- .../ManualContinuousSplitPlanner.java | 97 -- .../TestContinuousIcebergEnumerator.java | 352 ----- .../TestContinuousSplitPlannerImpl.java | 692 ---------- ...ntinuousSplitPlannerImplStartStrategy.java | 200 --- .../enumerator/TestEnumerationHistory.java | 135 -- .../TestIcebergEnumeratorStateSerializer.java | 146 -- .../source/reader/ReaderFunctionTestBase.java | 218 --- .../flink/source/reader/ReaderUtil.java | 128 -- .../source/reader/TestArrayBatchRecords.java | 69 - ...stArrayPoolDataIteratorBatcherRowData.java | 360 ----- .../TestColumnStatsWatermarkExtractor.java | 176 --- .../reader/TestIcebergSourceReader.java | 181 --- .../reader/TestLimitableDataIterator.java | 84 -- .../reader/TestRowDataReaderFunction.java | 69 - .../source/reader/TestingMetricGroup.java | 102 -- .../TestIcebergSourceSplitSerializer.java | 183 --- .../iceberg/flink/util/TestFlinkPackage.java | 55 - .../org.apache.flink.table.factories.Factory | 16 - 311 files changed, 55202 deletions(-) delete mode 100644 flink/v1.17/build.gradle delete mode 100644 flink/v1.17/flink-runtime/LICENSE delete mode 100644 flink/v1.17/flink-runtime/NOTICE delete mode 100644 flink/v1.17/flink-runtime/src/integration/java/org/apache/iceberg/flink/IcebergConnectorSmokeTest.java delete mode 100644 flink/v1.17/flink/src/jmh/java/org/apache/iceberg/flink/sink/shuffle/MapRangePartitionerBenchmark.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/CatalogLoader.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/FlinkCatalog.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/FlinkCatalogFactory.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/FlinkConfParser.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/FlinkConfigOptions.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/FlinkDynamicTableFactory.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/FlinkEnvironmentContext.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/FlinkFilters.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/FlinkFixupTypes.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/FlinkReadConf.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/FlinkReadOptions.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/FlinkSchemaUtil.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/FlinkSourceFilter.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeToType.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeVisitor.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/FlinkWriteConf.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/FlinkWriteOptions.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/IcebergTableSink.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/RowDataWrapper.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/TableLoader.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/TypeToFlinkType.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/actions/Actions.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/actions/RewriteDataFilesAction.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/data/AvroWithFlinkSchemaVisitor.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/data/FlinkAvroReader.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/data/FlinkAvroWriter.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReader.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReaders.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriter.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriters.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/data/FlinkParquetReaders.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/data/FlinkParquetWriters.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/data/FlinkSchemaVisitor.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/data/FlinkValueReaders.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/data/FlinkValueWriters.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/data/ParquetWithFlinkSchemaVisitor.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/data/RowDataProjection.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/data/RowDataUtil.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/data/StructRowData.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/AvroGenericRecordToRowDataMapper.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/BaseDeltaTaskWriter.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/BucketPartitionKeySelector.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/BucketPartitioner.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/BucketPartitionerUtil.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/CachingTableSupplier.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/CommitSummary.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/DeltaManifests.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/DeltaManifestsSerializer.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/EqualityFieldKeySelector.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkAppenderFactory.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkFileWriterFactory.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkManifestUtil.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkSink.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergFilesCommitter.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergFilesCommitterMetrics.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriter.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriterMetrics.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/ManifestOutputFileFactory.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/PartitionKeySelector.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/PartitionedDeltaWriter.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/RowDataTaskWriterFactory.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/TaskWriterFactory.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/UnpartitionedDeltaWriter.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/AggregatedStatisticsTracker.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/CompletedStatistics.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/CompletedStatisticsSerializer.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatistics.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsCoordinator.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsCoordinatorProvider.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsOperator.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsSerializer.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/GlobalStatistics.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/GlobalStatisticsSerializer.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/KeyAssignment.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapAssignment.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapDataStatistics.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapRangePartitioner.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/RangePartitioner.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/RequestGlobalStatisticsEvent.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchDataStatistics.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchRangePartitioner.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchUtil.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SortKeySerializer.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SortKeySketchSerializer.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SortKeyUtil.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsEvent.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsOrRecord.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsOrRecordSerializer.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsType.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsUtil.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/AvroGenericRecordFileScanTaskReader.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/DataIterator.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/DataTaskReader.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/FileScanTaskReader.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/FlinkInputFormat.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/FlinkInputSplit.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSource.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSplitPlanner.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/IcebergSource.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/IcebergTableSource.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/RowDataFileScanTaskReader.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/RowDataRewriter.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/RowDataToAvroGenericRecordConverter.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/ScanContext.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/SourceUtil.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/StreamingMonitorFunction.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/StreamingReaderOperator.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/StreamingStartingStrategy.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/assigner/DefaultSplitAssigner.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/assigner/GetSplitResult.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/assigner/OrderedSplitAssignerFactory.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SimpleSplitAssignerFactory.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssigner.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssignerFactory.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssignerType.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/AbstractIcebergEnumerator.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousEnumerationResult.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousIcebergEnumerator.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousSplitPlanner.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousSplitPlannerImpl.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/EnumerationHistory.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorPosition.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorPositionSerializer.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorState.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorStateSerializer.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/StaticIcebergEnumerator.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/ArrayBatchRecords.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/ArrayPoolDataIteratorBatcher.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/AvroGenericRecordReaderFunction.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/ColumnStatsWatermarkExtractor.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorBatcher.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorReaderFunction.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceReader.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceReaderMetrics.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceSplitReader.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/LimitableDataIterator.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/ListBatchRecords.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/ListDataIteratorBatcher.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/MetaDataReaderFunction.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/ReaderFunction.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordAndPosition.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordFactory.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordLimiter.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataReaderFunction.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataRecordFactory.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/SerializableRecordEmitter.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/SplitWatermarkExtractor.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/WatermarkExtractorRecordEmitter.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplit.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitSerializer.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitState.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitStatus.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/split/SerializableComparator.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/split/SerializerHelper.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/split/SplitComparators.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/split/SplitRequestEvent.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/util/FlinkAlterTableUtil.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/util/FlinkCompatibilityUtil.java delete mode 100644 flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/util/FlinkPackage.java delete mode 100644 flink/v1.17/flink/src/main/resources/META-INF/services/org.apache.flink.table.factories.Factory delete mode 100644 flink/v1.17/flink/src/main/resources/META-INF/services/org.apache.flink.table.factories.TableFactory delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/AvroGenericRecordConverterBase.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/CatalogTestBase.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/DataGenerator.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/DataGenerators.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/HadoopCatalogExtension.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/HadoopTableExtension.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/MiniFlinkClusterExtension.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/RowDataConverter.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/SimpleDataUtil.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestBase.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestCatalogLoader.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestCatalogTableLoader.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestChangeLogTable.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestDataFileSerialization.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestFixtures.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestFlinkAnonymousTable.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogDatabase.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogFactory.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogTable.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogTablePartitions.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestFlinkConfParser.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestFlinkFilters.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestFlinkHiveCatalog.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestFlinkSchemaUtil.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSink.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestFlinkUpsert.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestHelpers.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestIcebergConnector.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestManifestFileSerialization.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestRowDataWrapper.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestTableLoader.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestTableSerialization.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/actions/TestRewriteDataFilesAction.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/data/RandomRowData.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/data/RowDataToRowMapper.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkAvroReaderWriter.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkOrcReaderWriter.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkParquetReader.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkParquetWriter.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/data/TestRowDataProjection.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/data/TestRowProjection.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/data/TestStructRowData.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestAvroGenericRecordToRowDataMapper.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitionKeySelector.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitioner.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitionerFlinkIcebergSink.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitionerUtil.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestCachingTableSupplier.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestCompressionSettings.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestDeltaTaskWriter.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkAppenderFactory.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkFileWriterFactory.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSink.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkBase.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkBranch.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2Base.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2Branch.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkManifest.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkPartitioningWriters.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkPositionDeltaWriters.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkRollingFileWriters.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkWriterMetrics.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergFilesCommitter.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergStreamWriter.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestRowDataPartitionKey.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestTaskWriters.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/Fixtures.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestAggregatedStatisticsTracker.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestCompletedStatisticsSerializer.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsCoordinator.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsCoordinatorProvider.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsOperator.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsSerializer.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestGlobalStatisticsSerializer.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestMapDataStatistics.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestMapRangePartitioner.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestRangePartitioner.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchDataStatistics.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchRangePartitioner.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchUtil.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerBase.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerNestedStruct.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerPrimitives.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerSnapshot.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeyUtil.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/BoundedTableFactory.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/BoundedTestSource.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/ChangeLogTableTestBase.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/SplitHelpers.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/SqlHelpers.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TableSourceTestBase.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestBoundedTableFactory.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkInputFormat.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkInputFormatReaderDeletes.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkMergingMetrics.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkMetaDataTable.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkReaderDeletesBase.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScan.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScanSql.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSource.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSourceConfig.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSourceSql.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkTableSource.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBounded.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBoundedGenericRecord.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBoundedSql.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceContinuous.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceFailover.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceFailoverWithWatermarkExtractor.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceReaderDeletes.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceSql.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceWithWatermarkExtractor.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSpeculativeExecutionSupport.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestMetadataTableReadableMetrics.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestProjectMetaColumn.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestRowDataToAvroGenericRecordConverter.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestScanContext.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestSourceUtil.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestSqlBase.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamScanSql.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamingMonitorFunction.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamingReaderOperator.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/assigner/SplitAssignerTestBase.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/assigner/TestDefaultSplitAssigner.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/assigner/TestFileSequenceNumberBasedSplitAssigner.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/assigner/TestWatermarkBasedSplitAssigner.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/ManualContinuousSplitPlanner.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousIcebergEnumerator.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousSplitPlannerImpl.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousSplitPlannerImplStartStrategy.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestEnumerationHistory.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestIcebergEnumeratorStateSerializer.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/reader/ReaderFunctionTestBase.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/reader/ReaderUtil.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestArrayBatchRecords.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestArrayPoolDataIteratorBatcherRowData.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestColumnStatsWatermarkExtractor.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestIcebergSourceReader.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestLimitableDataIterator.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestRowDataReaderFunction.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestingMetricGroup.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/split/TestIcebergSourceSplitSerializer.java delete mode 100644 flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/util/TestFlinkPackage.java delete mode 100644 flink/v1.17/flink/src/test/resources/META-INF/services/org.apache.flink.table.factories.Factory diff --git a/flink/v1.17/build.gradle b/flink/v1.17/build.gradle deleted file mode 100644 index 0278e4dc3b73..000000000000 --- a/flink/v1.17/build.gradle +++ /dev/null @@ -1,268 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -String flinkMajorVersion = '1.17' -String scalaVersion = System.getProperty("scalaVersion") != null ? System.getProperty("scalaVersion") : System.getProperty("defaultScalaVersion") - -project(":iceberg-flink:iceberg-flink-${flinkMajorVersion}") { - - dependencies { - implementation project(path: ':iceberg-bundled-guava', configuration: 'shadow') - api project(':iceberg-api') - implementation project(':iceberg-common') - implementation project(':iceberg-core') - api project(':iceberg-data') - implementation project(':iceberg-orc') - implementation project(':iceberg-parquet') - implementation project(':iceberg-hive-metastore') - - compileOnly libs.flink117.avro - // for dropwizard histogram metrics implementation - compileOnly libs.flink117.metrics.dropwizard - compileOnly libs.flink117.streaming.java - compileOnly "${libs.flink117.streaming.java.get().module}:${libs.flink117.streaming.java.get().getVersion()}:tests" - compileOnly libs.flink117.table.api.java.bridge - compileOnly "org.apache.flink:flink-table-planner_${scalaVersion}:${libs.versions.flink117.get()}" - compileOnly libs.flink117.connector.base - compileOnly libs.flink117.connector.files - - compileOnly libs.hadoop2.hdfs - compileOnly libs.hadoop2.common - compileOnly(libs.hadoop2.minicluster) { - exclude group: 'org.apache.avro', module: 'avro' - } - - implementation(libs.parquet.avro) { - exclude group: 'org.apache.avro', module: 'avro' - // already shaded by Parquet - exclude group: 'it.unimi.dsi' - exclude group: 'org.codehaus.jackson' - } - - compileOnly libs.avro.avro - - implementation("${libs.orc.core.get().module}:${libs.versions.orc.get()}:nohive") { - exclude group: 'org.apache.hadoop' - exclude group: 'commons-lang' - // These artifacts are shaded and included in the orc-core fat jar - exclude group: 'com.google.protobuf', module: 'protobuf-java' - exclude group: 'org.apache.hive', module: 'hive-storage-api' - exclude group: 'org.slf4j' - } - - implementation libs.datasketches - - testImplementation libs.flink117.connector.test.utils - testImplementation libs.flink117.core - testImplementation libs.flink117.runtime - testImplementation(libs.flink117.test.utilsjunit) { - exclude group: 'junit' - } - testImplementation(libs.flink117.test.utils) { - exclude group: "org.apache.curator", module: 'curator-test' - exclude group: 'junit' - } - - testImplementation project(path: ':iceberg-hive-metastore', configuration: 'testArtifacts') - testImplementation project(path: ':iceberg-api', configuration: 'testArtifacts') - testImplementation project(path: ':iceberg-core', configuration: 'testArtifacts') - testImplementation project(path: ':iceberg-data', configuration: 'testArtifacts') - - // By default, hive-exec is a fat/uber jar and it exports a guava library - // that's really old. We use the core classifier to be able to override our guava - // version. Luckily, hive-exec seems to work okay so far with this version of guava - // See: https://github.com/apache/hive/blob/master/ql/pom.xml#L911 for more context. - testImplementation("${libs.hive2.exec.get().module}:${libs.hive2.exec.get().getVersion()}:core") { - exclude group: 'org.apache.avro', module: 'avro' - exclude group: 'org.slf4j', module: 'slf4j-log4j12' - exclude group: 'org.pentaho' // missing dependency - exclude group: 'org.apache.hive', module: 'hive-llap-tez' - exclude group: 'org.apache.logging.log4j' - exclude group: 'com.google.protobuf', module: 'protobuf-java' - exclude group: 'org.apache.calcite' - exclude group: 'org.apache.calcite.avatica' - exclude group: 'com.google.code.findbugs', module: 'jsr305' - } - - testImplementation(libs.hive2.metastore) { - exclude group: 'org.apache.avro', module: 'avro' - exclude group: 'org.slf4j', module: 'slf4j-log4j12' - exclude group: 'org.pentaho' // missing dependency - exclude group: 'org.apache.hbase' - exclude group: 'org.apache.logging.log4j' - exclude group: 'co.cask.tephra' - exclude group: 'com.google.code.findbugs', module: 'jsr305' - exclude group: 'org.eclipse.jetty.aggregate', module: 'jetty-all' - exclude group: 'org.eclipse.jetty.orbit', module: 'javax.servlet' - exclude group: 'org.apache.parquet', module: 'parquet-hadoop-bundle' - exclude group: 'com.tdunning', module: 'json' - exclude group: 'javax.transaction', module: 'transaction-api' - exclude group: 'com.zaxxer', module: 'HikariCP' - exclude group: 'org.slf4j' - } - - testImplementation libs.awaitility - testImplementation libs.assertj.core - } - - test { - useJUnitPlatform() - } -} - -project(":iceberg-flink:iceberg-flink-runtime-${flinkMajorVersion}") { - apply plugin: 'io.github.goooler.shadow' - - tasks.jar.dependsOn tasks.shadowJar - - sourceSets { - integration { - java.srcDir "$projectDir/src/integration/java" - resources.srcDir "$projectDir/src/integration/resources" - } - } - - configurations { - implementation { - // included in Flink - exclude group: 'org.slf4j' - exclude group: 'org.apache.commons' - exclude group: 'commons-pool' - exclude group: 'commons-codec' - exclude group: 'org.xerial.snappy' - exclude group: 'javax.xml.bind' - exclude group: 'javax.annotation' - } - } - - dependencies { - implementation(project(":iceberg-flink:iceberg-flink-${flinkMajorVersion}")) { - exclude group: 'org.apache.flink' - } - implementation project(':iceberg-aws') - implementation project(':iceberg-azure') - implementation(project(':iceberg-aliyun')) { - exclude group: 'edu.umd.cs.findbugs', module: 'findbugs' - exclude group: 'org.apache.httpcomponents', module: 'httpclient' - exclude group: 'commons-logging', module: 'commons-logging' - } - implementation project(':iceberg-gcp') - implementation(project(':iceberg-nessie')) { - exclude group: 'com.google.code.findbugs', module: 'jsr305' - } - - // for dropwizard histogram metrics implementation - implementation libs.flink117.metrics.dropwizard - - // for integration testing with the flink-runtime-jar - // all of those dependencies are required because the integration test extends FlinkTestBase - integrationCompileOnly project(':iceberg-api') - integrationImplementation libs.junit.vintage.engine - integrationImplementation libs.assertj.core - integrationImplementation project(path: ":iceberg-flink:iceberg-flink-${flinkMajorVersion}", configuration: "testArtifacts") - integrationImplementation project(path: ':iceberg-api', configuration: 'testArtifacts') - integrationImplementation project(path: ':iceberg-hive-metastore', configuration: 'testArtifacts') - integrationImplementation(libs.flink117.test.utils) { - exclude group: "org.apache.curator", module: 'curator-test' - exclude group: 'junit' - } - - integrationImplementation libs.flink117.table.api.java.bridge - integrationImplementation "org.apache.flink:flink-table-planner_${scalaVersion}:${libs.versions.flink117.get()}" - - integrationImplementation libs.hadoop2.common - integrationImplementation libs.hadoop2.hdfs - integrationImplementation(libs.hadoop2.minicluster) { - exclude group: 'org.apache.avro', module: 'avro' - } - - integrationImplementation(libs.hive2.metastore) { - exclude group: 'org.apache.avro', module: 'avro' - exclude group: 'org.slf4j', module: 'slf4j-log4j12' - exclude group: 'org.pentaho' // missing dependency - exclude group: 'org.apache.hbase' - exclude group: 'org.apache.logging.log4j' - exclude group: 'co.cask.tephra' - exclude group: 'com.google.code.findbugs', module: 'jsr305' - exclude group: 'org.eclipse.jetty.aggregate', module: 'jetty-all' - exclude group: 'org.eclipse.jetty.orbit', module: 'javax.servlet' - exclude group: 'org.apache.parquet', module: 'parquet-hadoop-bundle' - exclude group: 'com.tdunning', module: 'json' - exclude group: 'javax.transaction', module: 'transaction-api' - exclude group: 'com.zaxxer', module: 'HikariCP' - exclude group: 'org.slf4j' - } - - integrationImplementation("${libs.hive2.exec.get().module}:${libs.hive2.exec.get().getVersion()}:core") { - exclude group: 'org.apache.avro', module: 'avro' - exclude group: 'org.slf4j', module: 'slf4j-log4j12' - exclude group: 'org.pentaho' // missing dependency - exclude group: 'org.apache.hive', module: 'hive-llap-tez' - exclude group: 'org.apache.logging.log4j' - exclude group: 'com.google.protobuf', module: 'protobuf-java' - exclude group: 'org.apache.calcite' - exclude group: 'org.apache.calcite.avatica' - exclude group: 'com.google.code.findbugs', module: 'jsr305' - } - } - - shadowJar { - configurations = [project.configurations.runtimeClasspath] - - zip64 true - - // include the LICENSE and NOTICE files for the shaded Jar - from(projectDir) { - include 'LICENSE' - include 'NOTICE' - } - - // Relocate dependencies to avoid conflicts - relocate 'org.apache.avro', 'org.apache.iceberg.shaded.org.apache.avro' - relocate 'org.apache.parquet', 'org.apache.iceberg.shaded.org.apache.parquet' - relocate 'com.google.errorprone', 'org.apache.iceberg.shaded.com.google.errorprone' - relocate 'com.google.flatbuffers', 'org.apache.iceberg.shaded.com.google.flatbuffers' - relocate 'com.fasterxml', 'org.apache.iceberg.shaded.com.fasterxml' - relocate 'com.github.benmanes', 'org.apache.iceberg.shaded.com.github.benmanes' - relocate 'org.checkerframework', 'org.apache.iceberg.shaded.org.checkerframework' - relocate 'shaded.parquet', 'org.apache.iceberg.shaded.org.apache.parquet.shaded' - relocate 'org.apache.orc', 'org.apache.iceberg.shaded.org.apache.orc' - relocate 'io.airlift', 'org.apache.iceberg.shaded.io.airlift' - relocate 'org.threeten.extra', 'org.apache.iceberg.shaded.org.threeten.extra' - relocate 'org.apache.hc.client5', 'org.apache.iceberg.shaded.org.apache.hc.client5' - relocate 'org.apache.hc.core5', 'org.apache.iceberg.shaded.org.apache.hc.core5' - - archiveClassifier.set(null) - } - - task integrationTest(type: Test) { - description = "Test Flink Runtime Jar against Flink ${flinkMajorVersion}" - group = "verification" - jvmArgs += project.property('extraJvmArgs') - testClassesDirs = sourceSets.integration.output.classesDirs - classpath = sourceSets.integration.runtimeClasspath + files(shadowJar.archiveFile.get().asFile.path) - inputs.file(shadowJar.archiveFile.get().asFile.path) - } - integrationTest.dependsOn shadowJar - check.dependsOn integrationTest - - jar { - enabled = false - } -} diff --git a/flink/v1.17/flink-runtime/LICENSE b/flink/v1.17/flink-runtime/LICENSE deleted file mode 100644 index 8ab53469eb87..000000000000 --- a/flink/v1.17/flink-runtime/LICENSE +++ /dev/null @@ -1,502 +0,0 @@ - - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright [yyyy] [name of copyright owner] - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. - --------------------------------------------------------------------------------- - -This binary artifact contains Apache Avro. - -Copyright: 2014-2020 The Apache Software Foundation. -Home page: https://parquet.apache.org/ -License: http://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This binary artifact contains the Jackson JSON processor. - -Copyright: 2007-2020 Tatu Saloranta and other contributors -Home page: http://jackson.codehaus.org/ -License: http://www.apache.org/licenses/LICENSE-2.0.txt - --------------------------------------------------------------------------------- - -This binary artifact contains Apache Parquet. - -Copyright: 2014-2020 The Apache Software Foundation. -Home page: https://parquet.apache.org/ -License: http://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This binary artifact contains Apache Thrift. - -Copyright: 2006-2010 The Apache Software Foundation. -Home page: https://thrift.apache.org/ -License: http://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This binary artifact contains fastutil. - -Copyright: 2002-2014 Sebastiano Vigna -Home page: http://fastutil.di.unimi.it/ -License: http://www.apache.org/licenses/LICENSE-2.0.html - --------------------------------------------------------------------------------- - -This binary artifact contains Apache ORC. - -Copyright: 2013-2020 The Apache Software Foundation. -Home page: https://orc.apache.org/ -License: http://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This binary artifact contains Apache Hive's storage API via ORC. - -Copyright: 2013-2020 The Apache Software Foundation. -Home page: https://hive.apache.org/ -License: http://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This binary artifact contains Airlift Aircompressor. - -Copyright: 2011-2020 Aircompressor authors. -Home page: https://github.com/airlift/aircompressor -License: http://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This binary artifact contains Airlift Slice. - -Copyright: 2013-2020 Slice authors. -Home page: https://github.com/airlift/slice -License: http://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This binary artifact contains JetBrains annotations. - -Copyright: 2000-2020 JetBrains s.r.o. -Home page: https://github.com/JetBrains/java-annotations -License: http://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This binary artifact contains Google Guava. - -Copyright: 2006-2020 The Guava Authors -Home page: https://github.com/google/guava -License: http://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This binary artifact contains Google Error Prone Annotations. - -Copyright: Copyright 2011-2019 The Error Prone Authors -Home page: https://github.com/google/error-prone -License: http://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This binary artifact contains checkerframework checker-qual Annotations. - -Copyright: 2004-2020 the Checker Framework developers -Home page: https://github.com/typetools/checker-framework -License: https://github.com/typetools/checker-framework/blob/master/LICENSE.txt (MIT license) - -License text: -| The annotations are licensed under the MIT License. (The text of this -| license appears below.) More specifically, all the parts of the Checker -| Framework that you might want to include with your own program use the -| MIT License. This is the checker-qual.jar file and all the files that -| appear in it: every file in a qual/ directory, plus utility files such -| as NullnessUtil.java, RegexUtil.java, SignednessUtil.java, etc. -| In addition, the cleanroom implementations of third-party annotations, -| which the Checker Framework recognizes as aliases for its own -| annotations, are licensed under the MIT License. -| -| Permission is hereby granted, free of charge, to any person obtaining a copy -| of this software and associated documentation files (the "Software"), to deal -| in the Software without restriction, including without limitation the rights -| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -| copies of the Software, and to permit persons to whom the Software is -| furnished to do so, subject to the following conditions: -| -| The above copyright notice and this permission notice shall be included in -| all copies or substantial portions of the Software. -| -| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -| THE SOFTWARE. - --------------------------------------------------------------------------------- - -This binary artifact contains Animal Sniffer Annotations. - -Copyright: 2009-2018 codehaus.org -Home page: https://www.mojohaus.org/animal-sniffer/animal-sniffer-annotations/ -License: https://www.mojohaus.org/animal-sniffer/animal-sniffer-annotations/license.html (MIT license) - -License text: -| The MIT License -| -| Copyright (c) 2009 codehaus.org. -| -| Permission is hereby granted, free of charge, to any person obtaining a copy -| of this software and associated documentation files (the "Software"), to deal -| in the Software without restriction, including without limitation the rights -| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -| copies of the Software, and to permit persons to whom the Software is -| furnished to do so, subject to the following conditions: -| -| The above copyright notice and this permission notice shall be included in -| all copies or substantial portions of the Software. -| -| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -| THE SOFTWARE. - --------------------------------------------------------------------------------- - -This binary artifact contains Caffeine by Ben Manes. - -Copyright: 2014-2020 Ben Manes and contributors -Home page: https://github.com/ben-manes/caffeine -License: http://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This binary artifact contains Apache Yetus audience annotations. - -Copyright: 2008-2020 The Apache Software Foundation. -Home page: https://yetus.apache.org/ -License: http://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This binary artifact contains Google protobuf. - -Copyright: 2008 Google Inc. -Home page: https://developers.google.com/protocol-buffers -License: https://github.com/protocolbuffers/protobuf/blob/master/LICENSE (BSD) - -License text: - -| Copyright 2008 Google Inc. All rights reserved. -| -| Redistribution and use in source and binary forms, with or without -| modification, are permitted provided that the following conditions are -| met: -| -| * Redistributions of source code must retain the above copyright -| notice, this list of conditions and the following disclaimer. -| * Redistributions in binary form must reproduce the above -| copyright notice, this list of conditions and the following disclaimer -| in the documentation and/or other materials provided with the -| distribution. -| * Neither the name of Google Inc. nor the names of its -| contributors may be used to endorse or promote products derived from -| this software without specific prior written permission. -| -| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -| "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -| LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -| A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -| OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -| SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -| LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -| DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -| THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -| (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -| OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -| -| Code generated by the Protocol Buffer compiler is owned by the owner -| of the input file used when generating it. This code is not -| standalone and requires a support library to be linked with it. This -| support library is itself covered by the above license. - --------------------------------------------------------------------------------- - -This binary artifact contains ThreeTen. - -Copyright: 2007-present, Stephen Colebourne & Michael Nascimento Santos. -Home page: https://www.threeten.org/threeten-extra/ -License: https://github.com/ThreeTen/threeten-extra/blob/master/LICENSE.txt (BSD 3-clause) - -License text: - -| All rights reserved. -| -| * Redistribution and use in source and binary forms, with or without -| modification, are permitted provided that the following conditions are met: -| -| * Redistributions of source code must retain the above copyright notice, -| this list of conditions and the following disclaimer. -| -| * Redistributions in binary form must reproduce the above copyright notice, -| this list of conditions and the following disclaimer in the documentation -| and/or other materials provided with the distribution. -| -| * Neither the name of JSR-310 nor the names of its contributors -| may be used to endorse or promote products derived from this software -| without specific prior written permission. -| -| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -| "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -| LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -| A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -| CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -| EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -| PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -| PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -| LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -| NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -| SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - --------------------------------------------------------------------------------- - -This binary artifact includes Project Nessie with the following in its NOTICE -file: - -| Dremio -| Copyright 2015-2017 Dremio Corporation -| -| This product includes software developed at -| The Apache Software Foundation (http://www.apache.org/). - --------------------------------------------------------------------------------- - -This binary includes code from Apache Commons. - -* Core ArrayUtil. - -Copyright: 2020 The Apache Software Foundation -Home page: https://commons.apache.org/ -License: https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This binary artifact contains Apache HttpComponents Client. - -Copyright: 1999-2022 The Apache Software Foundation. -Home page: https://hc.apache.org/ -License: http://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product includes code from Apache HttpComponents Client. - -* retry and error handling logic in ExponentialHttpRequestRetryStrategy.java - -Copyright: 1999-2022 The Apache Software Foundation. -Home page: https://hc.apache.org/ -License: https://www.apache.org/licenses/LICENSE-2.0 diff --git a/flink/v1.17/flink-runtime/NOTICE b/flink/v1.17/flink-runtime/NOTICE deleted file mode 100644 index dc36f84c4ac5..000000000000 --- a/flink/v1.17/flink-runtime/NOTICE +++ /dev/null @@ -1,91 +0,0 @@ - -Apache Iceberg -Copyright 2017-2024 The Apache Software Foundation - -This product includes software developed at -The Apache Software Foundation (http://www.apache.org/). - --------------------------------------------------------------------------------- - -This binary artifact includes Apache ORC with the following in its NOTICE file: - -| Apache ORC -| Copyright 2013-2019 The Apache Software Foundation -| -| This product includes software developed by The Apache Software -| Foundation (http://www.apache.org/). -| -| This product includes software developed by Hewlett-Packard: -| (c) Copyright [2014-2015] Hewlett-Packard Development Company, L.P - --------------------------------------------------------------------------------- - -This binary artifact includes Airlift Aircompressor with the following in its -NOTICE file: - -| Snappy Copyright Notices -| ========================= -| -| * Copyright 2011 Dain Sundstrom -| * Copyright 2011, Google Inc. -| -| -| Snappy License -| =============== -| Copyright 2011, Google Inc. -| All rights reserved. -| -| Redistribution and use in source and binary forms, with or without -| modification, are permitted provided that the following conditions are -| met: -| -| * Redistributions of source code must retain the above copyright -| notice, this list of conditions and the following disclaimer. -| * Redistributions in binary form must reproduce the above -| copyright notice, this list of conditions and the following disclaimer -| in the documentation and/or other materials provided with the -| distribution. -| * Neither the name of Google Inc. nor the names of its -| contributors may be used to endorse or promote products derived from -| this software without specific prior written permission. -| -| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -| "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -| LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -| A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -| OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -| SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -| LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -| DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -| THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -| (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -| OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - --------------------------------------------------------------------------------- - -This binary artifact includes Apache Yetus with the following in its NOTICE -file: - -| Apache Yetus -| Copyright 2008-2020 The Apache Software Foundation -| -| This product includes software developed at -| The Apache Software Foundation (https://www.apache.org/). -| -| --- -| Additional licenses for the Apache Yetus Source/Website: -| --- -| -| -| See LICENSE for terms. - --------------------------------------------------------------------------------- - -This binary artifact includes Project Nessie with the following in its NOTICE -file: - -| Dremio -| Copyright 2015-2017 Dremio Corporation -| -| This product includes software developed at -| The Apache Software Foundation (http://www.apache.org/). diff --git a/flink/v1.17/flink-runtime/src/integration/java/org/apache/iceberg/flink/IcebergConnectorSmokeTest.java b/flink/v1.17/flink-runtime/src/integration/java/org/apache/iceberg/flink/IcebergConnectorSmokeTest.java deleted file mode 100644 index 0d9bbf9d3601..000000000000 --- a/flink/v1.17/flink-runtime/src/integration/java/org/apache/iceberg/flink/IcebergConnectorSmokeTest.java +++ /dev/null @@ -1,21 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -public class IcebergConnectorSmokeTest extends TestIcebergConnector {} diff --git a/flink/v1.17/flink/src/jmh/java/org/apache/iceberg/flink/sink/shuffle/MapRangePartitionerBenchmark.java b/flink/v1.17/flink/src/jmh/java/org/apache/iceberg/flink/sink/shuffle/MapRangePartitionerBenchmark.java deleted file mode 100644 index a9ad386a5a4a..000000000000 --- a/flink/v1.17/flink/src/jmh/java/org/apache/iceberg/flink/sink/shuffle/MapRangePartitionerBenchmark.java +++ /dev/null @@ -1,207 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import java.nio.charset.StandardCharsets; -import java.util.Comparator; -import java.util.List; -import java.util.Map; -import java.util.NavigableMap; -import java.util.concurrent.ThreadLocalRandom; -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.RowData; -import org.apache.iceberg.Schema; -import org.apache.iceberg.SortKey; -import org.apache.iceberg.SortOrder; -import org.apache.iceberg.SortOrderComparators; -import org.apache.iceberg.StructLike; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.types.Types; -import org.openjdk.jmh.annotations.Benchmark; -import org.openjdk.jmh.annotations.BenchmarkMode; -import org.openjdk.jmh.annotations.Fork; -import org.openjdk.jmh.annotations.Measurement; -import org.openjdk.jmh.annotations.Mode; -import org.openjdk.jmh.annotations.Scope; -import org.openjdk.jmh.annotations.Setup; -import org.openjdk.jmh.annotations.State; -import org.openjdk.jmh.annotations.TearDown; -import org.openjdk.jmh.annotations.Threads; -import org.openjdk.jmh.annotations.Warmup; -import org.openjdk.jmh.infra.Blackhole; - -@Fork(1) -@State(Scope.Benchmark) -@Warmup(iterations = 3) -@Measurement(iterations = 5) -@BenchmarkMode(Mode.SingleShotTime) -public class MapRangePartitionerBenchmark { - private static final String CHARS = - "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_-.!?"; - private static final int SAMPLE_SIZE = 100_000; - private static final Schema SCHEMA = - new Schema( - Types.NestedField.required(1, "id", Types.IntegerType.get()), - Types.NestedField.required(2, "name2", Types.StringType.get()), - Types.NestedField.required(3, "name3", Types.StringType.get()), - Types.NestedField.required(4, "name4", Types.StringType.get()), - Types.NestedField.required(5, "name5", Types.StringType.get()), - Types.NestedField.required(6, "name6", Types.StringType.get()), - Types.NestedField.required(7, "name7", Types.StringType.get()), - Types.NestedField.required(8, "name8", Types.StringType.get()), - Types.NestedField.required(9, "name9", Types.StringType.get())); - - private static final SortOrder SORT_ORDER = SortOrder.builderFor(SCHEMA).asc("id").build(); - private static final Comparator SORT_ORDER_COMPARTOR = - SortOrderComparators.forSchema(SCHEMA, SORT_ORDER); - private static final SortKey SORT_KEY = new SortKey(SCHEMA, SORT_ORDER); - - private MapRangePartitioner partitioner; - private RowData[] rows; - - @Setup - public void setupBenchmark() { - NavigableMap weights = longTailDistribution(100_000, 24, 240, 100, 2.0); - Map mapStatistics = Maps.newHashMapWithExpectedSize(weights.size()); - weights.forEach( - (id, weight) -> { - SortKey sortKey = SORT_KEY.copy(); - sortKey.set(0, id); - mapStatistics.put(sortKey, weight); - }); - - MapAssignment mapAssignment = - MapAssignment.fromKeyFrequency(2, mapStatistics, 0.0, SORT_ORDER_COMPARTOR); - this.partitioner = - new MapRangePartitioner( - SCHEMA, SortOrder.builderFor(SCHEMA).asc("id").build(), mapAssignment); - - List keys = Lists.newArrayList(weights.keySet().iterator()); - long[] weightsCDF = new long[keys.size()]; - long totalWeight = 0; - for (int i = 0; i < keys.size(); ++i) { - totalWeight += weights.get(keys.get(i)); - weightsCDF[i] = totalWeight; - } - - // pre-calculate the samples for benchmark run - this.rows = new GenericRowData[SAMPLE_SIZE]; - for (int i = 0; i < SAMPLE_SIZE; ++i) { - long weight = ThreadLocalRandom.current().nextLong(totalWeight); - int index = binarySearchIndex(weightsCDF, weight); - rows[i] = - GenericRowData.of( - keys.get(index), - randomString("name2-"), - randomString("name3-"), - randomString("name4-"), - randomString("name5-"), - randomString("name6-"), - randomString("name7-"), - randomString("name8-"), - randomString("name9-")); - } - } - - @TearDown - public void tearDownBenchmark() {} - - @Benchmark - @Threads(1) - public void testPartitionerLongTailDistribution(Blackhole blackhole) { - for (int i = 0; i < SAMPLE_SIZE; ++i) { - blackhole.consume(partitioner.partition(rows[i], 128)); - } - } - - private static String randomString(String prefix) { - int length = ThreadLocalRandom.current().nextInt(200); - byte[] buffer = new byte[length]; - - for (int i = 0; i < length; i += 1) { - buffer[i] = (byte) CHARS.charAt(ThreadLocalRandom.current().nextInt(CHARS.length())); - } - - // CHARS is all ASCII - return prefix + new String(buffer, StandardCharsets.US_ASCII); - } - - /** find the index where weightsUDF[index] < weight && weightsUDF[index+1] >= weight */ - private static int binarySearchIndex(long[] weightsUDF, long target) { - Preconditions.checkArgument( - target < weightsUDF[weightsUDF.length - 1], - "weight is out of range: total weight = %s, search target = %s", - weightsUDF[weightsUDF.length - 1], - target); - int start = 0; - int end = weightsUDF.length - 1; - while (start < end) { - int mid = (start + end) / 2; - if (weightsUDF[mid] < target && weightsUDF[mid + 1] >= target) { - return mid; - } - - if (weightsUDF[mid] >= target) { - end = mid - 1; - } else if (weightsUDF[mid + 1] < target) { - start = mid + 1; - } - } - return start; - } - - /** Key is the id string and value is the weight in long value. */ - private static NavigableMap longTailDistribution( - long startingWeight, - int longTailStartingIndex, - int longTailLength, - long longTailBaseWeight, - double weightRandomJitterPercentage) { - - NavigableMap weights = Maps.newTreeMap(); - - // first part just decays the weight by half - long currentWeight = startingWeight; - for (int index = 0; index < longTailStartingIndex; ++index) { - double jitter = ThreadLocalRandom.current().nextDouble(weightRandomJitterPercentage / 100); - long weight = (long) (currentWeight * (1.0 + jitter)); - weight = weight > 0 ? weight : 1; - weights.put(index, weight); - if (currentWeight > longTailBaseWeight) { - currentWeight = currentWeight / 2; - } - } - - // long tail part - for (int index = longTailStartingIndex; - index < longTailStartingIndex + longTailLength; - ++index) { - long longTailWeight = - (long) - (longTailBaseWeight - * ThreadLocalRandom.current().nextDouble(weightRandomJitterPercentage)); - longTailWeight = longTailWeight > 0 ? longTailWeight : 1; - weights.put(index, longTailWeight); - } - - return weights; - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/CatalogLoader.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/CatalogLoader.java deleted file mode 100644 index 18473bf4f190..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/CatalogLoader.java +++ /dev/null @@ -1,215 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import java.io.Serializable; -import java.util.Map; -import org.apache.hadoop.conf.Configuration; -import org.apache.iceberg.CatalogProperties; -import org.apache.iceberg.CatalogUtil; -import org.apache.iceberg.catalog.Catalog; -import org.apache.iceberg.hadoop.HadoopCatalog; -import org.apache.iceberg.hadoop.SerializableConfiguration; -import org.apache.iceberg.hive.HiveCatalog; -import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.rest.RESTCatalog; - -/** Serializable loader to load an Iceberg {@link Catalog}. */ -public interface CatalogLoader extends Serializable, Cloneable { - - /** - * Create a new catalog with the provided properties. NOTICE: for flink, we may initialize the - * {@link CatalogLoader} at flink sql client side or job manager side, and then serialize this - * catalog loader to task manager, finally deserialize it and create a new catalog at task manager - * side. - * - * @return a newly created {@link Catalog} - */ - Catalog loadCatalog(); - - /** Clone a CatalogLoader. */ - @SuppressWarnings({"checkstyle:NoClone", "checkstyle:SuperClone"}) - CatalogLoader clone(); - - static CatalogLoader hadoop( - String name, Configuration hadoopConf, Map properties) { - return new HadoopCatalogLoader(name, hadoopConf, properties); - } - - static CatalogLoader hive(String name, Configuration hadoopConf, Map properties) { - return new HiveCatalogLoader(name, hadoopConf, properties); - } - - static CatalogLoader rest(String name, Configuration hadoopConf, Map properties) { - return new RESTCatalogLoader(name, hadoopConf, properties); - } - - static CatalogLoader custom( - String name, Map properties, Configuration hadoopConf, String impl) { - return new CustomCatalogLoader(name, properties, hadoopConf, impl); - } - - class HadoopCatalogLoader implements CatalogLoader { - private final String catalogName; - private final SerializableConfiguration hadoopConf; - private final String warehouseLocation; - private final Map properties; - - private HadoopCatalogLoader( - String catalogName, Configuration conf, Map properties) { - this.catalogName = catalogName; - this.hadoopConf = new SerializableConfiguration(conf); - this.warehouseLocation = properties.get(CatalogProperties.WAREHOUSE_LOCATION); - this.properties = Maps.newHashMap(properties); - } - - @Override - public Catalog loadCatalog() { - return CatalogUtil.loadCatalog( - HadoopCatalog.class.getName(), catalogName, properties, hadoopConf.get()); - } - - @Override - @SuppressWarnings({"checkstyle:NoClone", "checkstyle:SuperClone"}) - public CatalogLoader clone() { - return new HadoopCatalogLoader(catalogName, new Configuration(hadoopConf.get()), properties); - } - - @Override - public String toString() { - return MoreObjects.toStringHelper(this) - .add("catalogName", catalogName) - .add("warehouseLocation", warehouseLocation) - .toString(); - } - } - - class HiveCatalogLoader implements CatalogLoader { - private final String catalogName; - private final SerializableConfiguration hadoopConf; - private final String uri; - private final String warehouse; - private final int clientPoolSize; - private final Map properties; - - private HiveCatalogLoader( - String catalogName, Configuration conf, Map properties) { - this.catalogName = catalogName; - this.hadoopConf = new SerializableConfiguration(conf); - this.uri = properties.get(CatalogProperties.URI); - this.warehouse = properties.get(CatalogProperties.WAREHOUSE_LOCATION); - this.clientPoolSize = - properties.containsKey(CatalogProperties.CLIENT_POOL_SIZE) - ? Integer.parseInt(properties.get(CatalogProperties.CLIENT_POOL_SIZE)) - : CatalogProperties.CLIENT_POOL_SIZE_DEFAULT; - this.properties = Maps.newHashMap(properties); - } - - @Override - public Catalog loadCatalog() { - return CatalogUtil.loadCatalog( - HiveCatalog.class.getName(), catalogName, properties, hadoopConf.get()); - } - - @Override - @SuppressWarnings({"checkstyle:NoClone", "checkstyle:SuperClone"}) - public CatalogLoader clone() { - return new HiveCatalogLoader(catalogName, new Configuration(hadoopConf.get()), properties); - } - - @Override - public String toString() { - return MoreObjects.toStringHelper(this) - .add("catalogName", catalogName) - .add("uri", uri) - .add("warehouse", warehouse) - .add("clientPoolSize", clientPoolSize) - .toString(); - } - } - - class RESTCatalogLoader implements CatalogLoader { - private final String catalogName; - private final SerializableConfiguration hadoopConf; - private final Map properties; - - private RESTCatalogLoader( - String catalogName, Configuration conf, Map properties) { - this.catalogName = catalogName; - this.hadoopConf = new SerializableConfiguration(conf); - this.properties = Maps.newHashMap(properties); - } - - @Override - public Catalog loadCatalog() { - return CatalogUtil.loadCatalog( - RESTCatalog.class.getName(), catalogName, properties, hadoopConf.get()); - } - - @Override - @SuppressWarnings({"checkstyle:NoClone", "checkstyle:SuperClone"}) - public CatalogLoader clone() { - return new RESTCatalogLoader(catalogName, new Configuration(hadoopConf.get()), properties); - } - - @Override - public String toString() { - return MoreObjects.toStringHelper(this) - .add("catalogName", catalogName) - .add("properties", properties) - .toString(); - } - } - - class CustomCatalogLoader implements CatalogLoader { - - private final SerializableConfiguration hadoopConf; - private final Map properties; - private final String name; - private final String impl; - - private CustomCatalogLoader( - String name, Map properties, Configuration conf, String impl) { - this.hadoopConf = new SerializableConfiguration(conf); - this.properties = Maps.newHashMap(properties); // wrap into a hashmap for serialization - this.name = name; - this.impl = - Preconditions.checkNotNull( - impl, "Cannot initialize custom Catalog, impl class name is null"); - } - - @Override - public Catalog loadCatalog() { - return CatalogUtil.loadCatalog(impl, name, properties, hadoopConf.get()); - } - - @Override - @SuppressWarnings({"checkstyle:NoClone", "checkstyle:SuperClone"}) - public CatalogLoader clone() { - return new CustomCatalogLoader(name, properties, new Configuration(hadoopConf.get()), impl); - } - - @Override - public String toString() { - return MoreObjects.toStringHelper(this).add("name", name).add("impl", impl).toString(); - } - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/FlinkCatalog.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/FlinkCatalog.java deleted file mode 100644 index 988465a38f4d..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/FlinkCatalog.java +++ /dev/null @@ -1,834 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import java.io.Closeable; -import java.io.IOException; -import java.util.Collections; -import java.util.List; -import java.util.Map; -import java.util.Objects; -import java.util.Optional; -import java.util.Set; -import java.util.stream.Collectors; -import org.apache.flink.table.api.TableSchema; -import org.apache.flink.table.catalog.AbstractCatalog; -import org.apache.flink.table.catalog.CatalogBaseTable; -import org.apache.flink.table.catalog.CatalogDatabase; -import org.apache.flink.table.catalog.CatalogDatabaseImpl; -import org.apache.flink.table.catalog.CatalogFunction; -import org.apache.flink.table.catalog.CatalogPartition; -import org.apache.flink.table.catalog.CatalogPartitionSpec; -import org.apache.flink.table.catalog.CatalogTable; -import org.apache.flink.table.catalog.CatalogTableImpl; -import org.apache.flink.table.catalog.ObjectPath; -import org.apache.flink.table.catalog.ResolvedCatalogTable; -import org.apache.flink.table.catalog.TableChange; -import org.apache.flink.table.catalog.exceptions.CatalogException; -import org.apache.flink.table.catalog.exceptions.DatabaseAlreadyExistException; -import org.apache.flink.table.catalog.exceptions.DatabaseNotEmptyException; -import org.apache.flink.table.catalog.exceptions.DatabaseNotExistException; -import org.apache.flink.table.catalog.exceptions.FunctionNotExistException; -import org.apache.flink.table.catalog.exceptions.TableAlreadyExistException; -import org.apache.flink.table.catalog.exceptions.TableNotExistException; -import org.apache.flink.table.catalog.exceptions.TableNotPartitionedException; -import org.apache.flink.table.catalog.stats.CatalogColumnStatistics; -import org.apache.flink.table.catalog.stats.CatalogTableStatistics; -import org.apache.flink.table.expressions.Expression; -import org.apache.flink.table.factories.Factory; -import org.apache.flink.util.StringUtils; -import org.apache.iceberg.CachingCatalog; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.FileScanTask; -import org.apache.iceberg.MetadataTableType; -import org.apache.iceberg.PartitionField; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.StructLike; -import org.apache.iceberg.Table; -import org.apache.iceberg.catalog.Catalog; -import org.apache.iceberg.catalog.Namespace; -import org.apache.iceberg.catalog.SupportsNamespaces; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.exceptions.AlreadyExistsException; -import org.apache.iceberg.exceptions.NamespaceNotEmptyException; -import org.apache.iceberg.exceptions.NoSuchNamespaceException; -import org.apache.iceberg.flink.util.FlinkAlterTableUtil; -import org.apache.iceberg.flink.util.FlinkCompatibilityUtil; -import org.apache.iceberg.io.CloseableIterable; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.base.Splitter; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.relocated.com.google.common.collect.Sets; - -/** - * A Flink Catalog implementation that wraps an Iceberg {@link Catalog}. - * - *

    The mapping between Flink database and Iceberg namespace: Supplying a base namespace for a - * given catalog, so if you have a catalog that supports a 2-level namespace, you would supply the - * first level in the catalog configuration and the second level would be exposed as Flink - * databases. - * - *

    The Iceberg table manages its partitions by itself. The partition of the Iceberg table is - * independent of the partition of Flink. - */ -public class FlinkCatalog extends AbstractCatalog { - private final CatalogLoader catalogLoader; - private final Catalog icebergCatalog; - private final Namespace baseNamespace; - private final SupportsNamespaces asNamespaceCatalog; - private final Closeable closeable; - private final boolean cacheEnabled; - - public FlinkCatalog( - String catalogName, - String defaultDatabase, - Namespace baseNamespace, - CatalogLoader catalogLoader, - boolean cacheEnabled, - long cacheExpirationIntervalMs) { - super(catalogName, defaultDatabase); - this.catalogLoader = catalogLoader; - this.baseNamespace = baseNamespace; - this.cacheEnabled = cacheEnabled; - - Catalog originalCatalog = catalogLoader.loadCatalog(); - icebergCatalog = - cacheEnabled - ? CachingCatalog.wrap(originalCatalog, cacheExpirationIntervalMs) - : originalCatalog; - asNamespaceCatalog = - originalCatalog instanceof SupportsNamespaces ? (SupportsNamespaces) originalCatalog : null; - closeable = originalCatalog instanceof Closeable ? (Closeable) originalCatalog : null; - - FlinkEnvironmentContext.init(); - } - - @Override - public void open() throws CatalogException {} - - @Override - public void close() throws CatalogException { - if (closeable != null) { - try { - closeable.close(); - } catch (IOException e) { - throw new CatalogException(e); - } - } - } - - public Catalog catalog() { - return icebergCatalog; - } - - /** Append a new level to the base namespace */ - private static Namespace appendLevel(Namespace baseNamespace, String newLevel) { - String[] namespace = new String[baseNamespace.levels().length + 1]; - System.arraycopy(baseNamespace.levels(), 0, namespace, 0, baseNamespace.levels().length); - namespace[baseNamespace.levels().length] = newLevel; - return Namespace.of(namespace); - } - - TableIdentifier toIdentifier(ObjectPath path) { - String objectName = path.getObjectName(); - List tableName = Splitter.on('$').splitToList(objectName); - - if (tableName.size() == 1) { - return TableIdentifier.of( - appendLevel(baseNamespace, path.getDatabaseName()), path.getObjectName()); - } else if (tableName.size() == 2 && MetadataTableType.from(tableName.get(1)) != null) { - return TableIdentifier.of( - appendLevel(appendLevel(baseNamespace, path.getDatabaseName()), tableName.get(0)), - tableName.get(1)); - } else { - throw new IllegalArgumentException("Illegal table name:" + objectName); - } - } - - @Override - public List listDatabases() throws CatalogException { - if (asNamespaceCatalog == null) { - return Collections.singletonList(getDefaultDatabase()); - } - - return asNamespaceCatalog.listNamespaces(baseNamespace).stream() - .map(n -> n.level(n.levels().length - 1)) - .collect(Collectors.toList()); - } - - @Override - public CatalogDatabase getDatabase(String databaseName) - throws DatabaseNotExistException, CatalogException { - if (asNamespaceCatalog == null) { - if (!getDefaultDatabase().equals(databaseName)) { - throw new DatabaseNotExistException(getName(), databaseName); - } else { - return new CatalogDatabaseImpl(Maps.newHashMap(), ""); - } - } else { - try { - Map metadata = - Maps.newHashMap( - asNamespaceCatalog.loadNamespaceMetadata(appendLevel(baseNamespace, databaseName))); - String comment = metadata.remove("comment"); - return new CatalogDatabaseImpl(metadata, comment); - } catch (NoSuchNamespaceException e) { - throw new DatabaseNotExistException(getName(), databaseName, e); - } - } - } - - @Override - public boolean databaseExists(String databaseName) throws CatalogException { - try { - getDatabase(databaseName); - return true; - } catch (DatabaseNotExistException ignore) { - return false; - } - } - - @Override - public void createDatabase(String name, CatalogDatabase database, boolean ignoreIfExists) - throws DatabaseAlreadyExistException, CatalogException { - createDatabase( - name, mergeComment(database.getProperties(), database.getComment()), ignoreIfExists); - } - - private void createDatabase( - String databaseName, Map metadata, boolean ignoreIfExists) - throws DatabaseAlreadyExistException, CatalogException { - if (asNamespaceCatalog != null) { - try { - asNamespaceCatalog.createNamespace(appendLevel(baseNamespace, databaseName), metadata); - } catch (AlreadyExistsException e) { - if (!ignoreIfExists) { - throw new DatabaseAlreadyExistException(getName(), databaseName, e); - } - } - } else { - throw new UnsupportedOperationException( - "Namespaces are not supported by catalog: " + getName()); - } - } - - private Map mergeComment(Map metadata, String comment) { - Map ret = Maps.newHashMap(metadata); - if (metadata.containsKey("comment")) { - throw new CatalogException("Database properties should not contain key: 'comment'."); - } - - if (!StringUtils.isNullOrWhitespaceOnly(comment)) { - ret.put("comment", comment); - } - return ret; - } - - @Override - public void dropDatabase(String name, boolean ignoreIfNotExists, boolean cascade) - throws DatabaseNotExistException, DatabaseNotEmptyException, CatalogException { - if (asNamespaceCatalog != null) { - try { - boolean success = asNamespaceCatalog.dropNamespace(appendLevel(baseNamespace, name)); - if (!success && !ignoreIfNotExists) { - throw new DatabaseNotExistException(getName(), name); - } - } catch (NoSuchNamespaceException e) { - if (!ignoreIfNotExists) { - throw new DatabaseNotExistException(getName(), name, e); - } - } catch (NamespaceNotEmptyException e) { - throw new DatabaseNotEmptyException(getName(), name, e); - } - } else { - if (!ignoreIfNotExists) { - throw new DatabaseNotExistException(getName(), name); - } - } - } - - @Override - public void alterDatabase(String name, CatalogDatabase newDatabase, boolean ignoreIfNotExists) - throws DatabaseNotExistException, CatalogException { - if (asNamespaceCatalog != null) { - Namespace namespace = appendLevel(baseNamespace, name); - Map updates = Maps.newHashMap(); - Set removals = Sets.newHashSet(); - - try { - Map oldProperties = asNamespaceCatalog.loadNamespaceMetadata(namespace); - Map newProperties = - mergeComment(newDatabase.getProperties(), newDatabase.getComment()); - - for (String key : oldProperties.keySet()) { - if (!newProperties.containsKey(key)) { - removals.add(key); - } - } - - for (Map.Entry entry : newProperties.entrySet()) { - if (!entry.getValue().equals(oldProperties.get(entry.getKey()))) { - updates.put(entry.getKey(), entry.getValue()); - } - } - - if (!updates.isEmpty()) { - asNamespaceCatalog.setProperties(namespace, updates); - } - - if (!removals.isEmpty()) { - asNamespaceCatalog.removeProperties(namespace, removals); - } - - } catch (NoSuchNamespaceException e) { - if (!ignoreIfNotExists) { - throw new DatabaseNotExistException(getName(), name, e); - } - } - } else { - if (getDefaultDatabase().equals(name)) { - throw new CatalogException( - "Can not alter the default database when the iceberg catalog doesn't support namespaces."); - } - if (!ignoreIfNotExists) { - throw new DatabaseNotExistException(getName(), name); - } - } - } - - @Override - public List listTables(String databaseName) - throws DatabaseNotExistException, CatalogException { - try { - return icebergCatalog.listTables(appendLevel(baseNamespace, databaseName)).stream() - .map(TableIdentifier::name) - .collect(Collectors.toList()); - } catch (NoSuchNamespaceException e) { - throw new DatabaseNotExistException(getName(), databaseName, e); - } - } - - @Override - public CatalogTable getTable(ObjectPath tablePath) - throws TableNotExistException, CatalogException { - Table table = loadIcebergTable(tablePath); - return toCatalogTable(table); - } - - private Table loadIcebergTable(ObjectPath tablePath) throws TableNotExistException { - try { - Table table = icebergCatalog.loadTable(toIdentifier(tablePath)); - if (cacheEnabled) { - table.refresh(); - } - - return table; - } catch (org.apache.iceberg.exceptions.NoSuchTableException e) { - throw new TableNotExistException(getName(), tablePath, e); - } - } - - @Override - public boolean tableExists(ObjectPath tablePath) throws CatalogException { - return icebergCatalog.tableExists(toIdentifier(tablePath)); - } - - @Override - public void dropTable(ObjectPath tablePath, boolean ignoreIfNotExists) - throws TableNotExistException, CatalogException { - try { - icebergCatalog.dropTable(toIdentifier(tablePath)); - } catch (org.apache.iceberg.exceptions.NoSuchTableException e) { - if (!ignoreIfNotExists) { - throw new TableNotExistException(getName(), tablePath, e); - } - } - } - - @Override - public void renameTable(ObjectPath tablePath, String newTableName, boolean ignoreIfNotExists) - throws TableNotExistException, TableAlreadyExistException, CatalogException { - try { - icebergCatalog.renameTable( - toIdentifier(tablePath), - toIdentifier(new ObjectPath(tablePath.getDatabaseName(), newTableName))); - } catch (org.apache.iceberg.exceptions.NoSuchTableException e) { - if (!ignoreIfNotExists) { - throw new TableNotExistException(getName(), tablePath, e); - } - } catch (AlreadyExistsException e) { - throw new TableAlreadyExistException(getName(), tablePath, e); - } - } - - @Override - public void createTable(ObjectPath tablePath, CatalogBaseTable table, boolean ignoreIfExists) - throws CatalogException, TableAlreadyExistException { - if (Objects.equals( - table.getOptions().get("connector"), FlinkDynamicTableFactory.FACTORY_IDENTIFIER)) { - throw new IllegalArgumentException( - "Cannot create the table with 'connector'='iceberg' table property in " - + "an iceberg catalog, Please create table with 'connector'='iceberg' property in a non-iceberg catalog or " - + "create table without 'connector'='iceberg' related properties in an iceberg table."); - } - Preconditions.checkArgument(table instanceof ResolvedCatalogTable, "table should be resolved"); - createIcebergTable(tablePath, (ResolvedCatalogTable) table, ignoreIfExists); - } - - void createIcebergTable(ObjectPath tablePath, ResolvedCatalogTable table, boolean ignoreIfExists) - throws CatalogException, TableAlreadyExistException { - validateFlinkTable(table); - - Schema icebergSchema = FlinkSchemaUtil.convert(table.getResolvedSchema()); - PartitionSpec spec = toPartitionSpec(((CatalogTable) table).getPartitionKeys(), icebergSchema); - - ImmutableMap.Builder properties = ImmutableMap.builder(); - String location = null; - for (Map.Entry entry : table.getOptions().entrySet()) { - if ("location".equalsIgnoreCase(entry.getKey())) { - location = entry.getValue(); - } else { - properties.put(entry.getKey(), entry.getValue()); - } - } - - try { - icebergCatalog.createTable( - toIdentifier(tablePath), icebergSchema, spec, location, properties.build()); - } catch (AlreadyExistsException e) { - if (!ignoreIfExists) { - throw new TableAlreadyExistException(getName(), tablePath, e); - } - } - } - - private static void validateTableSchemaAndPartition(CatalogTable ct1, CatalogTable ct2) { - TableSchema ts1 = ct1.getSchema(); - TableSchema ts2 = ct2.getSchema(); - boolean equalsPrimary = false; - - if (ts1.getPrimaryKey().isPresent() && ts2.getPrimaryKey().isPresent()) { - equalsPrimary = - Objects.equals(ts1.getPrimaryKey().get().getType(), ts2.getPrimaryKey().get().getType()) - && Objects.equals( - ts1.getPrimaryKey().get().getColumns(), ts2.getPrimaryKey().get().getColumns()); - } else if (!ts1.getPrimaryKey().isPresent() && !ts2.getPrimaryKey().isPresent()) { - equalsPrimary = true; - } - - if (!(Objects.equals(ts1.getTableColumns(), ts2.getTableColumns()) - && Objects.equals(ts1.getWatermarkSpecs(), ts2.getWatermarkSpecs()) - && equalsPrimary)) { - throw new UnsupportedOperationException( - "Altering schema is not supported in the old alterTable API. " - + "To alter schema, use the other alterTable API and provide a list of TableChange's."); - } - - validateTablePartition(ct1, ct2); - } - - private static void validateTablePartition(CatalogTable ct1, CatalogTable ct2) { - if (!ct1.getPartitionKeys().equals(ct2.getPartitionKeys())) { - throw new UnsupportedOperationException("Altering partition keys is not supported yet."); - } - } - - /** - * This alterTable API only supports altering table properties. - * - *

    Support for adding/removing/renaming columns cannot be done by comparing CatalogTable - * instances, unless the Flink schema contains Iceberg column IDs. - * - *

    To alter columns, use the other alterTable API and provide a list of TableChange's. - * - * @param tablePath path of the table or view to be modified - * @param newTable the new table definition - * @param ignoreIfNotExists flag to specify behavior when the table or view does not exist: if set - * to false, throw an exception, if set to true, do nothing. - * @throws CatalogException in case of any runtime exception - * @throws TableNotExistException if the table does not exist - */ - @Override - public void alterTable(ObjectPath tablePath, CatalogBaseTable newTable, boolean ignoreIfNotExists) - throws CatalogException, TableNotExistException { - validateFlinkTable(newTable); - - Table icebergTable; - try { - icebergTable = loadIcebergTable(tablePath); - } catch (TableNotExistException e) { - if (!ignoreIfNotExists) { - throw e; - } else { - return; - } - } - - CatalogTable table = toCatalogTable(icebergTable); - validateTableSchemaAndPartition(table, (CatalogTable) newTable); - - Map oldProperties = table.getOptions(); - Map setProperties = Maps.newHashMap(); - - String setLocation = null; - String setSnapshotId = null; - String pickSnapshotId = null; - - for (Map.Entry entry : newTable.getOptions().entrySet()) { - String key = entry.getKey(); - String value = entry.getValue(); - - if (Objects.equals(value, oldProperties.get(key))) { - continue; - } - - if ("location".equalsIgnoreCase(key)) { - setLocation = value; - } else if ("current-snapshot-id".equalsIgnoreCase(key)) { - setSnapshotId = value; - } else if ("cherry-pick-snapshot-id".equalsIgnoreCase(key)) { - pickSnapshotId = value; - } else { - setProperties.put(key, value); - } - } - - oldProperties - .keySet() - .forEach( - k -> { - if (!newTable.getOptions().containsKey(k)) { - setProperties.put(k, null); - } - }); - - FlinkAlterTableUtil.commitChanges( - icebergTable, setLocation, setSnapshotId, pickSnapshotId, setProperties); - } - - @Override - public void alterTable( - ObjectPath tablePath, - CatalogBaseTable newTable, - List tableChanges, - boolean ignoreIfNotExists) - throws TableNotExistException, CatalogException { - validateFlinkTable(newTable); - - Table icebergTable; - try { - icebergTable = loadIcebergTable(tablePath); - } catch (TableNotExistException e) { - if (!ignoreIfNotExists) { - throw e; - } else { - return; - } - } - - // Does not support altering partition yet. - validateTablePartition(toCatalogTable(icebergTable), (CatalogTable) newTable); - - String setLocation = null; - String setSnapshotId = null; - String cherrypickSnapshotId = null; - - List propertyChanges = Lists.newArrayList(); - List schemaChanges = Lists.newArrayList(); - for (TableChange change : tableChanges) { - if (change instanceof TableChange.SetOption) { - TableChange.SetOption set = (TableChange.SetOption) change; - - if ("location".equalsIgnoreCase(set.getKey())) { - setLocation = set.getValue(); - } else if ("current-snapshot-id".equalsIgnoreCase(set.getKey())) { - setSnapshotId = set.getValue(); - } else if ("cherry-pick-snapshot-id".equalsIgnoreCase(set.getKey())) { - cherrypickSnapshotId = set.getValue(); - } else { - propertyChanges.add(change); - } - } else if (change instanceof TableChange.ResetOption) { - propertyChanges.add(change); - } else { - schemaChanges.add(change); - } - } - - FlinkAlterTableUtil.commitChanges( - icebergTable, - setLocation, - setSnapshotId, - cherrypickSnapshotId, - schemaChanges, - propertyChanges); - } - - private static void validateFlinkTable(CatalogBaseTable table) { - Preconditions.checkArgument( - table instanceof CatalogTable, "The Table should be a CatalogTable."); - - TableSchema schema = table.getSchema(); - schema - .getTableColumns() - .forEach( - column -> { - if (!FlinkCompatibilityUtil.isPhysicalColumn(column)) { - throw new UnsupportedOperationException( - "Creating table with computed columns is not supported yet."); - } - }); - - if (!schema.getWatermarkSpecs().isEmpty()) { - throw new UnsupportedOperationException( - "Creating table with watermark specs is not supported yet."); - } - } - - private static PartitionSpec toPartitionSpec(List partitionKeys, Schema icebergSchema) { - PartitionSpec.Builder builder = PartitionSpec.builderFor(icebergSchema); - partitionKeys.forEach(builder::identity); - return builder.build(); - } - - private static List toPartitionKeys(PartitionSpec spec, Schema icebergSchema) { - ImmutableList.Builder partitionKeysBuilder = ImmutableList.builder(); - for (PartitionField field : spec.fields()) { - if (field.transform().isIdentity()) { - partitionKeysBuilder.add(icebergSchema.findColumnName(field.sourceId())); - } else { - // Not created by Flink SQL. - // For compatibility with iceberg tables, return empty. - // TODO modify this after Flink support partition transform. - return Collections.emptyList(); - } - } - return partitionKeysBuilder.build(); - } - - static CatalogTable toCatalogTable(Table table) { - TableSchema schema = FlinkSchemaUtil.toSchema(table.schema()); - List partitionKeys = toPartitionKeys(table.spec(), table.schema()); - - // NOTE: We can not create a IcebergCatalogTable extends CatalogTable, because Flink optimizer - // may use - // CatalogTableImpl to copy a new catalog table. - // Let's re-loading table from Iceberg catalog when creating source/sink operators. - // Iceberg does not have Table comment, so pass a null (Default comment value in Flink). - return new CatalogTableImpl(schema, partitionKeys, table.properties(), null); - } - - @Override - public Optional getFactory() { - return Optional.of(new FlinkDynamicTableFactory(this)); - } - - CatalogLoader getCatalogLoader() { - return catalogLoader; - } - - // ------------------------------ Unsupported methods - // --------------------------------------------- - - @Override - public List listViews(String databaseName) throws CatalogException { - return Collections.emptyList(); - } - - @Override - public CatalogPartition getPartition(ObjectPath tablePath, CatalogPartitionSpec partitionSpec) - throws CatalogException { - throw new UnsupportedOperationException(); - } - - @Override - public boolean partitionExists(ObjectPath tablePath, CatalogPartitionSpec partitionSpec) - throws CatalogException { - throw new UnsupportedOperationException(); - } - - @Override - public void createPartition( - ObjectPath tablePath, - CatalogPartitionSpec partitionSpec, - CatalogPartition partition, - boolean ignoreIfExists) - throws CatalogException { - throw new UnsupportedOperationException(); - } - - @Override - public void dropPartition( - ObjectPath tablePath, CatalogPartitionSpec partitionSpec, boolean ignoreIfNotExists) - throws CatalogException { - throw new UnsupportedOperationException(); - } - - @Override - public void alterPartition( - ObjectPath tablePath, - CatalogPartitionSpec partitionSpec, - CatalogPartition newPartition, - boolean ignoreIfNotExists) - throws CatalogException { - throw new UnsupportedOperationException(); - } - - @Override - public List listFunctions(String dbName) throws CatalogException { - return Collections.emptyList(); - } - - @Override - public CatalogFunction getFunction(ObjectPath functionPath) - throws FunctionNotExistException, CatalogException { - throw new FunctionNotExistException(getName(), functionPath); - } - - @Override - public boolean functionExists(ObjectPath functionPath) throws CatalogException { - return false; - } - - @Override - public void createFunction( - ObjectPath functionPath, CatalogFunction function, boolean ignoreIfExists) - throws CatalogException { - throw new UnsupportedOperationException(); - } - - @Override - public void alterFunction( - ObjectPath functionPath, CatalogFunction newFunction, boolean ignoreIfNotExists) - throws CatalogException { - throw new UnsupportedOperationException(); - } - - @Override - public void dropFunction(ObjectPath functionPath, boolean ignoreIfNotExists) - throws CatalogException { - throw new UnsupportedOperationException(); - } - - @Override - public void alterTableStatistics( - ObjectPath tablePath, CatalogTableStatistics tableStatistics, boolean ignoreIfNotExists) - throws CatalogException { - throw new UnsupportedOperationException(); - } - - @Override - public void alterTableColumnStatistics( - ObjectPath tablePath, CatalogColumnStatistics columnStatistics, boolean ignoreIfNotExists) - throws CatalogException { - throw new UnsupportedOperationException(); - } - - @Override - public void alterPartitionStatistics( - ObjectPath tablePath, - CatalogPartitionSpec partitionSpec, - CatalogTableStatistics partitionStatistics, - boolean ignoreIfNotExists) - throws CatalogException { - throw new UnsupportedOperationException(); - } - - @Override - public void alterPartitionColumnStatistics( - ObjectPath tablePath, - CatalogPartitionSpec partitionSpec, - CatalogColumnStatistics columnStatistics, - boolean ignoreIfNotExists) - throws CatalogException { - throw new UnsupportedOperationException(); - } - - @Override - public List listPartitions(ObjectPath tablePath) - throws TableNotExistException, TableNotPartitionedException, CatalogException { - Table table = loadIcebergTable(tablePath); - - if (table.spec().isUnpartitioned()) { - throw new TableNotPartitionedException(icebergCatalog.name(), tablePath); - } - - Set set = Sets.newHashSet(); - try (CloseableIterable tasks = table.newScan().planFiles()) { - for (DataFile dataFile : CloseableIterable.transform(tasks, FileScanTask::file)) { - Map map = Maps.newHashMap(); - StructLike structLike = dataFile.partition(); - PartitionSpec spec = table.specs().get(dataFile.specId()); - for (int i = 0; i < structLike.size(); i++) { - map.put(spec.fields().get(i).name(), String.valueOf(structLike.get(i, Object.class))); - } - set.add(new CatalogPartitionSpec(map)); - } - } catch (IOException e) { - throw new CatalogException( - String.format("Failed to list partitions of table %s", tablePath), e); - } - - return Lists.newArrayList(set); - } - - @Override - public List listPartitions( - ObjectPath tablePath, CatalogPartitionSpec partitionSpec) throws CatalogException { - throw new UnsupportedOperationException(); - } - - @Override - public List listPartitionsByFilter( - ObjectPath tablePath, List filters) throws CatalogException { - throw new UnsupportedOperationException(); - } - - // After partition pruning and filter push down, the statistics have become very inaccurate, so - // the statistics from - // here are of little significance. - // Flink will support something like SupportsReportStatistics in future. - - @Override - public CatalogTableStatistics getTableStatistics(ObjectPath tablePath) throws CatalogException { - return CatalogTableStatistics.UNKNOWN; - } - - @Override - public CatalogColumnStatistics getTableColumnStatistics(ObjectPath tablePath) - throws CatalogException { - return CatalogColumnStatistics.UNKNOWN; - } - - @Override - public CatalogTableStatistics getPartitionStatistics( - ObjectPath tablePath, CatalogPartitionSpec partitionSpec) throws CatalogException { - return CatalogTableStatistics.UNKNOWN; - } - - @Override - public CatalogColumnStatistics getPartitionColumnStatistics( - ObjectPath tablePath, CatalogPartitionSpec partitionSpec) throws CatalogException { - return CatalogColumnStatistics.UNKNOWN; - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/FlinkCatalogFactory.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/FlinkCatalogFactory.java deleted file mode 100644 index 1453753849ec..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/FlinkCatalogFactory.java +++ /dev/null @@ -1,213 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import java.net.URL; -import java.nio.file.Files; -import java.nio.file.Paths; -import java.util.List; -import java.util.Locale; -import java.util.Map; -import org.apache.flink.configuration.GlobalConfiguration; -import org.apache.flink.runtime.util.HadoopUtils; -import org.apache.flink.table.catalog.Catalog; -import org.apache.flink.table.factories.CatalogFactory; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; -import org.apache.iceberg.CatalogProperties; -import org.apache.iceberg.catalog.Namespace; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.base.Strings; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.util.PropertyUtil; - -/** - * A Flink Catalog factory implementation that creates {@link FlinkCatalog}. - * - *

    This supports the following catalog configuration options: - * - *

      - *
    • type - Flink catalog factory key, should be "iceberg" - *
    • catalog-type - iceberg catalog type, "hive", "hadoop" or "rest" - *
    • uri - the Hive Metastore URI (Hive catalog only) - *
    • clients - the Hive Client Pool Size (Hive catalog only) - *
    • warehouse - the warehouse path (Hadoop catalog only) - *
    • default-database - a database name to use as the default - *
    • base-namespace - a base namespace as the prefix for all databases (Hadoop - * catalog only) - *
    • cache-enabled - whether to enable catalog cache - *
    - * - *

    To use a custom catalog that is not a Hive or Hadoop catalog, extend this class and override - * {@link #createCatalogLoader(String, Map, Configuration)}. - */ -public class FlinkCatalogFactory implements CatalogFactory { - - // Can not just use "type", it conflicts with CATALOG_TYPE. - public static final String ICEBERG_CATALOG_TYPE = "catalog-type"; - public static final String ICEBERG_CATALOG_TYPE_HADOOP = "hadoop"; - public static final String ICEBERG_CATALOG_TYPE_HIVE = "hive"; - public static final String ICEBERG_CATALOG_TYPE_REST = "rest"; - - public static final String HIVE_CONF_DIR = "hive-conf-dir"; - public static final String HADOOP_CONF_DIR = "hadoop-conf-dir"; - public static final String DEFAULT_DATABASE = "default-database"; - public static final String DEFAULT_DATABASE_NAME = "default"; - public static final String BASE_NAMESPACE = "base-namespace"; - - public static final String TYPE = "type"; - public static final String PROPERTY_VERSION = "property-version"; - - /** - * Create an Iceberg {@link org.apache.iceberg.catalog.Catalog} loader to be used by this Flink - * catalog adapter. - * - * @param name Flink's catalog name - * @param properties Flink's catalog properties - * @param hadoopConf Hadoop configuration for catalog - * @return an Iceberg catalog loader - */ - static CatalogLoader createCatalogLoader( - String name, Map properties, Configuration hadoopConf) { - String catalogImpl = properties.get(CatalogProperties.CATALOG_IMPL); - if (catalogImpl != null) { - String catalogType = properties.get(ICEBERG_CATALOG_TYPE); - Preconditions.checkArgument( - catalogType == null, - "Cannot create catalog %s, both catalog-type and catalog-impl are set: catalog-type=%s, catalog-impl=%s", - name, - catalogType, - catalogImpl); - return CatalogLoader.custom(name, properties, hadoopConf, catalogImpl); - } - - String catalogType = properties.getOrDefault(ICEBERG_CATALOG_TYPE, ICEBERG_CATALOG_TYPE_HIVE); - switch (catalogType.toLowerCase(Locale.ENGLISH)) { - case ICEBERG_CATALOG_TYPE_HIVE: - // The values of properties 'uri', 'warehouse', 'hive-conf-dir' are allowed to be null, in - // that case it will - // fallback to parse those values from hadoop configuration which is loaded from classpath. - String hiveConfDir = properties.get(HIVE_CONF_DIR); - String hadoopConfDir = properties.get(HADOOP_CONF_DIR); - Configuration newHadoopConf = mergeHiveConf(hadoopConf, hiveConfDir, hadoopConfDir); - return CatalogLoader.hive(name, newHadoopConf, properties); - - case ICEBERG_CATALOG_TYPE_HADOOP: - return CatalogLoader.hadoop(name, hadoopConf, properties); - - case ICEBERG_CATALOG_TYPE_REST: - return CatalogLoader.rest(name, hadoopConf, properties); - - default: - throw new UnsupportedOperationException( - "Unknown catalog-type: " + catalogType + " (Must be 'hive', 'hadoop' or 'rest')"); - } - } - - @Override - public Map requiredContext() { - Map context = Maps.newHashMap(); - context.put(TYPE, "iceberg"); - context.put(PROPERTY_VERSION, "1"); - return context; - } - - @Override - public List supportedProperties() { - return ImmutableList.of("*"); - } - - @Override - public Catalog createCatalog(String name, Map properties) { - return createCatalog(name, properties, clusterHadoopConf()); - } - - protected Catalog createCatalog( - String name, Map properties, Configuration hadoopConf) { - CatalogLoader catalogLoader = createCatalogLoader(name, properties, hadoopConf); - String defaultDatabase = properties.getOrDefault(DEFAULT_DATABASE, DEFAULT_DATABASE_NAME); - - Namespace baseNamespace = Namespace.empty(); - if (properties.containsKey(BASE_NAMESPACE)) { - baseNamespace = Namespace.of(properties.get(BASE_NAMESPACE).split("\\.")); - } - - boolean cacheEnabled = - PropertyUtil.propertyAsBoolean( - properties, CatalogProperties.CACHE_ENABLED, CatalogProperties.CACHE_ENABLED_DEFAULT); - - long cacheExpirationIntervalMs = - PropertyUtil.propertyAsLong( - properties, - CatalogProperties.CACHE_EXPIRATION_INTERVAL_MS, - CatalogProperties.CACHE_EXPIRATION_INTERVAL_MS_OFF); - Preconditions.checkArgument( - cacheExpirationIntervalMs != 0, - "%s is not allowed to be 0.", - CatalogProperties.CACHE_EXPIRATION_INTERVAL_MS); - - return new FlinkCatalog( - name, - defaultDatabase, - baseNamespace, - catalogLoader, - cacheEnabled, - cacheExpirationIntervalMs); - } - - private static Configuration mergeHiveConf( - Configuration hadoopConf, String hiveConfDir, String hadoopConfDir) { - Configuration newConf = new Configuration(hadoopConf); - if (!Strings.isNullOrEmpty(hiveConfDir)) { - Preconditions.checkState( - Files.exists(Paths.get(hiveConfDir, "hive-site.xml")), - "There should be a hive-site.xml file under the directory %s", - hiveConfDir); - newConf.addResource(new Path(hiveConfDir, "hive-site.xml")); - } else { - // If don't provide the hive-site.xml path explicitly, it will try to load resource from - // classpath. If still - // couldn't load the configuration file, then it will throw exception in HiveCatalog. - URL configFile = CatalogLoader.class.getClassLoader().getResource("hive-site.xml"); - if (configFile != null) { - newConf.addResource(configFile); - } - } - - if (!Strings.isNullOrEmpty(hadoopConfDir)) { - Preconditions.checkState( - Files.exists(Paths.get(hadoopConfDir, "hdfs-site.xml")), - "Failed to load Hadoop configuration: missing %s", - Paths.get(hadoopConfDir, "hdfs-site.xml")); - newConf.addResource(new Path(hadoopConfDir, "hdfs-site.xml")); - Preconditions.checkState( - Files.exists(Paths.get(hadoopConfDir, "core-site.xml")), - "Failed to load Hadoop configuration: missing %s", - Paths.get(hadoopConfDir, "core-site.xml")); - newConf.addResource(new Path(hadoopConfDir, "core-site.xml")); - } - - return newConf; - } - - public static Configuration clusterHadoopConf() { - return HadoopUtils.getHadoopConfiguration(GlobalConfiguration.loadConfiguration()); - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/FlinkConfParser.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/FlinkConfParser.java deleted file mode 100644 index 7167859e600c..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/FlinkConfParser.java +++ /dev/null @@ -1,261 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import java.time.Duration; -import java.util.List; -import java.util.Map; -import java.util.function.Function; -import org.apache.flink.configuration.ConfigOption; -import org.apache.flink.configuration.ReadableConfig; -import org.apache.flink.util.TimeUtils; -import org.apache.iceberg.Table; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; - -class FlinkConfParser { - - private final Map tableProperties; - private final Map options; - private final ReadableConfig readableConfig; - - FlinkConfParser(Table table, Map options, ReadableConfig readableConfig) { - this.tableProperties = table.properties(); - this.options = options; - this.readableConfig = readableConfig; - } - - public BooleanConfParser booleanConf() { - return new BooleanConfParser(); - } - - public IntConfParser intConf() { - return new IntConfParser(); - } - - public LongConfParser longConf() { - return new LongConfParser(); - } - - public > EnumConfParser enumConfParser(Class enumClass) { - return new EnumConfParser<>(enumClass); - } - - public StringConfParser stringConf() { - return new StringConfParser(); - } - - public DurationConfParser durationConf() { - return new DurationConfParser(); - } - - class BooleanConfParser extends ConfParser { - private Boolean defaultValue; - - @Override - protected BooleanConfParser self() { - return this; - } - - public BooleanConfParser defaultValue(boolean value) { - this.defaultValue = value; - return self(); - } - - public BooleanConfParser defaultValue(String value) { - this.defaultValue = Boolean.parseBoolean(value); - return self(); - } - - public boolean parse() { - Preconditions.checkArgument(defaultValue != null, "Default value cannot be null"); - return parse(Boolean::parseBoolean, defaultValue); - } - } - - class IntConfParser extends ConfParser { - private Integer defaultValue; - - @Override - protected IntConfParser self() { - return this; - } - - public IntConfParser defaultValue(int value) { - this.defaultValue = value; - return self(); - } - - public int parse() { - Preconditions.checkArgument(defaultValue != null, "Default value cannot be null"); - return parse(Integer::parseInt, defaultValue); - } - - public Integer parseOptional() { - return parse(Integer::parseInt, null); - } - } - - class LongConfParser extends ConfParser { - private Long defaultValue; - - @Override - protected LongConfParser self() { - return this; - } - - public LongConfParser defaultValue(long value) { - this.defaultValue = value; - return self(); - } - - public long parse() { - Preconditions.checkArgument(defaultValue != null, "Default value cannot be null"); - return parse(Long::parseLong, defaultValue); - } - - public Long parseOptional() { - return parse(Long::parseLong, null); - } - } - - class StringConfParser extends ConfParser { - private String defaultValue; - - @Override - protected StringConfParser self() { - return this; - } - - public StringConfParser defaultValue(String value) { - this.defaultValue = value; - return self(); - } - - public String parse() { - Preconditions.checkArgument(defaultValue != null, "Default value cannot be null"); - return parse(Function.identity(), defaultValue); - } - - public String parseOptional() { - return parse(Function.identity(), null); - } - } - - class EnumConfParser> extends ConfParser, E> { - private E defaultValue; - private final Class enumClass; - - EnumConfParser(Class enumClass) { - this.enumClass = enumClass; - } - - @Override - protected EnumConfParser self() { - return this; - } - - public EnumConfParser defaultValue(E value) { - this.defaultValue = value; - return self(); - } - - public E parse() { - Preconditions.checkArgument(defaultValue != null, "Default value cannot be null"); - return parse(s -> Enum.valueOf(enumClass, s), defaultValue); - } - - public E parseOptional() { - return parse(s -> Enum.valueOf(enumClass, s), null); - } - } - - class DurationConfParser extends ConfParser { - private Duration defaultValue; - - @Override - protected DurationConfParser self() { - return this; - } - - public DurationConfParser defaultValue(Duration value) { - this.defaultValue = value; - return self(); - } - - public Duration parse() { - Preconditions.checkArgument(defaultValue != null, "Default value cannot be null"); - return parse(TimeUtils::parseDuration, defaultValue); - } - - public Duration parseOptional() { - return parse(TimeUtils::parseDuration, null); - } - } - - abstract class ConfParser { - private final List optionNames = Lists.newArrayList(); - private String tablePropertyName; - private ConfigOption configOption; - - protected abstract ThisT self(); - - public ThisT option(String name) { - this.optionNames.add(name); - return self(); - } - - public ThisT flinkConfig(ConfigOption newConfigOption) { - this.configOption = newConfigOption; - return self(); - } - - public ThisT tableProperty(String name) { - this.tablePropertyName = name; - return self(); - } - - protected T parse(Function conversion, T defaultValue) { - if (!optionNames.isEmpty()) { - for (String optionName : optionNames) { - String optionValue = options.get(optionName); - if (optionValue != null) { - return conversion.apply(optionValue); - } - } - } - - if (configOption != null) { - T propertyValue = readableConfig.get(configOption); - if (propertyValue != null) { - return propertyValue; - } - } - - if (tablePropertyName != null) { - String propertyValue = tableProperties.get(tablePropertyName); - if (propertyValue != null) { - return conversion.apply(propertyValue); - } - } - - return defaultValue; - } - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/FlinkConfigOptions.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/FlinkConfigOptions.java deleted file mode 100644 index 7c7afd24ed8e..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/FlinkConfigOptions.java +++ /dev/null @@ -1,107 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import org.apache.flink.configuration.ConfigOption; -import org.apache.flink.configuration.ConfigOptions; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.configuration.description.Description; -import org.apache.flink.configuration.description.TextElement; -import org.apache.flink.table.api.TableEnvironment; -import org.apache.iceberg.flink.source.assigner.SplitAssignerType; -import org.apache.iceberg.util.ThreadPools; - -/** - * When constructing Flink Iceberg source via Java API, configs can be set in {@link Configuration} - * passed to source builder. E.g. - * - *

    - *   configuration.setBoolean(FlinkConfigOptions.TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM, true);
    - *   FlinkSource.forRowData()
    - *       .flinkConf(configuration)
    - *       ...
    - * 
    - * - *

    When using Flink SQL/table API, connector options can be set in Flink's {@link - * TableEnvironment}. - * - *

    - *   TableEnvironment tEnv = createTableEnv();
    - *   tEnv.getConfig()
    - *        .getConfiguration()
    - *        .setBoolean(FlinkConfigOptions.TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM, true);
    - * 
    - */ -public class FlinkConfigOptions { - - private FlinkConfigOptions() {} - - public static final ConfigOption TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM = - ConfigOptions.key("table.exec.iceberg.infer-source-parallelism") - .booleanType() - .defaultValue(true) - .withDescription( - "If is false, parallelism of source are set by config.\n" - + "If is true, source parallelism is inferred according to splits number.\n"); - - public static final ConfigOption TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM_MAX = - ConfigOptions.key("table.exec.iceberg.infer-source-parallelism.max") - .intType() - .defaultValue(100) - .withDescription("Sets max infer parallelism for source operator."); - - public static final ConfigOption TABLE_EXEC_ICEBERG_EXPOSE_SPLIT_LOCALITY_INFO = - ConfigOptions.key("table.exec.iceberg.expose-split-locality-info") - .booleanType() - .noDefaultValue() - .withDescription( - "Expose split host information to use Flink's locality aware split assigner."); - - public static final ConfigOption SOURCE_READER_FETCH_BATCH_RECORD_COUNT = - ConfigOptions.key("table.exec.iceberg.fetch-batch-record-count") - .intType() - .defaultValue(2048) - .withDescription("The target number of records for Iceberg reader fetch batch."); - - public static final ConfigOption TABLE_EXEC_ICEBERG_WORKER_POOL_SIZE = - ConfigOptions.key("table.exec.iceberg.worker-pool-size") - .intType() - .defaultValue(ThreadPools.WORKER_THREAD_POOL_SIZE) - .withDescription("The size of workers pool used to plan or scan manifests."); - - public static final ConfigOption TABLE_EXEC_ICEBERG_USE_FLIP27_SOURCE = - ConfigOptions.key("table.exec.iceberg.use-flip27-source") - .booleanType() - .defaultValue(false) - .withDescription("Use the FLIP-27 based Iceberg source implementation."); - - public static final ConfigOption TABLE_EXEC_SPLIT_ASSIGNER_TYPE = - ConfigOptions.key("table.exec.iceberg.split-assigner-type") - .enumType(SplitAssignerType.class) - .defaultValue(SplitAssignerType.SIMPLE) - .withDescription( - Description.builder() - .text("Split assigner type that determine how splits are assigned to readers.") - .linebreak() - .list( - TextElement.text( - SplitAssignerType.SIMPLE - + ": simple assigner that doesn't provide any guarantee on order or locality.")) - .build()); -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/FlinkDynamicTableFactory.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/FlinkDynamicTableFactory.java deleted file mode 100644 index b7f1be4b93fb..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/FlinkDynamicTableFactory.java +++ /dev/null @@ -1,208 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import java.util.Map; -import java.util.Set; -import org.apache.flink.configuration.ConfigOption; -import org.apache.flink.configuration.ConfigOptions; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.table.api.TableSchema; -import org.apache.flink.table.catalog.CatalogDatabaseImpl; -import org.apache.flink.table.catalog.ObjectIdentifier; -import org.apache.flink.table.catalog.ObjectPath; -import org.apache.flink.table.catalog.ResolvedCatalogTable; -import org.apache.flink.table.catalog.exceptions.DatabaseAlreadyExistException; -import org.apache.flink.table.catalog.exceptions.TableAlreadyExistException; -import org.apache.flink.table.connector.sink.DynamicTableSink; -import org.apache.flink.table.connector.source.DynamicTableSource; -import org.apache.flink.table.factories.DynamicTableSinkFactory; -import org.apache.flink.table.factories.DynamicTableSourceFactory; -import org.apache.flink.table.utils.TableSchemaUtils; -import org.apache.flink.util.Preconditions; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.exceptions.AlreadyExistsException; -import org.apache.iceberg.flink.source.IcebergTableSource; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.relocated.com.google.common.collect.Sets; - -public class FlinkDynamicTableFactory - implements DynamicTableSinkFactory, DynamicTableSourceFactory { - static final String FACTORY_IDENTIFIER = "iceberg"; - - private static final ConfigOption CATALOG_NAME = - ConfigOptions.key("catalog-name") - .stringType() - .noDefaultValue() - .withDescription("Catalog name"); - - private static final ConfigOption CATALOG_TYPE = - ConfigOptions.key(FlinkCatalogFactory.ICEBERG_CATALOG_TYPE) - .stringType() - .noDefaultValue() - .withDescription("Catalog type, the optional types are: custom, hadoop, hive."); - - private static final ConfigOption CATALOG_DATABASE = - ConfigOptions.key("catalog-database") - .stringType() - .defaultValue(FlinkCatalogFactory.DEFAULT_DATABASE_NAME) - .withDescription("Database name managed in the iceberg catalog."); - - private static final ConfigOption CATALOG_TABLE = - ConfigOptions.key("catalog-table") - .stringType() - .noDefaultValue() - .withDescription("Table name managed in the underlying iceberg catalog and database."); - - private final FlinkCatalog catalog; - - public FlinkDynamicTableFactory() { - this.catalog = null; - } - - public FlinkDynamicTableFactory(FlinkCatalog catalog) { - this.catalog = catalog; - } - - @Override - public DynamicTableSource createDynamicTableSource(Context context) { - ObjectIdentifier objectIdentifier = context.getObjectIdentifier(); - ResolvedCatalogTable resolvedCatalogTable = context.getCatalogTable(); - Map tableProps = resolvedCatalogTable.getOptions(); - TableSchema tableSchema = TableSchemaUtils.getPhysicalSchema(resolvedCatalogTable.getSchema()); - - TableLoader tableLoader; - if (catalog != null) { - tableLoader = createTableLoader(catalog, objectIdentifier.toObjectPath()); - } else { - tableLoader = - createTableLoader( - resolvedCatalogTable, - tableProps, - objectIdentifier.getDatabaseName(), - objectIdentifier.getObjectName()); - } - - return new IcebergTableSource(tableLoader, tableSchema, tableProps, context.getConfiguration()); - } - - @Override - public DynamicTableSink createDynamicTableSink(Context context) { - ObjectIdentifier objectIdentifier = context.getObjectIdentifier(); - ResolvedCatalogTable resolvedCatalogTable = context.getCatalogTable(); - Map writeProps = resolvedCatalogTable.getOptions(); - TableSchema tableSchema = TableSchemaUtils.getPhysicalSchema(resolvedCatalogTable.getSchema()); - - TableLoader tableLoader; - if (catalog != null) { - tableLoader = createTableLoader(catalog, objectIdentifier.toObjectPath()); - } else { - tableLoader = - createTableLoader( - resolvedCatalogTable, - writeProps, - objectIdentifier.getDatabaseName(), - objectIdentifier.getObjectName()); - } - - return new IcebergTableSink(tableLoader, tableSchema, context.getConfiguration(), writeProps); - } - - @Override - public Set> requiredOptions() { - Set> options = Sets.newHashSet(); - options.add(CATALOG_TYPE); - options.add(CATALOG_NAME); - return options; - } - - @Override - public Set> optionalOptions() { - Set> options = Sets.newHashSet(); - options.add(CATALOG_DATABASE); - options.add(CATALOG_TABLE); - return options; - } - - @Override - public String factoryIdentifier() { - return FACTORY_IDENTIFIER; - } - - private static TableLoader createTableLoader( - ResolvedCatalogTable resolvedCatalogTable, - Map tableProps, - String databaseName, - String tableName) { - Configuration flinkConf = new Configuration(); - tableProps.forEach(flinkConf::setString); - - String catalogName = flinkConf.getString(CATALOG_NAME); - Preconditions.checkNotNull( - catalogName, "Table property '%s' cannot be null", CATALOG_NAME.key()); - - String catalogDatabase = flinkConf.getString(CATALOG_DATABASE, databaseName); - Preconditions.checkNotNull(catalogDatabase, "The iceberg database name cannot be null"); - - String catalogTable = flinkConf.getString(CATALOG_TABLE, tableName); - Preconditions.checkNotNull(catalogTable, "The iceberg table name cannot be null"); - - org.apache.hadoop.conf.Configuration hadoopConf = FlinkCatalogFactory.clusterHadoopConf(); - FlinkCatalogFactory factory = new FlinkCatalogFactory(); - FlinkCatalog flinkCatalog = - (FlinkCatalog) factory.createCatalog(catalogName, tableProps, hadoopConf); - ObjectPath objectPath = new ObjectPath(catalogDatabase, catalogTable); - - // Create database if not exists in the external catalog. - if (!flinkCatalog.databaseExists(catalogDatabase)) { - try { - flinkCatalog.createDatabase( - catalogDatabase, new CatalogDatabaseImpl(Maps.newHashMap(), null), true); - } catch (DatabaseAlreadyExistException e) { - throw new AlreadyExistsException( - e, - "Database %s already exists in the iceberg catalog %s.", - catalogName, - catalogDatabase); - } - } - - // Create table if not exists in the external catalog. - if (!flinkCatalog.tableExists(objectPath)) { - try { - flinkCatalog.createIcebergTable(objectPath, resolvedCatalogTable, true); - } catch (TableAlreadyExistException e) { - throw new AlreadyExistsException( - e, - "Table %s already exists in the database %s and catalog %s", - catalogTable, - catalogDatabase, - catalogName); - } - } - - return TableLoader.fromCatalog( - flinkCatalog.getCatalogLoader(), TableIdentifier.of(catalogDatabase, catalogTable)); - } - - private static TableLoader createTableLoader(FlinkCatalog catalog, ObjectPath objectPath) { - Preconditions.checkNotNull(catalog, "Flink catalog cannot be null"); - return TableLoader.fromCatalog(catalog.getCatalogLoader(), catalog.toIdentifier(objectPath)); - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/FlinkEnvironmentContext.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/FlinkEnvironmentContext.java deleted file mode 100644 index f35bb577fbba..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/FlinkEnvironmentContext.java +++ /dev/null @@ -1,31 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import org.apache.iceberg.EnvironmentContext; -import org.apache.iceberg.flink.util.FlinkPackage; - -class FlinkEnvironmentContext { - private FlinkEnvironmentContext() {} - - public static void init() { - EnvironmentContext.put(EnvironmentContext.ENGINE_NAME, "flink"); - EnvironmentContext.put(EnvironmentContext.ENGINE_VERSION, FlinkPackage.version()); - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/FlinkFilters.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/FlinkFilters.java deleted file mode 100644 index f2244d5137a1..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/FlinkFilters.java +++ /dev/null @@ -1,266 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import java.time.Instant; -import java.time.LocalDate; -import java.time.LocalDateTime; -import java.time.LocalTime; -import java.util.List; -import java.util.Map; -import java.util.Optional; -import java.util.function.BiFunction; -import java.util.regex.Matcher; -import java.util.regex.Pattern; -import org.apache.flink.table.expressions.CallExpression; -import org.apache.flink.table.expressions.FieldReferenceExpression; -import org.apache.flink.table.expressions.ResolvedExpression; -import org.apache.flink.table.expressions.ValueLiteralExpression; -import org.apache.flink.table.functions.BuiltInFunctionDefinitions; -import org.apache.flink.table.functions.FunctionDefinition; -import org.apache.iceberg.expressions.Expression; -import org.apache.iceberg.expressions.Expression.Operation; -import org.apache.iceberg.expressions.Expressions; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.util.DateTimeUtil; -import org.apache.iceberg.util.NaNUtil; - -public class FlinkFilters { - private FlinkFilters() {} - - private static final Pattern STARTS_WITH_PATTERN = Pattern.compile("([^%]+)%"); - - private static final Map FILTERS = - ImmutableMap.builder() - .put(BuiltInFunctionDefinitions.EQUALS, Operation.EQ) - .put(BuiltInFunctionDefinitions.NOT_EQUALS, Operation.NOT_EQ) - .put(BuiltInFunctionDefinitions.GREATER_THAN, Operation.GT) - .put(BuiltInFunctionDefinitions.GREATER_THAN_OR_EQUAL, Operation.GT_EQ) - .put(BuiltInFunctionDefinitions.LESS_THAN, Operation.LT) - .put(BuiltInFunctionDefinitions.LESS_THAN_OR_EQUAL, Operation.LT_EQ) - .put(BuiltInFunctionDefinitions.IS_NULL, Operation.IS_NULL) - .put(BuiltInFunctionDefinitions.IS_NOT_NULL, Operation.NOT_NULL) - .put(BuiltInFunctionDefinitions.AND, Operation.AND) - .put(BuiltInFunctionDefinitions.OR, Operation.OR) - .put(BuiltInFunctionDefinitions.NOT, Operation.NOT) - .put(BuiltInFunctionDefinitions.LIKE, Operation.STARTS_WITH) - .buildOrThrow(); - - /** - * Convert flink expression to iceberg expression. - * - *

    the BETWEEN, NOT_BETWEEN, IN expression will be converted by flink automatically. the - * BETWEEN will be converted to (GT_EQ AND LT_EQ), the NOT_BETWEEN will be converted to (LT_EQ OR - * GT_EQ), the IN will be converted to OR, so we do not add the conversion here - * - * @param flinkExpression the flink expression - * @return the iceberg expression - */ - public static Optional convert( - org.apache.flink.table.expressions.Expression flinkExpression) { - if (!(flinkExpression instanceof CallExpression)) { - return Optional.empty(); - } - - CallExpression call = (CallExpression) flinkExpression; - Operation op = FILTERS.get(call.getFunctionDefinition()); - if (op != null) { - switch (op) { - case IS_NULL: - return onlyChildAs(call, FieldReferenceExpression.class) - .map(FieldReferenceExpression::getName) - .map(Expressions::isNull); - - case NOT_NULL: - return onlyChildAs(call, FieldReferenceExpression.class) - .map(FieldReferenceExpression::getName) - .map(Expressions::notNull); - - case LT: - return convertFieldAndLiteral(Expressions::lessThan, Expressions::greaterThan, call); - - case LT_EQ: - return convertFieldAndLiteral( - Expressions::lessThanOrEqual, Expressions::greaterThanOrEqual, call); - - case GT: - return convertFieldAndLiteral(Expressions::greaterThan, Expressions::lessThan, call); - - case GT_EQ: - return convertFieldAndLiteral( - Expressions::greaterThanOrEqual, Expressions::lessThanOrEqual, call); - - case EQ: - return convertFieldAndLiteral( - (ref, lit) -> { - if (NaNUtil.isNaN(lit)) { - return Expressions.isNaN(ref); - } else { - return Expressions.equal(ref, lit); - } - }, - call); - - case NOT_EQ: - return convertFieldAndLiteral( - (ref, lit) -> { - if (NaNUtil.isNaN(lit)) { - return Expressions.notNaN(ref); - } else { - return Expressions.notEqual(ref, lit); - } - }, - call); - - case NOT: - return onlyChildAs(call, CallExpression.class) - .flatMap(FlinkFilters::convert) - .map(Expressions::not); - - case AND: - return convertLogicExpression(Expressions::and, call); - - case OR: - return convertLogicExpression(Expressions::or, call); - - case STARTS_WITH: - return convertLike(call); - } - } - - return Optional.empty(); - } - - private static Optional onlyChildAs( - CallExpression call, Class expectedChildClass) { - List children = call.getResolvedChildren(); - if (children.size() != 1) { - return Optional.empty(); - } - - ResolvedExpression child = children.get(0); - if (!expectedChildClass.isInstance(child)) { - return Optional.empty(); - } - - return Optional.of(expectedChildClass.cast(child)); - } - - private static Optional convertLike(CallExpression call) { - List args = call.getResolvedChildren(); - if (args.size() != 2) { - return Optional.empty(); - } - - org.apache.flink.table.expressions.Expression left = args.get(0); - org.apache.flink.table.expressions.Expression right = args.get(1); - - if (left instanceof FieldReferenceExpression && right instanceof ValueLiteralExpression) { - String name = ((FieldReferenceExpression) left).getName(); - return convertLiteral((ValueLiteralExpression) right) - .flatMap( - lit -> { - if (lit instanceof String) { - String pattern = (String) lit; - Matcher matcher = STARTS_WITH_PATTERN.matcher(pattern); - // exclude special char of LIKE - // '_' is the wildcard of the SQL LIKE - if (!pattern.contains("_") && matcher.matches()) { - return Optional.of(Expressions.startsWith(name, matcher.group(1))); - } - } - - return Optional.empty(); - }); - } - - return Optional.empty(); - } - - private static Optional convertLogicExpression( - BiFunction function, CallExpression call) { - List args = call.getResolvedChildren(); - if (args == null || args.size() != 2) { - return Optional.empty(); - } - - Optional left = convert(args.get(0)); - Optional right = convert(args.get(1)); - if (left.isPresent() && right.isPresent()) { - return Optional.of(function.apply(left.get(), right.get())); - } - - return Optional.empty(); - } - - private static Optional convertLiteral(ValueLiteralExpression expression) { - Optional value = - expression.getValueAs( - expression.getOutputDataType().getLogicalType().getDefaultConversion()); - return value.map( - o -> { - if (o instanceof LocalDateTime) { - return DateTimeUtil.microsFromTimestamp((LocalDateTime) o); - } else if (o instanceof Instant) { - return DateTimeUtil.microsFromInstant((Instant) o); - } else if (o instanceof LocalTime) { - return DateTimeUtil.microsFromTime((LocalTime) o); - } else if (o instanceof LocalDate) { - return DateTimeUtil.daysFromDate((LocalDate) o); - } - - return o; - }); - } - - private static Optional convertFieldAndLiteral( - BiFunction expr, CallExpression call) { - return convertFieldAndLiteral(expr, expr, call); - } - - private static Optional convertFieldAndLiteral( - BiFunction convertLR, - BiFunction convertRL, - CallExpression call) { - List args = call.getResolvedChildren(); - if (args.size() != 2) { - return Optional.empty(); - } - - org.apache.flink.table.expressions.Expression left = args.get(0); - org.apache.flink.table.expressions.Expression right = args.get(1); - - if (left instanceof FieldReferenceExpression && right instanceof ValueLiteralExpression) { - String name = ((FieldReferenceExpression) left).getName(); - Optional lit = convertLiteral((ValueLiteralExpression) right); - if (lit.isPresent()) { - return Optional.of(convertLR.apply(name, lit.get())); - } - } else if (left instanceof ValueLiteralExpression - && right instanceof FieldReferenceExpression) { - Optional lit = convertLiteral((ValueLiteralExpression) left); - String name = ((FieldReferenceExpression) right).getName(); - if (lit.isPresent()) { - return Optional.of(convertRL.apply(name, lit.get())); - } - } - - return Optional.empty(); - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/FlinkFixupTypes.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/FlinkFixupTypes.java deleted file mode 100644 index 767d4497ac91..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/FlinkFixupTypes.java +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import org.apache.iceberg.Schema; -import org.apache.iceberg.types.FixupTypes; -import org.apache.iceberg.types.Type; -import org.apache.iceberg.types.TypeUtil; -import org.apache.iceberg.types.Types; - -/** - * The uuid and fixed are converted to the same Flink type. Conversion back can produce only one, - * which may not be correct. - */ -class FlinkFixupTypes extends FixupTypes { - - private FlinkFixupTypes(Schema referenceSchema) { - super(referenceSchema); - } - - static Schema fixup(Schema schema, Schema referenceSchema) { - return new Schema( - TypeUtil.visit(schema, new FlinkFixupTypes(referenceSchema)).asStructType().fields()); - } - - @Override - protected boolean fixupPrimitive(Type.PrimitiveType type, Type source) { - if (type instanceof Types.FixedType) { - int length = ((Types.FixedType) type).length(); - return source.typeId() == Type.TypeID.UUID && length == 16; - } - return false; - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/FlinkReadConf.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/FlinkReadConf.java deleted file mode 100644 index 804a956ec9b9..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/FlinkReadConf.java +++ /dev/null @@ -1,213 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import java.time.Duration; -import java.util.Map; -import java.util.concurrent.TimeUnit; -import org.apache.flink.configuration.ReadableConfig; -import org.apache.flink.util.TimeUtils; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.flink.source.StreamingStartingStrategy; - -public class FlinkReadConf { - - private final FlinkConfParser confParser; - - public FlinkReadConf( - Table table, Map readOptions, ReadableConfig readableConfig) { - this.confParser = new FlinkConfParser(table, readOptions, readableConfig); - } - - public Long snapshotId() { - return confParser.longConf().option(FlinkReadOptions.SNAPSHOT_ID.key()).parseOptional(); - } - - public String tag() { - return confParser.stringConf().option(FlinkReadOptions.TAG.key()).parseOptional(); - } - - public String startTag() { - return confParser.stringConf().option(FlinkReadOptions.START_TAG.key()).parseOptional(); - } - - public String endTag() { - return confParser.stringConf().option(FlinkReadOptions.END_TAG.key()).parseOptional(); - } - - public String branch() { - return confParser.stringConf().option(FlinkReadOptions.BRANCH.key()).parseOptional(); - } - - public boolean caseSensitive() { - return confParser - .booleanConf() - .option(FlinkReadOptions.CASE_SENSITIVE) - .flinkConfig(FlinkReadOptions.CASE_SENSITIVE_OPTION) - .defaultValue(FlinkReadOptions.CASE_SENSITIVE_OPTION.defaultValue()) - .parse(); - } - - public Long asOfTimestamp() { - return confParser.longConf().option(FlinkReadOptions.AS_OF_TIMESTAMP.key()).parseOptional(); - } - - public StreamingStartingStrategy startingStrategy() { - return confParser - .enumConfParser(StreamingStartingStrategy.class) - .option(FlinkReadOptions.STARTING_STRATEGY) - .flinkConfig(FlinkReadOptions.STARTING_STRATEGY_OPTION) - .defaultValue(StreamingStartingStrategy.INCREMENTAL_FROM_LATEST_SNAPSHOT) - .parse(); - } - - public Long startSnapshotTimestamp() { - return confParser - .longConf() - .option(FlinkReadOptions.START_SNAPSHOT_TIMESTAMP.key()) - .parseOptional(); - } - - public Long startSnapshotId() { - return confParser.longConf().option(FlinkReadOptions.START_SNAPSHOT_ID.key()).parseOptional(); - } - - public Long endSnapshotId() { - return confParser.longConf().option(FlinkReadOptions.END_SNAPSHOT_ID.key()).parseOptional(); - } - - public long splitSize() { - return confParser - .longConf() - .option(FlinkReadOptions.SPLIT_SIZE) - .flinkConfig(FlinkReadOptions.SPLIT_SIZE_OPTION) - .tableProperty(TableProperties.SPLIT_SIZE) - .defaultValue(TableProperties.SPLIT_SIZE_DEFAULT) - .parse(); - } - - public int splitLookback() { - return confParser - .intConf() - .option(FlinkReadOptions.SPLIT_LOOKBACK) - .flinkConfig(FlinkReadOptions.SPLIT_LOOKBACK_OPTION) - .tableProperty(TableProperties.SPLIT_LOOKBACK) - .defaultValue(TableProperties.SPLIT_LOOKBACK_DEFAULT) - .parse(); - } - - public long splitFileOpenCost() { - return confParser - .longConf() - .option(FlinkReadOptions.SPLIT_FILE_OPEN_COST) - .flinkConfig(FlinkReadOptions.SPLIT_FILE_OPEN_COST_OPTION) - .tableProperty(TableProperties.SPLIT_OPEN_FILE_COST) - .defaultValue(TableProperties.SPLIT_OPEN_FILE_COST_DEFAULT) - .parse(); - } - - public boolean streaming() { - return confParser - .booleanConf() - .option(FlinkReadOptions.STREAMING) - .flinkConfig(FlinkReadOptions.STREAMING_OPTION) - .defaultValue(FlinkReadOptions.STREAMING_OPTION.defaultValue()) - .parse(); - } - - public Duration monitorInterval() { - String duration = - confParser - .stringConf() - .option(FlinkReadOptions.MONITOR_INTERVAL) - .flinkConfig(FlinkReadOptions.MONITOR_INTERVAL_OPTION) - .defaultValue(FlinkReadOptions.MONITOR_INTERVAL_OPTION.defaultValue()) - .parse(); - - return TimeUtils.parseDuration(duration); - } - - public boolean includeColumnStats() { - return confParser - .booleanConf() - .option(FlinkReadOptions.INCLUDE_COLUMN_STATS) - .flinkConfig(FlinkReadOptions.INCLUDE_COLUMN_STATS_OPTION) - .defaultValue(FlinkReadOptions.INCLUDE_COLUMN_STATS_OPTION.defaultValue()) - .parse(); - } - - public int maxPlanningSnapshotCount() { - return confParser - .intConf() - .option(FlinkReadOptions.MAX_PLANNING_SNAPSHOT_COUNT) - .flinkConfig(FlinkReadOptions.MAX_PLANNING_SNAPSHOT_COUNT_OPTION) - .defaultValue(FlinkReadOptions.MAX_PLANNING_SNAPSHOT_COUNT_OPTION.defaultValue()) - .parse(); - } - - public String nameMapping() { - return confParser.stringConf().option(TableProperties.DEFAULT_NAME_MAPPING).parseOptional(); - } - - public long limit() { - return confParser - .longConf() - .option(FlinkReadOptions.LIMIT) - .flinkConfig(FlinkReadOptions.LIMIT_OPTION) - .defaultValue(FlinkReadOptions.LIMIT_OPTION.defaultValue()) - .parse(); - } - - public int workerPoolSize() { - return confParser - .intConf() - .option(FlinkConfigOptions.TABLE_EXEC_ICEBERG_WORKER_POOL_SIZE.key()) - .flinkConfig(FlinkConfigOptions.TABLE_EXEC_ICEBERG_WORKER_POOL_SIZE) - .defaultValue(FlinkConfigOptions.TABLE_EXEC_ICEBERG_WORKER_POOL_SIZE.defaultValue()) - .parse(); - } - - public int maxAllowedPlanningFailures() { - return confParser - .intConf() - .option(FlinkReadOptions.MAX_ALLOWED_PLANNING_FAILURES) - .flinkConfig(FlinkReadOptions.MAX_ALLOWED_PLANNING_FAILURES_OPTION) - .defaultValue(FlinkReadOptions.MAX_ALLOWED_PLANNING_FAILURES_OPTION.defaultValue()) - .parse(); - } - - public String watermarkColumn() { - return confParser - .stringConf() - .option(FlinkReadOptions.WATERMARK_COLUMN) - .flinkConfig(FlinkReadOptions.WATERMARK_COLUMN_OPTION) - .defaultValue(FlinkReadOptions.WATERMARK_COLUMN_OPTION.defaultValue()) - .parseOptional(); - } - - public TimeUnit watermarkColumnTimeUnit() { - return confParser - .enumConfParser(TimeUnit.class) - .option(FlinkReadOptions.WATERMARK_COLUMN_TIME_UNIT) - .flinkConfig(FlinkReadOptions.WATERMARK_COLUMN_TIME_UNIT_OPTION) - .defaultValue(FlinkReadOptions.WATERMARK_COLUMN_TIME_UNIT_OPTION.defaultValue()) - .parse(); - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/FlinkReadOptions.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/FlinkReadOptions.java deleted file mode 100644 index 1bbd88146c8f..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/FlinkReadOptions.java +++ /dev/null @@ -1,123 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import java.util.concurrent.TimeUnit; -import org.apache.flink.configuration.ConfigOption; -import org.apache.flink.configuration.ConfigOptions; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.flink.source.StreamingStartingStrategy; - -/** Flink source read options */ -public class FlinkReadOptions { - private static final String PREFIX = "connector.iceberg."; - - private FlinkReadOptions() {} - - public static final ConfigOption SNAPSHOT_ID = - ConfigOptions.key("snapshot-id").longType().defaultValue(null); - - public static final ConfigOption TAG = - ConfigOptions.key("tag").stringType().defaultValue(null); - - public static final ConfigOption BRANCH = - ConfigOptions.key("branch").stringType().defaultValue(null); - - public static final ConfigOption START_TAG = - ConfigOptions.key("start-tag").stringType().defaultValue(null); - - public static final ConfigOption END_TAG = - ConfigOptions.key("end-tag").stringType().defaultValue(null); - - public static final String CASE_SENSITIVE = "case-sensitive"; - public static final ConfigOption CASE_SENSITIVE_OPTION = - ConfigOptions.key(PREFIX + CASE_SENSITIVE).booleanType().defaultValue(false); - - public static final ConfigOption AS_OF_TIMESTAMP = - ConfigOptions.key("as-of-timestamp").longType().defaultValue(null); - - public static final String STARTING_STRATEGY = "starting-strategy"; - public static final ConfigOption STARTING_STRATEGY_OPTION = - ConfigOptions.key(PREFIX + STARTING_STRATEGY) - .enumType(StreamingStartingStrategy.class) - .defaultValue(StreamingStartingStrategy.INCREMENTAL_FROM_LATEST_SNAPSHOT); - - public static final ConfigOption START_SNAPSHOT_TIMESTAMP = - ConfigOptions.key("start-snapshot-timestamp").longType().defaultValue(null); - - public static final ConfigOption START_SNAPSHOT_ID = - ConfigOptions.key("start-snapshot-id").longType().defaultValue(null); - - public static final ConfigOption END_SNAPSHOT_ID = - ConfigOptions.key("end-snapshot-id").longType().defaultValue(null); - - public static final String SPLIT_SIZE = "split-size"; - public static final ConfigOption SPLIT_SIZE_OPTION = - ConfigOptions.key(PREFIX + SPLIT_SIZE) - .longType() - .defaultValue(TableProperties.SPLIT_SIZE_DEFAULT); - - public static final String SPLIT_LOOKBACK = "split-lookback"; - public static final ConfigOption SPLIT_LOOKBACK_OPTION = - ConfigOptions.key(PREFIX + SPLIT_LOOKBACK) - .intType() - .defaultValue(TableProperties.SPLIT_LOOKBACK_DEFAULT); - - public static final String SPLIT_FILE_OPEN_COST = "split-file-open-cost"; - public static final ConfigOption SPLIT_FILE_OPEN_COST_OPTION = - ConfigOptions.key(PREFIX + SPLIT_FILE_OPEN_COST) - .longType() - .defaultValue(TableProperties.SPLIT_OPEN_FILE_COST_DEFAULT); - - public static final String STREAMING = "streaming"; - public static final ConfigOption STREAMING_OPTION = - ConfigOptions.key(PREFIX + STREAMING).booleanType().defaultValue(false); - - public static final String MONITOR_INTERVAL = "monitor-interval"; - public static final ConfigOption MONITOR_INTERVAL_OPTION = - ConfigOptions.key(PREFIX + MONITOR_INTERVAL).stringType().defaultValue("60s"); - - public static final String INCLUDE_COLUMN_STATS = "include-column-stats"; - public static final ConfigOption INCLUDE_COLUMN_STATS_OPTION = - ConfigOptions.key(PREFIX + INCLUDE_COLUMN_STATS).booleanType().defaultValue(false); - - public static final String MAX_PLANNING_SNAPSHOT_COUNT = "max-planning-snapshot-count"; - public static final ConfigOption MAX_PLANNING_SNAPSHOT_COUNT_OPTION = - ConfigOptions.key(PREFIX + MAX_PLANNING_SNAPSHOT_COUNT) - .intType() - .defaultValue(Integer.MAX_VALUE); - - public static final String LIMIT = "limit"; - public static final ConfigOption LIMIT_OPTION = - ConfigOptions.key(PREFIX + LIMIT).longType().defaultValue(-1L); - - public static final String MAX_ALLOWED_PLANNING_FAILURES = "max-allowed-planning-failures"; - public static final ConfigOption MAX_ALLOWED_PLANNING_FAILURES_OPTION = - ConfigOptions.key(PREFIX + MAX_ALLOWED_PLANNING_FAILURES).intType().defaultValue(3); - - public static final String WATERMARK_COLUMN = "watermark-column"; - public static final ConfigOption WATERMARK_COLUMN_OPTION = - ConfigOptions.key(PREFIX + WATERMARK_COLUMN).stringType().noDefaultValue(); - - public static final String WATERMARK_COLUMN_TIME_UNIT = "watermark-column-time-unit"; - public static final ConfigOption WATERMARK_COLUMN_TIME_UNIT_OPTION = - ConfigOptions.key(PREFIX + WATERMARK_COLUMN_TIME_UNIT) - .enumType(TimeUnit.class) - .defaultValue(TimeUnit.MICROSECONDS); -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/FlinkSchemaUtil.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/FlinkSchemaUtil.java deleted file mode 100644 index 4790dc85bf28..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/FlinkSchemaUtil.java +++ /dev/null @@ -1,232 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import java.util.List; -import java.util.Set; -import org.apache.flink.table.api.DataTypes; -import org.apache.flink.table.api.TableSchema; -import org.apache.flink.table.catalog.Column; -import org.apache.flink.table.catalog.ResolvedSchema; -import org.apache.flink.table.types.logical.LogicalType; -import org.apache.flink.table.types.logical.RowType; -import org.apache.flink.table.types.utils.TypeConversions; -import org.apache.iceberg.Schema; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Sets; -import org.apache.iceberg.types.Type; -import org.apache.iceberg.types.TypeUtil; -import org.apache.iceberg.types.Types; - -/** - * Converter between Flink types and Iceberg type. The conversion is not a 1:1 mapping that not - * allows back-and-forth conversion. So some information might get lost during the back-and-forth - * conversion. - * - *

    This inconsistent types: - * - *

      - *
    • map Iceberg UUID type to Flink BinaryType(16) - *
    • map Flink VarCharType(_) and CharType(_) to Iceberg String type - *
    • map Flink VarBinaryType(_) to Iceberg Binary type - *
    • map Flink TimeType(_) to Iceberg Time type (microseconds) - *
    • map Flink TimestampType(_) to Iceberg Timestamp without zone type (microseconds) - *
    • map Flink LocalZonedTimestampType(_) to Iceberg Timestamp with zone type (microseconds) - *
    • map Flink MultiSetType to Iceberg Map type(element, int) - *
    - * - *

    - */ -public class FlinkSchemaUtil { - - private FlinkSchemaUtil() {} - - /** @deprecated Use {@link #convert(ResolvedSchema)} instead. */ - @Deprecated - public static Schema convert(TableSchema schema) { - LogicalType schemaType = schema.toRowDataType().getLogicalType(); - Preconditions.checkArgument( - schemaType instanceof RowType, "Schema logical type should be row type."); - - RowType root = (RowType) schemaType; - Type converted = root.accept(new FlinkTypeToType(root)); - - Schema icebergSchema = new Schema(converted.asStructType().fields()); - if (schema.getPrimaryKey().isPresent()) { - return freshIdentifierFieldIds(icebergSchema, schema.getPrimaryKey().get().getColumns()); - } else { - return icebergSchema; - } - } - - /** Convert the flink table schema to apache iceberg schema with column comment. */ - public static Schema convert(ResolvedSchema flinkSchema) { - List tableColumns = flinkSchema.getColumns(); - // copy from org.apache.flink.table.api.Schema#toRowDataType - DataTypes.Field[] fields = - tableColumns.stream() - .map( - column -> { - if (column.getComment().isPresent()) { - return DataTypes.FIELD( - column.getName(), column.getDataType(), column.getComment().get()); - } else { - return DataTypes.FIELD(column.getName(), column.getDataType()); - } - }) - .toArray(DataTypes.Field[]::new); - - LogicalType schemaType = DataTypes.ROW(fields).notNull().getLogicalType(); - Preconditions.checkArgument( - schemaType instanceof RowType, "Schema logical type should be row type."); - - RowType root = (RowType) schemaType; - Type converted = root.accept(new FlinkTypeToType(root)); - Schema icebergSchema = new Schema(converted.asStructType().fields()); - if (flinkSchema.getPrimaryKey().isPresent()) { - return freshIdentifierFieldIds(icebergSchema, flinkSchema.getPrimaryKey().get().getColumns()); - } else { - return icebergSchema; - } - } - - private static Schema freshIdentifierFieldIds(Schema icebergSchema, List primaryKeys) { - // Locate the identifier field id list. - Set identifierFieldIds = Sets.newHashSet(); - for (String primaryKey : primaryKeys) { - Types.NestedField field = icebergSchema.findField(primaryKey); - Preconditions.checkNotNull( - field, - "Cannot find field ID for the primary key column %s in schema %s", - primaryKey, - icebergSchema); - identifierFieldIds.add(field.fieldId()); - } - return new Schema( - icebergSchema.schemaId(), icebergSchema.asStruct().fields(), identifierFieldIds); - } - - /** - * Convert a Flink {@link TableSchema} to a {@link Schema} based on the given schema. - * - *

    This conversion does not assign new ids; it uses ids from the base schema. - * - *

    Data types, field order, and nullability will match the Flink type. This conversion may - * return a schema that is not compatible with base schema. - * - * @param baseSchema a Schema on which conversion is based - * @param flinkSchema a Flink TableSchema - * @return the equivalent Schema - * @throws IllegalArgumentException if the type cannot be converted or there are missing ids - */ - public static Schema convert(Schema baseSchema, TableSchema flinkSchema) { - // convert to a type with fresh ids - Types.StructType struct = convert(flinkSchema).asStruct(); - // reassign ids to match the base schema - Schema schema = TypeUtil.reassignIds(new Schema(struct.fields()), baseSchema); - // reassign doc to match the base schema - schema = TypeUtil.reassignDoc(schema, baseSchema); - - // fix types that can't be represented in Flink (UUID) - Schema fixedSchema = FlinkFixupTypes.fixup(schema, baseSchema); - if (flinkSchema.getPrimaryKey().isPresent()) { - return freshIdentifierFieldIds(fixedSchema, flinkSchema.getPrimaryKey().get().getColumns()); - } else { - return fixedSchema; - } - } - - /** - * Convert a {@link Schema} to a {@link RowType Flink type}. - * - * @param schema a Schema - * @return the equivalent Flink type - * @throws IllegalArgumentException if the type cannot be converted to Flink - */ - public static RowType convert(Schema schema) { - return (RowType) TypeUtil.visit(schema, new TypeToFlinkType()); - } - - /** - * Convert a {@link Type} to a {@link LogicalType Flink type}. - * - * @param type a Type - * @return the equivalent Flink type - * @throws IllegalArgumentException if the type cannot be converted to Flink - */ - public static LogicalType convert(Type type) { - return TypeUtil.visit(type, new TypeToFlinkType()); - } - - /** - * Convert a {@link LogicalType Flink type} to a {@link Type}. - * - * @param flinkType a FlinkType - * @return the equivalent Iceberg type - */ - public static Type convert(LogicalType flinkType) { - return flinkType.accept(new FlinkTypeToType()); - } - - /** - * Convert a {@link RowType} to a {@link TableSchema}. - * - * @param rowType a RowType - * @return Flink TableSchema - */ - public static TableSchema toSchema(RowType rowType) { - TableSchema.Builder builder = TableSchema.builder(); - for (RowType.RowField field : rowType.getFields()) { - builder.field(field.getName(), TypeConversions.fromLogicalToDataType(field.getType())); - } - return builder.build(); - } - - /** - * Convert a {@link Schema} to a {@link TableSchema}. - * - * @param schema iceberg schema to convert. - * @return Flink TableSchema. - */ - public static TableSchema toSchema(Schema schema) { - TableSchema.Builder builder = TableSchema.builder(); - - // Add columns. - for (RowType.RowField field : convert(schema).getFields()) { - builder.field(field.getName(), TypeConversions.fromLogicalToDataType(field.getType())); - } - - // Add primary key. - Set identifierFieldIds = schema.identifierFieldIds(); - if (!identifierFieldIds.isEmpty()) { - List columns = Lists.newArrayListWithExpectedSize(identifierFieldIds.size()); - for (Integer identifierFieldId : identifierFieldIds) { - String columnName = schema.findColumnName(identifierFieldId); - Preconditions.checkNotNull( - columnName, "Cannot find field with id %s in schema %s", identifierFieldId, schema); - - columns.add(columnName); - } - builder.primaryKey(columns.toArray(new String[0])); - } - - return builder.build(); - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/FlinkSourceFilter.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/FlinkSourceFilter.java deleted file mode 100644 index 5fbd84909d69..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/FlinkSourceFilter.java +++ /dev/null @@ -1,49 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import org.apache.flink.api.common.functions.FilterFunction; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.Schema; -import org.apache.iceberg.expressions.Evaluator; -import org.apache.iceberg.expressions.Expression; -import org.apache.iceberg.types.Types; - -public class FlinkSourceFilter implements FilterFunction { - - private final RowType rowType; - private final Evaluator evaluator; - private final Types.StructType struct; - private volatile RowDataWrapper wrapper; - - public FlinkSourceFilter(Schema schema, Expression expr, boolean caseSensitive) { - this.rowType = FlinkSchemaUtil.convert(schema); - this.struct = schema.asStruct(); - this.evaluator = new Evaluator(struct, expr, caseSensitive); - } - - @Override - public boolean filter(RowData value) { - if (wrapper == null) { - this.wrapper = new RowDataWrapper(rowType, struct); - } - return evaluator.eval(wrapper.wrap(value)); - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeToType.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeToType.java deleted file mode 100644 index 408065f06057..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeToType.java +++ /dev/null @@ -1,203 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import java.util.List; -import java.util.stream.Collectors; -import org.apache.flink.table.types.logical.ArrayType; -import org.apache.flink.table.types.logical.BigIntType; -import org.apache.flink.table.types.logical.BinaryType; -import org.apache.flink.table.types.logical.BooleanType; -import org.apache.flink.table.types.logical.CharType; -import org.apache.flink.table.types.logical.DateType; -import org.apache.flink.table.types.logical.DecimalType; -import org.apache.flink.table.types.logical.DoubleType; -import org.apache.flink.table.types.logical.FloatType; -import org.apache.flink.table.types.logical.IntType; -import org.apache.flink.table.types.logical.LocalZonedTimestampType; -import org.apache.flink.table.types.logical.MapType; -import org.apache.flink.table.types.logical.MultisetType; -import org.apache.flink.table.types.logical.RowType; -import org.apache.flink.table.types.logical.SmallIntType; -import org.apache.flink.table.types.logical.TimeType; -import org.apache.flink.table.types.logical.TimestampType; -import org.apache.flink.table.types.logical.TinyIntType; -import org.apache.flink.table.types.logical.VarBinaryType; -import org.apache.flink.table.types.logical.VarCharType; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.types.Type; -import org.apache.iceberg.types.Types; - -class FlinkTypeToType extends FlinkTypeVisitor { - - private final RowType root; - private int nextId; - - FlinkTypeToType() { - this.root = null; - } - - FlinkTypeToType(RowType root) { - this.root = root; - // the root struct's fields use the first ids - this.nextId = root.getFieldCount(); - } - - private int getNextId() { - int next = nextId; - nextId += 1; - return next; - } - - @Override - public Type visit(CharType charType) { - return Types.StringType.get(); - } - - @Override - public Type visit(VarCharType varCharType) { - return Types.StringType.get(); - } - - @Override - public Type visit(BooleanType booleanType) { - return Types.BooleanType.get(); - } - - @Override - public Type visit(BinaryType binaryType) { - return Types.FixedType.ofLength(binaryType.getLength()); - } - - @Override - public Type visit(VarBinaryType varBinaryType) { - return Types.BinaryType.get(); - } - - @Override - public Type visit(DecimalType decimalType) { - return Types.DecimalType.of(decimalType.getPrecision(), decimalType.getScale()); - } - - @Override - public Type visit(TinyIntType tinyIntType) { - return Types.IntegerType.get(); - } - - @Override - public Type visit(SmallIntType smallIntType) { - return Types.IntegerType.get(); - } - - @Override - public Type visit(IntType intType) { - return Types.IntegerType.get(); - } - - @Override - public Type visit(BigIntType bigIntType) { - return Types.LongType.get(); - } - - @Override - public Type visit(FloatType floatType) { - return Types.FloatType.get(); - } - - @Override - public Type visit(DoubleType doubleType) { - return Types.DoubleType.get(); - } - - @Override - public Type visit(DateType dateType) { - return Types.DateType.get(); - } - - @Override - public Type visit(TimeType timeType) { - return Types.TimeType.get(); - } - - @Override - public Type visit(TimestampType timestampType) { - return Types.TimestampType.withoutZone(); - } - - @Override - public Type visit(LocalZonedTimestampType localZonedTimestampType) { - return Types.TimestampType.withZone(); - } - - @Override - public Type visit(ArrayType arrayType) { - Type elementType = arrayType.getElementType().accept(this); - if (arrayType.getElementType().isNullable()) { - return Types.ListType.ofOptional(getNextId(), elementType); - } else { - return Types.ListType.ofRequired(getNextId(), elementType); - } - } - - @Override - public Type visit(MultisetType multisetType) { - Type elementType = multisetType.getElementType().accept(this); - return Types.MapType.ofRequired(getNextId(), getNextId(), elementType, Types.IntegerType.get()); - } - - @Override - public Type visit(MapType mapType) { - // keys in map are not allowed to be null. - Type keyType = mapType.getKeyType().accept(this); - Type valueType = mapType.getValueType().accept(this); - if (mapType.getValueType().isNullable()) { - return Types.MapType.ofOptional(getNextId(), getNextId(), keyType, valueType); - } else { - return Types.MapType.ofRequired(getNextId(), getNextId(), keyType, valueType); - } - } - - @Override - @SuppressWarnings("ReferenceEquality") - public Type visit(RowType rowType) { - List newFields = Lists.newArrayListWithExpectedSize(rowType.getFieldCount()); - boolean isRoot = root == rowType; - - List types = - rowType.getFields().stream() - .map(f -> f.getType().accept(this)) - .collect(Collectors.toList()); - - for (int i = 0; i < rowType.getFieldCount(); i++) { - int id = isRoot ? i : getNextId(); - - RowType.RowField field = rowType.getFields().get(i); - String name = field.getName(); - String comment = field.getDescription().orElse(null); - - if (field.getType().isNullable()) { - newFields.add(Types.NestedField.optional(id, name, types.get(i), comment)); - } else { - newFields.add(Types.NestedField.required(id, name, types.get(i), comment)); - } - } - - return Types.StructType.of(newFields); - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeVisitor.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeVisitor.java deleted file mode 100644 index f3de2416088c..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeVisitor.java +++ /dev/null @@ -1,80 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import org.apache.flink.table.types.logical.DayTimeIntervalType; -import org.apache.flink.table.types.logical.DistinctType; -import org.apache.flink.table.types.logical.LogicalType; -import org.apache.flink.table.types.logical.LogicalTypeVisitor; -import org.apache.flink.table.types.logical.NullType; -import org.apache.flink.table.types.logical.RawType; -import org.apache.flink.table.types.logical.StructuredType; -import org.apache.flink.table.types.logical.SymbolType; -import org.apache.flink.table.types.logical.YearMonthIntervalType; -import org.apache.flink.table.types.logical.ZonedTimestampType; - -public abstract class FlinkTypeVisitor implements LogicalTypeVisitor { - - // ------------------------- Unsupported types ------------------------------ - - @Override - public T visit(ZonedTimestampType zonedTimestampType) { - throw new UnsupportedOperationException("Unsupported ZonedTimestampType."); - } - - @Override - public T visit(YearMonthIntervalType yearMonthIntervalType) { - throw new UnsupportedOperationException("Unsupported YearMonthIntervalType."); - } - - @Override - public T visit(DayTimeIntervalType dayTimeIntervalType) { - throw new UnsupportedOperationException("Unsupported DayTimeIntervalType."); - } - - @Override - public T visit(DistinctType distinctType) { - throw new UnsupportedOperationException("Unsupported DistinctType."); - } - - @Override - public T visit(StructuredType structuredType) { - throw new UnsupportedOperationException("Unsupported StructuredType."); - } - - @Override - public T visit(NullType nullType) { - throw new UnsupportedOperationException("Unsupported NullType."); - } - - @Override - public T visit(RawType rawType) { - throw new UnsupportedOperationException("Unsupported RawType."); - } - - @Override - public T visit(SymbolType symbolType) { - throw new UnsupportedOperationException("Unsupported SymbolType."); - } - - @Override - public T visit(LogicalType other) { - throw new UnsupportedOperationException("Unsupported type: " + other); - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/FlinkWriteConf.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/FlinkWriteConf.java deleted file mode 100644 index ca7b1120bc81..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/FlinkWriteConf.java +++ /dev/null @@ -1,205 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import java.time.Duration; -import java.util.Map; -import org.apache.flink.annotation.Experimental; -import org.apache.flink.configuration.ReadableConfig; -import org.apache.iceberg.DistributionMode; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableProperties; - -/** - * A class for common Iceberg configs for Flink writes. - * - *

    If a config is set at multiple levels, the following order of precedence is used (top to - * bottom): - * - *

      - *
    1. Write options - *
    2. flink ReadableConfig - *
    3. Table metadata - *
    - * - * The most specific value is set in write options and takes precedence over all other configs. If - * no write option is provided, this class checks the flink configuration for any overrides. If no - * applicable value is found in the write options, this class uses the table metadata. - * - *

    Note this class is NOT meant to be serialized. - */ -public class FlinkWriteConf { - - private final FlinkConfParser confParser; - - public FlinkWriteConf( - Table table, Map writeOptions, ReadableConfig readableConfig) { - this.confParser = new FlinkConfParser(table, writeOptions, readableConfig); - } - - public boolean overwriteMode() { - return confParser - .booleanConf() - .option(FlinkWriteOptions.OVERWRITE_MODE.key()) - .flinkConfig(FlinkWriteOptions.OVERWRITE_MODE) - .defaultValue(FlinkWriteOptions.OVERWRITE_MODE.defaultValue()) - .parse(); - } - - public boolean upsertMode() { - return confParser - .booleanConf() - .option(FlinkWriteOptions.WRITE_UPSERT_ENABLED.key()) - .flinkConfig(FlinkWriteOptions.WRITE_UPSERT_ENABLED) - .tableProperty(TableProperties.UPSERT_ENABLED) - .defaultValue(TableProperties.UPSERT_ENABLED_DEFAULT) - .parse(); - } - - public FileFormat dataFileFormat() { - String valueAsString = - confParser - .stringConf() - .option(FlinkWriteOptions.WRITE_FORMAT.key()) - .flinkConfig(FlinkWriteOptions.WRITE_FORMAT) - .tableProperty(TableProperties.DEFAULT_FILE_FORMAT) - .defaultValue(TableProperties.DEFAULT_FILE_FORMAT_DEFAULT) - .parse(); - return FileFormat.fromString(valueAsString); - } - - public long targetDataFileSize() { - return confParser - .longConf() - .option(FlinkWriteOptions.TARGET_FILE_SIZE_BYTES.key()) - .flinkConfig(FlinkWriteOptions.TARGET_FILE_SIZE_BYTES) - .tableProperty(TableProperties.WRITE_TARGET_FILE_SIZE_BYTES) - .defaultValue(TableProperties.WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT) - .parse(); - } - - public String parquetCompressionCodec() { - return confParser - .stringConf() - .option(FlinkWriteOptions.COMPRESSION_CODEC.key()) - .flinkConfig(FlinkWriteOptions.COMPRESSION_CODEC) - .tableProperty(TableProperties.PARQUET_COMPRESSION) - .defaultValue(TableProperties.PARQUET_COMPRESSION_DEFAULT) - .parse(); - } - - public String parquetCompressionLevel() { - return confParser - .stringConf() - .option(FlinkWriteOptions.COMPRESSION_LEVEL.key()) - .flinkConfig(FlinkWriteOptions.COMPRESSION_LEVEL) - .tableProperty(TableProperties.PARQUET_COMPRESSION_LEVEL) - .defaultValue(TableProperties.PARQUET_COMPRESSION_LEVEL_DEFAULT) - .parseOptional(); - } - - public String avroCompressionCodec() { - return confParser - .stringConf() - .option(FlinkWriteOptions.COMPRESSION_CODEC.key()) - .flinkConfig(FlinkWriteOptions.COMPRESSION_CODEC) - .tableProperty(TableProperties.AVRO_COMPRESSION) - .defaultValue(TableProperties.AVRO_COMPRESSION_DEFAULT) - .parse(); - } - - public String avroCompressionLevel() { - return confParser - .stringConf() - .option(FlinkWriteOptions.COMPRESSION_LEVEL.key()) - .flinkConfig(FlinkWriteOptions.COMPRESSION_LEVEL) - .tableProperty(TableProperties.AVRO_COMPRESSION_LEVEL) - .defaultValue(TableProperties.AVRO_COMPRESSION_LEVEL_DEFAULT) - .parseOptional(); - } - - public String orcCompressionCodec() { - return confParser - .stringConf() - .option(FlinkWriteOptions.COMPRESSION_CODEC.key()) - .flinkConfig(FlinkWriteOptions.COMPRESSION_CODEC) - .tableProperty(TableProperties.ORC_COMPRESSION) - .defaultValue(TableProperties.ORC_COMPRESSION_DEFAULT) - .parse(); - } - - public String orcCompressionStrategy() { - return confParser - .stringConf() - .option(FlinkWriteOptions.COMPRESSION_STRATEGY.key()) - .flinkConfig(FlinkWriteOptions.COMPRESSION_STRATEGY) - .tableProperty(TableProperties.ORC_COMPRESSION_STRATEGY) - .defaultValue(TableProperties.ORC_COMPRESSION_STRATEGY_DEFAULT) - .parse(); - } - - public DistributionMode distributionMode() { - String modeName = - confParser - .stringConf() - .option(FlinkWriteOptions.DISTRIBUTION_MODE.key()) - .flinkConfig(FlinkWriteOptions.DISTRIBUTION_MODE) - .tableProperty(TableProperties.WRITE_DISTRIBUTION_MODE) - .defaultValue(TableProperties.WRITE_DISTRIBUTION_MODE_NONE) - .parse(); - return DistributionMode.fromName(modeName); - } - - public int workerPoolSize() { - return confParser - .intConf() - .flinkConfig(FlinkConfigOptions.TABLE_EXEC_ICEBERG_WORKER_POOL_SIZE) - .defaultValue(FlinkConfigOptions.TABLE_EXEC_ICEBERG_WORKER_POOL_SIZE.defaultValue()) - .parse(); - } - - public String branch() { - return confParser - .stringConf() - .option(FlinkWriteOptions.BRANCH.key()) - .defaultValue(FlinkWriteOptions.BRANCH.defaultValue()) - .parse(); - } - - public Integer writeParallelism() { - return confParser.intConf().option(FlinkWriteOptions.WRITE_PARALLELISM.key()).parseOptional(); - } - - /** - * NOTE: This may be removed or changed in a future release. This value specifies the interval for - * refreshing the table instances in sink writer subtasks. If not specified then the default - * behavior is to not refresh the table. - * - * @return the interval for refreshing the table in sink writer subtasks - */ - @Experimental - public Duration tableRefreshInterval() { - return confParser - .durationConf() - .option(FlinkWriteOptions.TABLE_REFRESH_INTERVAL.key()) - .flinkConfig(FlinkWriteOptions.TABLE_REFRESH_INTERVAL) - .parseOptional(); - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/FlinkWriteOptions.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/FlinkWriteOptions.java deleted file mode 100644 index df73f2e09cac..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/FlinkWriteOptions.java +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import java.time.Duration; -import org.apache.flink.annotation.Experimental; -import org.apache.flink.configuration.ConfigOption; -import org.apache.flink.configuration.ConfigOptions; -import org.apache.iceberg.SnapshotRef; - -/** Flink sink write options */ -public class FlinkWriteOptions { - - private FlinkWriteOptions() {} - - // File format for write operations(default: Table write.format.default ) - public static final ConfigOption WRITE_FORMAT = - ConfigOptions.key("write-format").stringType().noDefaultValue(); - - // Overrides this table's write.target-file-size-bytes - public static final ConfigOption TARGET_FILE_SIZE_BYTES = - ConfigOptions.key("target-file-size-bytes").longType().noDefaultValue(); - - // Overrides this table's write..compression-codec - public static final ConfigOption COMPRESSION_CODEC = - ConfigOptions.key("compression-codec").stringType().noDefaultValue(); - - // Overrides this table's write..compression-level - public static final ConfigOption COMPRESSION_LEVEL = - ConfigOptions.key("compression-level").stringType().noDefaultValue(); - - // Overrides this table's write..compression-strategy - public static final ConfigOption COMPRESSION_STRATEGY = - ConfigOptions.key("compression-strategy").stringType().noDefaultValue(); - - // Overrides this table's write.upsert.enabled - public static final ConfigOption WRITE_UPSERT_ENABLED = - ConfigOptions.key("upsert-enabled").booleanType().noDefaultValue(); - - public static final ConfigOption OVERWRITE_MODE = - ConfigOptions.key("overwrite-enabled").booleanType().defaultValue(false); - - // Overrides the table's write.distribution-mode - public static final ConfigOption DISTRIBUTION_MODE = - ConfigOptions.key("distribution-mode").stringType().noDefaultValue(); - - // Branch to write to - public static final ConfigOption BRANCH = - ConfigOptions.key("branch").stringType().defaultValue(SnapshotRef.MAIN_BRANCH); - - public static final ConfigOption WRITE_PARALLELISM = - ConfigOptions.key("write-parallelism").intType().noDefaultValue(); - - @Experimental - public static final ConfigOption TABLE_REFRESH_INTERVAL = - ConfigOptions.key("table-refresh-interval").durationType().noDefaultValue(); -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/IcebergTableSink.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/IcebergTableSink.java deleted file mode 100644 index 1b9268569d9a..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/IcebergTableSink.java +++ /dev/null @@ -1,121 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import java.util.List; -import java.util.Map; -import org.apache.flink.configuration.ReadableConfig; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.streaming.api.datastream.DataStreamSink; -import org.apache.flink.table.api.TableSchema; -import org.apache.flink.table.api.constraints.UniqueConstraint; -import org.apache.flink.table.connector.ChangelogMode; -import org.apache.flink.table.connector.ProviderContext; -import org.apache.flink.table.connector.sink.DataStreamSinkProvider; -import org.apache.flink.table.connector.sink.DynamicTableSink; -import org.apache.flink.table.connector.sink.abilities.SupportsOverwrite; -import org.apache.flink.table.connector.sink.abilities.SupportsPartitioning; -import org.apache.flink.table.data.RowData; -import org.apache.flink.types.RowKind; -import org.apache.flink.util.Preconditions; -import org.apache.iceberg.flink.sink.FlinkSink; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; - -public class IcebergTableSink implements DynamicTableSink, SupportsPartitioning, SupportsOverwrite { - private final TableLoader tableLoader; - private final TableSchema tableSchema; - private final ReadableConfig readableConfig; - private final Map writeProps; - - private boolean overwrite = false; - - private IcebergTableSink(IcebergTableSink toCopy) { - this.tableLoader = toCopy.tableLoader; - this.tableSchema = toCopy.tableSchema; - this.overwrite = toCopy.overwrite; - this.readableConfig = toCopy.readableConfig; - this.writeProps = toCopy.writeProps; - } - - public IcebergTableSink( - TableLoader tableLoader, - TableSchema tableSchema, - ReadableConfig readableConfig, - Map writeProps) { - this.tableLoader = tableLoader; - this.tableSchema = tableSchema; - this.readableConfig = readableConfig; - this.writeProps = writeProps; - } - - @Override - public SinkRuntimeProvider getSinkRuntimeProvider(Context context) { - Preconditions.checkState( - !overwrite || context.isBounded(), - "Unbounded data stream doesn't support overwrite operation."); - - List equalityColumns = - tableSchema.getPrimaryKey().map(UniqueConstraint::getColumns).orElseGet(ImmutableList::of); - - return new DataStreamSinkProvider() { - @Override - public DataStreamSink consumeDataStream( - ProviderContext providerContext, DataStream dataStream) { - return FlinkSink.forRowData(dataStream) - .tableLoader(tableLoader) - .tableSchema(tableSchema) - .equalityFieldColumns(equalityColumns) - .overwrite(overwrite) - .setAll(writeProps) - .flinkConf(readableConfig) - .append(); - } - }; - } - - @Override - public void applyStaticPartition(Map partition) { - // The flink's PartitionFanoutWriter will handle the static partition write policy - // automatically. - } - - @Override - public ChangelogMode getChangelogMode(ChangelogMode requestedMode) { - ChangelogMode.Builder builder = ChangelogMode.newBuilder(); - for (RowKind kind : requestedMode.getContainedKinds()) { - builder.addContainedKind(kind); - } - return builder.build(); - } - - @Override - public DynamicTableSink copy() { - return new IcebergTableSink(this); - } - - @Override - public String asSummaryString() { - return "Iceberg table sink"; - } - - @Override - public void applyOverwrite(boolean newOverwrite) { - this.overwrite = newOverwrite; - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/RowDataWrapper.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/RowDataWrapper.java deleted file mode 100644 index d4cec7a3e80b..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/RowDataWrapper.java +++ /dev/null @@ -1,142 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import java.lang.reflect.Array; -import java.nio.ByteBuffer; -import java.time.LocalDateTime; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.TimestampData; -import org.apache.flink.table.types.logical.DecimalType; -import org.apache.flink.table.types.logical.LocalZonedTimestampType; -import org.apache.flink.table.types.logical.LogicalType; -import org.apache.flink.table.types.logical.RowType; -import org.apache.flink.table.types.logical.TimestampType; -import org.apache.iceberg.StructLike; -import org.apache.iceberg.types.Type; -import org.apache.iceberg.types.Types; -import org.apache.iceberg.util.DateTimeUtil; -import org.apache.iceberg.util.UUIDUtil; - -public class RowDataWrapper implements StructLike { - - private final LogicalType[] types; - private final PositionalGetter[] getters; - private RowData rowData = null; - - public RowDataWrapper(RowType rowType, Types.StructType struct) { - int size = rowType.getFieldCount(); - - types = (LogicalType[]) Array.newInstance(LogicalType.class, size); - getters = (PositionalGetter[]) Array.newInstance(PositionalGetter.class, size); - - for (int i = 0; i < size; i++) { - types[i] = rowType.getTypeAt(i); - getters[i] = buildGetter(types[i], struct.fields().get(i).type()); - } - } - - public RowDataWrapper wrap(RowData data) { - this.rowData = data; - return this; - } - - @Override - public int size() { - return types.length; - } - - @Override - public T get(int pos, Class javaClass) { - if (rowData.isNullAt(pos)) { - return null; - } else if (getters[pos] != null) { - return javaClass.cast(getters[pos].get(rowData, pos)); - } - - Object value = RowData.createFieldGetter(types[pos], pos).getFieldOrNull(rowData); - return javaClass.cast(value); - } - - @Override - public void set(int pos, T value) { - throw new UnsupportedOperationException( - "Could not set a field in the RowDataWrapper because rowData is read-only"); - } - - private interface PositionalGetter { - T get(RowData data, int pos); - } - - private static PositionalGetter buildGetter(LogicalType logicalType, Type type) { - switch (logicalType.getTypeRoot()) { - case TINYINT: - return (row, pos) -> (int) row.getByte(pos); - case SMALLINT: - return (row, pos) -> (int) row.getShort(pos); - case CHAR: - case VARCHAR: - return (row, pos) -> row.getString(pos).toString(); - - case BINARY: - case VARBINARY: - if (Type.TypeID.UUID == type.typeId()) { - return (row, pos) -> UUIDUtil.convert(row.getBinary(pos)); - } else { - return (row, pos) -> ByteBuffer.wrap(row.getBinary(pos)); - } - - case DECIMAL: - DecimalType decimalType = (DecimalType) logicalType; - return (row, pos) -> - row.getDecimal(pos, decimalType.getPrecision(), decimalType.getScale()).toBigDecimal(); - - case TIME_WITHOUT_TIME_ZONE: - // Time in RowData is in milliseconds (Integer), while iceberg's time is microseconds - // (Long). - return (row, pos) -> ((long) row.getInt(pos)) * 1_000; - - case TIMESTAMP_WITHOUT_TIME_ZONE: - TimestampType timestampType = (TimestampType) logicalType; - return (row, pos) -> { - LocalDateTime localDateTime = - row.getTimestamp(pos, timestampType.getPrecision()).toLocalDateTime(); - return DateTimeUtil.microsFromTimestamp(localDateTime); - }; - - case TIMESTAMP_WITH_LOCAL_TIME_ZONE: - LocalZonedTimestampType lzTs = (LocalZonedTimestampType) logicalType; - return (row, pos) -> { - TimestampData timestampData = row.getTimestamp(pos, lzTs.getPrecision()); - return timestampData.getMillisecond() * 1000 - + timestampData.getNanoOfMillisecond() / 1000; - }; - - case ROW: - RowType rowType = (RowType) logicalType; - Types.StructType structType = (Types.StructType) type; - - RowDataWrapper nestedWrapper = new RowDataWrapper(rowType, structType); - return (row, pos) -> nestedWrapper.wrap(row.getRow(pos, rowType.getFieldCount())); - - default: - return null; - } - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/TableLoader.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/TableLoader.java deleted file mode 100644 index da509451fee7..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/TableLoader.java +++ /dev/null @@ -1,159 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import java.io.Closeable; -import java.io.IOException; -import java.io.Serializable; -import org.apache.hadoop.conf.Configuration; -import org.apache.iceberg.Table; -import org.apache.iceberg.catalog.Catalog; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.hadoop.HadoopTables; -import org.apache.iceberg.hadoop.SerializableConfiguration; -import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; - -/** - * Serializable loader to load an Iceberg {@link Table}. Flink needs to get {@link Table} objects in - * the cluster (for example, to get splits), not just on the client side. So we need an Iceberg - * table loader to get the {@link Table} object. - */ -public interface TableLoader extends Closeable, Serializable, Cloneable { - - void open(); - - boolean isOpen(); - - Table loadTable(); - - /** Clone a TableLoader */ - @SuppressWarnings({"checkstyle:NoClone", "checkstyle:SuperClone"}) - TableLoader clone(); - - static TableLoader fromCatalog(CatalogLoader catalogLoader, TableIdentifier identifier) { - return new CatalogTableLoader(catalogLoader, identifier); - } - - static TableLoader fromHadoopTable(String location) { - return fromHadoopTable(location, FlinkCatalogFactory.clusterHadoopConf()); - } - - static TableLoader fromHadoopTable(String location, Configuration hadoopConf) { - return new HadoopTableLoader(location, hadoopConf); - } - - class HadoopTableLoader implements TableLoader { - - private static final long serialVersionUID = 1L; - - private final String location; - private final SerializableConfiguration hadoopConf; - - private transient HadoopTables tables; - - private HadoopTableLoader(String location, Configuration conf) { - this.location = location; - this.hadoopConf = new SerializableConfiguration(conf); - } - - @Override - public void open() { - tables = new HadoopTables(hadoopConf.get()); - } - - @Override - public boolean isOpen() { - return tables != null; - } - - @Override - public Table loadTable() { - FlinkEnvironmentContext.init(); - return tables.load(location); - } - - @Override - @SuppressWarnings({"checkstyle:NoClone", "checkstyle:SuperClone"}) - public TableLoader clone() { - return new HadoopTableLoader(location, new Configuration(hadoopConf.get())); - } - - @Override - public void close() {} - - @Override - public String toString() { - return MoreObjects.toStringHelper(this).add("location", location).toString(); - } - } - - class CatalogTableLoader implements TableLoader { - - private static final long serialVersionUID = 1L; - - private final CatalogLoader catalogLoader; - private final String identifier; - - private transient Catalog catalog; - - private CatalogTableLoader(CatalogLoader catalogLoader, TableIdentifier tableIdentifier) { - this.catalogLoader = catalogLoader; - this.identifier = tableIdentifier.toString(); - } - - @Override - public void open() { - catalog = catalogLoader.loadCatalog(); - } - - @Override - public boolean isOpen() { - return catalog != null; - } - - @Override - public Table loadTable() { - FlinkEnvironmentContext.init(); - return catalog.loadTable(TableIdentifier.parse(identifier)); - } - - @Override - public void close() throws IOException { - if (catalog instanceof Closeable) { - ((Closeable) catalog).close(); - } - - catalog = null; - } - - @Override - @SuppressWarnings({"checkstyle:NoClone", "checkstyle:SuperClone"}) - public TableLoader clone() { - return new CatalogTableLoader(catalogLoader.clone(), TableIdentifier.parse(identifier)); - } - - @Override - public String toString() { - return MoreObjects.toStringHelper(this) - .add("tableIdentifier", identifier) - .add("catalogLoader", catalogLoader) - .toString(); - } - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/TypeToFlinkType.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/TypeToFlinkType.java deleted file mode 100644 index f8f1b74b1ceb..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/TypeToFlinkType.java +++ /dev/null @@ -1,134 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import java.util.List; -import org.apache.flink.table.types.logical.ArrayType; -import org.apache.flink.table.types.logical.BigIntType; -import org.apache.flink.table.types.logical.BinaryType; -import org.apache.flink.table.types.logical.BooleanType; -import org.apache.flink.table.types.logical.DateType; -import org.apache.flink.table.types.logical.DecimalType; -import org.apache.flink.table.types.logical.DoubleType; -import org.apache.flink.table.types.logical.FloatType; -import org.apache.flink.table.types.logical.IntType; -import org.apache.flink.table.types.logical.LocalZonedTimestampType; -import org.apache.flink.table.types.logical.LogicalType; -import org.apache.flink.table.types.logical.MapType; -import org.apache.flink.table.types.logical.RowType; -import org.apache.flink.table.types.logical.TimeType; -import org.apache.flink.table.types.logical.TimestampType; -import org.apache.flink.table.types.logical.VarBinaryType; -import org.apache.flink.table.types.logical.VarCharType; -import org.apache.iceberg.Schema; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.types.Type; -import org.apache.iceberg.types.TypeUtil; -import org.apache.iceberg.types.Types; - -class TypeToFlinkType extends TypeUtil.SchemaVisitor { - TypeToFlinkType() {} - - @Override - public LogicalType schema(Schema schema, LogicalType structType) { - return structType; - } - - @Override - public LogicalType struct(Types.StructType struct, List fieldResults) { - List fields = struct.fields(); - - List flinkFields = Lists.newArrayListWithExpectedSize(fieldResults.size()); - for (int i = 0; i < fields.size(); i += 1) { - Types.NestedField field = fields.get(i); - LogicalType type = fieldResults.get(i); - RowType.RowField flinkField = - new RowType.RowField(field.name(), type.copy(field.isOptional()), field.doc()); - flinkFields.add(flinkField); - } - - return new RowType(flinkFields); - } - - @Override - public LogicalType field(Types.NestedField field, LogicalType fieldResult) { - return fieldResult; - } - - @Override - public LogicalType list(Types.ListType list, LogicalType elementResult) { - return new ArrayType(elementResult.copy(list.isElementOptional())); - } - - @Override - public LogicalType map(Types.MapType map, LogicalType keyResult, LogicalType valueResult) { - // keys in map are not allowed to be null. - return new MapType(keyResult.copy(false), valueResult.copy(map.isValueOptional())); - } - - @Override - public LogicalType primitive(Type.PrimitiveType primitive) { - switch (primitive.typeId()) { - case BOOLEAN: - return new BooleanType(); - case INTEGER: - return new IntType(); - case LONG: - return new BigIntType(); - case FLOAT: - return new FloatType(); - case DOUBLE: - return new DoubleType(); - case DATE: - return new DateType(); - case TIME: - // For the type: Flink only support TimeType with default precision (second) now. The - // precision of time is - // not supported in Flink, so we can think of it as a simple time type directly. - // For the data: Flink uses int that support mills to represent time data, so it supports - // mills precision. - return new TimeType(); - case TIMESTAMP: - Types.TimestampType timestamp = (Types.TimestampType) primitive; - if (timestamp.shouldAdjustToUTC()) { - // MICROS - return new LocalZonedTimestampType(6); - } else { - // MICROS - return new TimestampType(6); - } - case STRING: - return new VarCharType(VarCharType.MAX_LENGTH); - case UUID: - // UUID length is 16 - return new BinaryType(16); - case FIXED: - Types.FixedType fixedType = (Types.FixedType) primitive; - return new BinaryType(fixedType.length()); - case BINARY: - return new VarBinaryType(VarBinaryType.MAX_LENGTH); - case DECIMAL: - Types.DecimalType decimal = (Types.DecimalType) primitive; - return new DecimalType(decimal.precision(), decimal.scale()); - default: - throw new UnsupportedOperationException( - "Cannot convert unknown type to Flink: " + primitive); - } - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/actions/Actions.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/actions/Actions.java deleted file mode 100644 index b96b47c5a785..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/actions/Actions.java +++ /dev/null @@ -1,52 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.actions; - -import org.apache.flink.configuration.Configuration; -import org.apache.flink.configuration.CoreOptions; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.iceberg.Table; - -public class Actions { - - public static final Configuration CONFIG = - new Configuration() - // disable classloader check as Avro may cache class/object in the serializers. - .set(CoreOptions.CHECK_LEAKED_CLASSLOADER, false); - - private final StreamExecutionEnvironment env; - private final Table table; - - private Actions(StreamExecutionEnvironment env, Table table) { - this.env = env; - this.table = table; - } - - public static Actions forTable(StreamExecutionEnvironment env, Table table) { - return new Actions(env, table); - } - - public static Actions forTable(Table table) { - return new Actions(StreamExecutionEnvironment.getExecutionEnvironment(CONFIG), table); - } - - public RewriteDataFilesAction rewriteDataFiles() { - return new RewriteDataFilesAction(env, table); - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/actions/RewriteDataFilesAction.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/actions/RewriteDataFilesAction.java deleted file mode 100644 index 670abebcb58a..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/actions/RewriteDataFilesAction.java +++ /dev/null @@ -1,72 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.actions; - -import java.util.List; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.iceberg.CombinedScanTask; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.Table; -import org.apache.iceberg.actions.BaseRewriteDataFilesAction; -import org.apache.iceberg.flink.source.RowDataRewriter; -import org.apache.iceberg.io.FileIO; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; - -public class RewriteDataFilesAction extends BaseRewriteDataFilesAction { - - private final StreamExecutionEnvironment env; - private int maxParallelism; - - public RewriteDataFilesAction(StreamExecutionEnvironment env, Table table) { - super(table); - this.env = env; - this.maxParallelism = env.getParallelism(); - } - - @Override - protected FileIO fileIO() { - return table().io(); - } - - @Override - protected List rewriteDataForTasks(List combinedScanTasks) { - int size = combinedScanTasks.size(); - int parallelism = Math.min(size, maxParallelism); - DataStream dataStream = env.fromCollection(combinedScanTasks); - RowDataRewriter rowDataRewriter = - new RowDataRewriter(table(), caseSensitive(), fileIO(), encryptionManager()); - try { - return rowDataRewriter.rewriteDataForTasks(dataStream, parallelism); - } catch (Exception e) { - throw new RuntimeException("Rewrite data file error.", e); - } - } - - @Override - protected RewriteDataFilesAction self() { - return this; - } - - public RewriteDataFilesAction maxParallelism(int parallelism) { - Preconditions.checkArgument(parallelism > 0, "Invalid max parallelism %s", parallelism); - this.maxParallelism = parallelism; - return this; - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/data/AvroWithFlinkSchemaVisitor.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/data/AvroWithFlinkSchemaVisitor.java deleted file mode 100644 index 8103224a0b6c..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/data/AvroWithFlinkSchemaVisitor.java +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.data; - -import org.apache.flink.table.types.logical.ArrayType; -import org.apache.flink.table.types.logical.LogicalType; -import org.apache.flink.table.types.logical.LogicalTypeFamily; -import org.apache.flink.table.types.logical.MapType; -import org.apache.flink.table.types.logical.NullType; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.avro.AvroWithPartnerByStructureVisitor; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.util.Pair; - -public abstract class AvroWithFlinkSchemaVisitor - extends AvroWithPartnerByStructureVisitor { - - @Override - protected boolean isStringType(LogicalType logicalType) { - return logicalType.getTypeRoot().getFamilies().contains(LogicalTypeFamily.CHARACTER_STRING); - } - - @Override - protected boolean isMapType(LogicalType logicalType) { - return logicalType instanceof MapType; - } - - @Override - protected LogicalType arrayElementType(LogicalType arrayType) { - Preconditions.checkArgument( - arrayType instanceof ArrayType, "Invalid array: %s is not an array", arrayType); - return ((ArrayType) arrayType).getElementType(); - } - - @Override - protected LogicalType mapKeyType(LogicalType mapType) { - Preconditions.checkArgument(isMapType(mapType), "Invalid map: %s is not a map", mapType); - return ((MapType) mapType).getKeyType(); - } - - @Override - protected LogicalType mapValueType(LogicalType mapType) { - Preconditions.checkArgument(isMapType(mapType), "Invalid map: %s is not a map", mapType); - return ((MapType) mapType).getValueType(); - } - - @Override - protected Pair fieldNameAndType(LogicalType structType, int pos) { - Preconditions.checkArgument( - structType instanceof RowType, "Invalid struct: %s is not a struct", structType); - RowType.RowField field = ((RowType) structType).getFields().get(pos); - return Pair.of(field.getName(), field.getType()); - } - - @Override - protected LogicalType nullType() { - return new NullType(); - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/data/FlinkAvroReader.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/data/FlinkAvroReader.java deleted file mode 100644 index 86404959735a..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/data/FlinkAvroReader.java +++ /dev/null @@ -1,169 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.data; - -import java.io.IOException; -import java.util.List; -import java.util.Map; -import java.util.function.Supplier; -import org.apache.avro.LogicalType; -import org.apache.avro.LogicalTypes; -import org.apache.avro.Schema; -import org.apache.avro.io.DatumReader; -import org.apache.avro.io.Decoder; -import org.apache.flink.table.data.RowData; -import org.apache.iceberg.avro.AvroSchemaWithTypeVisitor; -import org.apache.iceberg.avro.SupportsRowPosition; -import org.apache.iceberg.avro.ValueReader; -import org.apache.iceberg.avro.ValueReaders; -import org.apache.iceberg.data.avro.DecoderResolver; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.types.Type; -import org.apache.iceberg.types.Types; - -public class FlinkAvroReader implements DatumReader, SupportsRowPosition { - - private final Schema readSchema; - private final ValueReader reader; - private Schema fileSchema = null; - - public FlinkAvroReader(org.apache.iceberg.Schema expectedSchema, Schema readSchema) { - this(expectedSchema, readSchema, ImmutableMap.of()); - } - - @SuppressWarnings("unchecked") - public FlinkAvroReader( - org.apache.iceberg.Schema expectedSchema, Schema readSchema, Map constants) { - this.readSchema = readSchema; - this.reader = - (ValueReader) - AvroSchemaWithTypeVisitor.visit(expectedSchema, readSchema, new ReadBuilder(constants)); - } - - @Override - public void setSchema(Schema newFileSchema) { - this.fileSchema = Schema.applyAliases(newFileSchema, readSchema); - } - - @Override - public RowData read(RowData reuse, Decoder decoder) throws IOException { - return DecoderResolver.resolveAndRead(decoder, readSchema, fileSchema, reader, reuse); - } - - @Override - public void setRowPositionSupplier(Supplier posSupplier) { - if (reader instanceof SupportsRowPosition) { - ((SupportsRowPosition) reader).setRowPositionSupplier(posSupplier); - } - } - - private static class ReadBuilder extends AvroSchemaWithTypeVisitor> { - private final Map idToConstant; - - private ReadBuilder(Map idToConstant) { - this.idToConstant = idToConstant; - } - - @Override - public ValueReader record( - Types.StructType expected, Schema record, List names, List> fields) { - return FlinkValueReaders.struct(fields, expected.asStructType(), idToConstant); - } - - @Override - public ValueReader union(Type expected, Schema union, List> options) { - return ValueReaders.union(options); - } - - @Override - public ValueReader array( - Types.ListType expected, Schema array, ValueReader elementReader) { - return FlinkValueReaders.array(elementReader); - } - - @Override - public ValueReader map( - Types.MapType expected, Schema map, ValueReader keyReader, ValueReader valueReader) { - return FlinkValueReaders.arrayMap(keyReader, valueReader); - } - - @Override - public ValueReader map(Types.MapType expected, Schema map, ValueReader valueReader) { - return FlinkValueReaders.map(FlinkValueReaders.strings(), valueReader); - } - - @Override - public ValueReader primitive(Type.PrimitiveType expected, Schema primitive) { - LogicalType logicalType = primitive.getLogicalType(); - if (logicalType != null) { - switch (logicalType.getName()) { - case "date": - return ValueReaders.ints(); - - case "time-micros": - return FlinkValueReaders.timeMicros(); - - case "timestamp-millis": - return FlinkValueReaders.timestampMills(); - - case "timestamp-micros": - return FlinkValueReaders.timestampMicros(); - - case "decimal": - LogicalTypes.Decimal decimal = (LogicalTypes.Decimal) logicalType; - return FlinkValueReaders.decimal( - ValueReaders.decimalBytesReader(primitive), - decimal.getPrecision(), - decimal.getScale()); - - case "uuid": - return FlinkValueReaders.uuids(); - - default: - throw new IllegalArgumentException("Unknown logical type: " + logicalType); - } - } - - switch (primitive.getType()) { - case NULL: - return ValueReaders.nulls(); - case BOOLEAN: - return ValueReaders.booleans(); - case INT: - return ValueReaders.ints(); - case LONG: - return ValueReaders.longs(); - case FLOAT: - return ValueReaders.floats(); - case DOUBLE: - return ValueReaders.doubles(); - case STRING: - return FlinkValueReaders.strings(); - case FIXED: - return ValueReaders.fixed(primitive.getFixedSize()); - case BYTES: - return ValueReaders.bytes(); - case ENUM: - return FlinkValueReaders.enums(primitive.getEnumSymbols()); - default: - throw new IllegalArgumentException("Unsupported type: " + primitive); - } - } - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/data/FlinkAvroWriter.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/data/FlinkAvroWriter.java deleted file mode 100644 index 873e65783119..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/data/FlinkAvroWriter.java +++ /dev/null @@ -1,165 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.data; - -import java.io.IOException; -import java.util.List; -import java.util.stream.Collectors; -import java.util.stream.IntStream; -import java.util.stream.Stream; -import org.apache.avro.LogicalTypes; -import org.apache.avro.Schema; -import org.apache.avro.io.Encoder; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.LogicalType; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.FieldMetrics; -import org.apache.iceberg.avro.MetricsAwareDatumWriter; -import org.apache.iceberg.avro.ValueWriter; -import org.apache.iceberg.avro.ValueWriters; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; - -public class FlinkAvroWriter implements MetricsAwareDatumWriter { - private final RowType rowType; - private ValueWriter writer = null; - - public FlinkAvroWriter(RowType rowType) { - this.rowType = rowType; - } - - @Override - @SuppressWarnings("unchecked") - public void setSchema(Schema schema) { - this.writer = - (ValueWriter) - AvroWithFlinkSchemaVisitor.visit(rowType, schema, new WriteBuilder()); - } - - @Override - public void write(RowData datum, Encoder out) throws IOException { - writer.write(datum, out); - } - - @Override - public Stream metrics() { - return writer.metrics(); - } - - private static class WriteBuilder extends AvroWithFlinkSchemaVisitor> { - @Override - public ValueWriter record( - LogicalType struct, Schema record, List names, List> fields) { - return FlinkValueWriters.row( - fields, - IntStream.range(0, names.size()) - .mapToObj(i -> fieldNameAndType(struct, i).second()) - .collect(Collectors.toList())); - } - - @Override - public ValueWriter union(LogicalType type, Schema union, List> options) { - Preconditions.checkArgument( - options.contains(ValueWriters.nulls()), - "Cannot create writer for non-option union: %s", - union); - Preconditions.checkArgument( - options.size() == 2, "Cannot create writer for non-option union: %s", union); - if (union.getTypes().get(0).getType() == Schema.Type.NULL) { - return ValueWriters.option(0, options.get(1)); - } else { - return ValueWriters.option(1, options.get(0)); - } - } - - @Override - public ValueWriter array(LogicalType sArray, Schema array, ValueWriter elementWriter) { - return FlinkValueWriters.array(elementWriter, arrayElementType(sArray)); - } - - @Override - public ValueWriter map(LogicalType sMap, Schema map, ValueWriter valueReader) { - return FlinkValueWriters.map( - FlinkValueWriters.strings(), mapKeyType(sMap), valueReader, mapValueType(sMap)); - } - - @Override - public ValueWriter map( - LogicalType sMap, Schema map, ValueWriter keyWriter, ValueWriter valueWriter) { - return FlinkValueWriters.arrayMap( - keyWriter, mapKeyType(sMap), valueWriter, mapValueType(sMap)); - } - - @Override - public ValueWriter primitive(LogicalType type, Schema primitive) { - org.apache.avro.LogicalType logicalType = primitive.getLogicalType(); - if (logicalType != null) { - switch (logicalType.getName()) { - case "date": - return ValueWriters.ints(); - - case "time-micros": - return FlinkValueWriters.timeMicros(); - - case "timestamp-micros": - return FlinkValueWriters.timestampMicros(); - - case "decimal": - LogicalTypes.Decimal decimal = (LogicalTypes.Decimal) logicalType; - return FlinkValueWriters.decimal(decimal.getPrecision(), decimal.getScale()); - - case "uuid": - return ValueWriters.uuids(); - - default: - throw new IllegalArgumentException("Unsupported logical type: " + logicalType); - } - } - - switch (primitive.getType()) { - case NULL: - return ValueWriters.nulls(); - case BOOLEAN: - return ValueWriters.booleans(); - case INT: - switch (type.getTypeRoot()) { - case TINYINT: - return ValueWriters.tinyints(); - case SMALLINT: - return ValueWriters.shorts(); - default: - return ValueWriters.ints(); - } - case LONG: - return ValueWriters.longs(); - case FLOAT: - return ValueWriters.floats(); - case DOUBLE: - return ValueWriters.doubles(); - case STRING: - return FlinkValueWriters.strings(); - case FIXED: - return ValueWriters.fixed(primitive.getFixedSize()); - case BYTES: - return ValueWriters.bytes(); - default: - throw new IllegalArgumentException("Unsupported type: " + primitive); - } - } - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReader.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReader.java deleted file mode 100644 index 65b9d44ad4b8..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReader.java +++ /dev/null @@ -1,131 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.data; - -import java.util.List; -import java.util.Map; -import org.apache.flink.table.data.ArrayData; -import org.apache.flink.table.data.MapData; -import org.apache.flink.table.data.RowData; -import org.apache.iceberg.Schema; -import org.apache.iceberg.orc.OrcRowReader; -import org.apache.iceberg.orc.OrcSchemaWithTypeVisitor; -import org.apache.iceberg.orc.OrcValueReader; -import org.apache.iceberg.orc.OrcValueReaders; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.types.Type; -import org.apache.iceberg.types.Types; -import org.apache.orc.TypeDescription; -import org.apache.orc.storage.ql.exec.vector.StructColumnVector; -import org.apache.orc.storage.ql.exec.vector.VectorizedRowBatch; - -public class FlinkOrcReader implements OrcRowReader { - private final OrcValueReader reader; - - public FlinkOrcReader(Schema iSchema, TypeDescription readSchema) { - this(iSchema, readSchema, ImmutableMap.of()); - } - - public FlinkOrcReader(Schema iSchema, TypeDescription readSchema, Map idToConstant) { - this.reader = - OrcSchemaWithTypeVisitor.visit(iSchema, readSchema, new ReadBuilder(idToConstant)); - } - - @Override - public RowData read(VectorizedRowBatch batch, int row) { - return (RowData) reader.read(new StructColumnVector(batch.size, batch.cols), row); - } - - @Override - public void setBatchContext(long batchOffsetInFile) { - reader.setBatchContext(batchOffsetInFile); - } - - private static class ReadBuilder extends OrcSchemaWithTypeVisitor> { - private final Map idToConstant; - - private ReadBuilder(Map idToConstant) { - this.idToConstant = idToConstant; - } - - @Override - public OrcValueReader record( - Types.StructType iStruct, - TypeDescription record, - List names, - List> fields) { - return FlinkOrcReaders.struct(fields, iStruct, idToConstant); - } - - @Override - public OrcValueReader list( - Types.ListType iList, TypeDescription array, OrcValueReader elementReader) { - return FlinkOrcReaders.array(elementReader); - } - - @Override - public OrcValueReader map( - Types.MapType iMap, - TypeDescription map, - OrcValueReader keyReader, - OrcValueReader valueReader) { - return FlinkOrcReaders.map(keyReader, valueReader); - } - - @Override - public OrcValueReader primitive(Type.PrimitiveType iPrimitive, TypeDescription primitive) { - switch (iPrimitive.typeId()) { - case BOOLEAN: - return OrcValueReaders.booleans(); - case INTEGER: - return OrcValueReaders.ints(); - case LONG: - return OrcValueReaders.longs(); - case FLOAT: - return OrcValueReaders.floats(); - case DOUBLE: - return OrcValueReaders.doubles(); - case DATE: - return FlinkOrcReaders.dates(); - case TIME: - return FlinkOrcReaders.times(); - case TIMESTAMP: - Types.TimestampType timestampType = (Types.TimestampType) iPrimitive; - if (timestampType.shouldAdjustToUTC()) { - return FlinkOrcReaders.timestampTzs(); - } else { - return FlinkOrcReaders.timestamps(); - } - case STRING: - return FlinkOrcReaders.strings(); - case UUID: - case FIXED: - case BINARY: - return OrcValueReaders.bytes(); - case DECIMAL: - Types.DecimalType decimalType = (Types.DecimalType) iPrimitive; - return FlinkOrcReaders.decimals(decimalType.precision(), decimalType.scale()); - default: - throw new IllegalArgumentException( - String.format( - "Invalid iceberg type %s corresponding to ORC type %s", iPrimitive, primitive)); - } - } - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReaders.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReaders.java deleted file mode 100644 index 7a4a15c7e600..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReaders.java +++ /dev/null @@ -1,283 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.data; - -import java.math.BigDecimal; -import java.time.Instant; -import java.time.LocalDateTime; -import java.time.ZoneOffset; -import java.util.List; -import java.util.Map; -import org.apache.flink.table.data.ArrayData; -import org.apache.flink.table.data.DecimalData; -import org.apache.flink.table.data.GenericArrayData; -import org.apache.flink.table.data.GenericMapData; -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.MapData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.StringData; -import org.apache.flink.table.data.TimestampData; -import org.apache.iceberg.orc.OrcValueReader; -import org.apache.iceberg.orc.OrcValueReaders; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.types.Types; -import org.apache.orc.storage.ql.exec.vector.BytesColumnVector; -import org.apache.orc.storage.ql.exec.vector.ColumnVector; -import org.apache.orc.storage.ql.exec.vector.DecimalColumnVector; -import org.apache.orc.storage.ql.exec.vector.ListColumnVector; -import org.apache.orc.storage.ql.exec.vector.LongColumnVector; -import org.apache.orc.storage.ql.exec.vector.MapColumnVector; -import org.apache.orc.storage.ql.exec.vector.TimestampColumnVector; -import org.apache.orc.storage.serde2.io.HiveDecimalWritable; - -class FlinkOrcReaders { - private FlinkOrcReaders() {} - - static OrcValueReader strings() { - return StringReader.INSTANCE; - } - - static OrcValueReader dates() { - return DateReader.INSTANCE; - } - - static OrcValueReader decimals(int precision, int scale) { - if (precision <= 18) { - return new Decimal18Reader(precision, scale); - } else if (precision <= 38) { - return new Decimal38Reader(precision, scale); - } else { - throw new IllegalArgumentException("Invalid precision: " + precision); - } - } - - static OrcValueReader times() { - return TimeReader.INSTANCE; - } - - static OrcValueReader timestamps() { - return TimestampReader.INSTANCE; - } - - static OrcValueReader timestampTzs() { - return TimestampTzReader.INSTANCE; - } - - static OrcValueReader array(OrcValueReader elementReader) { - return new ArrayReader<>(elementReader); - } - - public static OrcValueReader map( - OrcValueReader keyReader, OrcValueReader valueReader) { - return new MapReader<>(keyReader, valueReader); - } - - public static OrcValueReader struct( - List> readers, Types.StructType struct, Map idToConstant) { - return new StructReader(readers, struct, idToConstant); - } - - private static class StringReader implements OrcValueReader { - private static final StringReader INSTANCE = new StringReader(); - - @Override - public StringData nonNullRead(ColumnVector vector, int row) { - BytesColumnVector bytesVector = (BytesColumnVector) vector; - return StringData.fromBytes( - bytesVector.vector[row], bytesVector.start[row], bytesVector.length[row]); - } - } - - private static class DateReader implements OrcValueReader { - private static final DateReader INSTANCE = new DateReader(); - - @Override - public Integer nonNullRead(ColumnVector vector, int row) { - return (int) ((LongColumnVector) vector).vector[row]; - } - } - - private static class Decimal18Reader implements OrcValueReader { - private final int precision; - private final int scale; - - Decimal18Reader(int precision, int scale) { - this.precision = precision; - this.scale = scale; - } - - @Override - public DecimalData nonNullRead(ColumnVector vector, int row) { - HiveDecimalWritable value = ((DecimalColumnVector) vector).vector[row]; - - // The hive ORC writer may will adjust the scale of decimal data. - Preconditions.checkArgument( - value.precision() <= precision, - "Cannot read value as decimal(%s,%s), too large: %s", - precision, - scale, - value); - - return DecimalData.fromUnscaledLong(value.serialize64(scale), precision, scale); - } - } - - private static class Decimal38Reader implements OrcValueReader { - private final int precision; - private final int scale; - - Decimal38Reader(int precision, int scale) { - this.precision = precision; - this.scale = scale; - } - - @Override - public DecimalData nonNullRead(ColumnVector vector, int row) { - BigDecimal value = - ((DecimalColumnVector) vector).vector[row].getHiveDecimal().bigDecimalValue(); - - Preconditions.checkArgument( - value.precision() <= precision, - "Cannot read value as decimal(%s,%s), too large: %s", - precision, - scale, - value); - - return DecimalData.fromBigDecimal(value, precision, scale); - } - } - - private static class TimeReader implements OrcValueReader { - private static final TimeReader INSTANCE = new TimeReader(); - - @Override - public Integer nonNullRead(ColumnVector vector, int row) { - long micros = ((LongColumnVector) vector).vector[row]; - // Flink only support time mills, just erase micros. - return (int) (micros / 1000); - } - } - - private static class TimestampReader implements OrcValueReader { - private static final TimestampReader INSTANCE = new TimestampReader(); - - @Override - public TimestampData nonNullRead(ColumnVector vector, int row) { - TimestampColumnVector tcv = (TimestampColumnVector) vector; - LocalDateTime localDate = - Instant.ofEpochSecond(Math.floorDiv(tcv.time[row], 1_000), tcv.nanos[row]) - .atOffset(ZoneOffset.UTC) - .toLocalDateTime(); - return TimestampData.fromLocalDateTime(localDate); - } - } - - private static class TimestampTzReader implements OrcValueReader { - private static final TimestampTzReader INSTANCE = new TimestampTzReader(); - - @Override - public TimestampData nonNullRead(ColumnVector vector, int row) { - TimestampColumnVector tcv = (TimestampColumnVector) vector; - Instant instant = - Instant.ofEpochSecond(Math.floorDiv(tcv.time[row], 1_000), tcv.nanos[row]) - .atOffset(ZoneOffset.UTC) - .toInstant(); - return TimestampData.fromInstant(instant); - } - } - - private static class ArrayReader implements OrcValueReader { - private final OrcValueReader elementReader; - - private ArrayReader(OrcValueReader elementReader) { - this.elementReader = elementReader; - } - - @Override - public ArrayData nonNullRead(ColumnVector vector, int row) { - ListColumnVector listVector = (ListColumnVector) vector; - int offset = (int) listVector.offsets[row]; - int length = (int) listVector.lengths[row]; - List elements = Lists.newArrayListWithExpectedSize(length); - for (int c = 0; c < length; ++c) { - elements.add(elementReader.read(listVector.child, offset + c)); - } - return new GenericArrayData(elements.toArray()); - } - - @Override - public void setBatchContext(long batchOffsetInFile) { - elementReader.setBatchContext(batchOffsetInFile); - } - } - - private static class MapReader implements OrcValueReader { - private final OrcValueReader keyReader; - private final OrcValueReader valueReader; - - private MapReader(OrcValueReader keyReader, OrcValueReader valueReader) { - this.keyReader = keyReader; - this.valueReader = valueReader; - } - - @Override - public MapData nonNullRead(ColumnVector vector, int row) { - MapColumnVector mapVector = (MapColumnVector) vector; - int offset = (int) mapVector.offsets[row]; - long length = mapVector.lengths[row]; - - Map map = Maps.newHashMap(); - for (int c = 0; c < length; c++) { - K key = keyReader.read(mapVector.keys, offset + c); - V value = valueReader.read(mapVector.values, offset + c); - map.put(key, value); - } - - return new GenericMapData(map); - } - - @Override - public void setBatchContext(long batchOffsetInFile) { - keyReader.setBatchContext(batchOffsetInFile); - valueReader.setBatchContext(batchOffsetInFile); - } - } - - private static class StructReader extends OrcValueReaders.StructReader { - private final int numFields; - - StructReader( - List> readers, Types.StructType struct, Map idToConstant) { - super(readers, struct, idToConstant); - this.numFields = struct.fields().size(); - } - - @Override - protected RowData create() { - return new GenericRowData(numFields); - } - - @Override - protected void set(RowData struct, int pos, Object value) { - ((GenericRowData) struct).setField(pos, value); - } - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriter.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriter.java deleted file mode 100644 index 6a31accffd22..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriter.java +++ /dev/null @@ -1,163 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.data; - -import java.util.Deque; -import java.util.List; -import java.util.stream.Stream; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.LogicalType; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.FieldMetrics; -import org.apache.iceberg.Schema; -import org.apache.iceberg.data.orc.GenericOrcWriters; -import org.apache.iceberg.orc.OrcRowWriter; -import org.apache.iceberg.orc.OrcValueWriter; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.types.Type; -import org.apache.iceberg.types.Types; -import org.apache.orc.storage.ql.exec.vector.VectorizedRowBatch; - -public class FlinkOrcWriter implements OrcRowWriter { - private final FlinkOrcWriters.RowDataWriter writer; - - private FlinkOrcWriter(RowType rowType, Schema iSchema) { - this.writer = - (FlinkOrcWriters.RowDataWriter) - FlinkSchemaVisitor.visit(rowType, iSchema, new WriteBuilder()); - } - - public static OrcRowWriter buildWriter(RowType rowType, Schema iSchema) { - return new FlinkOrcWriter(rowType, iSchema); - } - - @Override - public void write(RowData row, VectorizedRowBatch output) { - Preconditions.checkArgument(row != null, "value must not be null"); - writer.writeRow(row, output); - } - - @Override - public List> writers() { - return writer.writers(); - } - - @Override - public Stream> metrics() { - return writer.metrics(); - } - - private static class WriteBuilder extends FlinkSchemaVisitor> { - private final Deque fieldIds = Lists.newLinkedList(); - - private WriteBuilder() {} - - @Override - public void beforeField(Types.NestedField field) { - fieldIds.push(field.fieldId()); - } - - @Override - public void afterField(Types.NestedField field) { - fieldIds.pop(); - } - - @Override - public OrcValueWriter record( - Types.StructType iStruct, List> results, List fieldType) { - return FlinkOrcWriters.struct(results, fieldType); - } - - @Override - public OrcValueWriter map( - Types.MapType iMap, - OrcValueWriter key, - OrcValueWriter value, - LogicalType keyType, - LogicalType valueType) { - return FlinkOrcWriters.map(key, value, keyType, valueType); - } - - @Override - public OrcValueWriter list( - Types.ListType iList, OrcValueWriter element, LogicalType elementType) { - return FlinkOrcWriters.list(element, elementType); - } - - @Override - public OrcValueWriter primitive(Type.PrimitiveType iPrimitive, LogicalType flinkPrimitive) { - switch (iPrimitive.typeId()) { - case BOOLEAN: - return GenericOrcWriters.booleans(); - case INTEGER: - switch (flinkPrimitive.getTypeRoot()) { - case TINYINT: - return GenericOrcWriters.bytes(); - case SMALLINT: - return GenericOrcWriters.shorts(); - } - return GenericOrcWriters.ints(); - case LONG: - return GenericOrcWriters.longs(); - case FLOAT: - Preconditions.checkArgument( - fieldIds.peek() != null, - String.format( - "[BUG] Cannot find field id for primitive field with type %s. This is likely because id " - + "information is not properly pushed during schema visiting.", - iPrimitive)); - return GenericOrcWriters.floats(fieldIds.peek()); - case DOUBLE: - Preconditions.checkArgument( - fieldIds.peek() != null, - String.format( - "[BUG] Cannot find field id for primitive field with type %s. This is likely because id " - + "information is not properly pushed during schema visiting.", - iPrimitive)); - return GenericOrcWriters.doubles(fieldIds.peek()); - case DATE: - return FlinkOrcWriters.dates(); - case TIME: - return FlinkOrcWriters.times(); - case TIMESTAMP: - Types.TimestampType timestampType = (Types.TimestampType) iPrimitive; - if (timestampType.shouldAdjustToUTC()) { - return FlinkOrcWriters.timestampTzs(); - } else { - return FlinkOrcWriters.timestamps(); - } - case STRING: - return FlinkOrcWriters.strings(); - case UUID: - case FIXED: - case BINARY: - return GenericOrcWriters.byteArrays(); - case DECIMAL: - Types.DecimalType decimalType = (Types.DecimalType) iPrimitive; - return FlinkOrcWriters.decimals(decimalType.precision(), decimalType.scale()); - default: - throw new IllegalArgumentException( - String.format( - "Invalid iceberg type %s corresponding to Flink logical type %s", - iPrimitive, flinkPrimitive)); - } - } - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriters.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriters.java deleted file mode 100644 index da2f95cf822f..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriters.java +++ /dev/null @@ -1,317 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.data; - -import java.time.Instant; -import java.time.OffsetDateTime; -import java.time.ZoneOffset; -import java.util.List; -import java.util.stream.Stream; -import org.apache.flink.table.data.ArrayData; -import org.apache.flink.table.data.DecimalData; -import org.apache.flink.table.data.MapData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.StringData; -import org.apache.flink.table.data.TimestampData; -import org.apache.flink.table.types.logical.LogicalType; -import org.apache.iceberg.FieldMetrics; -import org.apache.iceberg.data.orc.GenericOrcWriters; -import org.apache.iceberg.orc.OrcValueWriter; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.orc.storage.common.type.HiveDecimal; -import org.apache.orc.storage.ql.exec.vector.BytesColumnVector; -import org.apache.orc.storage.ql.exec.vector.ColumnVector; -import org.apache.orc.storage.ql.exec.vector.DecimalColumnVector; -import org.apache.orc.storage.ql.exec.vector.ListColumnVector; -import org.apache.orc.storage.ql.exec.vector.LongColumnVector; -import org.apache.orc.storage.ql.exec.vector.MapColumnVector; -import org.apache.orc.storage.ql.exec.vector.TimestampColumnVector; - -class FlinkOrcWriters { - - private FlinkOrcWriters() {} - - static OrcValueWriter strings() { - return StringWriter.INSTANCE; - } - - static OrcValueWriter dates() { - return DateWriter.INSTANCE; - } - - static OrcValueWriter times() { - return TimeWriter.INSTANCE; - } - - static OrcValueWriter timestamps() { - return TimestampWriter.INSTANCE; - } - - static OrcValueWriter timestampTzs() { - return TimestampTzWriter.INSTANCE; - } - - static OrcValueWriter decimals(int precision, int scale) { - if (precision <= 18) { - return new Decimal18Writer(precision, scale); - } else if (precision <= 38) { - return new Decimal38Writer(precision, scale); - } else { - throw new IllegalArgumentException("Invalid precision: " + precision); - } - } - - static OrcValueWriter list( - OrcValueWriter elementWriter, LogicalType elementType) { - return new ListWriter<>(elementWriter, elementType); - } - - static OrcValueWriter map( - OrcValueWriter keyWriter, - OrcValueWriter valueWriter, - LogicalType keyType, - LogicalType valueType) { - return new MapWriter<>(keyWriter, valueWriter, keyType, valueType); - } - - static OrcValueWriter struct(List> writers, List types) { - return new RowDataWriter(writers, types); - } - - private static class StringWriter implements OrcValueWriter { - private static final StringWriter INSTANCE = new StringWriter(); - - @Override - public void nonNullWrite(int rowId, StringData data, ColumnVector output) { - byte[] value = data.toBytes(); - ((BytesColumnVector) output).setRef(rowId, value, 0, value.length); - } - } - - private static class DateWriter implements OrcValueWriter { - private static final DateWriter INSTANCE = new DateWriter(); - - @Override - public void nonNullWrite(int rowId, Integer data, ColumnVector output) { - ((LongColumnVector) output).vector[rowId] = data; - } - } - - private static class TimeWriter implements OrcValueWriter { - private static final TimeWriter INSTANCE = new TimeWriter(); - - @Override - public void nonNullWrite(int rowId, Integer millis, ColumnVector output) { - // The time in flink is in millisecond, while the standard time in iceberg is microsecond. - // So we need to transform it to microsecond. - ((LongColumnVector) output).vector[rowId] = millis * 1000L; - } - } - - private static class TimestampWriter implements OrcValueWriter { - private static final TimestampWriter INSTANCE = new TimestampWriter(); - - @Override - public void nonNullWrite(int rowId, TimestampData data, ColumnVector output) { - TimestampColumnVector cv = (TimestampColumnVector) output; - cv.setIsUTC(true); - // millis - OffsetDateTime offsetDateTime = data.toInstant().atOffset(ZoneOffset.UTC); - cv.time[rowId] = - offsetDateTime.toEpochSecond() * 1_000 + offsetDateTime.getNano() / 1_000_000; - // truncate nanos to only keep microsecond precision. - cv.nanos[rowId] = (offsetDateTime.getNano() / 1_000) * 1_000; - } - } - - private static class TimestampTzWriter implements OrcValueWriter { - private static final TimestampTzWriter INSTANCE = new TimestampTzWriter(); - - @SuppressWarnings("JavaInstantGetSecondsGetNano") - @Override - public void nonNullWrite(int rowId, TimestampData data, ColumnVector output) { - TimestampColumnVector cv = (TimestampColumnVector) output; - // millis - Instant instant = data.toInstant(); - cv.time[rowId] = instant.toEpochMilli(); - // truncate nanos to only keep microsecond precision. - cv.nanos[rowId] = (instant.getNano() / 1_000) * 1_000; - } - } - - private static class Decimal18Writer implements OrcValueWriter { - private final int precision; - private final int scale; - - Decimal18Writer(int precision, int scale) { - this.precision = precision; - this.scale = scale; - } - - @Override - public void nonNullWrite(int rowId, DecimalData data, ColumnVector output) { - Preconditions.checkArgument( - scale == data.scale(), - "Cannot write value as decimal(%s,%s), wrong scale: %s", - precision, - scale, - data); - Preconditions.checkArgument( - data.precision() <= precision, - "Cannot write value as decimal(%s,%s), too large: %s", - precision, - scale, - data); - - ((DecimalColumnVector) output) - .vector[rowId].setFromLongAndScale(data.toUnscaledLong(), data.scale()); - } - } - - private static class Decimal38Writer implements OrcValueWriter { - private final int precision; - private final int scale; - - Decimal38Writer(int precision, int scale) { - this.precision = precision; - this.scale = scale; - } - - @Override - public void nonNullWrite(int rowId, DecimalData data, ColumnVector output) { - Preconditions.checkArgument( - scale == data.scale(), - "Cannot write value as decimal(%s,%s), wrong scale: %s", - precision, - scale, - data); - Preconditions.checkArgument( - data.precision() <= precision, - "Cannot write value as decimal(%s,%s), too large: %s", - precision, - scale, - data); - - ((DecimalColumnVector) output) - .vector[rowId].set(HiveDecimal.create(data.toBigDecimal(), false)); - } - } - - static class ListWriter implements OrcValueWriter { - private final OrcValueWriter elementWriter; - private final ArrayData.ElementGetter elementGetter; - - ListWriter(OrcValueWriter elementWriter, LogicalType elementType) { - this.elementWriter = elementWriter; - this.elementGetter = ArrayData.createElementGetter(elementType); - } - - @Override - @SuppressWarnings("unchecked") - public void nonNullWrite(int rowId, ArrayData data, ColumnVector output) { - ListColumnVector cv = (ListColumnVector) output; - cv.lengths[rowId] = data.size(); - cv.offsets[rowId] = cv.childCount; - cv.childCount = (int) (cv.childCount + cv.lengths[rowId]); - // make sure the child is big enough. - growColumnVector(cv.child, cv.childCount); - - for (int e = 0; e < cv.lengths[rowId]; ++e) { - Object value = elementGetter.getElementOrNull(data, e); - elementWriter.write((int) (e + cv.offsets[rowId]), (T) value, cv.child); - } - } - - @Override - public Stream> metrics() { - return elementWriter.metrics(); - } - } - - static class MapWriter implements OrcValueWriter { - private final OrcValueWriter keyWriter; - private final OrcValueWriter valueWriter; - private final ArrayData.ElementGetter keyGetter; - private final ArrayData.ElementGetter valueGetter; - - MapWriter( - OrcValueWriter keyWriter, - OrcValueWriter valueWriter, - LogicalType keyType, - LogicalType valueType) { - this.keyWriter = keyWriter; - this.valueWriter = valueWriter; - this.keyGetter = ArrayData.createElementGetter(keyType); - this.valueGetter = ArrayData.createElementGetter(valueType); - } - - @Override - @SuppressWarnings("unchecked") - public void nonNullWrite(int rowId, MapData data, ColumnVector output) { - MapColumnVector cv = (MapColumnVector) output; - ArrayData keyArray = data.keyArray(); - ArrayData valArray = data.valueArray(); - - // record the length and start of the list elements - cv.lengths[rowId] = data.size(); - cv.offsets[rowId] = cv.childCount; - cv.childCount = (int) (cv.childCount + cv.lengths[rowId]); - // make sure the child is big enough - growColumnVector(cv.keys, cv.childCount); - growColumnVector(cv.values, cv.childCount); - // Add each element - for (int e = 0; e < cv.lengths[rowId]; ++e) { - int pos = (int) (e + cv.offsets[rowId]); - keyWriter.write(pos, (K) keyGetter.getElementOrNull(keyArray, e), cv.keys); - valueWriter.write(pos, (V) valueGetter.getElementOrNull(valArray, e), cv.values); - } - } - - @Override - public Stream> metrics() { - return Stream.concat(keyWriter.metrics(), valueWriter.metrics()); - } - } - - static class RowDataWriter extends GenericOrcWriters.StructWriter { - private final List fieldGetters; - - RowDataWriter(List> writers, List types) { - super(writers); - - this.fieldGetters = Lists.newArrayListWithExpectedSize(types.size()); - for (int i = 0; i < types.size(); i++) { - fieldGetters.add(RowData.createFieldGetter(types.get(i), i)); - } - } - - @Override - protected Object get(RowData struct, int index) { - return fieldGetters.get(index).getFieldOrNull(struct); - } - } - - private static void growColumnVector(ColumnVector cv, int requestedSize) { - if (cv.isNull.length < requestedSize) { - // Use growth factor of 3 to avoid frequent array allocations - cv.ensureSize(requestedSize * 3, true); - } - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/data/FlinkParquetReaders.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/data/FlinkParquetReaders.java deleted file mode 100644 index a5f2bb738960..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/data/FlinkParquetReaders.java +++ /dev/null @@ -1,905 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.data; - -import java.math.BigDecimal; -import java.math.BigInteger; -import java.nio.ByteBuffer; -import java.time.Instant; -import java.time.ZoneOffset; -import java.util.Arrays; -import java.util.List; -import java.util.Map; -import java.util.Optional; -import org.apache.flink.table.data.ArrayData; -import org.apache.flink.table.data.DecimalData; -import org.apache.flink.table.data.GenericArrayData; -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.MapData; -import org.apache.flink.table.data.RawValueData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.StringData; -import org.apache.flink.table.data.TimestampData; -import org.apache.iceberg.MetadataColumns; -import org.apache.iceberg.Schema; -import org.apache.iceberg.parquet.ParquetSchemaUtil; -import org.apache.iceberg.parquet.ParquetValueReader; -import org.apache.iceberg.parquet.ParquetValueReaders; -import org.apache.iceberg.parquet.TypeWithSchemaVisitor; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.types.Types; -import org.apache.iceberg.util.ArrayUtil; -import org.apache.parquet.column.ColumnDescriptor; -import org.apache.parquet.io.api.Binary; -import org.apache.parquet.schema.GroupType; -import org.apache.parquet.schema.LogicalTypeAnnotation; -import org.apache.parquet.schema.LogicalTypeAnnotation.DecimalLogicalTypeAnnotation; -import org.apache.parquet.schema.MessageType; -import org.apache.parquet.schema.PrimitiveType; -import org.apache.parquet.schema.Type; - -public class FlinkParquetReaders { - private FlinkParquetReaders() {} - - public static ParquetValueReader buildReader( - Schema expectedSchema, MessageType fileSchema) { - return buildReader(expectedSchema, fileSchema, ImmutableMap.of()); - } - - @SuppressWarnings("unchecked") - public static ParquetValueReader buildReader( - Schema expectedSchema, MessageType fileSchema, Map idToConstant) { - return (ParquetValueReader) - TypeWithSchemaVisitor.visit( - expectedSchema.asStruct(), fileSchema, new ReadBuilder(fileSchema, idToConstant)); - } - - private static class ReadBuilder extends TypeWithSchemaVisitor> { - private final MessageType type; - private final Map idToConstant; - - ReadBuilder(MessageType type, Map idToConstant) { - this.type = type; - this.idToConstant = idToConstant; - } - - @Override - public ParquetValueReader message( - Types.StructType expected, MessageType message, List> fieldReaders) { - return struct(expected, message.asGroupType(), fieldReaders); - } - - @Override - public ParquetValueReader struct( - Types.StructType expected, GroupType struct, List> fieldReaders) { - // match the expected struct's order - Map> readersById = Maps.newHashMap(); - Map typesById = Maps.newHashMap(); - Map maxDefinitionLevelsById = Maps.newHashMap(); - List fields = struct.getFields(); - for (int i = 0; i < fields.size(); i += 1) { - Type fieldType = fields.get(i); - if (fieldReaders.get(i) != null) { - int fieldD = type.getMaxDefinitionLevel(path(fieldType.getName())) - 1; - if (fieldType.getId() != null) { - int id = fieldType.getId().intValue(); - readersById.put(id, ParquetValueReaders.option(fieldType, fieldD, fieldReaders.get(i))); - typesById.put(id, fieldType); - if (idToConstant.containsKey(id)) { - maxDefinitionLevelsById.put(id, fieldD); - } - } - } - } - - List expectedFields = - expected != null ? expected.fields() : ImmutableList.of(); - List> reorderedFields = - Lists.newArrayListWithExpectedSize(expectedFields.size()); - List types = Lists.newArrayListWithExpectedSize(expectedFields.size()); - // Defaulting to parent max definition level - int defaultMaxDefinitionLevel = type.getMaxDefinitionLevel(currentPath()); - for (Types.NestedField field : expectedFields) { - int id = field.fieldId(); - if (idToConstant.containsKey(id)) { - // containsKey is used because the constant may be null - int fieldMaxDefinitionLevel = - maxDefinitionLevelsById.getOrDefault(id, defaultMaxDefinitionLevel); - reorderedFields.add( - ParquetValueReaders.constant(idToConstant.get(id), fieldMaxDefinitionLevel)); - types.add(null); - } else if (id == MetadataColumns.ROW_POSITION.fieldId()) { - reorderedFields.add(ParquetValueReaders.position()); - types.add(null); - } else if (id == MetadataColumns.IS_DELETED.fieldId()) { - reorderedFields.add(ParquetValueReaders.constant(false)); - types.add(null); - } else { - ParquetValueReader reader = readersById.get(id); - if (reader != null) { - reorderedFields.add(reader); - types.add(typesById.get(id)); - } else { - reorderedFields.add(ParquetValueReaders.nulls()); - types.add(null); - } - } - } - - return new RowDataReader(types, reorderedFields); - } - - @Override - public ParquetValueReader list( - Types.ListType expectedList, GroupType array, ParquetValueReader elementReader) { - if (expectedList == null) { - return null; - } - - String[] repeatedPath = currentPath(); - - int repeatedD = type.getMaxDefinitionLevel(repeatedPath) - 1; - int repeatedR = type.getMaxRepetitionLevel(repeatedPath) - 1; - - Type elementType = ParquetSchemaUtil.determineListElementType(array); - int elementD = type.getMaxDefinitionLevel(path(elementType.getName())) - 1; - - return new ArrayReader<>( - repeatedD, repeatedR, ParquetValueReaders.option(elementType, elementD, elementReader)); - } - - @Override - public ParquetValueReader map( - Types.MapType expectedMap, - GroupType map, - ParquetValueReader keyReader, - ParquetValueReader valueReader) { - if (expectedMap == null) { - return null; - } - - GroupType repeatedKeyValue = map.getFields().get(0).asGroupType(); - String[] repeatedPath = currentPath(); - - int repeatedD = type.getMaxDefinitionLevel(repeatedPath) - 1; - int repeatedR = type.getMaxRepetitionLevel(repeatedPath) - 1; - - Type keyType = repeatedKeyValue.getType(0); - int keyD = type.getMaxDefinitionLevel(path(keyType.getName())) - 1; - Type valueType = repeatedKeyValue.getType(1); - int valueD = type.getMaxDefinitionLevel(path(valueType.getName())) - 1; - - return new MapReader<>( - repeatedD, - repeatedR, - ParquetValueReaders.option(keyType, keyD, keyReader), - ParquetValueReaders.option(valueType, valueD, valueReader)); - } - - private static class LogicalTypeAnnotationParquetValueReaderVisitor - implements LogicalTypeAnnotation.LogicalTypeAnnotationVisitor> { - - private final PrimitiveType primitive; - private final ColumnDescriptor desc; - private final org.apache.iceberg.types.Type.PrimitiveType expected; - - LogicalTypeAnnotationParquetValueReaderVisitor( - PrimitiveType primitive, - ColumnDescriptor desc, - org.apache.iceberg.types.Type.PrimitiveType expected) { - this.primitive = primitive; - this.desc = desc; - this.expected = expected; - } - - @Override - public Optional> visit( - LogicalTypeAnnotation.StringLogicalTypeAnnotation stringLogicalType) { - return Optional.of(new StringReader(desc)); - } - - @Override - public Optional> visit( - LogicalTypeAnnotation.EnumLogicalTypeAnnotation enumLogicalType) { - return Optional.of(new StringReader(desc)); - } - - @Override - public Optional> visit( - LogicalTypeAnnotation.JsonLogicalTypeAnnotation jsonLogicalType) { - return Optional.of(new StringReader(desc)); - } - - @Override - public Optional> visit( - DecimalLogicalTypeAnnotation decimalLogicalType) { - switch (primitive.getPrimitiveTypeName()) { - case BINARY: - case FIXED_LEN_BYTE_ARRAY: - return Optional.of( - new BinaryDecimalReader( - desc, decimalLogicalType.getPrecision(), decimalLogicalType.getScale())); - case INT64: - return Optional.of( - new LongDecimalReader( - desc, decimalLogicalType.getPrecision(), decimalLogicalType.getScale())); - case INT32: - return Optional.of( - new IntegerDecimalReader( - desc, decimalLogicalType.getPrecision(), decimalLogicalType.getScale())); - } - - return LogicalTypeAnnotation.LogicalTypeAnnotationVisitor.super.visit(decimalLogicalType); - } - - @Override - public Optional> visit( - LogicalTypeAnnotation.DateLogicalTypeAnnotation dateLogicalType) { - return Optional.of(new ParquetValueReaders.UnboxedReader<>(desc)); - } - - @Override - public Optional> visit( - LogicalTypeAnnotation.TimeLogicalTypeAnnotation timeLogicalType) { - if (timeLogicalType.getUnit() == LogicalTypeAnnotation.TimeUnit.MILLIS) { - return Optional.of(new MillisTimeReader(desc)); - } else if (timeLogicalType.getUnit() == LogicalTypeAnnotation.TimeUnit.MICROS) { - return Optional.of(new LossyMicrosToMillisTimeReader(desc)); - } - - return LogicalTypeAnnotation.LogicalTypeAnnotationVisitor.super.visit(timeLogicalType); - } - - @Override - public Optional> visit( - LogicalTypeAnnotation.TimestampLogicalTypeAnnotation timestampLogicalType) { - if (timestampLogicalType.getUnit() == LogicalTypeAnnotation.TimeUnit.MILLIS) { - if (timestampLogicalType.isAdjustedToUTC()) { - return Optional.of(new MillisToTimestampTzReader(desc)); - } else { - return Optional.of(new MillisToTimestampReader(desc)); - } - } else if (timestampLogicalType.getUnit() == LogicalTypeAnnotation.TimeUnit.MICROS) { - if (timestampLogicalType.isAdjustedToUTC()) { - return Optional.of(new MicrosToTimestampTzReader(desc)); - } else { - return Optional.of(new MicrosToTimestampReader(desc)); - } - } - - return LogicalTypeAnnotation.LogicalTypeAnnotationVisitor.super.visit(timestampLogicalType); - } - - @Override - public Optional> visit( - LogicalTypeAnnotation.IntLogicalTypeAnnotation intLogicalType) { - int width = intLogicalType.getBitWidth(); - if (width <= 32) { - if (expected.typeId() == Types.LongType.get().typeId()) { - return Optional.of(new ParquetValueReaders.IntAsLongReader(desc)); - } else { - return Optional.of(new ParquetValueReaders.UnboxedReader<>(desc)); - } - } else if (width <= 64) { - return Optional.of(new ParquetValueReaders.UnboxedReader<>(desc)); - } - - return LogicalTypeAnnotation.LogicalTypeAnnotationVisitor.super.visit(intLogicalType); - } - - @Override - public Optional> visit( - LogicalTypeAnnotation.BsonLogicalTypeAnnotation bsonLogicalType) { - return Optional.of(new ParquetValueReaders.ByteArrayReader(desc)); - } - } - - @Override - @SuppressWarnings("CyclomaticComplexity") - public ParquetValueReader primitive( - org.apache.iceberg.types.Type.PrimitiveType expected, PrimitiveType primitive) { - if (expected == null) { - return null; - } - - ColumnDescriptor desc = type.getColumnDescription(currentPath()); - LogicalTypeAnnotation logicalTypeAnnotation = primitive.getLogicalTypeAnnotation(); - if (logicalTypeAnnotation != null) { - return logicalTypeAnnotation - .accept(new LogicalTypeAnnotationParquetValueReaderVisitor(primitive, desc, expected)) - .orElseThrow( - () -> - new UnsupportedOperationException( - "Unsupported logical type: " + primitive.getLogicalTypeAnnotation())); - } - - switch (primitive.getPrimitiveTypeName()) { - case FIXED_LEN_BYTE_ARRAY: - case BINARY: - return new ParquetValueReaders.ByteArrayReader(desc); - case INT32: - if (expected.typeId() == org.apache.iceberg.types.Type.TypeID.LONG) { - return new ParquetValueReaders.IntAsLongReader(desc); - } else { - return new ParquetValueReaders.UnboxedReader<>(desc); - } - case FLOAT: - if (expected.typeId() == org.apache.iceberg.types.Type.TypeID.DOUBLE) { - return new ParquetValueReaders.FloatAsDoubleReader(desc); - } else { - return new ParquetValueReaders.UnboxedReader<>(desc); - } - case BOOLEAN: - case INT64: - case DOUBLE: - return new ParquetValueReaders.UnboxedReader<>(desc); - default: - throw new UnsupportedOperationException("Unsupported type: " + primitive); - } - } - } - - private static class BinaryDecimalReader - extends ParquetValueReaders.PrimitiveReader { - private final int precision; - private final int scale; - - BinaryDecimalReader(ColumnDescriptor desc, int precision, int scale) { - super(desc); - this.precision = precision; - this.scale = scale; - } - - @Override - public DecimalData read(DecimalData ignored) { - Binary binary = column.nextBinary(); - BigDecimal bigDecimal = new BigDecimal(new BigInteger(binary.getBytes()), scale); - // TODO: need a unit test to write-read-validate decimal via FlinkParquetWrite/Reader - return DecimalData.fromBigDecimal(bigDecimal, precision, scale); - } - } - - private static class IntegerDecimalReader - extends ParquetValueReaders.PrimitiveReader { - private final int precision; - private final int scale; - - IntegerDecimalReader(ColumnDescriptor desc, int precision, int scale) { - super(desc); - this.precision = precision; - this.scale = scale; - } - - @Override - public DecimalData read(DecimalData ignored) { - return DecimalData.fromUnscaledLong(column.nextInteger(), precision, scale); - } - } - - private static class LongDecimalReader extends ParquetValueReaders.PrimitiveReader { - private final int precision; - private final int scale; - - LongDecimalReader(ColumnDescriptor desc, int precision, int scale) { - super(desc); - this.precision = precision; - this.scale = scale; - } - - @Override - public DecimalData read(DecimalData ignored) { - return DecimalData.fromUnscaledLong(column.nextLong(), precision, scale); - } - } - - private static class MicrosToTimestampTzReader - extends ParquetValueReaders.UnboxedReader { - MicrosToTimestampTzReader(ColumnDescriptor desc) { - super(desc); - } - - @Override - public TimestampData read(TimestampData ignored) { - long value = readLong(); - return TimestampData.fromLocalDateTime( - Instant.ofEpochSecond( - Math.floorDiv(value, 1000_000L), Math.floorMod(value, 1000_000L) * 1000L) - .atOffset(ZoneOffset.UTC) - .toLocalDateTime()); - } - - @Override - public long readLong() { - return column.nextLong(); - } - } - - private static class MicrosToTimestampReader - extends ParquetValueReaders.UnboxedReader { - MicrosToTimestampReader(ColumnDescriptor desc) { - super(desc); - } - - @Override - public TimestampData read(TimestampData ignored) { - long value = readLong(); - return TimestampData.fromInstant( - Instant.ofEpochSecond( - Math.floorDiv(value, 1000_000L), Math.floorMod(value, 1000_000L) * 1000L)); - } - - @Override - public long readLong() { - return column.nextLong(); - } - } - - private static class MillisToTimestampReader - extends ParquetValueReaders.UnboxedReader { - MillisToTimestampReader(ColumnDescriptor desc) { - super(desc); - } - - @Override - public TimestampData read(TimestampData ignored) { - long millis = readLong(); - return TimestampData.fromEpochMillis(millis); - } - - @Override - public long readLong() { - return column.nextLong(); - } - } - - private static class MillisToTimestampTzReader - extends ParquetValueReaders.UnboxedReader { - MillisToTimestampTzReader(ColumnDescriptor desc) { - super(desc); - } - - @Override - public TimestampData read(TimestampData ignored) { - long millis = readLong(); - return TimestampData.fromLocalDateTime( - Instant.ofEpochMilli(millis).atOffset(ZoneOffset.UTC).toLocalDateTime()); - } - - @Override - public long readLong() { - return column.nextLong(); - } - } - - private static class StringReader extends ParquetValueReaders.PrimitiveReader { - StringReader(ColumnDescriptor desc) { - super(desc); - } - - @Override - public StringData read(StringData ignored) { - Binary binary = column.nextBinary(); - ByteBuffer buffer = binary.toByteBuffer(); - if (buffer.hasArray()) { - return StringData.fromBytes( - buffer.array(), buffer.arrayOffset() + buffer.position(), buffer.remaining()); - } else { - return StringData.fromBytes(binary.getBytes()); - } - } - } - - private static class LossyMicrosToMillisTimeReader - extends ParquetValueReaders.PrimitiveReader { - LossyMicrosToMillisTimeReader(ColumnDescriptor desc) { - super(desc); - } - - @Override - public Integer read(Integer reuse) { - // Discard microseconds since Flink uses millisecond unit for TIME type. - return (int) Math.floorDiv(column.nextLong(), 1000L); - } - } - - private static class MillisTimeReader extends ParquetValueReaders.PrimitiveReader { - MillisTimeReader(ColumnDescriptor desc) { - super(desc); - } - - @Override - public Integer read(Integer reuse) { - return (int) column.nextLong(); - } - } - - private static class ArrayReader - extends ParquetValueReaders.RepeatedReader { - private int readPos = 0; - private int writePos = 0; - - ArrayReader(int definitionLevel, int repetitionLevel, ParquetValueReader reader) { - super(definitionLevel, repetitionLevel, reader); - } - - @Override - protected ReusableArrayData newListData(ArrayData reuse) { - this.readPos = 0; - this.writePos = 0; - - if (reuse instanceof ReusableArrayData) { - return (ReusableArrayData) reuse; - } else { - return new ReusableArrayData(); - } - } - - @Override - @SuppressWarnings("unchecked") - protected E getElement(ReusableArrayData list) { - E value = null; - if (readPos < list.capacity()) { - value = (E) list.values[readPos]; - } - - readPos += 1; - - return value; - } - - @Override - protected void addElement(ReusableArrayData reused, E element) { - if (writePos >= reused.capacity()) { - reused.grow(); - } - - reused.values[writePos] = element; - - writePos += 1; - } - - @Override - protected ArrayData buildList(ReusableArrayData list) { - // Since ReusableArrayData is not accepted by Flink, use GenericArrayData temporarily to walk - // around it. - // Revert this to use ReusableArrayData once it is fixed in Flink. - // For your reference, https://issues.apache.org/jira/browse/FLINK-25238. - return new GenericArrayData(Arrays.copyOf(list.values, writePos)); - } - } - - private static class MapReader - extends ParquetValueReaders.RepeatedKeyValueReader { - private int readPos = 0; - private int writePos = 0; - - private final ParquetValueReaders.ReusableEntry entry = - new ParquetValueReaders.ReusableEntry<>(); - private final ParquetValueReaders.ReusableEntry nullEntry = - new ParquetValueReaders.ReusableEntry<>(); - - MapReader( - int definitionLevel, - int repetitionLevel, - ParquetValueReader keyReader, - ParquetValueReader valueReader) { - super(definitionLevel, repetitionLevel, keyReader, valueReader); - } - - @Override - protected ReusableMapData newMapData(MapData reuse) { - this.readPos = 0; - this.writePos = 0; - - if (reuse instanceof ReusableMapData) { - return (ReusableMapData) reuse; - } else { - return new ReusableMapData(); - } - } - - @Override - @SuppressWarnings("unchecked") - protected Map.Entry getPair(ReusableMapData map) { - Map.Entry kv = nullEntry; - if (readPos < map.capacity()) { - entry.set((K) map.keys.values[readPos], (V) map.values.values[readPos]); - kv = entry; - } - - readPos += 1; - - return kv; - } - - @Override - protected void addPair(ReusableMapData map, K key, V value) { - if (writePos >= map.capacity()) { - map.grow(); - } - - map.keys.values[writePos] = key; - map.values.values[writePos] = value; - - writePos += 1; - } - - @Override - protected MapData buildMap(ReusableMapData map) { - map.setNumElements(writePos); - return map; - } - } - - private static class RowDataReader - extends ParquetValueReaders.StructReader { - private final int numFields; - - RowDataReader(List types, List> readers) { - super(types, readers); - this.numFields = readers.size(); - } - - @Override - protected GenericRowData newStructData(RowData reuse) { - if (reuse instanceof GenericRowData) { - return (GenericRowData) reuse; - } else { - return new GenericRowData(numFields); - } - } - - @Override - protected Object getField(GenericRowData intermediate, int pos) { - return intermediate.getField(pos); - } - - @Override - protected RowData buildStruct(GenericRowData struct) { - return struct; - } - - @Override - protected void set(GenericRowData row, int pos, Object value) { - row.setField(pos, value); - } - - @Override - protected void setNull(GenericRowData row, int pos) { - row.setField(pos, null); - } - - @Override - protected void setBoolean(GenericRowData row, int pos, boolean value) { - row.setField(pos, value); - } - - @Override - protected void setInteger(GenericRowData row, int pos, int value) { - row.setField(pos, value); - } - - @Override - protected void setLong(GenericRowData row, int pos, long value) { - row.setField(pos, value); - } - - @Override - protected void setFloat(GenericRowData row, int pos, float value) { - row.setField(pos, value); - } - - @Override - protected void setDouble(GenericRowData row, int pos, double value) { - row.setField(pos, value); - } - } - - private static class ReusableMapData implements MapData { - private final ReusableArrayData keys; - private final ReusableArrayData values; - - private int numElements; - - private ReusableMapData() { - this.keys = new ReusableArrayData(); - this.values = new ReusableArrayData(); - } - - private void grow() { - keys.grow(); - values.grow(); - } - - private int capacity() { - return keys.capacity(); - } - - public void setNumElements(int numElements) { - this.numElements = numElements; - keys.setNumElements(numElements); - values.setNumElements(numElements); - } - - @Override - public int size() { - return numElements; - } - - @Override - public ReusableArrayData keyArray() { - return keys; - } - - @Override - public ReusableArrayData valueArray() { - return values; - } - } - - private static class ReusableArrayData implements ArrayData { - private static final Object[] EMPTY = new Object[0]; - - private Object[] values = EMPTY; - private int numElements = 0; - - private void grow() { - if (values.length == 0) { - this.values = new Object[20]; - } else { - Object[] old = values; - this.values = new Object[old.length << 1]; - // copy the old array in case it has values that can be reused - System.arraycopy(old, 0, values, 0, old.length); - } - } - - private int capacity() { - return values.length; - } - - public void setNumElements(int numElements) { - this.numElements = numElements; - } - - @Override - public int size() { - return numElements; - } - - @Override - public boolean isNullAt(int ordinal) { - return null == values[ordinal]; - } - - @Override - public boolean getBoolean(int ordinal) { - return (boolean) values[ordinal]; - } - - @Override - public byte getByte(int ordinal) { - return (byte) values[ordinal]; - } - - @Override - public short getShort(int ordinal) { - return (short) values[ordinal]; - } - - @Override - public int getInt(int ordinal) { - return (int) values[ordinal]; - } - - @Override - public long getLong(int ordinal) { - return (long) values[ordinal]; - } - - @Override - public float getFloat(int ordinal) { - return (float) values[ordinal]; - } - - @Override - public double getDouble(int ordinal) { - return (double) values[ordinal]; - } - - @Override - public StringData getString(int pos) { - return (StringData) values[pos]; - } - - @Override - public DecimalData getDecimal(int pos, int precision, int scale) { - return (DecimalData) values[pos]; - } - - @Override - public TimestampData getTimestamp(int pos, int precision) { - return (TimestampData) values[pos]; - } - - @SuppressWarnings("unchecked") - @Override - public RawValueData getRawValue(int pos) { - return (RawValueData) values[pos]; - } - - @Override - public byte[] getBinary(int ordinal) { - return (byte[]) values[ordinal]; - } - - @Override - public ArrayData getArray(int ordinal) { - return (ArrayData) values[ordinal]; - } - - @Override - public MapData getMap(int ordinal) { - return (MapData) values[ordinal]; - } - - @Override - public RowData getRow(int pos, int numFields) { - return (RowData) values[pos]; - } - - @Override - public boolean[] toBooleanArray() { - return ArrayUtil.toPrimitive((Boolean[]) values); - } - - @Override - public byte[] toByteArray() { - return ArrayUtil.toPrimitive((Byte[]) values); - } - - @Override - public short[] toShortArray() { - return ArrayUtil.toPrimitive((Short[]) values); - } - - @Override - public int[] toIntArray() { - return ArrayUtil.toPrimitive((Integer[]) values); - } - - @Override - public long[] toLongArray() { - return ArrayUtil.toPrimitive((Long[]) values); - } - - @Override - public float[] toFloatArray() { - return ArrayUtil.toPrimitive((Float[]) values); - } - - @Override - public double[] toDoubleArray() { - return ArrayUtil.toPrimitive((Double[]) values); - } - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/data/FlinkParquetWriters.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/data/FlinkParquetWriters.java deleted file mode 100644 index db4f1730a134..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/data/FlinkParquetWriters.java +++ /dev/null @@ -1,504 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.data; - -import java.util.Iterator; -import java.util.List; -import java.util.Map; -import java.util.NoSuchElementException; -import org.apache.flink.table.data.ArrayData; -import org.apache.flink.table.data.DecimalData; -import org.apache.flink.table.data.MapData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.StringData; -import org.apache.flink.table.data.TimestampData; -import org.apache.flink.table.types.logical.ArrayType; -import org.apache.flink.table.types.logical.LogicalType; -import org.apache.flink.table.types.logical.MapType; -import org.apache.flink.table.types.logical.RowType; -import org.apache.flink.table.types.logical.RowType.RowField; -import org.apache.flink.table.types.logical.SmallIntType; -import org.apache.flink.table.types.logical.TinyIntType; -import org.apache.iceberg.parquet.ParquetValueReaders; -import org.apache.iceberg.parquet.ParquetValueWriter; -import org.apache.iceberg.parquet.ParquetValueWriters; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.types.TypeUtil; -import org.apache.iceberg.util.DecimalUtil; -import org.apache.parquet.column.ColumnDescriptor; -import org.apache.parquet.io.api.Binary; -import org.apache.parquet.schema.GroupType; -import org.apache.parquet.schema.LogicalTypeAnnotation.DecimalLogicalTypeAnnotation; -import org.apache.parquet.schema.MessageType; -import org.apache.parquet.schema.PrimitiveType; -import org.apache.parquet.schema.Type; - -public class FlinkParquetWriters { - private FlinkParquetWriters() {} - - @SuppressWarnings("unchecked") - public static ParquetValueWriter buildWriter(LogicalType schema, MessageType type) { - return (ParquetValueWriter) - ParquetWithFlinkSchemaVisitor.visit(schema, type, new WriteBuilder(type)); - } - - private static class WriteBuilder extends ParquetWithFlinkSchemaVisitor> { - private final MessageType type; - - WriteBuilder(MessageType type) { - this.type = type; - } - - @Override - public ParquetValueWriter message( - RowType sStruct, MessageType message, List> fields) { - return struct(sStruct, message.asGroupType(), fields); - } - - @Override - public ParquetValueWriter struct( - RowType sStruct, GroupType struct, List> fieldWriters) { - List fields = struct.getFields(); - List flinkFields = sStruct.getFields(); - List> writers = Lists.newArrayListWithExpectedSize(fieldWriters.size()); - List flinkTypes = Lists.newArrayList(); - for (int i = 0; i < fields.size(); i += 1) { - writers.add(newOption(struct.getType(i), fieldWriters.get(i))); - flinkTypes.add(flinkFields.get(i).getType()); - } - - return new RowDataWriter(writers, flinkTypes); - } - - @Override - public ParquetValueWriter list( - ArrayType sArray, GroupType array, ParquetValueWriter elementWriter) { - GroupType repeated = array.getFields().get(0).asGroupType(); - String[] repeatedPath = currentPath(); - - int repeatedD = type.getMaxDefinitionLevel(repeatedPath); - int repeatedR = type.getMaxRepetitionLevel(repeatedPath); - - return new ArrayDataWriter<>( - repeatedD, - repeatedR, - newOption(repeated.getType(0), elementWriter), - sArray.getElementType()); - } - - @Override - public ParquetValueWriter map( - MapType sMap, - GroupType map, - ParquetValueWriter keyWriter, - ParquetValueWriter valueWriter) { - GroupType repeatedKeyValue = map.getFields().get(0).asGroupType(); - String[] repeatedPath = currentPath(); - - int repeatedD = type.getMaxDefinitionLevel(repeatedPath); - int repeatedR = type.getMaxRepetitionLevel(repeatedPath); - - return new MapDataWriter<>( - repeatedD, - repeatedR, - newOption(repeatedKeyValue.getType(0), keyWriter), - newOption(repeatedKeyValue.getType(1), valueWriter), - sMap.getKeyType(), - sMap.getValueType()); - } - - private ParquetValueWriter newOption(Type fieldType, ParquetValueWriter writer) { - int maxD = type.getMaxDefinitionLevel(path(fieldType.getName())); - return ParquetValueWriters.option(fieldType, maxD, writer); - } - - @Override - public ParquetValueWriter primitive(LogicalType fType, PrimitiveType primitive) { - ColumnDescriptor desc = type.getColumnDescription(currentPath()); - - if (primitive.getOriginalType() != null) { - switch (primitive.getOriginalType()) { - case ENUM: - case JSON: - case UTF8: - return strings(desc); - case DATE: - case INT_8: - case INT_16: - case INT_32: - return ints(fType, desc); - case INT_64: - return ParquetValueWriters.longs(desc); - case TIME_MICROS: - return timeMicros(desc); - case TIMESTAMP_MICROS: - return timestamps(desc); - case DECIMAL: - DecimalLogicalTypeAnnotation decimal = - (DecimalLogicalTypeAnnotation) primitive.getLogicalTypeAnnotation(); - switch (primitive.getPrimitiveTypeName()) { - case INT32: - return decimalAsInteger(desc, decimal.getPrecision(), decimal.getScale()); - case INT64: - return decimalAsLong(desc, decimal.getPrecision(), decimal.getScale()); - case BINARY: - case FIXED_LEN_BYTE_ARRAY: - return decimalAsFixed(desc, decimal.getPrecision(), decimal.getScale()); - default: - throw new UnsupportedOperationException( - "Unsupported base type for decimal: " + primitive.getPrimitiveTypeName()); - } - case BSON: - return byteArrays(desc); - default: - throw new UnsupportedOperationException( - "Unsupported logical type: " + primitive.getOriginalType()); - } - } - - switch (primitive.getPrimitiveTypeName()) { - case FIXED_LEN_BYTE_ARRAY: - case BINARY: - return byteArrays(desc); - case BOOLEAN: - return ParquetValueWriters.booleans(desc); - case INT32: - return ints(fType, desc); - case INT64: - return ParquetValueWriters.longs(desc); - case FLOAT: - return ParquetValueWriters.floats(desc); - case DOUBLE: - return ParquetValueWriters.doubles(desc); - default: - throw new UnsupportedOperationException("Unsupported type: " + primitive); - } - } - } - - private static ParquetValueWriters.PrimitiveWriter ints( - LogicalType type, ColumnDescriptor desc) { - if (type instanceof TinyIntType) { - return ParquetValueWriters.tinyints(desc); - } else if (type instanceof SmallIntType) { - return ParquetValueWriters.shorts(desc); - } - return ParquetValueWriters.ints(desc); - } - - private static ParquetValueWriters.PrimitiveWriter strings(ColumnDescriptor desc) { - return new StringDataWriter(desc); - } - - private static ParquetValueWriters.PrimitiveWriter timeMicros(ColumnDescriptor desc) { - return new TimeMicrosWriter(desc); - } - - private static ParquetValueWriters.PrimitiveWriter decimalAsInteger( - ColumnDescriptor desc, int precision, int scale) { - Preconditions.checkArgument( - precision <= 9, - "Cannot write decimal value as integer with precision larger than 9," - + " wrong precision %s", - precision); - return new IntegerDecimalWriter(desc, precision, scale); - } - - private static ParquetValueWriters.PrimitiveWriter decimalAsLong( - ColumnDescriptor desc, int precision, int scale) { - Preconditions.checkArgument( - precision <= 18, - "Cannot write decimal value as long with precision larger than 18, " - + " wrong precision %s", - precision); - return new LongDecimalWriter(desc, precision, scale); - } - - private static ParquetValueWriters.PrimitiveWriter decimalAsFixed( - ColumnDescriptor desc, int precision, int scale) { - return new FixedDecimalWriter(desc, precision, scale); - } - - private static ParquetValueWriters.PrimitiveWriter timestamps( - ColumnDescriptor desc) { - return new TimestampDataWriter(desc); - } - - private static ParquetValueWriters.PrimitiveWriter byteArrays(ColumnDescriptor desc) { - return new ByteArrayWriter(desc); - } - - private static class StringDataWriter extends ParquetValueWriters.PrimitiveWriter { - private StringDataWriter(ColumnDescriptor desc) { - super(desc); - } - - @Override - public void write(int repetitionLevel, StringData value) { - column.writeBinary(repetitionLevel, Binary.fromReusedByteArray(value.toBytes())); - } - } - - private static class TimeMicrosWriter extends ParquetValueWriters.PrimitiveWriter { - private TimeMicrosWriter(ColumnDescriptor desc) { - super(desc); - } - - @Override - public void write(int repetitionLevel, Integer value) { - long micros = value.longValue() * 1000; - column.writeLong(repetitionLevel, micros); - } - } - - private static class IntegerDecimalWriter - extends ParquetValueWriters.PrimitiveWriter { - private final int precision; - private final int scale; - - private IntegerDecimalWriter(ColumnDescriptor desc, int precision, int scale) { - super(desc); - this.precision = precision; - this.scale = scale; - } - - @Override - public void write(int repetitionLevel, DecimalData decimal) { - Preconditions.checkArgument( - decimal.scale() == scale, - "Cannot write value as decimal(%s,%s), wrong scale: %s", - precision, - scale, - decimal); - Preconditions.checkArgument( - decimal.precision() <= precision, - "Cannot write value as decimal(%s,%s), too large: %s", - precision, - scale, - decimal); - - column.writeInteger(repetitionLevel, (int) decimal.toUnscaledLong()); - } - } - - private static class LongDecimalWriter extends ParquetValueWriters.PrimitiveWriter { - private final int precision; - private final int scale; - - private LongDecimalWriter(ColumnDescriptor desc, int precision, int scale) { - super(desc); - this.precision = precision; - this.scale = scale; - } - - @Override - public void write(int repetitionLevel, DecimalData decimal) { - Preconditions.checkArgument( - decimal.scale() == scale, - "Cannot write value as decimal(%s,%s), wrong scale: %s", - precision, - scale, - decimal); - Preconditions.checkArgument( - decimal.precision() <= precision, - "Cannot write value as decimal(%s,%s), too large: %s", - precision, - scale, - decimal); - - column.writeLong(repetitionLevel, decimal.toUnscaledLong()); - } - } - - private static class FixedDecimalWriter extends ParquetValueWriters.PrimitiveWriter { - private final int precision; - private final int scale; - private final ThreadLocal bytes; - - private FixedDecimalWriter(ColumnDescriptor desc, int precision, int scale) { - super(desc); - this.precision = precision; - this.scale = scale; - this.bytes = - ThreadLocal.withInitial(() -> new byte[TypeUtil.decimalRequiredBytes(precision)]); - } - - @Override - public void write(int repetitionLevel, DecimalData decimal) { - byte[] binary = - DecimalUtil.toReusedFixLengthBytes(precision, scale, decimal.toBigDecimal(), bytes.get()); - column.writeBinary(repetitionLevel, Binary.fromReusedByteArray(binary)); - } - } - - private static class TimestampDataWriter - extends ParquetValueWriters.PrimitiveWriter { - private TimestampDataWriter(ColumnDescriptor desc) { - super(desc); - } - - @Override - public void write(int repetitionLevel, TimestampData value) { - column.writeLong( - repetitionLevel, value.getMillisecond() * 1000 + value.getNanoOfMillisecond() / 1000); - } - } - - private static class ByteArrayWriter extends ParquetValueWriters.PrimitiveWriter { - private ByteArrayWriter(ColumnDescriptor desc) { - super(desc); - } - - @Override - public void write(int repetitionLevel, byte[] bytes) { - column.writeBinary(repetitionLevel, Binary.fromReusedByteArray(bytes)); - } - } - - private static class ArrayDataWriter extends ParquetValueWriters.RepeatedWriter { - private final LogicalType elementType; - - private ArrayDataWriter( - int definitionLevel, - int repetitionLevel, - ParquetValueWriter writer, - LogicalType elementType) { - super(definitionLevel, repetitionLevel, writer); - this.elementType = elementType; - } - - @Override - protected Iterator elements(ArrayData list) { - return new ElementIterator<>(list); - } - - private class ElementIterator implements Iterator { - private final int size; - private final ArrayData list; - private final ArrayData.ElementGetter getter; - private int index; - - private ElementIterator(ArrayData list) { - this.list = list; - size = list.size(); - getter = ArrayData.createElementGetter(elementType); - index = 0; - } - - @Override - public boolean hasNext() { - return index != size; - } - - @Override - @SuppressWarnings("unchecked") - public E next() { - if (index >= size) { - throw new NoSuchElementException(); - } - - E element = (E) getter.getElementOrNull(list, index); - index += 1; - - return element; - } - } - } - - private static class MapDataWriter - extends ParquetValueWriters.RepeatedKeyValueWriter { - private final LogicalType keyType; - private final LogicalType valueType; - - private MapDataWriter( - int definitionLevel, - int repetitionLevel, - ParquetValueWriter keyWriter, - ParquetValueWriter valueWriter, - LogicalType keyType, - LogicalType valueType) { - super(definitionLevel, repetitionLevel, keyWriter, valueWriter); - this.keyType = keyType; - this.valueType = valueType; - } - - @Override - protected Iterator> pairs(MapData map) { - return new EntryIterator<>(map); - } - - private class EntryIterator implements Iterator> { - private final int size; - private final ArrayData keys; - private final ArrayData values; - private final ParquetValueReaders.ReusableEntry entry; - private final ArrayData.ElementGetter keyGetter; - private final ArrayData.ElementGetter valueGetter; - private int index; - - private EntryIterator(MapData map) { - size = map.size(); - keys = map.keyArray(); - values = map.valueArray(); - entry = new ParquetValueReaders.ReusableEntry<>(); - keyGetter = ArrayData.createElementGetter(keyType); - valueGetter = ArrayData.createElementGetter(valueType); - index = 0; - } - - @Override - public boolean hasNext() { - return index != size; - } - - @Override - @SuppressWarnings("unchecked") - public Map.Entry next() { - if (index >= size) { - throw new NoSuchElementException(); - } - - entry.set( - (K) keyGetter.getElementOrNull(keys, index), - (V) valueGetter.getElementOrNull(values, index)); - index += 1; - - return entry; - } - } - } - - private static class RowDataWriter extends ParquetValueWriters.StructWriter { - private final RowData.FieldGetter[] fieldGetter; - - RowDataWriter(List> writers, List types) { - super(writers); - fieldGetter = new RowData.FieldGetter[types.size()]; - for (int i = 0; i < types.size(); i += 1) { - fieldGetter[i] = RowData.createFieldGetter(types.get(i), i); - } - } - - @Override - protected Object get(RowData struct, int index) { - return fieldGetter[index].getFieldOrNull(struct); - } - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/data/FlinkSchemaVisitor.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/data/FlinkSchemaVisitor.java deleted file mode 100644 index ba4e1a7a7aec..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/data/FlinkSchemaVisitor.java +++ /dev/null @@ -1,161 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.data; - -import java.util.List; -import org.apache.flink.table.types.logical.ArrayType; -import org.apache.flink.table.types.logical.LogicalType; -import org.apache.flink.table.types.logical.MapType; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.Schema; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.types.Type; -import org.apache.iceberg.types.Types; - -abstract class FlinkSchemaVisitor { - - static T visit(RowType flinkType, Schema schema, FlinkSchemaVisitor visitor) { - return visit(flinkType, schema.asStruct(), visitor); - } - - private static T visit(LogicalType flinkType, Type iType, FlinkSchemaVisitor visitor) { - switch (iType.typeId()) { - case STRUCT: - return visitRecord(flinkType, iType.asStructType(), visitor); - - case MAP: - MapType mapType = (MapType) flinkType; - Types.MapType iMapType = iType.asMapType(); - T key; - T value; - - Types.NestedField keyField = iMapType.field(iMapType.keyId()); - visitor.beforeMapKey(keyField); - try { - key = visit(mapType.getKeyType(), iMapType.keyType(), visitor); - } finally { - visitor.afterMapKey(keyField); - } - - Types.NestedField valueField = iMapType.field(iMapType.valueId()); - visitor.beforeMapValue(valueField); - try { - value = visit(mapType.getValueType(), iMapType.valueType(), visitor); - } finally { - visitor.afterMapValue(valueField); - } - - return visitor.map(iMapType, key, value, mapType.getKeyType(), mapType.getValueType()); - - case LIST: - ArrayType listType = (ArrayType) flinkType; - Types.ListType iListType = iType.asListType(); - T element; - - Types.NestedField elementField = iListType.field(iListType.elementId()); - visitor.beforeListElement(elementField); - try { - element = visit(listType.getElementType(), iListType.elementType(), visitor); - } finally { - visitor.afterListElement(elementField); - } - - return visitor.list(iListType, element, listType.getElementType()); - - default: - return visitor.primitive(iType.asPrimitiveType(), flinkType); - } - } - - private static T visitRecord( - LogicalType flinkType, Types.StructType struct, FlinkSchemaVisitor visitor) { - Preconditions.checkArgument(flinkType instanceof RowType, "%s is not a RowType.", flinkType); - RowType rowType = (RowType) flinkType; - - int fieldSize = struct.fields().size(); - List results = Lists.newArrayListWithExpectedSize(fieldSize); - List fieldTypes = Lists.newArrayListWithExpectedSize(fieldSize); - List nestedFields = struct.fields(); - - for (int i = 0; i < fieldSize; i++) { - Types.NestedField iField = nestedFields.get(i); - int fieldIndex = rowType.getFieldIndex(iField.name()); - Preconditions.checkArgument( - fieldIndex >= 0, "NestedField: %s is not found in flink RowType: %s", iField, rowType); - - LogicalType fieldFlinkType = rowType.getTypeAt(fieldIndex); - - fieldTypes.add(fieldFlinkType); - - visitor.beforeField(iField); - try { - results.add(visit(fieldFlinkType, iField.type(), visitor)); - } finally { - visitor.afterField(iField); - } - } - - return visitor.record(struct, results, fieldTypes); - } - - public T record(Types.StructType iStruct, List results, List fieldTypes) { - return null; - } - - public T list(Types.ListType iList, T element, LogicalType elementType) { - return null; - } - - public T map(Types.MapType iMap, T key, T value, LogicalType keyType, LogicalType valueType) { - return null; - } - - public T primitive(Type.PrimitiveType iPrimitive, LogicalType flinkPrimitive) { - return null; - } - - public void beforeField(Types.NestedField field) {} - - public void afterField(Types.NestedField field) {} - - public void beforeListElement(Types.NestedField elementField) { - beforeField(elementField); - } - - public void afterListElement(Types.NestedField elementField) { - afterField(elementField); - } - - public void beforeMapKey(Types.NestedField keyField) { - beforeField(keyField); - } - - public void afterMapKey(Types.NestedField keyField) { - afterField(keyField); - } - - public void beforeMapValue(Types.NestedField valueField) { - beforeField(valueField); - } - - public void afterMapValue(Types.NestedField valueField) { - afterField(valueField); - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/data/FlinkValueReaders.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/data/FlinkValueReaders.java deleted file mode 100644 index 32f6c3a2ccfd..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/data/FlinkValueReaders.java +++ /dev/null @@ -1,312 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.data; - -import java.io.IOException; -import java.math.BigDecimal; -import java.math.BigInteger; -import java.nio.charset.StandardCharsets; -import java.util.List; -import java.util.Map; -import org.apache.avro.io.Decoder; -import org.apache.avro.util.Utf8; -import org.apache.flink.table.data.ArrayData; -import org.apache.flink.table.data.DecimalData; -import org.apache.flink.table.data.GenericArrayData; -import org.apache.flink.table.data.GenericMapData; -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.MapData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.StringData; -import org.apache.flink.table.data.TimestampData; -import org.apache.iceberg.avro.ValueReader; -import org.apache.iceberg.avro.ValueReaders; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.types.Types; - -public class FlinkValueReaders { - - private FlinkValueReaders() {} - - static ValueReader strings() { - return StringReader.INSTANCE; - } - - static ValueReader enums(List symbols) { - return new EnumReader(symbols); - } - - static ValueReader uuids() { - return ValueReaders.fixed(16); - } - - static ValueReader timeMicros() { - return TimeMicrosReader.INSTANCE; - } - - static ValueReader timestampMills() { - return TimestampMillsReader.INSTANCE; - } - - static ValueReader timestampMicros() { - return TimestampMicrosReader.INSTANCE; - } - - static ValueReader decimal( - ValueReader unscaledReader, int precision, int scale) { - return new DecimalReader(unscaledReader, precision, scale); - } - - static ValueReader array(ValueReader elementReader) { - return new ArrayReader(elementReader); - } - - static ValueReader arrayMap(ValueReader keyReader, ValueReader valueReader) { - return new ArrayMapReader(keyReader, valueReader); - } - - static ValueReader map(ValueReader keyReader, ValueReader valueReader) { - return new MapReader(keyReader, valueReader); - } - - static ValueReader struct( - List> readers, Types.StructType struct, Map idToConstant) { - return new StructReader(readers, struct, idToConstant); - } - - private static class StringReader implements ValueReader { - private static final StringReader INSTANCE = new StringReader(); - - private StringReader() {} - - @Override - public StringData read(Decoder decoder, Object reuse) throws IOException { - // use the decoder's readString(Utf8) method because it may be a resolving decoder - Utf8 utf8 = null; - if (reuse instanceof StringData) { - utf8 = new Utf8(((StringData) reuse).toBytes()); - } - - Utf8 string = decoder.readString(utf8); - return StringData.fromBytes(string.getBytes(), 0, string.getByteLength()); - } - } - - private static class EnumReader implements ValueReader { - private final StringData[] symbols; - - private EnumReader(List symbols) { - this.symbols = new StringData[symbols.size()]; - for (int i = 0; i < this.symbols.length; i += 1) { - this.symbols[i] = StringData.fromBytes(symbols.get(i).getBytes(StandardCharsets.UTF_8)); - } - } - - @Override - public StringData read(Decoder decoder, Object ignore) throws IOException { - int index = decoder.readEnum(); - return symbols[index]; - } - } - - private static class DecimalReader implements ValueReader { - private final ValueReader bytesReader; - private final int precision; - private final int scale; - - private DecimalReader(ValueReader bytesReader, int precision, int scale) { - this.bytesReader = bytesReader; - this.precision = precision; - this.scale = scale; - } - - @Override - public DecimalData read(Decoder decoder, Object reuse) throws IOException { - byte[] bytes = bytesReader.read(decoder, null); - return DecimalData.fromBigDecimal( - new BigDecimal(new BigInteger(bytes), scale), precision, scale); - } - } - - private static class TimeMicrosReader implements ValueReader { - private static final TimeMicrosReader INSTANCE = new TimeMicrosReader(); - - @Override - public Integer read(Decoder decoder, Object reuse) throws IOException { - long micros = decoder.readLong(); - // Flink only support time mills, just erase micros. - return (int) (micros / 1000); - } - } - - private static class TimestampMillsReader implements ValueReader { - private static final TimestampMillsReader INSTANCE = new TimestampMillsReader(); - - @Override - public TimestampData read(Decoder decoder, Object reuse) throws IOException { - return TimestampData.fromEpochMillis(decoder.readLong()); - } - } - - private static class TimestampMicrosReader implements ValueReader { - private static final TimestampMicrosReader INSTANCE = new TimestampMicrosReader(); - - @Override - public TimestampData read(Decoder decoder, Object reuse) throws IOException { - long micros = decoder.readLong(); - long mills = micros / 1000; - int nanos = ((int) (micros % 1000)) * 1000; - if (nanos < 0) { - nanos += 1_000_000; - mills -= 1; - } - return TimestampData.fromEpochMillis(mills, nanos); - } - } - - private static class ArrayReader implements ValueReader { - private final ValueReader elementReader; - private final List reusedList = Lists.newArrayList(); - - private ArrayReader(ValueReader elementReader) { - this.elementReader = elementReader; - } - - @Override - public GenericArrayData read(Decoder decoder, Object reuse) throws IOException { - reusedList.clear(); - long chunkLength = decoder.readArrayStart(); - - while (chunkLength > 0) { - for (int i = 0; i < chunkLength; i += 1) { - reusedList.add(elementReader.read(decoder, null)); - } - - chunkLength = decoder.arrayNext(); - } - - // this will convert the list to an array so it is okay to reuse the list - return new GenericArrayData(reusedList.toArray()); - } - } - - private static MapData kvArrayToMap(List keyList, List valueList) { - Map map = Maps.newHashMap(); - Object[] keys = keyList.toArray(); - Object[] values = valueList.toArray(); - for (int i = 0; i < keys.length; i++) { - map.put(keys[i], values[i]); - } - - return new GenericMapData(map); - } - - private static class ArrayMapReader implements ValueReader { - private final ValueReader keyReader; - private final ValueReader valueReader; - - private final List reusedKeyList = Lists.newArrayList(); - private final List reusedValueList = Lists.newArrayList(); - - private ArrayMapReader(ValueReader keyReader, ValueReader valueReader) { - this.keyReader = keyReader; - this.valueReader = valueReader; - } - - @Override - public MapData read(Decoder decoder, Object reuse) throws IOException { - reusedKeyList.clear(); - reusedValueList.clear(); - - long chunkLength = decoder.readArrayStart(); - - while (chunkLength > 0) { - for (int i = 0; i < chunkLength; i += 1) { - reusedKeyList.add(keyReader.read(decoder, null)); - reusedValueList.add(valueReader.read(decoder, null)); - } - - chunkLength = decoder.arrayNext(); - } - - return kvArrayToMap(reusedKeyList, reusedValueList); - } - } - - private static class MapReader implements ValueReader { - private final ValueReader keyReader; - private final ValueReader valueReader; - - private final List reusedKeyList = Lists.newArrayList(); - private final List reusedValueList = Lists.newArrayList(); - - private MapReader(ValueReader keyReader, ValueReader valueReader) { - this.keyReader = keyReader; - this.valueReader = valueReader; - } - - @Override - public MapData read(Decoder decoder, Object reuse) throws IOException { - reusedKeyList.clear(); - reusedValueList.clear(); - - long chunkLength = decoder.readMapStart(); - - while (chunkLength > 0) { - for (int i = 0; i < chunkLength; i += 1) { - reusedKeyList.add(keyReader.read(decoder, null)); - reusedValueList.add(valueReader.read(decoder, null)); - } - - chunkLength = decoder.mapNext(); - } - - return kvArrayToMap(reusedKeyList, reusedValueList); - } - } - - private static class StructReader extends ValueReaders.StructReader { - private final int numFields; - - private StructReader( - List> readers, Types.StructType struct, Map idToConstant) { - super(readers, struct, idToConstant); - this.numFields = readers.size(); - } - - @Override - protected RowData reuseOrCreate(Object reuse) { - if (reuse instanceof GenericRowData && ((GenericRowData) reuse).getArity() == numFields) { - return (GenericRowData) reuse; - } - return new GenericRowData(numFields); - } - - @Override - protected Object get(RowData struct, int pos) { - return null; - } - - @Override - protected void set(RowData struct, int pos, Object value) { - ((GenericRowData) struct).setField(pos, value); - } - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/data/FlinkValueWriters.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/data/FlinkValueWriters.java deleted file mode 100644 index 4e86ecce28b5..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/data/FlinkValueWriters.java +++ /dev/null @@ -1,253 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.data; - -import java.io.IOException; -import java.lang.reflect.Array; -import java.util.List; -import org.apache.avro.io.Encoder; -import org.apache.avro.util.Utf8; -import org.apache.flink.table.data.ArrayData; -import org.apache.flink.table.data.DecimalData; -import org.apache.flink.table.data.MapData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.StringData; -import org.apache.flink.table.data.TimestampData; -import org.apache.flink.table.types.logical.LogicalType; -import org.apache.iceberg.avro.ValueWriter; -import org.apache.iceberg.types.TypeUtil; -import org.apache.iceberg.util.DecimalUtil; - -public class FlinkValueWriters { - - private FlinkValueWriters() {} - - static ValueWriter strings() { - return StringWriter.INSTANCE; - } - - static ValueWriter timeMicros() { - return TimeMicrosWriter.INSTANCE; - } - - static ValueWriter timestampMicros() { - return TimestampMicrosWriter.INSTANCE; - } - - static ValueWriter decimal(int precision, int scale) { - return new DecimalWriter(precision, scale); - } - - static ValueWriter array(ValueWriter elementWriter, LogicalType elementType) { - return new ArrayWriter<>(elementWriter, elementType); - } - - static ValueWriter arrayMap( - ValueWriter keyWriter, - LogicalType keyType, - ValueWriter valueWriter, - LogicalType valueType) { - return new ArrayMapWriter<>(keyWriter, keyType, valueWriter, valueType); - } - - static ValueWriter map( - ValueWriter keyWriter, - LogicalType keyType, - ValueWriter valueWriter, - LogicalType valueType) { - return new MapWriter<>(keyWriter, keyType, valueWriter, valueType); - } - - static ValueWriter row(List> writers, List types) { - return new RowWriter(writers, types); - } - - private static class StringWriter implements ValueWriter { - private static final StringWriter INSTANCE = new StringWriter(); - - private StringWriter() {} - - @Override - public void write(StringData s, Encoder encoder) throws IOException { - // toBytes is cheaper than Avro calling toString, which incurs encoding costs - encoder.writeString(new Utf8(s.toBytes())); - } - } - - private static class DecimalWriter implements ValueWriter { - private final int precision; - private final int scale; - private final ThreadLocal bytes; - - private DecimalWriter(int precision, int scale) { - this.precision = precision; - this.scale = scale; - this.bytes = - ThreadLocal.withInitial(() -> new byte[TypeUtil.decimalRequiredBytes(precision)]); - } - - @Override - public void write(DecimalData d, Encoder encoder) throws IOException { - encoder.writeFixed( - DecimalUtil.toReusedFixLengthBytes(precision, scale, d.toBigDecimal(), bytes.get())); - } - } - - private static class TimeMicrosWriter implements ValueWriter { - private static final TimeMicrosWriter INSTANCE = new TimeMicrosWriter(); - - @Override - public void write(Integer timeMills, Encoder encoder) throws IOException { - encoder.writeLong(timeMills * 1000L); - } - } - - private static class TimestampMicrosWriter implements ValueWriter { - private static final TimestampMicrosWriter INSTANCE = new TimestampMicrosWriter(); - - @Override - public void write(TimestampData timestampData, Encoder encoder) throws IOException { - long micros = - timestampData.getMillisecond() * 1000 + timestampData.getNanoOfMillisecond() / 1000; - encoder.writeLong(micros); - } - } - - private static class ArrayWriter implements ValueWriter { - private final ValueWriter elementWriter; - private final ArrayData.ElementGetter elementGetter; - - private ArrayWriter(ValueWriter elementWriter, LogicalType elementType) { - this.elementWriter = elementWriter; - this.elementGetter = ArrayData.createElementGetter(elementType); - } - - @Override - @SuppressWarnings("unchecked") - public void write(ArrayData array, Encoder encoder) throws IOException { - encoder.writeArrayStart(); - int numElements = array.size(); - encoder.setItemCount(numElements); - for (int i = 0; i < numElements; i += 1) { - encoder.startItem(); - elementWriter.write((T) elementGetter.getElementOrNull(array, i), encoder); - } - encoder.writeArrayEnd(); - } - } - - private static class ArrayMapWriter implements ValueWriter { - private final ValueWriter keyWriter; - private final ValueWriter valueWriter; - private final ArrayData.ElementGetter keyGetter; - private final ArrayData.ElementGetter valueGetter; - - private ArrayMapWriter( - ValueWriter keyWriter, - LogicalType keyType, - ValueWriter valueWriter, - LogicalType valueType) { - this.keyWriter = keyWriter; - this.keyGetter = ArrayData.createElementGetter(keyType); - this.valueWriter = valueWriter; - this.valueGetter = ArrayData.createElementGetter(valueType); - } - - @Override - @SuppressWarnings("unchecked") - public void write(MapData map, Encoder encoder) throws IOException { - encoder.writeArrayStart(); - int numElements = map.size(); - encoder.setItemCount(numElements); - ArrayData keyArray = map.keyArray(); - ArrayData valueArray = map.valueArray(); - for (int i = 0; i < numElements; i += 1) { - encoder.startItem(); - keyWriter.write((K) keyGetter.getElementOrNull(keyArray, i), encoder); - valueWriter.write((V) valueGetter.getElementOrNull(valueArray, i), encoder); - } - encoder.writeArrayEnd(); - } - } - - private static class MapWriter implements ValueWriter { - private final ValueWriter keyWriter; - private final ValueWriter valueWriter; - private final ArrayData.ElementGetter keyGetter; - private final ArrayData.ElementGetter valueGetter; - - private MapWriter( - ValueWriter keyWriter, - LogicalType keyType, - ValueWriter valueWriter, - LogicalType valueType) { - this.keyWriter = keyWriter; - this.keyGetter = ArrayData.createElementGetter(keyType); - this.valueWriter = valueWriter; - this.valueGetter = ArrayData.createElementGetter(valueType); - } - - @Override - @SuppressWarnings("unchecked") - public void write(MapData map, Encoder encoder) throws IOException { - encoder.writeMapStart(); - int numElements = map.size(); - encoder.setItemCount(numElements); - ArrayData keyArray = map.keyArray(); - ArrayData valueArray = map.valueArray(); - for (int i = 0; i < numElements; i += 1) { - encoder.startItem(); - keyWriter.write((K) keyGetter.getElementOrNull(keyArray, i), encoder); - valueWriter.write((V) valueGetter.getElementOrNull(valueArray, i), encoder); - } - encoder.writeMapEnd(); - } - } - - static class RowWriter implements ValueWriter { - private final ValueWriter[] writers; - private final RowData.FieldGetter[] getters; - - private RowWriter(List> writers, List types) { - this.writers = (ValueWriter[]) Array.newInstance(ValueWriter.class, writers.size()); - this.getters = new RowData.FieldGetter[writers.size()]; - for (int i = 0; i < writers.size(); i += 1) { - this.writers[i] = writers.get(i); - this.getters[i] = RowData.createFieldGetter(types.get(i), i); - } - } - - @Override - public void write(RowData row, Encoder encoder) throws IOException { - for (int i = 0; i < writers.length; i += 1) { - if (row.isNullAt(i)) { - writers[i].write(null, encoder); - } else { - write(row, i, writers[i], encoder); - } - } - } - - @SuppressWarnings("unchecked") - private void write(RowData row, int pos, ValueWriter writer, Encoder encoder) - throws IOException { - writer.write((T) getters[pos].getFieldOrNull(row), encoder); - } - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/data/ParquetWithFlinkSchemaVisitor.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/data/ParquetWithFlinkSchemaVisitor.java deleted file mode 100644 index 33feb2e32118..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/data/ParquetWithFlinkSchemaVisitor.java +++ /dev/null @@ -1,222 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.data; - -import java.util.Deque; -import java.util.List; -import org.apache.flink.table.types.logical.ArrayType; -import org.apache.flink.table.types.logical.LogicalType; -import org.apache.flink.table.types.logical.MapType; -import org.apache.flink.table.types.logical.RowType; -import org.apache.flink.table.types.logical.RowType.RowField; -import org.apache.iceberg.avro.AvroSchemaUtil; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.parquet.schema.GroupType; -import org.apache.parquet.schema.MessageType; -import org.apache.parquet.schema.OriginalType; -import org.apache.parquet.schema.PrimitiveType; -import org.apache.parquet.schema.Type; - -public class ParquetWithFlinkSchemaVisitor { - private final Deque fieldNames = Lists.newLinkedList(); - - public static T visit( - LogicalType sType, Type type, ParquetWithFlinkSchemaVisitor visitor) { - Preconditions.checkArgument(sType != null, "Invalid DataType: null"); - if (type instanceof MessageType) { - Preconditions.checkArgument( - sType instanceof RowType, "Invalid struct: %s is not a struct", sType); - RowType struct = (RowType) sType; - return visitor.message( - struct, (MessageType) type, visitFields(struct, type.asGroupType(), visitor)); - } else if (type.isPrimitive()) { - return visitor.primitive(sType, type.asPrimitiveType()); - } else { - // if not a primitive, the typeId must be a group - GroupType group = type.asGroupType(); - OriginalType annotation = group.getOriginalType(); - if (annotation != null) { - switch (annotation) { - case LIST: - Preconditions.checkArgument( - !group.isRepetition(Type.Repetition.REPEATED), - "Invalid list: top-level group is repeated: %s", - group); - Preconditions.checkArgument( - group.getFieldCount() == 1, - "Invalid list: does not contain single repeated field: %s", - group); - - GroupType repeatedElement = group.getFields().get(0).asGroupType(); - Preconditions.checkArgument( - repeatedElement.isRepetition(Type.Repetition.REPEATED), - "Invalid list: inner group is not repeated"); - Preconditions.checkArgument( - repeatedElement.getFieldCount() <= 1, - "Invalid list: repeated group is not a single field: %s", - group); - - Preconditions.checkArgument( - sType instanceof ArrayType, "Invalid list: %s is not an array", sType); - ArrayType array = (ArrayType) sType; - RowType.RowField element = - new RowField( - "element", array.getElementType(), "element of " + array.asSummaryString()); - - visitor.fieldNames.push(repeatedElement.getName()); - try { - T elementResult = null; - if (repeatedElement.getFieldCount() > 0) { - elementResult = visitField(element, repeatedElement.getType(0), visitor); - } - - return visitor.list(array, group, elementResult); - - } finally { - visitor.fieldNames.pop(); - } - - case MAP: - Preconditions.checkArgument( - !group.isRepetition(Type.Repetition.REPEATED), - "Invalid map: top-level group is repeated: %s", - group); - Preconditions.checkArgument( - group.getFieldCount() == 1, - "Invalid map: does not contain single repeated field: %s", - group); - - GroupType repeatedKeyValue = group.getType(0).asGroupType(); - Preconditions.checkArgument( - repeatedKeyValue.isRepetition(Type.Repetition.REPEATED), - "Invalid map: inner group is not repeated"); - Preconditions.checkArgument( - repeatedKeyValue.getFieldCount() <= 2, - "Invalid map: repeated group does not have 2 fields"); - - Preconditions.checkArgument( - sType instanceof MapType, "Invalid map: %s is not a map", sType); - MapType map = (MapType) sType; - RowField keyField = - new RowField("key", map.getKeyType(), "key of " + map.asSummaryString()); - RowField valueField = - new RowField("value", map.getValueType(), "value of " + map.asSummaryString()); - - visitor.fieldNames.push(repeatedKeyValue.getName()); - try { - T keyResult = null; - T valueResult = null; - switch (repeatedKeyValue.getFieldCount()) { - case 2: - // if there are 2 fields, both key and value are projected - keyResult = visitField(keyField, repeatedKeyValue.getType(0), visitor); - valueResult = visitField(valueField, repeatedKeyValue.getType(1), visitor); - break; - case 1: - // if there is just one, use the name to determine what it is - Type keyOrValue = repeatedKeyValue.getType(0); - if (keyOrValue.getName().equalsIgnoreCase("key")) { - keyResult = visitField(keyField, keyOrValue, visitor); - // value result remains null - } else { - valueResult = visitField(valueField, keyOrValue, visitor); - // key result remains null - } - break; - default: - // both results will remain null - } - - return visitor.map(map, group, keyResult, valueResult); - - } finally { - visitor.fieldNames.pop(); - } - - default: - } - } - Preconditions.checkArgument( - sType instanceof RowType, "Invalid struct: %s is not a struct", sType); - RowType struct = (RowType) sType; - return visitor.struct(struct, group, visitFields(struct, group, visitor)); - } - } - - private static T visitField( - RowType.RowField sField, Type field, ParquetWithFlinkSchemaVisitor visitor) { - visitor.fieldNames.push(field.getName()); - try { - return visit(sField.getType(), field, visitor); - } finally { - visitor.fieldNames.pop(); - } - } - - private static List visitFields( - RowType struct, GroupType group, ParquetWithFlinkSchemaVisitor visitor) { - List sFields = struct.getFields(); - Preconditions.checkArgument( - sFields.size() == group.getFieldCount(), "Structs do not match: %s and %s", struct, group); - List results = Lists.newArrayListWithExpectedSize(group.getFieldCount()); - for (int i = 0; i < sFields.size(); i += 1) { - Type field = group.getFields().get(i); - RowType.RowField sField = sFields.get(i); - Preconditions.checkArgument( - field.getName().equals(AvroSchemaUtil.makeCompatibleName(sField.getName())), - "Structs do not match: field %s != %s", - field.getName(), - sField.getName()); - results.add(visitField(sField, field, visitor)); - } - - return results; - } - - public T message(RowType sStruct, MessageType message, List fields) { - return null; - } - - public T struct(RowType sStruct, GroupType struct, List fields) { - return null; - } - - public T list(ArrayType sArray, GroupType array, T element) { - return null; - } - - public T map(MapType sMap, GroupType map, T key, T value) { - return null; - } - - public T primitive(LogicalType sPrimitive, PrimitiveType primitive) { - return null; - } - - protected String[] currentPath() { - return Lists.newArrayList(fieldNames.descendingIterator()).toArray(new String[0]); - } - - protected String[] path(String name) { - List list = Lists.newArrayList(fieldNames.descendingIterator()); - list.add(name); - return list.toArray(new String[0]); - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/data/RowDataProjection.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/data/RowDataProjection.java deleted file mode 100644 index 33816c97ac29..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/data/RowDataProjection.java +++ /dev/null @@ -1,341 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.data; - -import java.util.Arrays; -import java.util.Map; -import java.util.Objects; -import org.apache.flink.table.data.ArrayData; -import org.apache.flink.table.data.DecimalData; -import org.apache.flink.table.data.MapData; -import org.apache.flink.table.data.RawValueData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.StringData; -import org.apache.flink.table.data.TimestampData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.flink.types.RowKind; -import org.apache.flink.util.StringUtils; -import org.apache.iceberg.Schema; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.types.Types; - -public class RowDataProjection implements RowData { - /** - * Creates a projecting wrapper for {@link RowData} rows. - * - *

    This projection will not project the nested children types of repeated types like lists and - * maps. - * - * @param schema schema of rows wrapped by this projection - * @param projectedSchema result schema of the projected rows - * @return a wrapper to project rows - */ - public static RowDataProjection create(Schema schema, Schema projectedSchema) { - return RowDataProjection.create( - FlinkSchemaUtil.convert(schema), schema.asStruct(), projectedSchema.asStruct()); - } - - /** - * Creates a projecting wrapper for {@link RowData} rows. - * - *

    This projection will not project the nested children types of repeated types like lists and - * maps. - * - * @param rowType flink row type of rows wrapped by this projection - * @param schema schema of rows wrapped by this projection - * @param projectedSchema result schema of the projected rows - * @return a wrapper to project rows - */ - public static RowDataProjection create( - RowType rowType, Types.StructType schema, Types.StructType projectedSchema) { - return new RowDataProjection(rowType, schema, projectedSchema); - } - - private final RowData.FieldGetter[] getters; - private RowData rowData; - - private RowDataProjection( - RowType rowType, Types.StructType rowStruct, Types.StructType projectType) { - Map fieldIdToPosition = Maps.newHashMap(); - for (int i = 0; i < rowStruct.fields().size(); i++) { - fieldIdToPosition.put(rowStruct.fields().get(i).fieldId(), i); - } - - this.getters = new RowData.FieldGetter[projectType.fields().size()]; - for (int i = 0; i < getters.length; i++) { - Types.NestedField projectField = projectType.fields().get(i); - Types.NestedField rowField = rowStruct.field(projectField.fieldId()); - - Preconditions.checkNotNull( - rowField, - "Cannot locate the project field <%s> in the iceberg struct <%s>", - projectField, - rowStruct); - - getters[i] = - createFieldGetter( - rowType, fieldIdToPosition.get(projectField.fieldId()), rowField, projectField); - } - } - - private static RowData.FieldGetter createFieldGetter( - RowType rowType, int position, Types.NestedField rowField, Types.NestedField projectField) { - Preconditions.checkArgument( - rowField.type().typeId() == projectField.type().typeId(), - "Different iceberg type between row field <%s> and project field <%s>", - rowField, - projectField); - - switch (projectField.type().typeId()) { - case STRUCT: - RowType nestedRowType = (RowType) rowType.getTypeAt(position); - return row -> { - // null nested struct value - if (row.isNullAt(position)) { - return null; - } - - RowData nestedRow = row.getRow(position, nestedRowType.getFieldCount()); - return RowDataProjection.create( - nestedRowType, rowField.type().asStructType(), projectField.type().asStructType()) - .wrap(nestedRow); - }; - - case MAP: - Types.MapType projectedMap = projectField.type().asMapType(); - Types.MapType originalMap = rowField.type().asMapType(); - - boolean keyProjectable = - !projectedMap.keyType().isNestedType() - || projectedMap.keyType().equals(originalMap.keyType()); - boolean valueProjectable = - !projectedMap.valueType().isNestedType() - || projectedMap.valueType().equals(originalMap.valueType()); - Preconditions.checkArgument( - keyProjectable && valueProjectable, - "Cannot project a partial map key or value with non-primitive type. Trying to project <%s> out of <%s>", - projectField, - rowField); - - return RowData.createFieldGetter(rowType.getTypeAt(position), position); - - case LIST: - Types.ListType projectedList = projectField.type().asListType(); - Types.ListType originalList = rowField.type().asListType(); - - boolean elementProjectable = - !projectedList.elementType().isNestedType() - || projectedList.elementType().equals(originalList.elementType()); - Preconditions.checkArgument( - elementProjectable, - "Cannot project a partial list element with non-primitive type. Trying to project <%s> out of <%s>", - projectField, - rowField); - - return RowData.createFieldGetter(rowType.getTypeAt(position), position); - - default: - return RowData.createFieldGetter(rowType.getTypeAt(position), position); - } - } - - public RowData wrap(RowData row) { - // StructProjection allow wrapping null root struct object. - // See more discussions in https://github.com/apache/iceberg/pull/7517. - // RowDataProjection never allowed null root object to be wrapped. - // Hence, it is fine to enforce strict Preconditions check here. - Preconditions.checkArgument(row != null, "Invalid row data: null"); - this.rowData = row; - return this; - } - - private Object getValue(int pos) { - Preconditions.checkState(rowData != null, "Row data not wrapped"); - return getters[pos].getFieldOrNull(rowData); - } - - @Override - public int getArity() { - return getters.length; - } - - @Override - public RowKind getRowKind() { - Preconditions.checkState(rowData != null, "Row data not wrapped"); - return rowData.getRowKind(); - } - - @Override - public void setRowKind(RowKind kind) { - throw new UnsupportedOperationException("Cannot set row kind in the RowDataProjection"); - } - - @Override - public boolean isNullAt(int pos) { - return getValue(pos) == null; - } - - @Override - public boolean getBoolean(int pos) { - return (boolean) getValue(pos); - } - - @Override - public byte getByte(int pos) { - return (byte) getValue(pos); - } - - @Override - public short getShort(int pos) { - return (short) getValue(pos); - } - - @Override - public int getInt(int pos) { - return (int) getValue(pos); - } - - @Override - public long getLong(int pos) { - return (long) getValue(pos); - } - - @Override - public float getFloat(int pos) { - return (float) getValue(pos); - } - - @Override - public double getDouble(int pos) { - return (double) getValue(pos); - } - - @Override - public StringData getString(int pos) { - return (StringData) getValue(pos); - } - - @Override - public DecimalData getDecimal(int pos, int precision, int scale) { - return (DecimalData) getValue(pos); - } - - @Override - public TimestampData getTimestamp(int pos, int precision) { - return (TimestampData) getValue(pos); - } - - @Override - @SuppressWarnings("unchecked") - public RawValueData getRawValue(int pos) { - return (RawValueData) getValue(pos); - } - - @Override - public byte[] getBinary(int pos) { - return (byte[]) getValue(pos); - } - - @Override - public ArrayData getArray(int pos) { - return (ArrayData) getValue(pos); - } - - @Override - public MapData getMap(int pos) { - return (MapData) getValue(pos); - } - - @Override - public RowData getRow(int pos, int numFields) { - return (RowData) getValue(pos); - } - - @Override - public boolean equals(Object o) { - if (this == o) { - return true; - } - - if (!(o instanceof RowDataProjection)) { - return false; - } - - RowDataProjection that = (RowDataProjection) o; - return deepEquals(that); - } - - @Override - public int hashCode() { - int result = Objects.hashCode(getRowKind()); - for (int pos = 0; pos < getArity(); pos++) { - if (!isNullAt(pos)) { - // Arrays.deepHashCode handles array object properly - result = 31 * result + Arrays.deepHashCode(new Object[] {getValue(pos)}); - } - } - - return result; - } - - @Override - public String toString() { - StringBuilder sb = new StringBuilder(); - sb.append(getRowKind().shortString()).append("("); - for (int pos = 0; pos < getArity(); pos++) { - if (pos != 0) { - sb.append(","); - } - // copied the behavior from Flink GenericRowData - sb.append(StringUtils.arrayAwareToString(getValue(pos))); - } - - sb.append(")"); - return sb.toString(); - } - - private boolean deepEquals(RowDataProjection other) { - if (getRowKind() != other.getRowKind()) { - return false; - } - - if (getArity() != other.getArity()) { - return false; - } - - for (int pos = 0; pos < getArity(); ++pos) { - if (isNullAt(pos) && other.isNullAt(pos)) { - continue; - } - - if ((isNullAt(pos) && !other.isNullAt(pos)) || (!isNullAt(pos) && other.isNullAt(pos))) { - return false; - } - - // Objects.deepEquals handles array object properly - if (!Objects.deepEquals(getValue(pos), other.getValue(pos))) { - return false; - } - } - - return true; - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/data/RowDataUtil.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/data/RowDataUtil.java deleted file mode 100644 index 3a8f5ccc6c03..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/data/RowDataUtil.java +++ /dev/null @@ -1,123 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.data; - -import java.math.BigDecimal; -import java.nio.ByteBuffer; -import org.apache.avro.generic.GenericData; -import org.apache.avro.util.Utf8; -import org.apache.flink.api.common.typeutils.TypeSerializer; -import org.apache.flink.table.data.DecimalData; -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.StringData; -import org.apache.flink.table.data.TimestampData; -import org.apache.flink.table.runtime.typeutils.RowDataSerializer; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.types.Type; -import org.apache.iceberg.types.Types; -import org.apache.iceberg.util.ByteBuffers; -import org.apache.iceberg.util.DateTimeUtil; - -public class RowDataUtil { - - private RowDataUtil() {} - - public static Object convertConstant(Type type, Object value) { - if (value == null) { - return null; - } - - switch (type.typeId()) { - case DECIMAL: // DecimalData - Types.DecimalType decimal = (Types.DecimalType) type; - return DecimalData.fromBigDecimal((BigDecimal) value, decimal.precision(), decimal.scale()); - case STRING: // StringData - if (value instanceof Utf8) { - Utf8 utf8 = (Utf8) value; - return StringData.fromBytes(utf8.getBytes(), 0, utf8.getByteLength()); - } - return StringData.fromString(value.toString()); - case FIXED: // byte[] - if (value instanceof byte[]) { - return value; - } else if (value instanceof GenericData.Fixed) { - return ((GenericData.Fixed) value).bytes(); - } - return ByteBuffers.toByteArray((ByteBuffer) value); - case BINARY: // byte[] - return ByteBuffers.toByteArray((ByteBuffer) value); - case TIME: // int mills instead of long - return (int) ((Long) value / 1000); - case TIMESTAMP: // TimestampData - return TimestampData.fromLocalDateTime(DateTimeUtil.timestampFromMicros((Long) value)); - default: - } - return value; - } - - /** - * Similar to the private {@link RowDataSerializer#copyRowData(RowData, RowData)} method. This - * skips the check the arity of rowType and from, because the from RowData may contains additional - * column for position deletes. Using {@link RowDataSerializer#copy(RowData, RowData)} will fail - * the arity check. - */ - public static RowData clone( - RowData from, - RowData reuse, - RowType rowType, - TypeSerializer[] fieldSerializers, - RowData.FieldGetter[] fieldGetters) { - GenericRowData ret; - if (reuse instanceof GenericRowData) { - ret = (GenericRowData) reuse; - } else { - ret = new GenericRowData(from.getArity()); - } - - ret.setRowKind(from.getRowKind()); - for (int i = 0; i < rowType.getFieldCount(); i++) { - if (!from.isNullAt(i)) { - ret.setField(i, fieldSerializers[i].copy(fieldGetters[i].getFieldOrNull(from))); - } else { - ret.setField(i, null); - } - } - - return ret; - } - - /** - * @deprecated will be removed in 1.7.0; Not reusing FieldGetter in this method could lead to - * performance degradation, use {@link #clone(RowData, RowData, RowType, TypeSerializer[], - * RowData.FieldGetter[])} instead. - */ - @Deprecated - public static RowData clone( - RowData from, RowData reuse, RowType rowType, TypeSerializer[] fieldSerializers) { - RowData.FieldGetter[] fieldGetters = new RowData.FieldGetter[rowType.getFieldCount()]; - for (int i = 0; i < rowType.getFieldCount(); ++i) { - if (!from.isNullAt(i)) { - fieldGetters[i] = RowData.createFieldGetter(rowType.getTypeAt(i), i); - } - } - - return clone(from, reuse, rowType, fieldSerializers, fieldGetters); - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/data/StructRowData.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/data/StructRowData.java deleted file mode 100644 index 1019285018d0..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/data/StructRowData.java +++ /dev/null @@ -1,300 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.data; - -import java.math.BigDecimal; -import java.nio.ByteBuffer; -import java.time.Duration; -import java.time.Instant; -import java.time.LocalDate; -import java.time.LocalDateTime; -import java.time.LocalTime; -import java.time.OffsetDateTime; -import java.time.ZoneOffset; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.UUID; -import org.apache.flink.annotation.Internal; -import org.apache.flink.table.data.ArrayData; -import org.apache.flink.table.data.DecimalData; -import org.apache.flink.table.data.GenericArrayData; -import org.apache.flink.table.data.GenericMapData; -import org.apache.flink.table.data.MapData; -import org.apache.flink.table.data.RawValueData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.StringData; -import org.apache.flink.table.data.TimestampData; -import org.apache.flink.types.RowKind; -import org.apache.iceberg.StructLike; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.types.Type; -import org.apache.iceberg.types.Types; -import org.apache.iceberg.util.ByteBuffers; - -@Internal -public class StructRowData implements RowData { - private final Types.StructType type; - private RowKind kind; - private StructLike struct; - - public StructRowData(Types.StructType type) { - this(type, RowKind.INSERT); - } - - public StructRowData(Types.StructType type, RowKind kind) { - this(type, null, kind); - } - - private StructRowData(Types.StructType type, StructLike struct) { - this(type, struct, RowKind.INSERT); - } - - private StructRowData(Types.StructType type, StructLike struct, RowKind kind) { - this.type = type; - this.struct = struct; - this.kind = kind; - } - - public StructRowData setStruct(StructLike newStruct) { - this.struct = newStruct; - return this; - } - - @Override - public int getArity() { - return struct.size(); - } - - @Override - public RowKind getRowKind() { - return kind; - } - - @Override - public void setRowKind(RowKind newKind) { - Preconditions.checkNotNull(newKind, "kind can not be null"); - this.kind = newKind; - } - - @Override - public boolean isNullAt(int pos) { - return struct.get(pos, Object.class) == null; - } - - @Override - public boolean getBoolean(int pos) { - return struct.get(pos, Boolean.class); - } - - @Override - public byte getByte(int pos) { - return (byte) (int) struct.get(pos, Integer.class); - } - - @Override - public short getShort(int pos) { - return (short) (int) struct.get(pos, Integer.class); - } - - @Override - public int getInt(int pos) { - Object integer = struct.get(pos, Object.class); - - if (integer instanceof Integer) { - return (int) integer; - } else if (integer instanceof LocalDate) { - return (int) ((LocalDate) integer).toEpochDay(); - } else if (integer instanceof LocalTime) { - return (int) (((LocalTime) integer).toNanoOfDay() / 1000_000); - } else { - throw new IllegalStateException( - "Unknown type for int field. Type name: " + integer.getClass().getName()); - } - } - - @Override - public long getLong(int pos) { - Object longVal = struct.get(pos, Object.class); - - if (longVal instanceof Long) { - return (long) longVal; - } else if (longVal instanceof OffsetDateTime) { - return Duration.between(Instant.EPOCH, (OffsetDateTime) longVal).toNanos() / 1000; - } else if (longVal instanceof LocalDate) { - return ((LocalDate) longVal).toEpochDay(); - } else if (longVal instanceof LocalTime) { - return ((LocalTime) longVal).toNanoOfDay(); - } else if (longVal instanceof LocalDateTime) { - return Duration.between(Instant.EPOCH, ((LocalDateTime) longVal).atOffset(ZoneOffset.UTC)) - .toNanos() - / 1000; - } else { - throw new IllegalStateException( - "Unknown type for long field. Type name: " + longVal.getClass().getName()); - } - } - - @Override - public float getFloat(int pos) { - return struct.get(pos, Float.class); - } - - @Override - public double getDouble(int pos) { - return struct.get(pos, Double.class); - } - - @Override - public StringData getString(int pos) { - return isNullAt(pos) ? null : getStringDataInternal(pos); - } - - private StringData getStringDataInternal(int pos) { - CharSequence seq = struct.get(pos, CharSequence.class); - return StringData.fromString(seq.toString()); - } - - @Override - public DecimalData getDecimal(int pos, int precision, int scale) { - return isNullAt(pos) - ? null - : DecimalData.fromBigDecimal(getDecimalInternal(pos), precision, scale); - } - - private BigDecimal getDecimalInternal(int pos) { - return struct.get(pos, BigDecimal.class); - } - - @Override - public TimestampData getTimestamp(int pos, int precision) { - long timeLong = getLong(pos); - return TimestampData.fromEpochMillis(timeLong / 1000, (int) (timeLong % 1000) * 1000); - } - - @Override - public RawValueData getRawValue(int pos) { - throw new UnsupportedOperationException("Not supported yet."); - } - - @Override - public byte[] getBinary(int pos) { - return isNullAt(pos) ? null : getBinaryInternal(pos); - } - - private byte[] getBinaryInternal(int pos) { - Object bytes = struct.get(pos, Object.class); - - // should only be either ByteBuffer or byte[] - if (bytes instanceof ByteBuffer) { - return ByteBuffers.toByteArray((ByteBuffer) bytes); - } else if (bytes instanceof byte[]) { - return (byte[]) bytes; - } else if (bytes instanceof UUID) { - UUID uuid = (UUID) bytes; - ByteBuffer bb = ByteBuffer.allocate(16); - bb.putLong(uuid.getMostSignificantBits()); - bb.putLong(uuid.getLeastSignificantBits()); - return bb.array(); - } else { - throw new IllegalStateException( - "Unknown type for binary field. Type name: " + bytes.getClass().getName()); - } - } - - @Override - public ArrayData getArray(int pos) { - return isNullAt(pos) - ? null - : (ArrayData) - convertValue(type.fields().get(pos).type().asListType(), struct.get(pos, List.class)); - } - - @Override - public MapData getMap(int pos) { - return isNullAt(pos) - ? null - : (MapData) - convertValue(type.fields().get(pos).type().asMapType(), struct.get(pos, Map.class)); - } - - @Override - public RowData getRow(int pos, int numFields) { - return isNullAt(pos) ? null : getStructRowData(pos, numFields); - } - - private StructRowData getStructRowData(int pos, int numFields) { - return new StructRowData( - type.fields().get(pos).type().asStructType(), struct.get(pos, StructLike.class)); - } - - private Object convertValue(Type elementType, Object value) { - switch (elementType.typeId()) { - case BOOLEAN: - case INTEGER: - case DATE: - case TIME: - case LONG: - case FLOAT: - case DOUBLE: - case DECIMAL: - return value; - case TIMESTAMP: - long millisecond = (long) value / 1000; - int nanoOfMillisecond = (int) ((Long) value % 1000) * 1000; - return TimestampData.fromEpochMillis(millisecond, nanoOfMillisecond); - case STRING: - return StringData.fromString(value.toString()); - case FIXED: - case BINARY: - return ByteBuffers.toByteArray((ByteBuffer) value); - case STRUCT: - return new StructRowData(elementType.asStructType(), (StructLike) value); - case LIST: - List list = (List) value; - Object[] array = new Object[list.size()]; - - int index = 0; - for (Object element : list) { - if (element == null) { - array[index] = null; - } else { - array[index] = convertValue(elementType.asListType().elementType(), element); - } - - index += 1; - } - return new GenericArrayData(array); - case MAP: - Types.MapType mapType = elementType.asMapType(); - Set> entries = ((Map) value).entrySet(); - Map result = Maps.newHashMap(); - for (Map.Entry entry : entries) { - final Object keyValue = convertValue(mapType.keyType(), entry.getKey()); - final Object valueValue = convertValue(mapType.valueType(), entry.getValue()); - result.put(keyValue, valueValue); - } - - return new GenericMapData(result); - default: - throw new UnsupportedOperationException("Unsupported element type: " + elementType); - } - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/AvroGenericRecordToRowDataMapper.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/AvroGenericRecordToRowDataMapper.java deleted file mode 100644 index f7e8e0c884cf..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/AvroGenericRecordToRowDataMapper.java +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import org.apache.avro.Schema; -import org.apache.avro.generic.GenericRecord; -import org.apache.flink.api.common.functions.MapFunction; -import org.apache.flink.formats.avro.AvroToRowDataConverters; -import org.apache.flink.formats.avro.typeutils.AvroSchemaConverter; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.DataType; -import org.apache.flink.table.types.logical.LogicalType; -import org.apache.flink.table.types.logical.RowType; -import org.apache.flink.table.types.utils.TypeConversions; -import org.apache.iceberg.avro.AvroSchemaUtil; - -/** - * This util class converts Avro GenericRecord to Flink RowData.
    - *
    - * Internally it uses Flink {@link AvroToRowDataConverters}. Because of the precision difference - * between how Iceberg schema (micro) and Flink {@link AvroToRowDataConverters} (milli) deal with - * time type, we can't directly use the Avro Schema converted from Iceberg schema via {@link - * AvroSchemaUtil#convert(org.apache.iceberg.Schema, String)}. - */ -public class AvroGenericRecordToRowDataMapper implements MapFunction { - - private final AvroToRowDataConverters.AvroToRowDataConverter converter; - - AvroGenericRecordToRowDataMapper(RowType rowType) { - this.converter = AvroToRowDataConverters.createRowConverter(rowType); - } - - @Override - public RowData map(GenericRecord genericRecord) throws Exception { - return (RowData) converter.convert(genericRecord); - } - - /** Create a mapper based on Avro schema. */ - public static AvroGenericRecordToRowDataMapper forAvroSchema(Schema avroSchema) { - DataType dataType = AvroSchemaConverter.convertToDataType(avroSchema.toString()); - LogicalType logicalType = TypeConversions.fromDataToLogicalType(dataType); - RowType rowType = RowType.of(logicalType.getChildren().toArray(new LogicalType[0])); - return new AvroGenericRecordToRowDataMapper(rowType); - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/BaseDeltaTaskWriter.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/BaseDeltaTaskWriter.java deleted file mode 100644 index e8a46c5becd7..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/BaseDeltaTaskWriter.java +++ /dev/null @@ -1,126 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import java.io.IOException; -import java.util.List; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.PartitionKey; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.StructLike; -import org.apache.iceberg.deletes.DeleteGranularity; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.RowDataWrapper; -import org.apache.iceberg.flink.data.RowDataProjection; -import org.apache.iceberg.io.BaseTaskWriter; -import org.apache.iceberg.io.FileAppenderFactory; -import org.apache.iceberg.io.FileIO; -import org.apache.iceberg.io.OutputFileFactory; -import org.apache.iceberg.relocated.com.google.common.collect.Sets; -import org.apache.iceberg.types.TypeUtil; - -abstract class BaseDeltaTaskWriter extends BaseTaskWriter { - - private final Schema schema; - private final Schema deleteSchema; - private final RowDataWrapper wrapper; - private final RowDataWrapper keyWrapper; - private final RowDataProjection keyProjection; - private final boolean upsert; - - BaseDeltaTaskWriter( - PartitionSpec spec, - FileFormat format, - FileAppenderFactory appenderFactory, - OutputFileFactory fileFactory, - FileIO io, - long targetFileSize, - Schema schema, - RowType flinkSchema, - List equalityFieldIds, - boolean upsert) { - super(spec, format, appenderFactory, fileFactory, io, targetFileSize); - this.schema = schema; - this.deleteSchema = TypeUtil.select(schema, Sets.newHashSet(equalityFieldIds)); - this.wrapper = new RowDataWrapper(flinkSchema, schema.asStruct()); - this.keyWrapper = - new RowDataWrapper(FlinkSchemaUtil.convert(deleteSchema), deleteSchema.asStruct()); - this.keyProjection = - RowDataProjection.create(flinkSchema, schema.asStruct(), deleteSchema.asStruct()); - this.upsert = upsert; - } - - abstract RowDataDeltaWriter route(RowData row); - - RowDataWrapper wrapper() { - return wrapper; - } - - @Override - public void write(RowData row) throws IOException { - RowDataDeltaWriter writer = route(row); - - switch (row.getRowKind()) { - case INSERT: - case UPDATE_AFTER: - if (upsert) { - writer.deleteKey(keyProjection.wrap(row)); - } - writer.write(row); - break; - - case UPDATE_BEFORE: - if (upsert) { - break; // UPDATE_BEFORE is not necessary for UPSERT, we do nothing to prevent delete one - // row twice - } - writer.delete(row); - break; - case DELETE: - if (upsert) { - writer.deleteKey(keyProjection.wrap(row)); - } else { - writer.delete(row); - } - break; - - default: - throw new UnsupportedOperationException("Unknown row kind: " + row.getRowKind()); - } - } - - protected class RowDataDeltaWriter extends BaseEqualityDeltaWriter { - RowDataDeltaWriter(PartitionKey partition) { - super(partition, schema, deleteSchema, DeleteGranularity.FILE); - } - - @Override - protected StructLike asStructLike(RowData data) { - return wrapper.wrap(data); - } - - @Override - protected StructLike asStructLikeKey(RowData data) { - return keyWrapper.wrap(data); - } - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/BucketPartitionKeySelector.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/BucketPartitionKeySelector.java deleted file mode 100644 index 1cb6e013bd2c..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/BucketPartitionKeySelector.java +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import java.util.stream.IntStream; -import org.apache.flink.api.java.functions.KeySelector; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.PartitionKey; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.flink.RowDataWrapper; - -/** - * A {@link KeySelector} that extracts the bucketId from a data row's bucket partition as the key. - * To be used with the {@link BucketPartitioner}. - */ -class BucketPartitionKeySelector implements KeySelector { - - private final Schema schema; - private final PartitionKey partitionKey; - private final RowType flinkSchema; - private final int bucketFieldPosition; - - private transient RowDataWrapper rowDataWrapper; - - BucketPartitionKeySelector(PartitionSpec partitionSpec, Schema schema, RowType flinkSchema) { - this.schema = schema; - this.partitionKey = new PartitionKey(partitionSpec, schema); - this.flinkSchema = flinkSchema; - this.bucketFieldPosition = getBucketFieldPosition(partitionSpec); - } - - private int getBucketFieldPosition(PartitionSpec partitionSpec) { - int bucketFieldId = BucketPartitionerUtil.getBucketFieldId(partitionSpec); - return IntStream.range(0, partitionSpec.fields().size()) - .filter(i -> partitionSpec.fields().get(i).fieldId() == bucketFieldId) - .toArray()[0]; - } - - private RowDataWrapper lazyRowDataWrapper() { - if (rowDataWrapper == null) { - rowDataWrapper = new RowDataWrapper(flinkSchema, schema.asStruct()); - } - - return rowDataWrapper; - } - - @Override - public Integer getKey(RowData rowData) { - partitionKey.partition(lazyRowDataWrapper().wrap(rowData)); - return partitionKey.get(bucketFieldPosition, Integer.class); - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/BucketPartitioner.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/BucketPartitioner.java deleted file mode 100644 index 9c9a117906e2..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/BucketPartitioner.java +++ /dev/null @@ -1,103 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import org.apache.flink.api.common.functions.Partitioner; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; - -/** - * This partitioner will redirect records to writers deterministically based on the Bucket partition - * spec. It'll attempt to optimize the file size written depending on whether numPartitions is - * greater, less or equal than the maxNumBuckets. Note: The current implementation only supports ONE - * bucket in the partition spec. - */ -class BucketPartitioner implements Partitioner { - - static final String BUCKET_NULL_MESSAGE = "bucketId cannot be null"; - static final String BUCKET_LESS_THAN_LOWER_BOUND_MESSAGE = - "Invalid bucket ID %s: must be non-negative."; - static final String BUCKET_GREATER_THAN_UPPER_BOUND_MESSAGE = - "Invalid bucket ID %s: must be less than bucket limit: %s."; - - private final int maxNumBuckets; - - // To hold the OFFSET of the next writer to use for any bucket, only used when writers > the - // number of buckets - private final int[] currentBucketWriterOffset; - - BucketPartitioner(PartitionSpec partitionSpec) { - this.maxNumBuckets = BucketPartitionerUtil.getMaxNumBuckets(partitionSpec); - this.currentBucketWriterOffset = new int[maxNumBuckets]; - } - - /** - * Determine the partition id based on the following criteria: If the number of writers <= the - * number of buckets, an evenly distributed number of buckets will be assigned to each writer (one - * writer -> many buckets). Conversely, if the number of writers > the number of buckets the logic - * is handled by the {@link #getPartitionWithMoreWritersThanBuckets - * getPartitionWritersGreaterThanBuckets} method. - * - * @param bucketId the bucketId for each request - * @param numPartitions the total number of partitions - * @return the partition id (writer) to use for each request - */ - @Override - public int partition(Integer bucketId, int numPartitions) { - Preconditions.checkNotNull(bucketId, BUCKET_NULL_MESSAGE); - Preconditions.checkArgument(bucketId >= 0, BUCKET_LESS_THAN_LOWER_BOUND_MESSAGE, bucketId); - Preconditions.checkArgument( - bucketId < maxNumBuckets, BUCKET_GREATER_THAN_UPPER_BOUND_MESSAGE, bucketId, maxNumBuckets); - - if (numPartitions <= maxNumBuckets) { - return bucketId % numPartitions; - } else { - return getPartitionWithMoreWritersThanBuckets(bucketId, numPartitions); - } - } - - /*- - * If the number of writers > the number of buckets each partitioner will keep a state of multiple - * writers per bucket as evenly as possible, and will round-robin the requests across them, in this - * case each writer will target only one bucket at all times (many writers -> one bucket). Example: - * Configuration: numPartitions (writers) = 5, maxBuckets = 2 - * Expected behavior: - * - Records for Bucket 0 will be "round robin" between Writers 0, 2 and 4 - * - Records for Bucket 1 will always use Writer 1 and 3 - * Notes: - * - maxNumWritersPerBucket determines when to reset the currentBucketWriterOffset to 0 for this bucketId - * - When numPartitions is not evenly divisible by maxBuckets, some buckets will have one more writer (extraWriter). - * In this example Bucket 0 has an "extra writer" to consider before resetting its offset to 0. - * - * @return the destination partition index (writer subtask id) - */ - private int getPartitionWithMoreWritersThanBuckets(int bucketId, int numPartitions) { - int currentOffset = currentBucketWriterOffset[bucketId]; - // Determine if this bucket requires an "extra writer" - int extraWriter = bucketId < (numPartitions % maxNumBuckets) ? 1 : 0; - // The max number of writers this bucket can have - int maxNumWritersPerBucket = (numPartitions / maxNumBuckets) + extraWriter; - - // Increment the writer offset or reset if it's reached the max for this bucket - int nextOffset = currentOffset == maxNumWritersPerBucket - 1 ? 0 : currentOffset + 1; - currentBucketWriterOffset[bucketId] = nextOffset; - - return bucketId + (maxNumBuckets * currentOffset); - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/BucketPartitionerUtil.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/BucketPartitionerUtil.java deleted file mode 100644 index c33207728d3e..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/BucketPartitionerUtil.java +++ /dev/null @@ -1,125 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import java.util.List; -import java.util.Objects; -import java.util.stream.Collectors; -import org.apache.flink.api.java.tuple.Tuple2; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.transforms.PartitionSpecVisitor; - -final class BucketPartitionerUtil { - static final String BAD_NUMBER_OF_BUCKETS_ERROR_MESSAGE = - "Invalid number of buckets: %s (must be 1)"; - - private BucketPartitionerUtil() {} - - /** - * Determines whether the PartitionSpec has one and only one Bucket definition - * - * @param partitionSpec the partition spec in question - * @return whether the PartitionSpec has only one Bucket - */ - static boolean hasOneBucketField(PartitionSpec partitionSpec) { - List> bucketFields = getBucketFields(partitionSpec); - return bucketFields != null && bucketFields.size() == 1; - } - - /** - * Extracts the Bucket definition from a PartitionSpec. - * - * @param partitionSpec the partition spec in question - * @return the Bucket definition in the form of a tuple (fieldId, maxNumBuckets) - */ - private static Tuple2 getBucketFieldInfo(PartitionSpec partitionSpec) { - List> bucketFields = getBucketFields(partitionSpec); - Preconditions.checkArgument( - bucketFields.size() == 1, - BucketPartitionerUtil.BAD_NUMBER_OF_BUCKETS_ERROR_MESSAGE, - bucketFields.size()); - return bucketFields.get(0); - } - - static int getBucketFieldId(PartitionSpec partitionSpec) { - return getBucketFieldInfo(partitionSpec).f0; - } - - static int getMaxNumBuckets(PartitionSpec partitionSpec) { - return getBucketFieldInfo(partitionSpec).f1; - } - - private static List> getBucketFields(PartitionSpec spec) { - return PartitionSpecVisitor.visit(spec, new BucketPartitionSpecVisitor()).stream() - .filter(Objects::nonNull) - .collect(Collectors.toList()); - } - - private static class BucketPartitionSpecVisitor - implements PartitionSpecVisitor> { - @Override - public Tuple2 identity(int fieldId, String sourceName, int sourceId) { - return null; - } - - @Override - public Tuple2 bucket( - int fieldId, String sourceName, int sourceId, int numBuckets) { - return new Tuple2<>(fieldId, numBuckets); - } - - @Override - public Tuple2 truncate( - int fieldId, String sourceName, int sourceId, int width) { - return null; - } - - @Override - public Tuple2 year(int fieldId, String sourceName, int sourceId) { - return null; - } - - @Override - public Tuple2 month(int fieldId, String sourceName, int sourceId) { - return null; - } - - @Override - public Tuple2 day(int fieldId, String sourceName, int sourceId) { - return null; - } - - @Override - public Tuple2 hour(int fieldId, String sourceName, int sourceId) { - return null; - } - - @Override - public Tuple2 alwaysNull(int fieldId, String sourceName, int sourceId) { - return null; - } - - @Override - public Tuple2 unknown( - int fieldId, String sourceName, int sourceId, String transform) { - return null; - } - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/CachingTableSupplier.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/CachingTableSupplier.java deleted file mode 100644 index e9f9786f9190..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/CachingTableSupplier.java +++ /dev/null @@ -1,91 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import java.time.Duration; -import org.apache.flink.util.Preconditions; -import org.apache.iceberg.SerializableTable; -import org.apache.iceberg.Table; -import org.apache.iceberg.flink.TableLoader; -import org.apache.iceberg.util.DateTimeUtil; -import org.apache.iceberg.util.SerializableSupplier; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * A table loader that will only reload a table after a certain interval has passed. WARNING: This - * table loader should be used carefully when used with writer tasks. It could result in heavy load - * on a catalog for jobs with many writers. - */ -class CachingTableSupplier implements SerializableSupplier

  • { - - private static final Logger LOG = LoggerFactory.getLogger(CachingTableSupplier.class); - - private final Table initialTable; - private final TableLoader tableLoader; - private final Duration tableRefreshInterval; - private long lastLoadTimeMillis; - private transient Table table; - - CachingTableSupplier( - SerializableTable initialTable, TableLoader tableLoader, Duration tableRefreshInterval) { - Preconditions.checkArgument(initialTable != null, "initialTable cannot be null"); - Preconditions.checkArgument(tableLoader != null, "tableLoader cannot be null"); - Preconditions.checkArgument( - tableRefreshInterval != null, "tableRefreshInterval cannot be null"); - this.initialTable = initialTable; - this.table = initialTable; - this.tableLoader = tableLoader; - this.tableRefreshInterval = tableRefreshInterval; - this.lastLoadTimeMillis = System.currentTimeMillis(); - } - - @Override - public Table get() { - if (table == null) { - this.table = initialTable; - } - return table; - } - - Table initialTable() { - return initialTable; - } - - void refreshTable() { - if (System.currentTimeMillis() > lastLoadTimeMillis + tableRefreshInterval.toMillis()) { - try { - if (!tableLoader.isOpen()) { - tableLoader.open(); - } - - this.table = tableLoader.loadTable(); - this.lastLoadTimeMillis = System.currentTimeMillis(); - - LOG.info( - "Table {} reloaded, next min load time threshold is {}", - table.name(), - DateTimeUtil.formatTimestampMillis( - lastLoadTimeMillis + tableRefreshInterval.toMillis())); - } catch (Exception e) { - LOG.warn("An error occurred reloading table {}, table was not reloaded", table.name(), e); - } - } - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/CommitSummary.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/CommitSummary.java deleted file mode 100644 index 9a2f57181708..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/CommitSummary.java +++ /dev/null @@ -1,93 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import java.util.Arrays; -import java.util.NavigableMap; -import java.util.concurrent.atomic.AtomicLong; -import org.apache.iceberg.io.WriteResult; -import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; - -class CommitSummary { - - private final AtomicLong dataFilesCount = new AtomicLong(); - private final AtomicLong dataFilesRecordCount = new AtomicLong(); - private final AtomicLong dataFilesByteCount = new AtomicLong(); - private final AtomicLong deleteFilesCount = new AtomicLong(); - private final AtomicLong deleteFilesRecordCount = new AtomicLong(); - private final AtomicLong deleteFilesByteCount = new AtomicLong(); - - CommitSummary(NavigableMap pendingResults) { - pendingResults - .values() - .forEach( - writeResult -> { - dataFilesCount.addAndGet(writeResult.dataFiles().length); - Arrays.stream(writeResult.dataFiles()) - .forEach( - dataFile -> { - dataFilesRecordCount.addAndGet(dataFile.recordCount()); - dataFilesByteCount.addAndGet(dataFile.fileSizeInBytes()); - }); - deleteFilesCount.addAndGet(writeResult.deleteFiles().length); - Arrays.stream(writeResult.deleteFiles()) - .forEach( - deleteFile -> { - deleteFilesRecordCount.addAndGet(deleteFile.recordCount()); - deleteFilesByteCount.addAndGet(deleteFile.fileSizeInBytes()); - }); - }); - } - - long dataFilesCount() { - return dataFilesCount.get(); - } - - long dataFilesRecordCount() { - return dataFilesRecordCount.get(); - } - - long dataFilesByteCount() { - return dataFilesByteCount.get(); - } - - long deleteFilesCount() { - return deleteFilesCount.get(); - } - - long deleteFilesRecordCount() { - return deleteFilesRecordCount.get(); - } - - long deleteFilesByteCount() { - return deleteFilesByteCount.get(); - } - - @Override - public String toString() { - return MoreObjects.toStringHelper(this) - .add("dataFilesCount", dataFilesCount) - .add("dataFilesRecordCount", dataFilesRecordCount) - .add("dataFilesByteCount", dataFilesByteCount) - .add("deleteFilesCount", deleteFilesCount) - .add("deleteFilesRecordCount", deleteFilesRecordCount) - .add("deleteFilesByteCount", deleteFilesByteCount) - .toString(); - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/DeltaManifests.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/DeltaManifests.java deleted file mode 100644 index 036970c06d5b..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/DeltaManifests.java +++ /dev/null @@ -1,71 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import java.util.List; -import org.apache.iceberg.ManifestFile; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; - -class DeltaManifests { - - private static final CharSequence[] EMPTY_REF_DATA_FILES = new CharSequence[0]; - - private final ManifestFile dataManifest; - private final ManifestFile deleteManifest; - private final CharSequence[] referencedDataFiles; - - DeltaManifests(ManifestFile dataManifest, ManifestFile deleteManifest) { - this(dataManifest, deleteManifest, EMPTY_REF_DATA_FILES); - } - - DeltaManifests( - ManifestFile dataManifest, ManifestFile deleteManifest, CharSequence[] referencedDataFiles) { - Preconditions.checkNotNull(referencedDataFiles, "Referenced data files shouldn't be null."); - - this.dataManifest = dataManifest; - this.deleteManifest = deleteManifest; - this.referencedDataFiles = referencedDataFiles; - } - - ManifestFile dataManifest() { - return dataManifest; - } - - ManifestFile deleteManifest() { - return deleteManifest; - } - - CharSequence[] referencedDataFiles() { - return referencedDataFiles; - } - - List manifests() { - List manifests = Lists.newArrayListWithCapacity(2); - if (dataManifest != null) { - manifests.add(dataManifest); - } - - if (deleteManifest != null) { - manifests.add(deleteManifest); - } - - return manifests; - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/DeltaManifestsSerializer.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/DeltaManifestsSerializer.java deleted file mode 100644 index 92ca284b12ba..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/DeltaManifestsSerializer.java +++ /dev/null @@ -1,122 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; -import java.io.DataInputStream; -import java.io.DataOutputStream; -import java.io.IOException; -import org.apache.flink.core.io.SimpleVersionedSerializer; -import org.apache.iceberg.ManifestFile; -import org.apache.iceberg.ManifestFiles; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; - -class DeltaManifestsSerializer implements SimpleVersionedSerializer { - private static final int VERSION_1 = 1; - private static final int VERSION_2 = 2; - private static final byte[] EMPTY_BINARY = new byte[0]; - - static final DeltaManifestsSerializer INSTANCE = new DeltaManifestsSerializer(); - - @Override - public int getVersion() { - return VERSION_2; - } - - @Override - public byte[] serialize(DeltaManifests deltaManifests) throws IOException { - Preconditions.checkNotNull( - deltaManifests, "DeltaManifests to be serialized should not be null"); - - ByteArrayOutputStream binaryOut = new ByteArrayOutputStream(); - DataOutputStream out = new DataOutputStream(binaryOut); - - byte[] dataManifestBinary = EMPTY_BINARY; - if (deltaManifests.dataManifest() != null) { - dataManifestBinary = ManifestFiles.encode(deltaManifests.dataManifest()); - } - - out.writeInt(dataManifestBinary.length); - out.write(dataManifestBinary); - - byte[] deleteManifestBinary = EMPTY_BINARY; - if (deltaManifests.deleteManifest() != null) { - deleteManifestBinary = ManifestFiles.encode(deltaManifests.deleteManifest()); - } - - out.writeInt(deleteManifestBinary.length); - out.write(deleteManifestBinary); - - CharSequence[] referencedDataFiles = deltaManifests.referencedDataFiles(); - out.writeInt(referencedDataFiles.length); - for (CharSequence referencedDataFile : referencedDataFiles) { - out.writeUTF(referencedDataFile.toString()); - } - - return binaryOut.toByteArray(); - } - - @Override - public DeltaManifests deserialize(int version, byte[] serialized) throws IOException { - if (version == VERSION_1) { - return deserializeV1(serialized); - } else if (version == VERSION_2) { - return deserializeV2(serialized); - } else { - throw new RuntimeException("Unknown serialize version: " + version); - } - } - - private DeltaManifests deserializeV1(byte[] serialized) throws IOException { - return new DeltaManifests(ManifestFiles.decode(serialized), null); - } - - private DeltaManifests deserializeV2(byte[] serialized) throws IOException { - ManifestFile dataManifest = null; - ManifestFile deleteManifest = null; - - ByteArrayInputStream binaryIn = new ByteArrayInputStream(serialized); - DataInputStream in = new DataInputStream(binaryIn); - - int dataManifestSize = in.readInt(); - if (dataManifestSize > 0) { - byte[] dataManifestBinary = new byte[dataManifestSize]; - Preconditions.checkState(in.read(dataManifestBinary) == dataManifestSize); - - dataManifest = ManifestFiles.decode(dataManifestBinary); - } - - int deleteManifestSize = in.readInt(); - if (deleteManifestSize > 0) { - byte[] deleteManifestBinary = new byte[deleteManifestSize]; - Preconditions.checkState(in.read(deleteManifestBinary) == deleteManifestSize); - - deleteManifest = ManifestFiles.decode(deleteManifestBinary); - } - - int referenceDataFileNum = in.readInt(); - CharSequence[] referencedDataFiles = new CharSequence[referenceDataFileNum]; - for (int i = 0; i < referenceDataFileNum; i++) { - referencedDataFiles[i] = in.readUTF(); - } - - return new DeltaManifests(dataManifest, deleteManifest, referencedDataFiles); - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/EqualityFieldKeySelector.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/EqualityFieldKeySelector.java deleted file mode 100644 index 18b269d6c3e9..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/EqualityFieldKeySelector.java +++ /dev/null @@ -1,86 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import java.util.List; -import org.apache.flink.api.java.functions.KeySelector; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.Schema; -import org.apache.iceberg.flink.RowDataWrapper; -import org.apache.iceberg.relocated.com.google.common.collect.Sets; -import org.apache.iceberg.types.TypeUtil; -import org.apache.iceberg.util.StructLikeWrapper; -import org.apache.iceberg.util.StructProjection; - -/** - * Create a {@link KeySelector} to shuffle by equality fields, to ensure same equality fields record - * will be emitted to same writer in order. - */ -class EqualityFieldKeySelector implements KeySelector { - - private final Schema schema; - private final RowType flinkSchema; - private final Schema deleteSchema; - - private transient RowDataWrapper rowDataWrapper; - private transient StructProjection structProjection; - private transient StructLikeWrapper structLikeWrapper; - - EqualityFieldKeySelector(Schema schema, RowType flinkSchema, List equalityFieldIds) { - this.schema = schema; - this.flinkSchema = flinkSchema; - this.deleteSchema = TypeUtil.select(schema, Sets.newHashSet(equalityFieldIds)); - } - - /** - * Construct the {@link RowDataWrapper} lazily here because few members in it are not - * serializable. In this way, we don't have to serialize them with forcing. - */ - protected RowDataWrapper lazyRowDataWrapper() { - if (rowDataWrapper == null) { - rowDataWrapper = new RowDataWrapper(flinkSchema, schema.asStruct()); - } - return rowDataWrapper; - } - - /** Construct the {@link StructProjection} lazily because it is not serializable. */ - protected StructProjection lazyStructProjection() { - if (structProjection == null) { - structProjection = StructProjection.create(schema, deleteSchema); - } - return structProjection; - } - - /** Construct the {@link StructLikeWrapper} lazily because it is not serializable. */ - protected StructLikeWrapper lazyStructLikeWrapper() { - if (structLikeWrapper == null) { - structLikeWrapper = StructLikeWrapper.forType(deleteSchema.asStruct()); - } - return structLikeWrapper; - } - - @Override - public Integer getKey(RowData row) { - RowDataWrapper wrappedRowData = lazyRowDataWrapper().wrap(row); - StructProjection projectedRowData = lazyStructProjection().wrap(wrappedRowData); - StructLikeWrapper wrapper = lazyStructLikeWrapper().set(projectedRowData); - return wrapper.hashCode(); - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkAppenderFactory.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkAppenderFactory.java deleted file mode 100644 index eacef58a8d5d..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkAppenderFactory.java +++ /dev/null @@ -1,280 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import java.io.IOException; -import java.io.Serializable; -import java.io.UncheckedIOException; -import java.util.Map; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.StringData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.MetricsConfig; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.StructLike; -import org.apache.iceberg.Table; -import org.apache.iceberg.avro.Avro; -import org.apache.iceberg.deletes.EqualityDeleteWriter; -import org.apache.iceberg.deletes.PositionDeleteWriter; -import org.apache.iceberg.encryption.EncryptedOutputFile; -import org.apache.iceberg.encryption.EncryptionUtil; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.data.FlinkAvroWriter; -import org.apache.iceberg.flink.data.FlinkOrcWriter; -import org.apache.iceberg.flink.data.FlinkParquetWriters; -import org.apache.iceberg.io.DataWriter; -import org.apache.iceberg.io.DeleteSchemaUtil; -import org.apache.iceberg.io.FileAppender; -import org.apache.iceberg.io.FileAppenderFactory; -import org.apache.iceberg.io.OutputFile; -import org.apache.iceberg.orc.ORC; -import org.apache.iceberg.parquet.Parquet; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; - -public class FlinkAppenderFactory implements FileAppenderFactory, Serializable { - private final Schema schema; - private final RowType flinkSchema; - private final Map props; - private final PartitionSpec spec; - private final int[] equalityFieldIds; - private final Schema eqDeleteRowSchema; - private final Schema posDeleteRowSchema; - private final Table table; - - private RowType eqDeleteFlinkSchema = null; - private RowType posDeleteFlinkSchema = null; - - public FlinkAppenderFactory( - Table table, - Schema schema, - RowType flinkSchema, - Map props, - PartitionSpec spec, - int[] equalityFieldIds, - Schema eqDeleteRowSchema, - Schema posDeleteRowSchema) { - Preconditions.checkNotNull(table, "Table shouldn't be null"); - this.table = table; - this.schema = schema; - this.flinkSchema = flinkSchema; - this.props = props; - this.spec = spec; - this.equalityFieldIds = equalityFieldIds; - this.eqDeleteRowSchema = eqDeleteRowSchema; - this.posDeleteRowSchema = posDeleteRowSchema; - } - - private RowType lazyEqDeleteFlinkSchema() { - if (eqDeleteFlinkSchema == null) { - Preconditions.checkNotNull(eqDeleteRowSchema, "Equality delete row schema shouldn't be null"); - this.eqDeleteFlinkSchema = FlinkSchemaUtil.convert(eqDeleteRowSchema); - } - return eqDeleteFlinkSchema; - } - - private RowType lazyPosDeleteFlinkSchema() { - if (posDeleteFlinkSchema == null) { - Preconditions.checkNotNull(posDeleteRowSchema, "Pos-delete row schema shouldn't be null"); - this.posDeleteFlinkSchema = FlinkSchemaUtil.convert(posDeleteRowSchema); - } - return this.posDeleteFlinkSchema; - } - - @Override - public FileAppender newAppender(OutputFile outputFile, FileFormat format) { - return newAppender(EncryptionUtil.plainAsEncryptedOutput(outputFile), format); - } - - @Override - public FileAppender newAppender(EncryptedOutputFile outputFile, FileFormat format) { - MetricsConfig metricsConfig = MetricsConfig.forTable(table); - try { - switch (format) { - case AVRO: - return Avro.write(outputFile) - .createWriterFunc(ignore -> new FlinkAvroWriter(flinkSchema)) - .setAll(props) - .schema(schema) - .metricsConfig(metricsConfig) - .overwrite() - .build(); - - case ORC: - return ORC.write(outputFile) - .createWriterFunc( - (iSchema, typDesc) -> FlinkOrcWriter.buildWriter(flinkSchema, iSchema)) - .setAll(props) - .metricsConfig(metricsConfig) - .schema(schema) - .overwrite() - .build(); - - case PARQUET: - return Parquet.write(outputFile) - .createWriterFunc(msgType -> FlinkParquetWriters.buildWriter(flinkSchema, msgType)) - .setAll(props) - .metricsConfig(metricsConfig) - .schema(schema) - .overwrite() - .build(); - - default: - throw new UnsupportedOperationException("Cannot write unknown file format: " + format); - } - } catch (IOException e) { - throw new UncheckedIOException(e); - } - } - - @Override - public DataWriter newDataWriter( - EncryptedOutputFile file, FileFormat format, StructLike partition) { - return new DataWriter<>( - newAppender(file, format), - format, - file.encryptingOutputFile().location(), - spec, - partition, - file.keyMetadata()); - } - - @Override - public EqualityDeleteWriter newEqDeleteWriter( - EncryptedOutputFile outputFile, FileFormat format, StructLike partition) { - Preconditions.checkState( - equalityFieldIds != null && equalityFieldIds.length > 0, - "Equality field ids shouldn't be null or empty when creating equality-delete writer"); - Preconditions.checkNotNull( - eqDeleteRowSchema, - "Equality delete row schema shouldn't be null when creating equality-delete writer"); - - MetricsConfig metricsConfig = MetricsConfig.forTable(table); - try { - switch (format) { - case AVRO: - return Avro.writeDeletes(outputFile) - .createWriterFunc(ignore -> new FlinkAvroWriter(lazyEqDeleteFlinkSchema())) - .withPartition(partition) - .overwrite() - .setAll(props) - .metricsConfig(metricsConfig) - .rowSchema(eqDeleteRowSchema) - .withSpec(spec) - .withKeyMetadata(outputFile.keyMetadata()) - .equalityFieldIds(equalityFieldIds) - .buildEqualityWriter(); - - case ORC: - return ORC.writeDeletes(outputFile) - .createWriterFunc( - (iSchema, typDesc) -> FlinkOrcWriter.buildWriter(flinkSchema, iSchema)) - .withPartition(partition) - .overwrite() - .setAll(props) - .metricsConfig(metricsConfig) - .rowSchema(eqDeleteRowSchema) - .withSpec(spec) - .withKeyMetadata(outputFile.keyMetadata()) - .equalityFieldIds(equalityFieldIds) - .buildEqualityWriter(); - - case PARQUET: - return Parquet.writeDeletes(outputFile) - .createWriterFunc( - msgType -> FlinkParquetWriters.buildWriter(lazyEqDeleteFlinkSchema(), msgType)) - .withPartition(partition) - .overwrite() - .setAll(props) - .metricsConfig(metricsConfig) - .rowSchema(eqDeleteRowSchema) - .withSpec(spec) - .withKeyMetadata(outputFile.keyMetadata()) - .equalityFieldIds(equalityFieldIds) - .buildEqualityWriter(); - - default: - throw new UnsupportedOperationException( - "Cannot write equality-deletes for unsupported file format: " + format); - } - } catch (IOException e) { - throw new UncheckedIOException(e); - } - } - - @Override - public PositionDeleteWriter newPosDeleteWriter( - EncryptedOutputFile outputFile, FileFormat format, StructLike partition) { - MetricsConfig metricsConfig = MetricsConfig.forPositionDelete(table); - try { - switch (format) { - case AVRO: - return Avro.writeDeletes(outputFile) - .createWriterFunc(ignore -> new FlinkAvroWriter(lazyPosDeleteFlinkSchema())) - .withPartition(partition) - .overwrite() - .setAll(props) - .metricsConfig(metricsConfig) - .rowSchema(posDeleteRowSchema) - .withSpec(spec) - .withKeyMetadata(outputFile.keyMetadata()) - .buildPositionWriter(); - - case ORC: - RowType orcPosDeleteSchema = - FlinkSchemaUtil.convert(DeleteSchemaUtil.posDeleteSchema(posDeleteRowSchema)); - return ORC.writeDeletes(outputFile) - .createWriterFunc( - (iSchema, typDesc) -> FlinkOrcWriter.buildWriter(orcPosDeleteSchema, iSchema)) - .withPartition(partition) - .overwrite() - .setAll(props) - .metricsConfig(metricsConfig) - .rowSchema(posDeleteRowSchema) - .withSpec(spec) - .withKeyMetadata(outputFile.keyMetadata()) - .transformPaths(path -> StringData.fromString(path.toString())) - .buildPositionWriter(); - - case PARQUET: - RowType flinkPosDeleteSchema = - FlinkSchemaUtil.convert(DeleteSchemaUtil.posDeleteSchema(posDeleteRowSchema)); - return Parquet.writeDeletes(outputFile) - .createWriterFunc( - msgType -> FlinkParquetWriters.buildWriter(flinkPosDeleteSchema, msgType)) - .withPartition(partition) - .overwrite() - .setAll(props) - .metricsConfig(metricsConfig) - .rowSchema(posDeleteRowSchema) - .withSpec(spec) - .withKeyMetadata(outputFile.keyMetadata()) - .transformPaths(path -> StringData.fromString(path.toString())) - .buildPositionWriter(); - - default: - throw new UnsupportedOperationException( - "Cannot write pos-deletes for unsupported file format: " + format); - } - } catch (IOException e) { - throw new UncheckedIOException(e); - } - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkFileWriterFactory.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkFileWriterFactory.java deleted file mode 100644 index 2183fe062af4..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkFileWriterFactory.java +++ /dev/null @@ -1,293 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import static org.apache.iceberg.MetadataColumns.DELETE_FILE_ROW_FIELD_NAME; -import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; -import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT_DEFAULT; -import static org.apache.iceberg.TableProperties.DELETE_DEFAULT_FILE_FORMAT; - -import java.io.Serializable; -import java.util.Map; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.StringData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Schema; -import org.apache.iceberg.SortOrder; -import org.apache.iceberg.Table; -import org.apache.iceberg.avro.Avro; -import org.apache.iceberg.data.BaseFileWriterFactory; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.data.FlinkAvroWriter; -import org.apache.iceberg.flink.data.FlinkOrcWriter; -import org.apache.iceberg.flink.data.FlinkParquetWriters; -import org.apache.iceberg.io.DeleteSchemaUtil; -import org.apache.iceberg.orc.ORC; -import org.apache.iceberg.parquet.Parquet; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; - -class FlinkFileWriterFactory extends BaseFileWriterFactory implements Serializable { - private RowType dataFlinkType; - private RowType equalityDeleteFlinkType; - private RowType positionDeleteFlinkType; - - FlinkFileWriterFactory( - Table table, - FileFormat dataFileFormat, - Schema dataSchema, - RowType dataFlinkType, - SortOrder dataSortOrder, - FileFormat deleteFileFormat, - int[] equalityFieldIds, - Schema equalityDeleteRowSchema, - RowType equalityDeleteFlinkType, - SortOrder equalityDeleteSortOrder, - Schema positionDeleteRowSchema, - RowType positionDeleteFlinkType) { - - super( - table, - dataFileFormat, - dataSchema, - dataSortOrder, - deleteFileFormat, - equalityFieldIds, - equalityDeleteRowSchema, - equalityDeleteSortOrder, - positionDeleteRowSchema); - - this.dataFlinkType = dataFlinkType; - this.equalityDeleteFlinkType = equalityDeleteFlinkType; - this.positionDeleteFlinkType = positionDeleteFlinkType; - } - - static Builder builderFor(Table table) { - return new Builder(table); - } - - @Override - protected void configureDataWrite(Avro.DataWriteBuilder builder) { - builder.createWriterFunc(ignore -> new FlinkAvroWriter(dataFlinkType())); - } - - @Override - protected void configureEqualityDelete(Avro.DeleteWriteBuilder builder) { - builder.createWriterFunc(ignored -> new FlinkAvroWriter(equalityDeleteFlinkType())); - } - - @Override - protected void configurePositionDelete(Avro.DeleteWriteBuilder builder) { - int rowFieldIndex = positionDeleteFlinkType().getFieldIndex(DELETE_FILE_ROW_FIELD_NAME); - if (rowFieldIndex >= 0) { - // FlinkAvroWriter accepts just the Flink type of the row ignoring the path and pos - RowType positionDeleteRowFlinkType = - (RowType) positionDeleteFlinkType().getTypeAt(rowFieldIndex); - builder.createWriterFunc(ignored -> new FlinkAvroWriter(positionDeleteRowFlinkType)); - } - } - - @Override - protected void configureDataWrite(Parquet.DataWriteBuilder builder) { - builder.createWriterFunc(msgType -> FlinkParquetWriters.buildWriter(dataFlinkType(), msgType)); - } - - @Override - protected void configureEqualityDelete(Parquet.DeleteWriteBuilder builder) { - builder.createWriterFunc( - msgType -> FlinkParquetWriters.buildWriter(equalityDeleteFlinkType(), msgType)); - } - - @Override - protected void configurePositionDelete(Parquet.DeleteWriteBuilder builder) { - builder.createWriterFunc( - msgType -> FlinkParquetWriters.buildWriter(positionDeleteFlinkType(), msgType)); - builder.transformPaths(path -> StringData.fromString(path.toString())); - } - - @Override - protected void configureDataWrite(ORC.DataWriteBuilder builder) { - builder.createWriterFunc( - (iSchema, typDesc) -> FlinkOrcWriter.buildWriter(dataFlinkType(), iSchema)); - } - - @Override - protected void configureEqualityDelete(ORC.DeleteWriteBuilder builder) { - builder.createWriterFunc( - (iSchema, typDesc) -> FlinkOrcWriter.buildWriter(equalityDeleteFlinkType(), iSchema)); - } - - @Override - protected void configurePositionDelete(ORC.DeleteWriteBuilder builder) { - builder.createWriterFunc( - (iSchema, typDesc) -> FlinkOrcWriter.buildWriter(positionDeleteFlinkType(), iSchema)); - builder.transformPaths(path -> StringData.fromString(path.toString())); - } - - private RowType dataFlinkType() { - if (dataFlinkType == null) { - Preconditions.checkNotNull(dataSchema(), "Data schema must not be null"); - this.dataFlinkType = FlinkSchemaUtil.convert(dataSchema()); - } - - return dataFlinkType; - } - - private RowType equalityDeleteFlinkType() { - if (equalityDeleteFlinkType == null) { - Preconditions.checkNotNull( - equalityDeleteRowSchema(), "Equality delete schema must not be null"); - this.equalityDeleteFlinkType = FlinkSchemaUtil.convert(equalityDeleteRowSchema()); - } - - return equalityDeleteFlinkType; - } - - private RowType positionDeleteFlinkType() { - if (positionDeleteFlinkType == null) { - // wrap the optional row schema into the position delete schema that contains path and - // position - Schema positionDeleteSchema = DeleteSchemaUtil.posDeleteSchema(positionDeleteRowSchema()); - this.positionDeleteFlinkType = FlinkSchemaUtil.convert(positionDeleteSchema); - } - - return positionDeleteFlinkType; - } - - static class Builder { - private final Table table; - private FileFormat dataFileFormat; - private Schema dataSchema; - private RowType dataFlinkType; - private SortOrder dataSortOrder; - private FileFormat deleteFileFormat; - private int[] equalityFieldIds; - private Schema equalityDeleteRowSchema; - private RowType equalityDeleteFlinkType; - private SortOrder equalityDeleteSortOrder; - private Schema positionDeleteRowSchema; - private RowType positionDeleteFlinkType; - - Builder(Table table) { - this.table = table; - - Map properties = table.properties(); - - String dataFileFormatName = - properties.getOrDefault(DEFAULT_FILE_FORMAT, DEFAULT_FILE_FORMAT_DEFAULT); - this.dataFileFormat = FileFormat.fromString(dataFileFormatName); - - String deleteFileFormatName = - properties.getOrDefault(DELETE_DEFAULT_FILE_FORMAT, dataFileFormatName); - this.deleteFileFormat = FileFormat.fromString(deleteFileFormatName); - } - - Builder dataFileFormat(FileFormat newDataFileFormat) { - this.dataFileFormat = newDataFileFormat; - return this; - } - - Builder dataSchema(Schema newDataSchema) { - this.dataSchema = newDataSchema; - return this; - } - - /** - * Sets a Flink type for data. - * - *

    If not set, the value is derived from the provided Iceberg schema. - */ - Builder dataFlinkType(RowType newDataFlinkType) { - this.dataFlinkType = newDataFlinkType; - return this; - } - - Builder dataSortOrder(SortOrder newDataSortOrder) { - this.dataSortOrder = newDataSortOrder; - return this; - } - - Builder deleteFileFormat(FileFormat newDeleteFileFormat) { - this.deleteFileFormat = newDeleteFileFormat; - return this; - } - - Builder equalityFieldIds(int[] newEqualityFieldIds) { - this.equalityFieldIds = newEqualityFieldIds; - return this; - } - - Builder equalityDeleteRowSchema(Schema newEqualityDeleteRowSchema) { - this.equalityDeleteRowSchema = newEqualityDeleteRowSchema; - return this; - } - - /** - * Sets a Flink type for equality deletes. - * - *

    If not set, the value is derived from the provided Iceberg schema. - */ - Builder equalityDeleteFlinkType(RowType newEqualityDeleteFlinkType) { - this.equalityDeleteFlinkType = newEqualityDeleteFlinkType; - return this; - } - - Builder equalityDeleteSortOrder(SortOrder newEqualityDeleteSortOrder) { - this.equalityDeleteSortOrder = newEqualityDeleteSortOrder; - return this; - } - - Builder positionDeleteRowSchema(Schema newPositionDeleteRowSchema) { - this.positionDeleteRowSchema = newPositionDeleteRowSchema; - return this; - } - - /** - * Sets a Flink type for position deletes. - * - *

    If not set, the value is derived from the provided Iceberg schema. - */ - Builder positionDeleteFlinkType(RowType newPositionDeleteFlinkType) { - this.positionDeleteFlinkType = newPositionDeleteFlinkType; - return this; - } - - FlinkFileWriterFactory build() { - boolean noEqualityDeleteConf = equalityFieldIds == null && equalityDeleteRowSchema == null; - boolean fullEqualityDeleteConf = equalityFieldIds != null && equalityDeleteRowSchema != null; - Preconditions.checkArgument( - noEqualityDeleteConf || fullEqualityDeleteConf, - "Equality field IDs and equality delete row schema must be set together"); - - return new FlinkFileWriterFactory( - table, - dataFileFormat, - dataSchema, - dataFlinkType, - dataSortOrder, - deleteFileFormat, - equalityFieldIds, - equalityDeleteRowSchema, - equalityDeleteFlinkType, - equalityDeleteSortOrder, - positionDeleteRowSchema, - positionDeleteFlinkType); - } - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkManifestUtil.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkManifestUtil.java deleted file mode 100644 index c7e8a2dea7cb..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkManifestUtil.java +++ /dev/null @@ -1,132 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import java.io.IOException; -import java.util.List; -import java.util.Map; -import java.util.function.Supplier; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.DeleteFile; -import org.apache.iceberg.ManifestFile; -import org.apache.iceberg.ManifestFiles; -import org.apache.iceberg.ManifestWriter; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Table; -import org.apache.iceberg.io.CloseableIterable; -import org.apache.iceberg.io.FileIO; -import org.apache.iceberg.io.OutputFile; -import org.apache.iceberg.io.WriteResult; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; - -class FlinkManifestUtil { - private static final int FORMAT_V2 = 2; - private static final Long DUMMY_SNAPSHOT_ID = 0L; - - private FlinkManifestUtil() {} - - static ManifestFile writeDataFiles( - OutputFile outputFile, PartitionSpec spec, List dataFiles) throws IOException { - ManifestWriter writer = - ManifestFiles.write(FORMAT_V2, spec, outputFile, DUMMY_SNAPSHOT_ID); - - try (ManifestWriter closeableWriter = writer) { - closeableWriter.addAll(dataFiles); - } - - return writer.toManifestFile(); - } - - static List readDataFiles( - ManifestFile manifestFile, FileIO io, Map specsById) - throws IOException { - try (CloseableIterable dataFiles = ManifestFiles.read(manifestFile, io, specsById)) { - return Lists.newArrayList(dataFiles); - } - } - - static ManifestOutputFileFactory createOutputFileFactory( - Supplier

    tableSupplier, - Map tableProps, - String flinkJobId, - String operatorUniqueId, - int subTaskId, - long attemptNumber) { - return new ManifestOutputFileFactory( - tableSupplier, tableProps, flinkJobId, operatorUniqueId, subTaskId, attemptNumber); - } - - /** - * Write the {@link WriteResult} to temporary manifest files. - * - * @param result all those DataFiles/DeleteFiles in this WriteResult should be written with same - * partition spec - */ - static DeltaManifests writeCompletedFiles( - WriteResult result, Supplier outputFileSupplier, PartitionSpec spec) - throws IOException { - - ManifestFile dataManifest = null; - ManifestFile deleteManifest = null; - - // Write the completed data files into a newly created data manifest file. - if (result.dataFiles() != null && result.dataFiles().length > 0) { - dataManifest = - writeDataFiles(outputFileSupplier.get(), spec, Lists.newArrayList(result.dataFiles())); - } - - // Write the completed delete files into a newly created delete manifest file. - if (result.deleteFiles() != null && result.deleteFiles().length > 0) { - OutputFile deleteManifestFile = outputFileSupplier.get(); - - ManifestWriter deleteManifestWriter = - ManifestFiles.writeDeleteManifest(FORMAT_V2, spec, deleteManifestFile, DUMMY_SNAPSHOT_ID); - try (ManifestWriter writer = deleteManifestWriter) { - for (DeleteFile deleteFile : result.deleteFiles()) { - writer.add(deleteFile); - } - } - - deleteManifest = deleteManifestWriter.toManifestFile(); - } - - return new DeltaManifests(dataManifest, deleteManifest, result.referencedDataFiles()); - } - - static WriteResult readCompletedFiles( - DeltaManifests deltaManifests, FileIO io, Map specsById) - throws IOException { - WriteResult.Builder builder = WriteResult.builder(); - - // Read the completed data files from persisted data manifest file. - if (deltaManifests.dataManifest() != null) { - builder.addDataFiles(readDataFiles(deltaManifests.dataManifest(), io, specsById)); - } - - // Read the completed delete files from persisted delete manifests file. - if (deltaManifests.deleteManifest() != null) { - try (CloseableIterable deleteFiles = - ManifestFiles.readDeleteManifest(deltaManifests.deleteManifest(), io, specsById)) { - builder.addDeleteFiles(deleteFiles); - } - } - - return builder.addReferencedDataFiles(deltaManifests.referencedDataFiles()).build(); - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkSink.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkSink.java deleted file mode 100644 index 769af7d77140..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkSink.java +++ /dev/null @@ -1,654 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import static org.apache.iceberg.TableProperties.AVRO_COMPRESSION; -import static org.apache.iceberg.TableProperties.AVRO_COMPRESSION_LEVEL; -import static org.apache.iceberg.TableProperties.ORC_COMPRESSION; -import static org.apache.iceberg.TableProperties.ORC_COMPRESSION_STRATEGY; -import static org.apache.iceberg.TableProperties.PARQUET_COMPRESSION; -import static org.apache.iceberg.TableProperties.PARQUET_COMPRESSION_LEVEL; -import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE; - -import java.io.IOException; -import java.io.UncheckedIOException; -import java.time.Duration; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.function.Function; -import org.apache.flink.api.common.functions.MapFunction; -import org.apache.flink.api.common.typeinfo.TypeInformation; -import org.apache.flink.api.common.typeinfo.Types; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.configuration.ReadableConfig; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.streaming.api.datastream.DataStreamSink; -import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator; -import org.apache.flink.streaming.api.functions.sink.DiscardingSink; -import org.apache.flink.table.api.TableSchema; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.util.DataFormatConverters; -import org.apache.flink.table.types.DataType; -import org.apache.flink.table.types.logical.RowType; -import org.apache.flink.types.Row; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.DistributionMode; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.PartitionField; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.SerializableTable; -import org.apache.iceberg.Table; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.FlinkWriteConf; -import org.apache.iceberg.flink.FlinkWriteOptions; -import org.apache.iceberg.flink.TableLoader; -import org.apache.iceberg.flink.util.FlinkCompatibilityUtil; -import org.apache.iceberg.io.WriteResult; -import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.relocated.com.google.common.collect.Sets; -import org.apache.iceberg.types.TypeUtil; -import org.apache.iceberg.util.SerializableSupplier; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -public class FlinkSink { - private static final Logger LOG = LoggerFactory.getLogger(FlinkSink.class); - - private static final String ICEBERG_STREAM_WRITER_NAME = - IcebergStreamWriter.class.getSimpleName(); - private static final String ICEBERG_FILES_COMMITTER_NAME = - IcebergFilesCommitter.class.getSimpleName(); - - private FlinkSink() {} - - /** - * Initialize a {@link Builder} to export the data from generic input data stream into iceberg - * table. We use {@link RowData} inside the sink connector, so users need to provide a mapper - * function and a {@link TypeInformation} to convert those generic records to a RowData - * DataStream. - * - * @param input the generic source input data stream. - * @param mapper function to convert the generic data to {@link RowData} - * @param outputType to define the {@link TypeInformation} for the input data. - * @param the data type of records. - * @return {@link Builder} to connect the iceberg table. - */ - public static Builder builderFor( - DataStream input, MapFunction mapper, TypeInformation outputType) { - return new Builder().forMapperOutputType(input, mapper, outputType); - } - - /** - * Initialize a {@link Builder} to export the data from input data stream with {@link Row}s into - * iceberg table. We use {@link RowData} inside the sink connector, so users need to provide a - * {@link TableSchema} for builder to convert those {@link Row}s to a {@link RowData} DataStream. - * - * @param input the source input data stream with {@link Row}s. - * @param tableSchema defines the {@link TypeInformation} for input data. - * @return {@link Builder} to connect the iceberg table. - */ - public static Builder forRow(DataStream input, TableSchema tableSchema) { - RowType rowType = (RowType) tableSchema.toRowDataType().getLogicalType(); - DataType[] fieldDataTypes = tableSchema.getFieldDataTypes(); - - DataFormatConverters.RowConverter rowConverter = - new DataFormatConverters.RowConverter(fieldDataTypes); - return builderFor(input, rowConverter::toInternal, FlinkCompatibilityUtil.toTypeInfo(rowType)) - .tableSchema(tableSchema); - } - - /** - * Initialize a {@link Builder} to export the data from input data stream with {@link RowData}s - * into iceberg table. - * - * @param input the source input data stream with {@link RowData}s. - * @return {@link Builder} to connect the iceberg table. - */ - public static Builder forRowData(DataStream input) { - return new Builder().forRowData(input); - } - - public static class Builder { - private Function> inputCreator = null; - private TableLoader tableLoader; - private Table table; - private TableSchema tableSchema; - private List equalityFieldColumns = null; - private String uidPrefix = null; - private final Map snapshotProperties = Maps.newHashMap(); - private ReadableConfig readableConfig = new Configuration(); - private final Map writeOptions = Maps.newHashMap(); - private FlinkWriteConf flinkWriteConf = null; - - private Builder() {} - - private Builder forRowData(DataStream newRowDataInput) { - this.inputCreator = ignored -> newRowDataInput; - return this; - } - - private Builder forMapperOutputType( - DataStream input, MapFunction mapper, TypeInformation outputType) { - this.inputCreator = - newUidPrefix -> { - // Input stream order is crucial for some situation(e.g. in cdc case). Therefore, we - // need to set the parallelism - // of map operator same as its input to keep map operator chaining its input, and avoid - // rebalanced by default. - SingleOutputStreamOperator inputStream = - input.map(mapper, outputType).setParallelism(input.getParallelism()); - if (newUidPrefix != null) { - inputStream.name(operatorName(newUidPrefix)).uid(newUidPrefix + "-mapper"); - } - return inputStream; - }; - return this; - } - - /** - * This iceberg {@link Table} instance is used for initializing {@link IcebergStreamWriter} - * which will write all the records into {@link DataFile}s and emit them to downstream operator. - * Providing a table would avoid so many table loading from each separate task. - * - * @param newTable the loaded iceberg table instance. - * @return {@link Builder} to connect the iceberg table. - */ - public Builder table(Table newTable) { - this.table = newTable; - return this; - } - - /** - * The table loader is used for loading tables in {@link IcebergFilesCommitter} lazily, we need - * this loader because {@link Table} is not serializable and could not just use the loaded table - * from Builder#table in the remote task manager. - * - * @param newTableLoader to load iceberg table inside tasks. - * @return {@link Builder} to connect the iceberg table. - */ - public Builder tableLoader(TableLoader newTableLoader) { - this.tableLoader = newTableLoader; - return this; - } - - /** - * Set the write properties for Flink sink. View the supported properties in {@link - * FlinkWriteOptions} - */ - public Builder set(String property, String value) { - writeOptions.put(property, value); - return this; - } - - /** - * Set the write properties for Flink sink. View the supported properties in {@link - * FlinkWriteOptions} - */ - public Builder setAll(Map properties) { - writeOptions.putAll(properties); - return this; - } - - public Builder tableSchema(TableSchema newTableSchema) { - this.tableSchema = newTableSchema; - return this; - } - - public Builder overwrite(boolean newOverwrite) { - writeOptions.put(FlinkWriteOptions.OVERWRITE_MODE.key(), Boolean.toString(newOverwrite)); - return this; - } - - public Builder flinkConf(ReadableConfig config) { - this.readableConfig = config; - return this; - } - - /** - * Configure the write {@link DistributionMode} that the flink sink will use. Currently, flink - * support {@link DistributionMode#NONE} and {@link DistributionMode#HASH}. - * - * @param mode to specify the write distribution mode. - * @return {@link Builder} to connect the iceberg table. - */ - public Builder distributionMode(DistributionMode mode) { - Preconditions.checkArgument( - !DistributionMode.RANGE.equals(mode), - "Flink does not support 'range' write distribution mode now."); - if (mode != null) { - writeOptions.put(FlinkWriteOptions.DISTRIBUTION_MODE.key(), mode.modeName()); - } - return this; - } - - /** - * Configuring the write parallel number for iceberg stream writer. - * - * @param newWriteParallelism the number of parallel iceberg stream writer. - * @return {@link Builder} to connect the iceberg table. - */ - public Builder writeParallelism(int newWriteParallelism) { - writeOptions.put( - FlinkWriteOptions.WRITE_PARALLELISM.key(), Integer.toString(newWriteParallelism)); - return this; - } - - /** - * All INSERT/UPDATE_AFTER events from input stream will be transformed to UPSERT events, which - * means it will DELETE the old records and then INSERT the new records. In partitioned table, - * the partition fields should be a subset of equality fields, otherwise the old row that - * located in partition-A could not be deleted by the new row that located in partition-B. - * - * @param enabled indicate whether it should transform all INSERT/UPDATE_AFTER events to UPSERT. - * @return {@link Builder} to connect the iceberg table. - */ - public Builder upsert(boolean enabled) { - writeOptions.put(FlinkWriteOptions.WRITE_UPSERT_ENABLED.key(), Boolean.toString(enabled)); - return this; - } - - /** - * Configuring the equality field columns for iceberg table that accept CDC or UPSERT events. - * - * @param columns defines the iceberg table's key. - * @return {@link Builder} to connect the iceberg table. - */ - public Builder equalityFieldColumns(List columns) { - this.equalityFieldColumns = columns; - return this; - } - - /** - * Set the uid prefix for FlinkSink operators. Note that FlinkSink internally consists of - * multiple operators (like writer, committer, dummy sink etc.) Actually operator uid will be - * appended with a suffix like "uidPrefix-writer".
    - *
    - * If provided, this prefix is also applied to operator names.
    - *
    - * Flink auto generates operator uid if not set explicitly. It is a recommended - * best-practice to set uid for all operators before deploying to production. Flink has an - * option to {@code pipeline.auto-generate-uid=false} to disable auto-generation and force - * explicit setting of all operator uid.
    - *
    - * Be careful with setting this for an existing job, because now we are changing the operator - * uid from an auto-generated one to this new value. When deploying the change with a - * checkpoint, Flink won't be able to restore the previous Flink sink operator state (more - * specifically the committer operator state). You need to use {@code --allowNonRestoredState} - * to ignore the previous sink state. During restore Flink sink state is used to check if last - * commit was actually successful or not. {@code --allowNonRestoredState} can lead to data loss - * if the Iceberg commit failed in the last completed checkpoint. - * - * @param newPrefix prefix for Flink sink operator uid and name - * @return {@link Builder} to connect the iceberg table. - */ - public Builder uidPrefix(String newPrefix) { - this.uidPrefix = newPrefix; - return this; - } - - public Builder setSnapshotProperties(Map properties) { - snapshotProperties.putAll(properties); - return this; - } - - public Builder setSnapshotProperty(String property, String value) { - snapshotProperties.put(property, value); - return this; - } - - public Builder toBranch(String branch) { - writeOptions.put(FlinkWriteOptions.BRANCH.key(), branch); - return this; - } - - private DataStreamSink chainIcebergOperators() { - Preconditions.checkArgument( - inputCreator != null, - "Please use forRowData() or forMapperOutputType() to initialize the input DataStream."); - Preconditions.checkNotNull(tableLoader, "Table loader shouldn't be null"); - - DataStream rowDataInput = inputCreator.apply(uidPrefix); - - if (table == null) { - if (!tableLoader.isOpen()) { - tableLoader.open(); - } - - try (TableLoader loader = tableLoader) { - this.table = loader.loadTable(); - } catch (IOException e) { - throw new UncheckedIOException( - "Failed to load iceberg table from table loader: " + tableLoader, e); - } - } - - flinkWriteConf = new FlinkWriteConf(table, writeOptions, readableConfig); - - // Find out the equality field id list based on the user-provided equality field column names. - List equalityFieldIds = checkAndGetEqualityFieldIds(); - - // Convert the requested flink table schema to flink row type. - RowType flinkRowType = toFlinkRowType(table.schema(), tableSchema); - - // Distribute the records from input data stream based on the write.distribution-mode and - // equality fields. - DataStream distributeStream = - distributeDataStream( - rowDataInput, equalityFieldIds, table.spec(), table.schema(), flinkRowType); - - // Add parallel writers that append rows to files - SingleOutputStreamOperator writerStream = - appendWriter(distributeStream, flinkRowType, equalityFieldIds); - - // Add single-parallelism committer that commits files - // after successful checkpoint or end of input - SingleOutputStreamOperator committerStream = appendCommitter(writerStream); - - // Add dummy discard sink - return appendDummySink(committerStream); - } - - /** - * Append the iceberg sink operators to write records to iceberg table. - * - * @return {@link DataStreamSink} for sink. - */ - public DataStreamSink append() { - return chainIcebergOperators(); - } - - private String operatorName(String suffix) { - return uidPrefix != null ? uidPrefix + "-" + suffix : suffix; - } - - @VisibleForTesting - List checkAndGetEqualityFieldIds() { - List equalityFieldIds = Lists.newArrayList(table.schema().identifierFieldIds()); - if (equalityFieldColumns != null && !equalityFieldColumns.isEmpty()) { - Set equalityFieldSet = - Sets.newHashSetWithExpectedSize(equalityFieldColumns.size()); - for (String column : equalityFieldColumns) { - org.apache.iceberg.types.Types.NestedField field = table.schema().findField(column); - Preconditions.checkNotNull( - field, - "Missing required equality field column '%s' in table schema %s", - column, - table.schema()); - equalityFieldSet.add(field.fieldId()); - } - - if (!equalityFieldSet.equals(table.schema().identifierFieldIds())) { - LOG.warn( - "The configured equality field column IDs {} are not matched with the schema identifier field IDs" - + " {}, use job specified equality field columns as the equality fields by default.", - equalityFieldSet, - table.schema().identifierFieldIds()); - } - equalityFieldIds = Lists.newArrayList(equalityFieldSet); - } - return equalityFieldIds; - } - - @SuppressWarnings("unchecked") - private DataStreamSink appendDummySink( - SingleOutputStreamOperator committerStream) { - DataStreamSink resultStream = - committerStream - .addSink(new DiscardingSink()) - .name(operatorName(String.format("IcebergSink %s", this.table.name()))) - .setParallelism(1); - if (uidPrefix != null) { - resultStream = resultStream.uid(uidPrefix + "-dummysink"); - } - return resultStream; - } - - private SingleOutputStreamOperator appendCommitter( - SingleOutputStreamOperator writerStream) { - IcebergFilesCommitter filesCommitter = - new IcebergFilesCommitter( - tableLoader, - flinkWriteConf.overwriteMode(), - snapshotProperties, - flinkWriteConf.workerPoolSize(), - flinkWriteConf.branch(), - table.spec()); - SingleOutputStreamOperator committerStream = - writerStream - .transform(operatorName(ICEBERG_FILES_COMMITTER_NAME), Types.VOID, filesCommitter) - .setParallelism(1) - .setMaxParallelism(1); - if (uidPrefix != null) { - committerStream = committerStream.uid(uidPrefix + "-committer"); - } - return committerStream; - } - - private SingleOutputStreamOperator appendWriter( - DataStream input, RowType flinkRowType, List equalityFieldIds) { - // Validate the equality fields and partition fields if we enable the upsert mode. - if (flinkWriteConf.upsertMode()) { - Preconditions.checkState( - !flinkWriteConf.overwriteMode(), - "OVERWRITE mode shouldn't be enable when configuring to use UPSERT data stream."); - Preconditions.checkState( - !equalityFieldIds.isEmpty(), - "Equality field columns shouldn't be empty when configuring to use UPSERT data stream."); - if (!table.spec().isUnpartitioned()) { - for (PartitionField partitionField : table.spec().fields()) { - Preconditions.checkState( - equalityFieldIds.contains(partitionField.sourceId()), - "In UPSERT mode, partition field '%s' should be included in equality fields: '%s'", - partitionField, - equalityFieldColumns); - } - } - } - - SerializableTable serializableTable = (SerializableTable) SerializableTable.copyOf(table); - Duration tableRefreshInterval = flinkWriteConf.tableRefreshInterval(); - - SerializableSupplier
    tableSupplier; - if (tableRefreshInterval != null) { - tableSupplier = - new CachingTableSupplier(serializableTable, tableLoader, tableRefreshInterval); - } else { - tableSupplier = () -> serializableTable; - } - - IcebergStreamWriter streamWriter = - createStreamWriter(tableSupplier, flinkWriteConf, flinkRowType, equalityFieldIds); - - int parallelism = - flinkWriteConf.writeParallelism() == null - ? input.getParallelism() - : flinkWriteConf.writeParallelism(); - SingleOutputStreamOperator writerStream = - input - .transform( - operatorName(ICEBERG_STREAM_WRITER_NAME), - TypeInformation.of(WriteResult.class), - streamWriter) - .setParallelism(parallelism); - if (uidPrefix != null) { - writerStream = writerStream.uid(uidPrefix + "-writer"); - } - return writerStream; - } - - private DataStream distributeDataStream( - DataStream input, - List equalityFieldIds, - PartitionSpec partitionSpec, - Schema iSchema, - RowType flinkRowType) { - DistributionMode writeMode = flinkWriteConf.distributionMode(); - - LOG.info("Write distribution mode is '{}'", writeMode.modeName()); - switch (writeMode) { - case NONE: - if (equalityFieldIds.isEmpty()) { - return input; - } else { - LOG.info("Distribute rows by equality fields, because there are equality fields set"); - return input.keyBy( - new EqualityFieldKeySelector(iSchema, flinkRowType, equalityFieldIds)); - } - - case HASH: - if (equalityFieldIds.isEmpty()) { - if (partitionSpec.isUnpartitioned()) { - LOG.warn( - "Fallback to use 'none' distribution mode, because there are no equality fields set " - + "and table is unpartitioned"); - return input; - } else { - return input.keyBy(new PartitionKeySelector(partitionSpec, iSchema, flinkRowType)); - } - } else { - if (partitionSpec.isUnpartitioned()) { - LOG.info( - "Distribute rows by equality fields, because there are equality fields set " - + "and table is unpartitioned"); - return input.keyBy( - new EqualityFieldKeySelector(iSchema, flinkRowType, equalityFieldIds)); - } else { - for (PartitionField partitionField : partitionSpec.fields()) { - Preconditions.checkState( - equalityFieldIds.contains(partitionField.sourceId()), - "In 'hash' distribution mode with equality fields set, partition field '%s' " - + "should be included in equality fields: '%s'", - partitionField, - equalityFieldColumns); - } - return input.keyBy(new PartitionKeySelector(partitionSpec, iSchema, flinkRowType)); - } - } - - case RANGE: - if (equalityFieldIds.isEmpty()) { - LOG.warn( - "Fallback to use 'none' distribution mode, because there are no equality fields set " - + "and {}=range is not supported yet in flink", - WRITE_DISTRIBUTION_MODE); - return input; - } else { - LOG.info( - "Distribute rows by equality fields, because there are equality fields set " - + "and{}=range is not supported yet in flink", - WRITE_DISTRIBUTION_MODE); - return input.keyBy( - new EqualityFieldKeySelector(iSchema, flinkRowType, equalityFieldIds)); - } - - default: - throw new RuntimeException("Unrecognized " + WRITE_DISTRIBUTION_MODE + ": " + writeMode); - } - } - } - - static RowType toFlinkRowType(Schema schema, TableSchema requestedSchema) { - if (requestedSchema != null) { - // Convert the flink schema to iceberg schema firstly, then reassign ids to match the existing - // iceberg schema. - Schema writeSchema = TypeUtil.reassignIds(FlinkSchemaUtil.convert(requestedSchema), schema); - TypeUtil.validateWriteSchema(schema, writeSchema, true, true); - - // We use this flink schema to read values from RowData. The flink's TINYINT and SMALLINT will - // be promoted to - // iceberg INTEGER, that means if we use iceberg's table schema to read TINYINT (backend by 1 - // 'byte'), we will - // read 4 bytes rather than 1 byte, it will mess up the byte array in BinaryRowData. So here - // we must use flink - // schema. - return (RowType) requestedSchema.toRowDataType().getLogicalType(); - } else { - return FlinkSchemaUtil.convert(schema); - } - } - - static IcebergStreamWriter createStreamWriter( - SerializableSupplier
    tableSupplier, - FlinkWriteConf flinkWriteConf, - RowType flinkRowType, - List equalityFieldIds) { - Preconditions.checkArgument(tableSupplier != null, "Iceberg table supplier shouldn't be null"); - - Table initTable = tableSupplier.get(); - FileFormat format = flinkWriteConf.dataFileFormat(); - TaskWriterFactory taskWriterFactory = - new RowDataTaskWriterFactory( - tableSupplier, - flinkRowType, - flinkWriteConf.targetDataFileSize(), - format, - writeProperties(initTable, format, flinkWriteConf), - equalityFieldIds, - flinkWriteConf.upsertMode()); - - return new IcebergStreamWriter<>(initTable.name(), taskWriterFactory); - } - - /** - * Based on the {@link FileFormat} overwrites the table level compression properties for the table - * write. - * - * @param table The table to get the table level settings - * @param format The FileFormat to use - * @param conf The write configuration - * @return The properties to use for writing - */ - private static Map writeProperties( - Table table, FileFormat format, FlinkWriteConf conf) { - Map writeProperties = Maps.newHashMap(table.properties()); - - switch (format) { - case PARQUET: - writeProperties.put(PARQUET_COMPRESSION, conf.parquetCompressionCodec()); - String parquetCompressionLevel = conf.parquetCompressionLevel(); - if (parquetCompressionLevel != null) { - writeProperties.put(PARQUET_COMPRESSION_LEVEL, parquetCompressionLevel); - } - - break; - case AVRO: - writeProperties.put(AVRO_COMPRESSION, conf.avroCompressionCodec()); - String avroCompressionLevel = conf.avroCompressionLevel(); - if (avroCompressionLevel != null) { - writeProperties.put(AVRO_COMPRESSION_LEVEL, conf.avroCompressionLevel()); - } - - break; - case ORC: - writeProperties.put(ORC_COMPRESSION, conf.orcCompressionCodec()); - writeProperties.put(ORC_COMPRESSION_STRATEGY, conf.orcCompressionStrategy()); - break; - default: - throw new IllegalArgumentException(String.format("Unknown file format %s", format)); - } - - return writeProperties; - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergFilesCommitter.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergFilesCommitter.java deleted file mode 100644 index b9bceaa9311d..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergFilesCommitter.java +++ /dev/null @@ -1,516 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import java.io.IOException; -import java.util.Arrays; -import java.util.Comparator; -import java.util.List; -import java.util.Map; -import java.util.NavigableMap; -import java.util.SortedMap; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.TimeUnit; -import org.apache.flink.api.common.state.ListState; -import org.apache.flink.api.common.state.ListStateDescriptor; -import org.apache.flink.api.common.typeinfo.BasicTypeInfo; -import org.apache.flink.api.common.typeinfo.PrimitiveArrayTypeInfo; -import org.apache.flink.core.io.SimpleVersionedSerialization; -import org.apache.flink.runtime.state.StateInitializationContext; -import org.apache.flink.runtime.state.StateSnapshotContext; -import org.apache.flink.streaming.api.operators.AbstractStreamOperator; -import org.apache.flink.streaming.api.operators.BoundedOneInput; -import org.apache.flink.streaming.api.operators.OneInputStreamOperator; -import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; -import org.apache.flink.table.runtime.typeutils.SortedMapTypeInfo; -import org.apache.iceberg.AppendFiles; -import org.apache.iceberg.ManifestFile; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.ReplacePartitions; -import org.apache.iceberg.RowDelta; -import org.apache.iceberg.Snapshot; -import org.apache.iceberg.SnapshotUpdate; -import org.apache.iceberg.Table; -import org.apache.iceberg.flink.TableLoader; -import org.apache.iceberg.io.WriteResult; -import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; -import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.base.Strings; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.types.Comparators; -import org.apache.iceberg.types.Types; -import org.apache.iceberg.util.PropertyUtil; -import org.apache.iceberg.util.ThreadPools; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -class IcebergFilesCommitter extends AbstractStreamOperator - implements OneInputStreamOperator, BoundedOneInput { - - private static final long serialVersionUID = 1L; - private static final long INITIAL_CHECKPOINT_ID = -1L; - private static final byte[] EMPTY_MANIFEST_DATA = new byte[0]; - - private static final Logger LOG = LoggerFactory.getLogger(IcebergFilesCommitter.class); - private static final String FLINK_JOB_ID = "flink.job-id"; - private static final String OPERATOR_ID = "flink.operator-id"; - - // The max checkpoint id we've committed to iceberg table. As the flink's checkpoint is always - // increasing, so we could correctly commit all the data files whose checkpoint id is greater than - // the max committed one to iceberg table, for avoiding committing the same data files twice. This - // id will be attached to iceberg's meta when committing the iceberg transaction. - private static final String MAX_COMMITTED_CHECKPOINT_ID = "flink.max-committed-checkpoint-id"; - static final String MAX_CONTINUOUS_EMPTY_COMMITS = "flink.max-continuous-empty-commits"; - - // TableLoader to load iceberg table lazily. - private final TableLoader tableLoader; - private final boolean replacePartitions; - private final Map snapshotProperties; - - // A sorted map to maintain the completed data files for each pending checkpointId (which have not - // been committed to iceberg table). We need a sorted map here because there's possible that few - // checkpoints snapshot failed, for example: the 1st checkpoint have 2 data files <1, >, the 2st checkpoint have 1 data files <2, >. Snapshot for checkpoint#1 - // interrupted because of network/disk failure etc, while we don't expect any data loss in iceberg - // table. So we keep the finished files <1, > in memory and retry to commit iceberg - // table when the next checkpoint happen. - private final NavigableMap dataFilesPerCheckpoint = Maps.newTreeMap(); - - // The completed files cache for current checkpoint. Once the snapshot barrier received, it will - // be flushed to the 'dataFilesPerCheckpoint'. - private final List writeResultsOfCurrentCkpt = Lists.newArrayList(); - private final String branch; - - // It will have an unique identifier for one job. - private transient String flinkJobId; - private transient String operatorUniqueId; - private transient Table table; - private transient IcebergFilesCommitterMetrics committerMetrics; - private transient ManifestOutputFileFactory manifestOutputFileFactory; - private transient long maxCommittedCheckpointId; - private transient int continuousEmptyCheckpoints; - private transient int maxContinuousEmptyCommits; - // There're two cases that we restore from flink checkpoints: the first case is restoring from - // snapshot created by the same flink job; another case is restoring from snapshot created by - // another different job. For the second case, we need to maintain the old flink job's id in flink - // state backend to find the max-committed-checkpoint-id when traversing iceberg table's - // snapshots. - private static final ListStateDescriptor JOB_ID_DESCRIPTOR = - new ListStateDescriptor<>("iceberg-flink-job-id", BasicTypeInfo.STRING_TYPE_INFO); - private transient ListState jobIdState; - // All pending checkpoints states for this function. - private static final ListStateDescriptor> STATE_DESCRIPTOR = - buildStateDescriptor(); - private transient ListState> checkpointsState; - - private final Integer workerPoolSize; - private final PartitionSpec spec; - private transient ExecutorService workerPool; - - IcebergFilesCommitter( - TableLoader tableLoader, - boolean replacePartitions, - Map snapshotProperties, - Integer workerPoolSize, - String branch, - PartitionSpec spec) { - this.tableLoader = tableLoader; - this.replacePartitions = replacePartitions; - this.snapshotProperties = snapshotProperties; - this.workerPoolSize = workerPoolSize; - this.branch = branch; - this.spec = spec; - } - - @Override - public void initializeState(StateInitializationContext context) throws Exception { - super.initializeState(context); - this.flinkJobId = getContainingTask().getEnvironment().getJobID().toString(); - this.operatorUniqueId = getRuntimeContext().getOperatorUniqueID(); - - // Open the table loader and load the table. - this.tableLoader.open(); - this.table = tableLoader.loadTable(); - this.committerMetrics = new IcebergFilesCommitterMetrics(super.metrics, table.name()); - - maxContinuousEmptyCommits = - PropertyUtil.propertyAsInt(table.properties(), MAX_CONTINUOUS_EMPTY_COMMITS, 10); - Preconditions.checkArgument( - maxContinuousEmptyCommits > 0, MAX_CONTINUOUS_EMPTY_COMMITS + " must be positive"); - - int subTaskId = getRuntimeContext().getIndexOfThisSubtask(); - int attemptId = getRuntimeContext().getAttemptNumber(); - this.manifestOutputFileFactory = - FlinkManifestUtil.createOutputFileFactory( - () -> table, table.properties(), flinkJobId, operatorUniqueId, subTaskId, attemptId); - this.maxCommittedCheckpointId = INITIAL_CHECKPOINT_ID; - - this.checkpointsState = context.getOperatorStateStore().getListState(STATE_DESCRIPTOR); - this.jobIdState = context.getOperatorStateStore().getListState(JOB_ID_DESCRIPTOR); - if (context.isRestored()) { - Iterable jobIdIterable = jobIdState.get(); - if (jobIdIterable == null || !jobIdIterable.iterator().hasNext()) { - LOG.warn( - "Failed to restore committer state. This can happen when operator uid changed and Flink " - + "allowNonRestoredState is enabled. Best practice is to explicitly set the operator id " - + "via FlinkSink#Builder#uidPrefix() so that the committer operator uid is stable. " - + "Otherwise, Flink auto generate an operator uid based on job topology." - + "With that, operator uid is subjective to change upon topology change."); - return; - } - - String restoredFlinkJobId = jobIdIterable.iterator().next(); - Preconditions.checkState( - !Strings.isNullOrEmpty(restoredFlinkJobId), - "Flink job id parsed from checkpoint snapshot shouldn't be null or empty"); - - // Since flink's checkpoint id will start from the max-committed-checkpoint-id + 1 in the new - // flink job even if it's restored from a snapshot created by another different flink job, so - // it's safe to assign the max committed checkpoint id from restored flink job to the current - // flink job. - this.maxCommittedCheckpointId = - getMaxCommittedCheckpointId(table, restoredFlinkJobId, operatorUniqueId, branch); - - NavigableMap uncommittedDataFiles = - Maps.newTreeMap(checkpointsState.get().iterator().next()) - .tailMap(maxCommittedCheckpointId, false); - if (!uncommittedDataFiles.isEmpty()) { - // Committed all uncommitted data files from the old flink job to iceberg table. - long maxUncommittedCheckpointId = uncommittedDataFiles.lastKey(); - commitUpToCheckpoint( - uncommittedDataFiles, restoredFlinkJobId, operatorUniqueId, maxUncommittedCheckpointId); - } - } - } - - @Override - public void snapshotState(StateSnapshotContext context) throws Exception { - super.snapshotState(context); - long checkpointId = context.getCheckpointId(); - LOG.info( - "Start to flush snapshot state to state backend, table: {}, checkpointId: {}", - table, - checkpointId); - - // Update the checkpoint state. - long startNano = System.nanoTime(); - dataFilesPerCheckpoint.put(checkpointId, writeToManifest(checkpointId)); - // Reset the snapshot state to the latest state. - checkpointsState.clear(); - checkpointsState.add(dataFilesPerCheckpoint); - - jobIdState.clear(); - jobIdState.add(flinkJobId); - - // Clear the local buffer for current checkpoint. - writeResultsOfCurrentCkpt.clear(); - committerMetrics.checkpointDuration( - TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - startNano)); - } - - @Override - public void notifyCheckpointComplete(long checkpointId) throws Exception { - super.notifyCheckpointComplete(checkpointId); - // It's possible that we have the following events: - // 1. snapshotState(ckpId); - // 2. snapshotState(ckpId+1); - // 3. notifyCheckpointComplete(ckpId+1); - // 4. notifyCheckpointComplete(ckpId); - // For step#4, we don't need to commit iceberg table again because in step#3 we've committed all - // the files, - // Besides, we need to maintain the max-committed-checkpoint-id to be increasing. - if (checkpointId > maxCommittedCheckpointId) { - LOG.info("Checkpoint {} completed. Attempting commit.", checkpointId); - commitUpToCheckpoint(dataFilesPerCheckpoint, flinkJobId, operatorUniqueId, checkpointId); - this.maxCommittedCheckpointId = checkpointId; - } else { - LOG.info( - "Skipping committing checkpoint {}. {} is already committed.", - checkpointId, - maxCommittedCheckpointId); - } - - // reload the table in case new configuration is needed - this.table = tableLoader.loadTable(); - } - - private void commitUpToCheckpoint( - NavigableMap deltaManifestsMap, - String newFlinkJobId, - String operatorId, - long checkpointId) - throws IOException { - NavigableMap pendingMap = deltaManifestsMap.headMap(checkpointId, true); - List manifests = Lists.newArrayList(); - NavigableMap pendingResults = Maps.newTreeMap(); - for (Map.Entry e : pendingMap.entrySet()) { - if (Arrays.equals(EMPTY_MANIFEST_DATA, e.getValue())) { - // Skip the empty flink manifest. - continue; - } - - DeltaManifests deltaManifests = - SimpleVersionedSerialization.readVersionAndDeSerialize( - DeltaManifestsSerializer.INSTANCE, e.getValue()); - pendingResults.put( - e.getKey(), - FlinkManifestUtil.readCompletedFiles(deltaManifests, table.io(), table.specs())); - manifests.addAll(deltaManifests.manifests()); - } - - CommitSummary summary = new CommitSummary(pendingResults); - commitPendingResult(pendingResults, summary, newFlinkJobId, operatorId, checkpointId); - committerMetrics.updateCommitSummary(summary); - pendingMap.clear(); - deleteCommittedManifests(manifests, newFlinkJobId, checkpointId); - } - - private void commitPendingResult( - NavigableMap pendingResults, - CommitSummary summary, - String newFlinkJobId, - String operatorId, - long checkpointId) { - long totalFiles = summary.dataFilesCount() + summary.deleteFilesCount(); - continuousEmptyCheckpoints = totalFiles == 0 ? continuousEmptyCheckpoints + 1 : 0; - if (totalFiles != 0 || continuousEmptyCheckpoints % maxContinuousEmptyCommits == 0) { - if (replacePartitions) { - replacePartitions(pendingResults, summary, newFlinkJobId, operatorId, checkpointId); - } else { - commitDeltaTxn(pendingResults, summary, newFlinkJobId, operatorId, checkpointId); - } - continuousEmptyCheckpoints = 0; - } else { - LOG.info("Skip commit for checkpoint {} due to no data files or delete files.", checkpointId); - } - } - - private void deleteCommittedManifests( - List manifests, String newFlinkJobId, long checkpointId) { - for (ManifestFile manifest : manifests) { - try { - table.io().deleteFile(manifest.path()); - } catch (Exception e) { - // The flink manifests cleaning failure shouldn't abort the completed checkpoint. - String details = - MoreObjects.toStringHelper(this) - .add("flinkJobId", newFlinkJobId) - .add("checkpointId", checkpointId) - .add("manifestPath", manifest.path()) - .toString(); - LOG.warn( - "The iceberg transaction has been committed, but we failed to clean the temporary flink manifests: {}", - details, - e); - } - } - } - - private void replacePartitions( - NavigableMap pendingResults, - CommitSummary summary, - String newFlinkJobId, - String operatorId, - long checkpointId) { - Preconditions.checkState( - summary.deleteFilesCount() == 0, "Cannot overwrite partitions with delete files."); - // Commit the overwrite transaction. - ReplacePartitions dynamicOverwrite = table.newReplacePartitions().scanManifestsWith(workerPool); - for (WriteResult result : pendingResults.values()) { - Preconditions.checkState( - result.referencedDataFiles().length == 0, "Should have no referenced data files."); - Arrays.stream(result.dataFiles()).forEach(dynamicOverwrite::addFile); - } - - commitOperation( - dynamicOverwrite, - summary, - "dynamic partition overwrite", - newFlinkJobId, - operatorId, - checkpointId); - } - - private void commitDeltaTxn( - NavigableMap pendingResults, - CommitSummary summary, - String newFlinkJobId, - String operatorId, - long checkpointId) { - if (summary.deleteFilesCount() == 0) { - // To be compatible with iceberg format V1. - AppendFiles appendFiles = table.newAppend().scanManifestsWith(workerPool); - for (WriteResult result : pendingResults.values()) { - Preconditions.checkState( - result.referencedDataFiles().length == 0, - "Should have no referenced data files for append."); - Arrays.stream(result.dataFiles()).forEach(appendFiles::appendFile); - } - commitOperation(appendFiles, summary, "append", newFlinkJobId, operatorId, checkpointId); - } else { - // To be compatible with iceberg format V2. - for (Map.Entry e : pendingResults.entrySet()) { - // We don't commit the merged result into a single transaction because for the sequential - // transaction txn1 and txn2, the equality-delete files of txn2 are required to be applied - // to data files from txn1. Committing the merged one will lead to the incorrect delete - // semantic. - WriteResult result = e.getValue(); - - // Row delta validations are not needed for streaming changes that write equality deletes. - // Equality deletes are applied to data in all previous sequence numbers, so retries may - // push deletes further in the future, but do not affect correctness. Position deletes - // committed to the table in this path are used only to delete rows from data files that are - // being added in this commit. There is no way for data files added along with the delete - // files to be concurrently removed, so there is no need to validate the files referenced by - // the position delete files that are being committed. - RowDelta rowDelta = table.newRowDelta().scanManifestsWith(workerPool); - - Arrays.stream(result.dataFiles()).forEach(rowDelta::addRows); - Arrays.stream(result.deleteFiles()).forEach(rowDelta::addDeletes); - commitOperation(rowDelta, summary, "rowDelta", newFlinkJobId, operatorId, e.getKey()); - } - } - } - - private void commitOperation( - SnapshotUpdate operation, - CommitSummary summary, - String description, - String newFlinkJobId, - String operatorId, - long checkpointId) { - LOG.info( - "Committing {} for checkpoint {} to table {} branch {} with summary: {}", - description, - checkpointId, - table.name(), - branch, - summary); - snapshotProperties.forEach(operation::set); - // custom snapshot metadata properties will be overridden if they conflict with internal ones - // used by the sink. - operation.set(MAX_COMMITTED_CHECKPOINT_ID, Long.toString(checkpointId)); - operation.set(FLINK_JOB_ID, newFlinkJobId); - operation.set(OPERATOR_ID, operatorId); - operation.toBranch(branch); - - long startNano = System.nanoTime(); - operation.commit(); // abort is automatically called if this fails. - long durationMs = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - startNano); - LOG.info( - "Committed {} to table: {}, branch: {}, checkpointId {} in {} ms", - description, - table.name(), - branch, - checkpointId, - durationMs); - committerMetrics.commitDuration(durationMs); - } - - @Override - public void processElement(StreamRecord element) { - this.writeResultsOfCurrentCkpt.add(element.getValue()); - } - - @Override - public void endInput() throws IOException { - // Flush the buffered data files into 'dataFilesPerCheckpoint' firstly. - long currentCheckpointId = Long.MAX_VALUE; - dataFilesPerCheckpoint.put(currentCheckpointId, writeToManifest(currentCheckpointId)); - writeResultsOfCurrentCkpt.clear(); - - commitUpToCheckpoint(dataFilesPerCheckpoint, flinkJobId, operatorUniqueId, currentCheckpointId); - } - - /** - * Write all the complete data files to a newly created manifest file and return the manifest's - * avro serialized bytes. - */ - private byte[] writeToManifest(long checkpointId) throws IOException { - if (writeResultsOfCurrentCkpt.isEmpty()) { - return EMPTY_MANIFEST_DATA; - } - - WriteResult result = WriteResult.builder().addAll(writeResultsOfCurrentCkpt).build(); - DeltaManifests deltaManifests = - FlinkManifestUtil.writeCompletedFiles( - result, () -> manifestOutputFileFactory.create(checkpointId), spec); - - return SimpleVersionedSerialization.writeVersionAndSerialize( - DeltaManifestsSerializer.INSTANCE, deltaManifests); - } - - @Override - public void open() throws Exception { - super.open(); - - final String operatorID = getRuntimeContext().getOperatorUniqueID(); - this.workerPool = - ThreadPools.newWorkerPool("iceberg-worker-pool-" + operatorID, workerPoolSize); - } - - @Override - public void close() throws Exception { - if (tableLoader != null) { - tableLoader.close(); - } - - if (workerPool != null) { - workerPool.shutdown(); - } - } - - @VisibleForTesting - static ListStateDescriptor> buildStateDescriptor() { - Comparator longComparator = Comparators.forType(Types.LongType.get()); - // Construct a SortedMapTypeInfo. - SortedMapTypeInfo sortedMapTypeInfo = - new SortedMapTypeInfo<>( - BasicTypeInfo.LONG_TYPE_INFO, - PrimitiveArrayTypeInfo.BYTE_PRIMITIVE_ARRAY_TYPE_INFO, - longComparator); - return new ListStateDescriptor<>("iceberg-files-committer-state", sortedMapTypeInfo); - } - - static long getMaxCommittedCheckpointId( - Table table, String flinkJobId, String operatorId, String branch) { - Snapshot snapshot = table.snapshot(branch); - long lastCommittedCheckpointId = INITIAL_CHECKPOINT_ID; - - while (snapshot != null) { - Map summary = snapshot.summary(); - String snapshotFlinkJobId = summary.get(FLINK_JOB_ID); - String snapshotOperatorId = summary.get(OPERATOR_ID); - if (flinkJobId.equals(snapshotFlinkJobId) - && (snapshotOperatorId == null || snapshotOperatorId.equals(operatorId))) { - String value = summary.get(MAX_COMMITTED_CHECKPOINT_ID); - if (value != null) { - lastCommittedCheckpointId = Long.parseLong(value); - break; - } - } - Long parentSnapshotId = snapshot.parentId(); - snapshot = parentSnapshotId != null ? table.snapshot(parentSnapshotId) : null; - } - - return lastCommittedCheckpointId; - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergFilesCommitterMetrics.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergFilesCommitterMetrics.java deleted file mode 100644 index 9de0d6aaa551..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergFilesCommitterMetrics.java +++ /dev/null @@ -1,96 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import java.util.concurrent.TimeUnit; -import java.util.concurrent.atomic.AtomicLong; -import org.apache.flink.metrics.Counter; -import org.apache.flink.metrics.Gauge; -import org.apache.flink.metrics.MetricGroup; - -class IcebergFilesCommitterMetrics { - private final AtomicLong lastCheckpointDurationMs = new AtomicLong(); - private final AtomicLong lastCommitDurationMs = new AtomicLong(); - private final ElapsedTimeGauge elapsedSecondsSinceLastSuccessfulCommit; - private final Counter committedDataFilesCount; - private final Counter committedDataFilesRecordCount; - private final Counter committedDataFilesByteCount; - private final Counter committedDeleteFilesCount; - private final Counter committedDeleteFilesRecordCount; - private final Counter committedDeleteFilesByteCount; - - IcebergFilesCommitterMetrics(MetricGroup metrics, String fullTableName) { - MetricGroup committerMetrics = - metrics.addGroup("IcebergFilesCommitter").addGroup("table", fullTableName); - committerMetrics.gauge("lastCheckpointDurationMs", lastCheckpointDurationMs::get); - committerMetrics.gauge("lastCommitDurationMs", lastCommitDurationMs::get); - this.elapsedSecondsSinceLastSuccessfulCommit = new ElapsedTimeGauge(TimeUnit.SECONDS); - committerMetrics.gauge( - "elapsedSecondsSinceLastSuccessfulCommit", elapsedSecondsSinceLastSuccessfulCommit); - this.committedDataFilesCount = committerMetrics.counter("committedDataFilesCount"); - this.committedDataFilesRecordCount = committerMetrics.counter("committedDataFilesRecordCount"); - this.committedDataFilesByteCount = committerMetrics.counter("committedDataFilesByteCount"); - this.committedDeleteFilesCount = committerMetrics.counter("committedDeleteFilesCount"); - this.committedDeleteFilesRecordCount = - committerMetrics.counter("committedDeleteFilesRecordCount"); - this.committedDeleteFilesByteCount = committerMetrics.counter("committedDeleteFilesByteCount"); - } - - void checkpointDuration(long checkpointDurationMs) { - lastCheckpointDurationMs.set(checkpointDurationMs); - } - - void commitDuration(long commitDurationMs) { - lastCommitDurationMs.set(commitDurationMs); - } - - /** This is called upon a successful commit. */ - void updateCommitSummary(CommitSummary stats) { - elapsedSecondsSinceLastSuccessfulCommit.refreshLastRecordedTime(); - committedDataFilesCount.inc(stats.dataFilesCount()); - committedDataFilesRecordCount.inc(stats.dataFilesRecordCount()); - committedDataFilesByteCount.inc(stats.dataFilesByteCount()); - committedDeleteFilesCount.inc(stats.deleteFilesCount()); - committedDeleteFilesRecordCount.inc(stats.deleteFilesRecordCount()); - committedDeleteFilesByteCount.inc(stats.deleteFilesByteCount()); - } - - /** - * This gauge measures the elapsed time between now and last recorded time set by {@link - * ElapsedTimeGauge#refreshLastRecordedTime()}. - */ - private static class ElapsedTimeGauge implements Gauge { - private final TimeUnit reportUnit; - private volatile long lastRecordedTimeNano; - - ElapsedTimeGauge(TimeUnit timeUnit) { - this.reportUnit = timeUnit; - this.lastRecordedTimeNano = System.nanoTime(); - } - - void refreshLastRecordedTime() { - this.lastRecordedTimeNano = System.nanoTime(); - } - - @Override - public Long getValue() { - return reportUnit.convert(System.nanoTime() - lastRecordedTimeNano, TimeUnit.NANOSECONDS); - } - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriter.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriter.java deleted file mode 100644 index 9ea0349fb057..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriter.java +++ /dev/null @@ -1,120 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import java.io.IOException; -import java.util.concurrent.TimeUnit; -import org.apache.flink.streaming.api.operators.AbstractStreamOperator; -import org.apache.flink.streaming.api.operators.BoundedOneInput; -import org.apache.flink.streaming.api.operators.ChainingStrategy; -import org.apache.flink.streaming.api.operators.OneInputStreamOperator; -import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; -import org.apache.iceberg.io.TaskWriter; -import org.apache.iceberg.io.WriteResult; -import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; - -class IcebergStreamWriter extends AbstractStreamOperator - implements OneInputStreamOperator, BoundedOneInput { - - private static final long serialVersionUID = 1L; - - private final String fullTableName; - private final TaskWriterFactory taskWriterFactory; - - private transient TaskWriter writer; - private transient int subTaskId; - private transient int attemptId; - private transient IcebergStreamWriterMetrics writerMetrics; - - IcebergStreamWriter(String fullTableName, TaskWriterFactory taskWriterFactory) { - this.fullTableName = fullTableName; - this.taskWriterFactory = taskWriterFactory; - setChainingStrategy(ChainingStrategy.ALWAYS); - } - - @Override - public void open() { - this.subTaskId = getRuntimeContext().getIndexOfThisSubtask(); - this.attemptId = getRuntimeContext().getAttemptNumber(); - this.writerMetrics = new IcebergStreamWriterMetrics(super.metrics, fullTableName); - - // Initialize the task writer factory. - this.taskWriterFactory.initialize(subTaskId, attemptId); - - // Initialize the task writer. - this.writer = taskWriterFactory.create(); - } - - @Override - public void prepareSnapshotPreBarrier(long checkpointId) throws Exception { - flush(); - this.writer = taskWriterFactory.create(); - } - - @Override - public void processElement(StreamRecord element) throws Exception { - writer.write(element.getValue()); - } - - @Override - public void close() throws Exception { - super.close(); - if (writer != null) { - writer.close(); - writer = null; - } - } - - @Override - public void endInput() throws IOException { - // For bounded stream, it may don't enable the checkpoint mechanism so we'd better to emit the - // remaining completed files to downstream before closing the writer so that we won't miss any - // of them. - // Note that if the task is not closed after calling endInput, checkpoint may be triggered again - // causing files to be sent repeatedly, the writer is marked as null after the last file is sent - // to guard against duplicated writes. - flush(); - } - - @Override - public String toString() { - return MoreObjects.toStringHelper(this) - .add("table_name", fullTableName) - .add("subtask_id", subTaskId) - .add("attempt_id", attemptId) - .toString(); - } - - /** close all open files and emit files to downstream committer operator */ - private void flush() throws IOException { - if (writer == null) { - return; - } - - long startNano = System.nanoTime(); - WriteResult result = writer.complete(); - writerMetrics.updateFlushResult(result); - output.collect(new StreamRecord<>(result)); - writerMetrics.flushDuration(TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - startNano)); - - // Set writer to null to prevent duplicate flushes in the corner case of - // prepareSnapshotPreBarrier happening after endInput. - writer = null; - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriterMetrics.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriterMetrics.java deleted file mode 100644 index ce2a6c583fdf..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriterMetrics.java +++ /dev/null @@ -1,89 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import com.codahale.metrics.SlidingWindowReservoir; -import java.util.Arrays; -import java.util.concurrent.atomic.AtomicLong; -import org.apache.flink.dropwizard.metrics.DropwizardHistogramWrapper; -import org.apache.flink.metrics.Counter; -import org.apache.flink.metrics.Histogram; -import org.apache.flink.metrics.MetricGroup; -import org.apache.iceberg.io.WriteResult; - -class IcebergStreamWriterMetrics { - // 1,024 reservoir size should cost about 8KB, which is quite small. - // It should also produce good accuracy for histogram distribution (like percentiles). - private static final int HISTOGRAM_RESERVOIR_SIZE = 1024; - - private final Counter flushedDataFiles; - private final Counter flushedDeleteFiles; - private final Counter flushedReferencedDataFiles; - private final AtomicLong lastFlushDurationMs; - private final Histogram dataFilesSizeHistogram; - private final Histogram deleteFilesSizeHistogram; - - IcebergStreamWriterMetrics(MetricGroup metrics, String fullTableName) { - MetricGroup writerMetrics = - metrics.addGroup("IcebergStreamWriter").addGroup("table", fullTableName); - this.flushedDataFiles = writerMetrics.counter("flushedDataFiles"); - this.flushedDeleteFiles = writerMetrics.counter("flushedDeleteFiles"); - this.flushedReferencedDataFiles = writerMetrics.counter("flushedReferencedDataFiles"); - this.lastFlushDurationMs = new AtomicLong(); - writerMetrics.gauge("lastFlushDurationMs", lastFlushDurationMs::get); - - com.codahale.metrics.Histogram dropwizardDataFilesSizeHistogram = - new com.codahale.metrics.Histogram(new SlidingWindowReservoir(HISTOGRAM_RESERVOIR_SIZE)); - this.dataFilesSizeHistogram = - writerMetrics.histogram( - "dataFilesSizeHistogram", - new DropwizardHistogramWrapper(dropwizardDataFilesSizeHistogram)); - com.codahale.metrics.Histogram dropwizardDeleteFilesSizeHistogram = - new com.codahale.metrics.Histogram(new SlidingWindowReservoir(HISTOGRAM_RESERVOIR_SIZE)); - this.deleteFilesSizeHistogram = - writerMetrics.histogram( - "deleteFilesSizeHistogram", - new DropwizardHistogramWrapper(dropwizardDeleteFilesSizeHistogram)); - } - - void updateFlushResult(WriteResult result) { - flushedDataFiles.inc(result.dataFiles().length); - flushedDeleteFiles.inc(result.deleteFiles().length); - flushedReferencedDataFiles.inc(result.referencedDataFiles().length); - - // For file size distribution histogram, we don't have to update them after successful commits. - // This should works equally well and we avoided the overhead of tracking the list of file sizes - // in the {@link CommitSummary}, which currently stores simple stats for counters and gauges - // metrics. - Arrays.stream(result.dataFiles()) - .forEach( - dataFile -> { - dataFilesSizeHistogram.update(dataFile.fileSizeInBytes()); - }); - Arrays.stream(result.deleteFiles()) - .forEach( - deleteFile -> { - deleteFilesSizeHistogram.update(deleteFile.fileSizeInBytes()); - }); - } - - void flushDuration(long flushDurationMs) { - lastFlushDurationMs.set(flushDurationMs); - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/ManifestOutputFileFactory.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/ManifestOutputFileFactory.java deleted file mode 100644 index da5e6e7627ae..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/ManifestOutputFileFactory.java +++ /dev/null @@ -1,94 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import java.util.Map; -import java.util.concurrent.atomic.AtomicInteger; -import java.util.function.Supplier; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.HasTableOperations; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableOperations; -import org.apache.iceberg.io.OutputFile; -import org.apache.iceberg.relocated.com.google.common.base.Strings; - -class ManifestOutputFileFactory { - // Users could define their own flink manifests directory by setting this value in table - // properties. - static final String FLINK_MANIFEST_LOCATION = "flink.manifests.location"; - - private final Supplier
    tableSupplier; - private final Map props; - private final String flinkJobId; - private final String operatorUniqueId; - private final int subTaskId; - private final long attemptNumber; - private final AtomicInteger fileCount = new AtomicInteger(0); - - ManifestOutputFileFactory( - Supplier
    tableSupplier, - Map props, - String flinkJobId, - String operatorUniqueId, - int subTaskId, - long attemptNumber) { - this.tableSupplier = tableSupplier; - this.props = props; - this.flinkJobId = flinkJobId; - this.operatorUniqueId = operatorUniqueId; - this.subTaskId = subTaskId; - this.attemptNumber = attemptNumber; - } - - private String generatePath(long checkpointId) { - return FileFormat.AVRO.addExtension( - String.format( - "%s-%s-%05d-%d-%d-%05d", - flinkJobId, - operatorUniqueId, - subTaskId, - attemptNumber, - checkpointId, - fileCount.incrementAndGet())); - } - - OutputFile create(long checkpointId) { - String flinkManifestDir = props.get(FLINK_MANIFEST_LOCATION); - TableOperations ops = ((HasTableOperations) tableSupplier.get()).operations(); - - String newManifestFullPath; - if (Strings.isNullOrEmpty(flinkManifestDir)) { - // User don't specify any flink manifest directory, so just use the default metadata path. - newManifestFullPath = ops.metadataFileLocation(generatePath(checkpointId)); - } else { - newManifestFullPath = - String.format("%s/%s", stripTrailingSlash(flinkManifestDir), generatePath(checkpointId)); - } - - return tableSupplier.get().io().newOutputFile(newManifestFullPath); - } - - private static String stripTrailingSlash(String path) { - String result = path; - while (result.endsWith("/")) { - result = result.substring(0, result.length() - 1); - } - return result; - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/PartitionKeySelector.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/PartitionKeySelector.java deleted file mode 100644 index df951684b446..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/PartitionKeySelector.java +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import org.apache.flink.api.java.functions.KeySelector; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.PartitionKey; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.flink.RowDataWrapper; - -/** - * Create a {@link KeySelector} to shuffle by partition key, then each partition/bucket will be - * wrote by only one task. That will reduce lots of small files in partitioned fanout write policy - * for {@link FlinkSink}. - */ -class PartitionKeySelector implements KeySelector { - - private final Schema schema; - private final PartitionKey partitionKey; - private final RowType flinkSchema; - - private transient RowDataWrapper rowDataWrapper; - - PartitionKeySelector(PartitionSpec spec, Schema schema, RowType flinkSchema) { - this.schema = schema; - this.partitionKey = new PartitionKey(spec, schema); - this.flinkSchema = flinkSchema; - } - - /** - * Construct the {@link RowDataWrapper} lazily here because few members in it are not - * serializable. In this way, we don't have to serialize them with forcing. - */ - private RowDataWrapper lazyRowDataWrapper() { - if (rowDataWrapper == null) { - rowDataWrapper = new RowDataWrapper(flinkSchema, schema.asStruct()); - } - return rowDataWrapper; - } - - @Override - public String getKey(RowData row) { - partitionKey.partition(lazyRowDataWrapper().wrap(row)); - return partitionKey.toPath(); - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/PartitionedDeltaWriter.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/PartitionedDeltaWriter.java deleted file mode 100644 index 38062dd1a2c4..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/PartitionedDeltaWriter.java +++ /dev/null @@ -1,97 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import java.io.IOException; -import java.io.UncheckedIOException; -import java.util.List; -import java.util.Map; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.PartitionKey; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.io.FileAppenderFactory; -import org.apache.iceberg.io.FileIO; -import org.apache.iceberg.io.OutputFileFactory; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.util.Tasks; - -class PartitionedDeltaWriter extends BaseDeltaTaskWriter { - - private final PartitionKey partitionKey; - - private final Map writers = Maps.newHashMap(); - - PartitionedDeltaWriter( - PartitionSpec spec, - FileFormat format, - FileAppenderFactory appenderFactory, - OutputFileFactory fileFactory, - FileIO io, - long targetFileSize, - Schema schema, - RowType flinkSchema, - List equalityFieldIds, - boolean upsert) { - super( - spec, - format, - appenderFactory, - fileFactory, - io, - targetFileSize, - schema, - flinkSchema, - equalityFieldIds, - upsert); - this.partitionKey = new PartitionKey(spec, schema); - } - - @Override - RowDataDeltaWriter route(RowData row) { - partitionKey.partition(wrapper().wrap(row)); - - RowDataDeltaWriter writer = writers.get(partitionKey); - if (writer == null) { - // NOTICE: we need to copy a new partition key here, in case of messing up the keys in - // writers. - PartitionKey copiedKey = partitionKey.copy(); - writer = new RowDataDeltaWriter(copiedKey); - writers.put(copiedKey, writer); - } - - return writer; - } - - @Override - public void close() { - try { - Tasks.foreach(writers.values()) - .throwFailureWhenFinished() - .noRetry() - .run(RowDataDeltaWriter::close, IOException.class); - - writers.clear(); - } catch (IOException e) { - throw new UncheckedIOException("Failed to close equality delta writer", e); - } - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/RowDataTaskWriterFactory.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/RowDataTaskWriterFactory.java deleted file mode 100644 index 67422a1afeb1..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/RowDataTaskWriterFactory.java +++ /dev/null @@ -1,244 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import java.util.List; -import java.util.Map; -import java.util.function.Supplier; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.PartitionKey; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.flink.RowDataWrapper; -import org.apache.iceberg.io.FileAppenderFactory; -import org.apache.iceberg.io.FileIO; -import org.apache.iceberg.io.OutputFileFactory; -import org.apache.iceberg.io.PartitionedFanoutWriter; -import org.apache.iceberg.io.TaskWriter; -import org.apache.iceberg.io.UnpartitionedWriter; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Sets; -import org.apache.iceberg.types.TypeUtil; -import org.apache.iceberg.util.ArrayUtil; -import org.apache.iceberg.util.SerializableSupplier; - -public class RowDataTaskWriterFactory implements TaskWriterFactory { - private final Supplier
    tableSupplier; - private final Schema schema; - private final RowType flinkSchema; - private final PartitionSpec spec; - private final long targetFileSizeBytes; - private final FileFormat format; - private final List equalityFieldIds; - private final boolean upsert; - private final FileAppenderFactory appenderFactory; - - private transient OutputFileFactory outputFileFactory; - - public RowDataTaskWriterFactory( - Table table, - RowType flinkSchema, - long targetFileSizeBytes, - FileFormat format, - Map writeProperties, - List equalityFieldIds, - boolean upsert) { - this( - () -> table, - flinkSchema, - targetFileSizeBytes, - format, - writeProperties, - equalityFieldIds, - upsert); - } - - public RowDataTaskWriterFactory( - SerializableSupplier
    tableSupplier, - RowType flinkSchema, - long targetFileSizeBytes, - FileFormat format, - Map writeProperties, - List equalityFieldIds, - boolean upsert) { - this.tableSupplier = tableSupplier; - - Table table; - if (tableSupplier instanceof CachingTableSupplier) { - // rely on the initial table metadata for schema, etc., until schema evolution is supported - table = ((CachingTableSupplier) tableSupplier).initialTable(); - } else { - table = tableSupplier.get(); - } - - this.schema = table.schema(); - this.flinkSchema = flinkSchema; - this.spec = table.spec(); - this.targetFileSizeBytes = targetFileSizeBytes; - this.format = format; - this.equalityFieldIds = equalityFieldIds; - this.upsert = upsert; - - if (equalityFieldIds == null || equalityFieldIds.isEmpty()) { - this.appenderFactory = - new FlinkAppenderFactory( - table, schema, flinkSchema, writeProperties, spec, null, null, null); - } else if (upsert) { - // In upsert mode, only the new row is emitted using INSERT row kind. Therefore, any column of - // the inserted row - // may differ from the deleted row other than the primary key fields, and the delete file must - // contain values - // that are correct for the deleted row. Therefore, only write the equality delete fields. - this.appenderFactory = - new FlinkAppenderFactory( - table, - schema, - flinkSchema, - writeProperties, - spec, - ArrayUtil.toIntArray(equalityFieldIds), - TypeUtil.select(schema, Sets.newHashSet(equalityFieldIds)), - null); - } else { - this.appenderFactory = - new FlinkAppenderFactory( - table, - schema, - flinkSchema, - writeProperties, - spec, - ArrayUtil.toIntArray(equalityFieldIds), - schema, - null); - } - } - - @Override - public void initialize(int taskId, int attemptId) { - Table table; - if (tableSupplier instanceof CachingTableSupplier) { - // rely on the initial table metadata for schema, etc., until schema evolution is supported - table = ((CachingTableSupplier) tableSupplier).initialTable(); - } else { - table = tableSupplier.get(); - } - - refreshTable(); - - this.outputFileFactory = - OutputFileFactory.builderFor(table, taskId, attemptId) - .format(format) - .ioSupplier(() -> tableSupplier.get().io()) - .build(); - } - - @Override - public TaskWriter create() { - Preconditions.checkNotNull( - outputFileFactory, - "The outputFileFactory shouldn't be null if we have invoked the initialize()."); - - refreshTable(); - - if (equalityFieldIds == null || equalityFieldIds.isEmpty()) { - // Initialize a task writer to write INSERT only. - if (spec.isUnpartitioned()) { - return new UnpartitionedWriter<>( - spec, - format, - appenderFactory, - outputFileFactory, - tableSupplier.get().io(), - targetFileSizeBytes); - } else { - return new RowDataPartitionedFanoutWriter( - spec, - format, - appenderFactory, - outputFileFactory, - tableSupplier.get().io(), - targetFileSizeBytes, - schema, - flinkSchema); - } - } else { - // Initialize a task writer to write both INSERT and equality DELETE. - if (spec.isUnpartitioned()) { - return new UnpartitionedDeltaWriter( - spec, - format, - appenderFactory, - outputFileFactory, - tableSupplier.get().io(), - targetFileSizeBytes, - schema, - flinkSchema, - equalityFieldIds, - upsert); - } else { - return new PartitionedDeltaWriter( - spec, - format, - appenderFactory, - outputFileFactory, - tableSupplier.get().io(), - targetFileSizeBytes, - schema, - flinkSchema, - equalityFieldIds, - upsert); - } - } - } - - void refreshTable() { - if (tableSupplier instanceof CachingTableSupplier) { - ((CachingTableSupplier) tableSupplier).refreshTable(); - } - } - - private static class RowDataPartitionedFanoutWriter extends PartitionedFanoutWriter { - - private final PartitionKey partitionKey; - private final RowDataWrapper rowDataWrapper; - - RowDataPartitionedFanoutWriter( - PartitionSpec spec, - FileFormat format, - FileAppenderFactory appenderFactory, - OutputFileFactory fileFactory, - FileIO io, - long targetFileSize, - Schema schema, - RowType flinkSchema) { - super(spec, format, appenderFactory, fileFactory, io, targetFileSize); - this.partitionKey = new PartitionKey(spec, schema); - this.rowDataWrapper = new RowDataWrapper(flinkSchema, schema.asStruct()); - } - - @Override - protected PartitionKey partition(RowData row) { - partitionKey.partition(rowDataWrapper.wrap(row)); - return partitionKey; - } - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/TaskWriterFactory.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/TaskWriterFactory.java deleted file mode 100644 index e3a1245e8cbd..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/TaskWriterFactory.java +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import java.io.Serializable; -import org.apache.iceberg.io.TaskWriter; - -/** - * Factory to create {@link TaskWriter} - * - * @param data type of record. - */ -public interface TaskWriterFactory extends Serializable { - - /** - * Initialize the factory with a given taskId and attemptId. - * - * @param taskId the identifier of task. - * @param attemptId the attempt id of this task. - */ - void initialize(int taskId, int attemptId); - - /** - * Initialize a {@link TaskWriter} with given task id and attempt id. - * - * @return a newly created task writer. - */ - TaskWriter create(); -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/UnpartitionedDeltaWriter.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/UnpartitionedDeltaWriter.java deleted file mode 100644 index 7680fb933b20..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/UnpartitionedDeltaWriter.java +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import java.io.IOException; -import java.util.List; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.io.FileAppenderFactory; -import org.apache.iceberg.io.FileIO; -import org.apache.iceberg.io.OutputFileFactory; - -class UnpartitionedDeltaWriter extends BaseDeltaTaskWriter { - private final RowDataDeltaWriter writer; - - UnpartitionedDeltaWriter( - PartitionSpec spec, - FileFormat format, - FileAppenderFactory appenderFactory, - OutputFileFactory fileFactory, - FileIO io, - long targetFileSize, - Schema schema, - RowType flinkSchema, - List equalityFieldIds, - boolean upsert) { - super( - spec, - format, - appenderFactory, - fileFactory, - io, - targetFileSize, - schema, - flinkSchema, - equalityFieldIds, - upsert); - this.writer = new RowDataDeltaWriter(null); - } - - @Override - RowDataDeltaWriter route(RowData row) { - return writer; - } - - @Override - public void close() throws IOException { - writer.close(); - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/AggregatedStatisticsTracker.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/AggregatedStatisticsTracker.java deleted file mode 100644 index 5525f02c873e..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/AggregatedStatisticsTracker.java +++ /dev/null @@ -1,262 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import java.util.Map; -import java.util.NavigableMap; -import java.util.Set; -import javax.annotation.Nullable; -import org.apache.datasketches.sampling.ReservoirItemsSketch; -import org.apache.datasketches.sampling.ReservoirItemsUnion; -import org.apache.flink.api.common.typeutils.TypeSerializer; -import org.apache.iceberg.Schema; -import org.apache.iceberg.SortKey; -import org.apache.iceberg.SortOrder; -import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.relocated.com.google.common.collect.Sets; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * AggregatedStatisticsTracker tracks the statistics aggregation received from {@link - * DataStatisticsOperator} subtasks for every checkpoint. - */ -class AggregatedStatisticsTracker { - private static final Logger LOG = LoggerFactory.getLogger(AggregatedStatisticsTracker.class); - - private final String operatorName; - private final int parallelism; - private final TypeSerializer statisticsSerializer; - private final int downstreamParallelism; - private final StatisticsType statisticsType; - private final int switchToSketchThreshold; - private final NavigableMap aggregationsPerCheckpoint; - - private CompletedStatistics completedStatistics; - - AggregatedStatisticsTracker( - String operatorName, - int parallelism, - Schema schema, - SortOrder sortOrder, - int downstreamParallelism, - StatisticsType statisticsType, - int switchToSketchThreshold, - @Nullable CompletedStatistics restoredStatistics) { - this.operatorName = operatorName; - this.parallelism = parallelism; - this.statisticsSerializer = - new DataStatisticsSerializer(new SortKeySerializer(schema, sortOrder)); - this.downstreamParallelism = downstreamParallelism; - this.statisticsType = statisticsType; - this.switchToSketchThreshold = switchToSketchThreshold; - this.completedStatistics = restoredStatistics; - - this.aggregationsPerCheckpoint = Maps.newTreeMap(); - } - - CompletedStatistics updateAndCheckCompletion(int subtask, StatisticsEvent event) { - long checkpointId = event.checkpointId(); - LOG.debug( - "Handling statistics event from subtask {} of operator {} for checkpoint {}", - subtask, - operatorName, - checkpointId); - - if (completedStatistics != null && completedStatistics.checkpointId() > checkpointId) { - LOG.info( - "Ignore stale statistics event from operator {} subtask {} for older checkpoint {}. " - + "Was expecting data statistics from checkpoint higher than {}", - operatorName, - subtask, - checkpointId, - completedStatistics.checkpointId()); - return null; - } - - Aggregation aggregation = - aggregationsPerCheckpoint.computeIfAbsent( - checkpointId, - ignored -> - new Aggregation( - parallelism, - downstreamParallelism, - switchToSketchThreshold, - statisticsType, - StatisticsUtil.collectType(statisticsType, completedStatistics))); - DataStatistics dataStatistics = - StatisticsUtil.deserializeDataStatistics(event.statisticsBytes(), statisticsSerializer); - if (!aggregation.merge(subtask, dataStatistics)) { - LOG.debug( - "Ignore duplicate data statistics from operator {} subtask {} for checkpoint {}.", - operatorName, - subtask, - checkpointId); - } - - if (aggregation.isComplete()) { - this.completedStatistics = aggregation.completedStatistics(checkpointId); - // clean up aggregations up to the completed checkpoint id - aggregationsPerCheckpoint.headMap(checkpointId, true).clear(); - return completedStatistics; - } - - return null; - } - - @VisibleForTesting - NavigableMap aggregationsPerCheckpoint() { - return aggregationsPerCheckpoint; - } - - static class Aggregation { - private static final Logger LOG = LoggerFactory.getLogger(Aggregation.class); - - private final Set subtaskSet; - private final int parallelism; - private final int downstreamParallelism; - private final int switchToSketchThreshold; - private final StatisticsType configuredType; - private StatisticsType currentType; - private Map mapStatistics; - private ReservoirItemsUnion sketchStatistics; - - Aggregation( - int parallelism, - int downstreamParallelism, - int switchToSketchThreshold, - StatisticsType configuredType, - StatisticsType currentType) { - this.subtaskSet = Sets.newHashSet(); - this.parallelism = parallelism; - this.downstreamParallelism = downstreamParallelism; - this.switchToSketchThreshold = switchToSketchThreshold; - this.configuredType = configuredType; - this.currentType = currentType; - - if (currentType == StatisticsType.Map) { - this.mapStatistics = Maps.newHashMap(); - this.sketchStatistics = null; - } else { - this.mapStatistics = null; - this.sketchStatistics = - ReservoirItemsUnion.newInstance( - SketchUtil.determineCoordinatorReservoirSize(downstreamParallelism)); - } - } - - @VisibleForTesting - Set subtaskSet() { - return subtaskSet; - } - - @VisibleForTesting - StatisticsType currentType() { - return currentType; - } - - @VisibleForTesting - Map mapStatistics() { - return mapStatistics; - } - - @VisibleForTesting - ReservoirItemsUnion sketchStatistics() { - return sketchStatistics; - } - - private boolean isComplete() { - return subtaskSet.size() == parallelism; - } - - /** @return false if duplicate */ - private boolean merge(int subtask, DataStatistics taskStatistics) { - if (subtaskSet.contains(subtask)) { - return false; - } - - subtaskSet.add(subtask); - merge(taskStatistics); - return true; - } - - @SuppressWarnings("unchecked") - private void merge(DataStatistics taskStatistics) { - if (taskStatistics.type() == StatisticsType.Map) { - Map taskMapStats = (Map) taskStatistics.result(); - if (currentType == StatisticsType.Map) { - taskMapStats.forEach((key, count) -> mapStatistics.merge(key, count, Long::sum)); - if (configuredType == StatisticsType.Auto - && mapStatistics.size() > switchToSketchThreshold) { - convertCoordinatorToSketch(); - } - } else { - // convert task stats to sketch first - ReservoirItemsSketch taskSketch = - ReservoirItemsSketch.newInstance( - SketchUtil.determineOperatorReservoirSize(parallelism, downstreamParallelism)); - SketchUtil.convertMapToSketch(taskMapStats, taskSketch::update); - sketchStatistics.update(taskSketch); - } - } else { - ReservoirItemsSketch taskSketch = - (ReservoirItemsSketch) taskStatistics.result(); - if (currentType == StatisticsType.Map) { - // convert global stats to sketch first - convertCoordinatorToSketch(); - } - - if (taskSketch.getNumSamples() > 0) { - sketchStatistics.update(taskSketch); - } - } - } - - private void convertCoordinatorToSketch() { - this.sketchStatistics = - ReservoirItemsUnion.newInstance( - SketchUtil.determineCoordinatorReservoirSize(downstreamParallelism)); - SketchUtil.convertMapToSketch(mapStatistics, sketchStatistics::update); - this.currentType = StatisticsType.Sketch; - this.mapStatistics = null; - } - - private CompletedStatistics completedStatistics(long checkpointId) { - if (currentType == StatisticsType.Map) { - LOG.info("Completed map statistics aggregation with {} keys", mapStatistics.size()); - return CompletedStatistics.fromKeyFrequency(checkpointId, mapStatistics); - } else { - ReservoirItemsSketch sketch = sketchStatistics.getResult(); - if (sketch != null) { - LOG.info( - "Completed sketch statistics aggregation: " - + "reservoir size = {}, number of items seen = {}, number of samples = {}", - sketch.getK(), - sketch.getN(), - sketch.getNumSamples()); - return CompletedStatistics.fromKeySamples(checkpointId, sketch.getSamples()); - } else { - LOG.info("Empty sketch statistics."); - return CompletedStatistics.fromKeySamples(checkpointId, new SortKey[0]); - } - } - } - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/CompletedStatistics.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/CompletedStatistics.java deleted file mode 100644 index e4cba174f0f2..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/CompletedStatistics.java +++ /dev/null @@ -1,111 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import java.util.Arrays; -import java.util.Map; -import org.apache.iceberg.SortKey; -import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; -import org.apache.iceberg.relocated.com.google.common.base.Objects; - -/** - * This is what {@link AggregatedStatisticsTracker} returns upon a completed statistics aggregation - * from all subtasks. It contains the raw statistics (Map or reservoir samples). - */ -class CompletedStatistics { - private final long checkpointId; - private final StatisticsType type; - private final Map keyFrequency; - private final SortKey[] keySamples; - - static CompletedStatistics fromKeyFrequency(long checkpointId, Map stats) { - return new CompletedStatistics(checkpointId, StatisticsType.Map, stats, null); - } - - static CompletedStatistics fromKeySamples(long checkpointId, SortKey[] keySamples) { - return new CompletedStatistics(checkpointId, StatisticsType.Sketch, null, keySamples); - } - - CompletedStatistics( - long checkpointId, - StatisticsType type, - Map keyFrequency, - SortKey[] keySamples) { - this.checkpointId = checkpointId; - this.type = type; - this.keyFrequency = keyFrequency; - this.keySamples = keySamples; - } - - @Override - public String toString() { - return MoreObjects.toStringHelper(this) - .add("checkpointId", checkpointId) - .add("type", type) - .add("keyFrequency", keyFrequency) - .add("keySamples", keySamples) - .toString(); - } - - @Override - public boolean equals(Object o) { - if (this == o) { - return true; - } - - if (!(o instanceof CompletedStatistics)) { - return false; - } - - CompletedStatistics other = (CompletedStatistics) o; - return Objects.equal(checkpointId, other.checkpointId) - && Objects.equal(type, other.type) - && Objects.equal(keyFrequency, other.keyFrequency()) - && Arrays.equals(keySamples, other.keySamples()); - } - - @Override - public int hashCode() { - return Objects.hashCode(checkpointId, type, keyFrequency, keySamples); - } - - long checkpointId() { - return checkpointId; - } - - StatisticsType type() { - return type; - } - - Map keyFrequency() { - return keyFrequency; - } - - SortKey[] keySamples() { - return keySamples; - } - - boolean isEmpty() { - if (type == StatisticsType.Sketch) { - return keySamples.length == 0; - } else { - return keyFrequency().isEmpty(); - } - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/CompletedStatisticsSerializer.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/CompletedStatisticsSerializer.java deleted file mode 100644 index 7f55188e7f8c..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/CompletedStatisticsSerializer.java +++ /dev/null @@ -1,178 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import java.io.IOException; -import java.util.Arrays; -import java.util.Collections; -import java.util.List; -import java.util.Map; -import java.util.Objects; -import org.apache.flink.api.common.typeutils.CompositeTypeSerializerSnapshot; -import org.apache.flink.api.common.typeutils.TypeSerializer; -import org.apache.flink.api.common.typeutils.TypeSerializerSnapshot; -import org.apache.flink.api.common.typeutils.base.EnumSerializer; -import org.apache.flink.api.common.typeutils.base.ListSerializer; -import org.apache.flink.api.common.typeutils.base.LongSerializer; -import org.apache.flink.api.common.typeutils.base.MapSerializer; -import org.apache.flink.core.memory.DataInputView; -import org.apache.flink.core.memory.DataOutputView; -import org.apache.iceberg.SortKey; - -class CompletedStatisticsSerializer extends TypeSerializer { - private final TypeSerializer sortKeySerializer; - private final EnumSerializer statisticsTypeSerializer; - private final MapSerializer keyFrequencySerializer; - private final ListSerializer keySamplesSerializer; - - CompletedStatisticsSerializer(TypeSerializer sortKeySerializer) { - this.sortKeySerializer = sortKeySerializer; - this.statisticsTypeSerializer = new EnumSerializer<>(StatisticsType.class); - this.keyFrequencySerializer = new MapSerializer<>(sortKeySerializer, LongSerializer.INSTANCE); - this.keySamplesSerializer = new ListSerializer<>(sortKeySerializer); - } - - @Override - public boolean isImmutableType() { - return false; - } - - @Override - public TypeSerializer duplicate() { - return new CompletedStatisticsSerializer(sortKeySerializer); - } - - @Override - public CompletedStatistics createInstance() { - return CompletedStatistics.fromKeyFrequency(0L, Collections.emptyMap()); - } - - @Override - public CompletedStatistics copy(CompletedStatistics from) { - return new CompletedStatistics( - from.checkpointId(), from.type(), from.keyFrequency(), from.keySamples()); - } - - @Override - public CompletedStatistics copy(CompletedStatistics from, CompletedStatistics reuse) { - // no benefit of reuse - return copy(from); - } - - @Override - public int getLength() { - return -1; - } - - @Override - public void serialize(CompletedStatistics record, DataOutputView target) throws IOException { - target.writeLong(record.checkpointId()); - statisticsTypeSerializer.serialize(record.type(), target); - if (record.type() == StatisticsType.Map) { - keyFrequencySerializer.serialize(record.keyFrequency(), target); - } else { - keySamplesSerializer.serialize(Arrays.asList(record.keySamples()), target); - } - } - - @Override - public CompletedStatistics deserialize(DataInputView source) throws IOException { - long checkpointId = source.readLong(); - StatisticsType type = statisticsTypeSerializer.deserialize(source); - if (type == StatisticsType.Map) { - Map keyFrequency = keyFrequencySerializer.deserialize(source); - return CompletedStatistics.fromKeyFrequency(checkpointId, keyFrequency); - } else { - List sortKeys = keySamplesSerializer.deserialize(source); - SortKey[] keySamples = new SortKey[sortKeys.size()]; - keySamples = sortKeys.toArray(keySamples); - return CompletedStatistics.fromKeySamples(checkpointId, keySamples); - } - } - - @Override - public CompletedStatistics deserialize(CompletedStatistics reuse, DataInputView source) - throws IOException { - // not much benefit to reuse - return deserialize(source); - } - - @Override - public void copy(DataInputView source, DataOutputView target) throws IOException { - serialize(deserialize(source), target); - } - - @Override - public boolean equals(Object obj) { - if (this == obj) { - return true; - } - - if (obj == null || getClass() != obj.getClass()) { - return false; - } - - CompletedStatisticsSerializer other = (CompletedStatisticsSerializer) obj; - return Objects.equals(sortKeySerializer, other.sortKeySerializer); - } - - @Override - public int hashCode() { - return sortKeySerializer.hashCode(); - } - - @Override - public TypeSerializerSnapshot snapshotConfiguration() { - return new CompletedStatisticsSerializerSnapshot(this); - } - - public static class CompletedStatisticsSerializerSnapshot - extends CompositeTypeSerializerSnapshot { - private static final int CURRENT_VERSION = 1; - - /** Constructor for read instantiation. */ - @SuppressWarnings({"unused", "checkstyle:RedundantModifier"}) - public CompletedStatisticsSerializerSnapshot() { - super(CompletedStatisticsSerializer.class); - } - - @SuppressWarnings("checkstyle:RedundantModifier") - public CompletedStatisticsSerializerSnapshot(CompletedStatisticsSerializer serializer) { - super(serializer); - } - - @Override - protected int getCurrentOuterSnapshotVersion() { - return CURRENT_VERSION; - } - - @Override - protected TypeSerializer[] getNestedSerializers( - CompletedStatisticsSerializer outerSerializer) { - return new TypeSerializer[] {outerSerializer.sortKeySerializer}; - } - - @Override - protected CompletedStatisticsSerializer createOuterSerializerWithNestedSerializers( - TypeSerializer[] nestedSerializers) { - SortKeySerializer sortKeySerializer = (SortKeySerializer) nestedSerializers[0]; - return new CompletedStatisticsSerializer(sortKeySerializer); - } - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatistics.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatistics.java deleted file mode 100644 index 76c59cd5f4b8..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatistics.java +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import java.util.Map; -import org.apache.datasketches.sampling.ReservoirItemsSketch; -import org.apache.flink.annotation.Internal; -import org.apache.iceberg.SortKey; - -/** - * DataStatistics defines the interface to collect data distribution information. - * - *

    Data statistics tracks traffic volume distribution across data keys. For low-cardinality key, - * a simple map of (key, count) can be used. For high-cardinality key, probabilistic data structures - * (sketching) can be used. - */ -@Internal -interface DataStatistics { - - StatisticsType type(); - - boolean isEmpty(); - - /** Add row sortKey to data statistics. */ - void add(SortKey sortKey); - - /** - * Get the collected statistics. Could be a {@link Map} (low cardinality) or {@link - * ReservoirItemsSketch} (high cardinality) - */ - Object result(); -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsCoordinator.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsCoordinator.java deleted file mode 100644 index 4bfde7204acf..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsCoordinator.java +++ /dev/null @@ -1,522 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import java.util.Comparator; -import java.util.Map; -import java.util.concurrent.Callable; -import java.util.concurrent.CompletableFuture; -import java.util.concurrent.ExecutionException; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; -import java.util.concurrent.ThreadFactory; -import org.apache.flink.annotation.Internal; -import org.apache.flink.api.common.typeutils.TypeSerializer; -import org.apache.flink.runtime.operators.coordination.OperatorCoordinator; -import org.apache.flink.runtime.operators.coordination.OperatorEvent; -import org.apache.flink.util.ExceptionUtils; -import org.apache.flink.util.FatalExitExceptionHandler; -import org.apache.flink.util.FlinkRuntimeException; -import org.apache.flink.util.Preconditions; -import org.apache.flink.util.ThrowableCatchingRunnable; -import org.apache.flink.util.function.ThrowingRunnable; -import org.apache.iceberg.Schema; -import org.apache.iceberg.SortOrder; -import org.apache.iceberg.StructLike; -import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; -import org.apache.iceberg.relocated.com.google.common.collect.Iterables; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.types.Comparators; -import org.jetbrains.annotations.NotNull; -import org.jetbrains.annotations.Nullable; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * DataStatisticsCoordinator receives {@link StatisticsEvent} from {@link DataStatisticsOperator} - * every subtask and then merge them together. Once aggregation for all subtasks data statistics - * completes, DataStatisticsCoordinator will send the aggregated data statistics back to {@link - * DataStatisticsOperator}. In the end a custom partitioner will distribute traffic based on the - * aggregated data statistics to improve data clustering. - */ -@Internal -class DataStatisticsCoordinator implements OperatorCoordinator { - private static final Logger LOG = LoggerFactory.getLogger(DataStatisticsCoordinator.class); - - private final String operatorName; - private final OperatorCoordinator.Context context; - private final Schema schema; - private final SortOrder sortOrder; - private final Comparator comparator; - private final int downstreamParallelism; - private final StatisticsType statisticsType; - private final double closeFileCostWeightPercentage; - - private final ExecutorService coordinatorExecutor; - private final SubtaskGateways subtaskGateways; - private final CoordinatorExecutorThreadFactory coordinatorThreadFactory; - private final TypeSerializer completedStatisticsSerializer; - private final TypeSerializer globalStatisticsSerializer; - - private transient boolean started; - private transient AggregatedStatisticsTracker aggregatedStatisticsTracker; - private transient CompletedStatistics completedStatistics; - private transient GlobalStatistics globalStatistics; - - DataStatisticsCoordinator( - String operatorName, - OperatorCoordinator.Context context, - Schema schema, - SortOrder sortOrder, - int downstreamParallelism, - StatisticsType statisticsType, - double closeFileCostWeightPercentage) { - this.operatorName = operatorName; - this.context = context; - this.schema = schema; - this.sortOrder = sortOrder; - this.comparator = Comparators.forType(SortKeyUtil.sortKeySchema(schema, sortOrder).asStruct()); - this.downstreamParallelism = downstreamParallelism; - this.statisticsType = statisticsType; - this.closeFileCostWeightPercentage = closeFileCostWeightPercentage; - - this.coordinatorThreadFactory = - new CoordinatorExecutorThreadFactory( - "DataStatisticsCoordinator-" + operatorName, context.getUserCodeClassloader()); - this.coordinatorExecutor = Executors.newSingleThreadExecutor(coordinatorThreadFactory); - this.subtaskGateways = new SubtaskGateways(operatorName, context.currentParallelism()); - SortKeySerializer sortKeySerializer = new SortKeySerializer(schema, sortOrder); - this.completedStatisticsSerializer = new CompletedStatisticsSerializer(sortKeySerializer); - this.globalStatisticsSerializer = new GlobalStatisticsSerializer(sortKeySerializer); - } - - @Override - public void start() throws Exception { - LOG.info("Starting data statistics coordinator: {}.", operatorName); - this.started = true; - - // statistics are restored already in resetToCheckpoint() before start() called - this.aggregatedStatisticsTracker = - new AggregatedStatisticsTracker( - operatorName, - context.currentParallelism(), - schema, - sortOrder, - downstreamParallelism, - statisticsType, - SketchUtil.COORDINATOR_SKETCH_SWITCH_THRESHOLD, - completedStatistics); - } - - @Override - public void close() throws Exception { - coordinatorExecutor.shutdown(); - this.aggregatedStatisticsTracker = null; - this.started = false; - LOG.info("Closed data statistics coordinator: {}.", operatorName); - } - - @VisibleForTesting - void callInCoordinatorThread(Callable callable, String errorMessage) { - ensureStarted(); - // Ensure the task is done by the coordinator executor. - if (!coordinatorThreadFactory.isCurrentThreadCoordinatorThread()) { - try { - Callable guardedCallable = - () -> { - try { - return callable.call(); - } catch (Throwable t) { - LOG.error( - "Uncaught Exception in data statistics coordinator: {} executor", - operatorName, - t); - ExceptionUtils.rethrowException(t); - return null; - } - }; - - coordinatorExecutor.submit(guardedCallable).get(); - } catch (InterruptedException | ExecutionException e) { - throw new FlinkRuntimeException(errorMessage, e); - } - } else { - try { - callable.call(); - } catch (Throwable t) { - LOG.error( - "Uncaught Exception in data statistics coordinator: {} executor", operatorName, t); - throw new FlinkRuntimeException(errorMessage, t); - } - } - } - - public void runInCoordinatorThread(Runnable runnable) { - this.coordinatorExecutor.execute( - new ThrowableCatchingRunnable( - throwable -> - this.coordinatorThreadFactory.uncaughtException(Thread.currentThread(), throwable), - runnable)); - } - - private void runInCoordinatorThread(ThrowingRunnable action, String actionString) { - ensureStarted(); - runInCoordinatorThread( - () -> { - try { - action.run(); - } catch (Throwable t) { - ExceptionUtils.rethrowIfFatalErrorOrOOM(t); - LOG.error( - "Uncaught exception in the data statistics coordinator: {} while {}. Triggering job failover", - operatorName, - actionString, - t); - context.failJob(t); - } - }); - } - - private void ensureStarted() { - Preconditions.checkState(started, "The coordinator of %s has not started yet.", operatorName); - } - - private void handleDataStatisticRequest(int subtask, StatisticsEvent event) { - CompletedStatistics maybeCompletedStatistics = - aggregatedStatisticsTracker.updateAndCheckCompletion(subtask, event); - - if (maybeCompletedStatistics != null) { - if (maybeCompletedStatistics.isEmpty()) { - LOG.info( - "Skip aggregated statistics for checkpoint {} as it is empty.", event.checkpointId()); - } else { - LOG.info("Completed statistics aggregation for checkpoint {}", event.checkpointId()); - // completedStatistics contains the complete samples, which is needed to compute - // the range bounds in globalStatistics if downstreamParallelism changed. - this.completedStatistics = maybeCompletedStatistics; - // globalStatistics only contains assignment calculated based on Map or Sketch statistics - this.globalStatistics = - globalStatistics( - maybeCompletedStatistics, - downstreamParallelism, - comparator, - closeFileCostWeightPercentage); - sendGlobalStatisticsToSubtasks(globalStatistics); - } - } - } - - private static GlobalStatistics globalStatistics( - CompletedStatistics completedStatistics, - int downstreamParallelism, - Comparator comparator, - double closeFileCostWeightPercentage) { - if (completedStatistics.type() == StatisticsType.Sketch) { - // range bound is a much smaller array compared to the complete samples. - // It helps reduce the amount of data transfer from coordinator to operator subtasks. - return GlobalStatistics.fromRangeBounds( - completedStatistics.checkpointId(), - SketchUtil.rangeBounds( - downstreamParallelism, comparator, completedStatistics.keySamples())); - } else { - return GlobalStatistics.fromMapAssignment( - completedStatistics.checkpointId(), - MapAssignment.fromKeyFrequency( - downstreamParallelism, - completedStatistics.keyFrequency(), - closeFileCostWeightPercentage, - comparator)); - } - } - - @SuppressWarnings("FutureReturnValueIgnored") - private void sendGlobalStatisticsToSubtasks(GlobalStatistics statistics) { - runInCoordinatorThread( - () -> { - LOG.info( - "Broadcast latest global statistics from checkpoint {} to all subtasks", - statistics.checkpointId()); - // applyImmediately is set to false so that operator subtasks can - // apply the change at checkpoint boundary - StatisticsEvent statisticsEvent = - StatisticsEvent.createGlobalStatisticsEvent( - statistics, globalStatisticsSerializer, false); - for (int i = 0; i < context.currentParallelism(); ++i) { - // Ignore future return value for potential error (e.g. subtask down). - // Upon restart, subtasks send request to coordinator to refresh statistics - // if there is any difference - subtaskGateways.getSubtaskGateway(i).sendEvent(statisticsEvent); - } - }, - String.format( - "Failed to send operator %s coordinator global data statistics for checkpoint %d", - operatorName, statistics.checkpointId())); - } - - @SuppressWarnings("FutureReturnValueIgnored") - private void handleRequestGlobalStatisticsEvent(int subtask, RequestGlobalStatisticsEvent event) { - if (globalStatistics != null) { - runInCoordinatorThread( - () -> { - if (event.signature() != null && event.signature() != globalStatistics.hashCode()) { - LOG.debug( - "Skip responding to statistics request from subtask {}, as hashCode matches or not included in the request", - subtask); - } else { - LOG.info( - "Send latest global statistics from checkpoint {} to subtask {}", - globalStatistics.checkpointId(), - subtask); - StatisticsEvent statisticsEvent = - StatisticsEvent.createGlobalStatisticsEvent( - globalStatistics, globalStatisticsSerializer, true); - subtaskGateways.getSubtaskGateway(subtask).sendEvent(statisticsEvent); - } - }, - String.format( - "Failed to send operator %s coordinator global data statistics to requesting subtask %d for checkpoint %d", - operatorName, subtask, globalStatistics.checkpointId())); - } else { - LOG.info( - "Ignore global statistics request from subtask {} as statistics not available", subtask); - } - } - - @Override - public void handleEventFromOperator(int subtask, int attemptNumber, OperatorEvent event) { - runInCoordinatorThread( - () -> { - LOG.debug( - "Handling event from subtask {} (#{}) of {}: {}", - subtask, - attemptNumber, - operatorName, - event); - if (event instanceof StatisticsEvent) { - handleDataStatisticRequest(subtask, ((StatisticsEvent) event)); - } else if (event instanceof RequestGlobalStatisticsEvent) { - handleRequestGlobalStatisticsEvent(subtask, (RequestGlobalStatisticsEvent) event); - } else { - throw new IllegalArgumentException( - "Invalid operator event type: " + event.getClass().getCanonicalName()); - } - }, - String.format( - "handling operator event %s from subtask %d (#%d)", - event.getClass(), subtask, attemptNumber)); - } - - @Override - public void checkpointCoordinator(long checkpointId, CompletableFuture resultFuture) { - runInCoordinatorThread( - () -> { - LOG.debug( - "Snapshotting data statistics coordinator {} for checkpoint {}", - operatorName, - checkpointId); - if (completedStatistics == null) { - // null checkpoint result is not allowed, hence supply an empty byte array - resultFuture.complete(new byte[0]); - } else { - resultFuture.complete( - StatisticsUtil.serializeCompletedStatistics( - completedStatistics, completedStatisticsSerializer)); - } - }, - String.format("taking checkpoint %d", checkpointId)); - } - - @Override - public void notifyCheckpointComplete(long checkpointId) {} - - @Override - public void resetToCheckpoint(long checkpointId, byte[] checkpointData) { - Preconditions.checkState( - !started, "The coordinator %s can only be reset if it was not yet started", operatorName); - if (checkpointData == null || checkpointData.length == 0) { - LOG.info( - "Data statistic coordinator {} has nothing to restore from checkpoint {}", - operatorName, - checkpointId); - return; - } - - LOG.info( - "Restoring data statistic coordinator {} from checkpoint {}", operatorName, checkpointId); - this.completedStatistics = - StatisticsUtil.deserializeCompletedStatistics( - checkpointData, completedStatisticsSerializer); - // recompute global statistics in case downstream parallelism changed - this.globalStatistics = - globalStatistics( - completedStatistics, downstreamParallelism, comparator, closeFileCostWeightPercentage); - } - - @Override - public void subtaskReset(int subtask, long checkpointId) { - runInCoordinatorThread( - () -> { - LOG.info( - "Operator {} subtask {} is reset to checkpoint {}", - operatorName, - subtask, - checkpointId); - Preconditions.checkState( - this.coordinatorThreadFactory.isCurrentThreadCoordinatorThread()); - subtaskGateways.reset(subtask); - }, - String.format("handling subtask %d recovery to checkpoint %d", subtask, checkpointId)); - } - - @Override - public void executionAttemptFailed(int subtask, int attemptNumber, @Nullable Throwable reason) { - runInCoordinatorThread( - () -> { - LOG.info( - "Unregistering gateway after failure for subtask {} (#{}) of data statistics {}", - subtask, - attemptNumber, - operatorName); - Preconditions.checkState( - this.coordinatorThreadFactory.isCurrentThreadCoordinatorThread()); - subtaskGateways.unregisterSubtaskGateway(subtask, attemptNumber); - }, - String.format("handling subtask %d (#%d) failure", subtask, attemptNumber)); - } - - @Override - public void executionAttemptReady(int subtask, int attemptNumber, SubtaskGateway gateway) { - Preconditions.checkArgument(subtask == gateway.getSubtask()); - Preconditions.checkArgument(attemptNumber == gateway.getExecution().getAttemptNumber()); - runInCoordinatorThread( - () -> { - Preconditions.checkState( - this.coordinatorThreadFactory.isCurrentThreadCoordinatorThread()); - subtaskGateways.registerSubtaskGateway(gateway); - }, - String.format( - "making event gateway to subtask %d (#%d) available", subtask, attemptNumber)); - } - - @VisibleForTesting - CompletedStatistics completedStatistics() { - return completedStatistics; - } - - @VisibleForTesting - GlobalStatistics globalStatistics() { - return globalStatistics; - } - - private static class SubtaskGateways { - private final String operatorName; - private final Map[] gateways; - - @SuppressWarnings("unchecked") - private SubtaskGateways(String operatorName, int parallelism) { - this.operatorName = operatorName; - gateways = new Map[parallelism]; - - for (int i = 0; i < parallelism; ++i) { - gateways[i] = Maps.newHashMap(); - } - } - - private void registerSubtaskGateway(OperatorCoordinator.SubtaskGateway gateway) { - int subtaskIndex = gateway.getSubtask(); - int attemptNumber = gateway.getExecution().getAttemptNumber(); - Preconditions.checkState( - !gateways[subtaskIndex].containsKey(attemptNumber), - "Coordinator of %s already has a subtask gateway for %d (#%d)", - operatorName, - subtaskIndex, - attemptNumber); - LOG.debug( - "Coordinator of {} registers gateway for subtask {} attempt {}", - operatorName, - subtaskIndex, - attemptNumber); - gateways[subtaskIndex].put(attemptNumber, gateway); - } - - private void unregisterSubtaskGateway(int subtaskIndex, int attemptNumber) { - LOG.debug( - "Coordinator of {} unregisters gateway for subtask {} attempt {}", - operatorName, - subtaskIndex, - attemptNumber); - gateways[subtaskIndex].remove(attemptNumber); - } - - private OperatorCoordinator.SubtaskGateway getSubtaskGateway(int subtaskIndex) { - Preconditions.checkState( - !gateways[subtaskIndex].isEmpty(), - "Coordinator of %s subtask %d is not ready yet to receive events", - operatorName, - subtaskIndex); - return Iterables.getOnlyElement(gateways[subtaskIndex].values()); - } - - private void reset(int subtaskIndex) { - gateways[subtaskIndex].clear(); - } - } - - private static class CoordinatorExecutorThreadFactory - implements ThreadFactory, Thread.UncaughtExceptionHandler { - - private final String coordinatorThreadName; - private final ClassLoader classLoader; - private final Thread.UncaughtExceptionHandler errorHandler; - - @javax.annotation.Nullable private Thread thread; - - CoordinatorExecutorThreadFactory( - final String coordinatorThreadName, final ClassLoader contextClassLoader) { - this(coordinatorThreadName, contextClassLoader, FatalExitExceptionHandler.INSTANCE); - } - - @org.apache.flink.annotation.VisibleForTesting - CoordinatorExecutorThreadFactory( - final String coordinatorThreadName, - final ClassLoader contextClassLoader, - final Thread.UncaughtExceptionHandler errorHandler) { - this.coordinatorThreadName = coordinatorThreadName; - this.classLoader = contextClassLoader; - this.errorHandler = errorHandler; - } - - @Override - public synchronized Thread newThread(@NotNull Runnable runnable) { - thread = new Thread(runnable, coordinatorThreadName); - thread.setContextClassLoader(classLoader); - thread.setUncaughtExceptionHandler(this); - return thread; - } - - @Override - public synchronized void uncaughtException(Thread t, Throwable e) { - errorHandler.uncaughtException(t, e); - } - - boolean isCurrentThreadCoordinatorThread() { - return Thread.currentThread() == thread; - } - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsCoordinatorProvider.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsCoordinatorProvider.java deleted file mode 100644 index 9d7d989c298e..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsCoordinatorProvider.java +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import org.apache.flink.annotation.Internal; -import org.apache.flink.runtime.jobgraph.OperatorID; -import org.apache.flink.runtime.operators.coordination.OperatorCoordinator; -import org.apache.flink.runtime.operators.coordination.RecreateOnResetOperatorCoordinator; -import org.apache.iceberg.Schema; -import org.apache.iceberg.SortOrder; - -/** - * DataStatisticsCoordinatorProvider provides the method to create new {@link - * DataStatisticsCoordinator} - */ -@Internal -public class DataStatisticsCoordinatorProvider extends RecreateOnResetOperatorCoordinator.Provider { - - private final String operatorName; - private final Schema schema; - private final SortOrder sortOrder; - private final int downstreamParallelism; - private final StatisticsType type; - private final double closeFileCostWeightPercentage; - - public DataStatisticsCoordinatorProvider( - String operatorName, - OperatorID operatorID, - Schema schema, - SortOrder sortOrder, - int downstreamParallelism, - StatisticsType type, - double closeFileCostWeightPercentage) { - super(operatorID); - this.operatorName = operatorName; - this.schema = schema; - this.sortOrder = sortOrder; - this.downstreamParallelism = downstreamParallelism; - this.type = type; - this.closeFileCostWeightPercentage = closeFileCostWeightPercentage; - } - - @Override - public OperatorCoordinator getCoordinator(OperatorCoordinator.Context context) { - return new DataStatisticsCoordinator( - operatorName, - context, - schema, - sortOrder, - downstreamParallelism, - type, - closeFileCostWeightPercentage); - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsOperator.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsOperator.java deleted file mode 100644 index 59c38b239725..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsOperator.java +++ /dev/null @@ -1,265 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import java.util.Map; -import org.apache.flink.annotation.Internal; -import org.apache.flink.api.common.state.ListState; -import org.apache.flink.api.common.state.ListStateDescriptor; -import org.apache.flink.api.common.typeutils.TypeSerializer; -import org.apache.flink.runtime.operators.coordination.OperatorEvent; -import org.apache.flink.runtime.operators.coordination.OperatorEventGateway; -import org.apache.flink.runtime.operators.coordination.OperatorEventHandler; -import org.apache.flink.runtime.state.StateInitializationContext; -import org.apache.flink.runtime.state.StateSnapshotContext; -import org.apache.flink.streaming.api.operators.AbstractStreamOperator; -import org.apache.flink.streaming.api.operators.OneInputStreamOperator; -import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; -import org.apache.flink.table.data.RowData; -import org.apache.iceberg.Schema; -import org.apache.iceberg.SortKey; -import org.apache.iceberg.SortOrder; -import org.apache.iceberg.StructLike; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.RowDataWrapper; -import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; - -/** - * DataStatisticsOperator collects traffic distribution statistics. A custom partitioner shall be - * attached to the DataStatisticsOperator output. The custom partitioner leverages the statistics to - * shuffle record to improve data clustering while maintaining relative balanced traffic - * distribution to downstream subtasks. - */ -@Internal -public class DataStatisticsOperator extends AbstractStreamOperator - implements OneInputStreamOperator, OperatorEventHandler { - - private static final long serialVersionUID = 1L; - - private final String operatorName; - private final RowDataWrapper rowDataWrapper; - private final SortKey sortKey; - private final OperatorEventGateway operatorEventGateway; - private final int downstreamParallelism; - private final StatisticsType statisticsType; - private final TypeSerializer taskStatisticsSerializer; - private final TypeSerializer globalStatisticsSerializer; - - private transient int parallelism; - private transient int subtaskIndex; - private transient ListState globalStatisticsState; - // current statistics type may be different from the config due to possible - // migration from Map statistics to Sketch statistics when high cardinality detected - private transient volatile StatisticsType taskStatisticsType; - private transient volatile DataStatistics localStatistics; - private transient volatile GlobalStatistics globalStatistics; - - DataStatisticsOperator( - String operatorName, - Schema schema, - SortOrder sortOrder, - OperatorEventGateway operatorEventGateway, - int downstreamParallelism, - StatisticsType statisticsType) { - this.operatorName = operatorName; - this.rowDataWrapper = new RowDataWrapper(FlinkSchemaUtil.convert(schema), schema.asStruct()); - this.sortKey = new SortKey(schema, sortOrder); - this.operatorEventGateway = operatorEventGateway; - this.downstreamParallelism = downstreamParallelism; - this.statisticsType = statisticsType; - - SortKeySerializer sortKeySerializer = new SortKeySerializer(schema, sortOrder); - this.taskStatisticsSerializer = new DataStatisticsSerializer(sortKeySerializer); - this.globalStatisticsSerializer = new GlobalStatisticsSerializer(sortKeySerializer); - } - - @Override - public void initializeState(StateInitializationContext context) throws Exception { - this.parallelism = getRuntimeContext().getNumberOfParallelSubtasks(); - this.subtaskIndex = getRuntimeContext().getIndexOfThisSubtask(); - - // Use union state so that new subtasks can also restore global statistics during scale-up. - this.globalStatisticsState = - context - .getOperatorStateStore() - .getUnionListState( - new ListStateDescriptor<>("globalStatisticsState", globalStatisticsSerializer)); - - if (context.isRestored()) { - if (globalStatisticsState.get() == null - || !globalStatisticsState.get().iterator().hasNext()) { - LOG.info( - "Operator {} subtask {} doesn't have global statistics state to restore", - operatorName, - subtaskIndex); - // If Flink deprecates union state in the future, RequestGlobalStatisticsEvent can be - // leveraged to request global statistics from coordinator if new subtasks (scale-up case) - // has nothing to restore from. - } else { - GlobalStatistics restoredStatistics = globalStatisticsState.get().iterator().next(); - LOG.info( - "Operator {} subtask {} restored global statistics state", operatorName, subtaskIndex); - this.globalStatistics = restoredStatistics; - } - - // Always request for new statistics from coordinator upon task initialization. - // There are a few scenarios this is needed - // 1. downstream writer parallelism changed due to rescale. - // 2. coordinator failed to send the aggregated statistics to subtask - // (e.g. due to subtask failure at the time). - // Records may flow before coordinator can respond. Range partitioner should be - // able to continue to operate with potentially suboptimal behavior (in sketch case). - LOG.info( - "Operator {} subtask {} requests new global statistics from coordinator ", - operatorName, - subtaskIndex); - // coordinator can use the hashCode (if available) in the request event to determine - // if operator already has the latest global statistics and respond can be skipped. - // This makes the handling cheap in most situations. - RequestGlobalStatisticsEvent event = - globalStatistics != null - ? new RequestGlobalStatisticsEvent(globalStatistics.hashCode()) - : new RequestGlobalStatisticsEvent(); - operatorEventGateway.sendEventToCoordinator(event); - } - - this.taskStatisticsType = StatisticsUtil.collectType(statisticsType, globalStatistics); - this.localStatistics = - StatisticsUtil.createTaskStatistics(taskStatisticsType, parallelism, downstreamParallelism); - } - - @Override - public void open() throws Exception { - if (globalStatistics != null) { - output.collect(new StreamRecord<>(StatisticsOrRecord.fromStatistics(globalStatistics))); - } - } - - @Override - public void handleOperatorEvent(OperatorEvent event) { - Preconditions.checkArgument( - event instanceof StatisticsEvent, - String.format( - "Operator %s subtask %s received unexpected operator event %s", - operatorName, subtaskIndex, event.getClass())); - StatisticsEvent statisticsEvent = (StatisticsEvent) event; - LOG.info( - "Operator {} subtask {} received global data event from coordinator checkpoint {}", - operatorName, - subtaskIndex, - statisticsEvent.checkpointId()); - this.globalStatistics = - StatisticsUtil.deserializeGlobalStatistics( - statisticsEvent.statisticsBytes(), globalStatisticsSerializer); - checkStatisticsTypeMigration(); - // if applyImmediately not set, wait until the checkpoint time to switch - if (statisticsEvent.applyImmediately()) { - output.collect(new StreamRecord<>(StatisticsOrRecord.fromStatistics(globalStatistics))); - } - } - - @Override - public void processElement(StreamRecord streamRecord) { - // collect data statistics - RowData record = streamRecord.getValue(); - StructLike struct = rowDataWrapper.wrap(record); - sortKey.wrap(struct); - localStatistics.add(sortKey); - - checkStatisticsTypeMigration(); - output.collect(new StreamRecord<>(StatisticsOrRecord.fromRecord(record))); - } - - @Override - public void snapshotState(StateSnapshotContext context) throws Exception { - long checkpointId = context.getCheckpointId(); - LOG.info( - "Operator {} subtask {} snapshotting data statistics for checkpoint {}", - operatorName, - subtaskIndex, - checkpointId); - - // Pass global statistics to partitioner so that all the operators refresh statistics - // at same checkpoint barrier - if (globalStatistics != null) { - output.collect(new StreamRecord<>(StatisticsOrRecord.fromStatistics(globalStatistics))); - } - - // Only subtask 0 saves the state so that globalStatisticsState(UnionListState) stores - // an exact copy of globalStatistics - if (globalStatistics != null && getRuntimeContext().getIndexOfThisSubtask() == 0) { - globalStatisticsState.clear(); - LOG.info( - "Operator {} subtask {} saving global statistics to state", operatorName, subtaskIndex); - globalStatisticsState.add(globalStatistics); - LOG.debug( - "Operator {} subtask {} saved global statistics to state: {}", - operatorName, - subtaskIndex, - globalStatistics); - } - - // For now, local statistics are sent to coordinator at checkpoint - LOG.info( - "Operator {} Subtask {} sending local statistics to coordinator for checkpoint {}", - operatorName, - subtaskIndex, - checkpointId); - operatorEventGateway.sendEventToCoordinator( - StatisticsEvent.createTaskStatisticsEvent( - checkpointId, localStatistics, taskStatisticsSerializer)); - - // Recreate the local statistics - localStatistics = - StatisticsUtil.createTaskStatistics(taskStatisticsType, parallelism, downstreamParallelism); - } - - @SuppressWarnings("unchecked") - private void checkStatisticsTypeMigration() { - // only check if the statisticsType config is Auto and localStatistics is currently Map type - if (statisticsType == StatisticsType.Auto && localStatistics.type() == StatisticsType.Map) { - Map mapStatistics = (Map) localStatistics.result(); - // convert if local statistics has cardinality over the threshold or - // if received global statistics is already sketch type - if (mapStatistics.size() > SketchUtil.OPERATOR_SKETCH_SWITCH_THRESHOLD - || (globalStatistics != null && globalStatistics.type() == StatisticsType.Sketch)) { - LOG.info( - "Operator {} subtask {} switched local statistics from Map to Sketch.", - operatorName, - subtaskIndex); - this.taskStatisticsType = StatisticsType.Sketch; - this.localStatistics = - StatisticsUtil.createTaskStatistics( - taskStatisticsType, parallelism, downstreamParallelism); - SketchUtil.convertMapToSketch(mapStatistics, localStatistics::add); - } - } - } - - @VisibleForTesting - DataStatistics localStatistics() { - return localStatistics; - } - - @VisibleForTesting - GlobalStatistics globalStatistics() { - return globalStatistics; - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsSerializer.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsSerializer.java deleted file mode 100644 index c25481b3c1f2..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/DataStatisticsSerializer.java +++ /dev/null @@ -1,206 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import java.io.IOException; -import java.util.Map; -import java.util.Objects; -import org.apache.datasketches.memory.Memory; -import org.apache.datasketches.sampling.ReservoirItemsSketch; -import org.apache.flink.annotation.Internal; -import org.apache.flink.api.common.typeutils.CompositeTypeSerializerSnapshot; -import org.apache.flink.api.common.typeutils.TypeSerializer; -import org.apache.flink.api.common.typeutils.TypeSerializerSnapshot; -import org.apache.flink.api.common.typeutils.base.EnumSerializer; -import org.apache.flink.api.common.typeutils.base.LongSerializer; -import org.apache.flink.api.common.typeutils.base.MapSerializer; -import org.apache.flink.core.memory.DataInputView; -import org.apache.flink.core.memory.DataOutputView; -import org.apache.iceberg.SortKey; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; - -@Internal -class DataStatisticsSerializer extends TypeSerializer { - private final TypeSerializer sortKeySerializer; - private final EnumSerializer statisticsTypeSerializer; - private final MapSerializer mapSerializer; - private final SortKeySketchSerializer sketchSerializer; - - DataStatisticsSerializer(TypeSerializer sortKeySerializer) { - this.sortKeySerializer = sortKeySerializer; - this.statisticsTypeSerializer = new EnumSerializer<>(StatisticsType.class); - this.mapSerializer = new MapSerializer<>(sortKeySerializer, LongSerializer.INSTANCE); - this.sketchSerializer = new SortKeySketchSerializer(sortKeySerializer); - } - - @Override - public boolean isImmutableType() { - return false; - } - - @SuppressWarnings("ReferenceEquality") - @Override - public TypeSerializer duplicate() { - TypeSerializer duplicateSortKeySerializer = sortKeySerializer.duplicate(); - return (duplicateSortKeySerializer == sortKeySerializer) - ? this - : new DataStatisticsSerializer(duplicateSortKeySerializer); - } - - @Override - public DataStatistics createInstance() { - return new MapDataStatistics(); - } - - @SuppressWarnings("unchecked") - @Override - public DataStatistics copy(DataStatistics obj) { - StatisticsType statisticsType = obj.type(); - if (statisticsType == StatisticsType.Map) { - MapDataStatistics from = (MapDataStatistics) obj; - Map fromStats = (Map) from.result(); - Map toStats = Maps.newHashMap(fromStats); - return new MapDataStatistics(toStats); - } else if (statisticsType == StatisticsType.Sketch) { - // because ReservoirItemsSketch doesn't expose enough public methods for cloning, - // this implementation adopted the less efficient serialization and deserialization. - SketchDataStatistics from = (SketchDataStatistics) obj; - ReservoirItemsSketch fromStats = (ReservoirItemsSketch) from.result(); - byte[] bytes = fromStats.toByteArray(sketchSerializer); - Memory memory = Memory.wrap(bytes); - ReservoirItemsSketch toStats = - ReservoirItemsSketch.heapify(memory, sketchSerializer); - return new SketchDataStatistics(toStats); - } else { - throw new IllegalArgumentException("Unsupported data statistics type: " + statisticsType); - } - } - - @Override - public DataStatistics copy(DataStatistics from, DataStatistics reuse) { - // not much benefit to reuse - return copy(from); - } - - @Override - public int getLength() { - return -1; - } - - @SuppressWarnings("unchecked") - @Override - public void serialize(DataStatistics obj, DataOutputView target) throws IOException { - StatisticsType statisticsType = obj.type(); - statisticsTypeSerializer.serialize(obj.type(), target); - if (statisticsType == StatisticsType.Map) { - Map mapStatistics = (Map) obj.result(); - mapSerializer.serialize(mapStatistics, target); - } else if (statisticsType == StatisticsType.Sketch) { - ReservoirItemsSketch sketch = (ReservoirItemsSketch) obj.result(); - byte[] sketchBytes = sketch.toByteArray(sketchSerializer); - target.writeInt(sketchBytes.length); - target.write(sketchBytes); - } else { - throw new IllegalArgumentException("Unsupported data statistics type: " + statisticsType); - } - } - - @Override - public DataStatistics deserialize(DataInputView source) throws IOException { - StatisticsType statisticsType = statisticsTypeSerializer.deserialize(source); - if (statisticsType == StatisticsType.Map) { - Map mapStatistics = mapSerializer.deserialize(source); - return new MapDataStatistics(mapStatistics); - } else if (statisticsType == StatisticsType.Sketch) { - int numBytes = source.readInt(); - byte[] sketchBytes = new byte[numBytes]; - source.read(sketchBytes); - Memory sketchMemory = Memory.wrap(sketchBytes); - ReservoirItemsSketch sketch = - ReservoirItemsSketch.heapify(sketchMemory, sketchSerializer); - return new SketchDataStatistics(sketch); - } else { - throw new IllegalArgumentException("Unsupported data statistics type: " + statisticsType); - } - } - - @Override - public DataStatistics deserialize(DataStatistics reuse, DataInputView source) throws IOException { - // not much benefit to reuse - return deserialize(source); - } - - @Override - public void copy(DataInputView source, DataOutputView target) throws IOException { - serialize(deserialize(source), target); - } - - @Override - public boolean equals(Object obj) { - if (!(obj instanceof DataStatisticsSerializer)) { - return false; - } - - DataStatisticsSerializer other = (DataStatisticsSerializer) obj; - return Objects.equals(sortKeySerializer, other.sortKeySerializer); - } - - @Override - public int hashCode() { - return sortKeySerializer.hashCode(); - } - - @Override - public TypeSerializerSnapshot snapshotConfiguration() { - return new DataStatisticsSerializerSnapshot(this); - } - - public static class DataStatisticsSerializerSnapshot - extends CompositeTypeSerializerSnapshot { - private static final int CURRENT_VERSION = 1; - - /** Constructor for read instantiation. */ - @SuppressWarnings({"unused", "checkstyle:RedundantModifier"}) - public DataStatisticsSerializerSnapshot() { - super(DataStatisticsSerializer.class); - } - - @SuppressWarnings("checkstyle:RedundantModifier") - public DataStatisticsSerializerSnapshot(DataStatisticsSerializer serializer) { - super(serializer); - } - - @Override - protected int getCurrentOuterSnapshotVersion() { - return CURRENT_VERSION; - } - - @Override - protected TypeSerializer[] getNestedSerializers(DataStatisticsSerializer outerSerializer) { - return new TypeSerializer[] {outerSerializer.sortKeySerializer}; - } - - @Override - protected DataStatisticsSerializer createOuterSerializerWithNestedSerializers( - TypeSerializer[] nestedSerializers) { - SortKeySerializer sortKeySerializer = (SortKeySerializer) nestedSerializers[0]; - return new DataStatisticsSerializer(sortKeySerializer); - } - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/GlobalStatistics.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/GlobalStatistics.java deleted file mode 100644 index 50ec23e9f7a2..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/GlobalStatistics.java +++ /dev/null @@ -1,114 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import java.util.Arrays; -import org.apache.iceberg.SortKey; -import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; -import org.apache.iceberg.relocated.com.google.common.base.Objects; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; - -/** - * This is used by {@link RangePartitioner} for guiding range partitioning. This is what is sent to - * the operator subtasks. For sketch statistics, it only contains much smaller range bounds than the - * complete raw samples. - */ -class GlobalStatistics { - private final long checkpointId; - private final StatisticsType type; - private final MapAssignment mapAssignment; - private final SortKey[] rangeBounds; - - private transient Integer hashCode; - - GlobalStatistics( - long checkpointId, StatisticsType type, MapAssignment mapAssignment, SortKey[] rangeBounds) { - Preconditions.checkArgument( - (mapAssignment != null && rangeBounds == null) - || (mapAssignment == null && rangeBounds != null), - "Invalid key assignment or range bounds: both are non-null or null"); - this.checkpointId = checkpointId; - this.type = type; - this.mapAssignment = mapAssignment; - this.rangeBounds = rangeBounds; - } - - static GlobalStatistics fromMapAssignment(long checkpointId, MapAssignment mapAssignment) { - return new GlobalStatistics(checkpointId, StatisticsType.Map, mapAssignment, null); - } - - static GlobalStatistics fromRangeBounds(long checkpointId, SortKey[] rangeBounds) { - return new GlobalStatistics(checkpointId, StatisticsType.Sketch, null, rangeBounds); - } - - @Override - public String toString() { - return MoreObjects.toStringHelper(this) - .add("checkpointId", checkpointId) - .add("type", type) - .add("mapAssignment", mapAssignment) - .add("rangeBounds", rangeBounds) - .toString(); - } - - @Override - public boolean equals(Object o) { - if (this == o) { - return true; - } - - if (!(o instanceof GlobalStatistics)) { - return false; - } - - GlobalStatistics other = (GlobalStatistics) o; - return Objects.equal(checkpointId, other.checkpointId) - && Objects.equal(type, other.type) - && Objects.equal(mapAssignment, other.mapAssignment()) - && Arrays.equals(rangeBounds, other.rangeBounds()); - } - - @Override - public int hashCode() { - // implemented caching because coordinator can call the hashCode many times. - // when subtasks request statistics refresh upon initialization for reconciliation purpose, - // hashCode is used to check if there is any difference btw coordinator and operator state. - if (hashCode == null) { - this.hashCode = Objects.hashCode(checkpointId, type, mapAssignment, rangeBounds); - } - - return hashCode; - } - - long checkpointId() { - return checkpointId; - } - - StatisticsType type() { - return type; - } - - MapAssignment mapAssignment() { - return mapAssignment; - } - - SortKey[] rangeBounds() { - return rangeBounds; - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/GlobalStatisticsSerializer.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/GlobalStatisticsSerializer.java deleted file mode 100644 index dfb947a84a0c..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/GlobalStatisticsSerializer.java +++ /dev/null @@ -1,201 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import java.io.IOException; -import java.util.Arrays; -import java.util.List; -import java.util.Map; -import java.util.Objects; -import org.apache.flink.api.common.typeutils.CompositeTypeSerializerSnapshot; -import org.apache.flink.api.common.typeutils.TypeSerializer; -import org.apache.flink.api.common.typeutils.TypeSerializerSnapshot; -import org.apache.flink.api.common.typeutils.base.EnumSerializer; -import org.apache.flink.api.common.typeutils.base.IntSerializer; -import org.apache.flink.api.common.typeutils.base.ListSerializer; -import org.apache.flink.api.common.typeutils.base.LongSerializer; -import org.apache.flink.core.memory.DataInputView; -import org.apache.flink.core.memory.DataOutputView; -import org.apache.iceberg.SortKey; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; - -class GlobalStatisticsSerializer extends TypeSerializer { - private final TypeSerializer sortKeySerializer; - private final EnumSerializer statisticsTypeSerializer; - private final ListSerializer rangeBoundsSerializer; - private final ListSerializer intsSerializer; - private final ListSerializer longsSerializer; - - GlobalStatisticsSerializer(TypeSerializer sortKeySerializer) { - this.sortKeySerializer = sortKeySerializer; - this.statisticsTypeSerializer = new EnumSerializer<>(StatisticsType.class); - this.rangeBoundsSerializer = new ListSerializer<>(sortKeySerializer); - this.intsSerializer = new ListSerializer<>(IntSerializer.INSTANCE); - this.longsSerializer = new ListSerializer<>(LongSerializer.INSTANCE); - } - - @Override - public boolean isImmutableType() { - return false; - } - - @Override - public TypeSerializer duplicate() { - return new GlobalStatisticsSerializer(sortKeySerializer); - } - - @Override - public GlobalStatistics createInstance() { - return GlobalStatistics.fromRangeBounds(0L, new SortKey[0]); - } - - @Override - public GlobalStatistics copy(GlobalStatistics from) { - return new GlobalStatistics( - from.checkpointId(), from.type(), from.mapAssignment(), from.rangeBounds()); - } - - @Override - public GlobalStatistics copy(GlobalStatistics from, GlobalStatistics reuse) { - // no benefit of reuse - return copy(from); - } - - @Override - public int getLength() { - return -1; - } - - @Override - public void serialize(GlobalStatistics record, DataOutputView target) throws IOException { - target.writeLong(record.checkpointId()); - statisticsTypeSerializer.serialize(record.type(), target); - if (record.type() == StatisticsType.Map) { - MapAssignment mapAssignment = record.mapAssignment(); - target.writeInt(mapAssignment.numPartitions()); - target.writeInt(mapAssignment.keyAssignments().size()); - for (Map.Entry entry : mapAssignment.keyAssignments().entrySet()) { - sortKeySerializer.serialize(entry.getKey(), target); - KeyAssignment keyAssignment = entry.getValue(); - intsSerializer.serialize(keyAssignment.assignedSubtasks(), target); - longsSerializer.serialize(keyAssignment.subtaskWeightsWithCloseFileCost(), target); - target.writeLong(keyAssignment.closeFileCostWeight()); - } - } else { - rangeBoundsSerializer.serialize(Arrays.asList(record.rangeBounds()), target); - } - } - - @Override - public GlobalStatistics deserialize(DataInputView source) throws IOException { - long checkpointId = source.readLong(); - StatisticsType type = statisticsTypeSerializer.deserialize(source); - if (type == StatisticsType.Map) { - int numPartitions = source.readInt(); - int mapSize = source.readInt(); - Map keyAssignments = Maps.newHashMapWithExpectedSize(mapSize); - for (int i = 0; i < mapSize; ++i) { - SortKey sortKey = sortKeySerializer.deserialize(source); - List assignedSubtasks = intsSerializer.deserialize(source); - List subtaskWeightsWithCloseFileCost = longsSerializer.deserialize(source); - long closeFileCostWeight = source.readLong(); - keyAssignments.put( - sortKey, - new KeyAssignment( - assignedSubtasks, subtaskWeightsWithCloseFileCost, closeFileCostWeight)); - } - - return GlobalStatistics.fromMapAssignment( - checkpointId, new MapAssignment(numPartitions, keyAssignments)); - } else { - List sortKeys = rangeBoundsSerializer.deserialize(source); - SortKey[] rangeBounds = new SortKey[sortKeys.size()]; - return GlobalStatistics.fromRangeBounds(checkpointId, sortKeys.toArray(rangeBounds)); - } - } - - @Override - public GlobalStatistics deserialize(GlobalStatistics reuse, DataInputView source) - throws IOException { - // not much benefit to reuse - return deserialize(source); - } - - @Override - public void copy(DataInputView source, DataOutputView target) throws IOException { - serialize(deserialize(source), target); - } - - @Override - public boolean equals(Object obj) { - if (this == obj) { - return true; - } - - if (obj == null || getClass() != obj.getClass()) { - return false; - } - - GlobalStatisticsSerializer other = (GlobalStatisticsSerializer) obj; - return Objects.equals(sortKeySerializer, other.sortKeySerializer); - } - - @Override - public int hashCode() { - return sortKeySerializer.hashCode(); - } - - @Override - public TypeSerializerSnapshot snapshotConfiguration() { - return new GlobalStatisticsSerializerSnapshot(this); - } - - public static class GlobalStatisticsSerializerSnapshot - extends CompositeTypeSerializerSnapshot { - private static final int CURRENT_VERSION = 1; - - /** Constructor for read instantiation. */ - @SuppressWarnings({"unused", "checkstyle:RedundantModifier"}) - public GlobalStatisticsSerializerSnapshot() { - super(GlobalStatisticsSerializer.class); - } - - @SuppressWarnings("checkstyle:RedundantModifier") - public GlobalStatisticsSerializerSnapshot(GlobalStatisticsSerializer serializer) { - super(serializer); - } - - @Override - protected int getCurrentOuterSnapshotVersion() { - return CURRENT_VERSION; - } - - @Override - protected TypeSerializer[] getNestedSerializers(GlobalStatisticsSerializer outerSerializer) { - return new TypeSerializer[] {outerSerializer.sortKeySerializer}; - } - - @Override - protected GlobalStatisticsSerializer createOuterSerializerWithNestedSerializers( - TypeSerializer[] nestedSerializers) { - SortKeySerializer sortKeySerializer = (SortKeySerializer) nestedSerializers[0]; - return new GlobalStatisticsSerializer(sortKeySerializer); - } - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/KeyAssignment.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/KeyAssignment.java deleted file mode 100644 index 781bcc646023..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/KeyAssignment.java +++ /dev/null @@ -1,155 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import java.util.Arrays; -import java.util.List; -import java.util.Objects; -import java.util.concurrent.ThreadLocalRandom; -import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; - -/** Subtask assignment for a key for Map statistics based */ -class KeyAssignment { - private final List assignedSubtasks; - private final List subtaskWeightsWithCloseFileCost; - private final long closeFileCostWeight; - private final long[] subtaskWeightsExcludingCloseCost; - private final long keyWeight; - private final long[] cumulativeWeights; - - /** - * @param assignedSubtasks assigned subtasks for this key. It could be a single subtask. It could - * also be multiple subtasks if the key has heavy weight that should be handled by multiple - * subtasks. - * @param subtaskWeightsWithCloseFileCost assigned weight for each subtask. E.g., if the keyWeight - * is 27 and the key is assigned to 3 subtasks, subtaskWeights could contain values as [10, - * 10, 7] for target weight of 10 per subtask. - */ - KeyAssignment( - List assignedSubtasks, - List subtaskWeightsWithCloseFileCost, - long closeFileCostWeight) { - Preconditions.checkArgument( - assignedSubtasks != null && !assignedSubtasks.isEmpty(), - "Invalid assigned subtasks: null or empty"); - Preconditions.checkArgument( - subtaskWeightsWithCloseFileCost != null && !subtaskWeightsWithCloseFileCost.isEmpty(), - "Invalid assigned subtasks weights: null or empty"); - Preconditions.checkArgument( - assignedSubtasks.size() == subtaskWeightsWithCloseFileCost.size(), - "Invalid assignment: size mismatch (tasks length = %s, weights length = %s)", - assignedSubtasks.size(), - subtaskWeightsWithCloseFileCost.size()); - subtaskWeightsWithCloseFileCost.forEach( - weight -> - Preconditions.checkArgument( - weight > closeFileCostWeight, - "Invalid weight: should be larger than close file cost: weight = %s, close file cost = %s", - weight, - closeFileCostWeight)); - - this.assignedSubtasks = assignedSubtasks; - this.subtaskWeightsWithCloseFileCost = subtaskWeightsWithCloseFileCost; - this.closeFileCostWeight = closeFileCostWeight; - // Exclude the close file cost for key routing - this.subtaskWeightsExcludingCloseCost = - subtaskWeightsWithCloseFileCost.stream() - .mapToLong(weightWithCloseFileCost -> weightWithCloseFileCost - closeFileCostWeight) - .toArray(); - this.keyWeight = Arrays.stream(subtaskWeightsExcludingCloseCost).sum(); - this.cumulativeWeights = new long[subtaskWeightsExcludingCloseCost.length]; - long cumulativeWeight = 0; - for (int i = 0; i < subtaskWeightsExcludingCloseCost.length; ++i) { - cumulativeWeight += subtaskWeightsExcludingCloseCost[i]; - cumulativeWeights[i] = cumulativeWeight; - } - } - - List assignedSubtasks() { - return assignedSubtasks; - } - - List subtaskWeightsWithCloseFileCost() { - return subtaskWeightsWithCloseFileCost; - } - - long closeFileCostWeight() { - return closeFileCostWeight; - } - - long[] subtaskWeightsExcludingCloseCost() { - return subtaskWeightsExcludingCloseCost; - } - - /** - * Select a subtask for the key. - * - * @return subtask id - */ - int select() { - if (assignedSubtasks.size() == 1) { - // only choice. no need to run random number generator. - return assignedSubtasks.get(0); - } else { - long randomNumber = ThreadLocalRandom.current().nextLong(keyWeight); - int index = Arrays.binarySearch(cumulativeWeights, randomNumber); - // choose the subtask where randomNumber < cumulativeWeights[pos]. - // this works regardless whether index is negative or not. - int position = Math.abs(index + 1); - Preconditions.checkState( - position < assignedSubtasks.size(), - "Invalid selected position: out of range. key weight = %s, random number = %s, cumulative weights array = %s", - keyWeight, - randomNumber, - cumulativeWeights); - return assignedSubtasks.get(position); - } - } - - @Override - public int hashCode() { - return Objects.hash(assignedSubtasks, subtaskWeightsWithCloseFileCost, closeFileCostWeight); - } - - @Override - public boolean equals(Object o) { - if (this == o) { - return true; - } - - if (o == null || getClass() != o.getClass()) { - return false; - } - - KeyAssignment that = (KeyAssignment) o; - return Objects.equals(assignedSubtasks, that.assignedSubtasks) - && Objects.equals(subtaskWeightsWithCloseFileCost, that.subtaskWeightsWithCloseFileCost) - && closeFileCostWeight == that.closeFileCostWeight; - } - - @Override - public String toString() { - return MoreObjects.toStringHelper(this) - .add("assignedSubtasks", assignedSubtasks) - .add("subtaskWeightsWithCloseFileCost", subtaskWeightsWithCloseFileCost) - .add("closeFileCostWeight", closeFileCostWeight) - .toString(); - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapAssignment.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapAssignment.java deleted file mode 100644 index 9d8167460a1b..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapAssignment.java +++ /dev/null @@ -1,242 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import java.util.Comparator; -import java.util.Iterator; -import java.util.List; -import java.util.Map; -import java.util.NavigableMap; -import org.apache.iceberg.SortKey; -import org.apache.iceberg.StructLike; -import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; -import org.apache.iceberg.relocated.com.google.common.base.Objects; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.util.Pair; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** Key assignment to subtasks for Map statistics. */ -class MapAssignment { - private static final Logger LOG = LoggerFactory.getLogger(MapAssignment.class); - - private final int numPartitions; - private final Map keyAssignments; - - MapAssignment(int numPartitions, Map keyAssignments) { - Preconditions.checkArgument(keyAssignments != null, "Invalid key assignments: null"); - this.numPartitions = numPartitions; - this.keyAssignments = keyAssignments; - } - - static MapAssignment fromKeyFrequency( - int numPartitions, - Map mapStatistics, - double closeFileCostWeightPercentage, - Comparator comparator) { - return new MapAssignment( - numPartitions, - assignment(numPartitions, mapStatistics, closeFileCostWeightPercentage, comparator)); - } - - @Override - public int hashCode() { - return Objects.hashCode(numPartitions, keyAssignments); - } - - @Override - public boolean equals(Object o) { - if (this == o) { - return true; - } - - if (o == null || getClass() != o.getClass()) { - return false; - } - - MapAssignment that = (MapAssignment) o; - return numPartitions == that.numPartitions && keyAssignments.equals(that.keyAssignments); - } - - @Override - public String toString() { - return MoreObjects.toStringHelper(this) - .add("numPartitions", numPartitions) - .add("keyAssignments", keyAssignments) - .toString(); - } - - int numPartitions() { - return numPartitions; - } - - Map keyAssignments() { - return keyAssignments; - } - - /** - * Returns assignment summary for every subtask. - * - * @return assignment summary for every subtask. Key is subtaskId. Value pair is (weight assigned - * to the subtask, number of keys assigned to the subtask) - */ - Map> assignmentInfo() { - Map> assignmentInfo = Maps.newTreeMap(); - keyAssignments.forEach( - (key, keyAssignment) -> { - for (int i = 0; i < keyAssignment.assignedSubtasks().size(); ++i) { - int subtaskId = keyAssignment.assignedSubtasks().get(i); - long subtaskWeight = keyAssignment.subtaskWeightsExcludingCloseCost()[i]; - Pair oldValue = assignmentInfo.getOrDefault(subtaskId, Pair.of(0L, 0)); - assignmentInfo.put( - subtaskId, Pair.of(oldValue.first() + subtaskWeight, oldValue.second() + 1)); - } - }); - - return assignmentInfo; - } - - static Map assignment( - int numPartitions, - Map mapStatistics, - double closeFileCostWeightPercentage, - Comparator comparator) { - mapStatistics.forEach( - (key, value) -> - Preconditions.checkArgument( - value > 0, "Invalid statistics: weight is 0 for key %s", key)); - - long totalWeight = mapStatistics.values().stream().mapToLong(l -> l).sum(); - double targetWeightPerSubtask = ((double) totalWeight) / numPartitions; - long closeFileCostWeight = - (long) Math.ceil(targetWeightPerSubtask * closeFileCostWeightPercentage / 100); - - NavigableMap sortedStatsWithCloseFileCost = Maps.newTreeMap(comparator); - mapStatistics.forEach( - (k, v) -> { - int estimatedSplits = (int) Math.ceil(v / targetWeightPerSubtask); - long estimatedCloseFileCost = closeFileCostWeight * estimatedSplits; - sortedStatsWithCloseFileCost.put(k, v + estimatedCloseFileCost); - }); - - long totalWeightWithCloseFileCost = - sortedStatsWithCloseFileCost.values().stream().mapToLong(l -> l).sum(); - long targetWeightPerSubtaskWithCloseFileCost = - (long) Math.ceil(((double) totalWeightWithCloseFileCost) / numPartitions); - return buildAssignment( - numPartitions, - sortedStatsWithCloseFileCost, - targetWeightPerSubtaskWithCloseFileCost, - closeFileCostWeight); - } - - private static Map buildAssignment( - int numPartitions, - NavigableMap sortedStatistics, - long targetWeightPerSubtask, - long closeFileCostWeight) { - Map assignmentMap = - Maps.newHashMapWithExpectedSize(sortedStatistics.size()); - Iterator mapKeyIterator = sortedStatistics.keySet().iterator(); - int subtaskId = 0; - SortKey currentKey = null; - long keyRemainingWeight = 0L; - long subtaskRemainingWeight = targetWeightPerSubtask; - List assignedSubtasks = Lists.newArrayList(); - List subtaskWeights = Lists.newArrayList(); - while (mapKeyIterator.hasNext() || currentKey != null) { - // This should never happen because target weight is calculated using ceil function. - if (subtaskId >= numPartitions) { - LOG.error( - "Internal algorithm error: exhausted subtasks with unassigned keys left. number of partitions: {}, " - + "target weight per subtask: {}, close file cost in weight: {}, data statistics: {}", - numPartitions, - targetWeightPerSubtask, - closeFileCostWeight, - sortedStatistics); - throw new IllegalStateException( - "Internal algorithm error: exhausted subtasks with unassigned keys left"); - } - - if (currentKey == null) { - currentKey = mapKeyIterator.next(); - keyRemainingWeight = sortedStatistics.get(currentKey); - } - - assignedSubtasks.add(subtaskId); - if (keyRemainingWeight < subtaskRemainingWeight) { - // assign the remaining weight of the key to the current subtask - subtaskWeights.add(keyRemainingWeight); - subtaskRemainingWeight -= keyRemainingWeight; - keyRemainingWeight = 0L; - } else { - // filled up the current subtask - long assignedWeight = subtaskRemainingWeight; - keyRemainingWeight -= subtaskRemainingWeight; - - // If assigned weight is less than close file cost, pad it up with close file cost. - // This might cause the subtask assigned weight over the target weight. - // But it should be no more than one close file cost. Small skew is acceptable. - if (assignedWeight <= closeFileCostWeight) { - long paddingWeight = Math.min(keyRemainingWeight, closeFileCostWeight); - keyRemainingWeight -= paddingWeight; - assignedWeight += paddingWeight; - } - - subtaskWeights.add(assignedWeight); - // move on to the next subtask - subtaskId += 1; - subtaskRemainingWeight = targetWeightPerSubtask; - } - - Preconditions.checkState( - assignedSubtasks.size() == subtaskWeights.size(), - "List size mismatch: assigned subtasks = %s, subtask weights = %s", - assignedSubtasks, - subtaskWeights); - - // If the remaining key weight is smaller than the close file cost, simply skip the residual - // as it doesn't make sense to assign a weight smaller than close file cost to a new subtask. - // this might lead to some inaccuracy in weight calculation. E.g., assuming the key weight is - // 2 and close file cost is 2. key weight with close cost is 4. Let's assume the previous - // task has a weight of 3 available. So weight of 3 for this key is assigned to the task and - // the residual weight of 1 is dropped. Then the routing weight for this key is 1 (minus the - // close file cost), which is inaccurate as the true key weight should be 2. - // Again, this greedy algorithm is not intended to be perfect. Some small inaccuracy is - // expected and acceptable. Traffic distribution should still be balanced. - if (keyRemainingWeight > 0 && keyRemainingWeight <= closeFileCostWeight) { - keyRemainingWeight = 0; - } - - if (keyRemainingWeight == 0) { - // finishing up the assignment for the current key - KeyAssignment keyAssignment = - new KeyAssignment(assignedSubtasks, subtaskWeights, closeFileCostWeight); - assignmentMap.put(currentKey, keyAssignment); - assignedSubtasks = Lists.newArrayList(); - subtaskWeights = Lists.newArrayList(); - currentKey = null; - } - } - - return assignmentMap; - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapDataStatistics.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapDataStatistics.java deleted file mode 100644 index 05b943f6046f..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapDataStatistics.java +++ /dev/null @@ -1,88 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import java.util.Map; -import org.apache.iceberg.SortKey; -import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; -import org.apache.iceberg.relocated.com.google.common.base.Objects; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; - -/** MapDataStatistics uses map to count key frequency */ -class MapDataStatistics implements DataStatistics { - private final Map keyFrequency; - - MapDataStatistics() { - this.keyFrequency = Maps.newHashMap(); - } - - MapDataStatistics(Map keyFrequency) { - this.keyFrequency = keyFrequency; - } - - @Override - public StatisticsType type() { - return StatisticsType.Map; - } - - @Override - public boolean isEmpty() { - return keyFrequency.isEmpty(); - } - - @Override - public void add(SortKey sortKey) { - if (keyFrequency.containsKey(sortKey)) { - keyFrequency.merge(sortKey, 1L, Long::sum); - } else { - // clone the sort key before adding to map because input sortKey object can be reused - SortKey copiedKey = sortKey.copy(); - keyFrequency.put(copiedKey, 1L); - } - } - - @Override - public Object result() { - return keyFrequency; - } - - @Override - public String toString() { - return MoreObjects.toStringHelper(this).add("map", keyFrequency).toString(); - } - - @Override - public boolean equals(Object o) { - if (this == o) { - return true; - } - - if (!(o instanceof MapDataStatistics)) { - return false; - } - - MapDataStatistics other = (MapDataStatistics) o; - return Objects.equal(keyFrequency, other.keyFrequency); - } - - @Override - public int hashCode() { - return Objects.hashCode(keyFrequency); - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapRangePartitioner.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapRangePartitioner.java deleted file mode 100644 index f36a078c94e0..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/MapRangePartitioner.java +++ /dev/null @@ -1,95 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import java.util.concurrent.TimeUnit; -import org.apache.flink.api.common.functions.Partitioner; -import org.apache.flink.table.data.RowData; -import org.apache.iceberg.Schema; -import org.apache.iceberg.SortKey; -import org.apache.iceberg.SortOrder; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.RowDataWrapper; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * Internal partitioner implementation that supports MapDataStatistics, which is typically used for - * low-cardinality use cases. While MapDataStatistics can keep accurate counters, it can't be used - * for high-cardinality use cases. Otherwise, the memory footprint is too high. - * - *

    It is a greedy algorithm for bin packing. With close file cost, the calculation isn't always - * precise when calculating close cost for every file, target weight per subtask, padding residual - * weight, assigned weight without close cost. - * - *

    All actions should be executed in a single Flink mailbox thread. So there is no need to make - * it thread safe. - */ -class MapRangePartitioner implements Partitioner { - private static final Logger LOG = LoggerFactory.getLogger(MapRangePartitioner.class); - - private final RowDataWrapper rowDataWrapper; - private final SortKey sortKey; - private final MapAssignment mapAssignment; - - // Counter that tracks how many times a new key encountered - // where there is no traffic statistics learned about it. - private long newSortKeyCounter; - private long lastNewSortKeyLogTimeMilli; - - MapRangePartitioner(Schema schema, SortOrder sortOrder, MapAssignment mapAssignment) { - this.rowDataWrapper = new RowDataWrapper(FlinkSchemaUtil.convert(schema), schema.asStruct()); - this.sortKey = new SortKey(schema, sortOrder); - this.mapAssignment = mapAssignment; - this.newSortKeyCounter = 0; - this.lastNewSortKeyLogTimeMilli = System.currentTimeMillis(); - } - - @Override - public int partition(RowData row, int numPartitions) { - // reuse the sortKey and rowDataWrapper - sortKey.wrap(rowDataWrapper.wrap(row)); - KeyAssignment keyAssignment = mapAssignment.keyAssignments().get(sortKey); - - int partition; - if (keyAssignment == null) { - LOG.trace( - "Encountered new sort key: {}. Fall back to round robin as statistics not learned yet.", - sortKey); - // Ideally unknownKeyCounter should be published as a counter metric. - // It seems difficult to pass in MetricGroup into the partitioner. - // Just log an INFO message every minute. - newSortKeyCounter += 1; - long now = System.currentTimeMillis(); - if (now - lastNewSortKeyLogTimeMilli > TimeUnit.MINUTES.toMillis(1)) { - LOG.info( - "Encounter new sort keys {} times. Fall back to round robin as statistics not learned yet", - newSortKeyCounter); - lastNewSortKeyLogTimeMilli = now; - newSortKeyCounter = 0; - } - partition = (int) (newSortKeyCounter % numPartitions); - } else { - partition = keyAssignment.select(); - } - - return RangePartitioner.adjustPartitionWithRescale( - partition, mapAssignment.numPartitions(), numPartitions); - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/RangePartitioner.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/RangePartitioner.java deleted file mode 100644 index 6608b938f5a8..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/RangePartitioner.java +++ /dev/null @@ -1,110 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import java.util.Random; -import java.util.concurrent.atomic.AtomicLong; -import org.apache.flink.annotation.Internal; -import org.apache.flink.api.common.functions.Partitioner; -import org.apache.flink.table.data.RowData; -import org.apache.iceberg.DistributionMode; -import org.apache.iceberg.Schema; -import org.apache.iceberg.SortOrder; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** This custom partitioner implements the {@link DistributionMode#RANGE} for Flink sink. */ -@Internal -public class RangePartitioner implements Partitioner { - private static final Logger LOG = LoggerFactory.getLogger(RangePartitioner.class); - - private final Schema schema; - private final SortOrder sortOrder; - - private transient AtomicLong roundRobinCounter; - private transient Partitioner delegatePartitioner; - - public RangePartitioner(Schema schema, SortOrder sortOrder) { - this.schema = schema; - this.sortOrder = sortOrder; - } - - @Override - public int partition(StatisticsOrRecord wrapper, int numPartitions) { - if (wrapper.hasStatistics()) { - this.delegatePartitioner = delegatePartitioner(wrapper.statistics()); - return (int) (roundRobinCounter(numPartitions).getAndIncrement() % numPartitions); - } else { - if (delegatePartitioner != null) { - return delegatePartitioner.partition(wrapper.record(), numPartitions); - } else { - int partition = (int) (roundRobinCounter(numPartitions).getAndIncrement() % numPartitions); - LOG.trace("Statistics not available. Round robin to partition {}", partition); - return partition; - } - } - } - - private AtomicLong roundRobinCounter(int numPartitions) { - if (roundRobinCounter == null) { - // randomize the starting point to avoid synchronization across subtasks - this.roundRobinCounter = new AtomicLong(new Random().nextInt(numPartitions)); - } - - return roundRobinCounter; - } - - private Partitioner delegatePartitioner(GlobalStatistics statistics) { - if (statistics.type() == StatisticsType.Map) { - return new MapRangePartitioner(schema, sortOrder, statistics.mapAssignment()); - } else if (statistics.type() == StatisticsType.Sketch) { - return new SketchRangePartitioner(schema, sortOrder, statistics.rangeBounds()); - } else { - throw new IllegalArgumentException( - String.format("Invalid statistics type: %s. Should be Map or Sketch", statistics.type())); - } - } - - /** - * Util method that handles rescale (write parallelism / numPartitions change). - * - * @param partition partition caculated based on the existing statistics - * @param numPartitionsStatsCalculation number of partitions when the assignment was calculated - * based on - * @param numPartitions current number of partitions - * @return adjusted partition if necessary. - */ - static int adjustPartitionWithRescale( - int partition, int numPartitionsStatsCalculation, int numPartitions) { - if (numPartitionsStatsCalculation <= numPartitions) { - // no rescale or scale-up case. - // new subtasks are ignored and not assigned any keys, which is sub-optimal and only - // transient. when rescale is detected, operator requests new statistics from - // coordinator upon initialization. - return partition; - } else { - // scale-down case. - // Use mod % operation to distribution the over-range partitions. - // It can cause skew among subtasks. but the behavior is still better than - // discarding the statistics and falling back to round-robin (no clustering). - // Again, this is transient and stats refresh is requested when rescale is detected. - return partition % numPartitions; - } - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/RequestGlobalStatisticsEvent.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/RequestGlobalStatisticsEvent.java deleted file mode 100644 index ce17e1964392..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/RequestGlobalStatisticsEvent.java +++ /dev/null @@ -1,38 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import org.apache.flink.runtime.operators.coordination.OperatorEvent; - -class RequestGlobalStatisticsEvent implements OperatorEvent { - private final Integer signature; - - RequestGlobalStatisticsEvent() { - this.signature = null; - } - - /** @param signature hashCode of the subtask's existing global statistics */ - RequestGlobalStatisticsEvent(int signature) { - this.signature = signature; - } - - Integer signature() { - return signature; - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchDataStatistics.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchDataStatistics.java deleted file mode 100644 index 35bbb27baf16..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchDataStatistics.java +++ /dev/null @@ -1,87 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import java.util.Arrays; -import org.apache.datasketches.sampling.ReservoirItemsSketch; -import org.apache.iceberg.SortKey; -import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; -import org.apache.iceberg.relocated.com.google.common.base.Objects; - -/** MapDataStatistics uses map to count key frequency */ -class SketchDataStatistics implements DataStatistics { - - private final ReservoirItemsSketch sketch; - - SketchDataStatistics(int reservoirSize) { - this.sketch = ReservoirItemsSketch.newInstance(reservoirSize); - } - - SketchDataStatistics(ReservoirItemsSketch sketchStats) { - this.sketch = sketchStats; - } - - @Override - public StatisticsType type() { - return StatisticsType.Sketch; - } - - @Override - public boolean isEmpty() { - return sketch.getNumSamples() == 0; - } - - @Override - public void add(SortKey sortKey) { - // clone the sort key first because input sortKey object can be reused - SortKey copiedKey = sortKey.copy(); - sketch.update(copiedKey); - } - - @Override - public Object result() { - return sketch; - } - - @Override - public String toString() { - return MoreObjects.toStringHelper(this).add("sketch", sketch).toString(); - } - - @Override - public boolean equals(Object o) { - if (this == o) { - return true; - } - - if (!(o instanceof SketchDataStatistics)) { - return false; - } - - ReservoirItemsSketch otherSketch = ((SketchDataStatistics) o).sketch; - return Objects.equal(sketch.getK(), otherSketch.getK()) - && Objects.equal(sketch.getN(), otherSketch.getN()) - && Arrays.deepEquals(sketch.getSamples(), otherSketch.getSamples()); - } - - @Override - public int hashCode() { - return Objects.hashCode(sketch.getK(), sketch.getN(), sketch.getSamples()); - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchRangePartitioner.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchRangePartitioner.java deleted file mode 100644 index dddb0d8722c0..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchRangePartitioner.java +++ /dev/null @@ -1,51 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import java.util.Comparator; -import org.apache.flink.api.common.functions.Partitioner; -import org.apache.flink.table.data.RowData; -import org.apache.iceberg.Schema; -import org.apache.iceberg.SortKey; -import org.apache.iceberg.SortOrder; -import org.apache.iceberg.StructLike; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.RowDataWrapper; -import org.apache.iceberg.types.Comparators; - -class SketchRangePartitioner implements Partitioner { - private final SortKey sortKey; - private final Comparator comparator; - private final SortKey[] rangeBounds; - private final RowDataWrapper rowDataWrapper; - - SketchRangePartitioner(Schema schema, SortOrder sortOrder, SortKey[] rangeBounds) { - this.sortKey = new SortKey(schema, sortOrder); - this.comparator = Comparators.forType(SortKeyUtil.sortKeySchema(schema, sortOrder).asStruct()); - this.rangeBounds = rangeBounds; - this.rowDataWrapper = new RowDataWrapper(FlinkSchemaUtil.convert(schema), schema.asStruct()); - } - - @Override - public int partition(RowData row, int numPartitions) { - // reuse the sortKey and rowDataWrapper - sortKey.wrap(rowDataWrapper.wrap(row)); - return SketchUtil.partition(sortKey, numPartitions, rangeBounds, comparator); - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchUtil.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchUtil.java deleted file mode 100644 index 871ef9ef1149..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SketchUtil.java +++ /dev/null @@ -1,159 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import java.util.Arrays; -import java.util.Comparator; -import java.util.Map; -import java.util.function.Consumer; -import org.apache.iceberg.SortKey; -import org.apache.iceberg.StructLike; - -class SketchUtil { - static final int COORDINATOR_MIN_RESERVOIR_SIZE = 10_000; - static final int COORDINATOR_MAX_RESERVOIR_SIZE = 1_000_000; - static final int COORDINATOR_TARGET_PARTITIONS_MULTIPLIER = 100; - static final int OPERATOR_OVER_SAMPLE_RATIO = 10; - - // switch the statistics tracking from map to sketch if the cardinality of the sort key is over - // this threshold. It is hardcoded for now, we can revisit in the future if config is needed. - static final int OPERATOR_SKETCH_SWITCH_THRESHOLD = 10_000; - static final int COORDINATOR_SKETCH_SWITCH_THRESHOLD = 100_000; - - private SketchUtil() {} - - /** - * The larger the reservoir size, the more accurate for range bounds calculation and the more - * balanced range distribution. - * - *

    Here are the heuristic rules - *

  • Target size: numPartitions x 100 to achieve good accuracy and is easier to calculate the - * range bounds - *
  • Min is 10K to achieve good accuracy while memory footprint is still relatively small - *
  • Max is 1M to cap the memory footprint on coordinator - * - * @param numPartitions number of range partitions which equals to downstream operator parallelism - * @return reservoir size - */ - static int determineCoordinatorReservoirSize(int numPartitions) { - int reservoirSize = numPartitions * COORDINATOR_TARGET_PARTITIONS_MULTIPLIER; - - if (reservoirSize < COORDINATOR_MIN_RESERVOIR_SIZE) { - // adjust it up and still make reservoirSize divisible by numPartitions - int remainder = COORDINATOR_MIN_RESERVOIR_SIZE % numPartitions; - reservoirSize = COORDINATOR_MIN_RESERVOIR_SIZE + (numPartitions - remainder); - } else if (reservoirSize > COORDINATOR_MAX_RESERVOIR_SIZE) { - // adjust it down and still make reservoirSize divisible by numPartitions - int remainder = COORDINATOR_MAX_RESERVOIR_SIZE % numPartitions; - reservoirSize = COORDINATOR_MAX_RESERVOIR_SIZE - remainder; - } - - return reservoirSize; - } - - /** - * Determine the sampling reservoir size where operator subtasks collect data statistics. - * - *

    Here are the heuristic rules - *

  • Target size is "coordinator reservoir size * over sampling ration (10) / operator - * parallelism" - *
  • Min is 1K to achieve good accuracy while memory footprint is still relatively small - *
  • Max is 100K to cap the memory footprint on coordinator - * - * @param numPartitions number of range partitions which equals to downstream operator parallelism - * @param operatorParallelism data statistics operator parallelism - * @return reservoir size - */ - static int determineOperatorReservoirSize(int operatorParallelism, int numPartitions) { - int coordinatorReservoirSize = determineCoordinatorReservoirSize(numPartitions); - int totalOperatorSamples = coordinatorReservoirSize * OPERATOR_OVER_SAMPLE_RATIO; - return (int) Math.ceil((double) totalOperatorSamples / operatorParallelism); - } - - /** - * To understand how range bounds are used in range partitioning, here is an example for human - * ages with 4 partitions: [15, 32, 60]. The 4 ranges would be - * - *
      - *
    • age <= 15 - *
    • age > 15 && age <= 32 - *
    • age >32 && age <= 60 - *
    • age > 60 - *
    - * - *

    Assumption is that a single key is not dominant enough to span multiple subtasks. - * - * @param numPartitions number of partitions which maps to downstream operator parallelism - * @param samples sampled keys - * @return array of range partition bounds. It should be a sorted list (ascending). Number of - * items should be {@code numPartitions - 1}. if numPartitions is 1, return an empty list - */ - static SortKey[] rangeBounds( - int numPartitions, Comparator comparator, SortKey[] samples) { - // sort the keys first - Arrays.sort(samples, comparator); - int numCandidates = numPartitions - 1; - SortKey[] candidates = new SortKey[numCandidates]; - int step = (int) Math.ceil((double) samples.length / numPartitions); - int position = step - 1; - int numChosen = 0; - while (position < samples.length && numChosen < numCandidates) { - SortKey candidate = samples[position]; - // skip duplicate values - if (numChosen > 0 && candidate.equals(candidates[numChosen - 1])) { - // linear probe for the next distinct value - position += 1; - } else { - candidates[numChosen] = candidate; - position += step; - numChosen += 1; - } - } - - return candidates; - } - - /** This can be a bit expensive since it is quadratic. */ - static void convertMapToSketch( - Map taskMapStats, Consumer sketchConsumer) { - taskMapStats.forEach( - (sortKey, count) -> { - for (int i = 0; i < count; ++i) { - sketchConsumer.accept(sortKey); - } - }); - } - - static int partition( - SortKey key, int numPartitions, SortKey[] rangeBounds, Comparator comparator) { - int partition = Arrays.binarySearch(rangeBounds, key, comparator); - - // binarySearch either returns the match location or -[insertion point]-1 - if (partition < 0) { - partition = -partition - 1; - } - - if (partition > rangeBounds.length) { - partition = rangeBounds.length; - } - - return RangePartitioner.adjustPartitionWithRescale( - partition, rangeBounds.length + 1, numPartitions); - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SortKeySerializer.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SortKeySerializer.java deleted file mode 100644 index d1d75019fa2e..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SortKeySerializer.java +++ /dev/null @@ -1,373 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import java.io.IOException; -import java.math.BigDecimal; -import java.math.BigInteger; -import java.nio.ByteBuffer; -import java.util.List; -import java.util.Objects; -import java.util.Set; -import java.util.UUID; -import java.util.stream.Collectors; -import org.apache.flink.annotation.VisibleForTesting; -import org.apache.flink.api.common.typeutils.TypeSerializer; -import org.apache.flink.api.common.typeutils.TypeSerializerSchemaCompatibility; -import org.apache.flink.api.common.typeutils.TypeSerializerSnapshot; -import org.apache.flink.core.memory.DataInputView; -import org.apache.flink.core.memory.DataOutputView; -import org.apache.flink.util.Preconditions; -import org.apache.flink.util.StringUtils; -import org.apache.iceberg.Schema; -import org.apache.iceberg.SchemaParser; -import org.apache.iceberg.SortField; -import org.apache.iceberg.SortKey; -import org.apache.iceberg.SortOrder; -import org.apache.iceberg.SortOrderParser; -import org.apache.iceberg.types.CheckCompatibility; -import org.apache.iceberg.types.Type; -import org.apache.iceberg.types.TypeUtil; -import org.apache.iceberg.types.Types; - -class SortKeySerializer extends TypeSerializer { - private final Schema schema; - private final SortOrder sortOrder; - private final int size; - private final Types.NestedField[] transformedFields; - - private transient SortKey sortKey; - - SortKeySerializer(Schema schema, SortOrder sortOrder) { - this.schema = schema; - this.sortOrder = sortOrder; - this.size = sortOrder.fields().size(); - - this.transformedFields = new Types.NestedField[size]; - for (int i = 0; i < size; ++i) { - SortField sortField = sortOrder.fields().get(i); - Types.NestedField sourceField = schema.findField(sortField.sourceId()); - Type resultType = sortField.transform().getResultType(sourceField.type()); - Types.NestedField transformedField = - Types.NestedField.of( - sourceField.fieldId(), - sourceField.isOptional(), - sourceField.name(), - resultType, - sourceField.doc()); - transformedFields[i] = transformedField; - } - } - - private SortKey lazySortKey() { - if (sortKey == null) { - this.sortKey = new SortKey(schema, sortOrder); - } - - return sortKey; - } - - @Override - public boolean isImmutableType() { - return false; - } - - @Override - public TypeSerializer duplicate() { - return new SortKeySerializer(schema, sortOrder); - } - - @Override - public SortKey createInstance() { - return new SortKey(schema, sortOrder); - } - - @Override - public SortKey copy(SortKey from) { - return from.copy(); - } - - @Override - public SortKey copy(SortKey from, SortKey reuse) { - // no benefit of reuse - return copy(from); - } - - @Override - public int getLength() { - return -1; - } - - @Override - public void serialize(SortKey record, DataOutputView target) throws IOException { - Preconditions.checkArgument( - record.size() == size, - "Invalid size of the sort key object: %s. Expected %s", - record.size(), - size); - for (int i = 0; i < size; ++i) { - int fieldId = transformedFields[i].fieldId(); - Type.TypeID typeId = transformedFields[i].type().typeId(); - switch (typeId) { - case BOOLEAN: - target.writeBoolean(record.get(i, Boolean.class)); - break; - case INTEGER: - case DATE: - target.writeInt(record.get(i, Integer.class)); - break; - case LONG: - case TIME: - case TIMESTAMP: - target.writeLong(record.get(i, Long.class)); - break; - case FLOAT: - target.writeFloat(record.get(i, Float.class)); - break; - case DOUBLE: - target.writeDouble(record.get(i, Double.class)); - break; - case STRING: - target.writeUTF(record.get(i, CharSequence.class).toString()); - break; - case UUID: - UUID uuid = record.get(i, UUID.class); - target.writeLong(uuid.getMostSignificantBits()); - target.writeLong(uuid.getLeastSignificantBits()); - break; - case FIXED: - case BINARY: - byte[] bytes = record.get(i, ByteBuffer.class).array(); - target.writeInt(bytes.length); - target.write(bytes); - break; - case DECIMAL: - BigDecimal decimal = record.get(i, BigDecimal.class); - byte[] decimalBytes = decimal.unscaledValue().toByteArray(); - target.writeInt(decimalBytes.length); - target.write(decimalBytes); - target.writeInt(decimal.scale()); - break; - case STRUCT: - case MAP: - case LIST: - default: - // SortKey transformation is a flattened struct without list and map - throw new UnsupportedOperationException( - String.format("Field %d has unsupported field type: %s", fieldId, typeId)); - } - } - } - - @Override - public SortKey deserialize(DataInputView source) throws IOException { - // copying is a little faster than constructing a new SortKey object - SortKey deserialized = lazySortKey().copy(); - deserialize(deserialized, source); - return deserialized; - } - - @Override - public SortKey deserialize(SortKey reuse, DataInputView source) throws IOException { - Preconditions.checkArgument( - reuse.size() == size, - "Invalid size of the sort key object: %s. Expected %s", - reuse.size(), - size); - for (int i = 0; i < size; ++i) { - int fieldId = transformedFields[i].fieldId(); - Type.TypeID typeId = transformedFields[i].type().typeId(); - switch (typeId) { - case BOOLEAN: - reuse.set(i, source.readBoolean()); - break; - case INTEGER: - case DATE: - reuse.set(i, source.readInt()); - break; - case LONG: - case TIME: - case TIMESTAMP: - reuse.set(i, source.readLong()); - break; - case FLOAT: - reuse.set(i, source.readFloat()); - break; - case DOUBLE: - reuse.set(i, source.readDouble()); - break; - case STRING: - reuse.set(i, source.readUTF()); - break; - case UUID: - long mostSignificantBits = source.readLong(); - long leastSignificantBits = source.readLong(); - reuse.set(i, new UUID(mostSignificantBits, leastSignificantBits)); - break; - case FIXED: - case BINARY: - byte[] bytes = new byte[source.readInt()]; - source.read(bytes); - reuse.set(i, ByteBuffer.wrap(bytes)); - break; - case DECIMAL: - byte[] unscaledBytes = new byte[source.readInt()]; - source.read(unscaledBytes); - int scale = source.readInt(); - BigDecimal decimal = new BigDecimal(new BigInteger(unscaledBytes), scale); - reuse.set(i, decimal); - break; - case STRUCT: - case MAP: - case LIST: - default: - // SortKey transformation is a flattened struct without list and map - throw new UnsupportedOperationException( - String.format("Field %d has unsupported field type: %s", fieldId, typeId)); - } - } - - return reuse; - } - - @Override - public void copy(DataInputView source, DataOutputView target) throws IOException { - // no optimization here - serialize(deserialize(source), target); - } - - @Override - public boolean equals(Object obj) { - if (!(obj instanceof SortKeySerializer)) { - return false; - } - - SortKeySerializer other = (SortKeySerializer) obj; - return Objects.equals(schema.asStruct(), other.schema.asStruct()) - && Objects.equals(sortOrder, other.sortOrder); - } - - @Override - public int hashCode() { - return schema.asStruct().hashCode() * 31 + sortOrder.hashCode(); - } - - @Override - public TypeSerializerSnapshot snapshotConfiguration() { - return new SortKeySerializerSnapshot(schema, sortOrder); - } - - public static class SortKeySerializerSnapshot implements TypeSerializerSnapshot { - private static final int CURRENT_VERSION = 1; - - private Schema schema; - private SortOrder sortOrder; - - /** Constructor for read instantiation. */ - @SuppressWarnings({"unused", "checkstyle:RedundantModifier"}) - public SortKeySerializerSnapshot() { - // this constructor is used when restoring from a checkpoint. - } - - @SuppressWarnings("checkstyle:RedundantModifier") - public SortKeySerializerSnapshot(Schema schema, SortOrder sortOrder) { - this.schema = schema; - this.sortOrder = sortOrder; - } - - @Override - public int getCurrentVersion() { - return CURRENT_VERSION; - } - - @Override - public void writeSnapshot(DataOutputView out) throws IOException { - Preconditions.checkState(schema != null, "Invalid schema: null"); - Preconditions.checkState(sortOrder != null, "Invalid sort order: null"); - - StringUtils.writeString(SchemaParser.toJson(schema), out); - StringUtils.writeString(SortOrderParser.toJson(sortOrder), out); - } - - @Override - public void readSnapshot(int readVersion, DataInputView in, ClassLoader userCodeClassLoader) - throws IOException { - if (readVersion == 1) { - readV1(in); - } else { - throw new IllegalArgumentException("Unknown read version: " + readVersion); - } - } - - @Override - public TypeSerializerSchemaCompatibility resolveSchemaCompatibility( - TypeSerializer newSerializer) { - if (!(newSerializer instanceof SortKeySerializer)) { - return TypeSerializerSchemaCompatibility.incompatible(); - } - - // Sort order should be identical - SortKeySerializerSnapshot newSnapshot = - (SortKeySerializerSnapshot) newSerializer.snapshotConfiguration(); - if (!sortOrder.sameOrder(newSnapshot.sortOrder)) { - return TypeSerializerSchemaCompatibility.incompatible(); - } - - Set sortFieldIds = - sortOrder.fields().stream().map(SortField::sourceId).collect(Collectors.toSet()); - // only care about the schema related to sort fields - Schema sortSchema = TypeUtil.project(schema, sortFieldIds); - Schema newSortSchema = TypeUtil.project(newSnapshot.schema, sortFieldIds); - - List compatibilityErrors = - CheckCompatibility.writeCompatibilityErrors(sortSchema, newSortSchema); - if (compatibilityErrors.isEmpty()) { - return TypeSerializerSchemaCompatibility.compatibleAsIs(); - } - - return TypeSerializerSchemaCompatibility.incompatible(); - } - - @Override - public TypeSerializer restoreSerializer() { - Preconditions.checkState(schema != null, "Invalid schema: null"); - Preconditions.checkState(sortOrder != null, "Invalid sort order: null"); - return new SortKeySerializer(schema, sortOrder); - } - - private void readV1(DataInputView in) throws IOException { - String schemaJson = StringUtils.readString(in); - String sortOrderJson = StringUtils.readString(in); - this.schema = SchemaParser.fromJson(schemaJson); - this.sortOrder = SortOrderParser.fromJson(sortOrderJson).bind(schema); - } - - @VisibleForTesting - static TypeSerializerSchemaCompatibility resolveSchemaCompatibility( - Schema readSchema, Schema writeSchema) { - List compatibilityErrors = - CheckCompatibility.writeCompatibilityErrors(readSchema, writeSchema); - if (compatibilityErrors.isEmpty()) { - return TypeSerializerSchemaCompatibility.compatibleAsIs(); - } - - return TypeSerializerSchemaCompatibility.incompatible(); - } - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SortKeySketchSerializer.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SortKeySketchSerializer.java deleted file mode 100644 index d6c23f035015..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SortKeySketchSerializer.java +++ /dev/null @@ -1,143 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import java.io.IOException; -import java.io.Serializable; -import java.io.UncheckedIOException; -import java.util.Arrays; -import java.util.List; -import org.apache.datasketches.common.ArrayOfItemsSerDe; -import org.apache.datasketches.common.ArrayOfStringsSerDe; -import org.apache.datasketches.common.ByteArrayUtil; -import org.apache.datasketches.common.Util; -import org.apache.datasketches.memory.Memory; -import org.apache.datasketches.sampling.ReservoirItemsSketch; -import org.apache.flink.api.common.typeutils.TypeSerializer; -import org.apache.flink.api.common.typeutils.base.ListSerializer; -import org.apache.flink.core.memory.DataInputDeserializer; -import org.apache.flink.core.memory.DataOutputSerializer; -import org.apache.iceberg.SortKey; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; - -/** - * Only way to implement {@link ReservoirItemsSketch} serializer is to extend from {@link - * ArrayOfItemsSerDe}, as deserialization uses a private constructor from ReservoirItemsSketch. The - * implementation is modeled after {@link ArrayOfStringsSerDe} - */ -class SortKeySketchSerializer extends ArrayOfItemsSerDe implements Serializable { - private static final int DEFAULT_SORT_KEY_SIZE = 128; - - private final TypeSerializer itemSerializer; - private final ListSerializer listSerializer; - private final DataInputDeserializer input; - - SortKeySketchSerializer(TypeSerializer itemSerializer) { - this.itemSerializer = itemSerializer; - this.listSerializer = new ListSerializer<>(itemSerializer); - this.input = new DataInputDeserializer(); - } - - @Override - public byte[] serializeToByteArray(SortKey item) { - try { - DataOutputSerializer output = new DataOutputSerializer(DEFAULT_SORT_KEY_SIZE); - itemSerializer.serialize(item, output); - byte[] itemBytes = output.getSharedBuffer(); - int numBytes = output.length(); - byte[] out = new byte[numBytes + Integer.BYTES]; - ByteArrayUtil.copyBytes(itemBytes, 0, out, 4, numBytes); - ByteArrayUtil.putIntLE(out, 0, numBytes); - return out; - } catch (IOException e) { - throw new UncheckedIOException("Failed to serialize sort key", e); - } - } - - @Override - public byte[] serializeToByteArray(SortKey[] items) { - try { - DataOutputSerializer output = new DataOutputSerializer(DEFAULT_SORT_KEY_SIZE * items.length); - listSerializer.serialize(Arrays.asList(items), output); - byte[] itemsBytes = output.getSharedBuffer(); - int numBytes = output.length(); - byte[] out = new byte[Integer.BYTES + numBytes]; - ByteArrayUtil.putIntLE(out, 0, numBytes); - System.arraycopy(itemsBytes, 0, out, Integer.BYTES, numBytes); - return out; - } catch (IOException e) { - throw new UncheckedIOException("Failed to serialize sort key", e); - } - } - - @Override - public SortKey[] deserializeFromMemory(Memory mem, long startingOffset, int numItems) { - Preconditions.checkArgument(mem != null, "Invalid input memory: null"); - if (numItems <= 0) { - return new SortKey[0]; - } - - long offset = startingOffset; - Util.checkBounds(offset, Integer.BYTES, mem.getCapacity()); - int numBytes = mem.getInt(offset); - offset += Integer.BYTES; - - Util.checkBounds(offset, numBytes, mem.getCapacity()); - byte[] sortKeyBytes = new byte[numBytes]; - mem.getByteArray(offset, sortKeyBytes, 0, numBytes); - input.setBuffer(sortKeyBytes); - - try { - List sortKeys = listSerializer.deserialize(input); - SortKey[] array = new SortKey[numItems]; - sortKeys.toArray(array); - input.releaseArrays(); - return array; - } catch (IOException e) { - throw new UncheckedIOException("Failed to deserialize sort key sketch", e); - } - } - - @Override - public int sizeOf(SortKey item) { - return serializeToByteArray(item).length; - } - - @Override - public int sizeOf(Memory mem, long offset, int numItems) { - Preconditions.checkArgument(mem != null, "Invalid input memory: null"); - if (numItems <= 0) { - return 0; - } - - Util.checkBounds(offset, Integer.BYTES, mem.getCapacity()); - int numBytes = mem.getInt(offset); - return Integer.BYTES + numBytes; - } - - @Override - public String toString(SortKey item) { - return item.toString(); - } - - @Override - public Class getClassOfT() { - return SortKey.class; - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SortKeyUtil.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SortKeyUtil.java deleted file mode 100644 index 1e5bdbbac3e4..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/SortKeyUtil.java +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import java.util.List; -import org.apache.iceberg.Schema; -import org.apache.iceberg.SortField; -import org.apache.iceberg.SortOrder; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.types.Type; -import org.apache.iceberg.types.Types; - -class SortKeyUtil { - private SortKeyUtil() {} - - /** Compute the result schema of {@code SortKey} transformation */ - static Schema sortKeySchema(Schema schema, SortOrder sortOrder) { - List sortFields = sortOrder.fields(); - int size = sortFields.size(); - List transformedFields = Lists.newArrayListWithCapacity(size); - for (int i = 0; i < size; ++i) { - int sourceFieldId = sortFields.get(i).sourceId(); - Types.NestedField sourceField = schema.findField(sourceFieldId); - Preconditions.checkArgument( - sourceField != null, "Cannot find source field: %s", sourceFieldId); - Type transformedType = sortFields.get(i).transform().getResultType(sourceField.type()); - // There could be multiple transformations on the same source column, like in the PartitionKey - // case. To resolve the collision, field id is set to transform index and field name is set to - // sourceFieldName_transformIndex - Types.NestedField transformedField = - Types.NestedField.of( - i, - sourceField.isOptional(), - sourceField.name() + '_' + i, - transformedType, - sourceField.doc()); - transformedFields.add(transformedField); - } - - return new Schema(transformedFields); - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsEvent.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsEvent.java deleted file mode 100644 index f6fcdb8b16ef..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsEvent.java +++ /dev/null @@ -1,76 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import org.apache.flink.annotation.Internal; -import org.apache.flink.api.common.typeutils.TypeSerializer; -import org.apache.flink.runtime.operators.coordination.OperatorEvent; - -/** - * DataStatisticsEvent is sent between data statistics coordinator and operator to transmit data - * statistics in bytes - */ -@Internal -class StatisticsEvent implements OperatorEvent { - - private static final long serialVersionUID = 1L; - private final long checkpointId; - private final byte[] statisticsBytes; - private final boolean applyImmediately; - - private StatisticsEvent(long checkpointId, byte[] statisticsBytes, boolean applyImmediately) { - this.checkpointId = checkpointId; - this.statisticsBytes = statisticsBytes; - this.applyImmediately = applyImmediately; - } - - static StatisticsEvent createTaskStatisticsEvent( - long checkpointId, - DataStatistics statistics, - TypeSerializer statisticsSerializer) { - // applyImmediately is really only relevant for coordinator to operator event. - // task reported statistics is always merged immediately by the coordinator. - return new StatisticsEvent( - checkpointId, - StatisticsUtil.serializeDataStatistics(statistics, statisticsSerializer), - true); - } - - static StatisticsEvent createGlobalStatisticsEvent( - GlobalStatistics statistics, - TypeSerializer statisticsSerializer, - boolean applyImmediately) { - return new StatisticsEvent( - statistics.checkpointId(), - StatisticsUtil.serializeGlobalStatistics(statistics, statisticsSerializer), - applyImmediately); - } - - long checkpointId() { - return checkpointId; - } - - byte[] statisticsBytes() { - return statisticsBytes; - } - - boolean applyImmediately() { - return applyImmediately; - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsOrRecord.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsOrRecord.java deleted file mode 100644 index bc28df2b0e22..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsOrRecord.java +++ /dev/null @@ -1,112 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import java.io.Serializable; -import org.apache.flink.annotation.Internal; -import org.apache.flink.api.common.typeutils.TypeSerializer; -import org.apache.flink.table.data.RowData; -import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; - -/** - * The wrapper class for data statistics and record. It is the only way for data statistics operator - * to send global data statistics to custom partitioner to distribute data based on statistics - * - *

    DataStatisticsOrRecord contains either data statistics(globally aggregated) or a record. It is - * sent from {@link DataStatisticsOperator} to partitioner. Once partitioner receives the data - * statistics, it will use that to decide the coming record should send to which writer subtask. - * After shuffling, a filter and mapper are required to filter out the data distribution weight, - * unwrap the object and extract the original record type T. - */ -@Internal -public class StatisticsOrRecord implements Serializable { - - private static final long serialVersionUID = 1L; - - private GlobalStatistics statistics; - private RowData record; - - private StatisticsOrRecord(GlobalStatistics statistics, RowData record) { - Preconditions.checkArgument( - record != null ^ statistics != null, "DataStatistics or record, not neither or both"); - this.statistics = statistics; - this.record = record; - } - - static StatisticsOrRecord fromRecord(RowData record) { - return new StatisticsOrRecord(null, record); - } - - static StatisticsOrRecord fromStatistics(GlobalStatistics statistics) { - return new StatisticsOrRecord(statistics, null); - } - - static StatisticsOrRecord reuseRecord( - StatisticsOrRecord reuse, TypeSerializer recordSerializer) { - if (reuse.hasRecord()) { - return reuse; - } else { - // not reusable - return StatisticsOrRecord.fromRecord(recordSerializer.createInstance()); - } - } - - static StatisticsOrRecord reuseStatistics( - StatisticsOrRecord reuse, TypeSerializer statisticsSerializer) { - if (reuse.hasStatistics()) { - return reuse; - } else { - // not reusable - return StatisticsOrRecord.fromStatistics(statisticsSerializer.createInstance()); - } - } - - boolean hasStatistics() { - return statistics != null; - } - - public boolean hasRecord() { - return record != null; - } - - GlobalStatistics statistics() { - return statistics; - } - - void statistics(GlobalStatistics newStatistics) { - this.statistics = newStatistics; - } - - public RowData record() { - return record; - } - - void record(RowData newRecord) { - this.record = newRecord; - } - - @Override - public String toString() { - return MoreObjects.toStringHelper(this) - .add("statistics", statistics) - .add("record", record) - .toString(); - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsOrRecordSerializer.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsOrRecordSerializer.java deleted file mode 100644 index 6e403425938d..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsOrRecordSerializer.java +++ /dev/null @@ -1,210 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import java.io.IOException; -import java.util.Objects; -import org.apache.flink.annotation.Internal; -import org.apache.flink.api.common.typeutils.CompositeTypeSerializerSnapshot; -import org.apache.flink.api.common.typeutils.TypeSerializer; -import org.apache.flink.api.common.typeutils.TypeSerializerSnapshot; -import org.apache.flink.core.memory.DataInputView; -import org.apache.flink.core.memory.DataOutputView; -import org.apache.flink.table.data.RowData; - -@Internal -class StatisticsOrRecordSerializer extends TypeSerializer { - private final TypeSerializer statisticsSerializer; - private final TypeSerializer recordSerializer; - - StatisticsOrRecordSerializer( - TypeSerializer statisticsSerializer, - TypeSerializer recordSerializer) { - this.statisticsSerializer = statisticsSerializer; - this.recordSerializer = recordSerializer; - } - - @Override - public boolean isImmutableType() { - return false; - } - - @SuppressWarnings("ReferenceEquality") - @Override - public TypeSerializer duplicate() { - TypeSerializer duplicateStatisticsSerializer = - statisticsSerializer.duplicate(); - TypeSerializer duplicateRowDataSerializer = recordSerializer.duplicate(); - if ((statisticsSerializer != duplicateStatisticsSerializer) - || (recordSerializer != duplicateRowDataSerializer)) { - return new StatisticsOrRecordSerializer( - duplicateStatisticsSerializer, duplicateRowDataSerializer); - } else { - return this; - } - } - - @Override - public StatisticsOrRecord createInstance() { - // arbitrarily always create RowData value instance - return StatisticsOrRecord.fromRecord(recordSerializer.createInstance()); - } - - @Override - public StatisticsOrRecord copy(StatisticsOrRecord from) { - if (from.hasRecord()) { - return StatisticsOrRecord.fromRecord(recordSerializer.copy(from.record())); - } else { - return StatisticsOrRecord.fromStatistics(statisticsSerializer.copy(from.statistics())); - } - } - - @Override - public StatisticsOrRecord copy(StatisticsOrRecord from, StatisticsOrRecord reuse) { - StatisticsOrRecord to; - if (from.hasRecord()) { - to = StatisticsOrRecord.reuseRecord(reuse, recordSerializer); - RowData record = recordSerializer.copy(from.record(), to.record()); - to.record(record); - } else { - to = StatisticsOrRecord.reuseStatistics(reuse, statisticsSerializer); - GlobalStatistics statistics = statisticsSerializer.copy(from.statistics(), to.statistics()); - to.statistics(statistics); - } - - return to; - } - - @Override - public int getLength() { - return -1; - } - - @Override - public void serialize(StatisticsOrRecord statisticsOrRecord, DataOutputView target) - throws IOException { - if (statisticsOrRecord.hasRecord()) { - target.writeBoolean(true); - recordSerializer.serialize(statisticsOrRecord.record(), target); - } else { - target.writeBoolean(false); - statisticsSerializer.serialize(statisticsOrRecord.statistics(), target); - } - } - - @Override - public StatisticsOrRecord deserialize(DataInputView source) throws IOException { - boolean isRecord = source.readBoolean(); - if (isRecord) { - return StatisticsOrRecord.fromRecord(recordSerializer.deserialize(source)); - } else { - return StatisticsOrRecord.fromStatistics(statisticsSerializer.deserialize(source)); - } - } - - @Override - public StatisticsOrRecord deserialize(StatisticsOrRecord reuse, DataInputView source) - throws IOException { - StatisticsOrRecord to; - boolean isRecord = source.readBoolean(); - if (isRecord) { - to = StatisticsOrRecord.reuseRecord(reuse, recordSerializer); - RowData record = recordSerializer.deserialize(to.record(), source); - to.record(record); - } else { - to = StatisticsOrRecord.reuseStatistics(reuse, statisticsSerializer); - GlobalStatistics statistics = statisticsSerializer.deserialize(to.statistics(), source); - to.statistics(statistics); - } - - return to; - } - - @Override - public void copy(DataInputView source, DataOutputView target) throws IOException { - boolean hasRecord = source.readBoolean(); - target.writeBoolean(hasRecord); - if (hasRecord) { - recordSerializer.copy(source, target); - } else { - statisticsSerializer.copy(source, target); - } - } - - @Override - public boolean equals(Object obj) { - if (!(obj instanceof StatisticsOrRecordSerializer)) { - return false; - } - - StatisticsOrRecordSerializer other = (StatisticsOrRecordSerializer) obj; - return Objects.equals(statisticsSerializer, other.statisticsSerializer) - && Objects.equals(recordSerializer, other.recordSerializer); - } - - @Override - public int hashCode() { - return Objects.hash(statisticsSerializer, recordSerializer); - } - - @Override - public TypeSerializerSnapshot snapshotConfiguration() { - return new StatisticsOrRecordSerializerSnapshot(this); - } - - public static class StatisticsOrRecordSerializerSnapshot - extends CompositeTypeSerializerSnapshot { - private static final int CURRENT_VERSION = 1; - - /** Constructor for read instantiation. */ - @SuppressWarnings({"unused", "checkstyle:RedundantModifier"}) - public StatisticsOrRecordSerializerSnapshot() { - super(StatisticsOrRecordSerializer.class); - } - - @SuppressWarnings("checkstyle:RedundantModifier") - public StatisticsOrRecordSerializerSnapshot(StatisticsOrRecordSerializer serializer) { - super(serializer); - } - - @SuppressWarnings("checkstyle:RedundantModifier") - @Override - protected int getCurrentOuterSnapshotVersion() { - return CURRENT_VERSION; - } - - @Override - protected TypeSerializer[] getNestedSerializers( - StatisticsOrRecordSerializer outerSerializer) { - return new TypeSerializer[] { - outerSerializer.statisticsSerializer, outerSerializer.recordSerializer - }; - } - - @SuppressWarnings("unchecked") - @Override - protected StatisticsOrRecordSerializer createOuterSerializerWithNestedSerializers( - TypeSerializer[] nestedSerializers) { - TypeSerializer statisticsSerializer = - (TypeSerializer) nestedSerializers[0]; - TypeSerializer recordSerializer = (TypeSerializer) nestedSerializers[1]; - return new StatisticsOrRecordSerializer(statisticsSerializer, recordSerializer); - } - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsType.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsType.java deleted file mode 100644 index 43f72e336e06..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsType.java +++ /dev/null @@ -1,55 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -/** - * Range distribution requires gathering statistics on the sort keys to determine proper range - * boundaries to distribute/cluster rows before writer operators. - */ -public enum StatisticsType { - /** - * Tracks the data statistics as {@code Map} frequency. It works better for - * low-cardinality scenarios (like country, event_type, etc.) where the cardinalities are in - * hundreds or thousands. - * - *

      - *
    • Pro: accurate measurement on the statistics/weight of every key. - *
    • Con: memory footprint can be large if the key cardinality is high. - *
    - */ - Map, - - /** - * Sample the sort keys via reservoir sampling. Then split the range partitions via range bounds - * from sampled values. It works better for high-cardinality scenarios (like device_id, user_id, - * uuid etc.) where the cardinalities can be in millions or billions. - * - *
      - *
    • Pro: relatively low memory footprint for high-cardinality sort keys. - *
    • Con: non-precise approximation with potentially lower accuracy. - *
    - */ - Sketch, - - /** - * Initially use Map for statistics tracking. If key cardinality turns out to be high, - * automatically switch to sketch sampling. - */ - Auto -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsUtil.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsUtil.java deleted file mode 100644 index 5d48ec57ca49..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/sink/shuffle/StatisticsUtil.java +++ /dev/null @@ -1,126 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import java.io.IOException; -import java.io.UncheckedIOException; -import javax.annotation.Nullable; -import org.apache.flink.api.common.typeutils.TypeSerializer; -import org.apache.flink.core.memory.DataInputDeserializer; -import org.apache.flink.core.memory.DataOutputSerializer; - -class StatisticsUtil { - - private StatisticsUtil() {} - - static DataStatistics createTaskStatistics( - StatisticsType type, int operatorParallelism, int numPartitions) { - if (type == StatisticsType.Map) { - return new MapDataStatistics(); - } else { - return new SketchDataStatistics( - SketchUtil.determineOperatorReservoirSize(operatorParallelism, numPartitions)); - } - } - - static byte[] serializeDataStatistics( - DataStatistics dataStatistics, TypeSerializer statisticsSerializer) { - DataOutputSerializer out = new DataOutputSerializer(64); - try { - statisticsSerializer.serialize(dataStatistics, out); - return out.getCopyOfBuffer(); - } catch (IOException e) { - throw new UncheckedIOException("Fail to serialize data statistics", e); - } - } - - static DataStatistics deserializeDataStatistics( - byte[] bytes, TypeSerializer statisticsSerializer) { - DataInputDeserializer input = new DataInputDeserializer(bytes, 0, bytes.length); - try { - return statisticsSerializer.deserialize(input); - } catch (IOException e) { - throw new UncheckedIOException("Fail to deserialize data statistics", e); - } - } - - static byte[] serializeCompletedStatistics( - CompletedStatistics completedStatistics, - TypeSerializer statisticsSerializer) { - try { - DataOutputSerializer out = new DataOutputSerializer(1024); - statisticsSerializer.serialize(completedStatistics, out); - return out.getCopyOfBuffer(); - } catch (IOException e) { - throw new UncheckedIOException("Fail to serialize aggregated statistics", e); - } - } - - static CompletedStatistics deserializeCompletedStatistics( - byte[] bytes, TypeSerializer statisticsSerializer) { - try { - DataInputDeserializer input = new DataInputDeserializer(bytes); - return statisticsSerializer.deserialize(input); - } catch (IOException e) { - throw new UncheckedIOException("Fail to deserialize aggregated statistics", e); - } - } - - static byte[] serializeGlobalStatistics( - GlobalStatistics globalStatistics, TypeSerializer statisticsSerializer) { - try { - DataOutputSerializer out = new DataOutputSerializer(1024); - statisticsSerializer.serialize(globalStatistics, out); - return out.getCopyOfBuffer(); - } catch (IOException e) { - throw new UncheckedIOException("Fail to serialize aggregated statistics", e); - } - } - - static GlobalStatistics deserializeGlobalStatistics( - byte[] bytes, TypeSerializer statisticsSerializer) { - try { - DataInputDeserializer input = new DataInputDeserializer(bytes); - return statisticsSerializer.deserialize(input); - } catch (IOException e) { - throw new UncheckedIOException("Fail to deserialize aggregated statistics", e); - } - } - - static StatisticsType collectType(StatisticsType config) { - return config == StatisticsType.Sketch ? StatisticsType.Sketch : StatisticsType.Map; - } - - static StatisticsType collectType(StatisticsType config, @Nullable GlobalStatistics statistics) { - if (statistics != null) { - return statistics.type(); - } - - return collectType(config); - } - - static StatisticsType collectType( - StatisticsType config, @Nullable CompletedStatistics statistics) { - if (statistics != null) { - return statistics.type(); - } - - return collectType(config); - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/AvroGenericRecordFileScanTaskReader.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/AvroGenericRecordFileScanTaskReader.java deleted file mode 100644 index 796434c45136..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/AvroGenericRecordFileScanTaskReader.java +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import org.apache.avro.generic.GenericRecord; -import org.apache.iceberg.FileScanTask; -import org.apache.iceberg.encryption.InputFilesDecryptor; -import org.apache.iceberg.io.CloseableIterator; - -public class AvroGenericRecordFileScanTaskReader implements FileScanTaskReader { - private final RowDataFileScanTaskReader rowDataReader; - private final RowDataToAvroGenericRecordConverter converter; - - public AvroGenericRecordFileScanTaskReader( - RowDataFileScanTaskReader rowDataReader, RowDataToAvroGenericRecordConverter converter) { - this.rowDataReader = rowDataReader; - this.converter = converter; - } - - @Override - public CloseableIterator open( - FileScanTask fileScanTask, InputFilesDecryptor inputFilesDecryptor) { - return CloseableIterator.transform( - rowDataReader.open(fileScanTask, inputFilesDecryptor), converter); - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/DataIterator.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/DataIterator.java deleted file mode 100644 index 91d975349b19..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/DataIterator.java +++ /dev/null @@ -1,156 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import java.io.IOException; -import java.io.UncheckedIOException; -import java.util.Iterator; -import org.apache.flink.annotation.Internal; -import org.apache.iceberg.CombinedScanTask; -import org.apache.iceberg.FileScanTask; -import org.apache.iceberg.encryption.EncryptionManager; -import org.apache.iceberg.encryption.InputFilesDecryptor; -import org.apache.iceberg.io.CloseableIterator; -import org.apache.iceberg.io.FileIO; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; - -/** - * Flink data iterator that reads {@link CombinedScanTask} into a {@link CloseableIterator} - * - * @param is the output data type returned by this iterator. - */ -@Internal -public class DataIterator implements CloseableIterator { - - private final FileScanTaskReader fileScanTaskReader; - - private final InputFilesDecryptor inputFilesDecryptor; - private final CombinedScanTask combinedTask; - - private Iterator tasks; - private CloseableIterator currentIterator; - private int fileOffset; - private long recordOffset; - - public DataIterator( - FileScanTaskReader fileScanTaskReader, - CombinedScanTask task, - FileIO io, - EncryptionManager encryption) { - this.fileScanTaskReader = fileScanTaskReader; - - this.inputFilesDecryptor = new InputFilesDecryptor(task, io, encryption); - this.combinedTask = task; - - this.tasks = task.files().iterator(); - this.currentIterator = CloseableIterator.empty(); - - // fileOffset starts at -1 because we started - // from an empty iterator that is not from the split files. - this.fileOffset = -1; - // record offset points to the record that next() should return when called - this.recordOffset = 0L; - } - - /** - * (startingFileOffset, startingRecordOffset) points to the next row that reader should resume - * from. E.g., if the seek position is (file=0, record=1), seek moves the iterator position to the - * 2nd row in file 0. When next() is called after seek, 2nd row from file 0 should be returned. - */ - public void seek(int startingFileOffset, long startingRecordOffset) { - Preconditions.checkState( - fileOffset == -1, "Seek should be called before any other iterator actions"); - // skip files - Preconditions.checkState( - startingFileOffset < combinedTask.files().size(), - "Invalid starting file offset %s for combined scan task with %s files: %s", - startingFileOffset, - combinedTask.files().size(), - combinedTask); - for (long i = 0L; i < startingFileOffset; ++i) { - tasks.next(); - } - - updateCurrentIterator(); - // skip records within the file - for (long i = 0; i < startingRecordOffset; ++i) { - if (currentFileHasNext() && hasNext()) { - next(); - } else { - throw new IllegalStateException( - String.format( - "Invalid starting record offset %d for file %d from CombinedScanTask: %s", - startingRecordOffset, startingFileOffset, combinedTask)); - } - } - - fileOffset = startingFileOffset; - recordOffset = startingRecordOffset; - } - - @Override - public boolean hasNext() { - updateCurrentIterator(); - return currentIterator.hasNext(); - } - - @Override - public T next() { - updateCurrentIterator(); - recordOffset += 1; - return currentIterator.next(); - } - - public boolean currentFileHasNext() { - return currentIterator.hasNext(); - } - - /** Updates the current iterator field to ensure that the current Iterator is not exhausted. */ - private void updateCurrentIterator() { - try { - while (!currentIterator.hasNext() && tasks.hasNext()) { - currentIterator.close(); - currentIterator = openTaskIterator(tasks.next()); - fileOffset += 1; - recordOffset = 0L; - } - } catch (IOException e) { - throw new UncheckedIOException(e); - } - } - - private CloseableIterator openTaskIterator(FileScanTask scanTask) { - return fileScanTaskReader.open(scanTask, inputFilesDecryptor); - } - - @Override - public void close() throws IOException { - // close the current iterator - currentIterator.close(); - tasks = null; - } - - public int fileOffset() { - return fileOffset; - } - - public long recordOffset() { - return recordOffset; - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/DataTaskReader.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/DataTaskReader.java deleted file mode 100644 index 4394dab4d4cc..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/DataTaskReader.java +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import org.apache.flink.annotation.Internal; -import org.apache.flink.table.data.RowData; -import org.apache.iceberg.FileScanTask; -import org.apache.iceberg.Schema; -import org.apache.iceberg.encryption.InputFilesDecryptor; -import org.apache.iceberg.flink.data.StructRowData; -import org.apache.iceberg.io.CloseableIterable; -import org.apache.iceberg.io.CloseableIterator; - -@Internal -public class DataTaskReader implements FileScanTaskReader { - - private final Schema readSchema; - - public DataTaskReader(Schema readSchema) { - this.readSchema = readSchema; - } - - @Override - public CloseableIterator open( - FileScanTask task, InputFilesDecryptor inputFilesDecryptor) { - StructRowData row = new StructRowData(readSchema.asStruct()); - CloseableIterable iterable = - CloseableIterable.transform(task.asDataTask().rows(), row::setStruct); - return iterable.iterator(); - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/FileScanTaskReader.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/FileScanTaskReader.java deleted file mode 100644 index 927a804a4792..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/FileScanTaskReader.java +++ /dev/null @@ -1,35 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import java.io.Serializable; -import org.apache.flink.annotation.Internal; -import org.apache.iceberg.FileScanTask; -import org.apache.iceberg.encryption.InputFilesDecryptor; -import org.apache.iceberg.io.CloseableIterator; - -/** - * Read a {@link FileScanTask} into a {@link CloseableIterator} - * - * @param is the output data type returned by this iterator. - */ -@Internal -public interface FileScanTaskReader extends Serializable { - CloseableIterator open(FileScanTask fileScanTask, InputFilesDecryptor inputFilesDecryptor); -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/FlinkInputFormat.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/FlinkInputFormat.java deleted file mode 100644 index 9a5123dc489e..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/FlinkInputFormat.java +++ /dev/null @@ -1,141 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import java.io.IOException; -import java.util.concurrent.ExecutorService; -import org.apache.flink.api.common.io.DefaultInputSplitAssigner; -import org.apache.flink.api.common.io.InputFormat; -import org.apache.flink.api.common.io.LocatableInputSplitAssigner; -import org.apache.flink.api.common.io.RichInputFormat; -import org.apache.flink.api.common.io.statistics.BaseStatistics; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.core.io.InputSplitAssigner; -import org.apache.flink.table.data.RowData; -import org.apache.iceberg.BaseMetadataTable; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.encryption.EncryptionManager; -import org.apache.iceberg.flink.TableLoader; -import org.apache.iceberg.io.FileIO; -import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; -import org.apache.iceberg.util.ThreadPools; - -/** Flink {@link InputFormat} for Iceberg. */ -public class FlinkInputFormat extends RichInputFormat { - - private static final long serialVersionUID = 1L; - - private final TableLoader tableLoader; - private final FileIO io; - private final EncryptionManager encryption; - private final ScanContext context; - private final FileScanTaskReader rowDataReader; - - private transient DataIterator iterator; - private transient long currentReadCount = 0L; - - FlinkInputFormat( - TableLoader tableLoader, - Schema tableSchema, - FileIO io, - EncryptionManager encryption, - ScanContext context) { - this.tableLoader = tableLoader; - this.io = io; - this.encryption = encryption; - this.context = context; - - tableLoader.open(); - Table table = tableLoader.loadTable(); - if (table instanceof BaseMetadataTable) { - this.rowDataReader = new DataTaskReader(context.project()); - } else { - this.rowDataReader = - new RowDataFileScanTaskReader( - tableSchema, - context.project(), - context.nameMapping(), - context.caseSensitive(), - context.filters()); - } - } - - @VisibleForTesting - Schema projectedSchema() { - return context.project(); - } - - @Override - public BaseStatistics getStatistics(BaseStatistics cachedStatistics) { - // Legacy method, not be used. - return null; - } - - @Override - public FlinkInputSplit[] createInputSplits(int minNumSplits) throws IOException { - // Called in Job manager, so it is OK to load table from catalog. - tableLoader.open(); - final ExecutorService workerPool = - ThreadPools.newWorkerPool("iceberg-plan-worker-pool", context.planParallelism()); - try (TableLoader loader = tableLoader) { - Table table = loader.loadTable(); - return FlinkSplitPlanner.planInputSplits(table, context, workerPool); - } finally { - workerPool.shutdown(); - } - } - - @Override - public InputSplitAssigner getInputSplitAssigner(FlinkInputSplit[] inputSplits) { - return context.exposeLocality() - ? new LocatableInputSplitAssigner(inputSplits) - : new DefaultInputSplitAssigner(inputSplits); - } - - @Override - public void configure(Configuration parameters) {} - - @Override - public void open(FlinkInputSplit split) { - this.iterator = new DataIterator<>(rowDataReader, split.getTask(), io, encryption); - } - - @Override - public boolean reachedEnd() { - if (context.limit() > 0 && currentReadCount >= context.limit()) { - return true; - } else { - return !iterator.hasNext(); - } - } - - @Override - public RowData nextRecord(RowData reuse) { - currentReadCount++; - return iterator.next(); - } - - @Override - public void close() throws IOException { - if (iterator != null) { - iterator.close(); - } - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/FlinkInputSplit.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/FlinkInputSplit.java deleted file mode 100644 index 16fd4f39596c..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/FlinkInputSplit.java +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import java.util.Arrays; -import javax.annotation.Nullable; -import org.apache.flink.core.io.LocatableInputSplit; -import org.apache.iceberg.CombinedScanTask; -import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; - -public class FlinkInputSplit extends LocatableInputSplit { - - private final CombinedScanTask task; - - FlinkInputSplit(int splitNumber, CombinedScanTask task, @Nullable String[] hostnames) { - super(splitNumber, hostnames); - this.task = task; - } - - CombinedScanTask getTask() { - return task; - } - - @Override - public String toString() { - return MoreObjects.toStringHelper(this) - .add("splitNumber", getSplitNumber()) - .add("task", task) - .add("hosts", Arrays.toString(getHostnames())) - .toString(); - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSource.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSource.java deleted file mode 100644 index b1431a32dd20..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSource.java +++ /dev/null @@ -1,307 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import java.io.IOException; -import java.io.UncheckedIOException; -import java.time.Duration; -import java.util.List; -import java.util.Map; -import org.apache.flink.api.common.typeinfo.TypeInformation; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.configuration.ReadableConfig; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.table.api.TableSchema; -import org.apache.flink.table.data.RowData; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.TableScan; -import org.apache.iceberg.encryption.EncryptionManager; -import org.apache.iceberg.expressions.Expression; -import org.apache.iceberg.flink.FlinkConfigOptions; -import org.apache.iceberg.flink.FlinkReadOptions; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.TableLoader; -import org.apache.iceberg.flink.util.FlinkCompatibilityUtil; -import org.apache.iceberg.io.FileIO; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.util.PropertyUtil; - -public class FlinkSource { - private FlinkSource() {} - - /** - * Initialize a {@link Builder} to read the data from iceberg table. Equivalent to {@link - * TableScan}. See more options in {@link ScanContext}. - * - *

    The Source can be read static data in bounded mode. It can also continuously check the - * arrival of new data and read records incrementally. - * - *

      - *
    • Without startSnapshotId: Bounded - *
    • With startSnapshotId and with endSnapshotId: Bounded - *
    • With startSnapshotId (-1 means unbounded preceding) and Without endSnapshotId: Unbounded - *
    - * - *

    - * - * @return {@link Builder} to connect the iceberg table. - */ - public static Builder forRowData() { - return new Builder(); - } - - /** Source builder to build {@link DataStream}. */ - public static class Builder { - private StreamExecutionEnvironment env; - private Table table; - private TableLoader tableLoader; - private TableSchema projectedSchema; - private ReadableConfig readableConfig = new Configuration(); - private final ScanContext.Builder contextBuilder = ScanContext.builder(); - private Boolean exposeLocality; - - private final Map readOptions = Maps.newHashMap(); - - public Builder tableLoader(TableLoader newLoader) { - this.tableLoader = newLoader; - return this; - } - - public Builder table(Table newTable) { - this.table = newTable; - return this; - } - - public Builder env(StreamExecutionEnvironment newEnv) { - this.env = newEnv; - return this; - } - - public Builder filters(List filters) { - contextBuilder.filters(filters); - return this; - } - - public Builder project(TableSchema schema) { - this.projectedSchema = schema; - return this; - } - - public Builder limit(Long newLimit) { - if (newLimit != null) { - readOptions.put(FlinkReadOptions.LIMIT, Long.toString(newLimit)); - } - return this; - } - - public Builder set(String property, String value) { - readOptions.put(property, value); - return this; - } - - public Builder setAll(Map properties) { - readOptions.putAll(properties); - return this; - } - - /** @deprecated Use {@link #setAll} instead. */ - @Deprecated - public Builder properties(Map properties) { - readOptions.putAll(properties); - return this; - } - - public Builder caseSensitive(boolean caseSensitive) { - readOptions.put(FlinkReadOptions.CASE_SENSITIVE, Boolean.toString(caseSensitive)); - return this; - } - - public Builder snapshotId(Long snapshotId) { - readOptions.put(FlinkReadOptions.SNAPSHOT_ID.key(), Long.toString(snapshotId)); - return this; - } - - public Builder branch(String branch) { - readOptions.put(FlinkReadOptions.BRANCH.key(), branch); - return this; - } - - public Builder tag(String tag) { - readOptions.put(FlinkReadOptions.TAG.key(), tag); - return this; - } - - public Builder startSnapshotId(Long startSnapshotId) { - readOptions.put(FlinkReadOptions.START_SNAPSHOT_ID.key(), Long.toString(startSnapshotId)); - return this; - } - - public Builder endSnapshotId(Long endSnapshotId) { - readOptions.put(FlinkReadOptions.END_SNAPSHOT_ID.key(), Long.toString(endSnapshotId)); - return this; - } - - public Builder startTag(String startTag) { - readOptions.put(FlinkReadOptions.START_TAG.key(), startTag); - return this; - } - - public Builder endTag(String endTag) { - readOptions.put(FlinkReadOptions.END_TAG.key(), endTag); - return this; - } - - public Builder asOfTimestamp(Long asOfTimestamp) { - readOptions.put(FlinkReadOptions.AS_OF_TIMESTAMP.key(), Long.toString(asOfTimestamp)); - return this; - } - - public Builder splitSize(Long splitSize) { - readOptions.put(FlinkReadOptions.SPLIT_SIZE, Long.toString(splitSize)); - return this; - } - - public Builder splitLookback(Integer splitLookback) { - readOptions.put(FlinkReadOptions.SPLIT_LOOKBACK, Integer.toString(splitLookback)); - return this; - } - - public Builder splitOpenFileCost(Long splitOpenFileCost) { - readOptions.put(FlinkReadOptions.SPLIT_FILE_OPEN_COST, Long.toString(splitOpenFileCost)); - return this; - } - - public Builder streaming(boolean streaming) { - readOptions.put(FlinkReadOptions.STREAMING, Boolean.toString(streaming)); - return this; - } - - public Builder exposeLocality(boolean newExposeLocality) { - this.exposeLocality = newExposeLocality; - return this; - } - - public Builder nameMapping(String nameMapping) { - readOptions.put(TableProperties.DEFAULT_NAME_MAPPING, nameMapping); - return this; - } - - public Builder monitorInterval(Duration interval) { - readOptions.put(FlinkReadOptions.MONITOR_INTERVAL, interval.toNanos() + " ns"); - return this; - } - - public Builder maxPlanningSnapshotCount(int newMaxPlanningSnapshotCount) { - readOptions.put( - FlinkReadOptions.MAX_PLANNING_SNAPSHOT_COUNT, - Integer.toString(newMaxPlanningSnapshotCount)); - return this; - } - - public Builder flinkConf(ReadableConfig config) { - this.readableConfig = config; - return this; - } - - public FlinkInputFormat buildFormat() { - Preconditions.checkNotNull(tableLoader, "TableLoader should not be null"); - - Schema icebergSchema; - FileIO io; - EncryptionManager encryption; - if (table == null) { - // load required fields by table loader. - tableLoader.open(); - try (TableLoader loader = tableLoader) { - table = loader.loadTable(); - icebergSchema = table.schema(); - io = table.io(); - encryption = table.encryption(); - } catch (IOException e) { - throw new UncheckedIOException(e); - } - } else { - icebergSchema = table.schema(); - io = table.io(); - encryption = table.encryption(); - } - - if (projectedSchema == null) { - contextBuilder.project(icebergSchema); - } else { - contextBuilder.project(FlinkSchemaUtil.convert(icebergSchema, projectedSchema)); - } - - contextBuilder.exposeLocality( - SourceUtil.isLocalityEnabled(table, readableConfig, exposeLocality)); - contextBuilder.planParallelism( - readableConfig.get(FlinkConfigOptions.TABLE_EXEC_ICEBERG_WORKER_POOL_SIZE)); - - contextBuilder.resolveConfig(table, readOptions, readableConfig); - - ScanContext context = contextBuilder.build(); - context.validate(); - return new FlinkInputFormat(tableLoader, icebergSchema, io, encryption, context); - } - - public DataStream build() { - Preconditions.checkNotNull(env, "StreamExecutionEnvironment should not be null"); - FlinkInputFormat format = buildFormat(); - - ScanContext context = contextBuilder.build(); - TypeInformation typeInfo = - FlinkCompatibilityUtil.toTypeInfo(FlinkSchemaUtil.convert(context.project())); - - if (!context.isStreaming()) { - int parallelism = - SourceUtil.inferParallelism( - readableConfig, - context.limit(), - () -> { - try { - return format.createInputSplits(0).length; - } catch (IOException e) { - throw new UncheckedIOException( - "Failed to create iceberg input splits for table: " + table, e); - } - }); - if (env.getMaxParallelism() > 0) { - parallelism = Math.min(parallelism, env.getMaxParallelism()); - } - return env.createInput(format, typeInfo).setParallelism(parallelism); - } else { - StreamingMonitorFunction function = new StreamingMonitorFunction(tableLoader, context); - - String monitorFunctionName = String.format("Iceberg table (%s) monitor", table); - String readerOperatorName = String.format("Iceberg table (%s) reader", table); - - return env.addSource(function, monitorFunctionName) - .transform(readerOperatorName, typeInfo, StreamingReaderOperator.factory(format)); - } - } - } - - public static boolean isBounded(Map properties) { - return !PropertyUtil.propertyAsBoolean(properties, FlinkReadOptions.STREAMING, false); - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSplitPlanner.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSplitPlanner.java deleted file mode 100644 index 15078809714f..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSplitPlanner.java +++ /dev/null @@ -1,189 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import java.io.IOException; -import java.io.UncheckedIOException; -import java.util.List; -import java.util.concurrent.ExecutorService; -import org.apache.flink.annotation.Internal; -import org.apache.iceberg.CombinedScanTask; -import org.apache.iceberg.FileScanTask; -import org.apache.iceberg.IncrementalAppendScan; -import org.apache.iceberg.Scan; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.TableScan; -import org.apache.iceberg.expressions.Expression; -import org.apache.iceberg.flink.source.split.IcebergSourceSplit; -import org.apache.iceberg.hadoop.Util; -import org.apache.iceberg.io.CloseableIterable; -import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.util.Tasks; - -@Internal -public class FlinkSplitPlanner { - private FlinkSplitPlanner() {} - - static FlinkInputSplit[] planInputSplits( - Table table, ScanContext context, ExecutorService workerPool) { - try (CloseableIterable tasksIterable = - planTasks(table, context, workerPool)) { - List tasks = Lists.newArrayList(tasksIterable); - FlinkInputSplit[] splits = new FlinkInputSplit[tasks.size()]; - boolean exposeLocality = context.exposeLocality(); - - Tasks.range(tasks.size()) - .stopOnFailure() - .executeWith(exposeLocality ? workerPool : null) - .run( - index -> { - CombinedScanTask task = tasks.get(index); - String[] hostnames = null; - if (exposeLocality) { - hostnames = Util.blockLocations(table.io(), task); - } - splits[index] = new FlinkInputSplit(index, task, hostnames); - }); - return splits; - } catch (IOException e) { - throw new UncheckedIOException("Failed to process tasks iterable", e); - } - } - - /** This returns splits for the FLIP-27 source */ - public static List planIcebergSourceSplits( - Table table, ScanContext context, ExecutorService workerPool) { - try (CloseableIterable tasksIterable = - planTasks(table, context, workerPool)) { - return Lists.newArrayList( - CloseableIterable.transform(tasksIterable, IcebergSourceSplit::fromCombinedScanTask)); - } catch (IOException e) { - throw new UncheckedIOException("Failed to process task iterable: ", e); - } - } - - static CloseableIterable planTasks( - Table table, ScanContext context, ExecutorService workerPool) { - ScanMode scanMode = checkScanMode(context); - if (scanMode == ScanMode.INCREMENTAL_APPEND_SCAN) { - IncrementalAppendScan scan = table.newIncrementalAppendScan(); - scan = refineScanWithBaseConfigs(scan, context, workerPool); - - if (context.startTag() != null) { - Preconditions.checkArgument( - table.snapshot(context.startTag()) != null, - "Cannot find snapshot with tag %s", - context.startTag()); - scan = scan.fromSnapshotExclusive(table.snapshot(context.startTag()).snapshotId()); - } - - if (context.startSnapshotId() != null) { - Preconditions.checkArgument( - context.startTag() == null, "START_SNAPSHOT_ID and START_TAG cannot both be set"); - scan = scan.fromSnapshotExclusive(context.startSnapshotId()); - } - - if (context.endTag() != null) { - Preconditions.checkArgument( - table.snapshot(context.endTag()) != null, - "Cannot find snapshot with tag %s", - context.endTag()); - scan = scan.toSnapshot(table.snapshot(context.endTag()).snapshotId()); - } - - if (context.endSnapshotId() != null) { - Preconditions.checkArgument( - context.endTag() == null, "END_SNAPSHOT_ID and END_TAG cannot both be set"); - scan = scan.toSnapshot(context.endSnapshotId()); - } - - return scan.planTasks(); - } else { - TableScan scan = table.newScan(); - scan = refineScanWithBaseConfigs(scan, context, workerPool); - - if (context.snapshotId() != null) { - scan = scan.useSnapshot(context.snapshotId()); - } else if (context.tag() != null) { - scan = scan.useRef(context.tag()); - } else if (context.branch() != null) { - scan = scan.useRef(context.branch()); - } - - if (context.asOfTimestamp() != null) { - scan = scan.asOfTime(context.asOfTimestamp()); - } - - return scan.planTasks(); - } - } - - @VisibleForTesting - enum ScanMode { - BATCH, - INCREMENTAL_APPEND_SCAN - } - - @VisibleForTesting - static ScanMode checkScanMode(ScanContext context) { - if (context.startSnapshotId() != null - || context.endSnapshotId() != null - || context.startTag() != null - || context.endTag() != null) { - return ScanMode.INCREMENTAL_APPEND_SCAN; - } else { - return ScanMode.BATCH; - } - } - - /** refine scan with common configs */ - private static > T refineScanWithBaseConfigs( - T scan, ScanContext context, ExecutorService workerPool) { - T refinedScan = - scan.caseSensitive(context.caseSensitive()).project(context.project()).planWith(workerPool); - - if (context.includeColumnStats()) { - refinedScan = refinedScan.includeColumnStats(); - } - - if (context.includeStatsForColumns() != null) { - refinedScan = refinedScan.includeColumnStats(context.includeStatsForColumns()); - } - - refinedScan = refinedScan.option(TableProperties.SPLIT_SIZE, context.splitSize().toString()); - - refinedScan = - refinedScan.option(TableProperties.SPLIT_LOOKBACK, context.splitLookback().toString()); - - refinedScan = - refinedScan.option( - TableProperties.SPLIT_OPEN_FILE_COST, context.splitOpenFileCost().toString()); - - if (context.filters() != null) { - for (Expression filter : context.filters()) { - refinedScan = refinedScan.filter(filter); - } - } - - return refinedScan; - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/IcebergSource.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/IcebergSource.java deleted file mode 100644 index ccbd0d9997ed..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/IcebergSource.java +++ /dev/null @@ -1,549 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import java.io.IOException; -import java.io.UncheckedIOException; -import java.time.Duration; -import java.util.List; -import java.util.Map; -import java.util.UUID; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.TimeUnit; -import javax.annotation.Nullable; -import org.apache.flink.annotation.Experimental; -import org.apache.flink.api.connector.source.Boundedness; -import org.apache.flink.api.connector.source.Source; -import org.apache.flink.api.connector.source.SourceReader; -import org.apache.flink.api.connector.source.SourceReaderContext; -import org.apache.flink.api.connector.source.SplitEnumerator; -import org.apache.flink.api.connector.source.SplitEnumeratorContext; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.configuration.ReadableConfig; -import org.apache.flink.core.io.SimpleVersionedSerializer; -import org.apache.flink.table.api.TableSchema; -import org.apache.flink.table.data.RowData; -import org.apache.flink.util.Preconditions; -import org.apache.iceberg.BaseMetadataTable; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.expressions.Expression; -import org.apache.iceberg.flink.FlinkConfigOptions; -import org.apache.iceberg.flink.FlinkReadConf; -import org.apache.iceberg.flink.FlinkReadOptions; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.TableLoader; -import org.apache.iceberg.flink.source.assigner.OrderedSplitAssignerFactory; -import org.apache.iceberg.flink.source.assigner.SimpleSplitAssignerFactory; -import org.apache.iceberg.flink.source.assigner.SplitAssigner; -import org.apache.iceberg.flink.source.assigner.SplitAssignerFactory; -import org.apache.iceberg.flink.source.enumerator.ContinuousIcebergEnumerator; -import org.apache.iceberg.flink.source.enumerator.ContinuousSplitPlanner; -import org.apache.iceberg.flink.source.enumerator.ContinuousSplitPlannerImpl; -import org.apache.iceberg.flink.source.enumerator.IcebergEnumeratorState; -import org.apache.iceberg.flink.source.enumerator.IcebergEnumeratorStateSerializer; -import org.apache.iceberg.flink.source.enumerator.StaticIcebergEnumerator; -import org.apache.iceberg.flink.source.reader.ColumnStatsWatermarkExtractor; -import org.apache.iceberg.flink.source.reader.IcebergSourceReader; -import org.apache.iceberg.flink.source.reader.IcebergSourceReaderMetrics; -import org.apache.iceberg.flink.source.reader.MetaDataReaderFunction; -import org.apache.iceberg.flink.source.reader.ReaderFunction; -import org.apache.iceberg.flink.source.reader.RowDataReaderFunction; -import org.apache.iceberg.flink.source.reader.SerializableRecordEmitter; -import org.apache.iceberg.flink.source.reader.SplitWatermarkExtractor; -import org.apache.iceberg.flink.source.split.IcebergSourceSplit; -import org.apache.iceberg.flink.source.split.IcebergSourceSplitSerializer; -import org.apache.iceberg.flink.source.split.SerializableComparator; -import org.apache.iceberg.flink.source.split.SplitComparators; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.relocated.com.google.common.collect.Sets; -import org.apache.iceberg.util.ThreadPools; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -@Experimental -public class IcebergSource implements Source { - private static final Logger LOG = LoggerFactory.getLogger(IcebergSource.class); - - // This table loader can be closed, and it is only safe to use this instance for resource - // independent information (e.g. a table name). Copies of this are required to avoid lifecycle - // management conflicts with the user provided table loader. e.g. a copy of this is required for - // split planning, which uses the underlying io, and should be closed after split planning is - // complete. - private final TableLoader tableLoader; - private final ScanContext scanContext; - private final ReaderFunction readerFunction; - private final SplitAssignerFactory assignerFactory; - private final SerializableComparator splitComparator; - private final SerializableRecordEmitter emitter; - private final String tableName; - - IcebergSource( - TableLoader tableLoader, - ScanContext scanContext, - ReaderFunction readerFunction, - SplitAssignerFactory assignerFactory, - SerializableComparator splitComparator, - Table table, - SerializableRecordEmitter emitter) { - Preconditions.checkNotNull(tableLoader, "tableLoader is required."); - Preconditions.checkNotNull(readerFunction, "readerFunction is required."); - Preconditions.checkNotNull(assignerFactory, "assignerFactory is required."); - Preconditions.checkNotNull(table, "table is required."); - this.tableLoader = tableLoader; - this.scanContext = scanContext; - this.readerFunction = readerFunction; - this.assignerFactory = assignerFactory; - this.splitComparator = splitComparator; - this.emitter = emitter; - this.tableName = table.name(); - } - - String name() { - return "IcebergSource-" + tableName; - } - - private String planningThreadName() { - // Ideally, operatorId should be used as the threadPoolName as Flink guarantees its uniqueness - // within a job. SplitEnumeratorContext doesn't expose the OperatorCoordinator.Context, which - // would contain the OperatorID. Need to discuss with Flink community whether it is ok to expose - // a public API like the protected method "OperatorCoordinator.Context getCoordinatorContext()" - // from SourceCoordinatorContext implementation. For now,

  • - is used as - // the unique thread pool name. - return tableName + "-" + UUID.randomUUID(); - } - - private List planSplitsForBatch(String threadName) { - ExecutorService workerPool = - ThreadPools.newWorkerPool(threadName, scanContext.planParallelism()); - try (TableLoader loader = tableLoader.clone()) { - loader.open(); - List splits = - FlinkSplitPlanner.planIcebergSourceSplits(loader.loadTable(), scanContext, workerPool); - LOG.info( - "Discovered {} splits from table {} during job initialization", splits.size(), tableName); - return splits; - } catch (IOException e) { - throw new UncheckedIOException("Failed to close table loader", e); - } finally { - workerPool.shutdown(); - } - } - - @Override - public Boundedness getBoundedness() { - return scanContext.isStreaming() ? Boundedness.CONTINUOUS_UNBOUNDED : Boundedness.BOUNDED; - } - - @Override - public SourceReader createReader(SourceReaderContext readerContext) { - IcebergSourceReaderMetrics metrics = - new IcebergSourceReaderMetrics(readerContext.metricGroup(), tableName); - return new IcebergSourceReader<>( - emitter, metrics, readerFunction, splitComparator, readerContext); - } - - @Override - public SplitEnumerator createEnumerator( - SplitEnumeratorContext enumContext) { - return createEnumerator(enumContext, null); - } - - @Override - public SplitEnumerator restoreEnumerator( - SplitEnumeratorContext enumContext, IcebergEnumeratorState enumState) { - return createEnumerator(enumContext, enumState); - } - - @Override - public SimpleVersionedSerializer getSplitSerializer() { - return new IcebergSourceSplitSerializer(scanContext.caseSensitive()); - } - - @Override - public SimpleVersionedSerializer getEnumeratorCheckpointSerializer() { - return new IcebergEnumeratorStateSerializer(scanContext.caseSensitive()); - } - - private SplitEnumerator createEnumerator( - SplitEnumeratorContext enumContext, - @Nullable IcebergEnumeratorState enumState) { - SplitAssigner assigner; - if (enumState == null) { - assigner = assignerFactory.createAssigner(); - } else { - LOG.info( - "Iceberg source restored {} splits from state for table {}", - enumState.pendingSplits().size(), - tableName); - assigner = assignerFactory.createAssigner(enumState.pendingSplits()); - } - if (scanContext.isStreaming()) { - ContinuousSplitPlanner splitPlanner = - new ContinuousSplitPlannerImpl(tableLoader, scanContext, planningThreadName()); - return new ContinuousIcebergEnumerator( - enumContext, assigner, scanContext, splitPlanner, enumState); - } else { - if (enumState == null) { - // Only do scan planning if nothing is restored from checkpoint state - List splits = planSplitsForBatch(planningThreadName()); - assigner.onDiscoveredSplits(splits); - } - - return new StaticIcebergEnumerator(enumContext, assigner); - } - } - - public static Builder builder() { - return new Builder<>(); - } - - public static Builder forRowData() { - return new Builder<>(); - } - - public static class Builder { - private TableLoader tableLoader; - private Table table; - private SplitAssignerFactory splitAssignerFactory; - private SerializableComparator splitComparator; - private ReaderFunction readerFunction; - private ReadableConfig flinkConfig = new Configuration(); - private final ScanContext.Builder contextBuilder = ScanContext.builder(); - private TableSchema projectedFlinkSchema; - private Boolean exposeLocality; - - private final Map readOptions = Maps.newHashMap(); - - Builder() {} - - public Builder tableLoader(TableLoader loader) { - this.tableLoader = loader; - return this; - } - - public Builder table(Table newTable) { - this.table = newTable; - return this; - } - - public Builder assignerFactory(SplitAssignerFactory assignerFactory) { - this.splitAssignerFactory = assignerFactory; - return this; - } - - public Builder splitComparator( - SerializableComparator newSplitComparator) { - this.splitComparator = newSplitComparator; - return this; - } - - public Builder readerFunction(ReaderFunction newReaderFunction) { - this.readerFunction = newReaderFunction; - return this; - } - - public Builder flinkConfig(ReadableConfig config) { - this.flinkConfig = config; - return this; - } - - public Builder caseSensitive(boolean newCaseSensitive) { - readOptions.put(FlinkReadOptions.CASE_SENSITIVE, Boolean.toString(newCaseSensitive)); - return this; - } - - public Builder useSnapshotId(Long newSnapshotId) { - if (newSnapshotId != null) { - readOptions.put(FlinkReadOptions.SNAPSHOT_ID.key(), Long.toString(newSnapshotId)); - } - return this; - } - - public Builder streamingStartingStrategy(StreamingStartingStrategy newStartingStrategy) { - readOptions.put(FlinkReadOptions.STARTING_STRATEGY, newStartingStrategy.name()); - return this; - } - - public Builder startSnapshotTimestamp(Long newStartSnapshotTimestamp) { - if (newStartSnapshotTimestamp != null) { - readOptions.put( - FlinkReadOptions.START_SNAPSHOT_TIMESTAMP.key(), - Long.toString(newStartSnapshotTimestamp)); - } - return this; - } - - public Builder startSnapshotId(Long newStartSnapshotId) { - if (newStartSnapshotId != null) { - readOptions.put( - FlinkReadOptions.START_SNAPSHOT_ID.key(), Long.toString(newStartSnapshotId)); - } - return this; - } - - public Builder tag(String tag) { - readOptions.put(FlinkReadOptions.TAG.key(), tag); - return this; - } - - public Builder branch(String branch) { - readOptions.put(FlinkReadOptions.BRANCH.key(), branch); - return this; - } - - public Builder startTag(String startTag) { - readOptions.put(FlinkReadOptions.START_TAG.key(), startTag); - return this; - } - - public Builder endTag(String endTag) { - readOptions.put(FlinkReadOptions.END_TAG.key(), endTag); - return this; - } - - public Builder endSnapshotId(Long newEndSnapshotId) { - if (newEndSnapshotId != null) { - readOptions.put(FlinkReadOptions.END_SNAPSHOT_ID.key(), Long.toString(newEndSnapshotId)); - } - return this; - } - - public Builder asOfTimestamp(Long newAsOfTimestamp) { - if (newAsOfTimestamp != null) { - readOptions.put(FlinkReadOptions.AS_OF_TIMESTAMP.key(), Long.toString(newAsOfTimestamp)); - } - return this; - } - - public Builder splitSize(Long newSplitSize) { - if (newSplitSize != null) { - readOptions.put(FlinkReadOptions.SPLIT_SIZE, Long.toString(newSplitSize)); - } - return this; - } - - public Builder splitLookback(Integer newSplitLookback) { - if (newSplitLookback != null) { - readOptions.put(FlinkReadOptions.SPLIT_LOOKBACK, Integer.toString(newSplitLookback)); - } - return this; - } - - public Builder splitOpenFileCost(Long newSplitOpenFileCost) { - if (newSplitOpenFileCost != null) { - readOptions.put(FlinkReadOptions.SPLIT_FILE_OPEN_COST, Long.toString(newSplitOpenFileCost)); - } - - return this; - } - - public Builder streaming(boolean streaming) { - readOptions.put(FlinkReadOptions.STREAMING, Boolean.toString(streaming)); - return this; - } - - public Builder monitorInterval(Duration newMonitorInterval) { - if (newMonitorInterval != null) { - readOptions.put(FlinkReadOptions.MONITOR_INTERVAL, newMonitorInterval.toNanos() + " ns"); - } - return this; - } - - public Builder nameMapping(String newNameMapping) { - readOptions.put(TableProperties.DEFAULT_NAME_MAPPING, newNameMapping); - return this; - } - - public Builder project(Schema newProjectedSchema) { - this.contextBuilder.project(newProjectedSchema); - return this; - } - - public Builder project(TableSchema newProjectedFlinkSchema) { - this.projectedFlinkSchema = newProjectedFlinkSchema; - return this; - } - - public Builder filters(List newFilters) { - this.contextBuilder.filters(newFilters); - return this; - } - - public Builder limit(Long newLimit) { - if (newLimit != null) { - readOptions.put(FlinkReadOptions.LIMIT, Long.toString(newLimit)); - } - return this; - } - - public Builder includeColumnStats(boolean newIncludeColumnStats) { - readOptions.put( - FlinkReadOptions.INCLUDE_COLUMN_STATS, Boolean.toString(newIncludeColumnStats)); - return this; - } - - public Builder planParallelism(int planParallelism) { - readOptions.put( - FlinkConfigOptions.TABLE_EXEC_ICEBERG_WORKER_POOL_SIZE.key(), - Integer.toString(planParallelism)); - return this; - } - - public Builder exposeLocality(boolean newExposeLocality) { - this.exposeLocality = newExposeLocality; - return this; - } - - public Builder maxAllowedPlanningFailures(int maxAllowedPlanningFailures) { - readOptions.put( - FlinkReadOptions.MAX_ALLOWED_PLANNING_FAILURES_OPTION.key(), - Integer.toString(maxAllowedPlanningFailures)); - return this; - } - - /** - * Set the read properties for Flink source. View the supported properties in {@link - * FlinkReadOptions} - */ - public Builder set(String property, String value) { - readOptions.put(property, value); - return this; - } - - /** - * Set the read properties for Flink source. View the supported properties in {@link - * FlinkReadOptions} - */ - public Builder setAll(Map properties) { - readOptions.putAll(properties); - return this; - } - - /** - * Emits watermarks once per split based on the min value of column statistics from files - * metadata in the given split. The generated watermarks are also used for ordering the splits - * for read. Accepted column types are timestamp/timestamptz/long. For long columns consider - * setting {@link #watermarkColumnTimeUnit(TimeUnit)}. - * - *

    Consider setting `read.split.open-file-cost` to prevent combining small files to a single - * split when the watermark is used for watermark alignment. - */ - public Builder watermarkColumn(String columnName) { - Preconditions.checkArgument( - splitAssignerFactory == null, - "Watermark column and SplitAssigner should not be set in the same source"); - readOptions.put(FlinkReadOptions.WATERMARK_COLUMN, columnName); - return this; - } - - /** - * When the type of the {@link #watermarkColumn} is {@link - * org.apache.iceberg.types.Types.LongType}, then sets the {@link TimeUnit} to convert the - * value. The default value is {@link TimeUnit#MICROSECONDS}. - */ - public Builder watermarkColumnTimeUnit(TimeUnit timeUnit) { - readOptions.put(FlinkReadOptions.WATERMARK_COLUMN_TIME_UNIT, timeUnit.name()); - return this; - } - - /** @deprecated Use {@link #setAll} instead. */ - @Deprecated - public Builder properties(Map properties) { - readOptions.putAll(properties); - return this; - } - - public IcebergSource build() { - if (table == null) { - try (TableLoader loader = tableLoader) { - loader.open(); - this.table = tableLoader.loadTable(); - } catch (IOException e) { - throw new UncheckedIOException(e); - } - } - - contextBuilder.resolveConfig(table, readOptions, flinkConfig); - Schema icebergSchema = table.schema(); - if (projectedFlinkSchema != null) { - contextBuilder.project(FlinkSchemaUtil.convert(icebergSchema, projectedFlinkSchema)); - } - - SerializableRecordEmitter emitter = SerializableRecordEmitter.defaultEmitter(); - FlinkReadConf flinkReadConf = new FlinkReadConf(table, readOptions, flinkConfig); - String watermarkColumn = flinkReadConf.watermarkColumn(); - TimeUnit watermarkTimeUnit = flinkReadConf.watermarkColumnTimeUnit(); - - if (watermarkColumn != null) { - // Column statistics is needed for watermark generation - contextBuilder.includeColumnStats(Sets.newHashSet(watermarkColumn)); - - SplitWatermarkExtractor watermarkExtractor = - new ColumnStatsWatermarkExtractor(icebergSchema, watermarkColumn, watermarkTimeUnit); - emitter = SerializableRecordEmitter.emitterWithWatermark(watermarkExtractor); - splitAssignerFactory = - new OrderedSplitAssignerFactory(SplitComparators.watermark(watermarkExtractor)); - } - - ScanContext context = contextBuilder.build(); - context.validate(); - if (readerFunction == null) { - if (table instanceof BaseMetadataTable) { - MetaDataReaderFunction rowDataReaderFunction = - new MetaDataReaderFunction( - flinkConfig, table.schema(), context.project(), table.io(), table.encryption()); - this.readerFunction = (ReaderFunction) rowDataReaderFunction; - } else { - RowDataReaderFunction rowDataReaderFunction = - new RowDataReaderFunction( - flinkConfig, - table.schema(), - context.project(), - context.nameMapping(), - context.caseSensitive(), - table.io(), - table.encryption(), - context.filters(), - context.limit()); - this.readerFunction = (ReaderFunction) rowDataReaderFunction; - } - } - - if (splitAssignerFactory == null) { - if (splitComparator == null) { - splitAssignerFactory = new SimpleSplitAssignerFactory(); - } else { - splitAssignerFactory = new OrderedSplitAssignerFactory(splitComparator); - } - } - - // Since builder already load the table, pass it to the source to avoid double loading - return new IcebergSource<>( - tableLoader, - context, - readerFunction, - splitAssignerFactory, - splitComparator, - table, - emitter); - } - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/IcebergTableSource.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/IcebergTableSource.java deleted file mode 100644 index 610657e8d47b..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/IcebergTableSource.java +++ /dev/null @@ -1,229 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import java.util.Arrays; -import java.util.List; -import java.util.Map; -import java.util.Optional; -import org.apache.flink.annotation.Internal; -import org.apache.flink.api.common.eventtime.WatermarkStrategy; -import org.apache.flink.api.common.typeinfo.TypeInformation; -import org.apache.flink.configuration.ReadableConfig; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.streaming.api.datastream.DataStreamSource; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.table.api.TableSchema; -import org.apache.flink.table.connector.ChangelogMode; -import org.apache.flink.table.connector.ProviderContext; -import org.apache.flink.table.connector.source.DataStreamScanProvider; -import org.apache.flink.table.connector.source.DynamicTableSource; -import org.apache.flink.table.connector.source.ScanTableSource; -import org.apache.flink.table.connector.source.abilities.SupportsFilterPushDown; -import org.apache.flink.table.connector.source.abilities.SupportsLimitPushDown; -import org.apache.flink.table.connector.source.abilities.SupportsProjectionPushDown; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.expressions.ResolvedExpression; -import org.apache.flink.table.types.DataType; -import org.apache.iceberg.expressions.Expression; -import org.apache.iceberg.flink.FlinkConfigOptions; -import org.apache.iceberg.flink.FlinkFilters; -import org.apache.iceberg.flink.TableLoader; -import org.apache.iceberg.flink.source.assigner.SplitAssignerType; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; - -/** Flink Iceberg table source. */ -@Internal -public class IcebergTableSource - implements ScanTableSource, - SupportsProjectionPushDown, - SupportsFilterPushDown, - SupportsLimitPushDown { - - private int[] projectedFields; - private Long limit; - private List filters; - - private final TableLoader loader; - private final TableSchema schema; - private final Map properties; - private final boolean isLimitPushDown; - private final ReadableConfig readableConfig; - - private IcebergTableSource(IcebergTableSource toCopy) { - this.loader = toCopy.loader; - this.schema = toCopy.schema; - this.properties = toCopy.properties; - this.projectedFields = toCopy.projectedFields; - this.isLimitPushDown = toCopy.isLimitPushDown; - this.limit = toCopy.limit; - this.filters = toCopy.filters; - this.readableConfig = toCopy.readableConfig; - } - - public IcebergTableSource( - TableLoader loader, - TableSchema schema, - Map properties, - ReadableConfig readableConfig) { - this(loader, schema, properties, null, false, null, ImmutableList.of(), readableConfig); - } - - private IcebergTableSource( - TableLoader loader, - TableSchema schema, - Map properties, - int[] projectedFields, - boolean isLimitPushDown, - Long limit, - List filters, - ReadableConfig readableConfig) { - this.loader = loader; - this.schema = schema; - this.properties = properties; - this.projectedFields = projectedFields; - this.isLimitPushDown = isLimitPushDown; - this.limit = limit; - this.filters = filters; - this.readableConfig = readableConfig; - } - - @Override - public void applyProjection(int[][] projectFields) { - this.projectedFields = new int[projectFields.length]; - for (int i = 0; i < projectFields.length; i++) { - Preconditions.checkArgument( - projectFields[i].length == 1, "Don't support nested projection in iceberg source now."); - this.projectedFields[i] = projectFields[i][0]; - } - } - - private DataStream createDataStream(StreamExecutionEnvironment execEnv) { - return FlinkSource.forRowData() - .env(execEnv) - .tableLoader(loader) - .properties(properties) - .project(getProjectedSchema()) - .limit(limit) - .filters(filters) - .flinkConf(readableConfig) - .build(); - } - - private DataStreamSource createFLIP27Stream(StreamExecutionEnvironment env) { - SplitAssignerType assignerType = - readableConfig.get(FlinkConfigOptions.TABLE_EXEC_SPLIT_ASSIGNER_TYPE); - IcebergSource source = - IcebergSource.forRowData() - .tableLoader(loader) - .assignerFactory(assignerType.factory()) - .properties(properties) - .project(getProjectedSchema()) - .limit(limit) - .filters(filters) - .flinkConfig(readableConfig) - .build(); - DataStreamSource stream = - env.fromSource( - source, - WatermarkStrategy.noWatermarks(), - source.name(), - TypeInformation.of(RowData.class)); - return stream; - } - - private TableSchema getProjectedSchema() { - if (projectedFields == null) { - return schema; - } else { - String[] fullNames = schema.getFieldNames(); - DataType[] fullTypes = schema.getFieldDataTypes(); - return TableSchema.builder() - .fields( - Arrays.stream(projectedFields).mapToObj(i -> fullNames[i]).toArray(String[]::new), - Arrays.stream(projectedFields).mapToObj(i -> fullTypes[i]).toArray(DataType[]::new)) - .build(); - } - } - - @Override - public void applyLimit(long newLimit) { - this.limit = newLimit; - } - - @Override - public Result applyFilters(List flinkFilters) { - List acceptedFilters = Lists.newArrayList(); - List expressions = Lists.newArrayList(); - - for (ResolvedExpression resolvedExpression : flinkFilters) { - Optional icebergExpression = FlinkFilters.convert(resolvedExpression); - if (icebergExpression.isPresent()) { - expressions.add(icebergExpression.get()); - acceptedFilters.add(resolvedExpression); - } - } - - this.filters = expressions; - return Result.of(acceptedFilters, flinkFilters); - } - - @Override - public boolean supportsNestedProjection() { - // TODO: support nested projection - return false; - } - - @Override - public ChangelogMode getChangelogMode() { - return ChangelogMode.insertOnly(); - } - - @Override - public ScanRuntimeProvider getScanRuntimeProvider(ScanContext runtimeProviderContext) { - return new DataStreamScanProvider() { - @Override - public DataStream produceDataStream( - ProviderContext providerContext, StreamExecutionEnvironment execEnv) { - if (readableConfig.get(FlinkConfigOptions.TABLE_EXEC_ICEBERG_USE_FLIP27_SOURCE)) { - return createFLIP27Stream(execEnv); - } else { - return createDataStream(execEnv); - } - } - - @Override - public boolean isBounded() { - return FlinkSource.isBounded(properties); - } - }; - } - - @Override - public DynamicTableSource copy() { - return new IcebergTableSource(this); - } - - @Override - public String asSummaryString() { - return "Iceberg table source"; - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/RowDataFileScanTaskReader.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/RowDataFileScanTaskReader.java deleted file mode 100644 index 88364f4e87b1..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/RowDataFileScanTaskReader.java +++ /dev/null @@ -1,243 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import java.util.List; -import java.util.Map; -import org.apache.flink.annotation.Internal; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.FileScanTask; -import org.apache.iceberg.MetadataColumns; -import org.apache.iceberg.Schema; -import org.apache.iceberg.StructLike; -import org.apache.iceberg.avro.Avro; -import org.apache.iceberg.data.DeleteFilter; -import org.apache.iceberg.encryption.InputFilesDecryptor; -import org.apache.iceberg.expressions.Expression; -import org.apache.iceberg.expressions.Expressions; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.FlinkSourceFilter; -import org.apache.iceberg.flink.RowDataWrapper; -import org.apache.iceberg.flink.data.FlinkAvroReader; -import org.apache.iceberg.flink.data.FlinkOrcReader; -import org.apache.iceberg.flink.data.FlinkParquetReaders; -import org.apache.iceberg.flink.data.RowDataProjection; -import org.apache.iceberg.flink.data.RowDataUtil; -import org.apache.iceberg.io.CloseableIterable; -import org.apache.iceberg.io.CloseableIterator; -import org.apache.iceberg.io.InputFile; -import org.apache.iceberg.mapping.NameMappingParser; -import org.apache.iceberg.orc.ORC; -import org.apache.iceberg.parquet.Parquet; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Sets; -import org.apache.iceberg.types.TypeUtil; -import org.apache.iceberg.util.PartitionUtil; - -@Internal -public class RowDataFileScanTaskReader implements FileScanTaskReader { - - private final Schema tableSchema; - private final Schema projectedSchema; - private final String nameMapping; - private final boolean caseSensitive; - private final FlinkSourceFilter rowFilter; - - public RowDataFileScanTaskReader( - Schema tableSchema, - Schema projectedSchema, - String nameMapping, - boolean caseSensitive, - List filters) { - this.tableSchema = tableSchema; - this.projectedSchema = projectedSchema; - this.nameMapping = nameMapping; - this.caseSensitive = caseSensitive; - - if (filters != null && !filters.isEmpty()) { - Expression combinedExpression = - filters.stream().reduce(Expressions.alwaysTrue(), Expressions::and); - this.rowFilter = - new FlinkSourceFilter(this.projectedSchema, combinedExpression, this.caseSensitive); - } else { - this.rowFilter = null; - } - } - - @Override - public CloseableIterator open( - FileScanTask task, InputFilesDecryptor inputFilesDecryptor) { - Schema partitionSchema = TypeUtil.select(projectedSchema, task.spec().identitySourceIds()); - - Map idToConstant = - partitionSchema.columns().isEmpty() - ? ImmutableMap.of() - : PartitionUtil.constantsMap(task, RowDataUtil::convertConstant); - - FlinkDeleteFilter deletes = - new FlinkDeleteFilter(task, tableSchema, projectedSchema, inputFilesDecryptor); - CloseableIterable iterable = - deletes.filter( - newIterable(task, deletes.requiredSchema(), idToConstant, inputFilesDecryptor)); - - // Project the RowData to remove the extra meta columns. - if (!projectedSchema.sameSchema(deletes.requiredSchema())) { - RowDataProjection rowDataProjection = - RowDataProjection.create( - deletes.requiredRowType(), - deletes.requiredSchema().asStruct(), - projectedSchema.asStruct()); - iterable = CloseableIterable.transform(iterable, rowDataProjection::wrap); - } - - return iterable.iterator(); - } - - private CloseableIterable newIterable( - FileScanTask task, - Schema schema, - Map idToConstant, - InputFilesDecryptor inputFilesDecryptor) { - CloseableIterable iter; - if (task.isDataTask()) { - throw new UnsupportedOperationException("Cannot read data task."); - } else { - switch (task.file().format()) { - case PARQUET: - iter = newParquetIterable(task, schema, idToConstant, inputFilesDecryptor); - break; - - case AVRO: - iter = newAvroIterable(task, schema, idToConstant, inputFilesDecryptor); - break; - - case ORC: - iter = newOrcIterable(task, schema, idToConstant, inputFilesDecryptor); - break; - - default: - throw new UnsupportedOperationException( - "Cannot read unknown format: " + task.file().format()); - } - } - - if (rowFilter != null) { - return CloseableIterable.filter(iter, rowFilter::filter); - } - return iter; - } - - private CloseableIterable newAvroIterable( - FileScanTask task, - Schema schema, - Map idToConstant, - InputFilesDecryptor inputFilesDecryptor) { - Avro.ReadBuilder builder = - Avro.read(inputFilesDecryptor.getInputFile(task)) - .reuseContainers() - .project(schema) - .split(task.start(), task.length()) - .createReaderFunc(readSchema -> new FlinkAvroReader(schema, readSchema, idToConstant)); - - if (nameMapping != null) { - builder.withNameMapping(NameMappingParser.fromJson(nameMapping)); - } - - return builder.build(); - } - - private CloseableIterable newParquetIterable( - FileScanTask task, - Schema schema, - Map idToConstant, - InputFilesDecryptor inputFilesDecryptor) { - Parquet.ReadBuilder builder = - Parquet.read(inputFilesDecryptor.getInputFile(task)) - .split(task.start(), task.length()) - .project(schema) - .createReaderFunc( - fileSchema -> FlinkParquetReaders.buildReader(schema, fileSchema, idToConstant)) - .filter(task.residual()) - .caseSensitive(caseSensitive) - .reuseContainers(); - - if (nameMapping != null) { - builder.withNameMapping(NameMappingParser.fromJson(nameMapping)); - } - - return builder.build(); - } - - private CloseableIterable newOrcIterable( - FileScanTask task, - Schema schema, - Map idToConstant, - InputFilesDecryptor inputFilesDecryptor) { - Schema readSchemaWithoutConstantAndMetadataFields = - TypeUtil.selectNot( - schema, Sets.union(idToConstant.keySet(), MetadataColumns.metadataFieldIds())); - - ORC.ReadBuilder builder = - ORC.read(inputFilesDecryptor.getInputFile(task)) - .project(readSchemaWithoutConstantAndMetadataFields) - .split(task.start(), task.length()) - .createReaderFunc( - readOrcSchema -> new FlinkOrcReader(schema, readOrcSchema, idToConstant)) - .filter(task.residual()) - .caseSensitive(caseSensitive); - - if (nameMapping != null) { - builder.withNameMapping(NameMappingParser.fromJson(nameMapping)); - } - - return builder.build(); - } - - private static class FlinkDeleteFilter extends DeleteFilter { - private final RowType requiredRowType; - private final RowDataWrapper asStructLike; - private final InputFilesDecryptor inputFilesDecryptor; - - FlinkDeleteFilter( - FileScanTask task, - Schema tableSchema, - Schema requestedSchema, - InputFilesDecryptor inputFilesDecryptor) { - super(task.file().path().toString(), task.deletes(), tableSchema, requestedSchema); - this.requiredRowType = FlinkSchemaUtil.convert(requiredSchema()); - this.asStructLike = new RowDataWrapper(requiredRowType, requiredSchema().asStruct()); - this.inputFilesDecryptor = inputFilesDecryptor; - } - - public RowType requiredRowType() { - return requiredRowType; - } - - @Override - protected StructLike asStructLike(RowData row) { - return asStructLike.wrap(row); - } - - @Override - protected InputFile getInputFile(String location) { - return inputFilesDecryptor.getInputFile(location); - } - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/RowDataRewriter.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/RowDataRewriter.java deleted file mode 100644 index c958604c004a..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/RowDataRewriter.java +++ /dev/null @@ -1,172 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import static org.apache.iceberg.TableProperties.DEFAULT_NAME_MAPPING; - -import java.util.Collection; -import java.util.Collections; -import java.util.List; -import java.util.stream.Collectors; -import org.apache.flink.api.common.functions.RichMapFunction; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.CombinedScanTask; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Schema; -import org.apache.iceberg.SerializableTable; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.encryption.EncryptionManager; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.sink.RowDataTaskWriterFactory; -import org.apache.iceberg.flink.sink.TaskWriterFactory; -import org.apache.iceberg.io.FileIO; -import org.apache.iceberg.io.TaskWriter; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.util.PropertyUtil; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -public class RowDataRewriter { - - private static final Logger LOG = LoggerFactory.getLogger(RowDataRewriter.class); - - private final Schema schema; - private final String nameMapping; - private final FileIO io; - private final boolean caseSensitive; - private final EncryptionManager encryptionManager; - private final TaskWriterFactory taskWriterFactory; - private final String tableName; - - public RowDataRewriter( - Table table, boolean caseSensitive, FileIO io, EncryptionManager encryptionManager) { - this.schema = table.schema(); - this.caseSensitive = caseSensitive; - this.io = io; - this.encryptionManager = encryptionManager; - this.nameMapping = - PropertyUtil.propertyAsString(table.properties(), DEFAULT_NAME_MAPPING, null); - this.tableName = table.name(); - - String formatString = - PropertyUtil.propertyAsString( - table.properties(), - TableProperties.DEFAULT_FILE_FORMAT, - TableProperties.DEFAULT_FILE_FORMAT_DEFAULT); - FileFormat format = FileFormat.fromString(formatString); - RowType flinkSchema = FlinkSchemaUtil.convert(table.schema()); - this.taskWriterFactory = - new RowDataTaskWriterFactory( - SerializableTable.copyOf(table), - flinkSchema, - Long.MAX_VALUE, - format, - table.properties(), - null, - false); - } - - public List rewriteDataForTasks( - DataStream dataStream, int parallelism) throws Exception { - RewriteMap map = - new RewriteMap( - schema, nameMapping, io, caseSensitive, encryptionManager, taskWriterFactory); - DataStream> ds = dataStream.map(map).setParallelism(parallelism); - return Lists.newArrayList(ds.executeAndCollect("Rewrite table :" + tableName)).stream() - .flatMap(Collection::stream) - .collect(Collectors.toList()); - } - - public static class RewriteMap extends RichMapFunction> { - - private TaskWriter writer; - private int subTaskId; - private int attemptId; - - private final Schema schema; - private final String nameMapping; - private final FileIO io; - private final boolean caseSensitive; - private final EncryptionManager encryptionManager; - private final TaskWriterFactory taskWriterFactory; - private final RowDataFileScanTaskReader rowDataReader; - - public RewriteMap( - Schema schema, - String nameMapping, - FileIO io, - boolean caseSensitive, - EncryptionManager encryptionManager, - TaskWriterFactory taskWriterFactory) { - this.schema = schema; - this.nameMapping = nameMapping; - this.io = io; - this.caseSensitive = caseSensitive; - this.encryptionManager = encryptionManager; - this.taskWriterFactory = taskWriterFactory; - this.rowDataReader = - new RowDataFileScanTaskReader( - schema, schema, nameMapping, caseSensitive, Collections.emptyList()); - } - - @Override - public void open(Configuration parameters) { - this.subTaskId = getRuntimeContext().getIndexOfThisSubtask(); - this.attemptId = getRuntimeContext().getAttemptNumber(); - // Initialize the task writer factory. - this.taskWriterFactory.initialize(subTaskId, attemptId); - } - - @Override - public List map(CombinedScanTask task) throws Exception { - // Initialize the task writer. - this.writer = taskWriterFactory.create(); - try (DataIterator iterator = - new DataIterator<>(rowDataReader, task, io, encryptionManager)) { - while (iterator.hasNext()) { - RowData rowData = iterator.next(); - writer.write(rowData); - } - return Lists.newArrayList(writer.dataFiles()); - } catch (Throwable originalThrowable) { - try { - LOG.error("Aborting commit for (subTaskId {}, attemptId {})", subTaskId, attemptId); - writer.abort(); - LOG.error("Aborted commit for (subTaskId {}, attemptId {})", subTaskId, attemptId); - } catch (Throwable inner) { - if (originalThrowable != inner) { - originalThrowable.addSuppressed(inner); - LOG.warn("Suppressing exception in catch: {}", inner.getMessage(), inner); - } - } - - if (originalThrowable instanceof Exception) { - throw originalThrowable; - } else { - throw new RuntimeException(originalThrowable); - } - } - } - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/RowDataToAvroGenericRecordConverter.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/RowDataToAvroGenericRecordConverter.java deleted file mode 100644 index 8ef1f1fbb833..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/RowDataToAvroGenericRecordConverter.java +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import java.io.Serializable; -import java.util.function.Function; -import org.apache.avro.Schema; -import org.apache.avro.generic.GenericRecord; -import org.apache.flink.annotation.Internal; -import org.apache.flink.formats.avro.RowDataToAvroConverters; -import org.apache.flink.formats.avro.typeutils.AvroSchemaConverter; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.DataType; -import org.apache.flink.table.types.logical.LogicalType; -import org.apache.flink.table.types.logical.RowType; -import org.apache.flink.table.types.utils.TypeConversions; -import org.apache.iceberg.avro.AvroSchemaUtil; -import org.apache.iceberg.flink.FlinkSchemaUtil; - -/** - * This is not serializable because Avro {@link Schema} is not actually serializable, even though it - * implements {@link Serializable} interface. - */ -@Internal -public class RowDataToAvroGenericRecordConverter implements Function { - private final RowDataToAvroConverters.RowDataToAvroConverter converter; - private final Schema avroSchema; - - private RowDataToAvroGenericRecordConverter(RowType rowType, Schema avroSchema) { - this.converter = RowDataToAvroConverters.createConverter(rowType); - this.avroSchema = avroSchema; - } - - @Override - public GenericRecord apply(RowData rowData) { - return (GenericRecord) converter.convert(avroSchema, rowData); - } - - /** Create a converter based on Iceberg schema */ - public static RowDataToAvroGenericRecordConverter fromIcebergSchema( - String tableName, org.apache.iceberg.Schema icebergSchema) { - RowType rowType = FlinkSchemaUtil.convert(icebergSchema); - Schema avroSchema = AvroSchemaUtil.convert(icebergSchema, tableName); - return new RowDataToAvroGenericRecordConverter(rowType, avroSchema); - } - - /** Create a mapper based on Avro schema */ - public static RowDataToAvroGenericRecordConverter fromAvroSchema(Schema avroSchema) { - DataType dataType = AvroSchemaConverter.convertToDataType(avroSchema.toString()); - LogicalType logicalType = TypeConversions.fromDataToLogicalType(dataType); - RowType rowType = RowType.of(logicalType.getChildren().toArray(new LogicalType[0])); - return new RowDataToAvroGenericRecordConverter(rowType, avroSchema); - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/ScanContext.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/ScanContext.java deleted file mode 100644 index ab79a3173933..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/ScanContext.java +++ /dev/null @@ -1,597 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import java.io.Serializable; -import java.time.Duration; -import java.util.Collection; -import java.util.List; -import java.util.Map; -import java.util.concurrent.TimeUnit; -import org.apache.flink.annotation.Internal; -import org.apache.flink.configuration.ReadableConfig; -import org.apache.flink.util.Preconditions; -import org.apache.flink.util.TimeUtils; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.expressions.Expression; -import org.apache.iceberg.flink.FlinkConfigOptions; -import org.apache.iceberg.flink.FlinkReadConf; -import org.apache.iceberg.flink.FlinkReadOptions; - -/** Context object with optional arguments for a Flink Scan. */ -@Internal -public class ScanContext implements Serializable { - - private static final long serialVersionUID = 1L; - - private final boolean caseSensitive; - private final boolean exposeLocality; - private final Long snapshotId; - private final String branch; - private final String tag; - private final StreamingStartingStrategy startingStrategy; - private final Long startSnapshotId; - private final Long startSnapshotTimestamp; - private final Long endSnapshotId; - private final Long asOfTimestamp; - private final String startTag; - private final String endTag; - private final Long splitSize; - private final Integer splitLookback; - private final Long splitOpenFileCost; - private final boolean isStreaming; - private final Duration monitorInterval; - - private final String nameMapping; - private final Schema schema; - private final List filters; - private final long limit; - private final boolean includeColumnStats; - private final Collection includeStatsForColumns; - private final Integer planParallelism; - private final int maxPlanningSnapshotCount; - private final int maxAllowedPlanningFailures; - private final String watermarkColumn; - private final TimeUnit watermarkColumnTimeUnit; - - private ScanContext( - boolean caseSensitive, - Long snapshotId, - StreamingStartingStrategy startingStrategy, - Long startSnapshotTimestamp, - Long startSnapshotId, - Long endSnapshotId, - Long asOfTimestamp, - Long splitSize, - Integer splitLookback, - Long splitOpenFileCost, - boolean isStreaming, - Duration monitorInterval, - String nameMapping, - Schema schema, - List filters, - long limit, - boolean includeColumnStats, - Collection includeStatsForColumns, - boolean exposeLocality, - Integer planParallelism, - int maxPlanningSnapshotCount, - int maxAllowedPlanningFailures, - String watermarkColumn, - TimeUnit watermarkColumnTimeUnit, - String branch, - String tag, - String startTag, - String endTag) { - this.caseSensitive = caseSensitive; - this.snapshotId = snapshotId; - this.tag = tag; - this.branch = branch; - this.startingStrategy = startingStrategy; - this.startSnapshotTimestamp = startSnapshotTimestamp; - this.startSnapshotId = startSnapshotId; - this.endSnapshotId = endSnapshotId; - this.asOfTimestamp = asOfTimestamp; - this.startTag = startTag; - this.endTag = endTag; - this.splitSize = splitSize; - this.splitLookback = splitLookback; - this.splitOpenFileCost = splitOpenFileCost; - this.isStreaming = isStreaming; - this.monitorInterval = monitorInterval; - - this.nameMapping = nameMapping; - this.schema = schema; - this.filters = filters; - this.limit = limit; - this.includeColumnStats = includeColumnStats; - this.includeStatsForColumns = includeStatsForColumns; - this.exposeLocality = exposeLocality; - this.planParallelism = planParallelism; - this.maxPlanningSnapshotCount = maxPlanningSnapshotCount; - this.maxAllowedPlanningFailures = maxAllowedPlanningFailures; - this.watermarkColumn = watermarkColumn; - this.watermarkColumnTimeUnit = watermarkColumnTimeUnit; - } - - void validate() { - if (isStreaming) { - if (startingStrategy == StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_ID) { - Preconditions.checkArgument( - startSnapshotId != null, - "Invalid starting snapshot id for SPECIFIC_START_SNAPSHOT_ID strategy: null"); - Preconditions.checkArgument( - startSnapshotTimestamp == null, - "Invalid starting snapshot timestamp for SPECIFIC_START_SNAPSHOT_ID strategy: not null"); - } - if (startingStrategy == StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP) { - Preconditions.checkArgument( - startSnapshotTimestamp != null, - "Invalid starting snapshot timestamp for SPECIFIC_START_SNAPSHOT_TIMESTAMP strategy: null"); - Preconditions.checkArgument( - startSnapshotId == null, - "Invalid starting snapshot id for SPECIFIC_START_SNAPSHOT_ID strategy: not null"); - } - - Preconditions.checkArgument( - tag == null, - String.format("Cannot scan table using ref %s configured for streaming reader", tag)); - Preconditions.checkArgument( - snapshotId == null, "Cannot set snapshot-id option for streaming reader"); - Preconditions.checkArgument( - asOfTimestamp == null, "Cannot set as-of-timestamp option for streaming reader"); - Preconditions.checkArgument( - endSnapshotId == null, "Cannot set end-snapshot-id option for streaming reader"); - Preconditions.checkArgument(endTag == null, "Cannot set end-tag option for streaming reader"); - } - - Preconditions.checkArgument( - !(startTag != null && startSnapshotId() != null), - "START_SNAPSHOT_ID and START_TAG cannot both be set."); - - Preconditions.checkArgument( - !(endTag != null && endSnapshotId() != null), - "END_SNAPSHOT_ID and END_TAG cannot both be set."); - - Preconditions.checkArgument( - maxAllowedPlanningFailures >= -1, - "Cannot set maxAllowedPlanningFailures to a negative number other than -1."); - } - - public boolean caseSensitive() { - return caseSensitive; - } - - public Long snapshotId() { - return snapshotId; - } - - public String branch() { - return branch; - } - - public String tag() { - return tag; - } - - public String startTag() { - return startTag; - } - - public String endTag() { - return endTag; - } - - public StreamingStartingStrategy streamingStartingStrategy() { - return startingStrategy; - } - - public Long startSnapshotTimestamp() { - return startSnapshotTimestamp; - } - - public Long startSnapshotId() { - return startSnapshotId; - } - - public Long endSnapshotId() { - return endSnapshotId; - } - - public Long asOfTimestamp() { - return asOfTimestamp; - } - - public Long splitSize() { - return splitSize; - } - - public Integer splitLookback() { - return splitLookback; - } - - public Long splitOpenFileCost() { - return splitOpenFileCost; - } - - public boolean isStreaming() { - return isStreaming; - } - - public Duration monitorInterval() { - return monitorInterval; - } - - public String nameMapping() { - return nameMapping; - } - - public Schema project() { - return schema; - } - - public List filters() { - return filters; - } - - public long limit() { - return limit; - } - - public boolean includeColumnStats() { - return includeColumnStats; - } - - public Collection includeStatsForColumns() { - return includeStatsForColumns; - } - - public boolean exposeLocality() { - return exposeLocality; - } - - public Integer planParallelism() { - return planParallelism; - } - - public int maxPlanningSnapshotCount() { - return maxPlanningSnapshotCount; - } - - public int maxAllowedPlanningFailures() { - return maxAllowedPlanningFailures; - } - - public String watermarkColumn() { - return watermarkColumn; - } - - public TimeUnit watermarkColumnTimeUnit() { - return watermarkColumnTimeUnit; - } - - public ScanContext copyWithAppendsBetween(Long newStartSnapshotId, long newEndSnapshotId) { - return ScanContext.builder() - .caseSensitive(caseSensitive) - .useSnapshotId(null) - .useBranch(branch) - .useTag(null) - .startSnapshotId(newStartSnapshotId) - .endSnapshotId(newEndSnapshotId) - .startTag(null) - .endTag(null) - .asOfTimestamp(null) - .splitSize(splitSize) - .splitLookback(splitLookback) - .splitOpenFileCost(splitOpenFileCost) - .streaming(isStreaming) - .monitorInterval(monitorInterval) - .nameMapping(nameMapping) - .project(schema) - .filters(filters) - .limit(limit) - .includeColumnStats(includeColumnStats) - .includeColumnStats(includeStatsForColumns) - .exposeLocality(exposeLocality) - .planParallelism(planParallelism) - .maxPlanningSnapshotCount(maxPlanningSnapshotCount) - .maxAllowedPlanningFailures(maxAllowedPlanningFailures) - .watermarkColumn(watermarkColumn) - .watermarkColumnTimeUnit(watermarkColumnTimeUnit) - .build(); - } - - public ScanContext copyWithSnapshotId(long newSnapshotId) { - return ScanContext.builder() - .caseSensitive(caseSensitive) - .useSnapshotId(newSnapshotId) - .useBranch(branch) - .useTag(tag) - .startSnapshotId(null) - .endSnapshotId(null) - .startTag(null) - .endTag(null) - .asOfTimestamp(null) - .splitSize(splitSize) - .splitLookback(splitLookback) - .splitOpenFileCost(splitOpenFileCost) - .streaming(isStreaming) - .monitorInterval(monitorInterval) - .nameMapping(nameMapping) - .project(schema) - .filters(filters) - .limit(limit) - .includeColumnStats(includeColumnStats) - .includeColumnStats(includeStatsForColumns) - .exposeLocality(exposeLocality) - .planParallelism(planParallelism) - .maxPlanningSnapshotCount(maxPlanningSnapshotCount) - .maxAllowedPlanningFailures(maxAllowedPlanningFailures) - .watermarkColumn(watermarkColumn) - .watermarkColumnTimeUnit(watermarkColumnTimeUnit) - .build(); - } - - public static Builder builder() { - return new Builder(); - } - - public static class Builder { - private boolean caseSensitive = FlinkReadOptions.CASE_SENSITIVE_OPTION.defaultValue(); - private Long snapshotId = FlinkReadOptions.SNAPSHOT_ID.defaultValue(); - private String branch = FlinkReadOptions.BRANCH.defaultValue(); - private String tag = FlinkReadOptions.TAG.defaultValue(); - private String startTag = FlinkReadOptions.START_TAG.defaultValue(); - private String endTag = FlinkReadOptions.END_TAG.defaultValue(); - private StreamingStartingStrategy startingStrategy = - FlinkReadOptions.STARTING_STRATEGY_OPTION.defaultValue(); - private Long startSnapshotTimestamp = FlinkReadOptions.START_SNAPSHOT_TIMESTAMP.defaultValue(); - private Long startSnapshotId = FlinkReadOptions.START_SNAPSHOT_ID.defaultValue(); - private Long endSnapshotId = FlinkReadOptions.END_SNAPSHOT_ID.defaultValue(); - private Long asOfTimestamp = FlinkReadOptions.AS_OF_TIMESTAMP.defaultValue(); - private Long splitSize = FlinkReadOptions.SPLIT_SIZE_OPTION.defaultValue(); - private Integer splitLookback = FlinkReadOptions.SPLIT_LOOKBACK_OPTION.defaultValue(); - private Long splitOpenFileCost = FlinkReadOptions.SPLIT_FILE_OPEN_COST_OPTION.defaultValue(); - private boolean isStreaming = FlinkReadOptions.STREAMING_OPTION.defaultValue(); - private Duration monitorInterval = - TimeUtils.parseDuration(FlinkReadOptions.MONITOR_INTERVAL_OPTION.defaultValue()); - private String nameMapping; - private Schema projectedSchema; - private List filters; - private long limit = FlinkReadOptions.LIMIT_OPTION.defaultValue(); - private boolean includeColumnStats = - FlinkReadOptions.INCLUDE_COLUMN_STATS_OPTION.defaultValue(); - private Collection includeStatsForColumns = null; - private boolean exposeLocality; - private Integer planParallelism = - FlinkConfigOptions.TABLE_EXEC_ICEBERG_WORKER_POOL_SIZE.defaultValue(); - private int maxPlanningSnapshotCount = - FlinkReadOptions.MAX_PLANNING_SNAPSHOT_COUNT_OPTION.defaultValue(); - private int maxAllowedPlanningFailures = - FlinkReadOptions.MAX_ALLOWED_PLANNING_FAILURES_OPTION.defaultValue(); - private String watermarkColumn = FlinkReadOptions.WATERMARK_COLUMN_OPTION.defaultValue(); - private TimeUnit watermarkColumnTimeUnit = - FlinkReadOptions.WATERMARK_COLUMN_TIME_UNIT_OPTION.defaultValue(); - - private Builder() {} - - public Builder caseSensitive(boolean newCaseSensitive) { - this.caseSensitive = newCaseSensitive; - return this; - } - - public Builder useSnapshotId(Long newSnapshotId) { - this.snapshotId = newSnapshotId; - return this; - } - - public Builder useTag(String newTag) { - this.tag = newTag; - return this; - } - - public Builder useBranch(String newBranch) { - this.branch = newBranch; - return this; - } - - public Builder startingStrategy(StreamingStartingStrategy newStartingStrategy) { - this.startingStrategy = newStartingStrategy; - return this; - } - - public Builder startSnapshotTimestamp(Long newStartSnapshotTimestamp) { - this.startSnapshotTimestamp = newStartSnapshotTimestamp; - return this; - } - - public Builder startSnapshotId(Long newStartSnapshotId) { - this.startSnapshotId = newStartSnapshotId; - return this; - } - - public Builder endSnapshotId(Long newEndSnapshotId) { - this.endSnapshotId = newEndSnapshotId; - return this; - } - - public Builder startTag(String newStartTag) { - this.startTag = newStartTag; - return this; - } - - public Builder endTag(String newEndTag) { - this.endTag = newEndTag; - return this; - } - - public Builder asOfTimestamp(Long newAsOfTimestamp) { - this.asOfTimestamp = newAsOfTimestamp; - return this; - } - - public Builder splitSize(Long newSplitSize) { - this.splitSize = newSplitSize; - return this; - } - - public Builder splitLookback(Integer newSplitLookback) { - this.splitLookback = newSplitLookback; - return this; - } - - public Builder splitOpenFileCost(Long newSplitOpenFileCost) { - this.splitOpenFileCost = newSplitOpenFileCost; - return this; - } - - public Builder streaming(boolean streaming) { - this.isStreaming = streaming; - return this; - } - - public Builder monitorInterval(Duration newMonitorInterval) { - this.monitorInterval = newMonitorInterval; - return this; - } - - public Builder nameMapping(String newNameMapping) { - this.nameMapping = newNameMapping; - return this; - } - - public Builder project(Schema newProjectedSchema) { - this.projectedSchema = newProjectedSchema; - return this; - } - - public Builder filters(List newFilters) { - this.filters = newFilters; - return this; - } - - public Builder limit(long newLimit) { - this.limit = newLimit; - return this; - } - - public Builder includeColumnStats(boolean newIncludeColumnStats) { - this.includeColumnStats = newIncludeColumnStats; - return this; - } - - public Builder includeColumnStats(Collection newIncludeStatsForColumns) { - this.includeStatsForColumns = newIncludeStatsForColumns; - return this; - } - - public Builder exposeLocality(boolean newExposeLocality) { - this.exposeLocality = newExposeLocality; - return this; - } - - public Builder planParallelism(Integer parallelism) { - this.planParallelism = parallelism; - return this; - } - - public Builder maxPlanningSnapshotCount(int newMaxPlanningSnapshotCount) { - this.maxPlanningSnapshotCount = newMaxPlanningSnapshotCount; - return this; - } - - public Builder maxAllowedPlanningFailures(int newMaxAllowedPlanningFailures) { - this.maxAllowedPlanningFailures = newMaxAllowedPlanningFailures; - return this; - } - - public Builder watermarkColumn(String newWatermarkColumn) { - this.watermarkColumn = newWatermarkColumn; - return this; - } - - public Builder watermarkColumnTimeUnit(TimeUnit newWatermarkTimeUnit) { - this.watermarkColumnTimeUnit = newWatermarkTimeUnit; - return this; - } - - public Builder resolveConfig( - Table table, Map readOptions, ReadableConfig readableConfig) { - FlinkReadConf flinkReadConf = new FlinkReadConf(table, readOptions, readableConfig); - - return this.useSnapshotId(flinkReadConf.snapshotId()) - .useTag(flinkReadConf.tag()) - .useBranch(flinkReadConf.branch()) - .startTag(flinkReadConf.startTag()) - .endTag(flinkReadConf.endTag()) - .caseSensitive(flinkReadConf.caseSensitive()) - .asOfTimestamp(flinkReadConf.asOfTimestamp()) - .startingStrategy(flinkReadConf.startingStrategy()) - .startSnapshotTimestamp(flinkReadConf.startSnapshotTimestamp()) - .startSnapshotId(flinkReadConf.startSnapshotId()) - .endSnapshotId(flinkReadConf.endSnapshotId()) - .splitSize(flinkReadConf.splitSize()) - .splitLookback(flinkReadConf.splitLookback()) - .splitOpenFileCost(flinkReadConf.splitFileOpenCost()) - .streaming(flinkReadConf.streaming()) - .monitorInterval(flinkReadConf.monitorInterval()) - .nameMapping(flinkReadConf.nameMapping()) - .limit(flinkReadConf.limit()) - .planParallelism(flinkReadConf.workerPoolSize()) - .includeColumnStats(flinkReadConf.includeColumnStats()) - .maxPlanningSnapshotCount(flinkReadConf.maxPlanningSnapshotCount()) - .maxAllowedPlanningFailures(maxAllowedPlanningFailures) - .watermarkColumn(flinkReadConf.watermarkColumn()) - .watermarkColumnTimeUnit(flinkReadConf.watermarkColumnTimeUnit()); - } - - public ScanContext build() { - return new ScanContext( - caseSensitive, - snapshotId, - startingStrategy, - startSnapshotTimestamp, - startSnapshotId, - endSnapshotId, - asOfTimestamp, - splitSize, - splitLookback, - splitOpenFileCost, - isStreaming, - monitorInterval, - nameMapping, - projectedSchema, - filters, - limit, - includeColumnStats, - includeStatsForColumns, - exposeLocality, - planParallelism, - maxPlanningSnapshotCount, - maxAllowedPlanningFailures, - watermarkColumn, - watermarkColumnTimeUnit, - branch, - tag, - startTag, - endTag); - } - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/SourceUtil.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/SourceUtil.java deleted file mode 100644 index 7c3a69dbc141..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/SourceUtil.java +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import java.util.function.Supplier; -import org.apache.flink.configuration.ReadableConfig; -import org.apache.flink.table.api.config.ExecutionConfigOptions; -import org.apache.iceberg.Table; -import org.apache.iceberg.flink.FlinkConfigOptions; -import org.apache.iceberg.hadoop.Util; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; - -class SourceUtil { - private SourceUtil() {} - - static boolean isLocalityEnabled( - Table table, ReadableConfig readableConfig, Boolean exposeLocality) { - Boolean localityEnabled = - exposeLocality != null - ? exposeLocality - : readableConfig.get(FlinkConfigOptions.TABLE_EXEC_ICEBERG_EXPOSE_SPLIT_LOCALITY_INFO); - - if (localityEnabled != null && !localityEnabled) { - return false; - } - - return Util.mayHaveBlockLocations(table.io(), table.location()); - } - - /** - * Infer source parallelism. - * - * @param readableConfig Flink config. - * @param splitCountProvider Split count supplier. As the computation may involve expensive split - * discover, lazy evaluation is performed if inferring parallelism is enabled. - * @param limitCount limited output count. - */ - static int inferParallelism( - ReadableConfig readableConfig, long limitCount, Supplier splitCountProvider) { - int parallelism = - readableConfig.get(ExecutionConfigOptions.TABLE_EXEC_RESOURCE_DEFAULT_PARALLELISM); - if (readableConfig.get(FlinkConfigOptions.TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM)) { - int maxInferParallelism = - readableConfig.get(FlinkConfigOptions.TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM_MAX); - Preconditions.checkState( - maxInferParallelism >= 1, - FlinkConfigOptions.TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM_MAX.key() - + " cannot be less than 1"); - parallelism = Math.min(splitCountProvider.get(), maxInferParallelism); - } - - if (limitCount > 0) { - int limit = limitCount >= Integer.MAX_VALUE ? Integer.MAX_VALUE : (int) limitCount; - parallelism = Math.min(parallelism, limit); - } - - // parallelism must be positive. - parallelism = Math.max(1, parallelism); - return parallelism; - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/StreamingMonitorFunction.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/StreamingMonitorFunction.java deleted file mode 100644 index a07613aee59b..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/StreamingMonitorFunction.java +++ /dev/null @@ -1,269 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import java.io.IOException; -import java.io.UncheckedIOException; -import java.util.List; -import java.util.concurrent.ExecutorService; -import org.apache.flink.api.common.functions.RuntimeContext; -import org.apache.flink.api.common.state.ListState; -import org.apache.flink.api.common.state.ListStateDescriptor; -import org.apache.flink.api.common.typeutils.base.LongSerializer; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.runtime.state.FunctionInitializationContext; -import org.apache.flink.runtime.state.FunctionSnapshotContext; -import org.apache.flink.streaming.api.checkpoint.CheckpointedFunction; -import org.apache.flink.streaming.api.functions.source.RichSourceFunction; -import org.apache.flink.streaming.api.operators.StreamingRuntimeContext; -import org.apache.iceberg.Snapshot; -import org.apache.iceberg.Table; -import org.apache.iceberg.exceptions.ValidationException; -import org.apache.iceberg.flink.TableLoader; -import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.util.SnapshotUtil; -import org.apache.iceberg.util.ThreadPools; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * This is the single (non-parallel) monitoring task which takes a {@link FlinkInputFormat}, it is - * responsible for: - * - *

      - *
    1. Monitoring snapshots of the Iceberg table. - *
    2. Creating the {@link FlinkInputSplit splits} corresponding to the incremental files - *
    3. Assigning them to downstream tasks for further processing. - *
    - * - *

    The splits to be read are forwarded to the downstream {@link StreamingReaderOperator} which - * can have parallelism greater than one. - */ -public class StreamingMonitorFunction extends RichSourceFunction - implements CheckpointedFunction { - - private static final Logger LOG = LoggerFactory.getLogger(StreamingMonitorFunction.class); - - private static final long INIT_LAST_SNAPSHOT_ID = -1L; - - private final TableLoader tableLoader; - private final ScanContext scanContext; - - private volatile boolean isRunning = true; - - // The checkpoint thread is not the same thread that running the function for SourceStreamTask - // now. It's necessary to - // mark this as volatile. - private volatile long lastSnapshotId = INIT_LAST_SNAPSHOT_ID; - - private transient SourceContext sourceContext; - private transient Table table; - private transient ListState lastSnapshotIdState; - private transient ExecutorService workerPool; - - public StreamingMonitorFunction(TableLoader tableLoader, ScanContext scanContext) { - Preconditions.checkArgument( - scanContext.snapshotId() == null, "Cannot set snapshot-id option for streaming reader"); - Preconditions.checkArgument( - scanContext.asOfTimestamp() == null, - "Cannot set as-of-timestamp option for streaming reader"); - Preconditions.checkArgument( - scanContext.endSnapshotId() == null, - "Cannot set end-snapshot-id option for streaming reader"); - Preconditions.checkArgument( - scanContext.endTag() == null, "Cannot set end-tag option for streaming reader"); - Preconditions.checkArgument( - scanContext.maxPlanningSnapshotCount() > 0, - "The max-planning-snapshot-count must be greater than zero"); - this.tableLoader = tableLoader; - this.scanContext = scanContext; - } - - @Override - public void open(Configuration parameters) throws Exception { - super.open(parameters); - - final RuntimeContext runtimeContext = getRuntimeContext(); - ValidationException.check( - runtimeContext instanceof StreamingRuntimeContext, - "context should be instance of StreamingRuntimeContext"); - final String operatorID = ((StreamingRuntimeContext) runtimeContext).getOperatorUniqueID(); - this.workerPool = - ThreadPools.newWorkerPool( - "iceberg-worker-pool-" + operatorID, scanContext.planParallelism()); - } - - @Override - public void initializeState(FunctionInitializationContext context) throws Exception { - // Load iceberg table from table loader. - tableLoader.open(); - table = tableLoader.loadTable(); - - // Initialize the flink state for last snapshot id. - lastSnapshotIdState = - context - .getOperatorStateStore() - .getListState(new ListStateDescriptor<>("snapshot-id-state", LongSerializer.INSTANCE)); - - // Restore the last-snapshot-id from flink's state if possible. - if (context.isRestored()) { - LOG.info("Restoring state for the {}.", getClass().getSimpleName()); - lastSnapshotId = lastSnapshotIdState.get().iterator().next(); - } else if (scanContext.startTag() != null || scanContext.startSnapshotId() != null) { - Preconditions.checkArgument( - !(scanContext.startTag() != null && scanContext.startSnapshotId() != null), - "START_SNAPSHOT_ID and START_TAG cannot both be set."); - Preconditions.checkNotNull( - table.currentSnapshot(), "Don't have any available snapshot in table."); - - long startSnapshotId; - if (scanContext.startTag() != null) { - Preconditions.checkArgument( - table.snapshot(scanContext.startTag()) != null, - "Cannot find snapshot with tag %s in table.", - scanContext.startTag()); - startSnapshotId = table.snapshot(scanContext.startTag()).snapshotId(); - } else { - startSnapshotId = scanContext.startSnapshotId(); - } - - long currentSnapshotId = table.currentSnapshot().snapshotId(); - Preconditions.checkState( - SnapshotUtil.isAncestorOf(table, currentSnapshotId, startSnapshotId), - "The option start-snapshot-id %s is not an ancestor of the current snapshot.", - startSnapshotId); - - lastSnapshotId = startSnapshotId; - } - } - - @Override - public void snapshotState(FunctionSnapshotContext context) throws Exception { - lastSnapshotIdState.clear(); - lastSnapshotIdState.add(lastSnapshotId); - } - - @Override - public void run(SourceContext ctx) throws Exception { - this.sourceContext = ctx; - while (isRunning) { - monitorAndForwardSplits(); - Thread.sleep(scanContext.monitorInterval().toMillis()); - } - } - - private long toSnapshotIdInclusive( - long lastConsumedSnapshotId, long currentSnapshotId, int maxPlanningSnapshotCount) { - List snapshotIds = - SnapshotUtil.snapshotIdsBetween(table, lastConsumedSnapshotId, currentSnapshotId); - if (snapshotIds.size() <= maxPlanningSnapshotCount) { - return currentSnapshotId; - } else { - // It uses reverted index since snapshotIdsBetween returns Ids that are ordered by committed - // time descending. - return snapshotIds.get(snapshotIds.size() - maxPlanningSnapshotCount); - } - } - - @VisibleForTesting - void sourceContext(SourceContext ctx) { - this.sourceContext = ctx; - } - - @VisibleForTesting - void monitorAndForwardSplits() { - // Refresh the table to get the latest committed snapshot. - table.refresh(); - - Snapshot snapshot = - scanContext.branch() != null - ? table.snapshot(scanContext.branch()) - : table.currentSnapshot(); - if (snapshot != null && snapshot.snapshotId() != lastSnapshotId) { - long snapshotId = snapshot.snapshotId(); - - ScanContext newScanContext; - if (lastSnapshotId == INIT_LAST_SNAPSHOT_ID) { - newScanContext = scanContext.copyWithSnapshotId(snapshotId); - } else { - snapshotId = - toSnapshotIdInclusive( - lastSnapshotId, snapshotId, scanContext.maxPlanningSnapshotCount()); - newScanContext = scanContext.copyWithAppendsBetween(lastSnapshotId, snapshotId); - } - - LOG.debug( - "Start discovering splits from {} (exclusive) to {} (inclusive)", - lastSnapshotId, - snapshotId); - long start = System.currentTimeMillis(); - FlinkInputSplit[] splits = - FlinkSplitPlanner.planInputSplits(table, newScanContext, workerPool); - LOG.debug( - "Discovered {} splits, time elapsed {}ms", - splits.length, - System.currentTimeMillis() - start); - - // only need to hold the checkpoint lock when emitting the splits and updating lastSnapshotId - start = System.currentTimeMillis(); - synchronized (sourceContext.getCheckpointLock()) { - for (FlinkInputSplit split : splits) { - sourceContext.collect(split); - } - - lastSnapshotId = snapshotId; - } - LOG.debug( - "Forwarded {} splits, time elapsed {}ms", - splits.length, - System.currentTimeMillis() - start); - } - } - - @Override - public void cancel() { - // this is to cover the case where cancel() is called before the run() - if (sourceContext != null) { - synchronized (sourceContext.getCheckpointLock()) { - isRunning = false; - } - } else { - isRunning = false; - } - - // Release all the resources here. - if (tableLoader != null) { - try { - tableLoader.close(); - } catch (IOException e) { - throw new UncheckedIOException(e); - } - } - } - - @Override - public void close() { - cancel(); - - if (workerPool != null) { - workerPool.shutdown(); - } - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/StreamingReaderOperator.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/StreamingReaderOperator.java deleted file mode 100644 index ee6f7b63988d..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/StreamingReaderOperator.java +++ /dev/null @@ -1,246 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import java.io.IOException; -import java.util.Queue; -import org.apache.flink.api.common.operators.MailboxExecutor; -import org.apache.flink.api.common.state.ListState; -import org.apache.flink.api.common.state.ListStateDescriptor; -import org.apache.flink.runtime.state.JavaSerializer; -import org.apache.flink.runtime.state.StateInitializationContext; -import org.apache.flink.runtime.state.StateSnapshotContext; -import org.apache.flink.streaming.api.functions.source.SourceFunction; -import org.apache.flink.streaming.api.operators.AbstractStreamOperator; -import org.apache.flink.streaming.api.operators.AbstractStreamOperatorFactory; -import org.apache.flink.streaming.api.operators.OneInputStreamOperator; -import org.apache.flink.streaming.api.operators.OneInputStreamOperatorFactory; -import org.apache.flink.streaming.api.operators.StreamOperator; -import org.apache.flink.streaming.api.operators.StreamOperatorParameters; -import org.apache.flink.streaming.api.operators.StreamSourceContexts; -import org.apache.flink.streaming.api.operators.YieldingOperatorFactory; -import org.apache.flink.streaming.api.watermark.Watermark; -import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; -import org.apache.flink.streaming.runtime.tasks.ProcessingTimeService; -import org.apache.flink.table.data.RowData; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * The operator that reads the {@link FlinkInputSplit splits} received from the preceding {@link - * StreamingMonitorFunction}. Contrary to the {@link StreamingMonitorFunction} which has a - * parallelism of 1, this operator can have multiple parallelism. - * - *

    As soon as a split descriptor is received, it is put in a queue, and use {@link - * MailboxExecutor} read the actual data of the split. This architecture allows the separation of - * the reading thread from the one split processing the checkpoint barriers, thus removing any - * potential back-pressure. - */ -public class StreamingReaderOperator extends AbstractStreamOperator - implements OneInputStreamOperator { - - private static final Logger LOG = LoggerFactory.getLogger(StreamingReaderOperator.class); - - // It's the same thread that is running this operator and checkpoint actions. we use this executor - // to schedule only - // one split for future reading, so that a new checkpoint could be triggered without blocking long - // time for exhausting - // all scheduled splits. - private final MailboxExecutor executor; - private FlinkInputFormat format; - - private transient SourceFunction.SourceContext sourceContext; - - private transient ListState inputSplitsState; - private transient Queue splits; - - // Splits are read by the same thread that calls processElement. Each read task is submitted to - // that thread by adding - // them to the executor. This state is used to ensure that only one read task is in that queue at - // a time, so that read - // tasks do not accumulate ahead of checkpoint tasks. When there is a read task in the queue, this - // is set to RUNNING. - // When there are no more files to read, this will be set to IDLE. - private transient SplitState currentSplitState; - - private StreamingReaderOperator( - FlinkInputFormat format, ProcessingTimeService timeService, MailboxExecutor mailboxExecutor) { - this.format = Preconditions.checkNotNull(format, "The InputFormat should not be null."); - this.processingTimeService = timeService; - this.executor = - Preconditions.checkNotNull(mailboxExecutor, "The mailboxExecutor should not be null."); - } - - @Override - public void initializeState(StateInitializationContext context) throws Exception { - super.initializeState(context); - - // TODO Replace Java serialization with Avro approach to keep state compatibility. - // See issue: https://github.com/apache/iceberg/issues/1698 - inputSplitsState = - context - .getOperatorStateStore() - .getListState(new ListStateDescriptor<>("splits", new JavaSerializer<>())); - - // Initialize the current split state to IDLE. - currentSplitState = SplitState.IDLE; - - // Recover splits state from flink state backend if possible. - splits = Lists.newLinkedList(); - if (context.isRestored()) { - int subtaskIdx = getRuntimeContext().getIndexOfThisSubtask(); - LOG.info("Restoring state for the {} (taskIdx: {}).", getClass().getSimpleName(), subtaskIdx); - - for (FlinkInputSplit split : inputSplitsState.get()) { - splits.add(split); - } - } - - this.sourceContext = - StreamSourceContexts.getSourceContext( - getOperatorConfig().getTimeCharacteristic(), - getProcessingTimeService(), - new Object(), // no actual locking needed - output, - getRuntimeContext().getExecutionConfig().getAutoWatermarkInterval(), - -1, - true); - - // Enqueue to process the recovered input splits. - enqueueProcessSplits(); - } - - @Override - public void snapshotState(StateSnapshotContext context) throws Exception { - super.snapshotState(context); - - inputSplitsState.clear(); - inputSplitsState.addAll(Lists.newArrayList(splits)); - } - - @Override - public void processElement(StreamRecord element) { - splits.add(element.getValue()); - enqueueProcessSplits(); - } - - private void enqueueProcessSplits() { - if (currentSplitState == SplitState.IDLE && !splits.isEmpty()) { - currentSplitState = SplitState.RUNNING; - executor.execute(this::processSplits, this.getClass().getSimpleName()); - } - } - - private void processSplits() throws IOException { - FlinkInputSplit split = splits.poll(); - if (split == null) { - currentSplitState = SplitState.IDLE; - return; - } - - format.open(split); - try { - RowData nextElement = null; - while (!format.reachedEnd()) { - nextElement = format.nextRecord(nextElement); - sourceContext.collect(nextElement); - } - } finally { - currentSplitState = SplitState.IDLE; - format.close(); - } - - // Re-schedule to process the next split. - enqueueProcessSplits(); - } - - @Override - public void processWatermark(Watermark mark) { - // we do nothing because we emit our own watermarks if needed. - } - - @Override - public void close() throws Exception { - super.close(); - - if (format != null) { - format.close(); - format.closeInputFormat(); - format = null; - } - - sourceContext = null; - } - - @Override - public void finish() throws Exception { - super.finish(); - output.close(); - if (sourceContext != null) { - sourceContext.emitWatermark(Watermark.MAX_WATERMARK); - sourceContext.close(); - sourceContext = null; - } - } - - static OneInputStreamOperatorFactory factory(FlinkInputFormat format) { - return new OperatorFactory(format); - } - - private enum SplitState { - IDLE, - RUNNING - } - - private static class OperatorFactory extends AbstractStreamOperatorFactory - implements YieldingOperatorFactory, - OneInputStreamOperatorFactory { - - private final FlinkInputFormat format; - - private transient MailboxExecutor mailboxExecutor; - - private OperatorFactory(FlinkInputFormat format) { - this.format = format; - } - - @Override - public void setMailboxExecutor(MailboxExecutor mailboxExecutor) { - this.mailboxExecutor = mailboxExecutor; - } - - @SuppressWarnings("unchecked") - @Override - public > O createStreamOperator( - StreamOperatorParameters parameters) { - StreamingReaderOperator operator = - new StreamingReaderOperator(format, processingTimeService, mailboxExecutor); - operator.setup( - parameters.getContainingTask(), parameters.getStreamConfig(), parameters.getOutput()); - return (O) operator; - } - - @Override - public Class getStreamOperatorClass(ClassLoader classLoader) { - return StreamingReaderOperator.class; - } - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/StreamingStartingStrategy.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/StreamingStartingStrategy.java deleted file mode 100644 index 11707bf82a0f..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/StreamingStartingStrategy.java +++ /dev/null @@ -1,54 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -/** Starting strategy for streaming execution. */ -public enum StreamingStartingStrategy { - /** - * Do a regular table scan then switch to the incremental mode. - * - *

    The incremental mode starts from the current snapshot exclusive. - */ - TABLE_SCAN_THEN_INCREMENTAL, - - /** - * Start incremental mode from the latest snapshot inclusive. - * - *

    If it is an empty map, all future append snapshots should be discovered. - */ - INCREMENTAL_FROM_LATEST_SNAPSHOT, - - /** - * Start incremental mode from the earliest snapshot inclusive. - * - *

    If it is an empty map, all future append snapshots should be discovered. - */ - INCREMENTAL_FROM_EARLIEST_SNAPSHOT, - - /** Start incremental mode from a snapshot with a specific id inclusive. */ - INCREMENTAL_FROM_SNAPSHOT_ID, - - /** - * Start incremental mode from a snapshot with a specific timestamp inclusive. - * - *

    If the timestamp is between two snapshots, it should start from the snapshot after the - * timestamp. - */ - INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/assigner/DefaultSplitAssigner.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/assigner/DefaultSplitAssigner.java deleted file mode 100644 index 37a0f1a6055f..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/assigner/DefaultSplitAssigner.java +++ /dev/null @@ -1,112 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.assigner; - -import java.util.ArrayDeque; -import java.util.Collection; -import java.util.PriorityQueue; -import java.util.Queue; -import java.util.concurrent.CompletableFuture; -import java.util.stream.Collectors; -import javax.annotation.Nullable; -import org.apache.flink.annotation.Internal; -import org.apache.iceberg.flink.source.split.IcebergSourceSplit; -import org.apache.iceberg.flink.source.split.IcebergSourceSplitState; -import org.apache.iceberg.flink.source.split.IcebergSourceSplitStatus; -import org.apache.iceberg.flink.source.split.SerializableComparator; - -/** - * Since all methods are called in the source coordinator thread by enumerator, there is no need for - * locking. - */ -@Internal -public class DefaultSplitAssigner implements SplitAssigner { - - private final Queue pendingSplits; - private CompletableFuture availableFuture; - - public DefaultSplitAssigner(SerializableComparator comparator) { - this.pendingSplits = comparator == null ? new ArrayDeque<>() : new PriorityQueue<>(comparator); - } - - public DefaultSplitAssigner( - SerializableComparator comparator, - Collection assignerState) { - this(comparator); - // Because default assigner only tracks unassigned splits, - // there is no need to filter splits based on status (unassigned) here. - assignerState.forEach(splitState -> pendingSplits.add(splitState.split())); - } - - @Override - public synchronized GetSplitResult getNext(@Nullable String hostname) { - if (pendingSplits.isEmpty()) { - return GetSplitResult.unavailable(); - } else { - IcebergSourceSplit split = pendingSplits.poll(); - return GetSplitResult.forSplit(split); - } - } - - @Override - public void onDiscoveredSplits(Collection splits) { - addSplits(splits); - } - - @Override - public void onUnassignedSplits(Collection splits) { - addSplits(splits); - } - - private synchronized void addSplits(Collection splits) { - if (!splits.isEmpty()) { - pendingSplits.addAll(splits); - // only complete pending future if new splits are discovered - completeAvailableFuturesIfNeeded(); - } - } - - /** Simple assigner only tracks unassigned splits */ - @Override - public synchronized Collection state() { - return pendingSplits.stream() - .map(split -> new IcebergSourceSplitState(split, IcebergSourceSplitStatus.UNASSIGNED)) - .collect(Collectors.toList()); - } - - @Override - public synchronized CompletableFuture isAvailable() { - if (availableFuture == null) { - availableFuture = new CompletableFuture<>(); - } - return availableFuture; - } - - @Override - public synchronized int pendingSplitCount() { - return pendingSplits.size(); - } - - private synchronized void completeAvailableFuturesIfNeeded() { - if (availableFuture != null && !pendingSplits.isEmpty()) { - availableFuture.complete(null); - } - availableFuture = null; - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/assigner/GetSplitResult.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/assigner/GetSplitResult.java deleted file mode 100644 index 72deaeb890f3..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/assigner/GetSplitResult.java +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.assigner; - -import org.apache.flink.annotation.Internal; -import org.apache.flink.util.Preconditions; -import org.apache.iceberg.flink.source.split.IcebergSourceSplit; - -@Internal -public class GetSplitResult { - - public enum Status { - AVAILABLE, - - /** - * There are pending splits. But they can't be assigned due to constraints (like event time - * alignment) - */ - CONSTRAINED, - - /** Assigner doesn't have pending splits. */ - UNAVAILABLE - } - - private final Status status; - private final IcebergSourceSplit split; - - private GetSplitResult(Status status) { - this.status = status; - this.split = null; - } - - private GetSplitResult(IcebergSourceSplit split) { - Preconditions.checkNotNull(split, "Split cannot be null"); - this.status = Status.AVAILABLE; - this.split = split; - } - - public Status status() { - return status; - } - - public IcebergSourceSplit split() { - return split; - } - - private static final GetSplitResult UNAVAILABLE = new GetSplitResult(Status.UNAVAILABLE); - private static final GetSplitResult CONSTRAINED = new GetSplitResult(Status.CONSTRAINED); - - public static GetSplitResult unavailable() { - return UNAVAILABLE; - } - - public static GetSplitResult constrained() { - return CONSTRAINED; - } - - public static GetSplitResult forSplit(IcebergSourceSplit split) { - return new GetSplitResult(split); - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/assigner/OrderedSplitAssignerFactory.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/assigner/OrderedSplitAssignerFactory.java deleted file mode 100644 index e58478897aef..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/assigner/OrderedSplitAssignerFactory.java +++ /dev/null @@ -1,46 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.assigner; - -import java.util.Collection; -import org.apache.iceberg.flink.source.split.IcebergSourceSplit; -import org.apache.iceberg.flink.source.split.IcebergSourceSplitState; -import org.apache.iceberg.flink.source.split.SerializableComparator; - -/** - * Create default assigner with a comparator that hands out splits where the order of the splits - * will be defined by the {@link SerializableComparator}. - */ -public class OrderedSplitAssignerFactory implements SplitAssignerFactory { - private final SerializableComparator comparator; - - public OrderedSplitAssignerFactory(SerializableComparator comparator) { - this.comparator = comparator; - } - - @Override - public SplitAssigner createAssigner() { - return new DefaultSplitAssigner(comparator); - } - - @Override - public SplitAssigner createAssigner(Collection assignerState) { - return new DefaultSplitAssigner(comparator, assignerState); - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SimpleSplitAssignerFactory.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SimpleSplitAssignerFactory.java deleted file mode 100644 index a2e2ff364d46..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SimpleSplitAssignerFactory.java +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.assigner; - -import java.util.Collection; -import org.apache.iceberg.flink.source.split.IcebergSourceSplitState; - -/** Create simple assigner that hands out splits without any guarantee in order or locality. */ -public class SimpleSplitAssignerFactory implements SplitAssignerFactory { - public SimpleSplitAssignerFactory() {} - - @Override - public SplitAssigner createAssigner() { - return new DefaultSplitAssigner(null); - } - - @Override - public SplitAssigner createAssigner(Collection assignerState) { - return new DefaultSplitAssigner(null, assignerState); - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssigner.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssigner.java deleted file mode 100644 index ca60612f0ec9..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssigner.java +++ /dev/null @@ -1,118 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.assigner; - -import java.io.Closeable; -import java.util.Collection; -import java.util.concurrent.CompletableFuture; -import javax.annotation.Nullable; -import org.apache.flink.api.connector.source.SplitEnumeratorContext; -import org.apache.iceberg.flink.source.ScanContext; -import org.apache.iceberg.flink.source.split.IcebergSourceSplit; -import org.apache.iceberg.flink.source.split.IcebergSourceSplitState; - -/** - * SplitAssigner interface is extracted out as a separate component so that we can plug in different - * split assignment strategy for different requirements. E.g. - * - *

      - *
    • Simple assigner with no ordering guarantee or locality aware optimization. - *
    • Locality aware assigner that prefer splits that are local. - *
    • Snapshot aware assigner that assign splits based on the order they are committed. - *
    • Event time alignment assigner that assign splits satisfying certain time ordering within a - * single source or across sources. - *
    - * - *

    Assigner implementation needs to be thread safe. Enumerator call the assigner APIs mostly from - * the coordinator thread. But enumerator may call the {@link SplitAssigner#pendingSplitCount()} - * from the I/O threads. - */ -public interface SplitAssigner extends Closeable { - - /** - * Some assigners may need to start background threads or perform other activity such as - * registering as listeners to updates from other event sources e.g., watermark tracker. - */ - default void start() {} - - /** - * Some assigners may need to perform certain actions when their corresponding enumerators are - * closed - */ - @Override - default void close() {} - - /** - * Request a new split from the assigner when enumerator trying to assign splits to awaiting - * readers. - * - *

    If enumerator wasn't able to assign the split (e.g., reader disconnected), enumerator should - * call {@link SplitAssigner#onUnassignedSplits} to return the split. - */ - GetSplitResult getNext(@Nullable String hostname); - - /** Add new splits discovered by enumerator */ - void onDiscoveredSplits(Collection splits); - - /** Forward addSplitsBack event (for failed reader) to assigner */ - void onUnassignedSplits(Collection splits); - - /** - * Some assigner (like event time alignment) may rack in-progress splits to advance watermark upon - * completed splits - */ - default void onCompletedSplits(Collection completedSplitIds) {} - - /** - * Get assigner state for checkpointing. This is a super-set API that works for all currently - * imagined assigners. - */ - Collection state(); - - /** - * Enumerator can get a notification via CompletableFuture when the assigner has more splits - * available later. Enumerator should schedule assignment in the thenAccept action of the future. - * - *

    Assigner will return the same future if this method is called again before the previous - * future is completed. - * - *

    The future can be completed from other thread, e.g. the coordinator thread from another - * thread for event time alignment. - * - *

    If enumerator need to trigger action upon the future completion, it may want to run it in - * the coordinator thread using {@link SplitEnumeratorContext#runInCoordinatorThread(Runnable)}. - */ - CompletableFuture isAvailable(); - - /** - * Return the number of pending splits that haven't been assigned yet. - * - *

    The enumerator can poll this API to publish a metric on the number of pending splits. - * - *

    The enumerator can also use this information to throttle split discovery for streaming read. - * If there are already many pending splits tracked by the assigner, it is undesirable to discover - * more splits and track them in the assigner. That will increase the memory footprint and - * enumerator checkpoint size. - * - *

    Throttling works better together with {@link ScanContext#maxPlanningSnapshotCount()}. - * Otherwise, the next split discovery after throttling will just discover all non-enumerated - * snapshots and splits, which defeats the purpose of throttling. - */ - int pendingSplitCount(); -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssignerFactory.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssignerFactory.java deleted file mode 100644 index 6e02a556ffcd..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssignerFactory.java +++ /dev/null @@ -1,30 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.assigner; - -import java.io.Serializable; -import java.util.Collection; -import org.apache.iceberg.flink.source.split.IcebergSourceSplitState; - -public interface SplitAssignerFactory extends Serializable { - - SplitAssigner createAssigner(); - - SplitAssigner createAssigner(Collection assignerState); -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssignerType.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssignerType.java deleted file mode 100644 index 03ba67a554f9..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssignerType.java +++ /dev/null @@ -1,33 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.assigner; - -import org.apache.flink.annotation.Internal; - -@Internal -public enum SplitAssignerType { - SIMPLE { - @Override - public SplitAssignerFactory factory() { - return new SimpleSplitAssignerFactory(); - } - }; - - public abstract SplitAssignerFactory factory(); -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/AbstractIcebergEnumerator.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/AbstractIcebergEnumerator.java deleted file mode 100644 index 801baf77a612..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/AbstractIcebergEnumerator.java +++ /dev/null @@ -1,182 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.enumerator; - -import java.io.IOException; -import java.util.Iterator; -import java.util.LinkedHashMap; -import java.util.List; -import java.util.Map; -import java.util.concurrent.CompletableFuture; -import java.util.concurrent.atomic.AtomicReference; -import javax.annotation.Nullable; -import org.apache.flink.api.connector.source.SourceEvent; -import org.apache.flink.api.connector.source.SplitEnumerator; -import org.apache.flink.api.connector.source.SplitEnumeratorContext; -import org.apache.flink.api.connector.source.SupportsHandleExecutionAttemptSourceEvent; -import org.apache.iceberg.flink.source.assigner.GetSplitResult; -import org.apache.iceberg.flink.source.assigner.SplitAssigner; -import org.apache.iceberg.flink.source.split.IcebergSourceSplit; -import org.apache.iceberg.flink.source.split.SplitRequestEvent; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * TODO: publish enumerator monitor metrics like number of pending metrics after FLINK-21000 is - * resolved - */ -abstract class AbstractIcebergEnumerator - implements SplitEnumerator, - SupportsHandleExecutionAttemptSourceEvent { - private static final Logger LOG = LoggerFactory.getLogger(AbstractIcebergEnumerator.class); - - private final SplitEnumeratorContext enumeratorContext; - private final SplitAssigner assigner; - private final Map readersAwaitingSplit; - private final AtomicReference> availableFuture; - - AbstractIcebergEnumerator( - SplitEnumeratorContext enumeratorContext, SplitAssigner assigner) { - this.enumeratorContext = enumeratorContext; - this.assigner = assigner; - this.readersAwaitingSplit = new LinkedHashMap<>(); - this.availableFuture = new AtomicReference<>(); - } - - @Override - public void start() { - assigner.start(); - } - - @Override - public void close() throws IOException { - assigner.close(); - } - - @Override - public void handleSplitRequest(int subtaskId, @Nullable String requesterHostname) { - // Iceberg source uses custom split request event to piggyback finished split ids. - throw new UnsupportedOperationException( - String.format( - "Received invalid default split request event " - + "from subtask %d as Iceberg source uses custom split request event", - subtaskId)); - } - - @Override - public void handleSourceEvent(int subtaskId, SourceEvent sourceEvent) { - if (sourceEvent instanceof SplitRequestEvent) { - SplitRequestEvent splitRequestEvent = (SplitRequestEvent) sourceEvent; - LOG.info("Received request split event from subtask {}", subtaskId); - assigner.onCompletedSplits(splitRequestEvent.finishedSplitIds()); - readersAwaitingSplit.put(subtaskId, splitRequestEvent.requesterHostname()); - assignSplits(); - } else { - throw new IllegalArgumentException( - String.format( - "Received unknown event from subtask %d: %s", - subtaskId, sourceEvent.getClass().getCanonicalName())); - } - } - - // Flink's SourceCoordinator already keeps track of subTask to splits mapping. - // It already takes care of re-assigning splits to speculated attempts as well. - @Override - public void handleSourceEvent(int subTaskId, int attemptNumber, SourceEvent sourceEvent) { - handleSourceEvent(subTaskId, sourceEvent); - } - - @Override - public void addSplitsBack(List splits, int subtaskId) { - LOG.info("Add {} splits back to the pool for failed subtask {}", splits.size(), subtaskId); - assigner.onUnassignedSplits(splits); - assignSplits(); - } - - @Override - public void addReader(int subtaskId) { - LOG.info("Added reader: {}", subtaskId); - } - - private void assignSplits() { - LOG.info("Assigning splits for {} awaiting readers", readersAwaitingSplit.size()); - Iterator> awaitingReader = - readersAwaitingSplit.entrySet().iterator(); - while (awaitingReader.hasNext()) { - Map.Entry nextAwaiting = awaitingReader.next(); - // if the reader that requested another split has failed in the meantime, remove - // it from the list of waiting readers - if (!enumeratorContext.registeredReaders().containsKey(nextAwaiting.getKey())) { - awaitingReader.remove(); - continue; - } - - int awaitingSubtask = nextAwaiting.getKey(); - String hostname = nextAwaiting.getValue(); - GetSplitResult getResult = assigner.getNext(hostname); - if (getResult.status() == GetSplitResult.Status.AVAILABLE) { - LOG.info("Assign split to subtask {}: {}", awaitingSubtask, getResult.split()); - enumeratorContext.assignSplit(getResult.split(), awaitingSubtask); - awaitingReader.remove(); - } else if (getResult.status() == GetSplitResult.Status.CONSTRAINED) { - getAvailableFutureIfNeeded(); - break; - } else if (getResult.status() == GetSplitResult.Status.UNAVAILABLE) { - if (shouldWaitForMoreSplits()) { - getAvailableFutureIfNeeded(); - break; - } else { - LOG.info("No more splits available for subtask {}", awaitingSubtask); - enumeratorContext.signalNoMoreSplits(awaitingSubtask); - awaitingReader.remove(); - } - } else { - throw new IllegalArgumentException("Unsupported status: " + getResult.status()); - } - } - } - - /** return true if enumerator should wait for splits like in the continuous enumerator case */ - protected abstract boolean shouldWaitForMoreSplits(); - - private synchronized void getAvailableFutureIfNeeded() { - if (availableFuture.get() != null) { - return; - } - - CompletableFuture future = - assigner - .isAvailable() - .thenAccept( - ignore -> - // Must run assignSplits in coordinator thread - // because the future may be completed from other threads. - // E.g., in event time alignment assigner, - // watermark advancement from another source may - // cause the available future to be completed - enumeratorContext.runInCoordinatorThread( - () -> { - LOG.debug("Executing callback of assignSplits"); - availableFuture.set(null); - assignSplits(); - })); - availableFuture.set(future); - LOG.debug("Registered callback for future available splits"); - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousEnumerationResult.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousEnumerationResult.java deleted file mode 100644 index 41863ffee60b..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousEnumerationResult.java +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.enumerator; - -import java.util.Collection; -import org.apache.iceberg.flink.source.split.IcebergSourceSplit; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; - -class ContinuousEnumerationResult { - private final Collection splits; - private final IcebergEnumeratorPosition fromPosition; - private final IcebergEnumeratorPosition toPosition; - - /** - * @param splits should never be null. But it can be an empty collection - * @param fromPosition can be null - * @param toPosition should never be null. But it can have null snapshotId and snapshotTimestampMs - */ - ContinuousEnumerationResult( - Collection splits, - IcebergEnumeratorPosition fromPosition, - IcebergEnumeratorPosition toPosition) { - Preconditions.checkArgument(splits != null, "Invalid to splits collection: null"); - Preconditions.checkArgument(toPosition != null, "Invalid end position: null"); - this.splits = splits; - this.fromPosition = fromPosition; - this.toPosition = toPosition; - } - - public Collection splits() { - return splits; - } - - public IcebergEnumeratorPosition fromPosition() { - return fromPosition; - } - - public IcebergEnumeratorPosition toPosition() { - return toPosition; - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousIcebergEnumerator.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousIcebergEnumerator.java deleted file mode 100644 index 55451b105885..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousIcebergEnumerator.java +++ /dev/null @@ -1,178 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.enumerator; - -import java.io.IOException; -import java.util.Collections; -import java.util.Objects; -import java.util.concurrent.atomic.AtomicReference; -import javax.annotation.Nullable; -import org.apache.flink.annotation.Internal; -import org.apache.flink.api.connector.source.SplitEnumeratorContext; -import org.apache.iceberg.flink.source.ScanContext; -import org.apache.iceberg.flink.source.assigner.SplitAssigner; -import org.apache.iceberg.flink.source.split.IcebergSourceSplit; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -@Internal -public class ContinuousIcebergEnumerator extends AbstractIcebergEnumerator { - - private static final Logger LOG = LoggerFactory.getLogger(ContinuousIcebergEnumerator.class); - /** - * This is hardcoded, as {@link ScanContext#maxPlanningSnapshotCount()} could be the knob to - * control the total number of snapshots worth of splits tracked by assigner. - */ - private static final int ENUMERATION_SPLIT_COUNT_HISTORY_SIZE = 3; - - private final SplitEnumeratorContext enumeratorContext; - private final SplitAssigner assigner; - private final ScanContext scanContext; - private final ContinuousSplitPlanner splitPlanner; - - /** - * snapshotId for the last enumerated snapshot. next incremental enumeration should be based off - * this as the starting position. - */ - private final AtomicReference enumeratorPosition; - - /** Track enumeration result history for split discovery throttling. */ - private final EnumerationHistory enumerationHistory; - - /** Count the consecutive failures and throw exception if the max allowed failres are reached */ - private transient int consecutiveFailures = 0; - - public ContinuousIcebergEnumerator( - SplitEnumeratorContext enumeratorContext, - SplitAssigner assigner, - ScanContext scanContext, - ContinuousSplitPlanner splitPlanner, - @Nullable IcebergEnumeratorState enumState) { - super(enumeratorContext, assigner); - - this.enumeratorContext = enumeratorContext; - this.assigner = assigner; - this.scanContext = scanContext; - this.splitPlanner = splitPlanner; - this.enumeratorPosition = new AtomicReference<>(); - this.enumerationHistory = new EnumerationHistory(ENUMERATION_SPLIT_COUNT_HISTORY_SIZE); - - if (enumState != null) { - this.enumeratorPosition.set(enumState.lastEnumeratedPosition()); - this.enumerationHistory.restore(enumState.enumerationSplitCountHistory()); - } - } - - @Override - public void start() { - super.start(); - enumeratorContext.callAsync( - this::discoverSplits, - this::processDiscoveredSplits, - 0L, - scanContext.monitorInterval().toMillis()); - } - - @Override - public void close() throws IOException { - splitPlanner.close(); - super.close(); - } - - @Override - protected boolean shouldWaitForMoreSplits() { - return true; - } - - @Override - public IcebergEnumeratorState snapshotState(long checkpointId) { - return new IcebergEnumeratorState( - enumeratorPosition.get(), assigner.state(), enumerationHistory.snapshot()); - } - - /** This method is executed in an IO thread pool. */ - private ContinuousEnumerationResult discoverSplits() { - int pendingSplitCountFromAssigner = assigner.pendingSplitCount(); - if (enumerationHistory.shouldPauseSplitDiscovery(pendingSplitCountFromAssigner)) { - // If the assigner already has many pending splits, it is better to pause split discovery. - // Otherwise, eagerly discovering more splits will just increase assigner memory footprint - // and enumerator checkpoint state size. - LOG.info( - "Pause split discovery as the assigner already has too many pending splits: {}", - pendingSplitCountFromAssigner); - return new ContinuousEnumerationResult( - Collections.emptyList(), enumeratorPosition.get(), enumeratorPosition.get()); - } else { - return splitPlanner.planSplits(enumeratorPosition.get()); - } - } - - /** This method is executed in a single coordinator thread. */ - private void processDiscoveredSplits(ContinuousEnumerationResult result, Throwable error) { - if (error == null) { - consecutiveFailures = 0; - if (!Objects.equals(result.fromPosition(), enumeratorPosition.get())) { - // Multiple discoverSplits() may be triggered with the same starting snapshot to the I/O - // thread pool. E.g., the splitDiscoveryInterval is very short (like 10 ms in some unit - // tests) or the thread pool is busy and multiple discovery actions are executed - // concurrently. Discovery result should only be accepted if the starting position - // matches the enumerator position (like compare-and-swap). - LOG.info( - "Skip {} discovered splits because the scan starting position doesn't match " - + "the current enumerator position: enumerator position = {}, scan starting position = {}", - result.splits().size(), - enumeratorPosition.get(), - result.fromPosition()); - } else { - // Sometimes, enumeration may yield no splits for a few reasons. - // - upstream paused or delayed streaming writes to the Iceberg table. - // - enumeration frequency is higher than the upstream write frequency. - if (!result.splits().isEmpty()) { - assigner.onDiscoveredSplits(result.splits()); - // EnumerationHistory makes throttling decision on split discovery - // based on the total number of splits discovered in the last a few cycles. - // Only update enumeration history when there are some discovered splits. - enumerationHistory.add(result.splits().size()); - LOG.info( - "Added {} splits discovered between ({}, {}] to the assigner", - result.splits().size(), - result.fromPosition(), - result.toPosition()); - } else { - LOG.info( - "No new splits discovered between ({}, {}]", - result.fromPosition(), - result.toPosition()); - } - // update the enumerator position even if there is no split discovered - // or the toPosition is empty (e.g. for empty table). - enumeratorPosition.set(result.toPosition()); - LOG.info("Update enumerator position to {}", result.toPosition()); - } - } else { - consecutiveFailures++; - if (scanContext.maxAllowedPlanningFailures() < 0 - || consecutiveFailures <= scanContext.maxAllowedPlanningFailures()) { - LOG.error("Failed to discover new splits", error); - } else { - throw new RuntimeException("Failed to discover new splits", error); - } - } - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousSplitPlanner.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousSplitPlanner.java deleted file mode 100644 index 2a1325178873..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousSplitPlanner.java +++ /dev/null @@ -1,30 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.enumerator; - -import java.io.Closeable; -import org.apache.flink.annotation.Internal; - -/** This interface is introduced so that we can plug in different split planner for unit test */ -@Internal -public interface ContinuousSplitPlanner extends Closeable { - - /** Discover the files appended between {@code lastPosition} and current table snapshot */ - ContinuousEnumerationResult planSplits(IcebergEnumeratorPosition lastPosition); -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousSplitPlannerImpl.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousSplitPlannerImpl.java deleted file mode 100644 index fef4ec45ed8a..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousSplitPlannerImpl.java +++ /dev/null @@ -1,240 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.enumerator; - -import java.io.IOException; -import java.util.Collections; -import java.util.List; -import java.util.Optional; -import java.util.concurrent.ExecutorService; -import org.apache.flink.annotation.Internal; -import org.apache.flink.annotation.VisibleForTesting; -import org.apache.flink.util.Preconditions; -import org.apache.iceberg.Snapshot; -import org.apache.iceberg.Table; -import org.apache.iceberg.flink.TableLoader; -import org.apache.iceberg.flink.source.FlinkSplitPlanner; -import org.apache.iceberg.flink.source.ScanContext; -import org.apache.iceberg.flink.source.StreamingStartingStrategy; -import org.apache.iceberg.flink.source.split.IcebergSourceSplit; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.util.SnapshotUtil; -import org.apache.iceberg.util.ThreadPools; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -@Internal -public class ContinuousSplitPlannerImpl implements ContinuousSplitPlanner { - private static final Logger LOG = LoggerFactory.getLogger(ContinuousSplitPlannerImpl.class); - - private final Table table; - private final ScanContext scanContext; - private final boolean isSharedPool; - private final ExecutorService workerPool; - private final TableLoader tableLoader; - - /** - * @param tableLoader A cloned tableLoader. - * @param threadName thread name prefix for worker pool to run the split planning. If null, a - * shared worker pool will be used. - */ - public ContinuousSplitPlannerImpl( - TableLoader tableLoader, ScanContext scanContext, String threadName) { - this.tableLoader = tableLoader.clone(); - this.tableLoader.open(); - this.table = this.tableLoader.loadTable(); - this.scanContext = scanContext; - this.isSharedPool = threadName == null; - this.workerPool = - isSharedPool - ? ThreadPools.getWorkerPool() - : ThreadPools.newWorkerPool( - "iceberg-plan-worker-pool-" + threadName, scanContext.planParallelism()); - } - - @Override - public void close() throws IOException { - if (!isSharedPool) { - workerPool.shutdown(); - } - tableLoader.close(); - } - - @Override - public ContinuousEnumerationResult planSplits(IcebergEnumeratorPosition lastPosition) { - table.refresh(); - if (lastPosition != null) { - return discoverIncrementalSplits(lastPosition); - } else { - return discoverInitialSplits(); - } - } - - private Snapshot toSnapshotInclusive( - Long lastConsumedSnapshotId, Snapshot currentSnapshot, int maxPlanningSnapshotCount) { - // snapshots are in reverse order (latest snapshot first) - List snapshots = - Lists.newArrayList( - SnapshotUtil.ancestorsBetween( - table, currentSnapshot.snapshotId(), lastConsumedSnapshotId)); - if (snapshots.size() <= maxPlanningSnapshotCount) { - return currentSnapshot; - } else { - // Because snapshots are in reverse order of commit history, this index returns - // the max allowed number of snapshots from the lastConsumedSnapshotId. - return snapshots.get(snapshots.size() - maxPlanningSnapshotCount); - } - } - - private ContinuousEnumerationResult discoverIncrementalSplits( - IcebergEnumeratorPosition lastPosition) { - Snapshot currentSnapshot = - scanContext.branch() != null - ? table.snapshot(scanContext.branch()) - : table.currentSnapshot(); - - if (currentSnapshot == null) { - // empty table - Preconditions.checkArgument( - lastPosition.snapshotId() == null, - "Invalid last enumerated position for an empty table: not null"); - LOG.info("Skip incremental scan because table is empty"); - return new ContinuousEnumerationResult(Collections.emptyList(), lastPosition, lastPosition); - } else if (lastPosition.snapshotId() != null - && currentSnapshot.snapshotId() == lastPosition.snapshotId()) { - LOG.info("Current table snapshot is already enumerated: {}", currentSnapshot.snapshotId()); - return new ContinuousEnumerationResult(Collections.emptyList(), lastPosition, lastPosition); - } else { - Long lastConsumedSnapshotId = lastPosition.snapshotId(); - Snapshot toSnapshotInclusive = - toSnapshotInclusive( - lastConsumedSnapshotId, currentSnapshot, scanContext.maxPlanningSnapshotCount()); - IcebergEnumeratorPosition newPosition = - IcebergEnumeratorPosition.of( - toSnapshotInclusive.snapshotId(), toSnapshotInclusive.timestampMillis()); - ScanContext incrementalScan = - scanContext.copyWithAppendsBetween( - lastPosition.snapshotId(), toSnapshotInclusive.snapshotId()); - List splits = - FlinkSplitPlanner.planIcebergSourceSplits(table, incrementalScan, workerPool); - LOG.info( - "Discovered {} splits from incremental scan: " - + "from snapshot (exclusive) is {}, to snapshot (inclusive) is {}", - splits.size(), - lastPosition, - newPosition); - return new ContinuousEnumerationResult(splits, lastPosition, newPosition); - } - } - - /** - * Discovery initial set of splits based on {@link StreamingStartingStrategy}. - *

  • {@link ContinuousEnumerationResult#splits()} should contain initial splits discovered from - * table scan for {@link StreamingStartingStrategy#TABLE_SCAN_THEN_INCREMENTAL}. For all other - * strategies, splits collection should be empty. - *
  • {@link ContinuousEnumerationResult#toPosition()} points to the starting position for the - * next incremental split discovery with exclusive behavior. Meaning files committed by the - * snapshot from the position in {@code ContinuousEnumerationResult} won't be included in the - * next incremental scan. - */ - private ContinuousEnumerationResult discoverInitialSplits() { - Optional startSnapshotOptional = startSnapshot(table, scanContext); - if (!startSnapshotOptional.isPresent()) { - return new ContinuousEnumerationResult( - Collections.emptyList(), null, IcebergEnumeratorPosition.empty()); - } - - Snapshot startSnapshot = startSnapshotOptional.get(); - LOG.info( - "Get starting snapshot id {} based on strategy {}", - startSnapshot.snapshotId(), - scanContext.streamingStartingStrategy()); - List splits; - IcebergEnumeratorPosition toPosition; - if (scanContext.streamingStartingStrategy() - == StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) { - // do a batch table scan first - splits = - FlinkSplitPlanner.planIcebergSourceSplits( - table, scanContext.copyWithSnapshotId(startSnapshot.snapshotId()), workerPool); - LOG.info( - "Discovered {} splits from initial batch table scan with snapshot Id {}", - splits.size(), - startSnapshot.snapshotId()); - // For TABLE_SCAN_THEN_INCREMENTAL, incremental mode starts exclusive from the startSnapshot - toPosition = - IcebergEnumeratorPosition.of(startSnapshot.snapshotId(), startSnapshot.timestampMillis()); - } else { - // For all other modes, starting snapshot should be consumed inclusively. - // Use parentId to achieve the inclusive behavior. It is fine if parentId is null. - splits = Collections.emptyList(); - Long parentSnapshotId = startSnapshot.parentId(); - if (parentSnapshotId != null) { - Snapshot parentSnapshot = table.snapshot(parentSnapshotId); - Long parentSnapshotTimestampMs = - parentSnapshot != null ? parentSnapshot.timestampMillis() : null; - toPosition = IcebergEnumeratorPosition.of(parentSnapshotId, parentSnapshotTimestampMs); - } else { - toPosition = IcebergEnumeratorPosition.empty(); - } - - LOG.info( - "Start incremental scan with start snapshot (inclusive): id = {}, timestamp = {}", - startSnapshot.snapshotId(), - startSnapshot.timestampMillis()); - } - - return new ContinuousEnumerationResult(splits, null, toPosition); - } - - /** - * Calculate the starting snapshot based on the {@link StreamingStartingStrategy} defined in - * {@code ScanContext}. - * - *

    If the {@link StreamingStartingStrategy} is not {@link - * StreamingStartingStrategy#TABLE_SCAN_THEN_INCREMENTAL}, the start snapshot should be consumed - * inclusively. - */ - @VisibleForTesting - static Optional startSnapshot(Table table, ScanContext scanContext) { - switch (scanContext.streamingStartingStrategy()) { - case TABLE_SCAN_THEN_INCREMENTAL: - case INCREMENTAL_FROM_LATEST_SNAPSHOT: - return Optional.ofNullable(table.currentSnapshot()); - case INCREMENTAL_FROM_EARLIEST_SNAPSHOT: - return Optional.ofNullable(SnapshotUtil.oldestAncestor(table)); - case INCREMENTAL_FROM_SNAPSHOT_ID: - Snapshot matchedSnapshotById = table.snapshot(scanContext.startSnapshotId()); - Preconditions.checkArgument( - matchedSnapshotById != null, - "Start snapshot id not found in history: " + scanContext.startSnapshotId()); - return Optional.of(matchedSnapshotById); - case INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP: - Snapshot matchedSnapshotByTimestamp = - SnapshotUtil.oldestAncestorAfter(table, scanContext.startSnapshotTimestamp()); - Preconditions.checkArgument( - matchedSnapshotByTimestamp != null, - "Cannot find a snapshot after: " + scanContext.startSnapshotTimestamp()); - return Optional.of(matchedSnapshotByTimestamp); - default: - throw new IllegalArgumentException( - "Unknown starting strategy: " + scanContext.streamingStartingStrategy()); - } - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/EnumerationHistory.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/EnumerationHistory.java deleted file mode 100644 index ec56a9ecdac1..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/EnumerationHistory.java +++ /dev/null @@ -1,100 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.enumerator; - -import java.util.Arrays; -import javax.annotation.concurrent.ThreadSafe; -import org.apache.flink.annotation.VisibleForTesting; - -/** - * This enumeration history is used for split discovery throttling. It tracks the discovered split - * count per every non-empty enumeration. - */ -@ThreadSafe -class EnumerationHistory { - - private final int[] history; - // int (2B) should be enough without overflow for enumeration history - private int count; - - EnumerationHistory(int maxHistorySize) { - this.history = new int[maxHistorySize]; - } - - synchronized void restore(int[] restoredHistory) { - int startingOffset = 0; - int restoreSize = restoredHistory.length; - - if (restoredHistory.length > history.length) { - // keep the newest history - startingOffset = restoredHistory.length - history.length; - // only restore the latest history up to maxHistorySize - restoreSize = history.length; - } - - System.arraycopy(restoredHistory, startingOffset, history, 0, restoreSize); - count = restoreSize; - } - - synchronized int[] snapshot() { - int len = history.length; - if (count > len) { - int[] copy = new int[len]; - // this is like a circular buffer - int indexForOldest = count % len; - System.arraycopy(history, indexForOldest, copy, 0, len - indexForOldest); - System.arraycopy(history, 0, copy, len - indexForOldest, indexForOldest); - return copy; - } else { - return Arrays.copyOfRange(history, 0, count); - } - } - - /** Add the split count from the last enumeration result. */ - synchronized void add(int splitCount) { - int pos = count % history.length; - history[pos] = splitCount; - count += 1; - } - - @VisibleForTesting - synchronized boolean hasFullHistory() { - return count >= history.length; - } - - /** - * Checks whether split discovery should be paused. - * - * @return true if split discovery should pause because assigner has too many splits already. - */ - synchronized boolean shouldPauseSplitDiscovery(int pendingSplitCountFromAssigner) { - if (count < history.length) { - // only check throttling when full history is obtained. - return false; - } else { - // if ScanContext#maxPlanningSnapshotCount() is 10, each split enumeration can - // discovery splits up to 10 snapshots. if maxHistorySize is 3, the max number of - // splits tracked in assigner shouldn't be more than 10 * (3 + 1) snapshots - // worth of splits. +1 because there could be another enumeration when the - // pending splits fall just below the 10 * 3. - int totalSplitCountFromRecentDiscovery = Arrays.stream(history).reduce(0, Integer::sum); - return pendingSplitCountFromAssigner >= totalSplitCountFromRecentDiscovery; - } - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorPosition.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorPosition.java deleted file mode 100644 index 96aba296f8cf..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorPosition.java +++ /dev/null @@ -1,79 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.enumerator; - -import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; -import org.apache.iceberg.relocated.com.google.common.base.Objects; - -class IcebergEnumeratorPosition { - private final Long snapshotId; - // Track snapshot timestamp mainly for info logging - private final Long snapshotTimestampMs; - - static IcebergEnumeratorPosition empty() { - return new IcebergEnumeratorPosition(null, null); - } - - static IcebergEnumeratorPosition of(long snapshotId, Long snapshotTimestampMs) { - return new IcebergEnumeratorPosition(snapshotId, snapshotTimestampMs); - } - - private IcebergEnumeratorPosition(Long snapshotId, Long snapshotTimestampMs) { - this.snapshotId = snapshotId; - this.snapshotTimestampMs = snapshotTimestampMs; - } - - boolean isEmpty() { - return snapshotId == null; - } - - Long snapshotId() { - return snapshotId; - } - - Long snapshotTimestampMs() { - return snapshotTimestampMs; - } - - @Override - public String toString() { - return MoreObjects.toStringHelper(this) - .add("snapshotId", snapshotId) - .add("snapshotTimestampMs", snapshotTimestampMs) - .toString(); - } - - @Override - public int hashCode() { - return Objects.hashCode(snapshotId, snapshotTimestampMs); - } - - @Override - public boolean equals(Object o) { - if (this == o) { - return true; - } - if (o == null || getClass() != o.getClass()) { - return false; - } - IcebergEnumeratorPosition other = (IcebergEnumeratorPosition) o; - return Objects.equal(snapshotId, other.snapshotId()) - && Objects.equal(snapshotTimestampMs, other.snapshotTimestampMs()); - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorPositionSerializer.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorPositionSerializer.java deleted file mode 100644 index 1c63807361c5..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorPositionSerializer.java +++ /dev/null @@ -1,90 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.enumerator; - -import java.io.IOException; -import org.apache.flink.core.io.SimpleVersionedSerializer; -import org.apache.flink.core.memory.DataInputDeserializer; -import org.apache.flink.core.memory.DataOutputSerializer; - -class IcebergEnumeratorPositionSerializer - implements SimpleVersionedSerializer { - - public static final IcebergEnumeratorPositionSerializer INSTANCE = - new IcebergEnumeratorPositionSerializer(); - - private static final int VERSION = 1; - - private static final ThreadLocal SERIALIZER_CACHE = - ThreadLocal.withInitial(() -> new DataOutputSerializer(128)); - - @Override - public int getVersion() { - return VERSION; - } - - @Override - public byte[] serialize(IcebergEnumeratorPosition position) throws IOException { - return serializeV1(position); - } - - @Override - public IcebergEnumeratorPosition deserialize(int version, byte[] serialized) throws IOException { - switch (version) { - case 1: - return deserializeV1(serialized); - default: - throw new IOException("Unknown version: " + version); - } - } - - private byte[] serializeV1(IcebergEnumeratorPosition position) throws IOException { - DataOutputSerializer out = SERIALIZER_CACHE.get(); - out.writeBoolean(position.snapshotId() != null); - if (position.snapshotId() != null) { - out.writeLong(position.snapshotId()); - } - out.writeBoolean(position.snapshotTimestampMs() != null); - if (position.snapshotTimestampMs() != null) { - out.writeLong(position.snapshotTimestampMs()); - } - byte[] result = out.getCopyOfBuffer(); - out.clear(); - return result; - } - - private IcebergEnumeratorPosition deserializeV1(byte[] serialized) throws IOException { - DataInputDeserializer in = new DataInputDeserializer(serialized); - Long snapshotId = null; - if (in.readBoolean()) { - snapshotId = in.readLong(); - } - - Long snapshotTimestampMs = null; - if (in.readBoolean()) { - snapshotTimestampMs = in.readLong(); - } - - if (snapshotId != null) { - return IcebergEnumeratorPosition.of(snapshotId, snapshotTimestampMs); - } else { - return IcebergEnumeratorPosition.empty(); - } - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorState.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorState.java deleted file mode 100644 index 26fbad46c128..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorState.java +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.enumerator; - -import java.io.Serializable; -import java.util.Collection; -import javax.annotation.Nullable; -import org.apache.flink.annotation.Internal; -import org.apache.iceberg.flink.source.split.IcebergSourceSplitState; - -/** Enumerator state for checkpointing */ -@Internal -public class IcebergEnumeratorState implements Serializable { - @Nullable private final IcebergEnumeratorPosition lastEnumeratedPosition; - private final Collection pendingSplits; - private final int[] enumerationSplitCountHistory; - - public IcebergEnumeratorState(Collection pendingSplits) { - this(null, pendingSplits); - } - - public IcebergEnumeratorState( - @Nullable IcebergEnumeratorPosition lastEnumeratedPosition, - Collection pendingSplits) { - this(lastEnumeratedPosition, pendingSplits, new int[0]); - } - - public IcebergEnumeratorState( - @Nullable IcebergEnumeratorPosition lastEnumeratedPosition, - Collection pendingSplits, - int[] enumerationSplitCountHistory) { - this.lastEnumeratedPosition = lastEnumeratedPosition; - this.pendingSplits = pendingSplits; - this.enumerationSplitCountHistory = enumerationSplitCountHistory; - } - - @Nullable - public IcebergEnumeratorPosition lastEnumeratedPosition() { - return lastEnumeratedPosition; - } - - public Collection pendingSplits() { - return pendingSplits; - } - - public int[] enumerationSplitCountHistory() { - return enumerationSplitCountHistory; - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorStateSerializer.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorStateSerializer.java deleted file mode 100644 index f76f8a69ff0e..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorStateSerializer.java +++ /dev/null @@ -1,194 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.enumerator; - -import java.io.IOException; -import java.util.Collection; -import org.apache.flink.annotation.Internal; -import org.apache.flink.annotation.VisibleForTesting; -import org.apache.flink.core.io.SimpleVersionedSerializer; -import org.apache.flink.core.memory.DataInputDeserializer; -import org.apache.flink.core.memory.DataOutputSerializer; -import org.apache.iceberg.flink.source.split.IcebergSourceSplit; -import org.apache.iceberg.flink.source.split.IcebergSourceSplitSerializer; -import org.apache.iceberg.flink.source.split.IcebergSourceSplitState; -import org.apache.iceberg.flink.source.split.IcebergSourceSplitStatus; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; - -@Internal -public class IcebergEnumeratorStateSerializer - implements SimpleVersionedSerializer { - - private static final int VERSION = 2; - - private static final ThreadLocal SERIALIZER_CACHE = - ThreadLocal.withInitial(() -> new DataOutputSerializer(1024)); - - private final IcebergEnumeratorPositionSerializer positionSerializer = - IcebergEnumeratorPositionSerializer.INSTANCE; - private final IcebergSourceSplitSerializer splitSerializer; - - public IcebergEnumeratorStateSerializer(boolean caseSensitive) { - this.splitSerializer = new IcebergSourceSplitSerializer(caseSensitive); - } - - @Override - public int getVersion() { - return VERSION; - } - - @Override - public byte[] serialize(IcebergEnumeratorState enumState) throws IOException { - return serializeV2(enumState); - } - - @Override - public IcebergEnumeratorState deserialize(int version, byte[] serialized) throws IOException { - switch (version) { - case 1: - return deserializeV1(serialized); - case 2: - return deserializeV2(serialized); - default: - throw new IOException("Unknown version: " + version); - } - } - - @VisibleForTesting - byte[] serializeV1(IcebergEnumeratorState enumState) throws IOException { - DataOutputSerializer out = SERIALIZER_CACHE.get(); - serializeEnumeratorPosition(out, enumState.lastEnumeratedPosition(), positionSerializer); - serializePendingSplits(out, enumState.pendingSplits(), splitSerializer); - byte[] result = out.getCopyOfBuffer(); - out.clear(); - return result; - } - - @VisibleForTesting - IcebergEnumeratorState deserializeV1(byte[] serialized) throws IOException { - DataInputDeserializer in = new DataInputDeserializer(serialized); - IcebergEnumeratorPosition enumeratorPosition = - deserializeEnumeratorPosition(in, positionSerializer); - Collection pendingSplits = - deserializePendingSplits(in, splitSerializer); - return new IcebergEnumeratorState(enumeratorPosition, pendingSplits); - } - - @VisibleForTesting - byte[] serializeV2(IcebergEnumeratorState enumState) throws IOException { - DataOutputSerializer out = SERIALIZER_CACHE.get(); - serializeEnumeratorPosition(out, enumState.lastEnumeratedPosition(), positionSerializer); - serializePendingSplits(out, enumState.pendingSplits(), splitSerializer); - serializeEnumerationSplitCountHistory(out, enumState.enumerationSplitCountHistory()); - byte[] result = out.getCopyOfBuffer(); - out.clear(); - return result; - } - - @VisibleForTesting - IcebergEnumeratorState deserializeV2(byte[] serialized) throws IOException { - DataInputDeserializer in = new DataInputDeserializer(serialized); - IcebergEnumeratorPosition enumeratorPosition = - deserializeEnumeratorPosition(in, positionSerializer); - Collection pendingSplits = - deserializePendingSplits(in, splitSerializer); - int[] enumerationSplitCountHistory = deserializeEnumerationSplitCountHistory(in); - return new IcebergEnumeratorState( - enumeratorPosition, pendingSplits, enumerationSplitCountHistory); - } - - private static void serializeEnumeratorPosition( - DataOutputSerializer out, - IcebergEnumeratorPosition enumeratorPosition, - IcebergEnumeratorPositionSerializer positionSerializer) - throws IOException { - out.writeBoolean(enumeratorPosition != null); - if (enumeratorPosition != null) { - out.writeInt(positionSerializer.getVersion()); - byte[] positionBytes = positionSerializer.serialize(enumeratorPosition); - out.writeInt(positionBytes.length); - out.write(positionBytes); - } - } - - private static IcebergEnumeratorPosition deserializeEnumeratorPosition( - DataInputDeserializer in, IcebergEnumeratorPositionSerializer positionSerializer) - throws IOException { - IcebergEnumeratorPosition enumeratorPosition = null; - if (in.readBoolean()) { - int version = in.readInt(); - byte[] positionBytes = new byte[in.readInt()]; - in.read(positionBytes); - enumeratorPosition = positionSerializer.deserialize(version, positionBytes); - } - return enumeratorPosition; - } - - private static void serializePendingSplits( - DataOutputSerializer out, - Collection pendingSplits, - IcebergSourceSplitSerializer splitSerializer) - throws IOException { - out.writeInt(splitSerializer.getVersion()); - out.writeInt(pendingSplits.size()); - for (IcebergSourceSplitState splitState : pendingSplits) { - byte[] splitBytes = splitSerializer.serialize(splitState.split()); - out.writeInt(splitBytes.length); - out.write(splitBytes); - out.writeUTF(splitState.status().name()); - } - } - - private static Collection deserializePendingSplits( - DataInputDeserializer in, IcebergSourceSplitSerializer splitSerializer) throws IOException { - int splitSerializerVersion = in.readInt(); - int splitCount = in.readInt(); - Collection pendingSplits = Lists.newArrayListWithCapacity(splitCount); - for (int i = 0; i < splitCount; ++i) { - byte[] splitBytes = new byte[in.readInt()]; - in.read(splitBytes); - IcebergSourceSplit split = splitSerializer.deserialize(splitSerializerVersion, splitBytes); - String statusName = in.readUTF(); - pendingSplits.add( - new IcebergSourceSplitState(split, IcebergSourceSplitStatus.valueOf(statusName))); - } - return pendingSplits; - } - - private static void serializeEnumerationSplitCountHistory( - DataOutputSerializer out, int[] enumerationSplitCountHistory) throws IOException { - out.writeInt(enumerationSplitCountHistory.length); - for (int enumerationSplitCount : enumerationSplitCountHistory) { - out.writeInt(enumerationSplitCount); - } - } - - private static int[] deserializeEnumerationSplitCountHistory(DataInputDeserializer in) - throws IOException { - int historySize = in.readInt(); - int[] history = new int[historySize]; - if (historySize > 0) { - for (int i = 0; i < historySize; ++i) { - history[i] = in.readInt(); - } - } - - return history; - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/StaticIcebergEnumerator.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/StaticIcebergEnumerator.java deleted file mode 100644 index 4e55ea5d5fd6..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/StaticIcebergEnumerator.java +++ /dev/null @@ -1,51 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.enumerator; - -import org.apache.flink.annotation.Internal; -import org.apache.flink.api.connector.source.SplitEnumeratorContext; -import org.apache.iceberg.flink.source.assigner.SplitAssigner; -import org.apache.iceberg.flink.source.split.IcebergSourceSplit; - -/** One-time split enumeration at the start-up for batch execution */ -@Internal -public class StaticIcebergEnumerator extends AbstractIcebergEnumerator { - private final SplitAssigner assigner; - - public StaticIcebergEnumerator( - SplitEnumeratorContext enumeratorContext, SplitAssigner assigner) { - super(enumeratorContext, assigner); - this.assigner = assigner; - } - - @Override - public void start() { - super.start(); - } - - @Override - protected boolean shouldWaitForMoreSplits() { - return false; - } - - @Override - public IcebergEnumeratorState snapshotState(long checkpointId) { - return new IcebergEnumeratorState(null, assigner.state(), new int[0]); - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/ArrayBatchRecords.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/ArrayBatchRecords.java deleted file mode 100644 index 7b94c364c976..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/ArrayBatchRecords.java +++ /dev/null @@ -1,171 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.reader; - -import java.util.Collections; -import java.util.Set; -import javax.annotation.Nullable; -import org.apache.flink.annotation.VisibleForTesting; -import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; -import org.apache.flink.connector.file.src.util.Pool; -import org.apache.flink.table.data.RowData; -import org.apache.iceberg.flink.source.DataIterator; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; - -/** - * {@link RecordsWithSplitIds} is used to pass a batch of records from fetcher to source reader. - * Batching is to improve the efficiency for records handover. - * - *

    {@link RecordsWithSplitIds} interface can encapsulate batches from multiple splits. This is - * the case for Kafka source where fetchers can retrieve records from multiple Kafka partitions at - * the same time. - * - *

    For file-based sources like Iceberg, readers always read one split/file at a time. Hence, we - * will only have a batch of records for one split here. - * - *

    This class uses array to store a batch of records from the same file (with the same - * fileOffset). - */ -class ArrayBatchRecords implements RecordsWithSplitIds> { - @Nullable private String splitId; - @Nullable private final Pool.Recycler recycler; - @Nullable private final T[] records; - private final int numberOfRecords; - private final Set finishedSplits; - private final RecordAndPosition recordAndPosition; - - // point to current read position within the records array - private int position; - - private ArrayBatchRecords( - @Nullable String splitId, - @Nullable Pool.Recycler recycler, - @Nullable T[] records, - int numberOfRecords, - int fileOffset, - long startingRecordOffset, - Set finishedSplits) { - Preconditions.checkArgument(numberOfRecords >= 0, "numberOfRecords can't be negative"); - Preconditions.checkArgument(fileOffset >= 0, "fileOffset can't be negative"); - Preconditions.checkArgument(startingRecordOffset >= 0, "numberOfRecords can't be negative"); - - this.splitId = splitId; - this.recycler = recycler; - this.records = records; - this.numberOfRecords = numberOfRecords; - this.finishedSplits = - Preconditions.checkNotNull(finishedSplits, "finishedSplits can be empty but not null"); - this.recordAndPosition = new RecordAndPosition<>(); - - recordAndPosition.set(null, fileOffset, startingRecordOffset); - this.position = 0; - } - - @Nullable - @Override - public String nextSplit() { - String nextSplit = this.splitId; - // set the splitId to null to indicate no more splits - // this class only contains record for one split - this.splitId = null; - return nextSplit; - } - - @Nullable - @Override - public RecordAndPosition nextRecordFromSplit() { - if (position < numberOfRecords) { - recordAndPosition.record(records[position]); - position++; - return recordAndPosition; - } else { - return null; - } - } - - /** - * This method is called when all records from this batch has been emitted. If recycler is set, it - * should be called to return the records array back to pool. - */ - @Override - public void recycle() { - if (recycler != null) { - recycler.recycle(records); - } - } - - @Override - public Set finishedSplits() { - return finishedSplits; - } - - @VisibleForTesting - T[] records() { - return records; - } - - @VisibleForTesting - int numberOfRecords() { - return numberOfRecords; - } - - /** - * Create a ArrayBatchRecords backed up an array with records from the same file - * - * @param splitId Iceberg source only read from one split a time. We never have multiple records - * from multiple splits. - * @param recycler Because {@link DataIterator} with {@link RowData} returns an iterator of reused - * RowData object, we need to clone RowData eagerly when constructing a batch of records. We - * can use object pool to reuse the RowData array object which can be expensive to create. - * This recycler can be provided to recycle the array object back to pool after read is - * exhausted. If the {@link DataIterator} returns an iterator of non-reused objects, we don't - * need to clone objects. It is cheap to just create the batch array. Hence, we don't need - * object pool and recycler can be set to null. - * @param records an array (maybe reused) holding a batch of records - * @param numberOfRecords actual number of records in the array - * @param fileOffset fileOffset for all records in this batch - * @param startingRecordOffset starting recordOffset - * @param record type - */ - public static ArrayBatchRecords forRecords( - String splitId, - Pool.Recycler recycler, - T[] records, - int numberOfRecords, - int fileOffset, - long startingRecordOffset) { - return new ArrayBatchRecords<>( - splitId, - recycler, - records, - numberOfRecords, - fileOffset, - startingRecordOffset, - Collections.emptySet()); - } - - /** - * Create ab ArrayBatchRecords with only finished split id - * - * @param splitId for the split that is just exhausted - */ - public static ArrayBatchRecords finishedSplit(String splitId) { - return new ArrayBatchRecords<>(null, null, null, 0, 0, 0, Collections.singleton(splitId)); - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/ArrayPoolDataIteratorBatcher.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/ArrayPoolDataIteratorBatcher.java deleted file mode 100644 index 306afd1811be..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/ArrayPoolDataIteratorBatcher.java +++ /dev/null @@ -1,130 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.reader; - -import java.io.IOException; -import java.util.NoSuchElementException; -import org.apache.flink.configuration.ReadableConfig; -import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; -import org.apache.flink.connector.base.source.reader.SourceReaderOptions; -import org.apache.flink.connector.file.src.util.Pool; -import org.apache.iceberg.flink.FlinkConfigOptions; -import org.apache.iceberg.flink.source.DataIterator; -import org.apache.iceberg.io.CloseableIterator; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; - -/** This implementation stores record batch in array from recyclable pool */ -class ArrayPoolDataIteratorBatcher implements DataIteratorBatcher { - private final int batchSize; - private final int handoverQueueSize; - private final RecordFactory recordFactory; - - private transient Pool pool; - - ArrayPoolDataIteratorBatcher(ReadableConfig config, RecordFactory recordFactory) { - this.batchSize = config.get(FlinkConfigOptions.SOURCE_READER_FETCH_BATCH_RECORD_COUNT); - this.handoverQueueSize = config.get(SourceReaderOptions.ELEMENT_QUEUE_CAPACITY); - this.recordFactory = recordFactory; - } - - @Override - public CloseableIterator>> batch( - String splitId, DataIterator inputIterator) { - Preconditions.checkArgument(inputIterator != null, "Input data iterator can't be null"); - // lazily create pool as it is not serializable - if (pool == null) { - this.pool = createPoolOfBatches(handoverQueueSize); - } - return new ArrayPoolBatchIterator(splitId, inputIterator, pool); - } - - private Pool createPoolOfBatches(int numBatches) { - Pool poolOfBatches = new Pool<>(numBatches); - for (int batchId = 0; batchId < numBatches; batchId++) { - T[] batch = recordFactory.createBatch(batchSize); - poolOfBatches.add(batch); - } - - return poolOfBatches; - } - - private class ArrayPoolBatchIterator - implements CloseableIterator>> { - - private final String splitId; - private final DataIterator inputIterator; - private final Pool pool; - - ArrayPoolBatchIterator(String splitId, DataIterator inputIterator, Pool pool) { - this.splitId = splitId; - this.inputIterator = inputIterator; - this.pool = pool; - } - - @Override - public boolean hasNext() { - return inputIterator.hasNext(); - } - - @Override - public RecordsWithSplitIds> next() { - if (!inputIterator.hasNext()) { - throw new NoSuchElementException(); - } - - T[] batch = getCachedEntry(); - int recordCount = 0; - while (inputIterator.hasNext() && recordCount < batchSize) { - // The record produced by inputIterator can be reused like for the RowData case. - // inputIterator.next() can't be called again until the copy is made - // since the record is not consumed immediately. - T nextRecord = inputIterator.next(); - recordFactory.clone(nextRecord, batch, recordCount); - recordCount++; - if (!inputIterator.currentFileHasNext()) { - // break early so that records in the ArrayResultIterator - // have the same fileOffset. - break; - } - } - - return ArrayBatchRecords.forRecords( - splitId, - pool.recycler(), - batch, - recordCount, - inputIterator.fileOffset(), - inputIterator.recordOffset() - recordCount); - } - - @Override - public void close() throws IOException { - inputIterator.close(); - } - - private T[] getCachedEntry() { - try { - return pool.pollEntry(); - } catch (InterruptedException e) { - Thread.currentThread().interrupt(); - throw new RuntimeException("Interrupted while waiting for array pool entry", e); - } - } - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/AvroGenericRecordReaderFunction.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/AvroGenericRecordReaderFunction.java deleted file mode 100644 index 66e59633fff2..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/AvroGenericRecordReaderFunction.java +++ /dev/null @@ -1,102 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.reader; - -import java.util.List; -import org.apache.avro.generic.GenericRecord; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.configuration.ReadableConfig; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.encryption.EncryptionManager; -import org.apache.iceberg.expressions.Expression; -import org.apache.iceberg.flink.source.AvroGenericRecordFileScanTaskReader; -import org.apache.iceberg.flink.source.DataIterator; -import org.apache.iceberg.flink.source.RowDataFileScanTaskReader; -import org.apache.iceberg.flink.source.RowDataToAvroGenericRecordConverter; -import org.apache.iceberg.flink.source.split.IcebergSourceSplit; -import org.apache.iceberg.io.FileIO; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; - -/** Read Iceberg rows as {@link GenericRecord}. */ -public class AvroGenericRecordReaderFunction extends DataIteratorReaderFunction { - private final String tableName; - private final Schema readSchema; - private final FileIO io; - private final EncryptionManager encryption; - private final RowDataFileScanTaskReader rowDataReader; - - private transient RowDataToAvroGenericRecordConverter converter; - - /** - * Create a reader function without projection and name mapping. Column name is case-insensitive. - */ - public static AvroGenericRecordReaderFunction fromTable(Table table) { - return new AvroGenericRecordReaderFunction( - table.name(), - new Configuration(), - table.schema(), - null, - null, - false, - table.io(), - table.encryption(), - null); - } - - public AvroGenericRecordReaderFunction( - String tableName, - ReadableConfig config, - Schema tableSchema, - Schema projectedSchema, - String nameMapping, - boolean caseSensitive, - FileIO io, - EncryptionManager encryption, - List filters) { - super(new ListDataIteratorBatcher<>(config)); - this.tableName = tableName; - this.readSchema = readSchema(tableSchema, projectedSchema); - this.io = io; - this.encryption = encryption; - this.rowDataReader = - new RowDataFileScanTaskReader(tableSchema, readSchema, nameMapping, caseSensitive, filters); - } - - @Override - protected DataIterator createDataIterator(IcebergSourceSplit split) { - return new DataIterator<>( - new AvroGenericRecordFileScanTaskReader(rowDataReader, lazyConverter()), - split.task(), - io, - encryption); - } - - private RowDataToAvroGenericRecordConverter lazyConverter() { - if (converter == null) { - this.converter = RowDataToAvroGenericRecordConverter.fromIcebergSchema(tableName, readSchema); - } - return converter; - } - - private static Schema readSchema(Schema tableSchema, Schema projectedSchema) { - Preconditions.checkNotNull(tableSchema, "Table schema can't be null"); - return projectedSchema == null ? tableSchema : projectedSchema; - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/ColumnStatsWatermarkExtractor.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/ColumnStatsWatermarkExtractor.java deleted file mode 100644 index 4bb6f0a98c4c..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/ColumnStatsWatermarkExtractor.java +++ /dev/null @@ -1,98 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.reader; - -import java.io.Serializable; -import java.util.Comparator; -import java.util.concurrent.TimeUnit; -import org.apache.flink.annotation.Internal; -import org.apache.iceberg.Schema; -import org.apache.iceberg.flink.source.split.IcebergSourceSplit; -import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.types.Conversions; -import org.apache.iceberg.types.Type.TypeID; -import org.apache.iceberg.types.Types; - -/** - * {@link SplitWatermarkExtractor} implementation which uses an Iceberg timestamp column statistics - * to get the watermarks for the {@link IcebergSourceSplit}. This watermark is emitted by the {@link - * WatermarkExtractorRecordEmitter} along with the actual records. - */ -@Internal -public class ColumnStatsWatermarkExtractor implements SplitWatermarkExtractor, Serializable { - private final int eventTimeFieldId; - private final String eventTimeFieldName; - private final TimeUnit timeUnit; - - /** - * Creates the extractor. - * - * @param schema The schema of the Table - * @param eventTimeFieldName The column which should be used as an event time - * @param timeUnit Used for converting the long value to epoch milliseconds - */ - public ColumnStatsWatermarkExtractor( - Schema schema, String eventTimeFieldName, TimeUnit timeUnit) { - Types.NestedField field = schema.findField(eventTimeFieldName); - TypeID typeID = field.type().typeId(); - Preconditions.checkArgument( - typeID.equals(TypeID.LONG) || typeID.equals(TypeID.TIMESTAMP), - "Found %s, expected a LONG or TIMESTAMP column for watermark generation.", - typeID); - this.eventTimeFieldId = field.fieldId(); - this.eventTimeFieldName = eventTimeFieldName; - // Use the timeUnit only for Long columns. - this.timeUnit = typeID.equals(TypeID.LONG) ? timeUnit : TimeUnit.MICROSECONDS; - } - - @VisibleForTesting - ColumnStatsWatermarkExtractor(int eventTimeFieldId, String eventTimeFieldName) { - this.eventTimeFieldId = eventTimeFieldId; - this.eventTimeFieldName = eventTimeFieldName; - this.timeUnit = TimeUnit.MICROSECONDS; - } - - /** - * Get the watermark for a split using column statistics. - * - * @param split The split - * @return The watermark - * @throws IllegalArgumentException if there is no statistics for the column - */ - @Override - public long extractWatermark(IcebergSourceSplit split) { - return split.task().files().stream() - .map( - scanTask -> { - Preconditions.checkArgument( - scanTask.file().lowerBounds() != null - && scanTask.file().lowerBounds().get(eventTimeFieldId) != null, - "Missing statistics for column name = %s in file = %s", - eventTimeFieldName, - eventTimeFieldId, - scanTask.file()); - return timeUnit.toMillis( - Conversions.fromByteBuffer( - Types.LongType.get(), scanTask.file().lowerBounds().get(eventTimeFieldId))); - }) - .min(Comparator.comparingLong(l -> l)) - .get(); - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorBatcher.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorBatcher.java deleted file mode 100644 index c376e359c600..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorBatcher.java +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.reader; - -import java.io.Serializable; -import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; -import org.apache.flink.connector.base.source.reader.splitreader.SplitReader; -import org.apache.iceberg.flink.source.DataIterator; -import org.apache.iceberg.io.CloseableIterator; - -/** - * Batcher converts iterator of T into iterator of batched {@code - * RecordsWithSplitIds>}, as FLIP-27's {@link SplitReader#fetch()} returns - * batched records. - */ -@FunctionalInterface -public interface DataIteratorBatcher extends Serializable { - CloseableIterator>> batch( - String splitId, DataIterator inputIterator); -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorReaderFunction.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorReaderFunction.java deleted file mode 100644 index bbf797ef4aa8..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorReaderFunction.java +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.reader; - -import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; -import org.apache.iceberg.flink.source.DataIterator; -import org.apache.iceberg.flink.source.split.IcebergSourceSplit; -import org.apache.iceberg.io.CloseableIterator; - -/** A {@link ReaderFunction} implementation that uses {@link DataIterator}. */ -public abstract class DataIteratorReaderFunction implements ReaderFunction { - private final DataIteratorBatcher batcher; - - public DataIteratorReaderFunction(DataIteratorBatcher batcher) { - this.batcher = batcher; - } - - protected abstract DataIterator createDataIterator(IcebergSourceSplit split); - - @Override - public CloseableIterator>> apply( - IcebergSourceSplit split) { - DataIterator inputIterator = createDataIterator(split); - inputIterator.seek(split.fileOffset(), split.recordOffset()); - return batcher.batch(split.splitId(), inputIterator); - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceReader.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceReader.java deleted file mode 100644 index f143b8d2df2e..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceReader.java +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.reader; - -import java.util.Collection; -import java.util.Collections; -import java.util.Map; -import org.apache.flink.annotation.Internal; -import org.apache.flink.api.connector.source.SourceReaderContext; -import org.apache.flink.connector.base.source.reader.SingleThreadMultiplexSourceReaderBase; -import org.apache.iceberg.flink.source.split.IcebergSourceSplit; -import org.apache.iceberg.flink.source.split.SerializableComparator; -import org.apache.iceberg.flink.source.split.SplitRequestEvent; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; - -@Internal -public class IcebergSourceReader - extends SingleThreadMultiplexSourceReaderBase< - RecordAndPosition, T, IcebergSourceSplit, IcebergSourceSplit> { - - public IcebergSourceReader( - SerializableRecordEmitter emitter, - IcebergSourceReaderMetrics metrics, - ReaderFunction readerFunction, - SerializableComparator splitComparator, - SourceReaderContext context) { - super( - () -> new IcebergSourceSplitReader<>(metrics, readerFunction, splitComparator, context), - emitter, - context.getConfiguration(), - context); - } - - @Override - public void start() { - // We request a split only if we did not get splits during the checkpoint restore. - // Otherwise, reader restarts will keep requesting more and more splits. - if (getNumberOfCurrentlyAssignedSplits() == 0) { - requestSplit(Collections.emptyList()); - } - } - - @Override - protected void onSplitFinished(Map finishedSplitIds) { - requestSplit(Lists.newArrayList(finishedSplitIds.keySet())); - } - - @Override - protected IcebergSourceSplit initializedState(IcebergSourceSplit split) { - return split; - } - - @Override - protected IcebergSourceSplit toSplitType(String splitId, IcebergSourceSplit splitState) { - return splitState; - } - - private void requestSplit(Collection finishedSplitIds) { - context.sendSourceEventToCoordinator(new SplitRequestEvent(finishedSplitIds)); - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceReaderMetrics.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceReaderMetrics.java deleted file mode 100644 index 2a3e1dd86b95..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceReaderMetrics.java +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.reader; - -import org.apache.flink.metrics.Counter; -import org.apache.flink.metrics.MetricGroup; - -public class IcebergSourceReaderMetrics { - private final Counter assignedSplits; - private final Counter assignedBytes; - private final Counter finishedSplits; - private final Counter finishedBytes; - private final Counter splitReaderFetchCalls; - - public IcebergSourceReaderMetrics(MetricGroup metrics, String fullTableName) { - MetricGroup readerMetrics = - metrics.addGroup("IcebergSourceReader").addGroup("table", fullTableName); - - this.assignedSplits = readerMetrics.counter("assignedSplits"); - this.assignedBytes = readerMetrics.counter("assignedBytes"); - this.finishedSplits = readerMetrics.counter("finishedSplits"); - this.finishedBytes = readerMetrics.counter("finishedBytes"); - this.splitReaderFetchCalls = readerMetrics.counter("splitReaderFetchCalls"); - } - - public void incrementAssignedSplits(long count) { - assignedSplits.inc(count); - } - - public void incrementAssignedBytes(long count) { - assignedBytes.inc(count); - } - - public void incrementFinishedSplits(long count) { - finishedSplits.inc(count); - } - - public void incrementFinishedBytes(long count) { - finishedBytes.inc(count); - } - - public void incrementSplitReaderFetchCalls(long count) { - splitReaderFetchCalls.inc(count); - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceSplitReader.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceSplitReader.java deleted file mode 100644 index 9c20494fdbcd..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceSplitReader.java +++ /dev/null @@ -1,167 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.reader; - -import java.io.IOException; -import java.io.UncheckedIOException; -import java.util.Collection; -import java.util.Collections; -import java.util.List; -import java.util.Queue; -import org.apache.flink.api.connector.source.SourceReaderContext; -import org.apache.flink.connector.base.source.reader.RecordsBySplits; -import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; -import org.apache.flink.connector.base.source.reader.splitreader.SplitReader; -import org.apache.flink.connector.base.source.reader.splitreader.SplitsAddition; -import org.apache.flink.connector.base.source.reader.splitreader.SplitsChange; -import org.apache.iceberg.FileScanTask; -import org.apache.iceberg.flink.source.split.IcebergSourceSplit; -import org.apache.iceberg.flink.source.split.SerializableComparator; -import org.apache.iceberg.io.CloseableIterator; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Queues; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -class IcebergSourceSplitReader implements SplitReader, IcebergSourceSplit> { - private static final Logger LOG = LoggerFactory.getLogger(IcebergSourceSplitReader.class); - - private final IcebergSourceReaderMetrics metrics; - private final ReaderFunction openSplitFunction; - private final SerializableComparator splitComparator; - private final int indexOfSubtask; - private final Queue splits; - - private CloseableIterator>> currentReader; - private IcebergSourceSplit currentSplit; - private String currentSplitId; - - IcebergSourceSplitReader( - IcebergSourceReaderMetrics metrics, - ReaderFunction openSplitFunction, - SerializableComparator splitComparator, - SourceReaderContext context) { - this.metrics = metrics; - this.openSplitFunction = openSplitFunction; - this.splitComparator = splitComparator; - this.indexOfSubtask = context.getIndexOfSubtask(); - this.splits = Queues.newArrayDeque(); - } - - /** - * The method reads a batch of records from the assigned splits. If all the records from the - * current split are returned then it will emit a {@link ArrayBatchRecords#finishedSplit(String)} - * batch to signal this event. In the next fetch loop the reader will continue with the next split - * (if any). - * - * @return The fetched records - * @throws IOException If there is an error during reading - */ - @Override - public RecordsWithSplitIds> fetch() throws IOException { - metrics.incrementSplitReaderFetchCalls(1); - if (currentReader == null) { - IcebergSourceSplit nextSplit = splits.poll(); - if (nextSplit != null) { - currentSplit = nextSplit; - currentSplitId = nextSplit.splitId(); - currentReader = openSplitFunction.apply(currentSplit); - } else { - // return an empty result, which will lead to split fetch to be idle. - // SplitFetcherManager will then close idle fetcher. - return new RecordsBySplits(Collections.emptyMap(), Collections.emptySet()); - } - } - - if (currentReader.hasNext()) { - // Because Iterator#next() doesn't support checked exception, - // we need to wrap and unwrap the checked IOException with UncheckedIOException - try { - return currentReader.next(); - } catch (UncheckedIOException e) { - throw e.getCause(); - } - } else { - return finishSplit(); - } - } - - @Override - public void handleSplitsChanges(SplitsChange splitsChange) { - if (!(splitsChange instanceof SplitsAddition)) { - throw new UnsupportedOperationException( - String.format("Unsupported split change: %s", splitsChange.getClass())); - } - - if (splitComparator != null) { - List newSplits = Lists.newArrayList(splitsChange.splits()); - newSplits.sort(splitComparator); - LOG.info("Add {} splits to reader: {}", newSplits.size(), newSplits); - splits.addAll(newSplits); - } else { - LOG.info("Add {} splits to reader", splitsChange.splits().size()); - splits.addAll(splitsChange.splits()); - } - metrics.incrementAssignedSplits(splitsChange.splits().size()); - metrics.incrementAssignedBytes(calculateBytes(splitsChange)); - } - - @Override - public void wakeUp() {} - - @Override - public void close() throws Exception { - currentSplitId = null; - if (currentReader != null) { - currentReader.close(); - } - } - - @Override - public void pauseOrResumeSplits( - Collection splitsToPause, Collection splitsToResume) { - // IcebergSourceSplitReader only reads splits sequentially. When waiting for watermark alignment - // the SourceOperator will stop processing and recycling the fetched batches. This exhausts the - // {@link ArrayPoolDataIteratorBatcher#pool} and the `currentReader.next()` call will be - // blocked even without split-level watermark alignment. Based on this the - // `pauseOrResumeSplits` and the `wakeUp` are left empty. - } - - private long calculateBytes(IcebergSourceSplit split) { - return split.task().files().stream().map(FileScanTask::length).reduce(0L, Long::sum); - } - - private long calculateBytes(SplitsChange splitsChanges) { - return splitsChanges.splits().stream().map(this::calculateBytes).reduce(0L, Long::sum); - } - - private ArrayBatchRecords finishSplit() throws IOException { - if (currentReader != null) { - currentReader.close(); - currentReader = null; - } - - ArrayBatchRecords finishRecords = ArrayBatchRecords.finishedSplit(currentSplitId); - LOG.info("Split reader {} finished split: {}", indexOfSubtask, currentSplitId); - metrics.incrementFinishedSplits(1); - metrics.incrementFinishedBytes(calculateBytes(currentSplit)); - currentSplitId = null; - return finishRecords; - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/LimitableDataIterator.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/LimitableDataIterator.java deleted file mode 100644 index 020e87646d05..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/LimitableDataIterator.java +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.reader; - -import org.apache.iceberg.CombinedScanTask; -import org.apache.iceberg.encryption.EncryptionManager; -import org.apache.iceberg.flink.source.DataIterator; -import org.apache.iceberg.flink.source.FileScanTaskReader; -import org.apache.iceberg.io.FileIO; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; - -class LimitableDataIterator extends DataIterator { - private final RecordLimiter limiter; - - LimitableDataIterator( - FileScanTaskReader fileScanTaskReader, - CombinedScanTask task, - FileIO io, - EncryptionManager encryption, - RecordLimiter limiter) { - super(fileScanTaskReader, task, io, encryption); - Preconditions.checkArgument(limiter != null, "Invalid record limiter: null"); - this.limiter = limiter; - } - - @Override - public boolean hasNext() { - if (limiter.reachedLimit()) { - return false; - } - - return super.hasNext(); - } - - @Override - public T next() { - limiter.increment(); - return super.next(); - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/ListBatchRecords.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/ListBatchRecords.java deleted file mode 100644 index 1acb3df76102..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/ListBatchRecords.java +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.reader; - -import java.util.Collections; -import java.util.List; -import java.util.Set; -import javax.annotation.Nullable; -import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; - -class ListBatchRecords implements RecordsWithSplitIds> { - private String splitId; - private final List records; - private final Set finishedSplits; - private final RecordAndPosition recordAndPosition; - - // point to current read position within the records list - private int position; - - ListBatchRecords( - String splitId, - List records, - int fileOffset, - long startingRecordOffset, - Set finishedSplits) { - this.splitId = splitId; - this.records = records; - this.finishedSplits = - Preconditions.checkNotNull(finishedSplits, "finishedSplits can be empty but not null"); - this.recordAndPosition = new RecordAndPosition<>(); - this.recordAndPosition.set(null, fileOffset, startingRecordOffset); - - this.position = 0; - } - - @Nullable - @Override - public String nextSplit() { - String nextSplit = this.splitId; - // set the splitId to null to indicate no more splits - // this class only contains record for one split - this.splitId = null; - return nextSplit; - } - - @Nullable - @Override - public RecordAndPosition nextRecordFromSplit() { - if (position < records.size()) { - recordAndPosition.record(records.get(position)); - position++; - return recordAndPosition; - } else { - return null; - } - } - - @Override - public Set finishedSplits() { - return finishedSplits; - } - - public static ListBatchRecords forRecords( - String splitId, List records, int fileOffset, long startingRecordOffset) { - return new ListBatchRecords<>( - splitId, records, fileOffset, startingRecordOffset, Collections.emptySet()); - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/ListDataIteratorBatcher.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/ListDataIteratorBatcher.java deleted file mode 100644 index 365416239d37..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/ListDataIteratorBatcher.java +++ /dev/null @@ -1,94 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.reader; - -import java.io.IOException; -import java.util.List; -import java.util.NoSuchElementException; -import org.apache.flink.configuration.ReadableConfig; -import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; -import org.apache.iceberg.flink.FlinkConfigOptions; -import org.apache.iceberg.flink.source.DataIterator; -import org.apache.iceberg.io.CloseableIterator; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; - -/** - * FlinkRecordReaderFunction essentially cloned objects already. So there is no need to use array - * pool to clone objects. Simply create a new ArrayList for each batch. - */ -class ListDataIteratorBatcher implements DataIteratorBatcher { - - private final int batchSize; - - ListDataIteratorBatcher(ReadableConfig config) { - this.batchSize = config.get(FlinkConfigOptions.SOURCE_READER_FETCH_BATCH_RECORD_COUNT); - } - - @Override - public CloseableIterator>> batch( - String splitId, DataIterator dataIterator) { - return new ListBatchIterator(splitId, dataIterator); - } - - private class ListBatchIterator - implements CloseableIterator>> { - - private final String splitId; - private final DataIterator inputIterator; - - ListBatchIterator(String splitId, DataIterator inputIterator) { - this.splitId = splitId; - this.inputIterator = inputIterator; - } - - @Override - public boolean hasNext() { - return inputIterator.hasNext(); - } - - @Override - public RecordsWithSplitIds> next() { - if (!inputIterator.hasNext()) { - throw new NoSuchElementException(); - } - - final List batch = Lists.newArrayListWithCapacity(batchSize); - int recordCount = 0; - while (inputIterator.hasNext() && recordCount < batchSize) { - T nextRecord = inputIterator.next(); - batch.add(nextRecord); - recordCount++; - if (!inputIterator.currentFileHasNext()) { - // break early so that records have the same fileOffset. - break; - } - } - - return ListBatchRecords.forRecords( - splitId, batch, inputIterator.fileOffset(), inputIterator.recordOffset() - recordCount); - } - - @Override - public void close() throws IOException { - if (inputIterator != null) { - inputIterator.close(); - } - } - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/MetaDataReaderFunction.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/MetaDataReaderFunction.java deleted file mode 100644 index fb4466913b90..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/MetaDataReaderFunction.java +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.reader; - -import org.apache.flink.annotation.Internal; -import org.apache.flink.configuration.ReadableConfig; -import org.apache.flink.table.data.RowData; -import org.apache.iceberg.Schema; -import org.apache.iceberg.encryption.EncryptionManager; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.source.DataIterator; -import org.apache.iceberg.flink.source.DataTaskReader; -import org.apache.iceberg.flink.source.split.IcebergSourceSplit; -import org.apache.iceberg.io.FileIO; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; - -/** Reading metadata tables (like snapshots, manifests, etc.) */ -@Internal -public class MetaDataReaderFunction extends DataIteratorReaderFunction { - private final Schema readSchema; - private final FileIO io; - private final EncryptionManager encryption; - - public MetaDataReaderFunction( - ReadableConfig config, - Schema tableSchema, - Schema projectedSchema, - FileIO io, - EncryptionManager encryption) { - super( - new ArrayPoolDataIteratorBatcher<>( - config, - new RowDataRecordFactory( - FlinkSchemaUtil.convert(readSchema(tableSchema, projectedSchema))))); - this.readSchema = readSchema(tableSchema, projectedSchema); - this.io = io; - this.encryption = encryption; - } - - @Override - public DataIterator createDataIterator(IcebergSourceSplit split) { - return new DataIterator<>(new DataTaskReader(readSchema), split.task(), io, encryption); - } - - private static Schema readSchema(Schema tableSchema, Schema projectedSchema) { - Preconditions.checkNotNull(tableSchema, "Table schema can't be null"); - return projectedSchema == null ? tableSchema : projectedSchema; - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/ReaderFunction.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/ReaderFunction.java deleted file mode 100644 index 1ea91f10b4e7..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/ReaderFunction.java +++ /dev/null @@ -1,31 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.reader; - -import java.io.Serializable; -import java.util.function.Function; -import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; -import org.apache.iceberg.flink.source.split.IcebergSourceSplit; -import org.apache.iceberg.io.CloseableIterator; - -@FunctionalInterface -public interface ReaderFunction - extends Serializable, - Function< - IcebergSourceSplit, CloseableIterator>>> {} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordAndPosition.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordAndPosition.java deleted file mode 100644 index 6ac92592b6aa..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordAndPosition.java +++ /dev/null @@ -1,78 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.reader; - -import org.apache.flink.annotation.Internal; - -/** - * A record along with the reader position to be stored in the checkpoint. - * - *

    The position defines the point in the reader AFTER the record. Record processing and updating - * checkpointed state happens atomically. The position points to where the reader should resume - * after this record is processed. - * - *

    This mutable object is useful in cases where only one instance of a {@code RecordAndPosition} - * is needed at a time. Then the same instance of RecordAndPosition can be reused. - */ -@Internal -public class RecordAndPosition { - private T record; - private int fileOffset; - private long recordOffset; - - public RecordAndPosition(T record, int fileOffset, long recordOffset) { - this.record = record; - this.fileOffset = fileOffset; - this.recordOffset = recordOffset; - } - - public RecordAndPosition() {} - - // ------------------------------------------------------------------------ - - public T record() { - return record; - } - - public int fileOffset() { - return fileOffset; - } - - public long recordOffset() { - return recordOffset; - } - - /** Updates the record and position in this object. */ - public void set(T newRecord, int newFileOffset, long newRecordOffset) { - this.record = newRecord; - this.fileOffset = newFileOffset; - this.recordOffset = newRecordOffset; - } - - /** Sets the next record of a sequence. This increments the {@code recordOffset} by one. */ - public void record(T nextRecord) { - this.record = nextRecord; - this.recordOffset++; - } - - @Override - public String toString() { - return String.format("%s @ %d + %d", record, fileOffset, recordOffset); - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordFactory.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordFactory.java deleted file mode 100644 index ef92e2e6b81f..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordFactory.java +++ /dev/null @@ -1,34 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.reader; - -import java.io.Serializable; - -/** - * In FLIP-27 source, SplitReader#fetch() returns a batch of records. Since DataIterator for RowData - * returns an iterator of reused RowData objects, RecordFactory is needed to (1) create object array - * that is recyclable via pool. (2) clone RowData element from DataIterator to the batch array. - */ -interface RecordFactory extends Serializable { - /** Create a batch of records */ - T[] createBatch(int batchSize); - - /** Clone record into the specified position of the batch array */ - void clone(T from, T[] batch, int position); -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordLimiter.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordLimiter.java deleted file mode 100644 index f260a53089ff..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordLimiter.java +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.reader; - -import java.util.concurrent.atomic.AtomicLong; -import org.apache.flink.annotation.Internal; - -@Internal -class RecordLimiter { - private final long limit; - private final AtomicLong counter; - - static RecordLimiter create(long limit) { - return new RecordLimiter(limit); - } - - private RecordLimiter(long limit) { - this.limit = limit; - this.counter = new AtomicLong(0); - } - - public boolean reachedLimit() { - return limit > 0 && counter.get() >= limit; - } - - public void increment() { - counter.incrementAndGet(); - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataReaderFunction.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataReaderFunction.java deleted file mode 100644 index c9208a0e1834..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataReaderFunction.java +++ /dev/null @@ -1,115 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.reader; - -import java.util.List; -import org.apache.flink.configuration.ReadableConfig; -import org.apache.flink.table.data.RowData; -import org.apache.iceberg.Schema; -import org.apache.iceberg.encryption.EncryptionManager; -import org.apache.iceberg.expressions.Expression; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.source.DataIterator; -import org.apache.iceberg.flink.source.RowDataFileScanTaskReader; -import org.apache.iceberg.flink.source.split.IcebergSourceSplit; -import org.apache.iceberg.io.FileIO; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; - -public class RowDataReaderFunction extends DataIteratorReaderFunction { - private final Schema tableSchema; - private final Schema readSchema; - private final String nameMapping; - private final boolean caseSensitive; - private final FileIO io; - private final EncryptionManager encryption; - private final List filters; - private final long limit; - - private transient RecordLimiter recordLimiter = null; - - public RowDataReaderFunction( - ReadableConfig config, - Schema tableSchema, - Schema projectedSchema, - String nameMapping, - boolean caseSensitive, - FileIO io, - EncryptionManager encryption, - List filters) { - this( - config, - tableSchema, - projectedSchema, - nameMapping, - caseSensitive, - io, - encryption, - filters, - -1L); - } - - public RowDataReaderFunction( - ReadableConfig config, - Schema tableSchema, - Schema projectedSchema, - String nameMapping, - boolean caseSensitive, - FileIO io, - EncryptionManager encryption, - List filters, - long limit) { - super( - new ArrayPoolDataIteratorBatcher<>( - config, - new RowDataRecordFactory( - FlinkSchemaUtil.convert(readSchema(tableSchema, projectedSchema))))); - this.tableSchema = tableSchema; - this.readSchema = readSchema(tableSchema, projectedSchema); - this.nameMapping = nameMapping; - this.caseSensitive = caseSensitive; - this.io = io; - this.encryption = encryption; - this.filters = filters; - this.limit = limit; - } - - @Override - public DataIterator createDataIterator(IcebergSourceSplit split) { - return new LimitableDataIterator<>( - new RowDataFileScanTaskReader(tableSchema, readSchema, nameMapping, caseSensitive, filters), - split.task(), - io, - encryption, - lazyLimiter()); - } - - private static Schema readSchema(Schema tableSchema, Schema projectedSchema) { - Preconditions.checkNotNull(tableSchema, "Table schema can't be null"); - return projectedSchema == null ? tableSchema : projectedSchema; - } - - /** Lazily create RecordLimiter to avoid the need to make it serializable */ - private RecordLimiter lazyLimiter() { - if (recordLimiter == null) { - this.recordLimiter = RecordLimiter.create(limit); - } - - return recordLimiter; - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataRecordFactory.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataRecordFactory.java deleted file mode 100644 index 40d5c28d7bc7..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataRecordFactory.java +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.reader; - -import org.apache.flink.api.common.typeutils.TypeSerializer; -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.runtime.typeutils.InternalSerializers; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.flink.data.RowDataUtil; - -class RowDataRecordFactory implements RecordFactory { - private final RowType rowType; - private final TypeSerializer[] fieldSerializers; - private final RowData.FieldGetter[] fieldGetters; - - RowDataRecordFactory(RowType rowType) { - this.rowType = rowType; - this.fieldSerializers = createFieldSerializers(rowType); - this.fieldGetters = createFieldGetters(rowType); - } - - static TypeSerializer[] createFieldSerializers(RowType rowType) { - return rowType.getChildren().stream() - .map(InternalSerializers::create) - .toArray(TypeSerializer[]::new); - } - - static RowData.FieldGetter[] createFieldGetters(RowType rowType) { - RowData.FieldGetter[] fieldGetters = new RowData.FieldGetter[rowType.getFieldCount()]; - for (int i = 0; i < rowType.getFieldCount(); ++i) { - fieldGetters[i] = RowData.createFieldGetter(rowType.getTypeAt(i), i); - } - - return fieldGetters; - } - - @Override - public RowData[] createBatch(int batchSize) { - RowData[] arr = new RowData[batchSize]; - for (int i = 0; i < batchSize; ++i) { - arr[i] = new GenericRowData(rowType.getFieldCount()); - } - return arr; - } - - @Override - public void clone(RowData from, RowData[] batch, int position) { - // Set the return value from RowDataUtil.clone back to the array. - // Clone method returns same clone target object (reused) if it is a GenericRowData. - // Clone method will allocate a new GenericRowData object - // if the target object is NOT a GenericRowData. - // So we should always set the clone return value back to the array. - batch[position] = - RowDataUtil.clone(from, batch[position], rowType, fieldSerializers, fieldGetters); - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/SerializableRecordEmitter.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/SerializableRecordEmitter.java deleted file mode 100644 index a6e2c1dae243..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/SerializableRecordEmitter.java +++ /dev/null @@ -1,40 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.reader; - -import java.io.Serializable; -import org.apache.flink.annotation.Internal; -import org.apache.flink.connector.base.source.reader.RecordEmitter; -import org.apache.iceberg.flink.source.split.IcebergSourceSplit; - -@Internal -@FunctionalInterface -public interface SerializableRecordEmitter - extends RecordEmitter, T, IcebergSourceSplit>, Serializable { - static SerializableRecordEmitter defaultEmitter() { - return (element, output, split) -> { - output.collect(element.record()); - split.updatePosition(element.fileOffset(), element.recordOffset()); - }; - } - - static SerializableRecordEmitter emitterWithWatermark(SplitWatermarkExtractor extractor) { - return new WatermarkExtractorRecordEmitter<>(extractor); - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/SplitWatermarkExtractor.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/SplitWatermarkExtractor.java deleted file mode 100644 index d1c50ac8ca52..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/SplitWatermarkExtractor.java +++ /dev/null @@ -1,28 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.reader; - -import java.io.Serializable; -import org.apache.iceberg.flink.source.split.IcebergSourceSplit; - -/** The interface used to extract watermarks from splits. */ -public interface SplitWatermarkExtractor extends Serializable { - /** Get the watermark for a split. */ - long extractWatermark(IcebergSourceSplit split); -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/WatermarkExtractorRecordEmitter.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/WatermarkExtractorRecordEmitter.java deleted file mode 100644 index 02ef57d344b1..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/reader/WatermarkExtractorRecordEmitter.java +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.reader; - -import org.apache.flink.api.common.eventtime.Watermark; -import org.apache.flink.api.connector.source.SourceOutput; -import org.apache.iceberg.flink.source.split.IcebergSourceSplit; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * Emitter which emits the watermarks, records and updates the split position. - * - *

    The Emitter emits watermarks at the beginning of every split provided by the {@link - * SplitWatermarkExtractor}. - */ -class WatermarkExtractorRecordEmitter implements SerializableRecordEmitter { - private static final Logger LOG = LoggerFactory.getLogger(WatermarkExtractorRecordEmitter.class); - private final SplitWatermarkExtractor timeExtractor; - private String lastSplitId = null; - private long watermark; - - WatermarkExtractorRecordEmitter(SplitWatermarkExtractor timeExtractor) { - this.timeExtractor = timeExtractor; - } - - @Override - public void emitRecord( - RecordAndPosition element, SourceOutput output, IcebergSourceSplit split) { - if (!split.splitId().equals(lastSplitId)) { - long newWatermark = timeExtractor.extractWatermark(split); - if (newWatermark < watermark) { - LOG.info( - "Received a new split with lower watermark. Previous watermark = {}, current watermark = {}, previous split = {}, current split = {}", - watermark, - newWatermark, - lastSplitId, - split.splitId()); - } else { - watermark = newWatermark; - output.emitWatermark(new Watermark(watermark)); - LOG.debug("Watermark = {} emitted based on split = {}", watermark, lastSplitId); - } - - lastSplitId = split.splitId(); - } - - output.collect(element.record()); - split.updatePosition(element.fileOffset(), element.recordOffset()); - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplit.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplit.java deleted file mode 100644 index 344f64833b62..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplit.java +++ /dev/null @@ -1,220 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.split; - -import java.io.IOException; -import java.io.Serializable; -import java.util.Collection; -import java.util.List; -import java.util.stream.Collectors; -import javax.annotation.Nullable; -import org.apache.flink.annotation.Internal; -import org.apache.flink.api.connector.source.SourceSplit; -import org.apache.flink.core.memory.DataInputDeserializer; -import org.apache.flink.core.memory.DataOutputSerializer; -import org.apache.flink.util.InstantiationUtil; -import org.apache.iceberg.BaseCombinedScanTask; -import org.apache.iceberg.CombinedScanTask; -import org.apache.iceberg.FileScanTask; -import org.apache.iceberg.ScanTaskParser; -import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Iterables; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; - -@Internal -public class IcebergSourceSplit implements SourceSplit, Serializable { - private static final long serialVersionUID = 1L; - private static final ThreadLocal SERIALIZER_CACHE = - ThreadLocal.withInitial(() -> new DataOutputSerializer(1024)); - - private final CombinedScanTask task; - - private int fileOffset; - private long recordOffset; - - // The splits are frequently serialized into checkpoints. - // Caching the byte representation makes repeated serialization cheap. - @Nullable private transient byte[] serializedBytesCache; - - private IcebergSourceSplit(CombinedScanTask task, int fileOffset, long recordOffset) { - this.task = task; - this.fileOffset = fileOffset; - this.recordOffset = recordOffset; - } - - public static IcebergSourceSplit fromCombinedScanTask(CombinedScanTask combinedScanTask) { - return fromCombinedScanTask(combinedScanTask, 0, 0L); - } - - public static IcebergSourceSplit fromCombinedScanTask( - CombinedScanTask combinedScanTask, int fileOffset, long recordOffset) { - return new IcebergSourceSplit(combinedScanTask, fileOffset, recordOffset); - } - - public CombinedScanTask task() { - return task; - } - - public int fileOffset() { - return fileOffset; - } - - public long recordOffset() { - return recordOffset; - } - - @Override - public String splitId() { - return MoreObjects.toStringHelper(this).add("files", toString(task.files())).toString(); - } - - public void updatePosition(int newFileOffset, long newRecordOffset) { - // invalidate the cache after position change - serializedBytesCache = null; - fileOffset = newFileOffset; - recordOffset = newRecordOffset; - } - - @Override - public String toString() { - return MoreObjects.toStringHelper(this) - .add("files", toString(task.files())) - .add("fileOffset", fileOffset) - .add("recordOffset", recordOffset) - .toString(); - } - - private String toString(Collection files) { - return Iterables.toString( - files.stream() - .map( - fileScanTask -> - MoreObjects.toStringHelper(fileScanTask) - .add("file", fileScanTask.file().path().toString()) - .add("start", fileScanTask.start()) - .add("length", fileScanTask.length()) - .toString()) - .collect(Collectors.toList())); - } - - byte[] serializeV1() throws IOException { - if (serializedBytesCache == null) { - serializedBytesCache = InstantiationUtil.serializeObject(this); - } - - return serializedBytesCache; - } - - static IcebergSourceSplit deserializeV1(byte[] serialized) throws IOException { - try { - return InstantiationUtil.deserializeObject( - serialized, IcebergSourceSplit.class.getClassLoader()); - } catch (ClassNotFoundException e) { - throw new RuntimeException("Failed to deserialize the split.", e); - } - } - - byte[] serializeV2() throws IOException { - return serialize(2); - } - - byte[] serializeV3() throws IOException { - return serialize(3); - } - - private byte[] serialize(int version) throws IOException { - if (serializedBytesCache == null) { - DataOutputSerializer out = SERIALIZER_CACHE.get(); - Collection fileScanTasks = task.tasks(); - Preconditions.checkArgument( - fileOffset >= 0 && fileOffset < fileScanTasks.size(), - "Invalid file offset: %s. Should be within the range of [0, %s)", - fileOffset, - fileScanTasks.size()); - - out.writeInt(fileOffset); - out.writeLong(recordOffset); - out.writeInt(fileScanTasks.size()); - - for (FileScanTask fileScanTask : fileScanTasks) { - String taskJson = ScanTaskParser.toJson(fileScanTask); - writeTaskJson(out, taskJson, version); - } - - serializedBytesCache = out.getCopyOfBuffer(); - out.clear(); - } - - return serializedBytesCache; - } - - private static void writeTaskJson(DataOutputSerializer out, String taskJson, int version) - throws IOException { - switch (version) { - case 2: - out.writeUTF(taskJson); - break; - case 3: - SerializerHelper.writeLongUTF(out, taskJson); - break; - default: - throw new IllegalArgumentException("Unsupported version: " + version); - } - } - - static IcebergSourceSplit deserializeV2(byte[] serialized, boolean caseSensitive) - throws IOException { - return deserialize(serialized, caseSensitive, 2); - } - - static IcebergSourceSplit deserializeV3(byte[] serialized, boolean caseSensitive) - throws IOException { - return deserialize(serialized, caseSensitive, 3); - } - - private static IcebergSourceSplit deserialize( - byte[] serialized, boolean caseSensitive, int version) throws IOException { - DataInputDeserializer in = new DataInputDeserializer(serialized); - int fileOffset = in.readInt(); - long recordOffset = in.readLong(); - int taskCount = in.readInt(); - - List tasks = Lists.newArrayListWithCapacity(taskCount); - for (int i = 0; i < taskCount; ++i) { - String taskJson = readTaskJson(in, version); - FileScanTask task = ScanTaskParser.fromJson(taskJson, caseSensitive); - tasks.add(task); - } - - CombinedScanTask combinedScanTask = new BaseCombinedScanTask(tasks); - return IcebergSourceSplit.fromCombinedScanTask(combinedScanTask, fileOffset, recordOffset); - } - - private static String readTaskJson(DataInputDeserializer in, int version) throws IOException { - switch (version) { - case 2: - return in.readUTF(); - case 3: - return SerializerHelper.readLongUTF(in); - default: - throw new IllegalArgumentException("Unsupported version: " + version); - } - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitSerializer.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitSerializer.java deleted file mode 100644 index d4b0f9e1977d..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitSerializer.java +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.split; - -import java.io.IOException; -import org.apache.flink.annotation.Internal; -import org.apache.flink.core.io.SimpleVersionedSerializer; - -@Internal -public class IcebergSourceSplitSerializer implements SimpleVersionedSerializer { - private static final int VERSION = 3; - - private final boolean caseSensitive; - - public IcebergSourceSplitSerializer(boolean caseSensitive) { - this.caseSensitive = caseSensitive; - } - - @Override - public int getVersion() { - return VERSION; - } - - @Override - public byte[] serialize(IcebergSourceSplit split) throws IOException { - return split.serializeV3(); - } - - @Override - public IcebergSourceSplit deserialize(int version, byte[] serialized) throws IOException { - switch (version) { - case 1: - return IcebergSourceSplit.deserializeV1(serialized); - case 2: - return IcebergSourceSplit.deserializeV2(serialized, caseSensitive); - case 3: - return IcebergSourceSplit.deserializeV3(serialized, caseSensitive); - default: - throw new IOException( - String.format( - "Failed to deserialize IcebergSourceSplit. " - + "Encountered unsupported version: %d. Supported version are [1]", - version)); - } - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitState.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitState.java deleted file mode 100644 index d9061e049e00..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitState.java +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.split; - -public class IcebergSourceSplitState { - private final IcebergSourceSplit split; - private final IcebergSourceSplitStatus status; - - public IcebergSourceSplitState(IcebergSourceSplit split, IcebergSourceSplitStatus status) { - this.split = split; - this.status = status; - } - - public IcebergSourceSplit split() { - return split; - } - - public IcebergSourceSplitStatus status() { - return status; - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitStatus.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitStatus.java deleted file mode 100644 index d4a84a165e1a..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitStatus.java +++ /dev/null @@ -1,25 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.split; - -public enum IcebergSourceSplitStatus { - UNASSIGNED, - ASSIGNED, - COMPLETED -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/split/SerializableComparator.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/split/SerializableComparator.java deleted file mode 100644 index 319648ca275c..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/split/SerializableComparator.java +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.split; - -import java.io.Serializable; -import java.util.Comparator; - -public interface SerializableComparator extends Comparator, Serializable {} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/split/SerializerHelper.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/split/SerializerHelper.java deleted file mode 100644 index 841969666ee5..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/split/SerializerHelper.java +++ /dev/null @@ -1,206 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.split; - -import java.io.IOException; -import java.io.Serializable; -import java.io.UTFDataFormatException; -import org.apache.flink.core.memory.DataInputDeserializer; -import org.apache.flink.core.memory.DataOutputSerializer; - -/** - * Helper class to serialize and deserialize strings longer than 65K. The inspiration is mostly - * taken from the class org.apache.flink.core.memory.DataInputSerializer.readUTF and - * org.apache.flink.core.memory.DataOutputSerializer.writeUTF. - */ -class SerializerHelper implements Serializable { - - private SerializerHelper() {} - - /** - * Similar to {@link DataOutputSerializer#writeUTF(String)}. Except this supports larger payloads - * which is up to max integer value. - * - *

    Note: This method can be removed when the method which does similar thing within the {@link - * DataOutputSerializer} already which does the same thing, so use that one instead once that is - * released on Flink version 1.20. - * - *

    See * FLINK-34228 * https://github.com/apache/flink/pull/24191 - * - * @param out the output stream to write the string to. - * @param str the string value to be written. - */ - public static void writeLongUTF(DataOutputSerializer out, String str) throws IOException { - int strlen = str.length(); - long utflen = 0; - int ch; - - /* use charAt instead of copying String to char array */ - for (int i = 0; i < strlen; i++) { - ch = str.charAt(i); - utflen += getUTFBytesSize(ch); - - if (utflen > Integer.MAX_VALUE) { - throw new UTFDataFormatException("Encoded string reached maximum length: " + utflen); - } - } - - if (utflen > Integer.MAX_VALUE - 4) { - throw new UTFDataFormatException("Encoded string is too long: " + utflen); - } - - out.writeInt((int) utflen); - writeUTFBytes(out, str, (int) utflen); - } - - /** - * Similar to {@link DataInputDeserializer#readUTF()}. Except this supports larger payloads which - * is up to max integer value. - * - *

    Note: This method can be removed when the method which does similar thing within the {@link - * DataOutputSerializer} already which does the same thing, so use that one instead once that is - * released on Flink version 1.20. - * - *

    See * FLINK-34228 * https://github.com/apache/flink/pull/24191 - * - * @param in the input stream to read the string from. - * @return the string value read from the input stream. - * @throws IOException if an I/O error occurs when reading from the input stream. - */ - public static String readLongUTF(DataInputDeserializer in) throws IOException { - int utflen = in.readInt(); - byte[] bytearr = new byte[utflen]; - char[] chararr = new char[utflen]; - - int ch; - int char2; - int char3; - int count = 0; - int chararrCount = 0; - - in.readFully(bytearr, 0, utflen); - - while (count < utflen) { - ch = (int) bytearr[count] & 0xff; - if (ch > 127) { - break; - } - count++; - chararr[chararrCount++] = (char) ch; - } - - while (count < utflen) { - ch = (int) bytearr[count] & 0xff; - switch (ch >> 4) { - case 0: - case 1: - case 2: - case 3: - case 4: - case 5: - case 6: - case 7: - /* 0xxxxxxx */ - count++; - chararr[chararrCount++] = (char) ch; - break; - case 12: - case 13: - /* 110x xxxx 10xx xxxx */ - count += 2; - if (count > utflen) { - throw new UTFDataFormatException("malformed input: partial character at end"); - } - char2 = bytearr[count - 1]; - if ((char2 & 0xC0) != 0x80) { - throw new UTFDataFormatException("malformed input around byte " + count); - } - chararr[chararrCount++] = (char) (((ch & 0x1F) << 6) | (char2 & 0x3F)); - break; - case 14: - /* 1110 xxxx 10xx xxxx 10xx xxxx */ - count += 3; - if (count > utflen) { - throw new UTFDataFormatException("malformed input: partial character at end"); - } - char2 = bytearr[count - 2]; - char3 = bytearr[count - 1]; - if (((char2 & 0xC0) != 0x80) || ((char3 & 0xC0) != 0x80)) { - throw new UTFDataFormatException("malformed input around byte " + (count - 1)); - } - chararr[chararrCount++] = - (char) (((ch & 0x0F) << 12) | ((char2 & 0x3F) << 6) | (char3 & 0x3F)); - break; - default: - /* 10xx xxxx, 1111 xxxx */ - throw new UTFDataFormatException("malformed input around byte " + count); - } - } - // The number of chars produced may be less than utflen - return new String(chararr, 0, chararrCount); - } - - private static int getUTFBytesSize(int ch) { - if ((ch >= 0x0001) && (ch <= 0x007F)) { - return 1; - } else if (ch > 0x07FF) { - return 3; - } else { - return 2; - } - } - - private static void writeUTFBytes(DataOutputSerializer out, String str, int utflen) - throws IOException { - int strlen = str.length(); - int ch; - - int len = Math.max(1024, utflen); - - byte[] bytearr = new byte[len]; - int count = 0; - - int index; - for (index = 0; index < strlen; index++) { - ch = str.charAt(index); - if (!((ch >= 0x0001) && (ch <= 0x007F))) { - break; - } - bytearr[count++] = (byte) ch; - } - - for (; index < strlen; index++) { - ch = str.charAt(index); - if ((ch >= 0x0001) && (ch <= 0x007F)) { - bytearr[count++] = (byte) ch; - } else if (ch > 0x07FF) { - bytearr[count++] = (byte) (0xE0 | ((ch >> 12) & 0x0F)); - bytearr[count++] = (byte) (0x80 | ((ch >> 6) & 0x3F)); - bytearr[count++] = (byte) (0x80 | (ch & 0x3F)); - } else { - bytearr[count++] = (byte) (0xC0 | ((ch >> 6) & 0x1F)); - bytearr[count++] = (byte) (0x80 | (ch & 0x3F)); - } - } - - out.write(bytearr, 0, count); - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/split/SplitComparators.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/split/SplitComparators.java deleted file mode 100644 index 56ee92014d12..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/split/SplitComparators.java +++ /dev/null @@ -1,76 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.split; - -import org.apache.iceberg.flink.source.reader.SplitWatermarkExtractor; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; - -/** - * Provides implementations of {@link org.apache.iceberg.flink.source.split.SerializableComparator} - * which could be used for ordering splits. These are used by the {@link - * org.apache.iceberg.flink.source.assigner.OrderedSplitAssignerFactory} and the {@link - * org.apache.iceberg.flink.source.reader.IcebergSourceReader} - */ -public class SplitComparators { - private SplitComparators() {} - - /** Comparator which orders the splits based on the file sequence number of the data files */ - public static SerializableComparator fileSequenceNumber() { - return (IcebergSourceSplit o1, IcebergSourceSplit o2) -> { - Preconditions.checkArgument( - o1.task().files().size() == 1 && o2.task().files().size() == 1, - "Could not compare combined task. Please use 'split-open-file-cost' to prevent combining multiple files to a split"); - - Long seq1 = o1.task().files().iterator().next().file().fileSequenceNumber(); - Long seq2 = o2.task().files().iterator().next().file().fileSequenceNumber(); - - Preconditions.checkNotNull( - seq1, - "Invalid file sequence number: null. Doesn't support splits written with V1 format: %s", - o1); - Preconditions.checkNotNull( - seq2, - "Invalid file sequence number: null. Doesn't support splits written with V1 format: %s", - o2); - - int temp = Long.compare(seq1, seq2); - if (temp != 0) { - return temp; - } else { - return o1.splitId().compareTo(o2.splitId()); - } - }; - } - - /** Comparator which orders the splits based on watermark of the splits */ - public static SerializableComparator watermark( - SplitWatermarkExtractor watermarkExtractor) { - return (IcebergSourceSplit o1, IcebergSourceSplit o2) -> { - long watermark1 = watermarkExtractor.extractWatermark(o1); - long watermark2 = watermarkExtractor.extractWatermark(o2); - - int temp = Long.compare(watermark1, watermark2); - if (temp != 0) { - return temp; - } else { - return o1.splitId().compareTo(o2.splitId()); - } - }; - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/split/SplitRequestEvent.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/split/SplitRequestEvent.java deleted file mode 100644 index eabd757aa638..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/source/split/SplitRequestEvent.java +++ /dev/null @@ -1,54 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.split; - -import java.util.Collection; -import java.util.Collections; -import org.apache.flink.annotation.Internal; -import org.apache.flink.api.connector.source.SourceEvent; - -/** We can remove this class once FLINK-21364 is resolved. */ -@Internal -public class SplitRequestEvent implements SourceEvent { - private static final long serialVersionUID = 1L; - - private final Collection finishedSplitIds; - private final String requesterHostname; - - public SplitRequestEvent() { - this(Collections.emptyList()); - } - - public SplitRequestEvent(Collection finishedSplitIds) { - this(finishedSplitIds, null); - } - - public SplitRequestEvent(Collection finishedSplitIds, String requesterHostname) { - this.finishedSplitIds = finishedSplitIds; - this.requesterHostname = requesterHostname; - } - - public Collection finishedSplitIds() { - return finishedSplitIds; - } - - public String requesterHostname() { - return requesterHostname; - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/util/FlinkAlterTableUtil.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/util/FlinkAlterTableUtil.java deleted file mode 100644 index 2bbc9cf208fe..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/util/FlinkAlterTableUtil.java +++ /dev/null @@ -1,248 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.util; - -import java.util.List; -import java.util.Map; -import org.apache.flink.table.catalog.Column; -import org.apache.flink.table.catalog.TableChange; -import org.apache.flink.table.catalog.UniqueConstraint; -import org.apache.iceberg.Table; -import org.apache.iceberg.Transaction; -import org.apache.iceberg.UpdateProperties; -import org.apache.iceberg.UpdateSchema; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.types.Type; - -public class FlinkAlterTableUtil { - private FlinkAlterTableUtil() {} - - public static void commitChanges( - Table table, - String setLocation, - String setSnapshotId, - String pickSnapshotId, - Map setProperties) { - commitManageSnapshots(table, setSnapshotId, pickSnapshotId); - - Transaction transaction = table.newTransaction(); - - if (setLocation != null) { - transaction.updateLocation().setLocation(setLocation).commit(); - } - - if (!setProperties.isEmpty()) { - UpdateProperties updateProperties = transaction.updateProperties(); - setProperties.forEach( - (k, v) -> { - if (v == null) { - updateProperties.remove(k); - } else { - updateProperties.set(k, v); - } - }); - updateProperties.commit(); - } - - transaction.commitTransaction(); - } - - public static void commitChanges( - Table table, - String setLocation, - String setSnapshotId, - String pickSnapshotId, - List schemaChanges, - List propertyChanges) { - commitManageSnapshots(table, setSnapshotId, pickSnapshotId); - - Transaction transaction = table.newTransaction(); - - if (setLocation != null) { - transaction.updateLocation().setLocation(setLocation).commit(); - } - - if (!schemaChanges.isEmpty()) { - UpdateSchema updateSchema = transaction.updateSchema(); - FlinkAlterTableUtil.applySchemaChanges(updateSchema, schemaChanges); - updateSchema.commit(); - } - - if (!propertyChanges.isEmpty()) { - UpdateProperties updateProperties = transaction.updateProperties(); - FlinkAlterTableUtil.applyPropertyChanges(updateProperties, propertyChanges); - updateProperties.commit(); - } - - transaction.commitTransaction(); - } - - public static void commitManageSnapshots( - Table table, String setSnapshotId, String cherrypickSnapshotId) { - // don't allow setting the snapshot and picking a commit at the same time because order is - // ambiguous and choosing one order leads to different results - Preconditions.checkArgument( - setSnapshotId == null || cherrypickSnapshotId == null, - "Cannot set the current snapshot ID and cherry-pick snapshot changes"); - - if (setSnapshotId != null) { - long newSnapshotId = Long.parseLong(setSnapshotId); - table.manageSnapshots().setCurrentSnapshot(newSnapshotId).commit(); - } - - // if updating the table snapshot, perform that update first in case it fails - if (cherrypickSnapshotId != null) { - long newSnapshotId = Long.parseLong(cherrypickSnapshotId); - table.manageSnapshots().cherrypick(newSnapshotId).commit(); - } - } - - /** - * Applies a list of Flink table changes to an {@link UpdateSchema} operation. - * - * @param pendingUpdate an uncommitted UpdateSchema operation to configure - * @param schemaChanges a list of Flink table changes - */ - public static void applySchemaChanges( - UpdateSchema pendingUpdate, List schemaChanges) { - for (TableChange change : schemaChanges) { - if (change instanceof TableChange.AddColumn) { - TableChange.AddColumn addColumn = (TableChange.AddColumn) change; - Column flinkColumn = addColumn.getColumn(); - Preconditions.checkArgument( - FlinkCompatibilityUtil.isPhysicalColumn(flinkColumn), - "Unsupported table change: Adding computed column %s.", - flinkColumn.getName()); - Type icebergType = FlinkSchemaUtil.convert(flinkColumn.getDataType().getLogicalType()); - if (flinkColumn.getDataType().getLogicalType().isNullable()) { - pendingUpdate.addColumn( - flinkColumn.getName(), icebergType, flinkColumn.getComment().orElse(null)); - } else { - pendingUpdate.addRequiredColumn( - flinkColumn.getName(), icebergType, flinkColumn.getComment().orElse(null)); - } - } else if (change instanceof TableChange.ModifyColumn) { - TableChange.ModifyColumn modifyColumn = (TableChange.ModifyColumn) change; - applyModifyColumn(pendingUpdate, modifyColumn); - } else if (change instanceof TableChange.DropColumn) { - TableChange.DropColumn dropColumn = (TableChange.DropColumn) change; - pendingUpdate.deleteColumn(dropColumn.getColumnName()); - } else if (change instanceof TableChange.AddWatermark) { - throw new UnsupportedOperationException("Unsupported table change: AddWatermark."); - } else if (change instanceof TableChange.ModifyWatermark) { - throw new UnsupportedOperationException("Unsupported table change: ModifyWatermark."); - } else if (change instanceof TableChange.DropWatermark) { - throw new UnsupportedOperationException("Unsupported table change: DropWatermark."); - } else if (change instanceof TableChange.AddUniqueConstraint) { - TableChange.AddUniqueConstraint addPk = (TableChange.AddUniqueConstraint) change; - applyUniqueConstraint(pendingUpdate, addPk.getConstraint()); - } else if (change instanceof TableChange.ModifyUniqueConstraint) { - TableChange.ModifyUniqueConstraint modifyPk = (TableChange.ModifyUniqueConstraint) change; - applyUniqueConstraint(pendingUpdate, modifyPk.getNewConstraint()); - } else if (change instanceof TableChange.DropConstraint) { - throw new UnsupportedOperationException("Unsupported table change: DropConstraint."); - } else { - throw new UnsupportedOperationException("Cannot apply unknown table change: " + change); - } - } - } - - /** - * Applies a list of Flink table property changes to an {@link UpdateProperties} operation. - * - * @param pendingUpdate an uncommitted UpdateProperty operation to configure - * @param propertyChanges a list of Flink table changes - */ - public static void applyPropertyChanges( - UpdateProperties pendingUpdate, List propertyChanges) { - for (TableChange change : propertyChanges) { - if (change instanceof TableChange.SetOption) { - TableChange.SetOption setOption = (TableChange.SetOption) change; - pendingUpdate.set(setOption.getKey(), setOption.getValue()); - } else if (change instanceof TableChange.ResetOption) { - TableChange.ResetOption resetOption = (TableChange.ResetOption) change; - pendingUpdate.remove(resetOption.getKey()); - } else { - throw new UnsupportedOperationException( - "The given table change is not a property change: " + change); - } - } - } - - private static void applyModifyColumn( - UpdateSchema pendingUpdate, TableChange.ModifyColumn modifyColumn) { - if (modifyColumn instanceof TableChange.ModifyColumnName) { - TableChange.ModifyColumnName modifyName = (TableChange.ModifyColumnName) modifyColumn; - pendingUpdate.renameColumn(modifyName.getOldColumnName(), modifyName.getNewColumnName()); - } else if (modifyColumn instanceof TableChange.ModifyColumnPosition) { - TableChange.ModifyColumnPosition modifyPosition = - (TableChange.ModifyColumnPosition) modifyColumn; - applyModifyColumnPosition(pendingUpdate, modifyPosition); - } else if (modifyColumn instanceof TableChange.ModifyPhysicalColumnType) { - TableChange.ModifyPhysicalColumnType modifyType = - (TableChange.ModifyPhysicalColumnType) modifyColumn; - Type type = FlinkSchemaUtil.convert(modifyType.getNewType().getLogicalType()); - String columnName = modifyType.getOldColumn().getName(); - pendingUpdate.updateColumn(columnName, type.asPrimitiveType()); - if (modifyType.getNewColumn().getDataType().getLogicalType().isNullable()) { - pendingUpdate.makeColumnOptional(columnName); - } else { - pendingUpdate.requireColumn(columnName); - } - } else if (modifyColumn instanceof TableChange.ModifyColumnComment) { - TableChange.ModifyColumnComment modifyComment = - (TableChange.ModifyColumnComment) modifyColumn; - pendingUpdate.updateColumnDoc( - modifyComment.getOldColumn().getName(), modifyComment.getNewComment()); - } else { - throw new UnsupportedOperationException( - "Cannot apply unknown modify-column change: " + modifyColumn); - } - } - - private static void applyModifyColumnPosition( - UpdateSchema pendingUpdate, TableChange.ModifyColumnPosition modifyColumnPosition) { - TableChange.ColumnPosition newPosition = modifyColumnPosition.getNewPosition(); - if (newPosition instanceof TableChange.First) { - pendingUpdate.moveFirst(modifyColumnPosition.getOldColumn().getName()); - } else if (newPosition instanceof TableChange.After) { - TableChange.After after = (TableChange.After) newPosition; - pendingUpdate.moveAfter(modifyColumnPosition.getOldColumn().getName(), after.column()); - } else { - throw new UnsupportedOperationException( - "Cannot apply unknown modify-column-position change: " + modifyColumnPosition); - } - } - - private static void applyUniqueConstraint( - UpdateSchema pendingUpdate, UniqueConstraint constraint) { - switch (constraint.getType()) { - case PRIMARY_KEY: - pendingUpdate.setIdentifierFields(constraint.getColumns()); - break; - case UNIQUE_KEY: - throw new UnsupportedOperationException( - "Unsupported table change: setting unique key constraints."); - default: - throw new UnsupportedOperationException( - "Cannot apply unknown unique constraint: " + constraint.getType().name()); - } - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/util/FlinkCompatibilityUtil.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/util/FlinkCompatibilityUtil.java deleted file mode 100644 index f02af894e82b..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/util/FlinkCompatibilityUtil.java +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.util; - -import org.apache.flink.api.common.typeinfo.TypeInformation; -import org.apache.flink.table.api.TableColumn; -import org.apache.flink.table.catalog.Column; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.runtime.typeutils.InternalTypeInfo; -import org.apache.flink.table.types.logical.RowType; - -/** - * This is a small util class that try to hide calls to Flink Internal or PublicEvolve interfaces as - * Flink can change those APIs during minor version release. - */ -public class FlinkCompatibilityUtil { - - private FlinkCompatibilityUtil() {} - - public static TypeInformation toTypeInfo(RowType rowType) { - return InternalTypeInfo.of(rowType); - } - - public static boolean isPhysicalColumn(TableColumn column) { - return column.isPhysical(); - } - - public static boolean isPhysicalColumn(Column column) { - return column.isPhysical(); - } -} diff --git a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/util/FlinkPackage.java b/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/util/FlinkPackage.java deleted file mode 100644 index 20b33e615e5f..000000000000 --- a/flink/v1.17/flink/src/main/java/org/apache/iceberg/flink/util/FlinkPackage.java +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.util; - -import java.util.concurrent.atomic.AtomicReference; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; - -public class FlinkPackage { - - private static final AtomicReference VERSION = new AtomicReference<>(); - public static final String FLINK_UNKNOWN_VERSION = "FLINK-UNKNOWN-VERSION"; - - private FlinkPackage() {} - - /** Returns Flink version string like x.y.z */ - public static String version() { - if (null == VERSION.get()) { - String detectedVersion; - try { - detectedVersion = versionFromJar(); - // use unknown version in case exact implementation version can't be found from the jar - // (this can happen if the DataStream class appears multiple times in the same classpath - // such as with shading) - detectedVersion = detectedVersion != null ? detectedVersion : FLINK_UNKNOWN_VERSION; - } catch (Exception e) { - detectedVersion = FLINK_UNKNOWN_VERSION; - } - VERSION.set(detectedVersion); - } - - return VERSION.get(); - } - - @VisibleForTesting - static String versionFromJar() { - // Choose {@link DataStream} class because it is one of the core Flink API - return DataStream.class.getPackage().getImplementationVersion(); - } - - @VisibleForTesting - static void setVersion(String version) { - VERSION.set(version); - } -} diff --git a/flink/v1.17/flink/src/main/resources/META-INF/services/org.apache.flink.table.factories.Factory b/flink/v1.17/flink/src/main/resources/META-INF/services/org.apache.flink.table.factories.Factory deleted file mode 100644 index 29a9955a7e20..000000000000 --- a/flink/v1.17/flink/src/main/resources/META-INF/services/org.apache.flink.table.factories.Factory +++ /dev/null @@ -1,16 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -org.apache.iceberg.flink.FlinkDynamicTableFactory diff --git a/flink/v1.17/flink/src/main/resources/META-INF/services/org.apache.flink.table.factories.TableFactory b/flink/v1.17/flink/src/main/resources/META-INF/services/org.apache.flink.table.factories.TableFactory deleted file mode 100644 index 2b6bfa3cd579..000000000000 --- a/flink/v1.17/flink/src/main/resources/META-INF/services/org.apache.flink.table.factories.TableFactory +++ /dev/null @@ -1,16 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -org.apache.iceberg.flink.FlinkCatalogFactory diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/AvroGenericRecordConverterBase.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/AvroGenericRecordConverterBase.java deleted file mode 100644 index 4184526a6a1a..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/AvroGenericRecordConverterBase.java +++ /dev/null @@ -1,90 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import org.junit.jupiter.api.Test; - -public abstract class AvroGenericRecordConverterBase { - protected abstract void testConverter(DataGenerator dataGenerator) throws Exception; - - @Test - public void testPrimitiveTypes() throws Exception { - testConverter(new DataGenerators.Primitives()); - } - - @Test - public void testStructOfPrimitive() throws Exception { - testConverter(new DataGenerators.StructOfPrimitive()); - } - - @Test - public void testStructOfArray() throws Exception { - testConverter(new DataGenerators.StructOfArray()); - } - - @Test - public void testStructOfMap() throws Exception { - testConverter(new DataGenerators.StructOfMap()); - } - - @Test - public void testStructOfStruct() throws Exception { - testConverter(new DataGenerators.StructOfStruct()); - } - - @Test - public void testArrayOfPrimitive() throws Exception { - testConverter(new DataGenerators.ArrayOfPrimitive()); - } - - @Test - public void testArrayOfArray() throws Exception { - testConverter(new DataGenerators.ArrayOfArray()); - } - - @Test - public void testArrayOfMap() throws Exception { - testConverter(new DataGenerators.ArrayOfMap()); - } - - @Test - public void testArrayOfStruct() throws Exception { - testConverter(new DataGenerators.ArrayOfStruct()); - } - - @Test - public void testMapOfPrimitives() throws Exception { - testConverter(new DataGenerators.MapOfPrimitives()); - } - - @Test - public void testMapOfArray() throws Exception { - testConverter(new DataGenerators.MapOfArray()); - } - - @Test - public void testMapOfMap() throws Exception { - testConverter(new DataGenerators.MapOfMap()); - } - - @Test - public void testMapOfStruct() throws Exception { - testConverter(new DataGenerators.MapOfStruct()); - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/CatalogTestBase.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/CatalogTestBase.java deleted file mode 100644 index 91ed3c4adea3..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/CatalogTestBase.java +++ /dev/null @@ -1,143 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import java.io.File; -import java.util.Arrays; -import java.util.List; -import java.util.Map; -import org.apache.flink.util.ArrayUtils; -import org.apache.hadoop.hive.conf.HiveConf; -import org.apache.iceberg.CatalogProperties; -import org.apache.iceberg.Parameter; -import org.apache.iceberg.ParameterizedTestExtension; -import org.apache.iceberg.Parameters; -import org.apache.iceberg.catalog.Catalog; -import org.apache.iceberg.catalog.Namespace; -import org.apache.iceberg.catalog.SupportsNamespaces; -import org.apache.iceberg.hadoop.HadoopCatalog; -import org.apache.iceberg.relocated.com.google.common.base.Joiner; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.extension.ExtendWith; -import org.junit.jupiter.api.io.TempDir; - -@ExtendWith(ParameterizedTestExtension.class) -public abstract class CatalogTestBase extends TestBase { - - protected static final String DATABASE = "db"; - @TempDir protected File hiveWarehouse; - @TempDir protected File hadoopWarehouse; - - @Parameter(index = 0) - protected String catalogName; - - @Parameter(index = 1) - protected Namespace baseNamespace; - - protected Catalog validationCatalog; - protected SupportsNamespaces validationNamespaceCatalog; - protected Map config = Maps.newHashMap(); - - protected String flinkDatabase; - protected Namespace icebergNamespace; - protected boolean isHadoopCatalog; - - @Parameters(name = "catalogName={0}, baseNamespace={1}") - protected static List parameters() { - return Arrays.asList( - new Object[] {"testhive", Namespace.empty()}, - new Object[] {"testhadoop", Namespace.empty()}, - new Object[] {"testhadoop_basenamespace", Namespace.of("l0", "l1")}); - } - - @BeforeEach - public void before() { - this.isHadoopCatalog = catalogName.startsWith("testhadoop"); - this.validationCatalog = - isHadoopCatalog - ? new HadoopCatalog(hiveConf, "file:" + hadoopWarehouse.getPath()) - : catalog; - this.validationNamespaceCatalog = (SupportsNamespaces) validationCatalog; - - config.put("type", "iceberg"); - if (!baseNamespace.isEmpty()) { - config.put(FlinkCatalogFactory.BASE_NAMESPACE, baseNamespace.toString()); - } - if (isHadoopCatalog) { - config.put(FlinkCatalogFactory.ICEBERG_CATALOG_TYPE, "hadoop"); - } else { - config.put(FlinkCatalogFactory.ICEBERG_CATALOG_TYPE, "hive"); - config.put(CatalogProperties.URI, getURI(hiveConf)); - } - config.put(CatalogProperties.WAREHOUSE_LOCATION, String.format("file://%s", warehouseRoot())); - - this.flinkDatabase = catalogName + "." + DATABASE; - this.icebergNamespace = - Namespace.of(ArrayUtils.concat(baseNamespace.levels(), new String[] {DATABASE})); - sql("CREATE CATALOG %s WITH %s", catalogName, toWithClause(config)); - } - - @AfterEach - public void clean() { - dropCatalog(catalogName, true); - } - - protected String warehouseRoot() { - if (isHadoopCatalog) { - return hadoopWarehouse.getAbsolutePath(); - } else { - return hiveWarehouse.getAbsolutePath(); - } - } - - protected String getFullQualifiedTableName(String tableName) { - final List levels = Lists.newArrayList(icebergNamespace.levels()); - levels.add(tableName); - return Joiner.on('.').join(levels); - } - - static String getURI(HiveConf conf) { - return conf.get(HiveConf.ConfVars.METASTOREURIS.varname); - } - - static String toWithClause(Map props) { - StringBuilder builder = new StringBuilder(); - builder.append("("); - int propCount = 0; - for (Map.Entry entry : props.entrySet()) { - if (propCount > 0) { - builder.append(","); - } - builder - .append("'") - .append(entry.getKey()) - .append("'") - .append("=") - .append("'") - .append(entry.getValue()) - .append("'"); - propCount++; - } - builder.append(")"); - return builder.toString(); - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/DataGenerator.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/DataGenerator.java deleted file mode 100644 index b1e3b20ff7ac..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/DataGenerator.java +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.Schema; -import org.apache.iceberg.data.GenericRecord; - -/** - * This interface defines test data generator. Different implementations for primitive and complex - * nested fields are defined in {@link DataGenerators}. - */ -public interface DataGenerator { - Schema icebergSchema(); - - RowType flinkRowType(); - - org.apache.avro.Schema avroSchema(); - - GenericRecord generateIcebergGenericRecord(); - - GenericRowData generateFlinkRowData(); - - org.apache.avro.generic.GenericRecord generateAvroGenericRecord(); -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/DataGenerators.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/DataGenerators.java deleted file mode 100644 index e2cd411d7069..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/DataGenerators.java +++ /dev/null @@ -1,1172 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import static org.apache.iceberg.types.Types.NestedField.required; - -import com.fasterxml.jackson.databind.node.IntNode; -import java.math.BigDecimal; -import java.nio.ByteBuffer; -import java.nio.charset.StandardCharsets; -import java.time.LocalDate; -import java.time.LocalDateTime; -import java.time.LocalTime; -import java.time.OffsetDateTime; -import java.time.ZoneOffset; -import java.util.Arrays; -import java.util.List; -import java.util.UUID; -import java.util.concurrent.TimeUnit; -import java.util.stream.Collectors; -import org.apache.avro.LogicalTypes; -import org.apache.avro.SchemaBuilder; -import org.apache.avro.generic.GenericData; -import org.apache.avro.util.Utf8; -import org.apache.flink.table.data.DecimalData; -import org.apache.flink.table.data.GenericArrayData; -import org.apache.flink.table.data.GenericMapData; -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.StringData; -import org.apache.flink.table.data.TimestampData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.Schema; -import org.apache.iceberg.avro.AvroSchemaUtil; -import org.apache.iceberg.data.GenericRecord; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.types.Types; -import org.joda.time.DateTime; -import org.joda.time.DateTimeZone; -import org.joda.time.Days; - -/** - * Util class to generate test data with extensive coverage different field types: from primitives - * to complex nested types. - */ -public class DataGenerators { - - public static class Primitives implements DataGenerator { - private static final DateTime JODA_DATETIME_EPOC = - new DateTime(1970, 1, 1, 0, 0, 0, 0, DateTimeZone.UTC); - private static final DateTime JODA_DATETIME_20220110 = - new DateTime(2022, 1, 10, 0, 0, 0, 0, DateTimeZone.UTC); - private static final int DAYS_BTW_EPOC_AND_20220110 = - Days.daysBetween(JODA_DATETIME_EPOC, JODA_DATETIME_20220110).getDays(); - private static final int HOUR_8_IN_MILLI = (int) TimeUnit.HOURS.toMillis(8); - - private static final LocalDate JAVA_LOCAL_DATE_20220110 = LocalDate.of(2022, 1, 10); - private static final LocalTime JAVA_LOCAL_TIME_HOUR8 = LocalTime.of(8, 0); - private static final OffsetDateTime JAVA_OFFSET_DATE_TIME_20220110 = - OffsetDateTime.of(2022, 1, 10, 0, 0, 0, 0, ZoneOffset.UTC); - private static final LocalDateTime JAVA_LOCAL_DATE_TIME_20220110 = - LocalDateTime.of(2022, 1, 10, 0, 0, 0); - private static final BigDecimal BIG_DECIMAL_NEGATIVE = new BigDecimal("-1.50"); - private static final byte[] FIXED_BYTES = "012345689012345".getBytes(StandardCharsets.UTF_8); - - private final Schema icebergSchema = - new Schema( - Types.NestedField.required(1, "row_id", Types.StringType.get()), - // primitive types - Types.NestedField.optional(2, "boolean_field", Types.BooleanType.get()), - Types.NestedField.optional(3, "int_field", Types.IntegerType.get()), - Types.NestedField.optional(4, "long_field", Types.LongType.get()), - Types.NestedField.optional(5, "float_field", Types.FloatType.get()), - Types.NestedField.optional(6, "double_field", Types.DoubleType.get()), - Types.NestedField.required(7, "string_field", Types.StringType.get()), - Types.NestedField.required(8, "date_field", Types.DateType.get()), - Types.NestedField.required(9, "time_field", Types.TimeType.get()), - Types.NestedField.required(10, "ts_with_zone_field", Types.TimestampType.withZone()), - Types.NestedField.required( - 11, "ts_without_zone_field", Types.TimestampType.withoutZone()), - Types.NestedField.required(12, "uuid_field", Types.UUIDType.get()), - Types.NestedField.required(13, "binary_field", Types.BinaryType.get()), - Types.NestedField.required(14, "decimal_field", Types.DecimalType.of(9, 2)), - Types.NestedField.required(15, "fixed_field", Types.FixedType.ofLength(16))); - - private final RowType flinkRowType = FlinkSchemaUtil.convert(icebergSchema); - - /** - * Fix up Avro Schema that is converted from Iceberg Schema. - * - * @param schemaConvertedFromIceberg Avro Schema converted from Iceberg schema via {@link - * AvroSchemaUtil#convert(Schema, String)} - */ - private org.apache.avro.Schema fixupAvroSchemaConvertedFromIcebergSchema( - org.apache.avro.Schema schemaConvertedFromIceberg) { - List fixedFields = - schemaConvertedFromIceberg.getFields().stream() - .map( - field -> { - org.apache.avro.Schema.Field updatedField = field; - if (field.name().equals("time_field")) { - // Iceberg's AvroSchemaUtil uses timestamp-micros with Long value for time - // field, while AvroToRowDataConverters#convertToTime() always looks for - // Integer value assuming millis. The root problem is that - // AvroToRowDataConverters#createConverter() uses LogicalTypeRoot to - // determine converter and LogicalTypeRoot lost the timestamp precision - // carried by LogicalType like Time(6). - org.apache.avro.Schema fieldSchema = - LogicalTypes.timeMillis() - .addToSchema( - org.apache.avro.Schema.create(org.apache.avro.Schema.Type.INT)); - updatedField = new org.apache.avro.Schema.Field("time_field", fieldSchema); - } - - return new org.apache.avro.Schema.Field(updatedField, updatedField.schema()); - }) - .collect(Collectors.toList()); - return org.apache.avro.Schema.createRecord( - schemaConvertedFromIceberg.getName(), - schemaConvertedFromIceberg.getDoc(), - schemaConvertedFromIceberg.getNamespace(), - schemaConvertedFromIceberg.isError(), - fixedFields); - } - - private final org.apache.avro.Schema avroSchema = - fixupAvroSchemaConvertedFromIcebergSchema(AvroSchemaUtil.convert(icebergSchema, "table")); - - @Override - public Schema icebergSchema() { - return icebergSchema; - } - - @Override - public RowType flinkRowType() { - return flinkRowType; - } - - @Override - public org.apache.avro.Schema avroSchema() { - return avroSchema; - } - - @Override - public GenericRecord generateIcebergGenericRecord() { - GenericRecord genericRecord = GenericRecord.create(icebergSchema); - genericRecord.setField("row_id", "row_id_value"); - genericRecord.setField("boolean_field", false); - genericRecord.setField("int_field", Integer.MAX_VALUE); - genericRecord.setField("long_field", Long.MAX_VALUE); - genericRecord.setField("float_field", Float.MAX_VALUE); - genericRecord.setField("double_field", Double.MAX_VALUE); - genericRecord.setField("string_field", "str"); - - genericRecord.setField("date_field", JAVA_LOCAL_DATE_20220110); - genericRecord.setField("time_field", JAVA_LOCAL_TIME_HOUR8); - genericRecord.setField("ts_with_zone_field", JAVA_OFFSET_DATE_TIME_20220110); - genericRecord.setField("ts_without_zone_field", JAVA_LOCAL_DATE_TIME_20220110); - - byte[] uuidBytes = new byte[16]; - for (int i = 0; i < 16; ++i) { - uuidBytes[i] = (byte) i; - } - - genericRecord.setField("uuid_field", UUID.nameUUIDFromBytes(uuidBytes)); - - byte[] binaryBytes = new byte[7]; - for (int i = 0; i < 7; ++i) { - binaryBytes[i] = (byte) i; - } - genericRecord.setField("binary_field", ByteBuffer.wrap(binaryBytes)); - - genericRecord.setField("decimal_field", BIG_DECIMAL_NEGATIVE); - genericRecord.setField("fixed_field", FIXED_BYTES); - - return genericRecord; - } - - @Override - public GenericRowData generateFlinkRowData() { - byte[] uuidBytes = new byte[16]; - for (int i = 0; i < 16; ++i) { - uuidBytes[i] = (byte) i; - } - - byte[] binaryBytes = new byte[7]; - for (int i = 0; i < 7; ++i) { - binaryBytes[i] = (byte) i; - } - - return GenericRowData.of( - StringData.fromString("row_id_value"), - false, - Integer.MAX_VALUE, - Long.MAX_VALUE, - Float.MAX_VALUE, - Double.MAX_VALUE, - StringData.fromString("str"), - DAYS_BTW_EPOC_AND_20220110, - HOUR_8_IN_MILLI, - // Although Avro logical type for timestamp fields are in micro seconds, - // AvroToRowDataConverters only looks for long value in milliseconds. - TimestampData.fromEpochMillis(JODA_DATETIME_20220110.getMillis()), - TimestampData.fromEpochMillis(JODA_DATETIME_20220110.getMillis()), - uuidBytes, - binaryBytes, - DecimalData.fromBigDecimal(BIG_DECIMAL_NEGATIVE, 9, 2), - FIXED_BYTES); - } - - @Override - public org.apache.avro.generic.GenericRecord generateAvroGenericRecord() { - org.apache.avro.generic.GenericRecord genericRecord = new GenericData.Record(avroSchema); - genericRecord.put("row_id", new Utf8("row_id_value")); - genericRecord.put("boolean_field", false); - genericRecord.put("int_field", Integer.MAX_VALUE); - genericRecord.put("long_field", Long.MAX_VALUE); - genericRecord.put("float_field", Float.MAX_VALUE); - genericRecord.put("double_field", Double.MAX_VALUE); - genericRecord.put("string_field", new Utf8("str")); - - genericRecord.put("date_field", DAYS_BTW_EPOC_AND_20220110); - genericRecord.put("time_field", HOUR_8_IN_MILLI); - // Although Avro logical type for timestamp fields are in micro seconds, - // AvroToRowDataConverters only looks for long value in milliseconds. - genericRecord.put("ts_with_zone_field", JODA_DATETIME_20220110.getMillis()); - genericRecord.put("ts_without_zone_field", JODA_DATETIME_20220110.getMillis()); - - byte[] uuidBytes = new byte[16]; - for (int i = 0; i < 16; ++i) { - uuidBytes[i] = (byte) i; - } - genericRecord.put("uuid_field", ByteBuffer.wrap(uuidBytes)); - - byte[] binaryBytes = new byte[7]; - for (int i = 0; i < 7; ++i) { - binaryBytes[i] = (byte) i; - } - genericRecord.put("binary_field", ByteBuffer.wrap(binaryBytes)); - - BigDecimal bigDecimal = new BigDecimal("-1.50"); - // unscaledValue().toByteArray() is to match the behavior of RowDataToAvroConverters from - // Flink for decimal type - genericRecord.put("decimal_field", ByteBuffer.wrap(bigDecimal.unscaledValue().toByteArray())); - - genericRecord.put("fixed_field", ByteBuffer.wrap(FIXED_BYTES)); - - return genericRecord; - } - } - - public static class StructOfPrimitive implements DataGenerator { - private final Schema icebergSchema = - new Schema( - Types.NestedField.required(1, "row_id", Types.StringType.get()), - Types.NestedField.required( - 2, - "struct_of_primitive", - Types.StructType.of( - required(101, "id", Types.IntegerType.get()), - required(102, "name", Types.StringType.get())))); - - private final RowType flinkRowType = FlinkSchemaUtil.convert(icebergSchema); - - private final org.apache.avro.Schema avroSchema = - AvroSchemaUtil.convert(icebergSchema, "table"); - - @Override - public Schema icebergSchema() { - return icebergSchema; - } - - @Override - public RowType flinkRowType() { - return flinkRowType; - } - - @Override - public org.apache.avro.Schema avroSchema() { - return avroSchema; - } - - @Override - public GenericRecord generateIcebergGenericRecord() { - Schema structSchema = - new Schema(icebergSchema.findField("struct_of_primitive").type().asStructType().fields()); - GenericRecord struct = GenericRecord.create(structSchema); - struct.setField("id", 1); - struct.setField("name", "Jane"); - GenericRecord genericRecord = GenericRecord.create(icebergSchema); - genericRecord.setField("row_id", "row_id_value"); - genericRecord.setField("struct_of_primitive", struct); - return genericRecord; - } - - @Override - public GenericRowData generateFlinkRowData() { - return GenericRowData.of( - StringData.fromString("row_id_value"), - GenericRowData.of(1, StringData.fromString("Jane"))); - } - - @Override - public org.apache.avro.generic.GenericRecord generateAvroGenericRecord() { - org.apache.avro.Schema structSchema = avroSchema.getField("struct_of_primitive").schema(); - org.apache.avro.generic.GenericRecord struct = new GenericData.Record(structSchema); - struct.put("id", 1); - struct.put("name", "Jane"); - org.apache.avro.generic.GenericRecord genericRecord = new GenericData.Record(avroSchema); - genericRecord.put("row_id", "row_id_value"); - genericRecord.put("struct_of_primitive", struct); - return genericRecord; - } - } - - public static class StructOfArray implements DataGenerator { - private final Schema icebergSchema = - new Schema( - Types.NestedField.required(1, "row_id", Types.StringType.get()), - Types.NestedField.required( - 2, - "struct_of_array", - Types.StructType.of( - required(101, "id", Types.IntegerType.get()), - required( - 102, "names", Types.ListType.ofRequired(201, Types.StringType.get()))))); - - private final RowType flinkRowType = FlinkSchemaUtil.convert(icebergSchema); - - private final org.apache.avro.Schema avroSchema = - AvroSchemaUtil.convert(icebergSchema, "table"); - - @Override - public Schema icebergSchema() { - return icebergSchema; - } - - @Override - public RowType flinkRowType() { - return flinkRowType; - } - - @Override - public org.apache.avro.Schema avroSchema() { - return avroSchema; - } - - @Override - public GenericRecord generateIcebergGenericRecord() { - Schema structSchema = - new Schema(icebergSchema.findField("struct_of_array").type().asStructType().fields()); - GenericRecord struct = GenericRecord.create(structSchema); - struct.setField("id", 1); - struct.setField("names", Arrays.asList("Jane", "Joe")); - GenericRecord genericRecord = GenericRecord.create(icebergSchema); - genericRecord.setField("row_id", "row_id_value"); - genericRecord.setField("struct_of_array", struct); - return genericRecord; - } - - @Override - public GenericRowData generateFlinkRowData() { - StringData[] names = {StringData.fromString("Jane"), StringData.fromString("Joe")}; - return GenericRowData.of( - StringData.fromString("row_id_value"), GenericRowData.of(1, new GenericArrayData(names))); - } - - @Override - public org.apache.avro.generic.GenericRecord generateAvroGenericRecord() { - org.apache.avro.Schema structSchema = avroSchema.getField("struct_of_array").schema(); - org.apache.avro.generic.GenericRecord struct = new GenericData.Record(structSchema); - struct.put("id", 1); - struct.put("names", Arrays.asList("Jane", "Joe")); - org.apache.avro.generic.GenericRecord genericRecord = new GenericData.Record(avroSchema); - genericRecord.put("row_id", "row_id_value"); - genericRecord.put("struct_of_array", struct); - return genericRecord; - } - } - - public static class StructOfMap implements DataGenerator { - private final Schema icebergSchema = - new Schema( - Types.NestedField.required(1, "row_id", Types.StringType.get()), - Types.NestedField.required( - 2, - "struct_of_map", - Types.StructType.of( - required(101, "id", Types.IntegerType.get()), - required( - 102, - "names", - Types.MapType.ofRequired( - 201, 202, Types.StringType.get(), Types.StringType.get()))))); - - private final RowType flinkRowType = FlinkSchemaUtil.convert(icebergSchema); - - private final org.apache.avro.Schema avroSchema = - AvroSchemaUtil.convert(icebergSchema, "table"); - - @Override - public Schema icebergSchema() { - return icebergSchema; - } - - @Override - public RowType flinkRowType() { - return flinkRowType; - } - - @Override - public org.apache.avro.Schema avroSchema() { - return avroSchema; - } - - @Override - public GenericRecord generateIcebergGenericRecord() { - Schema structSchema = - new Schema(icebergSchema.findField("struct_of_map").type().asStructType().fields()); - GenericRecord struct = GenericRecord.create(structSchema); - struct.setField("id", 1); - struct.setField("names", ImmutableMap.of("Jane", "female", "Joe", "male")); - GenericRecord genericRecord = GenericRecord.create(icebergSchema); - genericRecord.setField("row_id", "row_id_value"); - genericRecord.setField("struct_of_map", struct); - return genericRecord; - } - - @Override - public GenericRowData generateFlinkRowData() { - return GenericRowData.of( - StringData.fromString("row_id_value"), - GenericRowData.of( - 1, - new GenericMapData( - ImmutableMap.of( - StringData.fromString("Jane"), - StringData.fromString("female"), - StringData.fromString("Joe"), - StringData.fromString("male"))))); - } - - @Override - public org.apache.avro.generic.GenericRecord generateAvroGenericRecord() { - org.apache.avro.Schema structSchema = avroSchema.getField("struct_of_map").schema(); - org.apache.avro.generic.GenericRecord struct = new GenericData.Record(structSchema); - struct.put("id", 1); - struct.put("names", ImmutableMap.of("Jane", new Utf8("female"), "Joe", new Utf8("male"))); - org.apache.avro.generic.GenericRecord genericRecord = new GenericData.Record(avroSchema); - genericRecord.put("row_id", "row_id_value"); - genericRecord.put("struct_of_map", struct); - return genericRecord; - } - } - - public static class StructOfStruct implements DataGenerator { - private final Schema icebergSchema = - new Schema( - Types.NestedField.required(1, "row_id", Types.StringType.get()), - Types.NestedField.required( - 2, - "struct_of_struct", - Types.StructType.of( - required(101, "id", Types.IntegerType.get()), - required( - 102, - "person_struct", - Types.StructType.of( - Types.NestedField.required(201, "name", Types.StringType.get()), - Types.NestedField.required(202, "address", Types.StringType.get())))))); - - private final RowType flinkRowType = FlinkSchemaUtil.convert(icebergSchema); - - private final org.apache.avro.Schema avroSchema = - AvroSchemaUtil.convert(icebergSchema, "table"); - - @Override - public Schema icebergSchema() { - return icebergSchema; - } - - @Override - public RowType flinkRowType() { - return flinkRowType; - } - - @Override - public org.apache.avro.Schema avroSchema() { - return avroSchema; - } - - @Override - public GenericRecord generateIcebergGenericRecord() { - Schema structSchema = - new Schema(icebergSchema.findField("struct_of_struct").type().asStructType().fields()); - Schema personSchema = - new Schema(structSchema.findField("person_struct").type().asStructType().fields()); - GenericRecord person = GenericRecord.create(personSchema); - person.setField("name", "Jane"); - person.setField("address", "Apple Park"); - GenericRecord struct = GenericRecord.create(structSchema); - struct.setField("id", 1); - struct.setField("person_struct", person); - GenericRecord genericRecord = GenericRecord.create(icebergSchema); - genericRecord.setField("row_id", "row_id_value"); - genericRecord.setField("struct_of_struct", struct); - return genericRecord; - } - - @Override - public GenericRowData generateFlinkRowData() { - return GenericRowData.of( - StringData.fromString("row_id_value"), - GenericRowData.of( - 1, - GenericRowData.of( - StringData.fromString("Jane"), StringData.fromString("Apple Park")))); - } - - @Override - public org.apache.avro.generic.GenericRecord generateAvroGenericRecord() { - org.apache.avro.Schema structSchema = avroSchema.getField("struct_of_struct").schema(); - org.apache.avro.Schema personSchema = structSchema.getField("person_struct").schema(); - org.apache.avro.generic.GenericRecord person = new GenericData.Record(personSchema); - person.put("name", "Jane"); - person.put("address", "Apple Park"); - org.apache.avro.generic.GenericRecord struct = new GenericData.Record(structSchema); - struct.put("id", 1); - struct.put("person_struct", person); - org.apache.avro.generic.GenericRecord genericRecord = new GenericData.Record(avroSchema); - genericRecord.put("row_id", "row_id_value"); - genericRecord.put("struct_of_struct", struct); - return genericRecord; - } - } - - public static class ArrayOfPrimitive implements DataGenerator { - private final Schema icebergSchema = - new Schema( - Types.NestedField.required(1, "row_id", Types.StringType.get()), - Types.NestedField.required( - 2, "array_of_int", Types.ListType.ofOptional(101, Types.IntegerType.get()))); - - private final RowType flinkRowType = FlinkSchemaUtil.convert(icebergSchema); - - private final org.apache.avro.Schema avroSchema = - AvroSchemaUtil.convert(icebergSchema, "table"); - - @Override - public Schema icebergSchema() { - return icebergSchema; - } - - @Override - public RowType flinkRowType() { - return flinkRowType; - } - - @Override - public org.apache.avro.Schema avroSchema() { - return avroSchema; - } - - @Override - public GenericRecord generateIcebergGenericRecord() { - GenericRecord genericRecord = GenericRecord.create(icebergSchema); - genericRecord.setField("row_id", "row_id_value"); - genericRecord.setField("array_of_int", Arrays.asList(1, 2, 3)); - return genericRecord; - } - - @Override - public GenericRowData generateFlinkRowData() { - Integer[] arr = {1, 2, 3}; - return GenericRowData.of(StringData.fromString("row_id_value"), new GenericArrayData(arr)); - } - - @Override - public org.apache.avro.generic.GenericRecord generateAvroGenericRecord() { - org.apache.avro.generic.GenericRecord genericRecord = new GenericData.Record(avroSchema); - genericRecord.put("row_id", "row_id_value"); - genericRecord.put("array_of_int", Arrays.asList(1, 2, 3)); - return genericRecord; - } - } - - public static class ArrayOfArray implements DataGenerator { - private final Schema icebergSchema = - new Schema( - Types.NestedField.required(1, "row_id", Types.StringType.get()), - Types.NestedField.required( - 2, - "array_of_array", - Types.ListType.ofRequired( - 101, Types.ListType.ofRequired(201, Types.IntegerType.get())))); - - private final RowType flinkRowType = FlinkSchemaUtil.convert(icebergSchema); - - private final org.apache.avro.Schema avroSchema = - AvroSchemaUtil.convert(icebergSchema, "table"); - - @Override - public Schema icebergSchema() { - return icebergSchema; - } - - @Override - public RowType flinkRowType() { - return flinkRowType; - } - - @Override - public org.apache.avro.Schema avroSchema() { - return avroSchema; - } - - @Override - public GenericRecord generateIcebergGenericRecord() { - GenericRecord genericRecord = GenericRecord.create(icebergSchema); - genericRecord.setField("row_id", "row_id_value"); - genericRecord.setField( - "array_of_array", Arrays.asList(Arrays.asList(1, 2, 3), Arrays.asList(4, 5, 6))); - return genericRecord; - } - - @Override - public GenericRowData generateFlinkRowData() { - // non-primitive - Integer[] array1 = {1, 2, 3}; - Integer[] array2 = {4, 5, 6}; - GenericArrayData[] arrayOfArrays = { - new GenericArrayData(array1), new GenericArrayData(array2) - }; - return GenericRowData.of( - StringData.fromString("row_id_value"), new GenericArrayData(arrayOfArrays)); - } - - @Override - public org.apache.avro.generic.GenericRecord generateAvroGenericRecord() { - org.apache.avro.generic.GenericRecord genericRecord = new GenericData.Record(avroSchema); - genericRecord.put("row_id", "row_id_value"); - genericRecord.put( - "array_of_array", Arrays.asList(Arrays.asList(1, 2, 3), Arrays.asList(4, 5, 6))); - return genericRecord; - } - } - - public static class ArrayOfMap implements DataGenerator { - private final Schema icebergSchema = - new Schema( - Types.NestedField.required(1, "row_id", Types.StringType.get()), - Types.NestedField.required( - 2, - "array_of_map", - Types.ListType.ofRequired( - 101, - Types.MapType.ofRequired( - 201, 202, Types.StringType.get(), Types.IntegerType.get())))); - - private final RowType flinkRowType = FlinkSchemaUtil.convert(icebergSchema); - - private final org.apache.avro.Schema avroSchema = - AvroSchemaUtil.convert(icebergSchema, "table"); - - @Override - public Schema icebergSchema() { - return icebergSchema; - } - - @Override - public RowType flinkRowType() { - return flinkRowType; - } - - @Override - public org.apache.avro.Schema avroSchema() { - return avroSchema; - } - - @Override - public GenericRecord generateIcebergGenericRecord() { - GenericRecord genericRecord = GenericRecord.create(icebergSchema); - genericRecord.setField("row_id", "row_id_value"); - genericRecord.setField( - "array_of_map", - Arrays.asList( - ImmutableMap.of("Jane", 1, "Joe", 2), ImmutableMap.of("Alice", 3, "Bob", 4))); - return genericRecord; - } - - @Override - public GenericRowData generateFlinkRowData() { - GenericMapData[] array = { - new GenericMapData( - ImmutableMap.of(StringData.fromString("Jane"), 1, StringData.fromString("Joe"), 2)), - new GenericMapData( - ImmutableMap.of(StringData.fromString("Alice"), 3, StringData.fromString("Bob"), 4)) - }; - return GenericRowData.of(StringData.fromString("row_id_value"), new GenericArrayData(array)); - } - - @Override - public org.apache.avro.generic.GenericRecord generateAvroGenericRecord() { - org.apache.avro.generic.GenericRecord genericRecord = new GenericData.Record(avroSchema); - genericRecord.put("row_id", "row_id_value"); - genericRecord.put( - "array_of_map", - Arrays.asList( - ImmutableMap.of("Jane", 1, "Joe", 2), ImmutableMap.of("Alice", 3, "Bob", 4))); - return genericRecord; - } - } - - public static class ArrayOfStruct implements DataGenerator { - private final Types.StructType structType = - Types.StructType.of( - required(201, "id", Types.IntegerType.get()), - required(202, "name", Types.StringType.get())); - private final Schema structIcebergSchema = new Schema(structType.fields()); - private final org.apache.avro.Schema structAvroSchema = - AvroSchemaUtil.convert(structIcebergSchema, "struct"); - - private final Schema icebergSchema = - new Schema( - Types.NestedField.required(1, "row_id", Types.StringType.get()), - Types.NestedField.optional( - 2, "array_of_struct", Types.ListType.ofRequired(101, structType))); - - private final RowType flinkRowType = FlinkSchemaUtil.convert(icebergSchema); - - private final org.apache.avro.Schema avroSchema = - AvroSchemaUtil.convert(icebergSchema, "table"); - - @Override - public Schema icebergSchema() { - return icebergSchema; - } - - @Override - public RowType flinkRowType() { - return flinkRowType; - } - - @Override - public org.apache.avro.Schema avroSchema() { - return avroSchema; - } - - @Override - public GenericRecord generateIcebergGenericRecord() { - GenericRecord struct1 = GenericRecord.create(structIcebergSchema); - struct1.setField("id", 1); - struct1.setField("name", "Jane"); - GenericRecord struct2 = GenericRecord.create(structIcebergSchema); - struct2.setField("id", 2); - struct2.setField("name", "Joe"); - GenericRecord genericRecord = GenericRecord.create(icebergSchema); - genericRecord.setField("row_id", "row_id_value"); - genericRecord.setField("array_of_struct", Arrays.asList(struct1, struct2)); - return genericRecord; - } - - @Override - public GenericRowData generateFlinkRowData() { - GenericRowData[] structArray = { - GenericRowData.of(1, StringData.fromString("Jane")), - GenericRowData.of(2, StringData.fromString("Joe")) - }; - return GenericRowData.of( - StringData.fromString("row_id_value"), new GenericArrayData(structArray)); - } - - @Override - public org.apache.avro.generic.GenericRecord generateAvroGenericRecord() { - org.apache.avro.generic.GenericRecord struct1 = new GenericData.Record(structAvroSchema); - struct1.put("id", 1); - struct1.put("name", "Jane"); - org.apache.avro.generic.GenericRecord struct2 = new GenericData.Record(structAvroSchema); - struct2.put("id", 2); - struct2.put("name", "Joe"); - org.apache.avro.generic.GenericRecord genericRecord = new GenericData.Record(avroSchema); - genericRecord.put("row_id", "row_id_value"); - genericRecord.put("array_of_struct", Arrays.asList(struct1, struct2)); - return genericRecord; - } - } - - public static class MapOfPrimitives implements DataGenerator { - private final Schema icebergSchema = - new Schema( - Types.NestedField.required(1, "row_id", Types.StringType.get()), - Types.NestedField.optional( - 2, - "map_of_primitives", - Types.MapType.ofRequired( - 101, 102, Types.StringType.get(), Types.IntegerType.get()))); - - private final RowType flinkRowType = FlinkSchemaUtil.convert(icebergSchema); - - private final org.apache.avro.Schema avroSchema = - AvroSchemaUtil.convert(icebergSchema, "table"); - - @Override - public Schema icebergSchema() { - return icebergSchema; - } - - @Override - public RowType flinkRowType() { - return flinkRowType; - } - - @Override - public org.apache.avro.Schema avroSchema() { - return avroSchema; - } - - @Override - public GenericRecord generateIcebergGenericRecord() { - GenericRecord genericRecord = GenericRecord.create(icebergSchema); - genericRecord.setField("row_id", "row_id_value"); - genericRecord.setField("map_of_primitives", ImmutableMap.of("Jane", 1, "Joe", 2)); - return genericRecord; - } - - @Override - public GenericRowData generateFlinkRowData() { - return GenericRowData.of( - StringData.fromString("row_id_value"), - new GenericMapData( - ImmutableMap.of(StringData.fromString("Jane"), 1, StringData.fromString("Joe"), 2))); - } - - @Override - public org.apache.avro.generic.GenericRecord generateAvroGenericRecord() { - org.apache.avro.generic.GenericRecord genericRecord = new GenericData.Record(avroSchema); - genericRecord.put("row_id", "row_id_value"); - genericRecord.put("map_of_primitives", ImmutableMap.of("Jane", 1, "Joe", 2)); - return genericRecord; - } - } - - public static class MapOfArray implements DataGenerator { - private final Schema icebergSchema = - new Schema( - Types.NestedField.required(1, "row_id", Types.StringType.get()), - Types.NestedField.required( - 2, - "map_of_array", - Types.MapType.ofRequired( - 101, - 102, - Types.StringType.get(), - Types.ListType.ofRequired(201, Types.IntegerType.get())))); - - private final RowType rowType = FlinkSchemaUtil.convert(icebergSchema); - - private final org.apache.avro.Schema avroSchema = - AvroSchemaUtil.convert(icebergSchema, "table"); - - @Override - public Schema icebergSchema() { - return icebergSchema; - } - - @Override - public RowType flinkRowType() { - return rowType; - } - - @Override - public org.apache.avro.Schema avroSchema() { - return avroSchema; - } - - @Override - public GenericRecord generateIcebergGenericRecord() { - GenericRecord genericRecord = GenericRecord.create(icebergSchema); - genericRecord.setField("row_id", "row_id_value"); - genericRecord.setField( - "map_of_array", - ImmutableMap.of( - "Jane", Arrays.asList(1, 2, 3), - "Joe", Arrays.asList(4, 5, 6))); - return genericRecord; - } - - @Override - public GenericRowData generateFlinkRowData() { - Integer[] janeArray = {1, 2, 3}; - Integer[] joeArray = {4, 5, 6}; - return GenericRowData.of( - StringData.fromString("row_id_value"), - new GenericMapData( - ImmutableMap.of( - StringData.fromString("Jane"), - new GenericArrayData(janeArray), - StringData.fromString("Joe"), - new GenericArrayData(joeArray)))); - } - - @Override - public org.apache.avro.generic.GenericRecord generateAvroGenericRecord() { - org.apache.avro.generic.GenericRecord genericRecord = new GenericData.Record(avroSchema); - genericRecord.put("row_id", "row_id_value"); - genericRecord.put( - "map_of_array", - ImmutableMap.of( - "Jane", Arrays.asList(1, 2, 3), - "Joe", Arrays.asList(4, 5, 6))); - return genericRecord; - } - } - - public static class MapOfMap implements DataGenerator { - private final Schema icebergSchema = - new Schema( - Types.NestedField.required(1, "row_id", Types.StringType.get()), - Types.NestedField.required( - 2, - "map_of_map", - Types.MapType.ofRequired( - 101, - 102, - Types.StringType.get(), - Types.MapType.ofRequired( - 301, 302, Types.StringType.get(), Types.IntegerType.get())))); - - private final RowType flinkRowType = FlinkSchemaUtil.convert(icebergSchema); - - private final org.apache.avro.Schema avroSchema = - AvroSchemaUtil.convert(icebergSchema, "table"); - - @Override - public Schema icebergSchema() { - return icebergSchema; - } - - @Override - public RowType flinkRowType() { - return flinkRowType; - } - - @Override - public org.apache.avro.Schema avroSchema() { - return avroSchema; - } - - @Override - public GenericRecord generateIcebergGenericRecord() { - GenericRecord genericRecord = GenericRecord.create(icebergSchema); - genericRecord.setField("row_id", "row_id_value"); - genericRecord.setField( - "map_of_map", - ImmutableMap.of( - "female", ImmutableMap.of("Jane", 1, "Alice", 2), - "male", ImmutableMap.of("Joe", 3, "Bob", 4))); - return genericRecord; - } - - @Override - public GenericRowData generateFlinkRowData() { - return GenericRowData.of( - StringData.fromString("row_id_value"), - new GenericMapData( - ImmutableMap.of( - StringData.fromString("female"), - new GenericMapData( - ImmutableMap.of( - StringData.fromString("Jane"), 1, StringData.fromString("Alice"), 2)), - StringData.fromString("male"), - new GenericMapData( - ImmutableMap.of( - StringData.fromString("Joe"), 3, StringData.fromString("Bob"), 4))))); - } - - @Override - public org.apache.avro.generic.GenericRecord generateAvroGenericRecord() { - org.apache.avro.generic.GenericRecord genericRecord = new GenericData.Record(avroSchema); - genericRecord.put("row_id", "row_id_value"); - genericRecord.put( - "map_of_map", - ImmutableMap.of( - "female", ImmutableMap.of("Jane", 1, "Alice", 2), - "male", ImmutableMap.of("Joe", 3, "Bob", 4))); - return genericRecord; - } - } - - public static class MapOfStruct implements DataGenerator { - private org.apache.avro.Schema createAvroSchemaIdField() { - org.apache.avro.Schema schema = SchemaBuilder.builder().intType(); - // this is needed to match the converter generated schema props - schema.addProp("field-id", IntNode.valueOf(201)); - return schema; - } - - private org.apache.avro.Schema createAvroSchemaNameField() { - org.apache.avro.Schema schema = SchemaBuilder.builder().stringType(); - // this is needed to match the converter generated schema props - schema.addProp("field-id", IntNode.valueOf(202)); - return schema; - } - - private final Types.StructType structType = - Types.StructType.of( - required(201, "id", Types.IntegerType.get()), - required(202, "name", Types.StringType.get())); - private final Schema structIcebergSchema = new Schema(structType.fields()); - - private final org.apache.avro.Schema structAvroSchema = - SchemaBuilder.builder() - .record("struct") - .fields() - .name("id") - .type(createAvroSchemaIdField()) - .noDefault() - .name("name") - .type(createAvroSchemaNameField()) - .noDefault() - .endRecord(); - - private final Schema icebergSchema = - new Schema( - Types.NestedField.required(1, "row_id", Types.StringType.get()), - Types.NestedField.required( - 2, - "map_of_struct", - Types.MapType.ofRequired(101, 102, Types.StringType.get(), structType))); - - private final RowType flinkRowType = FlinkSchemaUtil.convert(icebergSchema); - - // Can't use AvroSchemaUtil.convert otherwise the nested schema will have generated name like - // `r102` not the specified name like `struct`. - org.apache.avro.Schema avroSchema = - SchemaBuilder.builder() - .record("table") - .fields() - .requiredString("row_id") - .name("map_of_struct") - .type(SchemaBuilder.builder().map().values(structAvroSchema)) - .noDefault() - .endRecord(); - - @Override - public Schema icebergSchema() { - return icebergSchema; - } - - @Override - public RowType flinkRowType() { - return flinkRowType; - } - - @Override - public org.apache.avro.Schema avroSchema() { - return avroSchema; - } - - @Override - public GenericRecord generateIcebergGenericRecord() { - GenericRecord struct1 = GenericRecord.create(structIcebergSchema); - struct1.setField("id", 1); - struct1.setField("name", "Jane"); - GenericRecord struct2 = GenericRecord.create(structIcebergSchema); - struct2.setField("id", 2); - struct2.setField("name", "Joe"); - GenericRecord genericRecord = GenericRecord.create(icebergSchema); - genericRecord.setField("row_id", "row_id_value"); - genericRecord.setField( - "map_of_struct", ImmutableMap.of("struct1", struct1, "struct2", struct2)); - return genericRecord; - } - - @Override - public GenericRowData generateFlinkRowData() { - return GenericRowData.of( - StringData.fromString("row_id_value"), - new GenericMapData( - ImmutableMap.of( - StringData.fromString("struct1"), - GenericRowData.of(1, StringData.fromString("Jane")), - StringData.fromString("struct2"), - GenericRowData.of(2, StringData.fromString("Joe"))))); - } - - @Override - public org.apache.avro.generic.GenericRecord generateAvroGenericRecord() { - org.apache.avro.generic.GenericRecord struct1 = new GenericData.Record(structAvroSchema); - struct1.put("id", 1); - struct1.put("name", new Utf8("Jane")); - org.apache.avro.generic.GenericRecord struct2 = new GenericData.Record(structAvroSchema); - struct2.put("id", 2); - struct2.put("name", new Utf8("Joe")); - org.apache.avro.generic.GenericRecord genericRecord = new GenericData.Record(avroSchema); - genericRecord.put("row_id", new Utf8("row_id_value")); - genericRecord.put("map_of_struct", ImmutableMap.of("struct1", struct1, "struct2", struct2)); - return genericRecord; - } - } - - public static class MapOfStructStruct implements DataGenerator { - private final Schema icebergSchema = - new Schema( - Types.NestedField.required(1, "row_id", Types.StringType.get()), - Types.NestedField.optional( - 2, - "map", - Types.MapType.ofOptional( - 101, - 102, - Types.StructType.of( - Types.NestedField.required(201, "key", Types.LongType.get()), - Types.NestedField.optional(202, "keyData", Types.StringType.get())), - Types.StructType.of( - Types.NestedField.required(203, "value", Types.LongType.get()), - Types.NestedField.optional(204, "valueData", Types.StringType.get()))))); - - private final RowType flinkRowType = FlinkSchemaUtil.convert(icebergSchema); - - @Override - public Schema icebergSchema() { - return icebergSchema; - } - - @Override - public RowType flinkRowType() { - return flinkRowType; - } - - @Override - public org.apache.avro.Schema avroSchema() { - throw new UnsupportedOperationException( - "Not applicable as Avro Map only support string key type"); - } - - @Override - public GenericRecord generateIcebergGenericRecord() { - throw new UnsupportedOperationException("Not implemented yet"); - } - - @Override - public GenericRowData generateFlinkRowData() { - return GenericRowData.of( - StringData.fromString("row_id_value"), - new GenericMapData( - ImmutableMap.of( - GenericRowData.of(1L, StringData.fromString("key_data")), - GenericRowData.of(1L, StringData.fromString("value_data"))))); - } - - @Override - public org.apache.avro.generic.GenericRecord generateAvroGenericRecord() { - throw new UnsupportedOperationException("Avro Map only support string key type"); - } - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/HadoopCatalogExtension.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/HadoopCatalogExtension.java deleted file mode 100644 index fd5c6b76b683..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/HadoopCatalogExtension.java +++ /dev/null @@ -1,105 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.UUID; -import org.apache.commons.io.FileUtils; -import org.apache.hadoop.conf.Configuration; -import org.apache.iceberg.CatalogProperties; -import org.apache.iceberg.catalog.Catalog; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.hadoop.HadoopCatalog; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.junit.jupiter.api.extension.AfterAllCallback; -import org.junit.jupiter.api.extension.AfterEachCallback; -import org.junit.jupiter.api.extension.BeforeAllCallback; -import org.junit.jupiter.api.extension.BeforeEachCallback; -import org.junit.jupiter.api.extension.ExtensionContext; - -public class HadoopCatalogExtension - implements BeforeAllCallback, BeforeEachCallback, AfterAllCallback, AfterEachCallback { - protected final String database; - protected final String tableName; - - protected Path temporaryFolder; - protected Catalog catalog; - protected CatalogLoader catalogLoader; - protected String warehouse; - protected TableLoader tableLoader; - - public HadoopCatalogExtension(String database, String tableName) { - this.database = database; - this.tableName = tableName; - } - - @Override - public void beforeAll(ExtensionContext context) throws Exception { - this.temporaryFolder = Files.createTempDirectory("junit5_hadoop_catalog-"); - } - - @Override - public void afterAll(ExtensionContext context) throws Exception { - FileUtils.deleteDirectory(temporaryFolder.toFile()); - } - - @Override - public void beforeEach(ExtensionContext context) throws Exception { - assertThat(temporaryFolder).exists().isDirectory(); - this.warehouse = "file:" + temporaryFolder + "/" + UUID.randomUUID(); - this.catalogLoader = - CatalogLoader.hadoop( - "hadoop", - new Configuration(), - ImmutableMap.of(CatalogProperties.WAREHOUSE_LOCATION, warehouse)); - this.catalog = catalogLoader.loadCatalog(); - this.tableLoader = - TableLoader.fromCatalog(catalogLoader, TableIdentifier.of(database, tableName)); - } - - @Override - public void afterEach(ExtensionContext context) throws Exception { - try { - catalog.dropTable(TableIdentifier.of(database, tableName)); - ((HadoopCatalog) catalog).close(); - tableLoader.close(); - } catch (Exception e) { - throw new RuntimeException("Failed to close catalog resource"); - } - } - - public TableLoader tableLoader() { - return tableLoader; - } - - public Catalog catalog() { - return catalog; - } - - public CatalogLoader catalogLoader() { - return catalogLoader; - } - - public String warehouse() { - return warehouse; - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/HadoopTableExtension.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/HadoopTableExtension.java deleted file mode 100644 index dc6ef400a4a9..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/HadoopTableExtension.java +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.catalog.TableIdentifier; -import org.junit.jupiter.api.extension.ExtensionContext; - -public class HadoopTableExtension extends HadoopCatalogExtension { - private final Schema schema; - private final PartitionSpec partitionSpec; - - private Table table; - - public HadoopTableExtension(String database, String tableName, Schema schema) { - this(database, tableName, schema, null); - } - - public HadoopTableExtension( - String database, String tableName, Schema schema, PartitionSpec partitionSpec) { - super(database, tableName); - this.schema = schema; - this.partitionSpec = partitionSpec; - } - - @Override - public void beforeEach(ExtensionContext context) throws Exception { - super.beforeEach(context); - if (partitionSpec == null) { - this.table = catalog.createTable(TableIdentifier.of(database, tableName), schema); - } else { - this.table = - catalog.createTable(TableIdentifier.of(database, tableName), schema, partitionSpec); - } - tableLoader.open(); - } - - public Table table() { - return table; - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/MiniFlinkClusterExtension.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/MiniFlinkClusterExtension.java deleted file mode 100644 index d2e086aa448e..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/MiniFlinkClusterExtension.java +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import org.apache.flink.configuration.Configuration; -import org.apache.flink.configuration.CoreOptions; -import org.apache.flink.runtime.testutils.InMemoryReporter; -import org.apache.flink.runtime.testutils.MiniClusterResourceConfiguration; -import org.apache.flink.test.junit5.MiniClusterExtension; - -public class MiniFlinkClusterExtension { - - private static final int DEFAULT_TM_NUM = 1; - private static final int DEFAULT_PARALLELISM = 4; - - public static final Configuration DISABLE_CLASSLOADER_CHECK_CONFIG = - new Configuration() - // disable classloader check as Avro may cache class/object in the serializers. - .set(CoreOptions.CHECK_LEAKED_CLASSLOADER, false); - - private MiniFlinkClusterExtension() {} - - /** - * It will start a mini cluster with classloader.check-leaked-classloader=false, so that we won't - * break the unit tests because of the class loader leak issue. In our iceberg integration tests, - * there're some that will assert the results after finished the flink jobs, so actually we may - * access the class loader that has been closed by the flink task managers if we enable the switch - * classloader.check-leaked-classloader by default. - */ - public static MiniClusterExtension createWithClassloaderCheckDisabled() { - return new MiniClusterExtension( - new MiniClusterResourceConfiguration.Builder() - .setNumberTaskManagers(DEFAULT_TM_NUM) - .setNumberSlotsPerTaskManager(DEFAULT_PARALLELISM) - .setConfiguration(DISABLE_CLASSLOADER_CHECK_CONFIG) - .build()); - } - - public static MiniClusterExtension createWithClassloaderCheckDisabled( - InMemoryReporter inMemoryReporter) { - Configuration configuration = new Configuration(DISABLE_CLASSLOADER_CHECK_CONFIG); - inMemoryReporter.addToConfiguration(configuration); - - return new MiniClusterExtension( - new MiniClusterResourceConfiguration.Builder() - .setNumberTaskManagers(DEFAULT_TM_NUM) - .setNumberSlotsPerTaskManager(DEFAULT_PARALLELISM) - .setConfiguration(configuration) - .build()); - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/RowDataConverter.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/RowDataConverter.java deleted file mode 100644 index e532fb62615c..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/RowDataConverter.java +++ /dev/null @@ -1,135 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import java.math.BigDecimal; -import java.nio.ByteBuffer; -import java.time.Instant; -import java.time.LocalDate; -import java.time.LocalDateTime; -import java.time.LocalTime; -import java.time.OffsetDateTime; -import java.time.ZoneOffset; -import java.time.temporal.ChronoUnit; -import java.util.Arrays; -import java.util.List; -import java.util.Map; -import java.util.UUID; -import java.util.concurrent.TimeUnit; -import org.apache.flink.table.data.DecimalData; -import org.apache.flink.table.data.GenericArrayData; -import org.apache.flink.table.data.GenericMapData; -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.StringData; -import org.apache.flink.table.data.TimestampData; -import org.apache.iceberg.Schema; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.types.Type; -import org.apache.iceberg.types.Types; - -public class RowDataConverter { - private static final OffsetDateTime EPOCH = Instant.ofEpochSecond(0).atOffset(ZoneOffset.UTC); - private static final LocalDate EPOCH_DAY = EPOCH.toLocalDate(); - - private RowDataConverter() {} - - public static RowData convert(Schema iSchema, Record record) { - return convert(iSchema.asStruct(), record); - } - - private static RowData convert(Types.StructType struct, Record record) { - GenericRowData rowData = new GenericRowData(struct.fields().size()); - List fields = struct.fields(); - for (int i = 0; i < fields.size(); i += 1) { - Types.NestedField field = fields.get(i); - - Type fieldType = field.type(); - rowData.setField(i, convert(fieldType, record.get(i))); - } - return rowData; - } - - private static Object convert(Type type, Object object) { - if (object == null) { - return null; - } - - switch (type.typeId()) { - case BOOLEAN: - case INTEGER: - case LONG: - case FLOAT: - case DOUBLE: - case FIXED: - return object; - case DATE: - return (int) ChronoUnit.DAYS.between(EPOCH_DAY, (LocalDate) object); - case TIME: - // Iceberg's time is in microseconds, while flink's time is in milliseconds. - LocalTime localTime = (LocalTime) object; - return (int) TimeUnit.NANOSECONDS.toMillis(localTime.toNanoOfDay()); - case TIMESTAMP: - if (((Types.TimestampType) type).shouldAdjustToUTC()) { - return TimestampData.fromInstant(((OffsetDateTime) object).toInstant()); - } else { - return TimestampData.fromLocalDateTime((LocalDateTime) object); - } - case STRING: - return StringData.fromString((String) object); - case UUID: - UUID uuid = (UUID) object; - ByteBuffer bb = ByteBuffer.allocate(16); - bb.putLong(uuid.getMostSignificantBits()); - bb.putLong(uuid.getLeastSignificantBits()); - return bb.array(); - case BINARY: - ByteBuffer buffer = (ByteBuffer) object; - return Arrays.copyOfRange( - buffer.array(), - buffer.arrayOffset() + buffer.position(), - buffer.arrayOffset() + buffer.remaining()); - case DECIMAL: - Types.DecimalType decimalType = (Types.DecimalType) type; - return DecimalData.fromBigDecimal( - (BigDecimal) object, decimalType.precision(), decimalType.scale()); - case STRUCT: - return convert(type.asStructType(), (Record) object); - case LIST: - List list = (List) object; - Object[] convertedArray = new Object[list.size()]; - for (int i = 0; i < convertedArray.length; i++) { - convertedArray[i] = convert(type.asListType().elementType(), list.get(i)); - } - return new GenericArrayData(convertedArray); - case MAP: - Map convertedMap = Maps.newLinkedHashMap(); - Map map = (Map) object; - for (Map.Entry entry : map.entrySet()) { - convertedMap.put( - convert(type.asMapType().keyType(), entry.getKey()), - convert(type.asMapType().valueType(), entry.getValue())); - } - return new GenericMapData(convertedMap); - default: - throw new UnsupportedOperationException("Not a supported type: " + type); - } - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/SimpleDataUtil.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/SimpleDataUtil.java deleted file mode 100644 index 1767f774922a..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/SimpleDataUtil.java +++ /dev/null @@ -1,439 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import static org.apache.iceberg.hadoop.HadoopOutputFile.fromPath; -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.IOException; -import java.time.Duration; -import java.util.List; -import java.util.Map; -import java.util.stream.Collectors; -import org.apache.flink.table.api.DataTypes; -import org.apache.flink.table.api.TableSchema; -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.StringData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.flink.types.RowKind; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.DataFiles; -import org.apache.iceberg.DeleteFile; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.FileScanTask; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Snapshot; -import org.apache.iceberg.SnapshotRef; -import org.apache.iceberg.StructLike; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableScan; -import org.apache.iceberg.data.GenericRecord; -import org.apache.iceberg.data.IcebergGenerics; -import org.apache.iceberg.data.InternalRecordWrapper; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.deletes.EqualityDeleteWriter; -import org.apache.iceberg.deletes.PositionDelete; -import org.apache.iceberg.deletes.PositionDeleteWriter; -import org.apache.iceberg.encryption.EncryptedOutputFile; -import org.apache.iceberg.flink.sink.FlinkAppenderFactory; -import org.apache.iceberg.hadoop.HadoopInputFile; -import org.apache.iceberg.hadoop.HadoopTables; -import org.apache.iceberg.io.CloseableIterable; -import org.apache.iceberg.io.FileAppender; -import org.apache.iceberg.io.FileAppenderFactory; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Iterables; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.types.Types; -import org.apache.iceberg.util.Pair; -import org.apache.iceberg.util.StructLikeSet; -import org.apache.iceberg.util.StructLikeWrapper; -import org.awaitility.Awaitility; - -public class SimpleDataUtil { - - private SimpleDataUtil() {} - - public static final Schema SCHEMA = - new Schema( - Types.NestedField.optional(1, "id", Types.IntegerType.get()), - Types.NestedField.optional(2, "data", Types.StringType.get())); - - public static final TableSchema FLINK_SCHEMA = - TableSchema.builder().field("id", DataTypes.INT()).field("data", DataTypes.STRING()).build(); - - public static final RowType ROW_TYPE = (RowType) FLINK_SCHEMA.toRowDataType().getLogicalType(); - - public static final Record RECORD = GenericRecord.create(SCHEMA); - - public static Table createTable( - String path, Map properties, boolean partitioned) { - PartitionSpec spec; - if (partitioned) { - spec = PartitionSpec.builderFor(SCHEMA).identity("data").build(); - } else { - spec = PartitionSpec.unpartitioned(); - } - return new HadoopTables().create(SCHEMA, spec, properties, path); - } - - public static Record createRecord(Integer id, String data) { - Record record = RECORD.copy(); - record.setField("id", id); - record.setField("data", data); - return record; - } - - public static RowData createRowData(Integer id, String data) { - return GenericRowData.of(id, StringData.fromString(data)); - } - - public static RowData createInsert(Integer id, String data) { - return GenericRowData.ofKind(RowKind.INSERT, id, StringData.fromString(data)); - } - - public static RowData createDelete(Integer id, String data) { - return GenericRowData.ofKind(RowKind.DELETE, id, StringData.fromString(data)); - } - - public static RowData createUpdateBefore(Integer id, String data) { - return GenericRowData.ofKind(RowKind.UPDATE_BEFORE, id, StringData.fromString(data)); - } - - public static RowData createUpdateAfter(Integer id, String data) { - return GenericRowData.ofKind(RowKind.UPDATE_AFTER, id, StringData.fromString(data)); - } - - public static DataFile writeFile( - Table table, - Schema schema, - PartitionSpec spec, - Configuration conf, - String location, - String filename, - List rows) - throws IOException { - return writeFile(table, schema, spec, conf, location, filename, rows, null); - } - - /** Write the list of {@link RowData} to the given path and with the given partition data */ - public static DataFile writeFile( - Table table, - Schema schema, - PartitionSpec spec, - Configuration conf, - String location, - String filename, - List rows, - StructLike partition) - throws IOException { - Path path = new Path(location, filename); - FileFormat fileFormat = FileFormat.fromFileName(filename); - Preconditions.checkNotNull(fileFormat, "Cannot determine format for file: %s", filename); - - RowType flinkSchema = FlinkSchemaUtil.convert(schema); - FileAppenderFactory appenderFactory = - new FlinkAppenderFactory( - table, schema, flinkSchema, ImmutableMap.of(), spec, null, null, null); - - FileAppender appender = appenderFactory.newAppender(fromPath(path, conf), fileFormat); - try (FileAppender closeableAppender = appender) { - closeableAppender.addAll(rows); - } - - DataFiles.Builder builder = - DataFiles.builder(spec) - .withInputFile(HadoopInputFile.fromPath(path, conf)) - .withMetrics(appender.metrics()); - - if (partition != null) { - builder = builder.withPartition(partition); - } - - return builder.build(); - } - - public static DeleteFile writeEqDeleteFile( - Table table, - FileFormat format, - String filename, - FileAppenderFactory appenderFactory, - List deletes) - throws IOException { - EncryptedOutputFile outputFile = - table - .encryption() - .encrypt(fromPath(new Path(table.location(), filename), new Configuration())); - - EqualityDeleteWriter eqWriter = - appenderFactory.newEqDeleteWriter(outputFile, format, null); - try (EqualityDeleteWriter writer = eqWriter) { - writer.write(deletes); - } - return eqWriter.toDeleteFile(); - } - - public static DeleteFile writePosDeleteFile( - Table table, - FileFormat format, - String filename, - FileAppenderFactory appenderFactory, - List> positions) - throws IOException { - EncryptedOutputFile outputFile = - table - .encryption() - .encrypt(fromPath(new Path(table.location(), filename), new Configuration())); - - PositionDeleteWriter posWriter = - appenderFactory.newPosDeleteWriter(outputFile, format, null); - PositionDelete posDelete = PositionDelete.create(); - try (PositionDeleteWriter writer = posWriter) { - for (Pair p : positions) { - writer.write(posDelete.set(p.first(), p.second(), null)); - } - } - return posWriter.toDeleteFile(); - } - - private static List convertToRecords(List rows) { - List records = Lists.newArrayList(); - for (RowData row : rows) { - Integer id = row.isNullAt(0) ? null : row.getInt(0); - String data = row.isNullAt(1) ? null : row.getString(1).toString(); - records.add(createRecord(id, data)); - } - return records; - } - - public static void assertTableRows(String tablePath, List expected, String branch) - throws IOException { - assertTableRecords(tablePath, convertToRecords(expected), branch); - } - - public static void assertTableRows(Table table, List expected) throws IOException { - assertTableRecords(table, convertToRecords(expected), SnapshotRef.MAIN_BRANCH); - } - - public static void assertTableRows(Table table, List expected, String branch) - throws IOException { - assertTableRecords(table, convertToRecords(expected), branch); - } - - /** Get all rows for a table */ - public static List tableRecords(Table table) throws IOException { - table.refresh(); - List records = Lists.newArrayList(); - try (CloseableIterable iterable = IcebergGenerics.read(table).build()) { - for (Record record : iterable) { - records.add(record); - } - } - return records; - } - - public static boolean equalsRecords(List expected, List actual, Schema schema) { - if (expected.size() != actual.size()) { - return false; - } - Types.StructType type = schema.asStruct(); - StructLikeSet expectedSet = StructLikeSet.create(type); - expectedSet.addAll(expected); - StructLikeSet actualSet = StructLikeSet.create(type); - actualSet.addAll(actual); - return expectedSet.equals(actualSet); - } - - public static void assertRecordsEqual(List expected, List actual, Schema schema) { - assertThat(actual).hasSameSizeAs(expected); - Types.StructType type = schema.asStruct(); - StructLikeSet expectedSet = StructLikeSet.create(type); - expectedSet.addAll(expected); - StructLikeSet actualSet = StructLikeSet.create(type); - actualSet.addAll(actual); - assertThat(actualSet).containsExactlyInAnyOrderElementsOf(expectedSet); - } - - /** - * Assert table contains the expected list of records after waiting up to the configured {@code - * timeout} - */ - public static void assertTableRecords(Table table, List expected, Duration timeout) { - Awaitility.await("expected list of records should be produced") - .atMost(timeout) - .untilAsserted(() -> assertRecordsEqual(expected, tableRecords(table), table.schema())); - } - - public static void assertTableRecords(Table table, List expected) throws IOException { - assertTableRecords(table, expected, SnapshotRef.MAIN_BRANCH); - } - - public static void assertTableRecords(Table table, List expected, String branch) - throws IOException { - table.refresh(); - Snapshot snapshot = latestSnapshot(table, branch); - - if (snapshot == null) { - assertThat(expected).isEmpty(); - return; - } - - Types.StructType type = table.schema().asStruct(); - StructLikeSet expectedSet = StructLikeSet.create(type); - expectedSet.addAll(expected); - - try (CloseableIterable iterable = - IcebergGenerics.read(table).useSnapshot(snapshot.snapshotId()).build()) { - StructLikeSet actualSet = StructLikeSet.create(type); - - for (Record record : iterable) { - actualSet.add(record); - } - - assertThat(actualSet).containsExactlyInAnyOrderElementsOf(expectedSet); - } - } - - // Returns the latest snapshot of the given branch in the table - public static Snapshot latestSnapshot(Table table, String branch) { - // For the main branch, currentSnapshot() is used to validate that the API behavior has - // not changed since that was the API used for validation prior to addition of branches. - if (branch.equals(SnapshotRef.MAIN_BRANCH)) { - return table.currentSnapshot(); - } - - return table.snapshot(branch); - } - - public static void assertTableRecords(String tablePath, List expected) - throws IOException { - Preconditions.checkArgument(expected != null, "expected records shouldn't be null"); - assertTableRecords(new HadoopTables().load(tablePath), expected, SnapshotRef.MAIN_BRANCH); - } - - public static void assertTableRecords(String tablePath, List expected, String branch) - throws IOException { - Preconditions.checkArgument(expected != null, "expected records shouldn't be null"); - assertTableRecords(new HadoopTables().load(tablePath), expected, branch); - } - - public static StructLikeSet expectedRowSet(Table table, Record... records) { - StructLikeSet set = StructLikeSet.create(table.schema().asStruct()); - InternalRecordWrapper wrapper = new InternalRecordWrapper(table.schema().asStruct()); - for (Record record : records) { - set.add(wrapper.copyFor(record)); - } - return set; - } - - public static StructLikeSet actualRowSet(Table table, String... columns) throws IOException { - return actualRowSet(table, null, columns); - } - - public static StructLikeSet actualRowSet(Table table, Long snapshotId, String... columns) - throws IOException { - table.refresh(); - StructLikeSet set = StructLikeSet.create(table.schema().asStruct()); - InternalRecordWrapper wrapper = new InternalRecordWrapper(table.schema().asStruct()); - try (CloseableIterable reader = - IcebergGenerics.read(table) - .useSnapshot(snapshotId == null ? table.currentSnapshot().snapshotId() : snapshotId) - .select(columns) - .build()) { - reader.forEach(record -> set.add(wrapper.copyFor(record))); - } - return set; - } - - public static List partitionDataFiles(Table table, Map partitionValues) - throws IOException { - table.refresh(); - Types.StructType partitionType = table.spec().partitionType(); - - Record partitionRecord = GenericRecord.create(partitionType).copy(partitionValues); - StructLikeWrapper expectedWrapper = - StructLikeWrapper.forType(partitionType).set(partitionRecord); - - List dataFiles = Lists.newArrayList(); - try (CloseableIterable fileScanTasks = table.newScan().planFiles()) { - for (FileScanTask scanTask : fileScanTasks) { - StructLikeWrapper wrapper = - StructLikeWrapper.forType(partitionType).set(scanTask.file().partition()); - - if (expectedWrapper.equals(wrapper)) { - dataFiles.add(scanTask.file()); - } - } - } - - return dataFiles; - } - - public static Map> snapshotToDataFiles(Table table) throws IOException { - table.refresh(); - - Map> result = Maps.newHashMap(); - Snapshot current = table.currentSnapshot(); - while (current != null) { - TableScan tableScan = table.newScan(); - if (current.parentId() != null) { - // Collect the data files that was added only in current snapshot. - tableScan = tableScan.appendsBetween(current.parentId(), current.snapshotId()); - } else { - // Collect the data files that was added in the oldest snapshot. - tableScan = tableScan.useSnapshot(current.snapshotId()); - } - try (CloseableIterable scanTasks = tableScan.planFiles()) { - result.put( - current.snapshotId(), - ImmutableList.copyOf(Iterables.transform(scanTasks, FileScanTask::file))); - } - - // Continue to traverse the parent snapshot if exists. - if (current.parentId() == null) { - break; - } - // Iterate to the parent snapshot. - current = table.snapshot(current.parentId()); - } - return result; - } - - public static List matchingPartitions( - List dataFiles, PartitionSpec partitionSpec, Map partitionValues) { - Types.StructType partitionType = partitionSpec.partitionType(); - Record partitionRecord = GenericRecord.create(partitionType).copy(partitionValues); - StructLikeWrapper expected = StructLikeWrapper.forType(partitionType).set(partitionRecord); - return dataFiles.stream() - .filter( - df -> { - StructLikeWrapper wrapper = - StructLikeWrapper.forType(partitionType).set(df.partition()); - return wrapper.equals(expected); - }) - .collect(Collectors.toList()); - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestBase.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestBase.java deleted file mode 100644 index a74226092f38..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestBase.java +++ /dev/null @@ -1,128 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.nio.file.Path; -import java.util.List; -import org.apache.flink.table.api.EnvironmentSettings; -import org.apache.flink.table.api.TableEnvironment; -import org.apache.flink.table.api.TableResult; -import org.apache.flink.test.junit5.MiniClusterExtension; -import org.apache.flink.test.util.TestBaseUtils; -import org.apache.flink.types.Row; -import org.apache.flink.util.CloseableIterator; -import org.apache.hadoop.hive.conf.HiveConf; -import org.apache.iceberg.CatalogUtil; -import org.apache.iceberg.hive.HiveCatalog; -import org.apache.iceberg.hive.TestHiveMetastore; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.junit.jupiter.api.AfterAll; -import org.junit.jupiter.api.BeforeAll; -import org.junit.jupiter.api.extension.RegisterExtension; -import org.junit.jupiter.api.io.TempDir; - -public abstract class TestBase extends TestBaseUtils { - - @RegisterExtension - public static MiniClusterExtension miniClusterExtension = - MiniFlinkClusterExtension.createWithClassloaderCheckDisabled(); - - @TempDir protected Path temporaryDirectory; - - private static TestHiveMetastore metastore = null; - protected static HiveConf hiveConf = null; - protected static HiveCatalog catalog = null; - - private volatile TableEnvironment tEnv = null; - - @BeforeAll - public static void startMetastore() { - TestBase.metastore = new TestHiveMetastore(); - metastore.start(); - TestBase.hiveConf = metastore.hiveConf(); - TestBase.catalog = - (HiveCatalog) - CatalogUtil.loadCatalog( - HiveCatalog.class.getName(), "hive", ImmutableMap.of(), hiveConf); - } - - @AfterAll - public static void stopMetastore() throws Exception { - metastore.stop(); - TestBase.catalog = null; - } - - protected TableEnvironment getTableEnv() { - if (tEnv == null) { - synchronized (this) { - if (tEnv == null) { - EnvironmentSettings settings = EnvironmentSettings.newInstance().inBatchMode().build(); - - TableEnvironment env = TableEnvironment.create(settings); - env.getConfig() - .getConfiguration() - .set(FlinkConfigOptions.TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM, false); - tEnv = env; - } - } - } - return tEnv; - } - - protected static TableResult exec(TableEnvironment env, String query, Object... args) { - return env.executeSql(String.format(query, args)); - } - - protected TableResult exec(String query, Object... args) { - return exec(getTableEnv(), query, args); - } - - protected List sql(String query, Object... args) { - TableResult tableResult = exec(query, args); - try (CloseableIterator iter = tableResult.collect()) { - return Lists.newArrayList(iter); - } catch (Exception e) { - throw new RuntimeException("Failed to collect table result", e); - } - } - - protected void assertSameElements(Iterable expected, Iterable actual) { - assertThat(actual).isNotNull().containsExactlyInAnyOrderElementsOf(expected); - } - - protected void assertSameElements(String message, Iterable expected, Iterable actual) { - assertThat(actual).isNotNull().as(message).containsExactlyInAnyOrderElementsOf(expected); - } - - /** - * We can not drop currently used catalog after FLINK-29677, so we have make sure that we do not - * use the current catalog before dropping it. This method switches to the 'default_catalog' and - * drops the one requested. - * - * @param catalogName The catalog to drop - * @param ifExists If we should use the 'IF EXISTS' when dropping the catalog - */ - protected void dropCatalog(String catalogName, boolean ifExists) { - sql("USE CATALOG default_catalog"); - sql("DROP CATALOG %s %s", ifExists ? "IF EXISTS" : "", catalogName); - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestCatalogLoader.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestCatalogLoader.java deleted file mode 100644 index e8f65921c19a..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestCatalogLoader.java +++ /dev/null @@ -1,116 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import static org.apache.iceberg.CatalogProperties.URI; -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.entry; - -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; -import java.io.File; -import java.io.IOException; -import java.io.ObjectInputStream; -import java.io.ObjectOutputStream; -import java.util.Map; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.iceberg.CatalogProperties; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.hadoop.HadoopFileIO; -import org.apache.iceberg.io.FileIO; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.types.Types; -import org.junit.jupiter.api.AfterAll; -import org.junit.jupiter.api.BeforeAll; -import org.junit.jupiter.api.Test; - -/** Test for {@link CatalogLoader}. */ -public class TestCatalogLoader extends TestBase { - - private static File warehouse = null; - private static final TableIdentifier IDENTIFIER = TableIdentifier.of("default", "my_table"); - private static final Schema SCHEMA = - new Schema(Types.NestedField.required(1, "f1", Types.StringType.get())); - - @BeforeAll - public static void createWarehouse() throws IOException { - warehouse = File.createTempFile("warehouse", null); - assertThat(warehouse.delete()).isTrue(); - hiveConf.set("my_key", "my_value"); - } - - @AfterAll - public static void dropWarehouse() throws IOException { - if (warehouse != null && warehouse.exists()) { - Path warehousePath = new Path(warehouse.getAbsolutePath()); - FileSystem fs = warehousePath.getFileSystem(hiveConf); - assertThat(fs.delete(warehousePath, true)).as("Failed to delete " + warehousePath).isTrue(); - } - } - - @Test - public void testHadoopCatalogLoader() throws IOException, ClassNotFoundException { - Map properties = Maps.newHashMap(); - properties.put(CatalogProperties.WAREHOUSE_LOCATION, "file:" + warehouse); - CatalogLoader loader = CatalogLoader.hadoop("my_catalog", hiveConf, properties); - validateCatalogLoader(loader); - } - - @Test - public void testHiveCatalogLoader() throws IOException, ClassNotFoundException { - CatalogLoader loader = CatalogLoader.hive("my_catalog", hiveConf, Maps.newHashMap()); - validateCatalogLoader(loader); - } - - @Test - public void testRESTCatalogLoader() { - Map properties = Maps.newHashMap(); - properties.put(URI, "http://localhost/"); - CatalogLoader.rest("my_catalog", hiveConf, Maps.newHashMap()); - } - - private static void validateCatalogLoader(CatalogLoader loader) - throws IOException, ClassNotFoundException { - Table table = javaSerAndDeSer(loader).loadCatalog().createTable(IDENTIFIER, SCHEMA); - validateHadoopConf(table); - } - - private static void validateHadoopConf(Table table) { - FileIO io = table.io(); - assertThat(io).as("FileIO should be a HadoopFileIO").isInstanceOf(HadoopFileIO.class); - HadoopFileIO hadoopIO = (HadoopFileIO) io; - assertThat(hadoopIO.conf()).contains(entry("my_key", "my_value")); - } - - @SuppressWarnings("unchecked") - private static T javaSerAndDeSer(T object) throws IOException, ClassNotFoundException { - ByteArrayOutputStream bytes = new ByteArrayOutputStream(); - try (ObjectOutputStream out = new ObjectOutputStream(bytes)) { - out.writeObject(object); - } - - try (ObjectInputStream in = - new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray()))) { - return (T) in.readObject(); - } - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestCatalogTableLoader.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestCatalogTableLoader.java deleted file mode 100644 index f719c7bc0001..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestCatalogTableLoader.java +++ /dev/null @@ -1,113 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; -import java.io.File; -import java.io.IOException; -import java.io.ObjectInputStream; -import java.io.ObjectOutputStream; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.hadoop.HadoopFileIO; -import org.apache.iceberg.hadoop.HadoopTables; -import org.apache.iceberg.io.FileIO; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.types.Types; -import org.junit.jupiter.api.AfterAll; -import org.junit.jupiter.api.BeforeAll; -import org.junit.jupiter.api.Test; - -/** Test for {@link TableLoader}. */ -public class TestCatalogTableLoader extends TestBase { - - private static File warehouse = null; - private static final TableIdentifier IDENTIFIER = TableIdentifier.of("default", "my_table"); - private static final Schema SCHEMA = - new Schema(Types.NestedField.required(1, "f1", Types.StringType.get())); - - @BeforeAll - public static void createWarehouse() throws IOException { - warehouse = File.createTempFile("warehouse", null); - assertThat(warehouse.delete()).isTrue(); - hiveConf.set("my_key", "my_value"); - } - - @AfterAll - public static void dropWarehouse() throws IOException { - if (warehouse != null && warehouse.exists()) { - Path warehousePath = new Path(warehouse.getAbsolutePath()); - FileSystem fs = warehousePath.getFileSystem(hiveConf); - assertThat(fs.delete(warehousePath, true)).as("Failed to delete " + warehousePath).isTrue(); - } - } - - @Test - public void testHadoopTableLoader() throws IOException, ClassNotFoundException { - String location = "file:" + warehouse + "/my_table"; - new HadoopTables(hiveConf).create(SCHEMA, location); - validateTableLoader(TableLoader.fromHadoopTable(location, hiveConf)); - } - - @Test - public void testHiveCatalogTableLoader() throws IOException, ClassNotFoundException { - CatalogLoader loader = CatalogLoader.hive("my_catalog", hiveConf, Maps.newHashMap()); - javaSerdes(loader).loadCatalog().createTable(IDENTIFIER, SCHEMA); - - CatalogLoader catalogLoader = CatalogLoader.hive("my_catalog", hiveConf, Maps.newHashMap()); - validateTableLoader(TableLoader.fromCatalog(catalogLoader, IDENTIFIER)); - } - - private static void validateTableLoader(TableLoader loader) - throws IOException, ClassNotFoundException { - TableLoader copied = javaSerdes(loader); - copied.open(); - try { - validateHadoopConf(copied.loadTable()); - } finally { - copied.close(); - } - } - - private static void validateHadoopConf(Table table) { - FileIO io = table.io(); - assertThat(io).as("FileIO should be a HadoopFileIO").isInstanceOf(HadoopFileIO.class); - HadoopFileIO hadoopIO = (HadoopFileIO) io; - assertThat(hadoopIO.conf().get("my_key")).isEqualTo("my_value"); - } - - @SuppressWarnings("unchecked") - private static T javaSerdes(T object) throws IOException, ClassNotFoundException { - ByteArrayOutputStream bytes = new ByteArrayOutputStream(); - try (ObjectOutputStream out = new ObjectOutputStream(bytes)) { - out.writeObject(object); - } - - try (ObjectInputStream in = - new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray()))) { - return (T) in.readObject(); - } - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestChangeLogTable.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestChangeLogTable.java deleted file mode 100644 index 7fc6ab82490d..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestChangeLogTable.java +++ /dev/null @@ -1,296 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.File; -import java.io.IOException; -import java.util.List; -import org.apache.flink.types.Row; -import org.apache.hadoop.conf.Configuration; -import org.apache.iceberg.BaseTable; -import org.apache.iceberg.CatalogProperties; -import org.apache.iceberg.Parameter; -import org.apache.iceberg.ParameterizedTestExtension; -import org.apache.iceberg.Parameters; -import org.apache.iceberg.Snapshot; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableMetadata; -import org.apache.iceberg.TableOperations; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.flink.source.BoundedTableFactory; -import org.apache.iceberg.flink.source.ChangeLogTableTestBase; -import org.apache.iceberg.relocated.com.google.common.base.Joiner; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.util.StructLikeSet; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.TestTemplate; -import org.junit.jupiter.api.extension.ExtendWith; - -/** - * In this test case, we mainly cover the impact of primary key selection, multiple operations - * within a single transaction, and multiple operations between different txn on the correctness of - * the data. - */ -@ExtendWith(ParameterizedTestExtension.class) -public class TestChangeLogTable extends ChangeLogTableTestBase { - private static final Configuration CONF = new Configuration(); - private static final String SOURCE_TABLE = "default_catalog.default_database.source_change_logs"; - - private static final String CATALOG_NAME = "test_catalog"; - private static final String DATABASE_NAME = "test_db"; - private static final String TABLE_NAME = "test_table"; - private static String warehouse; - - @Parameter private boolean partitioned; - - @Parameters(name = "PartitionedTable={0}") - public static Iterable parameters() { - return ImmutableList.of(new Object[] {true}, new Object[] {false}); - } - - @BeforeEach - public void before() throws IOException { - File warehouseFile = File.createTempFile("junit", null, temporaryDirectory.toFile()); - assertThat(warehouseFile.delete()).isTrue(); - warehouse = String.format("file:%s", warehouseFile); - - sql( - "CREATE CATALOG %s WITH ('type'='iceberg', 'catalog-type'='hadoop', 'warehouse'='%s')", - CATALOG_NAME, warehouse); - sql("USE CATALOG %s", CATALOG_NAME); - sql("CREATE DATABASE %s", DATABASE_NAME); - sql("USE %s", DATABASE_NAME); - // Set the table.exec.sink.upsert-materialize=NONE, so that downstream operators will receive - // the - // records with the same order as the source operator, bypassing Flink's inferred shuffle. - getTableEnv().getConfig().set("table.exec.sink.upsert-materialize", "NONE"); - } - - @AfterEach - @Override - public void clean() { - sql("DROP TABLE IF EXISTS %s", TABLE_NAME); - sql("DROP DATABASE IF EXISTS %s", DATABASE_NAME); - dropCatalog(CATALOG_NAME, true); - BoundedTableFactory.clearDataSets(); - } - - @TestTemplate - public void testSqlChangeLogOnIdKey() throws Exception { - List> inputRowsPerCheckpoint = - ImmutableList.of( - ImmutableList.of( - insertRow(1, "aaa"), - deleteRow(1, "aaa"), - insertRow(1, "bbb"), - insertRow(2, "aaa"), - deleteRow(2, "aaa"), - insertRow(2, "bbb")), - ImmutableList.of( - updateBeforeRow(2, "bbb"), - updateAfterRow(2, "ccc"), - deleteRow(2, "ccc"), - insertRow(2, "ddd")), - ImmutableList.of( - deleteRow(1, "bbb"), - insertRow(1, "ccc"), - deleteRow(1, "ccc"), - insertRow(1, "ddd"))); - - List> expectedRecordsPerCheckpoint = - ImmutableList.of( - ImmutableList.of(insertRow(1, "bbb"), insertRow(2, "bbb")), - ImmutableList.of(insertRow(1, "bbb"), insertRow(2, "ddd")), - ImmutableList.of(insertRow(1, "ddd"), insertRow(2, "ddd"))); - - testSqlChangeLog( - TABLE_NAME, ImmutableList.of("id"), inputRowsPerCheckpoint, expectedRecordsPerCheckpoint); - } - - @TestTemplate - public void testChangeLogOnDataKey() throws Exception { - List> elementsPerCheckpoint = - ImmutableList.of( - ImmutableList.of( - insertRow(1, "aaa"), - deleteRow(1, "aaa"), - insertRow(2, "bbb"), - insertRow(1, "bbb"), - insertRow(2, "aaa")), - ImmutableList.of( - updateBeforeRow(2, "aaa"), updateAfterRow(1, "ccc"), insertRow(1, "aaa")), - ImmutableList.of(deleteRow(1, "bbb"), insertRow(2, "aaa"), insertRow(2, "ccc"))); - - List> expectedRecords = - ImmutableList.of( - ImmutableList.of(insertRow(1, "bbb"), insertRow(2, "aaa")), - ImmutableList.of(insertRow(1, "aaa"), insertRow(1, "bbb"), insertRow(1, "ccc")), - ImmutableList.of( - insertRow(1, "aaa"), - insertRow(1, "ccc"), - insertRow(2, "aaa"), - insertRow(2, "ccc"))); - - testSqlChangeLog(TABLE_NAME, ImmutableList.of("data"), elementsPerCheckpoint, expectedRecords); - } - - @TestTemplate - public void testChangeLogOnIdDataKey() throws Exception { - List> elementsPerCheckpoint = - ImmutableList.of( - ImmutableList.of( - insertRow(1, "aaa"), - deleteRow(1, "aaa"), - insertRow(2, "bbb"), - insertRow(1, "bbb"), - insertRow(2, "aaa")), - ImmutableList.of( - updateBeforeRow(2, "aaa"), updateAfterRow(1, "ccc"), insertRow(1, "aaa")), - ImmutableList.of(deleteRow(1, "bbb"), insertRow(2, "aaa"))); - - List> expectedRecords = - ImmutableList.of( - ImmutableList.of(insertRow(1, "bbb"), insertRow(2, "aaa"), insertRow(2, "bbb")), - ImmutableList.of( - insertRow(1, "aaa"), insertRow(1, "bbb"), insertRow(1, "ccc"), insertRow(2, "bbb")), - ImmutableList.of( - insertRow(1, "aaa"), - insertRow(1, "ccc"), - insertRow(2, "aaa"), - insertRow(2, "bbb"))); - - testSqlChangeLog( - TABLE_NAME, ImmutableList.of("data", "id"), elementsPerCheckpoint, expectedRecords); - } - - @TestTemplate - public void testPureInsertOnIdKey() throws Exception { - List> elementsPerCheckpoint = - ImmutableList.of( - ImmutableList.of(insertRow(1, "aaa"), insertRow(2, "bbb")), - ImmutableList.of(insertRow(3, "ccc"), insertRow(4, "ddd")), - ImmutableList.of(insertRow(5, "eee"), insertRow(6, "fff"))); - - List> expectedRecords = - ImmutableList.of( - ImmutableList.of(insertRow(1, "aaa"), insertRow(2, "bbb")), - ImmutableList.of( - insertRow(1, "aaa"), insertRow(2, "bbb"), insertRow(3, "ccc"), insertRow(4, "ddd")), - ImmutableList.of( - insertRow(1, "aaa"), - insertRow(2, "bbb"), - insertRow(3, "ccc"), - insertRow(4, "ddd"), - insertRow(5, "eee"), - insertRow(6, "fff"))); - - testSqlChangeLog(TABLE_NAME, ImmutableList.of("data"), elementsPerCheckpoint, expectedRecords); - } - - private static Record record(int id, String data) { - return SimpleDataUtil.createRecord(id, data); - } - - private Table createTable(String tableName, List key, boolean isPartitioned) { - String partitionByCause = isPartitioned ? "PARTITIONED BY (data)" : ""; - sql( - "CREATE TABLE %s(id INT, data VARCHAR, PRIMARY KEY(%s) NOT ENFORCED) %s", - tableName, Joiner.on(',').join(key), partitionByCause); - - // Upgrade the iceberg table to format v2. - CatalogLoader loader = - CatalogLoader.hadoop( - "my_catalog", CONF, ImmutableMap.of(CatalogProperties.WAREHOUSE_LOCATION, warehouse)); - Table table = loader.loadCatalog().loadTable(TableIdentifier.of(DATABASE_NAME, TABLE_NAME)); - TableOperations ops = ((BaseTable) table).operations(); - TableMetadata meta = ops.current(); - ops.commit(meta, meta.upgradeToFormatVersion(2)); - - return table; - } - - private void testSqlChangeLog( - String tableName, - List key, - List> inputRowsPerCheckpoint, - List> expectedRecordsPerCheckpoint) - throws Exception { - String dataId = BoundedTableFactory.registerDataSet(inputRowsPerCheckpoint); - sql( - "CREATE TABLE %s(id INT NOT NULL, data STRING NOT NULL)" - + " WITH ('connector'='BoundedSource', 'data-id'='%s')", - SOURCE_TABLE, dataId); - - assertThat(sql("SELECT * FROM %s", SOURCE_TABLE)).isEqualTo(listJoin(inputRowsPerCheckpoint)); - - Table table = createTable(tableName, key, partitioned); - sql("INSERT INTO %s SELECT * FROM %s", tableName, SOURCE_TABLE); - - table.refresh(); - List snapshots = findValidSnapshots(table); - int expectedSnapshotNum = expectedRecordsPerCheckpoint.size(); - assertThat(snapshots) - .as("Should have the expected snapshot number") - .hasSameSizeAs(expectedRecordsPerCheckpoint); - - for (int i = 0; i < expectedSnapshotNum; i++) { - long snapshotId = snapshots.get(i).snapshotId(); - List expectedRows = expectedRecordsPerCheckpoint.get(i); - assertThat(actualRowSet(table, snapshotId)) - .as("Should have the expected records for the checkpoint#" + i) - .isEqualTo(expectedRowSet(table, expectedRows)); - } - - if (expectedSnapshotNum > 0) { - assertThat(sql("SELECT * FROM %s", tableName)) - .as("Should have the expected rows in the final table") - .containsExactlyInAnyOrderElementsOf( - expectedRecordsPerCheckpoint.get(expectedSnapshotNum - 1)); - } - } - - private List findValidSnapshots(Table table) { - List validSnapshots = Lists.newArrayList(); - for (Snapshot snapshot : table.snapshots()) { - if (snapshot.allManifests(table.io()).stream() - .anyMatch(m -> snapshot.snapshotId() == m.snapshotId())) { - validSnapshots.add(snapshot); - } - } - return validSnapshots; - } - - private static StructLikeSet expectedRowSet(Table table, List rows) { - Record[] records = new Record[rows.size()]; - for (int i = 0; i < records.length; i++) { - records[i] = record((int) rows.get(i).getField(0), (String) rows.get(i).getField(1)); - } - return SimpleDataUtil.expectedRowSet(table, records); - } - - private static StructLikeSet actualRowSet(Table table, long snapshotId) throws IOException { - return SimpleDataUtil.actualRowSet(table, snapshotId, "*"); - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestDataFileSerialization.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestDataFileSerialization.java deleted file mode 100644 index 8992cbd75187..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestDataFileSerialization.java +++ /dev/null @@ -1,203 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; -import java.io.IOException; -import java.io.ObjectInputStream; -import java.io.ObjectOutputStream; -import java.nio.ByteBuffer; -import java.nio.ByteOrder; -import java.util.Map; -import org.apache.flink.api.common.ExecutionConfig; -import org.apache.flink.api.java.typeutils.runtime.kryo.KryoSerializer; -import org.apache.flink.core.memory.DataInputDeserializer; -import org.apache.flink.core.memory.DataOutputSerializer; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.DataFiles; -import org.apache.iceberg.DeleteFile; -import org.apache.iceberg.FileMetadata; -import org.apache.iceberg.Metrics; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.SortOrder; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.types.Types; -import org.junit.jupiter.api.Test; - -public class TestDataFileSerialization { - - private static final Schema DATE_SCHEMA = - new Schema( - required(1, "id", Types.LongType.get()), - optional(2, "data", Types.StringType.get()), - required(3, "date", Types.StringType.get()), - optional(4, "double", Types.DoubleType.get())); - - private static final PartitionSpec PARTITION_SPEC = - PartitionSpec.builderFor(DATE_SCHEMA).identity("date").build(); - - private static final Map COLUMN_SIZES = Maps.newHashMap(); - private static final Map VALUE_COUNTS = Maps.newHashMap(); - private static final Map NULL_VALUE_COUNTS = Maps.newHashMap(); - private static final Map NAN_VALUE_COUNTS = Maps.newHashMap(); - private static final Map LOWER_BOUNDS = Maps.newHashMap(); - private static final Map UPPER_BOUNDS = Maps.newHashMap(); - - static { - COLUMN_SIZES.put(1, 2L); - COLUMN_SIZES.put(2, 3L); - VALUE_COUNTS.put(1, 5L); - VALUE_COUNTS.put(2, 3L); - VALUE_COUNTS.put(4, 2L); - NULL_VALUE_COUNTS.put(1, 0L); - NULL_VALUE_COUNTS.put(2, 2L); - NAN_VALUE_COUNTS.put(4, 1L); - LOWER_BOUNDS.put(1, longToBuffer(0L)); - UPPER_BOUNDS.put(1, longToBuffer(4L)); - } - - private static final Metrics METRICS = - new Metrics( - 5L, null, VALUE_COUNTS, NULL_VALUE_COUNTS, NAN_VALUE_COUNTS, LOWER_BOUNDS, UPPER_BOUNDS); - - private static final DataFile DATA_FILE = - DataFiles.builder(PARTITION_SPEC) - .withPath("/path/to/data-1.parquet") - .withFileSizeInBytes(1234) - .withPartitionPath("date=2018-06-08") - .withMetrics(METRICS) - .withSplitOffsets(ImmutableList.of(4L)) - .withEncryptionKeyMetadata(ByteBuffer.allocate(4).putInt(34)) - .withSortOrder(SortOrder.unsorted()) - .build(); - - private static final DeleteFile POS_DELETE_FILE = - FileMetadata.deleteFileBuilder(PARTITION_SPEC) - .ofPositionDeletes() - .withPath("/path/to/pos-delete.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("date=2018-06-08") - .withMetrics(METRICS) - .withEncryptionKeyMetadata(ByteBuffer.allocate(4).putInt(35)) - .withRecordCount(23) - .build(); - - private static final DeleteFile EQ_DELETE_FILE = - FileMetadata.deleteFileBuilder(PARTITION_SPEC) - .ofEqualityDeletes(2, 3) - .withPath("/path/to/equality-delete.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("date=2018-06-08") - .withMetrics(METRICS) - .withEncryptionKeyMetadata(ByteBuffer.allocate(4).putInt(35)) - .withRecordCount(23) - .withSortOrder(SortOrder.unsorted()) - .build(); - - @Test - public void testJavaSerialization() throws Exception { - ByteArrayOutputStream bytes = new ByteArrayOutputStream(); - try (ObjectOutputStream out = new ObjectOutputStream(bytes)) { - out.writeObject(DATA_FILE); - out.writeObject(DATA_FILE.copy()); - - out.writeObject(POS_DELETE_FILE); - out.writeObject(POS_DELETE_FILE.copy()); - - out.writeObject(EQ_DELETE_FILE); - out.writeObject(EQ_DELETE_FILE.copy()); - } - - try (ObjectInputStream in = - new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray()))) { - for (int i = 0; i < 2; i += 1) { - Object obj = in.readObject(); - assertThat(obj).as("Should be a DataFile").isInstanceOf(DataFile.class); - TestHelpers.assertEquals(DATA_FILE, (DataFile) obj); - } - - for (int i = 0; i < 2; i += 1) { - Object obj = in.readObject(); - assertThat(obj).as("Should be a position DeleteFile").isInstanceOf(DeleteFile.class); - TestHelpers.assertEquals(POS_DELETE_FILE, (DeleteFile) obj); - } - - for (int i = 0; i < 2; i += 1) { - Object obj = in.readObject(); - assertThat(obj).as("Should be a equality DeleteFile").isInstanceOf(DeleteFile.class); - TestHelpers.assertEquals(EQ_DELETE_FILE, (DeleteFile) obj); - } - } - } - - @Test - public void testDataFileKryoSerialization() throws IOException { - KryoSerializer kryo = new KryoSerializer<>(DataFile.class, new ExecutionConfig()); - - DataOutputSerializer outputView = new DataOutputSerializer(1024); - - kryo.serialize(DATA_FILE, outputView); - kryo.serialize(DATA_FILE.copy(), outputView); - - DataInputDeserializer inputView = new DataInputDeserializer(outputView.getCopyOfBuffer()); - DataFile dataFile1 = kryo.deserialize(inputView); - DataFile dataFile2 = kryo.deserialize(inputView); - - TestHelpers.assertEquals(DATA_FILE, dataFile1); - TestHelpers.assertEquals(DATA_FILE, dataFile2); - } - - @Test - public void testDeleteFileKryoSerialization() throws IOException { - KryoSerializer kryo = new KryoSerializer<>(DeleteFile.class, new ExecutionConfig()); - - DataOutputSerializer outputView = new DataOutputSerializer(1024); - - kryo.serialize(POS_DELETE_FILE, outputView); - kryo.serialize(POS_DELETE_FILE.copy(), outputView); - - kryo.serialize(EQ_DELETE_FILE, outputView); - kryo.serialize(EQ_DELETE_FILE.copy(), outputView); - - DataInputDeserializer inputView = new DataInputDeserializer(outputView.getCopyOfBuffer()); - - DeleteFile posDeleteFile1 = kryo.deserialize(inputView); - DeleteFile posDeleteFile2 = kryo.deserialize(inputView); - - TestHelpers.assertEquals(POS_DELETE_FILE, posDeleteFile1); - TestHelpers.assertEquals(POS_DELETE_FILE, posDeleteFile2); - - DeleteFile eqDeleteFile1 = kryo.deserialize(inputView); - DeleteFile eqDeleteFile2 = kryo.deserialize(inputView); - - TestHelpers.assertEquals(EQ_DELETE_FILE, eqDeleteFile1); - TestHelpers.assertEquals(EQ_DELETE_FILE, eqDeleteFile2); - } - - private static ByteBuffer longToBuffer(long value) { - return ByteBuffer.allocate(8).order(ByteOrder.LITTLE_ENDIAN).putLong(0, value); - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestFixtures.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestFixtures.java deleted file mode 100644 index b9a7d5b1d589..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestFixtures.java +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import static org.apache.iceberg.types.Types.NestedField.required; - -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.types.Types; - -public class TestFixtures { - - private TestFixtures() {} - - public static final Schema SCHEMA = - new Schema( - required(1, "data", Types.StringType.get()), - required(2, "id", Types.LongType.get()), - required(3, "dt", Types.StringType.get())); - - public static final PartitionSpec SPEC = - PartitionSpec.builderFor(SCHEMA).identity("dt").bucket("id", 1).build(); - - public static final RowType ROW_TYPE = FlinkSchemaUtil.convert(SCHEMA); - - public static final String DATABASE = "default"; - public static final String TABLE = "t"; - public static final String SINK_TABLE = "t_sink"; - - public static final TableIdentifier TABLE_IDENTIFIER = TableIdentifier.of(DATABASE, TABLE); - public static final TableIdentifier SINK_TABLE_IDENTIFIER = - TableIdentifier.of(DATABASE, SINK_TABLE); - - public static final Schema TS_SCHEMA = - new Schema( - required(1, "ts", Types.TimestampType.withoutZone()), - required(2, "str", Types.StringType.get())); - - public static final PartitionSpec TS_SPEC = - PartitionSpec.builderFor(TS_SCHEMA).hour("ts").build(); - - public static final RowType TS_ROW_TYPE = FlinkSchemaUtil.convert(TS_SCHEMA); -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestFlinkAnonymousTable.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestFlinkAnonymousTable.java deleted file mode 100644 index 70c8043f8fbb..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestFlinkAnonymousTable.java +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.File; -import java.nio.file.Files; -import java.util.concurrent.TimeUnit; -import org.apache.flink.table.api.DataTypes; -import org.apache.flink.table.api.Schema; -import org.apache.flink.table.api.Table; -import org.apache.flink.table.api.TableDescriptor; -import org.apache.flink.table.api.TableEnvironment; -import org.awaitility.Awaitility; -import org.junit.jupiter.api.Test; - -public class TestFlinkAnonymousTable extends TestBase { - - @Test - public void testWriteAnonymousTable() throws Exception { - File warehouseDir = Files.createTempDirectory(temporaryDirectory, "junit").toFile(); - TableEnvironment tEnv = getTableEnv(); - Table table = - tEnv.from( - TableDescriptor.forConnector("datagen") - .schema(Schema.newBuilder().column("f0", DataTypes.STRING()).build()) - .option("number-of-rows", "3") - .build()); - - TableDescriptor descriptor = - TableDescriptor.forConnector("iceberg") - .schema(Schema.newBuilder().column("f0", DataTypes.STRING()).build()) - .option("catalog-name", "hadoop_test") - .option("catalog-type", "hadoop") - .option("catalog-database", "test_db") - .option("catalog-table", "test") - .option("warehouse", warehouseDir.getAbsolutePath()) - .build(); - - table.insertInto(descriptor).execute(); - Awaitility.await() - .atMost(3, TimeUnit.SECONDS) - .untilAsserted( - () -> - assertThat(warehouseDir.toPath().resolve("test_db").resolve("test").toFile()) - .exists()); - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogDatabase.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogDatabase.java deleted file mode 100644 index 6850060e8fc8..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogDatabase.java +++ /dev/null @@ -1,253 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatThrownBy; -import static org.assertj.core.api.Assumptions.assumeThat; - -import java.nio.file.Path; -import java.util.List; -import java.util.Map; -import java.util.Objects; -import org.apache.flink.table.catalog.exceptions.DatabaseNotEmptyException; -import org.apache.flink.types.Row; -import org.apache.iceberg.Schema; -import org.apache.iceberg.catalog.Namespace; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.types.Types; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.TestTemplate; - -public class TestFlinkCatalogDatabase extends CatalogTestBase { - - @AfterEach - @Override - public void clean() { - sql("DROP TABLE IF EXISTS %s.tl", flinkDatabase); - sql("DROP DATABASE IF EXISTS %s", flinkDatabase); - super.clean(); - } - - @TestTemplate - public void testCreateNamespace() { - assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) - .as("Database should not already exist") - .isFalse(); - - sql("CREATE DATABASE %s", flinkDatabase); - assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) - .as("Database should exist") - .isTrue(); - - sql("CREATE DATABASE IF NOT EXISTS %s", flinkDatabase); - assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) - .as("Database should still exist") - .isTrue(); - - sql("DROP DATABASE IF EXISTS %s", flinkDatabase); - assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) - .as("Database should be dropped") - .isFalse(); - - sql("CREATE DATABASE IF NOT EXISTS %s", flinkDatabase); - assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) - .as("Database should be created") - .isTrue(); - } - - @TestTemplate - public void testDropEmptyDatabase() { - assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) - .as("Namespace should not already exist") - .isFalse(); - sql("CREATE DATABASE %s", flinkDatabase); - assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) - .as("Namespace should exist") - .isTrue(); - sql("DROP DATABASE %s", flinkDatabase); - assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) - .as("Namespace should have been dropped") - .isFalse(); - } - - @TestTemplate - public void testDropNonEmptyNamespace() { - assumeThat(isHadoopCatalog) - .as("Hadoop catalog throws IOException: Directory is not empty.") - .isFalse(); - assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) - .as("Namespace should not already exist") - .isFalse(); - sql("CREATE DATABASE %s", flinkDatabase); - validationCatalog.createTable( - TableIdentifier.of(icebergNamespace, "tl"), - new Schema(Types.NestedField.optional(0, "id", Types.LongType.get()))); - assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) - .as("Namespace should exist") - .isTrue(); - assertThat(validationCatalog.tableExists(TableIdentifier.of(icebergNamespace, "tl"))) - .as("Table should exist") - .isTrue(); - assertThatThrownBy(() -> sql("DROP DATABASE %s", flinkDatabase)) - .cause() - .isInstanceOf(DatabaseNotEmptyException.class) - .hasMessage( - String.format("Database %s in catalog %s is not empty.", DATABASE, catalogName)); - sql("DROP TABLE %s.tl", flinkDatabase); - } - - @TestTemplate - public void testListTables() { - assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) - .as("Namespace should not already exist") - .isFalse(); - sql("CREATE DATABASE %s", flinkDatabase); - sql("USE CATALOG %s", catalogName); - sql("USE %s", DATABASE); - assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) - .as("Namespace should exist") - .isTrue(); - assertThat(sql("SHOW TABLES")).isEmpty(); - validationCatalog.createTable( - TableIdentifier.of(icebergNamespace, "tl"), - new Schema(Types.NestedField.optional(0, "id", Types.LongType.get()))); - - List tables = sql("SHOW TABLES"); - assertThat(tables).hasSize(1); - assertThat("tl").as("Table name should match").isEqualTo(tables.get(0).getField(0)); - } - - @TestTemplate - public void testListNamespace() { - assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) - .as("Namespace should not already exist") - .isFalse(); - sql("CREATE DATABASE %s", flinkDatabase); - sql("USE CATALOG %s", catalogName); - assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) - .as("Namespace should exist") - .isTrue(); - - List databases = sql("SHOW DATABASES"); - - if (isHadoopCatalog) { - assertThat(databases).hasSize(1); - assertThat(databases.get(0).getField(0)).as("Should have db database").isEqualTo("db"); - if (!baseNamespace.isEmpty()) { - // test namespace not belongs to this catalog - validationNamespaceCatalog.createNamespace( - Namespace.of(baseNamespace.level(0), "UNKNOWN_NAMESPACE")); - databases = sql("SHOW DATABASES"); - assertThat(databases).hasSize(1); - assertThat(databases.get(0).getField(0)).as("Should have db database").isEqualTo("db"); - } - } else { - // If there are multiple classes extends FlinkTestBase, TestHiveMetastore may loose the - // creation for default - // database. See HiveMetaStore.HMSHandler.init. - assertThat(databases) - .as("Should have db database") - .anyMatch(d -> Objects.equals(d.getField(0), "db")); - } - } - - @TestTemplate - public void testCreateNamespaceWithMetadata() { - assumeThat(isHadoopCatalog).as("HadoopCatalog does not support namespace metadata").isFalse(); - assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) - .as("Namespace should not already exist") - .isFalse(); - sql("CREATE DATABASE %s WITH ('prop'='value')", flinkDatabase); - assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) - .as("Namespace should exist") - .isTrue(); - Map nsMetadata = - validationNamespaceCatalog.loadNamespaceMetadata(icebergNamespace); - assertThat(nsMetadata).containsEntry("prop", "value"); - } - - @TestTemplate - public void testCreateNamespaceWithComment() { - assumeThat(isHadoopCatalog).as("HadoopCatalog does not support namespace metadata").isFalse(); - assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) - .as("Namespace should not already exist") - .isFalse(); - - sql("CREATE DATABASE %s COMMENT 'namespace doc'", flinkDatabase); - assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) - .as("Namespace should exist") - .isTrue(); - Map nsMetadata = - validationNamespaceCatalog.loadNamespaceMetadata(icebergNamespace); - assertThat(nsMetadata).containsEntry("comment", "namespace doc"); - } - - @TestTemplate - public void testCreateNamespaceWithLocation() throws Exception { - assumeThat(isHadoopCatalog).as("HadoopCatalog does not support namespace metadata").isFalse(); - assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) - .as("Namespace should not already exist") - .isFalse(); - - Path location = temporaryDirectory.getRoot(); - sql("CREATE DATABASE %s WITH ('location'='%s')", flinkDatabase, location); - assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) - .as("Namespace should exist") - .isTrue(); - Map nsMetadata = - validationNamespaceCatalog.loadNamespaceMetadata(icebergNamespace); - assertThat(nsMetadata).containsEntry("location", "file:" + location.getRoot()); - } - - @TestTemplate - public void testSetProperties() { - assumeThat(isHadoopCatalog).as("HadoopCatalog does not support namespace metadata").isFalse(); - assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) - .as("Namespace should not already exist") - .isFalse(); - - sql("CREATE DATABASE %s", flinkDatabase); - assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) - .as("Namespace should exist") - .isTrue(); - - Map defaultMetadata = - validationNamespaceCatalog.loadNamespaceMetadata(icebergNamespace); - assertThat(defaultMetadata).doesNotContainKey("prop"); - sql("ALTER DATABASE %s SET ('prop'='value')", flinkDatabase); - Map nsMetadata = - validationNamespaceCatalog.loadNamespaceMetadata(icebergNamespace); - assertThat(nsMetadata).containsEntry("prop", "value"); - } - - @TestTemplate - public void testHadoopNotSupportMeta() { - assumeThat(isHadoopCatalog).as("HadoopCatalog does not support namespace metadata").isTrue(); - assertThat(validationNamespaceCatalog.namespaceExists(icebergNamespace)) - .as("Namespace should not already exist") - .isFalse(); - assertThatThrownBy(() -> sql("CREATE DATABASE %s WITH ('prop'='value')", flinkDatabase)) - .cause() - .isInstanceOf(UnsupportedOperationException.class) - .hasMessage( - String.format( - "Cannot create namespace %s: metadata is not supported", icebergNamespace)); - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogFactory.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogFactory.java deleted file mode 100644 index 4c9e95b8fa82..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogFactory.java +++ /dev/null @@ -1,119 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatThrownBy; - -import java.util.Map; -import org.apache.hadoop.conf.Configuration; -import org.apache.iceberg.CatalogProperties; -import org.apache.iceberg.catalog.Catalog; -import org.apache.iceberg.hadoop.HadoopCatalog; -import org.apache.iceberg.hive.HiveCatalog; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; - -public class TestFlinkCatalogFactory { - - private Map props; - - @BeforeEach - public void before() { - props = Maps.newHashMap(); - props.put("type", "iceberg"); - props.put(CatalogProperties.WAREHOUSE_LOCATION, "/tmp/location"); - } - - @Test - public void testCreateCatalogHive() { - String catalogName = "hiveCatalog"; - props.put( - FlinkCatalogFactory.ICEBERG_CATALOG_TYPE, FlinkCatalogFactory.ICEBERG_CATALOG_TYPE_HIVE); - - Catalog catalog = - FlinkCatalogFactory.createCatalogLoader(catalogName, props, new Configuration()) - .loadCatalog(); - - assertThat(catalog).isNotNull().isInstanceOf(HiveCatalog.class); - } - - @Test - public void testCreateCatalogHadoop() { - String catalogName = "hadoopCatalog"; - props.put( - FlinkCatalogFactory.ICEBERG_CATALOG_TYPE, FlinkCatalogFactory.ICEBERG_CATALOG_TYPE_HADOOP); - - Catalog catalog = - FlinkCatalogFactory.createCatalogLoader(catalogName, props, new Configuration()) - .loadCatalog(); - - assertThat(catalog).isNotNull().isInstanceOf(HadoopCatalog.class); - } - - @Test - public void testCreateCatalogCustom() { - String catalogName = "customCatalog"; - props.put(CatalogProperties.CATALOG_IMPL, CustomHadoopCatalog.class.getName()); - - Catalog catalog = - FlinkCatalogFactory.createCatalogLoader(catalogName, props, new Configuration()) - .loadCatalog(); - - assertThat(catalog).isNotNull().isInstanceOf(CustomHadoopCatalog.class); - } - - @Test - public void testCreateCatalogCustomWithHiveCatalogTypeSet() { - String catalogName = "customCatalog"; - props.put(CatalogProperties.CATALOG_IMPL, CustomHadoopCatalog.class.getName()); - props.put( - FlinkCatalogFactory.ICEBERG_CATALOG_TYPE, FlinkCatalogFactory.ICEBERG_CATALOG_TYPE_HIVE); - - assertThatThrownBy( - () -> FlinkCatalogFactory.createCatalogLoader(catalogName, props, new Configuration())) - .isInstanceOf(IllegalArgumentException.class) - .hasMessageStartingWith( - "Cannot create catalog customCatalog, both catalog-type and catalog-impl are set"); - } - - @Test - public void testLoadCatalogUnknown() { - String catalogName = "unknownCatalog"; - props.put(FlinkCatalogFactory.ICEBERG_CATALOG_TYPE, "fooType"); - - assertThatThrownBy( - () -> FlinkCatalogFactory.createCatalogLoader(catalogName, props, new Configuration())) - .isInstanceOf(UnsupportedOperationException.class) - .hasMessageStartingWith("Unknown catalog-type: fooType"); - } - - public static class CustomHadoopCatalog extends HadoopCatalog { - - public CustomHadoopCatalog() {} - - public CustomHadoopCatalog(Configuration conf, String warehouseLocation) { - setConf(conf); - initialize( - "custom", ImmutableMap.of(CatalogProperties.WAREHOUSE_LOCATION, warehouseLocation)); - } - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogTable.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogTable.java deleted file mode 100644 index 7a364b856398..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogTable.java +++ /dev/null @@ -1,669 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatThrownBy; -import static org.assertj.core.api.Assumptions.assumeThat; - -import java.util.Arrays; -import java.util.Collections; -import java.util.Map; -import java.util.Optional; -import java.util.Set; -import java.util.stream.Collectors; -import java.util.stream.StreamSupport; -import org.apache.flink.table.api.DataTypes; -import org.apache.flink.table.api.TableException; -import org.apache.flink.table.api.TableSchema; -import org.apache.flink.table.api.ValidationException; -import org.apache.flink.table.api.constraints.UniqueConstraint; -import org.apache.flink.table.catalog.CatalogTable; -import org.apache.flink.table.catalog.ObjectPath; -import org.apache.flink.table.catalog.exceptions.TableNotExistException; -import org.apache.iceberg.BaseTable; -import org.apache.iceberg.ContentFile; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.DataFiles; -import org.apache.iceberg.DataOperations; -import org.apache.iceberg.FileScanTask; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Snapshot; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableOperations; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.exceptions.NoSuchTableException; -import org.apache.iceberg.relocated.com.google.common.collect.Iterables; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.relocated.com.google.common.collect.Sets; -import org.apache.iceberg.types.Types; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.TestTemplate; - -public class TestFlinkCatalogTable extends CatalogTestBase { - - @Override - @BeforeEach - public void before() { - super.before(); - sql("CREATE DATABASE %s", flinkDatabase); - sql("USE CATALOG %s", catalogName); - sql("USE %s", DATABASE); - } - - @AfterEach - public void cleanNamespaces() { - sql("DROP TABLE IF EXISTS %s.tl", flinkDatabase); - sql("DROP TABLE IF EXISTS %s.tl2", flinkDatabase); - sql("DROP DATABASE IF EXISTS %s", flinkDatabase); - super.clean(); - } - - @TestTemplate - public void testGetTable() { - sql("CREATE TABLE tl(id BIGINT, strV STRING)"); - - Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, "tl")); - Schema iSchema = - new Schema( - Types.NestedField.optional(1, "id", Types.LongType.get()), - Types.NestedField.optional(2, "strV", Types.StringType.get())); - assertThat(table.schema().toString()) - .as("Should load the expected iceberg schema") - .isEqualTo(iSchema.toString()); - } - - @TestTemplate - public void testRenameTable() { - assumeThat(isHadoopCatalog).as("HadoopCatalog does not support rename table").isFalse(); - final Schema tableSchema = - new Schema(Types.NestedField.optional(0, "id", Types.LongType.get())); - validationCatalog.createTable(TableIdentifier.of(icebergNamespace, "tl"), tableSchema); - sql("ALTER TABLE tl RENAME TO tl2"); - - assertThatThrownBy(() -> getTableEnv().from("tl")) - .isInstanceOf(ValidationException.class) - .hasMessage("Table `tl` was not found."); - - Schema actualSchema = FlinkSchemaUtil.convert(getTableEnv().from("tl2").getSchema()); - assertThat(tableSchema.asStruct()).isEqualTo(actualSchema.asStruct()); - } - - @TestTemplate - public void testCreateTable() throws TableNotExistException { - sql("CREATE TABLE tl(id BIGINT)"); - - Table table = table("tl"); - assertThat(table.schema().asStruct()) - .isEqualTo( - new Schema(Types.NestedField.optional(1, "id", Types.LongType.get())).asStruct()); - CatalogTable catalogTable = catalogTable("tl"); - assertThat(catalogTable.getSchema()) - .isEqualTo(TableSchema.builder().field("id", DataTypes.BIGINT()).build()); - } - - @TestTemplate - public void testCreateTableWithPrimaryKey() throws Exception { - sql("CREATE TABLE tl(id BIGINT, data STRING, key STRING PRIMARY KEY NOT ENFORCED)"); - - Table table = table("tl"); - assertThat(table.schema().identifierFieldIds()) - .as("Should have the expected row key.") - .isEqualTo(Sets.newHashSet(table.schema().findField("key").fieldId())); - CatalogTable catalogTable = catalogTable("tl"); - Optional uniqueConstraintOptional = catalogTable.getSchema().getPrimaryKey(); - assertThat(uniqueConstraintOptional).isPresent(); - assertThat(uniqueConstraintOptional.get().getColumns()).containsExactly("key"); - } - - @TestTemplate - public void testCreateTableWithMultiColumnsInPrimaryKey() throws Exception { - sql( - "CREATE TABLE tl(id BIGINT, data STRING, CONSTRAINT pk_constraint PRIMARY KEY(data, id) NOT ENFORCED)"); - - Table table = table("tl"); - assertThat(table.schema().identifierFieldIds()) - .as("Should have the expected RowKey") - .isEqualTo( - Sets.newHashSet( - table.schema().findField("id").fieldId(), - table.schema().findField("data").fieldId())); - CatalogTable catalogTable = catalogTable("tl"); - Optional uniqueConstraintOptional = catalogTable.getSchema().getPrimaryKey(); - assertThat(uniqueConstraintOptional).isPresent(); - assertThat(uniqueConstraintOptional.get().getColumns()).containsExactly("id", "data"); - } - - @TestTemplate - public void testCreateTableIfNotExists() { - sql("CREATE TABLE tl(id BIGINT)"); - - // Assert that table does exist. - assertThat(table("tl")).isNotNull(); - - sql("DROP TABLE tl"); - assertThatThrownBy(() -> table("tl")) - .isInstanceOf(NoSuchTableException.class) - .hasMessage("Table does not exist: " + getFullQualifiedTableName("tl")); - - sql("CREATE TABLE IF NOT EXISTS tl(id BIGINT)"); - assertThat(table("tl").properties()).doesNotContainKey("key"); - - table("tl").updateProperties().set("key", "value").commit(); - assertThat(table("tl").properties()).containsEntry("key", "value"); - - sql("CREATE TABLE IF NOT EXISTS tl(id BIGINT)"); - assertThat(table("tl").properties()).containsEntry("key", "value"); - } - - @TestTemplate - public void testCreateTableLike() throws TableNotExistException { - sql("CREATE TABLE tl(id BIGINT)"); - sql("CREATE TABLE tl2 LIKE tl"); - - Table table = table("tl2"); - assertThat(table.schema().asStruct()) - .isEqualTo( - new Schema(Types.NestedField.optional(1, "id", Types.LongType.get())).asStruct()); - CatalogTable catalogTable = catalogTable("tl2"); - assertThat(catalogTable.getSchema()) - .isEqualTo(TableSchema.builder().field("id", DataTypes.BIGINT()).build()); - } - - @TestTemplate - public void testCreateTableLocation() { - assumeThat(isHadoopCatalog) - .as("HadoopCatalog does not support creating table with location") - .isFalse(); - sql("CREATE TABLE tl(id BIGINT) WITH ('location'='file:///tmp/location')"); - - Table table = table("tl"); - assertThat(table.schema().asStruct()) - .isEqualTo( - new Schema(Types.NestedField.optional(1, "id", Types.LongType.get())).asStruct()); - assertThat(table.location()).isEqualTo("file:///tmp/location"); - } - - @TestTemplate - public void testCreatePartitionTable() throws TableNotExistException { - sql("CREATE TABLE tl(id BIGINT, dt STRING) PARTITIONED BY(dt)"); - - Table table = table("tl"); - assertThat(table.schema().asStruct()) - .isEqualTo( - new Schema( - Types.NestedField.optional(1, "id", Types.LongType.get()), - Types.NestedField.optional(2, "dt", Types.StringType.get())) - .asStruct()); - assertThat(table.spec()) - .isEqualTo(PartitionSpec.builderFor(table.schema()).identity("dt").build()); - CatalogTable catalogTable = catalogTable("tl"); - assertThat(catalogTable.getSchema()) - .isEqualTo( - TableSchema.builder() - .field("id", DataTypes.BIGINT()) - .field("dt", DataTypes.STRING()) - .build()); - assertThat(catalogTable.getPartitionKeys()).isEqualTo(Collections.singletonList("dt")); - } - - @TestTemplate - public void testCreateTableWithColumnComment() { - sql("CREATE TABLE tl(id BIGINT COMMENT 'comment - id', data STRING COMMENT 'comment - data')"); - - Table table = table("tl"); - assertThat(table.schema().asStruct()) - .isEqualTo( - new Schema( - Types.NestedField.optional(1, "id", Types.LongType.get(), "comment - id"), - Types.NestedField.optional(2, "data", Types.StringType.get(), "comment - data")) - .asStruct()); - } - - @TestTemplate - public void testCreateTableWithFormatV2ThroughTableProperty() throws Exception { - sql("CREATE TABLE tl(id BIGINT) WITH ('format-version'='2')"); - - Table table = table("tl"); - assertThat(((BaseTable) table).operations().current().formatVersion()).isEqualTo(2); - } - - @TestTemplate - public void testUpgradeTableWithFormatV2ThroughTableProperty() throws Exception { - sql("CREATE TABLE tl(id BIGINT) WITH ('format-version'='1')"); - - Table table = table("tl"); - TableOperations ops = ((BaseTable) table).operations(); - assertThat(ops.refresh().formatVersion()) - .as("should create table using format v1") - .isEqualTo(1); - sql("ALTER TABLE tl SET('format-version'='2')"); - assertThat(ops.refresh().formatVersion()) - .as("should update table to use format v2") - .isEqualTo(2); - } - - @TestTemplate - public void testDowngradeTableToFormatV1ThroughTablePropertyFails() throws Exception { - sql("CREATE TABLE tl(id BIGINT) WITH ('format-version'='2')"); - - Table table = table("tl"); - TableOperations ops = ((BaseTable) table).operations(); - assertThat(ops.refresh().formatVersion()) - .as("should create table using format v2") - .isEqualTo(2); - assertThatThrownBy(() -> sql("ALTER TABLE tl SET('format-version'='1')")) - .rootCause() - .isInstanceOf(IllegalArgumentException.class) - .hasMessage("Cannot downgrade v2 table to v1"); - } - - @TestTemplate - public void testLoadTransformPartitionTable() throws TableNotExistException { - Schema schema = new Schema(Types.NestedField.optional(0, "id", Types.LongType.get())); - validationCatalog.createTable( - TableIdentifier.of(icebergNamespace, "tl"), - schema, - PartitionSpec.builderFor(schema).bucket("id", 100).build()); - - CatalogTable catalogTable = catalogTable("tl"); - assertThat(catalogTable.getSchema()) - .isEqualTo(TableSchema.builder().field("id", DataTypes.BIGINT()).build()); - assertThat(catalogTable.getPartitionKeys()).isEmpty(); - } - - @TestTemplate - public void testAlterTableProperties() throws TableNotExistException { - sql("CREATE TABLE tl(id BIGINT) WITH ('oldK'='oldV')"); - Map properties = Maps.newHashMap(); - properties.put("oldK", "oldV"); - - // new - sql("ALTER TABLE tl SET('newK'='newV')"); - properties.put("newK", "newV"); - assertThat(table("tl").properties()).containsAllEntriesOf(properties); - - // update old - sql("ALTER TABLE tl SET('oldK'='oldV2')"); - properties.put("oldK", "oldV2"); - assertThat(table("tl").properties()).containsAllEntriesOf(properties); - - // remove property - sql("ALTER TABLE tl RESET('oldK')"); - properties.remove("oldK"); - assertThat(table("tl").properties()).containsAllEntriesOf(properties); - } - - @TestTemplate - public void testAlterTableAddColumn() { - sql("CREATE TABLE tl(id BIGINT)"); - Schema schemaBefore = table("tl").schema(); - assertThat(schemaBefore.asStruct()) - .isEqualTo( - new Schema(Types.NestedField.optional(1, "id", Types.LongType.get())).asStruct()); - sql("ALTER TABLE tl ADD (dt STRING)"); - Schema schemaAfter1 = table("tl").schema(); - assertThat(schemaAfter1.asStruct()) - .isEqualTo( - new Schema( - Types.NestedField.optional(1, "id", Types.LongType.get()), - Types.NestedField.optional(2, "dt", Types.StringType.get())) - .asStruct()); - // Add multiple columns - sql("ALTER TABLE tl ADD (col1 STRING COMMENT 'comment for col1', col2 BIGINT)"); - Schema schemaAfter2 = table("tl").schema(); - assertThat(schemaAfter2.asStruct()) - .isEqualTo( - new Schema( - Types.NestedField.optional(1, "id", Types.LongType.get()), - Types.NestedField.optional(2, "dt", Types.StringType.get()), - Types.NestedField.optional( - 3, "col1", Types.StringType.get(), "comment for col1"), - Types.NestedField.optional(4, "col2", Types.LongType.get())) - .asStruct()); - // Adding a required field should fail because Iceberg's SchemaUpdate does not allow - // incompatible changes. - assertThatThrownBy(() -> sql("ALTER TABLE tl ADD (pk STRING NOT NULL)")) - .hasRootCauseInstanceOf(IllegalArgumentException.class) - .hasRootCauseMessage("Incompatible change: cannot add required column: pk"); - - // Adding an existing field should fail due to Flink's internal validation. - assertThatThrownBy(() -> sql("ALTER TABLE tl ADD (id STRING)")) - .isInstanceOf(ValidationException.class) - .hasMessageContaining("Try to add a column `id` which already exists in the table."); - } - - @TestTemplate - public void testAlterTableDropColumn() { - sql("CREATE TABLE tl(id BIGINT, dt STRING, col1 STRING, col2 BIGINT)"); - Schema schemaBefore = table("tl").schema(); - assertThat(schemaBefore.asStruct()) - .isEqualTo( - new Schema( - Types.NestedField.optional(1, "id", Types.LongType.get()), - Types.NestedField.optional(2, "dt", Types.StringType.get()), - Types.NestedField.optional(3, "col1", Types.StringType.get()), - Types.NestedField.optional(4, "col2", Types.LongType.get())) - .asStruct()); - sql("ALTER TABLE tl DROP (dt)"); - Schema schemaAfter1 = table("tl").schema(); - assertThat(schemaAfter1.asStruct()) - .isEqualTo( - new Schema( - Types.NestedField.optional(1, "id", Types.LongType.get()), - Types.NestedField.optional(3, "col1", Types.StringType.get()), - Types.NestedField.optional(4, "col2", Types.LongType.get())) - .asStruct()); - // Drop multiple columns - sql("ALTER TABLE tl DROP (col1, col2)"); - Schema schemaAfter2 = table("tl").schema(); - assertThat(schemaAfter2.asStruct()) - .isEqualTo( - new Schema(Types.NestedField.optional(1, "id", Types.LongType.get())).asStruct()); - // Dropping an non-existing field should fail due to Flink's internal validation. - assertThatThrownBy(() -> sql("ALTER TABLE tl DROP (foo)")) - .isInstanceOf(ValidationException.class) - .hasMessageContaining("The column `foo` does not exist in the base table."); - - // Dropping an already-deleted field should fail due to Flink's internal validation. - assertThatThrownBy(() -> sql("ALTER TABLE tl DROP (dt)")) - .isInstanceOf(ValidationException.class) - .hasMessageContaining("The column `dt` does not exist in the base table."); - } - - @TestTemplate - public void testAlterTableModifyColumnName() { - sql("CREATE TABLE tl(id BIGINT, dt STRING)"); - Schema schemaBefore = table("tl").schema(); - assertThat(schemaBefore.asStruct()) - .isEqualTo( - new Schema( - Types.NestedField.optional(1, "id", Types.LongType.get()), - Types.NestedField.optional(2, "dt", Types.StringType.get())) - .asStruct()); - sql("ALTER TABLE tl RENAME dt TO data"); - Schema schemaAfter = table("tl").schema(); - assertThat(schemaAfter.asStruct()) - .isEqualTo( - new Schema( - Types.NestedField.optional(1, "id", Types.LongType.get()), - Types.NestedField.optional(2, "data", Types.StringType.get())) - .asStruct()); - } - - @TestTemplate - public void testAlterTableModifyColumnType() { - sql("CREATE TABLE tl(id INTEGER, dt STRING)"); - Schema schemaBefore = table("tl").schema(); - assertThat(schemaBefore.asStruct()) - .isEqualTo( - new Schema( - Types.NestedField.optional(1, "id", Types.IntegerType.get()), - Types.NestedField.optional(2, "dt", Types.StringType.get())) - .asStruct()); - // Promote type from Integer to Long - sql("ALTER TABLE tl MODIFY (id BIGINT)"); - Schema schemaAfter = table("tl").schema(); - assertThat(schemaAfter.asStruct()) - .isEqualTo( - new Schema( - Types.NestedField.optional(1, "id", Types.LongType.get()), - Types.NestedField.optional(2, "dt", Types.StringType.get())) - .asStruct()); - // Type change that doesn't follow the type-promotion rule should fail due to Iceberg's - // validation. - assertThatThrownBy(() -> sql("ALTER TABLE tl MODIFY (dt INTEGER)")) - .isInstanceOf(TableException.class) - .hasRootCauseInstanceOf(IllegalArgumentException.class) - .hasRootCauseMessage("Cannot change column type: dt: string -> int"); - } - - @TestTemplate - public void testAlterTableModifyColumnNullability() { - sql("CREATE TABLE tl(id INTEGER NOT NULL, dt STRING)"); - Schema schemaBefore = table("tl").schema(); - assertThat(schemaBefore.asStruct()) - .isEqualTo( - new Schema( - Types.NestedField.required(1, "id", Types.IntegerType.get()), - Types.NestedField.optional(2, "dt", Types.StringType.get())) - .asStruct()); - // Changing nullability from optional to required should fail - // because Iceberg's SchemaUpdate does not allow incompatible changes. - assertThatThrownBy(() -> sql("ALTER TABLE tl MODIFY (dt STRING NOT NULL)")) - .isInstanceOf(TableException.class) - .hasRootCauseInstanceOf(IllegalArgumentException.class) - .hasRootCauseMessage("Cannot change column nullability: dt: optional -> required"); - - // Set nullability from required to optional - sql("ALTER TABLE tl MODIFY (id INTEGER)"); - Schema schemaAfter = table("tl").schema(); - assertThat(schemaAfter.asStruct()) - .isEqualTo( - new Schema( - Types.NestedField.optional(1, "id", Types.IntegerType.get()), - Types.NestedField.optional(2, "dt", Types.StringType.get())) - .asStruct()); - } - - @TestTemplate - public void testAlterTableModifyColumnPosition() { - sql("CREATE TABLE tl(id BIGINT, dt STRING)"); - Schema schemaBefore = table("tl").schema(); - assertThat(schemaBefore.asStruct()) - .isEqualTo( - new Schema( - Types.NestedField.optional(1, "id", Types.LongType.get()), - Types.NestedField.optional(2, "dt", Types.StringType.get())) - .asStruct()); - - sql("ALTER TABLE tl MODIFY (dt STRING FIRST)"); - Schema schemaAfter = table("tl").schema(); - assertThat(schemaAfter.asStruct()) - .isEqualTo( - new Schema( - Types.NestedField.optional(2, "dt", Types.StringType.get()), - Types.NestedField.optional(1, "id", Types.LongType.get())) - .asStruct()); - - sql("ALTER TABLE tl MODIFY (dt STRING AFTER id)"); - Schema schemaAfterAfter = table("tl").schema(); - assertThat(schemaAfterAfter.asStruct()) - .isEqualTo( - new Schema( - Types.NestedField.optional(1, "id", Types.LongType.get()), - Types.NestedField.optional(2, "dt", Types.StringType.get())) - .asStruct()); - // Modifying the position of a non-existing column should fail due to Flink's internal - // validation. - assertThatThrownBy(() -> sql("ALTER TABLE tl MODIFY (non_existing STRING FIRST)")) - .isInstanceOf(ValidationException.class) - .hasMessageContaining( - "Try to modify a column `non_existing` which does not exist in the table."); - - // Moving a column after a non-existing column should fail due to Flink's internal validation. - assertThatThrownBy(() -> sql("ALTER TABLE tl MODIFY (dt STRING AFTER non_existing)")) - .isInstanceOf(ValidationException.class) - .hasMessageContaining( - "Referenced column `non_existing` by 'AFTER' does not exist in the table."); - } - - @TestTemplate - public void testAlterTableModifyColumnComment() { - sql("CREATE TABLE tl(id BIGINT, dt STRING)"); - Schema schemaBefore = table("tl").schema(); - assertThat(schemaBefore.asStruct()) - .isEqualTo( - new Schema( - Types.NestedField.optional(1, "id", Types.LongType.get()), - Types.NestedField.optional(2, "dt", Types.StringType.get())) - .asStruct()); - - sql("ALTER TABLE tl MODIFY (dt STRING COMMENT 'comment for dt field')"); - Schema schemaAfter = table("tl").schema(); - assertThat(schemaAfter.asStruct()) - .isEqualTo( - new Schema( - Types.NestedField.optional(1, "id", Types.LongType.get()), - Types.NestedField.optional( - 2, "dt", Types.StringType.get(), "comment for dt field")) - .asStruct()); - } - - @TestTemplate - public void testAlterTableConstraint() { - sql("CREATE TABLE tl(id BIGINT NOT NULL, dt STRING NOT NULL, col1 STRING)"); - Schema schemaBefore = table("tl").schema(); - assertThat(schemaBefore.asStruct()) - .isEqualTo( - new Schema( - Types.NestedField.required(1, "id", Types.LongType.get()), - Types.NestedField.required(2, "dt", Types.StringType.get()), - Types.NestedField.optional(3, "col1", Types.StringType.get())) - .asStruct()); - assertThat(schemaBefore.identifierFieldNames()).isEmpty(); - sql("ALTER TABLE tl ADD (PRIMARY KEY (id) NOT ENFORCED)"); - Schema schemaAfterAdd = table("tl").schema(); - assertThat(schemaAfterAdd.identifierFieldNames()).containsExactly("id"); - sql("ALTER TABLE tl MODIFY (PRIMARY KEY (dt) NOT ENFORCED)"); - Schema schemaAfterModify = table("tl").schema(); - assertThat(schemaAfterModify.asStruct()) - .isEqualTo( - new Schema( - Types.NestedField.required(1, "id", Types.LongType.get()), - Types.NestedField.required(2, "dt", Types.StringType.get()), - Types.NestedField.optional(3, "col1", Types.StringType.get())) - .asStruct()); - assertThat(schemaAfterModify.identifierFieldNames()).containsExactly("dt"); - // Composite primary key - sql("ALTER TABLE tl MODIFY (PRIMARY KEY (id, dt) NOT ENFORCED)"); - Schema schemaAfterComposite = table("tl").schema(); - assertThat(schemaAfterComposite.asStruct()) - .isEqualTo( - new Schema( - Types.NestedField.required(1, "id", Types.LongType.get()), - Types.NestedField.required(2, "dt", Types.StringType.get()), - Types.NestedField.optional(3, "col1", Types.StringType.get())) - .asStruct()); - assertThat(schemaAfterComposite.identifierFieldNames()).containsExactlyInAnyOrder("id", "dt"); - // Setting an optional field as primary key should fail - // because Iceberg's SchemaUpdate does not allow incompatible changes. - assertThatThrownBy(() -> sql("ALTER TABLE tl MODIFY (PRIMARY KEY (col1) NOT ENFORCED)")) - .isInstanceOf(TableException.class) - .hasRootCauseInstanceOf(IllegalArgumentException.class) - .hasRootCauseMessage("Cannot add field col1 as an identifier field: not a required field"); - - // Setting a composite key containing an optional field should fail - // because Iceberg's SchemaUpdate does not allow incompatible changes. - assertThatThrownBy(() -> sql("ALTER TABLE tl MODIFY (PRIMARY KEY (id, col1) NOT ENFORCED)")) - .isInstanceOf(TableException.class) - .hasRootCauseInstanceOf(IllegalArgumentException.class) - .hasRootCauseMessage("Cannot add field col1 as an identifier field: not a required field"); - - // Dropping constraints is not supported yet - assertThatThrownBy(() -> sql("ALTER TABLE tl DROP PRIMARY KEY")) - .isInstanceOf(TableException.class) - .hasRootCauseInstanceOf(UnsupportedOperationException.class) - .hasRootCauseMessage("Unsupported table change: DropConstraint."); - } - - @TestTemplate - public void testRelocateTable() { - assumeThat(isHadoopCatalog).as("HadoopCatalog does not support relocate table").isFalse(); - sql("CREATE TABLE tl(id BIGINT)"); - sql("ALTER TABLE tl SET('location'='file:///tmp/location')"); - assertThat(table("tl").location()).isEqualTo("file:///tmp/location"); - } - - @TestTemplate - public void testSetCurrentAndCherryPickSnapshotId() { - sql("CREATE TABLE tl(c1 INT, c2 STRING, c3 STRING) PARTITIONED BY (c1)"); - - Table table = table("tl"); - - DataFile fileA = - DataFiles.builder(table.spec()) - .withPath("/path/to/data-a.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("c1=0") // easy way to set partition data for now - .withRecordCount(1) - .build(); - DataFile fileB = - DataFiles.builder(table.spec()) - .withPath("/path/to/data-b.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("c1=1") // easy way to set partition data for now - .withRecordCount(1) - .build(); - DataFile replacementFile = - DataFiles.builder(table.spec()) - .withPath("/path/to/data-a-replacement.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("c1=0") // easy way to set partition data for now - .withRecordCount(1) - .build(); - - table.newAppend().appendFile(fileA).commit(); - long snapshotId = table.currentSnapshot().snapshotId(); - - // stage an overwrite that replaces FILE_A - table.newReplacePartitions().addFile(replacementFile).stageOnly().commit(); - - Snapshot staged = Iterables.getLast(table.snapshots()); - assertThat(staged.operation()) - .as("Should find the staged overwrite snapshot") - .isEqualTo(DataOperations.OVERWRITE); - // add another append so that the original commit can't be fast-forwarded - table.newAppend().appendFile(fileB).commit(); - - // test cherry pick - sql("ALTER TABLE tl SET('cherry-pick-snapshot-id'='%s')", staged.snapshotId()); - validateTableFiles(table, fileB, replacementFile); - - // test set current snapshot - sql("ALTER TABLE tl SET('current-snapshot-id'='%s')", snapshotId); - validateTableFiles(table, fileA); - } - - private void validateTableFiles(Table tbl, DataFile... expectedFiles) { - tbl.refresh(); - Set expectedFilePaths = - Arrays.stream(expectedFiles).map(DataFile::path).collect(Collectors.toSet()); - Set actualFilePaths = - StreamSupport.stream(tbl.newScan().planFiles().spliterator(), false) - .map(FileScanTask::file) - .map(ContentFile::path) - .collect(Collectors.toSet()); - assertThat(actualFilePaths).as("Files should match").isEqualTo(expectedFilePaths); - } - - private Table table(String name) { - return validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, name)); - } - - private CatalogTable catalogTable(String name) throws TableNotExistException { - return (CatalogTable) - getTableEnv() - .getCatalog(getTableEnv().getCurrentCatalog()) - .get() - .getTable(new ObjectPath(DATABASE, name)); - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogTablePartitions.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogTablePartitions.java deleted file mode 100644 index e395414e925d..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogTablePartitions.java +++ /dev/null @@ -1,119 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatThrownBy; - -import java.util.List; -import org.apache.flink.table.catalog.CatalogPartitionSpec; -import org.apache.flink.table.catalog.ObjectPath; -import org.apache.flink.table.catalog.exceptions.TableNotExistException; -import org.apache.flink.table.catalog.exceptions.TableNotPartitionedException; -import org.apache.iceberg.CatalogProperties; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Parameter; -import org.apache.iceberg.Parameters; -import org.apache.iceberg.catalog.Namespace; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.TestTemplate; - -public class TestFlinkCatalogTablePartitions extends CatalogTestBase { - - private final String tableName = "test_table"; - - @Parameter(index = 2) - private FileFormat format; - - @Parameter(index = 3) - private Boolean cacheEnabled; - - @Parameters(name = "catalogName={0}, baseNamespace={1}, format={2}, cacheEnabled={3}") - protected static List parameters() { - List parameters = Lists.newArrayList(); - for (FileFormat format : - new FileFormat[] {FileFormat.ORC, FileFormat.AVRO, FileFormat.PARQUET}) { - for (Boolean cacheEnabled : new Boolean[] {true, false}) { - for (Object[] catalogParams : CatalogTestBase.parameters()) { - String catalogName = (String) catalogParams[0]; - Namespace baseNamespace = (Namespace) catalogParams[1]; - parameters.add(new Object[] {catalogName, baseNamespace, format, cacheEnabled}); - } - } - } - return parameters; - } - - @Override - @BeforeEach - public void before() { - super.before(); - config.put(CatalogProperties.CACHE_ENABLED, String.valueOf(cacheEnabled)); - sql("CREATE DATABASE %s", flinkDatabase); - sql("USE CATALOG %s", catalogName); - sql("USE %s", DATABASE); - } - - @AfterEach - public void cleanNamespaces() { - sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, tableName); - sql("DROP DATABASE IF EXISTS %s", flinkDatabase); - super.clean(); - } - - @TestTemplate - public void testListPartitionsWithUnpartitionedTable() { - sql( - "CREATE TABLE %s (id INT, data VARCHAR) with ('write.format.default'='%s')", - tableName, format.name()); - sql("INSERT INTO %s SELECT 1,'a'", tableName); - - ObjectPath objectPath = new ObjectPath(DATABASE, tableName); - FlinkCatalog flinkCatalog = (FlinkCatalog) getTableEnv().getCatalog(catalogName).get(); - assertThatThrownBy(() -> flinkCatalog.listPartitions(objectPath)) - .isInstanceOf(TableNotPartitionedException.class) - .hasMessageStartingWith("Table db.test_table in catalog") - .hasMessageEndingWith("is not partitioned."); - } - - @TestTemplate - public void testListPartitionsWithPartitionedTable() - throws TableNotExistException, TableNotPartitionedException { - sql( - "CREATE TABLE %s (id INT, data VARCHAR) PARTITIONED BY (data) " - + "with ('write.format.default'='%s')", - tableName, format.name()); - sql("INSERT INTO %s SELECT 1,'a'", tableName); - sql("INSERT INTO %s SELECT 2,'b'", tableName); - - ObjectPath objectPath = new ObjectPath(DATABASE, tableName); - FlinkCatalog flinkCatalog = (FlinkCatalog) getTableEnv().getCatalog(catalogName).get(); - List list = flinkCatalog.listPartitions(objectPath); - assertThat(list).hasSize(2); - List expected = Lists.newArrayList(); - CatalogPartitionSpec partitionSpec1 = new CatalogPartitionSpec(ImmutableMap.of("data", "a")); - CatalogPartitionSpec partitionSpec2 = new CatalogPartitionSpec(ImmutableMap.of("data", "b")); - expected.add(partitionSpec1); - expected.add(partitionSpec2); - assertThat(list).as("Should produce the expected catalog partition specs.").isEqualTo(expected); - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestFlinkConfParser.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestFlinkConfParser.java deleted file mode 100644 index 4b6ac25ab8e3..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestFlinkConfParser.java +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import static org.assertj.core.api.Assertions.assertThat; -import static org.mockito.Mockito.mock; -import static org.mockito.Mockito.when; - -import java.time.Duration; -import java.util.Map; -import org.apache.flink.configuration.ConfigOption; -import org.apache.flink.configuration.ConfigOptions; -import org.apache.flink.configuration.Configuration; -import org.apache.iceberg.Table; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.junit.jupiter.api.Test; - -public class TestFlinkConfParser { - - @Test - public void testDurationConf() { - Map writeOptions = ImmutableMap.of("write-prop", "111s"); - - ConfigOption configOption = - ConfigOptions.key("conf-prop").durationType().noDefaultValue(); - Configuration flinkConf = new Configuration(); - flinkConf.setString(configOption.key(), "222s"); - - Table table = mock(Table.class); - when(table.properties()).thenReturn(ImmutableMap.of("table-prop", "333s")); - - FlinkConfParser confParser = new FlinkConfParser(table, writeOptions, flinkConf); - Duration defaultVal = Duration.ofMillis(999); - - Duration result = - confParser.durationConf().option("write-prop").defaultValue(defaultVal).parse(); - assertThat(result).isEqualTo(Duration.ofSeconds(111)); - - result = confParser.durationConf().flinkConfig(configOption).defaultValue(defaultVal).parse(); - assertThat(result).isEqualTo(Duration.ofSeconds(222)); - - result = confParser.durationConf().tableProperty("table-prop").defaultValue(defaultVal).parse(); - assertThat(result).isEqualTo(Duration.ofSeconds(333)); - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestFlinkFilters.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestFlinkFilters.java deleted file mode 100644 index 838b0ea0e1a9..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestFlinkFilters.java +++ /dev/null @@ -1,462 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.math.BigDecimal; -import java.nio.ByteBuffer; -import java.time.Instant; -import java.time.LocalDate; -import java.time.LocalDateTime; -import java.time.LocalTime; -import java.util.List; -import java.util.Optional; -import java.util.stream.Collectors; -import org.apache.flink.table.api.DataTypes; -import org.apache.flink.table.api.Expressions; -import org.apache.flink.table.api.TableColumn; -import org.apache.flink.table.api.TableSchema; -import org.apache.flink.table.expressions.ApiExpressionUtils; -import org.apache.flink.table.expressions.CallExpression; -import org.apache.flink.table.expressions.Expression; -import org.apache.flink.table.expressions.FieldReferenceExpression; -import org.apache.flink.table.expressions.ResolvedExpression; -import org.apache.flink.table.expressions.UnresolvedCallExpression; -import org.apache.flink.table.expressions.UnresolvedReferenceExpression; -import org.apache.flink.table.expressions.ValueLiteralExpression; -import org.apache.flink.table.expressions.utils.ApiExpressionDefaultVisitor; -import org.apache.flink.table.functions.BuiltInFunctionDefinitions; -import org.apache.iceberg.expressions.And; -import org.apache.iceberg.expressions.BoundLiteralPredicate; -import org.apache.iceberg.expressions.Not; -import org.apache.iceberg.expressions.Or; -import org.apache.iceberg.expressions.UnboundPredicate; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.util.DateTimeUtil; -import org.apache.iceberg.util.Pair; -import org.junit.jupiter.api.Test; - -public class TestFlinkFilters { - - private static final TableSchema TABLE_SCHEMA = - TableSchema.builder() - .field("field1", DataTypes.INT()) - .field("field2", DataTypes.BIGINT()) - .field("field3", DataTypes.FLOAT()) - .field("field4", DataTypes.DOUBLE()) - .field("field5", DataTypes.STRING()) - .field("field6", DataTypes.BOOLEAN()) - .field("field7", DataTypes.BINARY(2)) - .field("field8", DataTypes.DECIMAL(10, 2)) - .field("field9", DataTypes.DATE()) - .field("field10", DataTypes.TIME()) - .field("field11", DataTypes.TIMESTAMP()) - .field("field12", DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE()) - .build(); - - // A map list of fields and values used to verify the conversion of flink expression to iceberg - // expression - private static final List> FIELD_VALUE_LIST = - ImmutableList.of( - Pair.of("field1", 1), - Pair.of("field2", 2L), - Pair.of("field3", 3F), - Pair.of("field4", 4D), - Pair.of("field5", "iceberg"), - Pair.of("field6", true), - Pair.of("field7", new byte[] {'a', 'b'}), - Pair.of("field8", BigDecimal.valueOf(10.12)), - Pair.of("field9", DateTimeUtil.daysFromDate(LocalDate.now())), - Pair.of("field10", DateTimeUtil.microsFromTime(LocalTime.now())), - Pair.of("field11", DateTimeUtil.microsFromTimestamp(LocalDateTime.now())), - Pair.of("field12", DateTimeUtil.microsFromInstant(Instant.now()))); - - @Test - public void testFlinkDataTypeEqual() { - matchLiteral("field1", 1, 1); - matchLiteral("field2", 10L, 10L); - matchLiteral("field3", 1.2F, 1.2F); - matchLiteral("field4", 3.4D, 3.4D); - matchLiteral("field5", "abcd", "abcd"); - matchLiteral("field6", true, true); - matchLiteral("field7", new byte[] {'a', 'b'}, ByteBuffer.wrap(new byte[] {'a', 'b'})); - matchLiteral("field8", BigDecimal.valueOf(10.12), BigDecimal.valueOf(10.12)); - - LocalDate date = LocalDate.parse("2020-12-23"); - matchLiteral("field9", date, DateTimeUtil.daysFromDate(date)); - - LocalTime time = LocalTime.parse("12:13:14"); - matchLiteral("field10", time, DateTimeUtil.microsFromTime(time)); - - LocalDateTime dateTime = LocalDateTime.parse("2020-12-23T12:13:14"); - matchLiteral("field11", dateTime, DateTimeUtil.microsFromTimestamp(dateTime)); - - Instant instant = Instant.parse("2020-12-23T12:13:14.00Z"); - matchLiteral("field12", instant, DateTimeUtil.microsFromInstant(instant)); - } - - @Test - public void testEquals() { - for (Pair pair : FIELD_VALUE_LIST) { - UnboundPredicate expected = - org.apache.iceberg.expressions.Expressions.equal(pair.first(), pair.second()); - - Optional actual = - FlinkFilters.convert( - resolve(Expressions.$(pair.first()).isEqual(Expressions.lit(pair.second())))); - assertThat(actual).isPresent(); - assertPredicatesMatch(expected, actual.get()); - - Optional actual1 = - FlinkFilters.convert( - resolve(Expressions.lit(pair.second()).isEqual(Expressions.$(pair.first())))); - assertThat(actual1).isPresent(); - assertPredicatesMatch(expected, actual1.get()); - } - } - - @Test - public void testEqualsNaN() { - UnboundPredicate expected = org.apache.iceberg.expressions.Expressions.isNaN("field3"); - - Optional actual = - FlinkFilters.convert(resolve(Expressions.$("field3").isEqual(Expressions.lit(Float.NaN)))); - assertThat(actual).isPresent(); - assertPredicatesMatch(expected, actual.get()); - - Optional actual1 = - FlinkFilters.convert(resolve(Expressions.lit(Float.NaN).isEqual(Expressions.$("field3")))); - assertThat(actual1).isPresent(); - assertPredicatesMatch(expected, actual1.get()); - } - - @Test - public void testNotEquals() { - for (Pair pair : FIELD_VALUE_LIST) { - UnboundPredicate expected = - org.apache.iceberg.expressions.Expressions.notEqual(pair.first(), pair.second()); - - Optional actual = - FlinkFilters.convert( - resolve(Expressions.$(pair.first()).isNotEqual(Expressions.lit(pair.second())))); - assertThat(actual).isPresent(); - assertPredicatesMatch(expected, actual.get()); - - Optional actual1 = - FlinkFilters.convert( - resolve(Expressions.lit(pair.second()).isNotEqual(Expressions.$(pair.first())))); - assertThat(actual1).isPresent(); - assertPredicatesMatch(expected, actual1.get()); - } - } - - @Test - public void testNotEqualsNaN() { - UnboundPredicate expected = org.apache.iceberg.expressions.Expressions.notNaN("field3"); - - Optional actual = - FlinkFilters.convert( - resolve(Expressions.$("field3").isNotEqual(Expressions.lit(Float.NaN)))); - assertThat(actual).isPresent(); - assertPredicatesMatch(expected, actual.get()); - - Optional actual1 = - FlinkFilters.convert( - resolve(Expressions.lit(Float.NaN).isNotEqual(Expressions.$("field3")))); - assertThat(actual1).isPresent(); - assertPredicatesMatch(expected, actual1.get()); - } - - @Test - public void testGreaterThan() { - UnboundPredicate expected = - org.apache.iceberg.expressions.Expressions.greaterThan("field1", 1); - - Optional actual = - FlinkFilters.convert(resolve(Expressions.$("field1").isGreater(Expressions.lit(1)))); - assertThat(actual).isPresent(); - assertPredicatesMatch(expected, actual.get()); - - Optional actual1 = - FlinkFilters.convert(resolve(Expressions.lit(1).isLess(Expressions.$("field1")))); - assertThat(actual1).isPresent(); - assertPredicatesMatch(expected, actual1.get()); - } - - @Test - public void testGreaterThanEquals() { - UnboundPredicate expected = - org.apache.iceberg.expressions.Expressions.greaterThanOrEqual("field1", 1); - - Optional actual = - FlinkFilters.convert(resolve(Expressions.$("field1").isGreaterOrEqual(Expressions.lit(1)))); - assertThat(actual).isPresent(); - assertPredicatesMatch(expected, actual.get()); - - Optional actual1 = - FlinkFilters.convert(resolve(Expressions.lit(1).isLessOrEqual(Expressions.$("field1")))); - assertThat(actual1).isPresent(); - assertPredicatesMatch(expected, actual1.get()); - } - - @Test - public void testLessThan() { - UnboundPredicate expected = - org.apache.iceberg.expressions.Expressions.lessThan("field1", 1); - - Optional actual = - FlinkFilters.convert(resolve(Expressions.$("field1").isLess(Expressions.lit(1)))); - assertThat(actual).isPresent(); - assertPredicatesMatch(expected, actual.get()); - - Optional actual1 = - FlinkFilters.convert(resolve(Expressions.lit(1).isGreater(Expressions.$("field1")))); - assertThat(actual1).isPresent(); - assertPredicatesMatch(expected, actual1.get()); - } - - @Test - public void testLessThanEquals() { - UnboundPredicate expected = - org.apache.iceberg.expressions.Expressions.lessThanOrEqual("field1", 1); - - Optional actual = - FlinkFilters.convert(resolve(Expressions.$("field1").isLessOrEqual(Expressions.lit(1)))); - assertThat(actual).isPresent(); - assertPredicatesMatch(expected, actual.get()); - - Optional actual1 = - FlinkFilters.convert(resolve(Expressions.lit(1).isGreaterOrEqual(Expressions.$("field1")))); - assertThat(actual1).isPresent(); - assertPredicatesMatch(expected, actual1.get()); - } - - @Test - public void testIsNull() { - Expression expr = resolve(Expressions.$("field1").isNull()); - Optional actual = FlinkFilters.convert(expr); - assertThat(actual).isPresent(); - UnboundPredicate expected = org.apache.iceberg.expressions.Expressions.isNull("field1"); - assertPredicatesMatch(expected, actual.get()); - } - - @Test - public void testIsNotNull() { - Expression expr = resolve(Expressions.$("field1").isNotNull()); - Optional actual = FlinkFilters.convert(expr); - assertThat(actual).isPresent(); - UnboundPredicate expected = - org.apache.iceberg.expressions.Expressions.notNull("field1"); - assertPredicatesMatch(expected, actual.get()); - } - - @Test - public void testAnd() { - Expression expr = - resolve( - Expressions.$("field1") - .isEqual(Expressions.lit(1)) - .and(Expressions.$("field2").isEqual(Expressions.lit(2L)))); - Optional actual = FlinkFilters.convert(expr); - assertThat(actual).isPresent(); - And and = (And) actual.get(); - And expected = - (And) - org.apache.iceberg.expressions.Expressions.and( - org.apache.iceberg.expressions.Expressions.equal("field1", 1), - org.apache.iceberg.expressions.Expressions.equal("field2", 2L)); - - assertPredicatesMatch(expected.left(), and.left()); - assertPredicatesMatch(expected.right(), and.right()); - } - - @Test - public void testOr() { - Expression expr = - resolve( - Expressions.$("field1") - .isEqual(Expressions.lit(1)) - .or(Expressions.$("field2").isEqual(Expressions.lit(2L)))); - Optional actual = FlinkFilters.convert(expr); - assertThat(actual).isPresent(); - Or or = (Or) actual.get(); - Or expected = - (Or) - org.apache.iceberg.expressions.Expressions.or( - org.apache.iceberg.expressions.Expressions.equal("field1", 1), - org.apache.iceberg.expressions.Expressions.equal("field2", 2L)); - - assertPredicatesMatch(expected.left(), or.left()); - assertPredicatesMatch(expected.right(), or.right()); - } - - @Test - public void testNot() { - Expression expr = - resolve( - ApiExpressionUtils.unresolvedCall( - BuiltInFunctionDefinitions.NOT, - Expressions.$("field1").isEqual(Expressions.lit(1)))); - Optional actual = FlinkFilters.convert(expr); - assertThat(actual).isPresent(); - Not not = (Not) actual.get(); - Not expected = - (Not) - org.apache.iceberg.expressions.Expressions.not( - org.apache.iceberg.expressions.Expressions.equal("field1", 1)); - - assertThat(not.op()).as("Predicate operation should match").isEqualTo(expected.op()); - assertPredicatesMatch(expected.child(), not.child()); - } - - @Test - public void testLike() { - UnboundPredicate expected = - org.apache.iceberg.expressions.Expressions.startsWith("field5", "abc"); - Expression expr = - resolve( - ApiExpressionUtils.unresolvedCall( - BuiltInFunctionDefinitions.LIKE, Expressions.$("field5"), Expressions.lit("abc%"))); - Optional actual = FlinkFilters.convert(expr); - assertThat(actual).isPresent(); - assertPredicatesMatch(expected, actual.get()); - - expr = - resolve( - ApiExpressionUtils.unresolvedCall( - BuiltInFunctionDefinitions.LIKE, Expressions.$("field5"), Expressions.lit("%abc"))); - actual = FlinkFilters.convert(expr); - assertThat(actual).isNotPresent(); - - expr = - resolve( - ApiExpressionUtils.unresolvedCall( - BuiltInFunctionDefinitions.LIKE, - Expressions.$("field5"), - Expressions.lit("%abc%"))); - actual = FlinkFilters.convert(expr); - assertThat(actual).isNotPresent(); - - expr = - resolve( - ApiExpressionUtils.unresolvedCall( - BuiltInFunctionDefinitions.LIKE, - Expressions.$("field5"), - Expressions.lit("abc%d"))); - actual = FlinkFilters.convert(expr); - assertThat(actual).isNotPresent(); - - expr = - resolve( - ApiExpressionUtils.unresolvedCall( - BuiltInFunctionDefinitions.LIKE, Expressions.$("field5"), Expressions.lit("%"))); - actual = FlinkFilters.convert(expr); - assertThat(actual).isNotPresent(); - - expr = - resolve( - ApiExpressionUtils.unresolvedCall( - BuiltInFunctionDefinitions.LIKE, Expressions.$("field5"), Expressions.lit("a_"))); - actual = FlinkFilters.convert(expr); - assertThat(actual).isNotPresent(); - - expr = - resolve( - ApiExpressionUtils.unresolvedCall( - BuiltInFunctionDefinitions.LIKE, Expressions.$("field5"), Expressions.lit("a%b"))); - actual = FlinkFilters.convert(expr); - assertThat(actual).isNotPresent(); - } - - @SuppressWarnings("unchecked") - private void matchLiteral(String fieldName, Object flinkLiteral, T icebergLiteral) { - Expression expr = resolve(Expressions.$(fieldName).isEqual(Expressions.lit(flinkLiteral))); - Optional actual = FlinkFilters.convert(expr); - assertThat(actual).isPresent(); - org.apache.iceberg.expressions.Expression expression = actual.get(); - assertThat(expression) - .as("The expression should be a UnboundPredicate") - .isInstanceOf(UnboundPredicate.class); - UnboundPredicate unboundPredicate = (UnboundPredicate) expression; - - org.apache.iceberg.expressions.Expression expression1 = - unboundPredicate.bind(FlinkSchemaUtil.convert(TABLE_SCHEMA).asStruct(), false); - assertThat(expression1) - .as("The expression should be a BoundLiteralPredicate") - .isInstanceOf(BoundLiteralPredicate.class); - - BoundLiteralPredicate predicate = (BoundLiteralPredicate) expression1; - assertThat(predicate.test(icebergLiteral)).isTrue(); - } - - private static Expression resolve(Expression originalExpression) { - return originalExpression.accept( - new ApiExpressionDefaultVisitor() { - @Override - public Expression visit(UnresolvedReferenceExpression unresolvedReference) { - String name = unresolvedReference.getName(); - Optional field = TABLE_SCHEMA.getTableColumn(name); - if (field.isPresent()) { - int index = TABLE_SCHEMA.getTableColumns().indexOf(field.get()); - return new FieldReferenceExpression(name, field.get().getType(), 0, index); - } else { - return null; - } - } - - @Override - public Expression visit(UnresolvedCallExpression unresolvedCall) { - List children = - unresolvedCall.getChildren().stream() - .map(e -> (ResolvedExpression) e.accept(this)) - .collect(Collectors.toList()); - return new CallExpression( - unresolvedCall.getFunctionDefinition(), children, DataTypes.STRING()); - } - - @Override - public Expression visit(ValueLiteralExpression valueLiteral) { - return valueLiteral; - } - - @Override - protected Expression defaultMethod(Expression expression) { - throw new UnsupportedOperationException( - String.format("unsupported expression: %s", expression)); - } - }); - } - - private void assertPredicatesMatch( - org.apache.iceberg.expressions.Expression expected, - org.apache.iceberg.expressions.Expression actual) { - assertThat(expected) - .as("The expected expression should be a UnboundPredicate") - .isInstanceOf(UnboundPredicate.class); - assertThat(actual) - .as("The actual expression should be a UnboundPredicate") - .isInstanceOf(UnboundPredicate.class); - UnboundPredicate predicateExpected = (UnboundPredicate) expected; - UnboundPredicate predicateActual = (UnboundPredicate) actual; - assertThat(predicateActual.op()).isEqualTo(predicateExpected.op()); - assertThat(predicateActual.literal()).isEqualTo(predicateExpected.literal()); - assertThat(predicateActual.ref().name()).isEqualTo(predicateExpected.ref().name()); - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestFlinkHiveCatalog.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestFlinkHiveCatalog.java deleted file mode 100644 index f1de267cf29b..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestFlinkHiveCatalog.java +++ /dev/null @@ -1,101 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.File; -import java.io.FileOutputStream; -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.Map; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.hive.conf.HiveConf; -import org.apache.iceberg.CatalogProperties; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.junit.jupiter.api.Test; - -public class TestFlinkHiveCatalog extends TestBase { - - @Test - public void testCreateCatalogWithWarehouseLocation() throws IOException { - Map props = Maps.newHashMap(); - props.put("type", "iceberg"); - props.put(FlinkCatalogFactory.ICEBERG_CATALOG_TYPE, "hive"); - props.put(CatalogProperties.URI, CatalogTestBase.getURI(hiveConf)); - - File warehouseDir = Files.createTempDirectory(temporaryDirectory, "junit").toFile(); - props.put(CatalogProperties.WAREHOUSE_LOCATION, "file://" + warehouseDir.getAbsolutePath()); - - checkSQLQuery(props, warehouseDir); - } - - @Test - public void testCreateCatalogWithHiveConfDir() throws IOException { - // Dump the hive conf into a local file. - File hiveConfDir = Files.createTempDirectory(temporaryDirectory, "junit").toFile(); - File hiveSiteXML = new File(hiveConfDir, "hive-site.xml"); - File warehouseDir = Files.createTempDirectory(temporaryDirectory, "junit").toFile(); - try (FileOutputStream fos = new FileOutputStream(hiveSiteXML)) { - Configuration newConf = new Configuration(hiveConf); - // Set another new directory which is different with the hive metastore's warehouse path. - newConf.set( - HiveConf.ConfVars.METASTOREWAREHOUSE.varname, "file://" + warehouseDir.getAbsolutePath()); - newConf.writeXml(fos); - } - assertThat(hiveSiteXML.toPath()).exists(); - - // Construct the catalog attributions. - Map props = Maps.newHashMap(); - props.put("type", "iceberg"); - props.put(FlinkCatalogFactory.ICEBERG_CATALOG_TYPE, "hive"); - props.put(CatalogProperties.URI, CatalogTestBase.getURI(hiveConf)); - // Set the 'hive-conf-dir' instead of 'warehouse' - props.put(FlinkCatalogFactory.HIVE_CONF_DIR, hiveConfDir.getAbsolutePath()); - - checkSQLQuery(props, warehouseDir); - } - - private void checkSQLQuery(Map catalogProperties, File warehouseDir) - throws IOException { - sql("CREATE CATALOG test_catalog WITH %s", CatalogTestBase.toWithClause(catalogProperties)); - sql("USE CATALOG test_catalog"); - sql("CREATE DATABASE test_db"); - sql("USE test_db"); - sql("CREATE TABLE test_table(c1 INT, c2 STRING)"); - sql("INSERT INTO test_table SELECT 1, 'a'"); - - Path databasePath = warehouseDir.toPath().resolve("test_db.db"); - assertThat(databasePath).exists(); - - Path tablePath = databasePath.resolve("test_table"); - assertThat(tablePath).exists(); - - Path dataPath = tablePath.resolve("data"); - assertThat(dataPath).exists(); - assertThat(Files.list(dataPath).count()) - .as("Should have a .crc file and a .parquet file") - .isEqualTo(2); - - sql("DROP TABLE test_table"); - sql("DROP DATABASE test_db"); - dropCatalog("test_catalog", false); - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestFlinkSchemaUtil.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestFlinkSchemaUtil.java deleted file mode 100644 index eab60d886ada..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestFlinkSchemaUtil.java +++ /dev/null @@ -1,416 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatThrownBy; - -import org.apache.flink.table.api.DataTypes; -import org.apache.flink.table.api.TableSchema; -import org.apache.flink.table.api.ValidationException; -import org.apache.flink.table.types.logical.BinaryType; -import org.apache.flink.table.types.logical.CharType; -import org.apache.flink.table.types.logical.LocalZonedTimestampType; -import org.apache.flink.table.types.logical.LogicalType; -import org.apache.flink.table.types.logical.RowType; -import org.apache.flink.table.types.logical.TimeType; -import org.apache.flink.table.types.logical.TimestampType; -import org.apache.flink.table.types.logical.VarBinaryType; -import org.apache.flink.table.types.logical.VarCharType; -import org.apache.iceberg.Schema; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Sets; -import org.apache.iceberg.types.Type; -import org.apache.iceberg.types.Types; -import org.junit.jupiter.api.Test; - -public class TestFlinkSchemaUtil { - - @Test - public void testConvertFlinkSchemaToIcebergSchema() { - TableSchema flinkSchema = - TableSchema.builder() - .field("id", DataTypes.INT().notNull()) - .field("name", DataTypes.STRING()) /* optional by default */ - .field("salary", DataTypes.DOUBLE().notNull()) - .field( - "locations", - DataTypes.MAP( - DataTypes.STRING(), - DataTypes.ROW( - DataTypes.FIELD("posX", DataTypes.DOUBLE().notNull(), "X field"), - DataTypes.FIELD("posY", DataTypes.DOUBLE().notNull(), "Y field")))) - .field("strArray", DataTypes.ARRAY(DataTypes.STRING()).nullable()) - .field("intArray", DataTypes.ARRAY(DataTypes.INT()).nullable()) - .field("char", DataTypes.CHAR(10).notNull()) - .field("varchar", DataTypes.VARCHAR(10).notNull()) - .field("boolean", DataTypes.BOOLEAN().nullable()) - .field("tinyint", DataTypes.TINYINT()) - .field("smallint", DataTypes.SMALLINT()) - .field("bigint", DataTypes.BIGINT()) - .field("varbinary", DataTypes.VARBINARY(10)) - .field("binary", DataTypes.BINARY(10)) - .field("time", DataTypes.TIME()) - .field("timestampWithoutZone", DataTypes.TIMESTAMP()) - .field("timestampWithZone", DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE()) - .field("date", DataTypes.DATE()) - .field("decimal", DataTypes.DECIMAL(2, 2)) - .field("decimal2", DataTypes.DECIMAL(38, 2)) - .field("decimal3", DataTypes.DECIMAL(10, 1)) - .field("multiset", DataTypes.MULTISET(DataTypes.STRING().notNull())) - .build(); - - Schema icebergSchema = - new Schema( - Types.NestedField.required(0, "id", Types.IntegerType.get(), null), - Types.NestedField.optional(1, "name", Types.StringType.get(), null), - Types.NestedField.required(2, "salary", Types.DoubleType.get(), null), - Types.NestedField.optional( - 3, - "locations", - Types.MapType.ofOptional( - 24, - 25, - Types.StringType.get(), - Types.StructType.of( - Types.NestedField.required(22, "posX", Types.DoubleType.get(), "X field"), - Types.NestedField.required( - 23, "posY", Types.DoubleType.get(), "Y field")))), - Types.NestedField.optional( - 4, "strArray", Types.ListType.ofOptional(26, Types.StringType.get())), - Types.NestedField.optional( - 5, "intArray", Types.ListType.ofOptional(27, Types.IntegerType.get())), - Types.NestedField.required(6, "char", Types.StringType.get()), - Types.NestedField.required(7, "varchar", Types.StringType.get()), - Types.NestedField.optional(8, "boolean", Types.BooleanType.get()), - Types.NestedField.optional(9, "tinyint", Types.IntegerType.get()), - Types.NestedField.optional(10, "smallint", Types.IntegerType.get()), - Types.NestedField.optional(11, "bigint", Types.LongType.get()), - Types.NestedField.optional(12, "varbinary", Types.BinaryType.get()), - Types.NestedField.optional(13, "binary", Types.FixedType.ofLength(10)), - Types.NestedField.optional(14, "time", Types.TimeType.get()), - Types.NestedField.optional( - 15, "timestampWithoutZone", Types.TimestampType.withoutZone()), - Types.NestedField.optional(16, "timestampWithZone", Types.TimestampType.withZone()), - Types.NestedField.optional(17, "date", Types.DateType.get()), - Types.NestedField.optional(18, "decimal", Types.DecimalType.of(2, 2)), - Types.NestedField.optional(19, "decimal2", Types.DecimalType.of(38, 2)), - Types.NestedField.optional(20, "decimal3", Types.DecimalType.of(10, 1)), - Types.NestedField.optional( - 21, - "multiset", - Types.MapType.ofRequired(28, 29, Types.StringType.get(), Types.IntegerType.get()))); - - checkSchema(flinkSchema, icebergSchema); - } - - @Test - public void testMapField() { - TableSchema flinkSchema = - TableSchema.builder() - .field( - "map_int_long", - DataTypes.MAP(DataTypes.INT(), DataTypes.BIGINT()).notNull()) /* Required */ - .field( - "map_int_array_string", - DataTypes.MAP(DataTypes.ARRAY(DataTypes.INT()), DataTypes.STRING())) - .field( - "map_decimal_string", DataTypes.MAP(DataTypes.DECIMAL(10, 2), DataTypes.STRING())) - .field( - "map_fields_fields", - DataTypes.MAP( - DataTypes.ROW( - DataTypes.FIELD("field_int", DataTypes.INT(), "doc - int"), - DataTypes.FIELD("field_string", DataTypes.STRING(), "doc - string")) - .notNull(), /* Required */ - DataTypes.ROW( - DataTypes.FIELD( - "field_array", - DataTypes.ARRAY(DataTypes.STRING()), - "doc - array")) - .notNull() /* Required */) - .notNull() /* Required */) - .build(); - - Schema icebergSchema = - new Schema( - Types.NestedField.required( - 0, - "map_int_long", - Types.MapType.ofOptional(4, 5, Types.IntegerType.get(), Types.LongType.get()), - null), - Types.NestedField.optional( - 1, - "map_int_array_string", - Types.MapType.ofOptional( - 7, - 8, - Types.ListType.ofOptional(6, Types.IntegerType.get()), - Types.StringType.get()), - null), - Types.NestedField.optional( - 2, - "map_decimal_string", - Types.MapType.ofOptional( - 9, 10, Types.DecimalType.of(10, 2), Types.StringType.get())), - Types.NestedField.required( - 3, - "map_fields_fields", - Types.MapType.ofRequired( - 15, - 16, - Types.StructType.of( - Types.NestedField.optional( - 11, "field_int", Types.IntegerType.get(), "doc - int"), - Types.NestedField.optional( - 12, "field_string", Types.StringType.get(), "doc - string")), - Types.StructType.of( - Types.NestedField.optional( - 14, - "field_array", - Types.ListType.ofOptional(13, Types.StringType.get()), - "doc - array"))))); - - checkSchema(flinkSchema, icebergSchema); - } - - @Test - public void testStructField() { - TableSchema flinkSchema = - TableSchema.builder() - .field( - "struct_int_string_decimal", - DataTypes.ROW( - DataTypes.FIELD("field_int", DataTypes.INT()), - DataTypes.FIELD("field_string", DataTypes.STRING()), - DataTypes.FIELD("field_decimal", DataTypes.DECIMAL(19, 2)), - DataTypes.FIELD( - "field_struct", - DataTypes.ROW( - DataTypes.FIELD("inner_struct_int", DataTypes.INT()), - DataTypes.FIELD( - "inner_struct_float_array", - DataTypes.ARRAY(DataTypes.FLOAT()))) - .notNull()) /* Row is required */) - .notNull()) /* Required */ - .field( - "struct_map_int_int", - DataTypes.ROW( - DataTypes.FIELD( - "field_map", DataTypes.MAP(DataTypes.INT(), DataTypes.INT()))) - .nullable()) /* Optional */ - .build(); - - Schema icebergSchema = - new Schema( - Types.NestedField.required( - 0, - "struct_int_string_decimal", - Types.StructType.of( - Types.NestedField.optional(5, "field_int", Types.IntegerType.get()), - Types.NestedField.optional(6, "field_string", Types.StringType.get()), - Types.NestedField.optional(7, "field_decimal", Types.DecimalType.of(19, 2)), - Types.NestedField.required( - 8, - "field_struct", - Types.StructType.of( - Types.NestedField.optional( - 3, "inner_struct_int", Types.IntegerType.get()), - Types.NestedField.optional( - 4, - "inner_struct_float_array", - Types.ListType.ofOptional(2, Types.FloatType.get())))))), - Types.NestedField.optional( - 1, - "struct_map_int_int", - Types.StructType.of( - Types.NestedField.optional( - 11, - "field_map", - Types.MapType.ofOptional( - 9, 10, Types.IntegerType.get(), Types.IntegerType.get()))))); - - checkSchema(flinkSchema, icebergSchema); - } - - @Test - public void testListField() { - TableSchema flinkSchema = - TableSchema.builder() - .field( - "list_struct_fields", - DataTypes.ARRAY(DataTypes.ROW(DataTypes.FIELD("field_int", DataTypes.INT()))) - .notNull()) /* Required */ - .field( - "list_optional_struct_fields", - DataTypes.ARRAY( - DataTypes.ROW( - DataTypes.FIELD( - "field_timestamp_with_local_time_zone", - DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE()))) - .nullable()) /* Optional */ - .field( - "list_map_fields", - DataTypes.ARRAY( - DataTypes.MAP( - DataTypes.ARRAY( - DataTypes.INT().notNull()), /* Key of map must be required */ - DataTypes.ROW( - DataTypes.FIELD("field_0", DataTypes.INT(), "doc - int"))) - .notNull()) - .notNull()) /* Required */ - .build(); - - Schema icebergSchema = - new Schema( - Types.NestedField.required( - 0, - "list_struct_fields", - Types.ListType.ofOptional( - 4, - Types.StructType.of( - Types.NestedField.optional(3, "field_int", Types.IntegerType.get())))), - Types.NestedField.optional( - 1, - "list_optional_struct_fields", - Types.ListType.ofOptional( - 6, - Types.StructType.of( - Types.NestedField.optional( - 5, - "field_timestamp_with_local_time_zone", - Types.TimestampType.withZone())))), - Types.NestedField.required( - 2, - "list_map_fields", - Types.ListType.ofRequired( - 11, - Types.MapType.ofOptional( - 9, - 10, - Types.ListType.ofRequired(7, Types.IntegerType.get()), - Types.StructType.of( - Types.NestedField.optional( - 8, "field_0", Types.IntegerType.get(), "doc - int")))))); - - checkSchema(flinkSchema, icebergSchema); - } - - private void checkSchema(TableSchema flinkSchema, Schema icebergSchema) { - assertThat(FlinkSchemaUtil.convert(flinkSchema).asStruct()).isEqualTo(icebergSchema.asStruct()); - // The conversion is not a 1:1 mapping, so we just check iceberg types. - assertThat( - FlinkSchemaUtil.convert( - FlinkSchemaUtil.toSchema(FlinkSchemaUtil.convert(icebergSchema))) - .asStruct()) - .isEqualTo(icebergSchema.asStruct()); - } - - @Test - public void testInconsistentTypes() { - checkInconsistentType( - Types.UUIDType.get(), new BinaryType(16), new BinaryType(16), Types.FixedType.ofLength(16)); - checkInconsistentType( - Types.StringType.get(), - new VarCharType(VarCharType.MAX_LENGTH), - new CharType(100), - Types.StringType.get()); - checkInconsistentType( - Types.BinaryType.get(), - new VarBinaryType(VarBinaryType.MAX_LENGTH), - new VarBinaryType(100), - Types.BinaryType.get()); - checkInconsistentType( - Types.TimeType.get(), new TimeType(), new TimeType(3), Types.TimeType.get()); - checkInconsistentType( - Types.TimestampType.withoutZone(), - new TimestampType(6), - new TimestampType(3), - Types.TimestampType.withoutZone()); - checkInconsistentType( - Types.TimestampType.withZone(), - new LocalZonedTimestampType(6), - new LocalZonedTimestampType(3), - Types.TimestampType.withZone()); - } - - private void checkInconsistentType( - Type icebergType, - LogicalType flinkExpectedType, - LogicalType flinkType, - Type icebergExpectedType) { - assertThat(FlinkSchemaUtil.convert(icebergType)).isEqualTo(flinkExpectedType); - assertThat(FlinkSchemaUtil.convert(FlinkSchemaUtil.toSchema(RowType.of(flinkType))).asStruct()) - .isEqualTo(Types.StructType.of(Types.NestedField.optional(0, "f0", icebergExpectedType))); - } - - @Test - public void testConvertFlinkSchemaBaseOnIcebergSchema() { - Schema baseSchema = - new Schema( - Lists.newArrayList( - Types.NestedField.required(101, "int", Types.IntegerType.get()), - Types.NestedField.optional(102, "string", Types.StringType.get())), - Sets.newHashSet(101)); - - TableSchema flinkSchema = - TableSchema.builder() - .field("int", DataTypes.INT().notNull()) - .field("string", DataTypes.STRING().nullable()) - .primaryKey("int") - .build(); - Schema convertedSchema = FlinkSchemaUtil.convert(baseSchema, flinkSchema); - assertThat(convertedSchema.asStruct()).isEqualTo(baseSchema.asStruct()); - assertThat(convertedSchema.identifierFieldIds()).containsExactly(101); - } - - @Test - public void testConvertFlinkSchemaWithPrimaryKeys() { - Schema icebergSchema = - new Schema( - Lists.newArrayList( - Types.NestedField.required(1, "int", Types.IntegerType.get()), - Types.NestedField.required(2, "string", Types.StringType.get())), - Sets.newHashSet(1, 2)); - - TableSchema tableSchema = FlinkSchemaUtil.toSchema(icebergSchema); - assertThat(tableSchema.getPrimaryKey()) - .isPresent() - .get() - .satisfies(k -> assertThat(k.getColumns()).containsExactly("int", "string")); - } - - @Test - public void testConvertFlinkSchemaWithNestedColumnInPrimaryKeys() { - Schema icebergSchema = - new Schema( - Lists.newArrayList( - Types.NestedField.required( - 1, - "struct", - Types.StructType.of( - Types.NestedField.required(2, "inner", Types.IntegerType.get())))), - Sets.newHashSet(2)); - - assertThatThrownBy(() -> FlinkSchemaUtil.toSchema(icebergSchema)) - .isInstanceOf(ValidationException.class) - .hasMessageStartingWith("Could not create a PRIMARY KEY") - .hasMessageContaining("Column 'struct.inner' does not exist."); - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSink.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSink.java deleted file mode 100644 index b73300e3f170..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSink.java +++ /dev/null @@ -1,358 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assumptions.assumeThat; - -import java.util.Collections; -import java.util.List; -import java.util.Map; -import java.util.stream.Collectors; -import java.util.stream.IntStream; -import org.apache.flink.api.dag.Transformation; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.table.api.EnvironmentSettings; -import org.apache.flink.table.api.Expressions; -import org.apache.flink.table.api.TableEnvironment; -import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; -import org.apache.flink.table.api.internal.TableEnvironmentImpl; -import org.apache.flink.table.operations.ModifyOperation; -import org.apache.flink.table.planner.delegation.PlannerBase; -import org.apache.flink.types.Row; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.DistributionMode; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Parameter; -import org.apache.iceberg.Parameters; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.catalog.Namespace; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.flink.source.BoundedTableFactory; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.TestTemplate; - -public class TestFlinkTableSink extends CatalogTestBase { - - private static final String SOURCE_TABLE = "default_catalog.default_database.bounded_source"; - private static final String TABLE_NAME = "test_table"; - private TableEnvironment tEnv; - private Table icebergTable; - - @Parameter(index = 2) - private FileFormat format; - - @Parameter(index = 3) - private boolean isStreamingJob; - - @Parameters(name = "catalogName={0}, baseNamespace={1}, format={2}, isStreaming={3}") - public static List parameters() { - List parameters = Lists.newArrayList(); - for (FileFormat format : - new FileFormat[] {FileFormat.ORC, FileFormat.AVRO, FileFormat.PARQUET}) { - for (Boolean isStreaming : new Boolean[] {true, false}) { - for (Object[] catalogParams : CatalogTestBase.parameters()) { - String catalogName = (String) catalogParams[0]; - Namespace baseNamespace = (Namespace) catalogParams[1]; - parameters.add(new Object[] {catalogName, baseNamespace, format, isStreaming}); - } - } - } - return parameters; - } - - @Override - protected TableEnvironment getTableEnv() { - if (tEnv == null) { - synchronized (this) { - EnvironmentSettings.Builder settingsBuilder = EnvironmentSettings.newInstance(); - if (isStreamingJob) { - settingsBuilder.inStreamingMode(); - StreamExecutionEnvironment env = - StreamExecutionEnvironment.getExecutionEnvironment( - MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG); - env.enableCheckpointing(400); - env.setMaxParallelism(2); - env.setParallelism(2); - tEnv = StreamTableEnvironment.create(env, settingsBuilder.build()); - } else { - settingsBuilder.inBatchMode(); - tEnv = TableEnvironment.create(settingsBuilder.build()); - } - } - } - return tEnv; - } - - @Override - @BeforeEach - public void before() { - super.before(); - sql("CREATE DATABASE %s", flinkDatabase); - sql("USE CATALOG %s", catalogName); - sql("USE %s", DATABASE); - sql( - "CREATE TABLE %s (id int, data varchar) with ('write.format.default'='%s')", - TABLE_NAME, format.name()); - icebergTable = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE_NAME)); - } - - @Override - @AfterEach - public void clean() { - sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, TABLE_NAME); - sql("DROP DATABASE IF EXISTS %s", flinkDatabase); - BoundedTableFactory.clearDataSets(); - super.clean(); - } - - @TestTemplate - public void testInsertFromSourceTable() throws Exception { - // Register the rows into a temporary table. - getTableEnv() - .createTemporaryView( - "sourceTable", - getTableEnv() - .fromValues( - SimpleDataUtil.FLINK_SCHEMA.toRowDataType(), - Expressions.row(1, "hello"), - Expressions.row(2, "world"), - Expressions.row(3, (String) null), - Expressions.row(null, "bar"))); - - // Redirect the records from source table to destination table. - sql("INSERT INTO %s SELECT id,data from sourceTable", TABLE_NAME); - - // Assert the table records as expected. - SimpleDataUtil.assertTableRecords( - icebergTable, - Lists.newArrayList( - SimpleDataUtil.createRecord(1, "hello"), - SimpleDataUtil.createRecord(2, "world"), - SimpleDataUtil.createRecord(3, null), - SimpleDataUtil.createRecord(null, "bar"))); - } - - @TestTemplate - public void testOverwriteTable() throws Exception { - assumeThat(isStreamingJob) - .as("Flink unbounded streaming does not support overwrite operation") - .isFalse(); - - sql("INSERT INTO %s SELECT 1, 'a'", TABLE_NAME); - SimpleDataUtil.assertTableRecords( - icebergTable, Lists.newArrayList(SimpleDataUtil.createRecord(1, "a"))); - - sql("INSERT OVERWRITE %s SELECT 2, 'b'", TABLE_NAME); - SimpleDataUtil.assertTableRecords( - icebergTable, Lists.newArrayList(SimpleDataUtil.createRecord(2, "b"))); - } - - @TestTemplate - public void testWriteParallelism() throws Exception { - List dataSet = - IntStream.range(1, 1000) - .mapToObj(i -> ImmutableList.of(Row.of(i, "aaa"), Row.of(i, "bbb"), Row.of(i, "ccc"))) - .flatMap(List::stream) - .collect(Collectors.toList()); - String dataId = BoundedTableFactory.registerDataSet(ImmutableList.of(dataSet)); - sql( - "CREATE TABLE %s(id INT NOT NULL, data STRING NOT NULL)" - + " WITH ('connector'='BoundedSource', 'data-id'='%s')", - SOURCE_TABLE, dataId); - - PlannerBase planner = (PlannerBase) ((TableEnvironmentImpl) getTableEnv()).getPlanner(); - String insertSQL = - String.format( - "INSERT INTO %s /*+ OPTIONS('write-parallelism'='1') */ SELECT * FROM %s", - TABLE_NAME, SOURCE_TABLE); - ModifyOperation operation = (ModifyOperation) planner.getParser().parse(insertSQL).get(0); - Transformation dummySink = planner.translate(Collections.singletonList(operation)).get(0); - Transformation committer = dummySink.getInputs().get(0); - Transformation writer = committer.getInputs().get(0); - - assertThat(writer.getParallelism()).as("Should have the expected 1 parallelism.").isEqualTo(1); - writer - .getInputs() - .forEach( - input -> - assertThat(input.getParallelism()) - .as("Should have the expected parallelism.") - .isEqualTo(isStreamingJob ? 2 : 4)); - } - - @TestTemplate - public void testReplacePartitions() throws Exception { - assumeThat(isStreamingJob) - .as("Flink unbounded streaming does not support overwrite operation") - .isFalse(); - String tableName = "test_partition"; - sql( - "CREATE TABLE %s(id INT, data VARCHAR) PARTITIONED BY (data) WITH ('write.format.default'='%s')", - tableName, format.name()); - - try { - Table partitionedTable = - validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, tableName)); - - sql("INSERT INTO %s SELECT 1, 'a'", tableName); - sql("INSERT INTO %s SELECT 2, 'b'", tableName); - sql("INSERT INTO %s SELECT 3, 'c'", tableName); - - SimpleDataUtil.assertTableRecords( - partitionedTable, - Lists.newArrayList( - SimpleDataUtil.createRecord(1, "a"), - SimpleDataUtil.createRecord(2, "b"), - SimpleDataUtil.createRecord(3, "c"))); - - sql("INSERT OVERWRITE %s SELECT 4, 'b'", tableName); - sql("INSERT OVERWRITE %s SELECT 5, 'a'", tableName); - - SimpleDataUtil.assertTableRecords( - partitionedTable, - Lists.newArrayList( - SimpleDataUtil.createRecord(5, "a"), - SimpleDataUtil.createRecord(4, "b"), - SimpleDataUtil.createRecord(3, "c"))); - - sql("INSERT OVERWRITE %s PARTITION (data='a') SELECT 6", tableName); - - SimpleDataUtil.assertTableRecords( - partitionedTable, - Lists.newArrayList( - SimpleDataUtil.createRecord(6, "a"), - SimpleDataUtil.createRecord(4, "b"), - SimpleDataUtil.createRecord(3, "c"))); - } finally { - sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, tableName); - } - } - - @TestTemplate - public void testInsertIntoPartition() throws Exception { - String tableName = "test_insert_into_partition"; - sql( - "CREATE TABLE %s(id INT, data VARCHAR) PARTITIONED BY (data) WITH ('write.format.default'='%s')", - tableName, format.name()); - - try { - Table partitionedTable = - validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, tableName)); - - // Full partition. - sql("INSERT INTO %s PARTITION (data='a') SELECT 1", tableName); - sql("INSERT INTO %s PARTITION (data='a') SELECT 2", tableName); - sql("INSERT INTO %s PARTITION (data='b') SELECT 3", tableName); - - SimpleDataUtil.assertTableRecords( - partitionedTable, - Lists.newArrayList( - SimpleDataUtil.createRecord(1, "a"), - SimpleDataUtil.createRecord(2, "a"), - SimpleDataUtil.createRecord(3, "b"))); - - // Partial partition. - sql("INSERT INTO %s SELECT 4, 'c'", tableName); - sql("INSERT INTO %s SELECT 5, 'd'", tableName); - - SimpleDataUtil.assertTableRecords( - partitionedTable, - Lists.newArrayList( - SimpleDataUtil.createRecord(1, "a"), - SimpleDataUtil.createRecord(2, "a"), - SimpleDataUtil.createRecord(3, "b"), - SimpleDataUtil.createRecord(4, "c"), - SimpleDataUtil.createRecord(5, "d"))); - } finally { - sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, tableName); - } - } - - @TestTemplate - public void testHashDistributeMode() throws Exception { - String tableName = "test_hash_distribution_mode"; - Map tableProps = - ImmutableMap.of( - "write.format.default", - format.name(), - TableProperties.WRITE_DISTRIBUTION_MODE, - DistributionMode.HASH.modeName()); - - // Initialize a BoundedSource table to precisely emit those rows in only one checkpoint. - List dataSet = - IntStream.range(1, 1000) - .mapToObj(i -> ImmutableList.of(Row.of(i, "aaa"), Row.of(i, "bbb"), Row.of(i, "ccc"))) - .flatMap(List::stream) - .collect(Collectors.toList()); - String dataId = BoundedTableFactory.registerDataSet(ImmutableList.of(dataSet)); - sql( - "CREATE TABLE %s(id INT NOT NULL, data STRING NOT NULL)" - + " WITH ('connector'='BoundedSource', 'data-id'='%s')", - SOURCE_TABLE, dataId); - - assertThat(sql("SELECT * FROM %s", SOURCE_TABLE)) - .as("Should have the expected rows in source table.") - .containsExactlyInAnyOrderElementsOf(dataSet); - - sql( - "CREATE TABLE %s(id INT, data VARCHAR) PARTITIONED BY (data) WITH %s", - tableName, toWithClause(tableProps)); - - try { - // Insert data set. - sql("INSERT INTO %s SELECT * FROM %s", tableName, SOURCE_TABLE); - - assertThat(sql("SELECT * FROM %s", tableName)) - .as("Should have the expected rows in sink table.") - .containsExactlyInAnyOrderElementsOf(dataSet); - - // Sometimes we will have more than one checkpoint if we pass the auto checkpoint interval, - // thus producing multiple snapshots. Here we assert that each snapshot has only 1 file per - // partition. - Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, tableName)); - Map> snapshotToDataFiles = SimpleDataUtil.snapshotToDataFiles(table); - for (List dataFiles : snapshotToDataFiles.values()) { - if (dataFiles.isEmpty()) { - continue; - } - - assertThat( - SimpleDataUtil.matchingPartitions( - dataFiles, table.spec(), ImmutableMap.of("data", "aaa"))) - .hasSize(1); - assertThat( - SimpleDataUtil.matchingPartitions( - dataFiles, table.spec(), ImmutableMap.of("data", "bbb"))) - .hasSize(1); - assertThat( - SimpleDataUtil.matchingPartitions( - dataFiles, table.spec(), ImmutableMap.of("data", "ccc"))) - .hasSize(1); - } - } finally { - sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, tableName); - } - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestFlinkUpsert.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestFlinkUpsert.java deleted file mode 100644 index d52d54e159e6..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestFlinkUpsert.java +++ /dev/null @@ -1,334 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import java.time.LocalDate; -import java.util.List; -import java.util.Map; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.table.api.EnvironmentSettings; -import org.apache.flink.table.api.TableEnvironment; -import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; -import org.apache.flink.types.Row; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Parameter; -import org.apache.iceberg.Parameters; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.catalog.Namespace; -import org.apache.iceberg.relocated.com.google.common.collect.Iterables; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.TestTemplate; - -public class TestFlinkUpsert extends CatalogTestBase { - - @Parameter(index = 2) - private FileFormat format; - - @Parameter(index = 3) - private boolean isStreamingJob; - - private final Map tableUpsertProps = Maps.newHashMap(); - private TableEnvironment tEnv; - - @Parameters(name = "catalogName={0}, baseNamespace={1}, format={2}, isStreaming={3}") - public static List parameters() { - List parameters = Lists.newArrayList(); - for (FileFormat format : - new FileFormat[] {FileFormat.PARQUET, FileFormat.AVRO, FileFormat.ORC}) { - for (Boolean isStreaming : new Boolean[] {true, false}) { - // Only test with one catalog as this is a file operation concern. - // FlinkCatalogTestBase requires the catalog name start with testhadoop if using hadoop - // catalog. - String catalogName = "testhadoop"; - Namespace baseNamespace = Namespace.of("default"); - parameters.add(new Object[] {catalogName, baseNamespace, format, isStreaming}); - } - } - return parameters; - } - - @Override - protected TableEnvironment getTableEnv() { - if (tEnv == null) { - synchronized (this) { - EnvironmentSettings.Builder settingsBuilder = EnvironmentSettings.newInstance(); - if (isStreamingJob) { - settingsBuilder.inStreamingMode(); - StreamExecutionEnvironment env = - StreamExecutionEnvironment.getExecutionEnvironment( - MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG); - env.enableCheckpointing(400); - env.setMaxParallelism(2); - env.setParallelism(2); - tEnv = StreamTableEnvironment.create(env, settingsBuilder.build()); - } else { - settingsBuilder.inBatchMode(); - tEnv = TableEnvironment.create(settingsBuilder.build()); - } - } - } - return tEnv; - } - - @Override - @BeforeEach - public void before() { - super.before(); - sql("CREATE DATABASE IF NOT EXISTS %s", flinkDatabase); - sql("USE CATALOG %s", catalogName); - sql("USE %s", DATABASE); - tableUpsertProps.put(TableProperties.FORMAT_VERSION, "2"); - tableUpsertProps.put(TableProperties.UPSERT_ENABLED, "true"); - tableUpsertProps.put(TableProperties.DEFAULT_FILE_FORMAT, format.name()); - } - - @Override - @AfterEach - public void clean() { - sql("DROP DATABASE IF EXISTS %s", flinkDatabase); - super.clean(); - } - - @TestTemplate - public void testUpsertAndQuery() { - String tableName = "test_upsert_query"; - LocalDate dt20220301 = LocalDate.of(2022, 3, 1); - LocalDate dt20220302 = LocalDate.of(2022, 3, 2); - - sql( - "CREATE TABLE %s(id INT NOT NULL, name STRING NOT NULL, dt DATE, PRIMARY KEY(id,dt) NOT ENFORCED) " - + "PARTITIONED BY (dt) WITH %s", - tableName, toWithClause(tableUpsertProps)); - - try { - sql( - "INSERT INTO %s VALUES " - + "(1, 'Bill', DATE '2022-03-01')," - + "(1, 'Jane', DATE '2022-03-01')," - + "(2, 'Jane', DATE '2022-03-01')", - tableName); - - sql( - "INSERT INTO %s VALUES " - + "(2, 'Bill', DATE '2022-03-01')," - + "(1, 'Jane', DATE '2022-03-02')," - + "(2, 'Jane', DATE '2022-03-02')", - tableName); - - List rowsOn20220301 = - Lists.newArrayList(Row.of(1, "Jane", dt20220301), Row.of(2, "Bill", dt20220301)); - TestHelpers.assertRows( - sql("SELECT * FROM %s WHERE dt < '2022-03-02'", tableName), rowsOn20220301); - - List rowsOn20220302 = - Lists.newArrayList(Row.of(1, "Jane", dt20220302), Row.of(2, "Jane", dt20220302)); - TestHelpers.assertRows( - sql("SELECT * FROM %s WHERE dt = '2022-03-02'", tableName), rowsOn20220302); - - TestHelpers.assertRows( - sql("SELECT * FROM %s", tableName), - Lists.newArrayList(Iterables.concat(rowsOn20220301, rowsOn20220302))); - } finally { - sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, tableName); - } - } - - @TestTemplate - public void testUpsertOptions() { - String tableName = "test_upsert_options"; - LocalDate dt20220301 = LocalDate.of(2022, 3, 1); - LocalDate dt20220302 = LocalDate.of(2022, 3, 2); - - Map optionsUpsertProps = Maps.newHashMap(tableUpsertProps); - optionsUpsertProps.remove(TableProperties.UPSERT_ENABLED); - sql( - "CREATE TABLE %s(id INT NOT NULL, name STRING NOT NULL, dt DATE, PRIMARY KEY(id,dt) NOT ENFORCED) " - + "PARTITIONED BY (dt) WITH %s", - tableName, toWithClause(optionsUpsertProps)); - - try { - sql( - "INSERT INTO %s /*+ OPTIONS('upsert-enabled'='true')*/ VALUES " - + "(1, 'Bill', DATE '2022-03-01')," - + "(1, 'Jane', DATE '2022-03-01')," - + "(2, 'Jane', DATE '2022-03-01')", - tableName); - - sql( - "INSERT INTO %s /*+ OPTIONS('upsert-enabled'='true')*/ VALUES " - + "(2, 'Bill', DATE '2022-03-01')," - + "(1, 'Jane', DATE '2022-03-02')," - + "(2, 'Jane', DATE '2022-03-02')", - tableName); - - List rowsOn20220301 = - Lists.newArrayList(Row.of(1, "Jane", dt20220301), Row.of(2, "Bill", dt20220301)); - TestHelpers.assertRows( - sql("SELECT * FROM %s WHERE dt < '2022-03-02'", tableName), rowsOn20220301); - - List rowsOn20220302 = - Lists.newArrayList(Row.of(1, "Jane", dt20220302), Row.of(2, "Jane", dt20220302)); - TestHelpers.assertRows( - sql("SELECT * FROM %s WHERE dt = '2022-03-02'", tableName), rowsOn20220302); - - TestHelpers.assertRows( - sql("SELECT * FROM %s", tableName), - Lists.newArrayList(Iterables.concat(rowsOn20220301, rowsOn20220302))); - } finally { - sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, tableName); - } - } - - @TestTemplate - public void testPrimaryKeyEqualToPartitionKey() { - // This is an SQL based reproduction of TestFlinkIcebergSinkV2#testUpsertOnDataKey - String tableName = "upsert_on_id_key"; - try { - sql( - "CREATE TABLE %s(id INT NOT NULL, name STRING NOT NULL, PRIMARY KEY(id) NOT ENFORCED) " - + "PARTITIONED BY (id) WITH %s", - tableName, toWithClause(tableUpsertProps)); - - sql("INSERT INTO %s VALUES " + "(1, 'Bill')," + "(1, 'Jane')," + "(2, 'Bill')", tableName); - - TestHelpers.assertRows( - sql("SELECT * FROM %s", tableName), - Lists.newArrayList(Row.of(1, "Jane"), Row.of(2, "Bill"))); - - sql("INSERT INTO %s VALUES " + "(1, 'Bill')," + "(2, 'Jane')", tableName); - - TestHelpers.assertRows( - sql("SELECT * FROM %s", tableName), - Lists.newArrayList(Row.of(1, "Bill"), Row.of(2, "Jane"))); - - sql("INSERT INTO %s VALUES " + "(3, 'Bill')," + "(4, 'Jane')", tableName); - - TestHelpers.assertRows( - sql("SELECT * FROM %s", tableName), - Lists.newArrayList( - Row.of(1, "Bill"), Row.of(2, "Jane"), Row.of(3, "Bill"), Row.of(4, "Jane"))); - } finally { - sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, tableName); - } - } - - @TestTemplate - public void testPrimaryKeyFieldsAtBeginningOfSchema() { - String tableName = "upsert_on_pk_at_schema_start"; - LocalDate dt = LocalDate.of(2022, 3, 1); - try { - sql( - "CREATE TABLE %s(id INT, dt DATE NOT NULL, name STRING NOT NULL, PRIMARY KEY(id,dt) NOT ENFORCED) " - + "PARTITIONED BY (dt) WITH %s", - tableName, toWithClause(tableUpsertProps)); - - sql( - "INSERT INTO %s VALUES " - + "(1, DATE '2022-03-01', 'Andy')," - + "(1, DATE '2022-03-01', 'Bill')," - + "(2, DATE '2022-03-01', 'Jane')", - tableName); - - TestHelpers.assertRows( - sql("SELECT * FROM %s", tableName), - Lists.newArrayList(Row.of(1, dt, "Bill"), Row.of(2, dt, "Jane"))); - - sql( - "INSERT INTO %s VALUES " - + "(1, DATE '2022-03-01', 'Jane')," - + "(2, DATE '2022-03-01', 'Bill')", - tableName); - - TestHelpers.assertRows( - sql("SELECT * FROM %s", tableName), - Lists.newArrayList(Row.of(1, dt, "Jane"), Row.of(2, dt, "Bill"))); - - sql( - "INSERT INTO %s VALUES " - + "(3, DATE '2022-03-01', 'Duke')," - + "(4, DATE '2022-03-01', 'Leon')", - tableName); - - TestHelpers.assertRows( - sql("SELECT * FROM %s", tableName), - Lists.newArrayList( - Row.of(1, dt, "Jane"), - Row.of(2, dt, "Bill"), - Row.of(3, dt, "Duke"), - Row.of(4, dt, "Leon"))); - } finally { - sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, tableName); - } - } - - @TestTemplate - public void testPrimaryKeyFieldsAtEndOfTableSchema() { - // This is the same test case as testPrimaryKeyFieldsAtBeginningOfSchema, but the primary key - // fields - // are located at the end of the flink schema. - String tableName = "upsert_on_pk_at_schema_end"; - LocalDate dt = LocalDate.of(2022, 3, 1); - try { - sql( - "CREATE TABLE %s(name STRING NOT NULL, id INT, dt DATE NOT NULL, PRIMARY KEY(id,dt) NOT ENFORCED) " - + "PARTITIONED BY (dt) WITH %s", - tableName, toWithClause(tableUpsertProps)); - - sql( - "INSERT INTO %s VALUES " - + "('Andy', 1, DATE '2022-03-01')," - + "('Bill', 1, DATE '2022-03-01')," - + "('Jane', 2, DATE '2022-03-01')", - tableName); - - TestHelpers.assertRows( - sql("SELECT * FROM %s", tableName), - Lists.newArrayList(Row.of("Bill", 1, dt), Row.of("Jane", 2, dt))); - - sql( - "INSERT INTO %s VALUES " - + "('Jane', 1, DATE '2022-03-01')," - + "('Bill', 2, DATE '2022-03-01')", - tableName); - - TestHelpers.assertRows( - sql("SELECT * FROM %s", tableName), - Lists.newArrayList(Row.of("Jane", 1, dt), Row.of("Bill", 2, dt))); - - sql( - "INSERT INTO %s VALUES " - + "('Duke', 3, DATE '2022-03-01')," - + "('Leon', 4, DATE '2022-03-01')", - tableName); - - TestHelpers.assertRows( - sql("SELECT * FROM %s", tableName), - Lists.newArrayList( - Row.of("Jane", 1, dt), - Row.of("Bill", 2, dt), - Row.of("Duke", 3, dt), - Row.of("Leon", 4, dt))); - } finally { - sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, tableName); - } - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestHelpers.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestHelpers.java deleted file mode 100644 index 8cebf950c5f0..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestHelpers.java +++ /dev/null @@ -1,632 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.IOException; -import java.math.BigDecimal; -import java.nio.ByteBuffer; -import java.time.LocalDate; -import java.time.LocalDateTime; -import java.time.LocalTime; -import java.time.OffsetDateTime; -import java.util.Collection; -import java.util.List; -import java.util.Map; -import java.util.UUID; -import java.util.function.Consumer; -import java.util.stream.Collectors; -import org.apache.avro.generic.GenericData; -import org.apache.flink.api.common.ExecutionConfig; -import org.apache.flink.api.common.typeutils.TypeSerializer; -import org.apache.flink.api.java.typeutils.runtime.kryo.KryoSerializer; -import org.apache.flink.core.memory.DataInputDeserializer; -import org.apache.flink.core.memory.DataOutputSerializer; -import org.apache.flink.table.data.ArrayData; -import org.apache.flink.table.data.DecimalData; -import org.apache.flink.table.data.GenericArrayData; -import org.apache.flink.table.data.GenericMapData; -import org.apache.flink.table.data.MapData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.TimestampData; -import org.apache.flink.table.data.conversion.DataStructureConverter; -import org.apache.flink.table.data.conversion.DataStructureConverters; -import org.apache.flink.table.runtime.typeutils.InternalSerializers; -import org.apache.flink.table.types.logical.ArrayType; -import org.apache.flink.table.types.logical.LogicalType; -import org.apache.flink.table.types.logical.MapType; -import org.apache.flink.table.types.logical.RowType; -import org.apache.flink.table.types.utils.TypeConversions; -import org.apache.flink.types.Row; -import org.apache.iceberg.ContentFile; -import org.apache.iceberg.ManifestFile; -import org.apache.iceberg.Schema; -import org.apache.iceberg.StructLike; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.flink.data.RowDataUtil; -import org.apache.iceberg.flink.source.FlinkInputFormat; -import org.apache.iceberg.flink.source.FlinkInputSplit; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Streams; -import org.apache.iceberg.types.Type; -import org.apache.iceberg.types.Types; -import org.apache.iceberg.util.DateTimeUtil; - -public class TestHelpers { - private TestHelpers() {} - - public static T roundTripKryoSerialize(Class clazz, T table) throws IOException { - KryoSerializer kryo = new KryoSerializer<>(clazz, new ExecutionConfig()); - - DataOutputSerializer outputView = new DataOutputSerializer(1024); - kryo.serialize(table, outputView); - - DataInputDeserializer inputView = new DataInputDeserializer(outputView.getCopyOfBuffer()); - return kryo.deserialize(inputView); - } - - public static RowData copyRowData(RowData from, RowType rowType) { - TypeSerializer[] fieldSerializers = - rowType.getChildren().stream() - .map((LogicalType type) -> InternalSerializers.create(type)) - .toArray(TypeSerializer[]::new); - RowData.FieldGetter[] fieldGetters = new RowData.FieldGetter[rowType.getFieldCount()]; - for (int i = 0; i < rowType.getFieldCount(); ++i) { - fieldGetters[i] = RowData.createFieldGetter(rowType.getTypeAt(i), i); - } - - return RowDataUtil.clone(from, null, rowType, fieldSerializers, fieldGetters); - } - - public static void readRowData(FlinkInputFormat input, Consumer visitor) - throws IOException { - for (FlinkInputSplit s : input.createInputSplits(0)) { - input.open(s); - try { - while (!input.reachedEnd()) { - RowData row = input.nextRecord(null); - visitor.accept(row); - } - } finally { - input.close(); - } - } - } - - public static List readRowData(FlinkInputFormat inputFormat, RowType rowType) - throws IOException { - List results = Lists.newArrayList(); - readRowData(inputFormat, row -> results.add(copyRowData(row, rowType))); - return results; - } - - public static List readRows(FlinkInputFormat inputFormat, RowType rowType) - throws IOException { - return convertRowDataToRow(readRowData(inputFormat, rowType), rowType); - } - - public static List convertRowDataToRow(List rowDataList, RowType rowType) { - DataStructureConverter converter = - DataStructureConverters.getConverter(TypeConversions.fromLogicalToDataType(rowType)); - return rowDataList.stream() - .map(converter::toExternal) - .map(Row.class::cast) - .collect(Collectors.toList()); - } - - private static List convertRecordToRow(List expectedRecords, Schema schema) { - List expected = Lists.newArrayList(); - @SuppressWarnings("unchecked") - DataStructureConverter converter = - (DataStructureConverter) - DataStructureConverters.getConverter( - TypeConversions.fromLogicalToDataType(FlinkSchemaUtil.convert(schema))); - expectedRecords.forEach( - r -> expected.add(converter.toExternal(RowDataConverter.convert(schema, r)))); - return expected; - } - - public static void assertRecordsWithOrder( - List results, List expectedRecords, Schema schema) { - List expected = convertRecordToRow(expectedRecords, schema); - assertRowsWithOrder(results, expected); - } - - public static void assertRecords(List results, List expectedRecords, Schema schema) { - List expected = convertRecordToRow(expectedRecords, schema); - assertRows(results, expected); - } - - public static void assertRows(List results, List expected, RowType rowType) { - assertRows(convertRowDataToRow(results, rowType), convertRowDataToRow(expected, rowType)); - } - - public static void assertRows(List results, List expected) { - assertThat(results).containsExactlyInAnyOrderElementsOf(expected); - } - - public static void assertRowsWithOrder(List results, List expected) { - assertThat(results).containsExactlyElementsOf(expected); - } - - public static void assertRowData(Schema schema, StructLike expected, RowData actual) { - assertRowData(schema.asStruct(), FlinkSchemaUtil.convert(schema), expected, actual); - } - - public static void assertRowData( - Types.StructType structType, - LogicalType rowType, - StructLike expectedRecord, - RowData actualRowData) { - if (expectedRecord == null && actualRowData == null) { - return; - } - - assertThat(expectedRecord).isNotNull(); - assertThat(actualRowData).isNotNull(); - - List types = Lists.newArrayList(); - for (Types.NestedField field : structType.fields()) { - types.add(field.type()); - } - - for (int i = 0; i < types.size(); i += 1) { - LogicalType logicalType = ((RowType) rowType).getTypeAt(i); - Object expected = expectedRecord.get(i, Object.class); - // The RowData.createFieldGetter won't return null for the required field. But in the - // projection case, if we are - // projecting a nested required field from an optional struct, then we should give a null for - // the projected field - // if the outer struct value is null. So we need to check the nullable for actualRowData here. - // For more details - // please see issue #2738. - Object actual = - actualRowData.isNullAt(i) - ? null - : RowData.createFieldGetter(logicalType, i).getFieldOrNull(actualRowData); - assertEquals(types.get(i), logicalType, expected, actual); - } - } - - private static void assertEquals( - Type type, LogicalType logicalType, Object expected, Object actual) { - - if (expected == null && actual == null) { - return; - } - - assertThat(expected).isNotNull(); - assertThat(actual).isNotNull(); - - switch (type.typeId()) { - case BOOLEAN: - assertThat(actual).as("boolean value should be equal").isEqualTo(expected); - break; - case INTEGER: - assertThat(actual).as("int value should be equal").isEqualTo(expected); - break; - case LONG: - assertThat(actual).as("long value should be equal").isEqualTo(expected); - break; - case FLOAT: - assertThat(actual).as("float value should be equal").isEqualTo(expected); - break; - case DOUBLE: - assertThat(actual).as("double value should be equal").isEqualTo(expected); - break; - case STRING: - assertThat(expected).as("Should expect a CharSequence").isInstanceOf(CharSequence.class); - assertThat(actual.toString()) - .as("string should be equal") - .isEqualTo(String.valueOf(expected)); - break; - case DATE: - assertThat(expected).as("Should expect a Date").isInstanceOf(LocalDate.class); - LocalDate date = DateTimeUtil.dateFromDays((int) actual); - assertThat(date).as("date should be equal").isEqualTo(expected); - break; - case TIME: - assertThat(expected).as("Should expect a LocalTime").isInstanceOf(LocalTime.class); - int milliseconds = (int) (((LocalTime) expected).toNanoOfDay() / 1000_000); - assertThat(actual).as("time millis should be equal").isEqualTo(milliseconds); - break; - case TIMESTAMP: - if (((Types.TimestampType) type).shouldAdjustToUTC()) { - assertThat(expected) - .as("Should expect a OffsetDataTime") - .isInstanceOf(OffsetDateTime.class); - OffsetDateTime ts = (OffsetDateTime) expected; - assertThat(((TimestampData) actual).toLocalDateTime()) - .as("OffsetDataTime should be equal") - .isEqualTo(ts.toLocalDateTime()); - } else { - assertThat(expected) - .as("Should expect a LocalDataTime") - .isInstanceOf(LocalDateTime.class); - LocalDateTime ts = (LocalDateTime) expected; - assertThat(((TimestampData) actual).toLocalDateTime()) - .as("LocalDataTime should be equal") - .isEqualTo(ts); - } - break; - case BINARY: - assertThat(ByteBuffer.wrap((byte[]) actual)) - .as("Should expect a ByteBuffer") - .isInstanceOf(ByteBuffer.class) - .isEqualTo(expected); - break; - case DECIMAL: - assertThat(expected).as("Should expect a BigDecimal").isInstanceOf(BigDecimal.class); - BigDecimal bd = (BigDecimal) expected; - assertThat(((DecimalData) actual).toBigDecimal()) - .as("decimal value should be equal") - .isEqualTo(bd); - break; - case LIST: - assertThat(expected).as("Should expect a Collection").isInstanceOf(Collection.class); - Collection expectedArrayData = (Collection) expected; - ArrayData actualArrayData = (ArrayData) actual; - LogicalType elementType = ((ArrayType) logicalType).getElementType(); - assertThat(actualArrayData.size()) - .as("array length should be equal") - .isEqualTo(expectedArrayData.size()); - assertArrayValues( - type.asListType().elementType(), elementType, expectedArrayData, actualArrayData); - break; - case MAP: - assertThat(expected).as("Should expect a Map").isInstanceOf(Map.class); - assertMapValues(type.asMapType(), logicalType, (Map) expected, (MapData) actual); - break; - case STRUCT: - assertThat(expected).as("Should expect a Record").isInstanceOf(StructLike.class); - assertRowData(type.asStructType(), logicalType, (StructLike) expected, (RowData) actual); - break; - case UUID: - assertThat(expected).as("Should expect a UUID").isInstanceOf(UUID.class); - ByteBuffer bb = ByteBuffer.wrap((byte[]) actual); - long firstLong = bb.getLong(); - long secondLong = bb.getLong(); - assertThat(new UUID(firstLong, secondLong).toString()) - .as("UUID should be equal") - .isEqualTo(expected.toString()); - break; - case FIXED: - assertThat(actual) - .as("Should expect byte[]") - .isInstanceOf(byte[].class) - .isEqualTo(expected); - break; - default: - throw new IllegalArgumentException("Not a supported type: " + type); - } - } - - public static void assertEquals(Schema schema, List records, List rows) { - Streams.forEachPair( - records.stream(), rows.stream(), (record, row) -> assertEquals(schema, record, row)); - } - - public static void assertEquals(Schema schema, GenericData.Record record, Row row) { - List fields = schema.asStruct().fields(); - assertThat(fields).hasSameSizeAs(record.getSchema().getFields()); - assertThat(fields).hasSize(row.getArity()); - - RowType rowType = FlinkSchemaUtil.convert(schema); - for (int i = 0; i < fields.size(); ++i) { - Type fieldType = fields.get(i).type(); - Object expectedValue = record.get(i); - Object actualValue = row.getField(i); - LogicalType logicalType = rowType.getTypeAt(i); - assertAvroEquals(fieldType, logicalType, expectedValue, actualValue); - } - } - - private static void assertEquals(Types.StructType struct, GenericData.Record record, Row row) { - List fields = struct.fields(); - for (int i = 0; i < fields.size(); i += 1) { - Type fieldType = fields.get(i).type(); - Object expectedValue = record.get(i); - Object actualValue = row.getField(i); - assertAvroEquals(fieldType, null, expectedValue, actualValue); - } - } - - private static void assertAvroEquals( - Type type, LogicalType logicalType, Object expected, Object actual) { - - if (expected == null && actual == null) { - return; - } - assertThat(expected).isNotNull(); - assertThat(actual).isNotNull(); - - switch (type.typeId()) { - case BOOLEAN: - case INTEGER: - case LONG: - case FLOAT: - case DOUBLE: - assertThat(expected) - .as("Should expect a " + type.typeId().javaClass()) - .isInstanceOf(type.typeId().javaClass()); - assertThat(actual) - .as("Should expect a " + type.typeId().javaClass()) - .isInstanceOf(type.typeId().javaClass()); - assertThat(actual).as(type.typeId() + " value should be equal").isEqualTo(expected); - break; - case STRING: - assertThat(expected).as("Should expect a CharSequence").isInstanceOf(CharSequence.class); - assertThat(actual).as("Should expect a CharSequence").isInstanceOf(CharSequence.class); - assertThat(actual.toString()).as("string should be equal").isEqualTo(expected.toString()); - break; - case DATE: - assertThat(expected).as("Should expect a Date").isInstanceOf(LocalDate.class); - LocalDate date = DateTimeUtil.dateFromDays((int) actual); - assertThat(date).as("date should be equal").isEqualTo(expected); - break; - case TIME: - assertThat(expected).as("Should expect a LocalTime").isInstanceOf(LocalTime.class); - int milliseconds = (int) (((LocalTime) expected).toNanoOfDay() / 1000_000); - assertThat(actual).as("time millis should be equal").isEqualTo(milliseconds); - break; - case TIMESTAMP: - if (((Types.TimestampType) type).shouldAdjustToUTC()) { - assertThat(expected) - .as("Should expect a OffsetDataTime") - .isInstanceOf(OffsetDateTime.class); - OffsetDateTime ts = (OffsetDateTime) expected; - assertThat(((TimestampData) actual).toLocalDateTime()) - .as("OffsetDataTime should be equal") - .isEqualTo(ts.toLocalDateTime()); - } else { - assertThat(expected) - .as("Should expect a LocalDataTime") - .isInstanceOf(LocalDateTime.class); - LocalDateTime ts = (LocalDateTime) expected; - assertThat(((TimestampData) actual).toLocalDateTime()) - .as("LocalDataTime should be equal") - .isEqualTo(ts); - } - break; - case BINARY: - assertThat(ByteBuffer.wrap((byte[]) actual)) - .as("Should expect a ByteBuffer") - .isInstanceOf(ByteBuffer.class) - .isEqualTo(expected); - break; - case DECIMAL: - assertThat(expected).as("Should expect a BigDecimal").isInstanceOf(BigDecimal.class); - BigDecimal bd = (BigDecimal) expected; - assertThat(((DecimalData) actual).toBigDecimal()) - .as("decimal value should be equal") - .isEqualTo(bd); - break; - case LIST: - assertThat(expected).as("Should expect a Collection").isInstanceOf(Collection.class); - Collection expectedArrayData = (Collection) expected; - ArrayData actualArrayData; - try { - actualArrayData = (ArrayData) actual; - } catch (ClassCastException e) { - actualArrayData = new GenericArrayData((Object[]) actual); - } - LogicalType elementType = ((ArrayType) logicalType).getElementType(); - assertThat(actualArrayData.size()) - .as("array length should be equal") - .isEqualTo(expectedArrayData.size()); - assertArrayValues( - type.asListType().elementType(), elementType, expectedArrayData, actualArrayData); - break; - case MAP: - assertThat(expected).as("Should expect a Map").isInstanceOf(Map.class); - MapData actualMap; - try { - actualMap = (MapData) actual; - } catch (ClassCastException e) { - actualMap = new GenericMapData((Map) actual); - } - assertMapValues(type.asMapType(), logicalType, (Map) expected, actualMap); - break; - case STRUCT: - assertThat(expected).as("Should expect a Record").isInstanceOf(GenericData.Record.class); - assertEquals( - type.asNestedType().asStructType(), (GenericData.Record) expected, (Row) actual); - break; - case UUID: - assertThat(expected).as("Should expect a UUID").isInstanceOf(UUID.class); - ByteBuffer bb = ByteBuffer.wrap((byte[]) actual); - long firstLong = bb.getLong(); - long secondLong = bb.getLong(); - assertThat(new UUID(firstLong, secondLong).toString()) - .as("UUID should be equal") - .isEqualTo(expected.toString()); - break; - case FIXED: - assertThat(actual) - .as("Should expect byte[]") - .isInstanceOf(byte[].class) - .isEqualTo(expected); - break; - default: - throw new IllegalArgumentException("Not a supported type: " + type); - } - } - - private static void assertArrayValues( - Type type, LogicalType logicalType, Collection expectedArray, ArrayData actualArray) { - List expectedElements = Lists.newArrayList(expectedArray); - for (int i = 0; i < expectedArray.size(); i += 1) { - if (expectedElements.get(i) == null) { - assertThat(actualArray.isNullAt(i)).isTrue(); - continue; - } - - Object expected = expectedElements.get(i); - - assertEquals( - type, - logicalType, - expected, - ArrayData.createElementGetter(logicalType).getElementOrNull(actualArray, i)); - } - } - - private static void assertMapValues( - Types.MapType mapType, LogicalType type, Map expected, MapData actual) { - assertThat(actual.size()).as("map size should be equal").isEqualTo(expected.size()); - - ArrayData actualKeyArrayData = actual.keyArray(); - ArrayData actualValueArrayData = actual.valueArray(); - LogicalType actualKeyType = ((MapType) type).getKeyType(); - LogicalType actualValueType = ((MapType) type).getValueType(); - Type keyType = mapType.keyType(); - Type valueType = mapType.valueType(); - - ArrayData.ElementGetter keyGetter = ArrayData.createElementGetter(actualKeyType); - ArrayData.ElementGetter valueGetter = ArrayData.createElementGetter(actualValueType); - - for (Map.Entry entry : expected.entrySet()) { - Object matchedActualKey = null; - int matchedKeyIndex = 0; - for (int i = 0; i < actual.size(); i += 1) { - try { - Object key = keyGetter.getElementOrNull(actualKeyArrayData, i); - assertEquals(keyType, actualKeyType, entry.getKey(), key); - matchedActualKey = key; - matchedKeyIndex = i; - break; - } catch (AssertionError e) { - // not found - } - } - assertThat(matchedActualKey).as("Should have a matching key").isNotNull(); - final int valueIndex = matchedKeyIndex; - assertEquals( - valueType, - actualValueType, - entry.getValue(), - valueGetter.getElementOrNull(actualValueArrayData, valueIndex)); - } - } - - public static void assertEquals(ManifestFile expected, ManifestFile actual) { - if (expected == actual) { - return; - } - assertThat(expected).isNotNull(); - assertThat(actual).isNotNull(); - assertThat(actual.path()).as("Path must match").isEqualTo(expected.path()); - assertThat(actual.length()).as("Length must match").isEqualTo(expected.length()); - assertThat(actual.partitionSpecId()) - .as("Spec id must match") - .isEqualTo(expected.partitionSpecId()); - assertThat(actual.content()).as("ManifestContent must match").isEqualTo(expected.content()); - assertThat(actual.sequenceNumber()) - .as("SequenceNumber must match") - .isEqualTo(expected.sequenceNumber()); - assertThat(actual.minSequenceNumber()) - .as("MinSequenceNumber must match") - .isEqualTo(expected.minSequenceNumber()); - assertThat(actual.snapshotId()).as("Snapshot id must match").isEqualTo(expected.snapshotId()); - assertThat(actual.hasAddedFiles()) - .as("Added files flag must match") - .isEqualTo(expected.hasAddedFiles()); - assertThat(actual.addedFilesCount()) - .as("Added files count must match") - .isEqualTo(expected.addedFilesCount()); - assertThat(actual.addedRowsCount()) - .as("Added rows count must match") - .isEqualTo(expected.addedRowsCount()); - assertThat(actual.hasExistingFiles()) - .as("Existing files flag must match") - .isEqualTo(expected.hasExistingFiles()); - assertThat(actual.existingFilesCount()) - .as("Existing files count must match") - .isEqualTo(expected.existingFilesCount()); - assertThat(actual.existingRowsCount()) - .as("Existing rows count must match") - .isEqualTo(expected.existingRowsCount()); - assertThat(actual.hasDeletedFiles()) - .as("Deleted files flag must match") - .isEqualTo(expected.hasDeletedFiles()); - assertThat(actual.deletedFilesCount()) - .as("Deleted files count must match") - .isEqualTo(expected.deletedFilesCount()); - assertThat(actual.deletedRowsCount()) - .as("Deleted rows count must match") - .isEqualTo(expected.deletedRowsCount()); - - List expectedSummaries = expected.partitions(); - List actualSummaries = actual.partitions(); - assertThat(actualSummaries) - .as("PartitionFieldSummary size does not match") - .hasSameSizeAs(expectedSummaries); - for (int i = 0; i < expectedSummaries.size(); i++) { - assertThat(actualSummaries.get(i).containsNull()) - .as("Null flag in partition must match") - .isEqualTo(expectedSummaries.get(i).containsNull()); - assertThat(actualSummaries.get(i).containsNaN()) - .as("NaN flag in partition must match") - .isEqualTo(expectedSummaries.get(i).containsNaN()); - assertThat(actualSummaries.get(i).lowerBound()) - .as("Lower bounds in partition must match") - .isEqualTo(expectedSummaries.get(i).lowerBound()); - assertThat(actualSummaries.get(i).upperBound()) - .as("Upper bounds in partition must match") - .isEqualTo(expectedSummaries.get(i).upperBound()); - } - } - - public static void assertEquals(ContentFile expected, ContentFile actual) { - if (expected == actual) { - return; - } - assertThat(expected).isNotNull(); - assertThat(actual).isNotNull(); - assertThat(actual.specId()).as("SpecId").isEqualTo(expected.specId()); - assertThat(actual.content()).as("Content").isEqualTo(expected.content()); - assertThat(actual.path()).as("Path").isEqualTo(expected.path()); - assertThat(actual.format()).as("Format").isEqualTo(expected.format()); - assertThat(actual.partition().size()) - .as("Partition size") - .isEqualTo(expected.partition().size()); - for (int i = 0; i < expected.partition().size(); i++) { - assertThat(actual.partition().get(i, Object.class)) - .as("Partition data at index " + i) - .isEqualTo(expected.partition().get(i, Object.class)); - } - assertThat(actual.recordCount()).as("Record count").isEqualTo(expected.recordCount()); - assertThat(actual.fileSizeInBytes()) - .as("File size in bytes") - .isEqualTo(expected.fileSizeInBytes()); - assertThat(actual.columnSizes()).as("Column sizes").isEqualTo(expected.columnSizes()); - assertThat(actual.valueCounts()).as("Value counts").isEqualTo(expected.valueCounts()); - assertThat(actual.nullValueCounts()) - .as("Null value counts") - .isEqualTo(expected.nullValueCounts()); - assertThat(actual.lowerBounds()).as("Lower bounds").isEqualTo(expected.lowerBounds()); - assertThat(actual.upperBounds()).as("Upper bounds").isEqualTo(expected.upperBounds()); - assertThat(actual.keyMetadata()).as("Key metadata").isEqualTo(expected.keyMetadata()); - assertThat(actual.splitOffsets()).as("Split offsets").isEqualTo(expected.splitOffsets()); - assertThat(actual.equalityFieldIds()) - .as("Equality field id list") - .isEqualTo(expected.equalityFieldIds()); - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestIcebergConnector.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestIcebergConnector.java deleted file mode 100644 index b709c0058f7d..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestIcebergConnector.java +++ /dev/null @@ -1,343 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatThrownBy; - -import java.io.IOException; -import java.io.UncheckedIOException; -import java.nio.file.Files; -import java.util.Map; -import org.apache.flink.configuration.CoreOptions; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.table.api.EnvironmentSettings; -import org.apache.flink.table.api.TableEnvironment; -import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; -import org.apache.flink.table.catalog.Catalog; -import org.apache.flink.table.catalog.ObjectPath; -import org.apache.flink.types.Row; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.hive.metastore.HiveMetaStoreClient; -import org.apache.iceberg.CatalogProperties; -import org.apache.iceberg.Parameter; -import org.apache.iceberg.ParameterizedTestExtension; -import org.apache.iceberg.Parameters; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.thrift.TException; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.TestTemplate; -import org.junit.jupiter.api.extension.ExtendWith; - -@ExtendWith(ParameterizedTestExtension.class) -public class TestIcebergConnector extends TestBase { - - private static final String TABLE_NAME = "test_table"; - - @Parameter(index = 0) - private String catalogName; - - @Parameter(index = 1) - private Map properties; - - @Parameter(index = 2) - private boolean isStreaming; - - private volatile TableEnvironment tEnv; - - @Parameters(name = "catalogName = {0}, properties = {1}, isStreaming = {2}") - public static Iterable parameters() { - return Lists.newArrayList( - // Create iceberg table in the hadoop catalog and default database. - new Object[] { - "testhadoop", - ImmutableMap.of( - "connector", "iceberg", - "catalog-type", "hadoop"), - true - }, - new Object[] { - "testhadoop", - ImmutableMap.of( - "connector", "iceberg", - "catalog-type", "hadoop", - "catalog-table", "not_existing_table"), - true - }, - new Object[] { - "testhadoop", - ImmutableMap.of( - "connector", "iceberg", - "catalog-type", "hadoop"), - false - }, - // Create iceberg table in the hadoop catalog and not_existing_db. - new Object[] { - "testhadoop", - ImmutableMap.of( - "connector", "iceberg", - "catalog-type", "hadoop", - "catalog-database", "not_existing_db"), - true - }, - new Object[] { - "testhadoop", - ImmutableMap.of( - "connector", "iceberg", - "catalog-type", "hadoop", - "catalog-database", "not_existing_db", - "catalog-table", "not_existing_table"), - true - }, - new Object[] { - "testhadoop", - ImmutableMap.of( - "connector", "iceberg", - "catalog-type", "hadoop", - "catalog-database", "not_existing_db"), - false - }, - // Create iceberg table in the hive catalog and default database. - new Object[] { - "testhive", - ImmutableMap.of( - "connector", "iceberg", - "catalog-type", "hive"), - true - }, - new Object[] { - "testhive", - ImmutableMap.of( - "connector", "iceberg", - "catalog-type", "hive", - "catalog-table", "not_existing_table"), - true - }, - new Object[] { - "testhive", - ImmutableMap.of( - "connector", "iceberg", - "catalog-type", "hive"), - false - }, - // Create iceberg table in the hive catalog and not_existing_db. - new Object[] { - "testhive", - ImmutableMap.of( - "connector", "iceberg", - "catalog-type", "hive", - "catalog-database", "not_existing_db"), - true - }, - new Object[] { - "testhive", - ImmutableMap.of( - "connector", "iceberg", - "catalog-type", "hive", - "catalog-database", "not_existing_db", - "catalog-table", "not_existing_table"), - true - }, - new Object[] { - "testhive", - ImmutableMap.of( - "connector", "iceberg", - "catalog-type", "hive", - "catalog-database", "not_existing_db"), - false - }); - } - - @Override - protected TableEnvironment getTableEnv() { - if (tEnv == null) { - synchronized (this) { - if (tEnv == null) { - EnvironmentSettings.Builder settingsBuilder = EnvironmentSettings.newInstance(); - if (isStreaming) { - settingsBuilder.inStreamingMode(); - StreamExecutionEnvironment env = - StreamExecutionEnvironment.getExecutionEnvironment( - MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG); - env.enableCheckpointing(400); - env.setMaxParallelism(2); - env.setParallelism(2); - tEnv = StreamTableEnvironment.create(env, settingsBuilder.build()); - } else { - settingsBuilder.inBatchMode(); - tEnv = TableEnvironment.create(settingsBuilder.build()); - } - // Set only one parallelism. - tEnv.getConfig() - .getConfiguration() - .set(CoreOptions.DEFAULT_PARALLELISM, 1) - .set(FlinkConfigOptions.TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM, false); - } - } - } - return tEnv; - } - - @AfterEach - public void after() throws TException { - sql("DROP TABLE IF EXISTS %s", TABLE_NAME); - - // Clean the created orphan databases and tables from hive-metastore. - if (isHiveCatalog()) { - HiveMetaStoreClient metaStoreClient = new HiveMetaStoreClient(hiveConf); - try { - metaStoreClient.dropTable(databaseName(), tableName()); - if (!isDefaultDatabaseName()) { - try { - metaStoreClient.dropDatabase(databaseName()); - } catch (Exception ignored) { - // Ignore - } - } - } finally { - metaStoreClient.close(); - } - } - } - - private void testCreateConnectorTable() { - Map tableProps = createTableProps(); - - // Create table under the flink's current database. - sql("CREATE TABLE %s (id BIGINT, data STRING) WITH %s", TABLE_NAME, toWithClause(tableProps)); - sql("INSERT INTO %s VALUES (1, 'AAA'), (2, 'BBB'), (3, 'CCC')", TABLE_NAME); - assertThat(sql("SELECT * FROM %s", TABLE_NAME)) - .containsExactlyInAnyOrder(Row.of(1L, "AAA"), Row.of(2L, "BBB"), Row.of(3L, "CCC")); - - FlinkCatalogFactory factory = new FlinkCatalogFactory(); - Catalog flinkCatalog = factory.createCatalog(catalogName, tableProps, new Configuration()); - assertThat(flinkCatalog.databaseExists(databaseName())).isTrue(); - assertThat(flinkCatalog.tableExists(new ObjectPath(databaseName(), tableName()))).isTrue(); - - // Drop and create it again. - sql("DROP TABLE %s", TABLE_NAME); - sql("CREATE TABLE %s (id BIGINT, data STRING) WITH %s", TABLE_NAME, toWithClause(tableProps)); - assertThat(sql("SELECT * FROM %s", TABLE_NAME)) - .containsExactlyInAnyOrder(Row.of(1L, "AAA"), Row.of(2L, "BBB"), Row.of(3L, "CCC")); - } - - @TestTemplate - public void testCreateTableUnderDefaultDatabase() { - testCreateConnectorTable(); - } - - @TestTemplate - public void testCatalogDatabaseConflictWithFlinkDatabase() { - sql("CREATE DATABASE IF NOT EXISTS `%s`", databaseName()); - sql("USE `%s`", databaseName()); - - try { - testCreateConnectorTable(); - // Ensure that the table was created under the specific database. - assertThatThrownBy( - () -> sql("CREATE TABLE `default_catalog`.`%s`.`%s`", databaseName(), TABLE_NAME)) - .isInstanceOf(org.apache.flink.table.api.TableException.class) - .hasMessageStartingWith("Could not execute CreateTable in path"); - } finally { - sql("DROP TABLE IF EXISTS `%s`.`%s`", databaseName(), TABLE_NAME); - if (!isDefaultDatabaseName()) { - sql("DROP DATABASE `%s`", databaseName()); - } - } - } - - @TestTemplate - public void testConnectorTableInIcebergCatalog() { - // Create the catalog properties - Map catalogProps = Maps.newHashMap(); - catalogProps.put("type", "iceberg"); - if (isHiveCatalog()) { - catalogProps.put("catalog-type", "hive"); - catalogProps.put(CatalogProperties.URI, CatalogTestBase.getURI(hiveConf)); - } else { - catalogProps.put("catalog-type", "hadoop"); - } - catalogProps.put(CatalogProperties.WAREHOUSE_LOCATION, createWarehouse()); - - // Create the table properties - Map tableProps = createTableProps(); - - // Create a connector table in an iceberg catalog. - sql("CREATE CATALOG `test_catalog` WITH %s", toWithClause(catalogProps)); - try { - assertThatThrownBy( - () -> - sql( - "CREATE TABLE `test_catalog`.`%s`.`%s` (id BIGINT, data STRING) WITH %s", - FlinkCatalogFactory.DEFAULT_DATABASE_NAME, - TABLE_NAME, - toWithClause(tableProps))) - .cause() - .isInstanceOf(IllegalArgumentException.class) - .hasMessage( - "Cannot create the table with 'connector'='iceberg' table property in an iceberg catalog, " - + "Please create table with 'connector'='iceberg' property in a non-iceberg catalog or " - + "create table without 'connector'='iceberg' related properties in an iceberg table."); - } finally { - sql("DROP CATALOG IF EXISTS `test_catalog`"); - } - } - - private Map createTableProps() { - Map tableProps = Maps.newHashMap(properties); - tableProps.put("catalog-name", catalogName); - tableProps.put(CatalogProperties.WAREHOUSE_LOCATION, createWarehouse()); - if (isHiveCatalog()) { - tableProps.put(CatalogProperties.URI, CatalogTestBase.getURI(hiveConf)); - } - return tableProps; - } - - private boolean isHiveCatalog() { - return "testhive".equalsIgnoreCase(catalogName); - } - - private boolean isDefaultDatabaseName() { - return FlinkCatalogFactory.DEFAULT_DATABASE_NAME.equalsIgnoreCase(databaseName()); - } - - private String tableName() { - return properties.getOrDefault("catalog-table", TABLE_NAME); - } - - private String databaseName() { - return properties.getOrDefault("catalog-database", "default_database"); - } - - private String toWithClause(Map props) { - return CatalogTestBase.toWithClause(props); - } - - private String createWarehouse() { - try { - return String.format( - "file://%s", - Files.createTempDirectory(temporaryDirectory, "junit").toFile().getAbsolutePath()); - } catch (IOException e) { - throw new UncheckedIOException(e); - } - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestManifestFileSerialization.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestManifestFileSerialization.java deleted file mode 100644 index 8f1f129e183b..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestManifestFileSerialization.java +++ /dev/null @@ -1,173 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; -import java.io.File; -import java.io.IOException; -import java.io.ObjectInputStream; -import java.io.ObjectOutputStream; -import java.nio.ByteBuffer; -import java.nio.ByteOrder; -import java.nio.file.Path; -import org.apache.flink.api.common.ExecutionConfig; -import org.apache.flink.api.java.typeutils.runtime.kryo.KryoSerializer; -import org.apache.flink.core.memory.DataInputDeserializer; -import org.apache.flink.core.memory.DataOutputSerializer; -import org.apache.hadoop.conf.Configuration; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.DataFiles; -import org.apache.iceberg.GenericManifestFile; -import org.apache.iceberg.ManifestFile; -import org.apache.iceberg.ManifestFiles; -import org.apache.iceberg.ManifestWriter; -import org.apache.iceberg.Metrics; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.hadoop.HadoopFileIO; -import org.apache.iceberg.io.FileIO; -import org.apache.iceberg.io.OutputFile; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.types.Types; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.io.TempDir; - -public class TestManifestFileSerialization { - - private static final Schema SCHEMA = - new Schema( - required(1, "id", Types.LongType.get()), - optional(2, "data", Types.StringType.get()), - required(3, "date", Types.StringType.get()), - required(4, "double", Types.DoubleType.get())); - - private static final PartitionSpec SPEC = - PartitionSpec.builderFor(SCHEMA).identity("double").build(); - - private static final DataFile FILE_A = - DataFiles.builder(SPEC) - .withPath("/path/to/data-1.parquet") - .withFileSizeInBytes(0) - .withPartition(org.apache.iceberg.TestHelpers.Row.of(1D)) - .withPartitionPath("double=1") - .withMetrics( - new Metrics( - 5L, - null, // no column sizes - ImmutableMap.of(1, 5L, 2, 3L), // value count - ImmutableMap.of(1, 0L, 2, 2L), // null count - ImmutableMap.of(), // nan count - ImmutableMap.of(1, longToBuffer(0L)), // lower bounds - ImmutableMap.of(1, longToBuffer(4L)) // upper bounds - )) - .build(); - - private static final DataFile FILE_B = - DataFiles.builder(SPEC) - .withPath("/path/to/data-2.parquet") - .withFileSizeInBytes(0) - .withPartition(org.apache.iceberg.TestHelpers.Row.of(Double.NaN)) - .withPartitionPath("double=NaN") - .withMetrics( - new Metrics( - 1L, - null, // no column sizes - ImmutableMap.of(1, 1L, 4, 1L), // value count - ImmutableMap.of(1, 0L, 2, 0L), // null count - ImmutableMap.of(4, 1L), // nan count - ImmutableMap.of(1, longToBuffer(0L)), // lower bounds - ImmutableMap.of(1, longToBuffer(1L)) // upper bounds - )) - .build(); - - private static final FileIO FILE_IO = new HadoopFileIO(new Configuration()); - - @TempDir private Path temp; - - @Test - public void testKryoSerialization() throws IOException { - KryoSerializer kryo = - new KryoSerializer<>(ManifestFile.class, new ExecutionConfig()); - - DataOutputSerializer outputView = new DataOutputSerializer(1024); - - ManifestFile manifest = writeManifest(FILE_A, FILE_B); - - kryo.serialize(manifest, outputView); - kryo.serialize(manifest.copy(), outputView); - kryo.serialize(GenericManifestFile.copyOf(manifest).build(), outputView); - - DataInputDeserializer inputView = new DataInputDeserializer(outputView.getCopyOfBuffer()); - ManifestFile m1 = kryo.deserialize(inputView); - ManifestFile m2 = kryo.deserialize(inputView); - ManifestFile m3 = kryo.deserialize(inputView); - - TestHelpers.assertEquals(manifest, m1); - TestHelpers.assertEquals(manifest, m2); - TestHelpers.assertEquals(manifest, m3); - } - - @Test - public void testJavaSerialization() throws Exception { - ByteArrayOutputStream bytes = new ByteArrayOutputStream(); - - ManifestFile manifest = writeManifest(FILE_A, FILE_B); - - try (ObjectOutputStream out = new ObjectOutputStream(bytes)) { - out.writeObject(manifest); - out.writeObject(manifest.copy()); - out.writeObject(GenericManifestFile.copyOf(manifest).build()); - } - - try (ObjectInputStream in = - new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray()))) { - for (int i = 0; i < 3; i += 1) { - Object obj = in.readObject(); - assertThat(obj).as("Should be a ManifestFile").isInstanceOf(ManifestFile.class); - TestHelpers.assertEquals(manifest, (ManifestFile) obj); - } - } - } - - private ManifestFile writeManifest(DataFile... files) throws IOException { - File manifestFile = File.createTempFile("input", "m0.avro", temp.toFile()); - assertThat(manifestFile.delete()).isTrue(); - OutputFile outputFile = FILE_IO.newOutputFile(manifestFile.getCanonicalPath()); - - ManifestWriter writer = ManifestFiles.write(SPEC, outputFile); - try { - for (DataFile file : files) { - writer.add(file); - } - } finally { - writer.close(); - } - - return writer.toManifestFile(); - } - - private static ByteBuffer longToBuffer(long value) { - return ByteBuffer.allocate(8).order(ByteOrder.LITTLE_ENDIAN).putLong(0, value); - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestRowDataWrapper.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestRowDataWrapper.java deleted file mode 100644 index 0af49e9e2365..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestRowDataWrapper.java +++ /dev/null @@ -1,93 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.util.Iterator; -import org.apache.flink.table.data.RowData; -import org.apache.iceberg.RecordWrapperTest; -import org.apache.iceberg.Schema; -import org.apache.iceberg.StructLike; -import org.apache.iceberg.data.InternalRecordWrapper; -import org.apache.iceberg.data.RandomGenericData; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.flink.data.RandomRowData; -import org.apache.iceberg.util.StructLikeWrapper; - -public class TestRowDataWrapper extends RecordWrapperTest { - - /** - * Flink's time type has been truncated to millis seconds, so we need a customized assert method - * to check the values. - */ - @Override - public void testTime() { - generateAndValidate( - new Schema(TIME.fields()), - (message, expectedWrapper, actualWrapper) -> { - for (int pos = 0; pos < TIME.fields().size(); pos++) { - Object expected = expectedWrapper.get().get(pos, Object.class); - Object actual = actualWrapper.get().get(pos, Object.class); - if (expected == actual) { - return; - } - - assertThat(actual).isNotNull(); - assertThat(expected).isNotNull(); - - int expectedMilliseconds = (int) ((long) expected / 1000_000); - int actualMilliseconds = (int) ((long) actual / 1000_000); - assertThat(actualMilliseconds).as(message).isEqualTo(expectedMilliseconds); - } - }); - } - - @Override - protected void generateAndValidate(Schema schema, RecordWrapperTest.AssertMethod assertMethod) { - int numRecords = 100; - Iterable recordList = RandomGenericData.generate(schema, numRecords, 101L); - Iterable rowDataList = RandomRowData.generate(schema, numRecords, 101L); - - InternalRecordWrapper recordWrapper = new InternalRecordWrapper(schema.asStruct()); - RowDataWrapper rowDataWrapper = - new RowDataWrapper(FlinkSchemaUtil.convert(schema), schema.asStruct()); - - Iterator actual = recordList.iterator(); - Iterator expected = rowDataList.iterator(); - - StructLikeWrapper actualWrapper = StructLikeWrapper.forType(schema.asStruct()); - StructLikeWrapper expectedWrapper = StructLikeWrapper.forType(schema.asStruct()); - for (int i = 0; i < numRecords; i++) { - assertThat(actual).hasNext(); - assertThat(expected).hasNext(); - - StructLike recordStructLike = recordWrapper.wrap(actual.next()); - StructLike rowDataStructLike = rowDataWrapper.wrap(expected.next()); - - assertMethod.assertEquals( - "Should have expected StructLike values", - expectedWrapper.set(rowDataStructLike), - actualWrapper.set(recordStructLike)); - } - - assertThat(actual).isExhausted(); - assertThat(expected).isExhausted(); - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestTableLoader.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestTableLoader.java deleted file mode 100644 index a7c58e551112..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestTableLoader.java +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import java.io.File; -import org.apache.iceberg.Table; -import org.apache.iceberg.TestTables; - -public class TestTableLoader implements TableLoader { - private final File dir; - - public static TableLoader of(String dir) { - return new TestTableLoader(dir); - } - - public TestTableLoader(String dir) { - this.dir = new File(dir); - } - - @Override - public void open() {} - - @Override - public boolean isOpen() { - return true; - } - - @Override - public Table loadTable() { - return TestTables.load(dir, "test"); - } - - @Override - @SuppressWarnings({"checkstyle:NoClone", "checkstyle:SuperClone"}) - public TableLoader clone() { - return new TestTableLoader(dir.getAbsolutePath()); - } - - @Override - public void close() {} -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestTableSerialization.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestTableSerialization.java deleted file mode 100644 index 7f0e7acaa822..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/TestTableSerialization.java +++ /dev/null @@ -1,110 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink; - -import static org.apache.iceberg.flink.TestHelpers.roundTripKryoSerialize; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.File; -import java.io.IOException; -import java.nio.file.Path; -import java.util.Map; -import org.apache.iceberg.HasTableOperations; -import org.apache.iceberg.MetadataTableType; -import org.apache.iceberg.MetadataTableUtils; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.SerializableTable; -import org.apache.iceberg.SortOrder; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableOperations; -import org.apache.iceberg.TestHelpers; -import org.apache.iceberg.Transaction; -import org.apache.iceberg.hadoop.HadoopTables; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.types.Types; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.io.TempDir; - -public class TestTableSerialization { - private static final HadoopTables TABLES = new HadoopTables(); - - private static final Schema SCHEMA = - new Schema( - required(1, "id", Types.LongType.get()), - optional(2, "data", Types.StringType.get()), - required(3, "date", Types.StringType.get()), - optional(4, "double", Types.DoubleType.get())); - - private static final PartitionSpec SPEC = - PartitionSpec.builderFor(SCHEMA).identity("date").build(); - - private static final SortOrder SORT_ORDER = SortOrder.builderFor(SCHEMA).asc("id").build(); - - @TempDir private Path temp; - private Table table; - - @BeforeEach - public void initTable() throws IOException { - Map props = ImmutableMap.of("k1", "v1", "k2", "v2"); - - File tableLocation = File.createTempFile("junit", null, temp.toFile()); - assertThat(tableLocation.delete()).isTrue(); - - this.table = TABLES.create(SCHEMA, SPEC, SORT_ORDER, props, tableLocation.toString()); - } - - @Test - public void testSerializableTableKryoSerialization() throws IOException { - SerializableTable serializableTable = (SerializableTable) SerializableTable.copyOf(table); - TestHelpers.assertSerializedAndLoadedMetadata( - table, roundTripKryoSerialize(SerializableTable.class, serializableTable)); - } - - @Test - public void testSerializableMetadataTableKryoSerialization() throws IOException { - for (MetadataTableType type : MetadataTableType.values()) { - TableOperations ops = ((HasTableOperations) table).operations(); - Table metadataTable = - MetadataTableUtils.createMetadataTableInstance(ops, table.name(), "meta", type); - SerializableTable serializableMetadataTable = - (SerializableTable) SerializableTable.copyOf(metadataTable); - - TestHelpers.assertSerializedAndLoadedMetadata( - metadataTable, - roundTripKryoSerialize(SerializableTable.class, serializableMetadataTable)); - } - } - - @Test - public void testSerializableTransactionTableKryoSerialization() throws IOException { - Transaction txn = table.newTransaction(); - - txn.updateProperties().set("k1", "v1").commit(); - - Table txnTable = txn.table(); - SerializableTable serializableTxnTable = (SerializableTable) SerializableTable.copyOf(txnTable); - - TestHelpers.assertSerializedMetadata( - txnTable, roundTripKryoSerialize(SerializableTable.class, serializableTxnTable)); - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/actions/TestRewriteDataFilesAction.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/actions/TestRewriteDataFilesAction.java deleted file mode 100644 index 800cce96edac..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/actions/TestRewriteDataFilesAction.java +++ /dev/null @@ -1,481 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.actions; - -import static org.apache.iceberg.flink.SimpleDataUtil.RECORD; -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatThrownBy; - -import java.io.File; -import java.io.IOException; -import java.nio.file.Path; -import java.util.Collection; -import java.util.List; -import java.util.Set; -import java.util.UUID; -import java.util.stream.Collectors; -import org.apache.commons.lang3.StringUtils; -import org.apache.flink.configuration.CoreOptions; -import org.apache.flink.table.api.TableEnvironment; -import org.apache.flink.types.Row; -import org.apache.iceberg.ContentFile; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.DataFiles; -import org.apache.iceberg.DeleteFile; -import org.apache.iceberg.FileContent; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.FileScanTask; -import org.apache.iceberg.Files; -import org.apache.iceberg.Parameter; -import org.apache.iceberg.Parameters; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.actions.RewriteDataFilesActionResult; -import org.apache.iceberg.catalog.Namespace; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.data.GenericAppenderFactory; -import org.apache.iceberg.data.GenericRecord; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.exceptions.ValidationException; -import org.apache.iceberg.expressions.Expressions; -import org.apache.iceberg.flink.CatalogTestBase; -import org.apache.iceberg.flink.SimpleDataUtil; -import org.apache.iceberg.io.CloseableIterable; -import org.apache.iceberg.io.FileAppender; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.Iterables; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.types.Types; -import org.apache.iceberg.util.Pair; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.TestTemplate; -import org.junit.jupiter.api.io.TempDir; - -public class TestRewriteDataFilesAction extends CatalogTestBase { - - private static final String TABLE_NAME_UNPARTITIONED = "test_table_unpartitioned"; - private static final String TABLE_NAME_PARTITIONED = "test_table_partitioned"; - private static final String TABLE_NAME_WITH_PK = "test_table_with_pk"; - - @Parameter(index = 2) - private FileFormat format; - - private Table icebergTableUnPartitioned; - private Table icebergTablePartitioned; - private Table icebergTableWithPk; - - @Override - protected TableEnvironment getTableEnv() { - super.getTableEnv().getConfig().getConfiguration().set(CoreOptions.DEFAULT_PARALLELISM, 1); - return super.getTableEnv(); - } - - @Parameters(name = "catalogName={0}, baseNamespace={1}, format={2}") - public static List parameters() { - List parameters = Lists.newArrayList(); - for (FileFormat format : - new FileFormat[] {FileFormat.AVRO, FileFormat.ORC, FileFormat.PARQUET}) { - for (Object[] catalogParams : CatalogTestBase.parameters()) { - String catalogName = (String) catalogParams[0]; - Namespace baseNamespace = (Namespace) catalogParams[1]; - parameters.add(new Object[] {catalogName, baseNamespace, format}); - } - } - return parameters; - } - - private @TempDir Path temp; - - @Override - @BeforeEach - public void before() { - super.before(); - sql("CREATE DATABASE %s", flinkDatabase); - sql("USE CATALOG %s", catalogName); - sql("USE %s", DATABASE); - sql( - "CREATE TABLE %s (id int, data varchar) with ('write.format.default'='%s')", - TABLE_NAME_UNPARTITIONED, format.name()); - icebergTableUnPartitioned = - validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE_NAME_UNPARTITIONED)); - - sql( - "CREATE TABLE %s (id int, data varchar,spec varchar) " - + " PARTITIONED BY (data,spec) with ('write.format.default'='%s')", - TABLE_NAME_PARTITIONED, format.name()); - icebergTablePartitioned = - validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE_NAME_PARTITIONED)); - - sql( - "CREATE TABLE %s (id int, data varchar, PRIMARY KEY(`id`) NOT ENFORCED) with ('write.format.default'='%s', 'format-version'='2')", - TABLE_NAME_WITH_PK, format.name()); - icebergTableWithPk = - validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE_NAME_WITH_PK)); - } - - @Override - @AfterEach - public void clean() { - sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, TABLE_NAME_UNPARTITIONED); - sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, TABLE_NAME_PARTITIONED); - sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, TABLE_NAME_WITH_PK); - sql("DROP DATABASE IF EXISTS %s", flinkDatabase); - super.clean(); - } - - @TestTemplate - public void testRewriteDataFilesEmptyTable() throws Exception { - assertThat(icebergTableUnPartitioned.currentSnapshot()).isNull(); - Actions.forTable(icebergTableUnPartitioned).rewriteDataFiles().execute(); - assertThat(icebergTableUnPartitioned.currentSnapshot()).isNull(); - } - - @TestTemplate - public void testRewriteDataFilesUnpartitionedTable() throws Exception { - sql("INSERT INTO %s SELECT 1, 'hello'", TABLE_NAME_UNPARTITIONED); - sql("INSERT INTO %s SELECT 2, 'world'", TABLE_NAME_UNPARTITIONED); - - icebergTableUnPartitioned.refresh(); - - CloseableIterable tasks = icebergTableUnPartitioned.newScan().planFiles(); - List dataFiles = - Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); - assertThat(dataFiles).hasSize(2); - RewriteDataFilesActionResult result = - Actions.forTable(icebergTableUnPartitioned).rewriteDataFiles().execute(); - - assertThat(result.deletedDataFiles()).hasSize(2); - assertThat(result.addedDataFiles()).hasSize(1); - - icebergTableUnPartitioned.refresh(); - - CloseableIterable tasks1 = icebergTableUnPartitioned.newScan().planFiles(); - List dataFiles1 = - Lists.newArrayList(CloseableIterable.transform(tasks1, FileScanTask::file)); - assertThat(dataFiles1).hasSize(1); - // Assert the table records as expected. - SimpleDataUtil.assertTableRecords( - icebergTableUnPartitioned, - Lists.newArrayList( - SimpleDataUtil.createRecord(1, "hello"), SimpleDataUtil.createRecord(2, "world"))); - } - - @TestTemplate - public void testRewriteDataFilesPartitionedTable() throws Exception { - sql("INSERT INTO %s SELECT 1, 'hello' ,'a'", TABLE_NAME_PARTITIONED); - sql("INSERT INTO %s SELECT 2, 'hello' ,'a'", TABLE_NAME_PARTITIONED); - sql("INSERT INTO %s SELECT 3, 'world' ,'b'", TABLE_NAME_PARTITIONED); - sql("INSERT INTO %s SELECT 4, 'world' ,'b'", TABLE_NAME_PARTITIONED); - - icebergTablePartitioned.refresh(); - - CloseableIterable tasks = icebergTablePartitioned.newScan().planFiles(); - List dataFiles = - Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); - assertThat(dataFiles).hasSize(4); - RewriteDataFilesActionResult result = - Actions.forTable(icebergTablePartitioned).rewriteDataFiles().execute(); - - assertThat(result.deletedDataFiles()).hasSize(4); - assertThat(result.addedDataFiles()).hasSize(2); - - icebergTablePartitioned.refresh(); - - CloseableIterable tasks1 = icebergTablePartitioned.newScan().planFiles(); - List dataFiles1 = - Lists.newArrayList(CloseableIterable.transform(tasks1, FileScanTask::file)); - assertThat(dataFiles1).hasSize(2); - // Assert the table records as expected. - Schema schema = - new Schema( - Types.NestedField.optional(1, "id", Types.IntegerType.get()), - Types.NestedField.optional(2, "data", Types.StringType.get()), - Types.NestedField.optional(3, "spec", Types.StringType.get())); - - Record record = GenericRecord.create(schema); - SimpleDataUtil.assertTableRecords( - icebergTablePartitioned, - Lists.newArrayList( - record.copy("id", 1, "data", "hello", "spec", "a"), - record.copy("id", 2, "data", "hello", "spec", "a"), - record.copy("id", 3, "data", "world", "spec", "b"), - record.copy("id", 4, "data", "world", "spec", "b"))); - } - - @TestTemplate - public void testRewriteDataFilesWithFilter() throws Exception { - sql("INSERT INTO %s SELECT 1, 'hello' ,'a'", TABLE_NAME_PARTITIONED); - sql("INSERT INTO %s SELECT 2, 'hello' ,'a'", TABLE_NAME_PARTITIONED); - sql("INSERT INTO %s SELECT 3, 'world' ,'a'", TABLE_NAME_PARTITIONED); - sql("INSERT INTO %s SELECT 4, 'world' ,'b'", TABLE_NAME_PARTITIONED); - sql("INSERT INTO %s SELECT 5, 'world' ,'b'", TABLE_NAME_PARTITIONED); - - icebergTablePartitioned.refresh(); - - CloseableIterable tasks = icebergTablePartitioned.newScan().planFiles(); - List dataFiles = - Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); - assertThat(dataFiles).hasSize(5); - RewriteDataFilesActionResult result = - Actions.forTable(icebergTablePartitioned) - .rewriteDataFiles() - .filter(Expressions.equal("spec", "a")) - .filter(Expressions.startsWith("data", "he")) - .execute(); - assertThat(result.deletedDataFiles()).hasSize(2); - assertThat(result.addedDataFiles()).hasSize(1); - - icebergTablePartitioned.refresh(); - - CloseableIterable tasks1 = icebergTablePartitioned.newScan().planFiles(); - List dataFiles1 = - Lists.newArrayList(CloseableIterable.transform(tasks1, FileScanTask::file)); - assertThat(dataFiles1).hasSize(4); - // Assert the table records as expected. - Schema schema = - new Schema( - Types.NestedField.optional(1, "id", Types.IntegerType.get()), - Types.NestedField.optional(2, "data", Types.StringType.get()), - Types.NestedField.optional(3, "spec", Types.StringType.get())); - - Record record = GenericRecord.create(schema); - SimpleDataUtil.assertTableRecords( - icebergTablePartitioned, - Lists.newArrayList( - record.copy("id", 1, "data", "hello", "spec", "a"), - record.copy("id", 2, "data", "hello", "spec", "a"), - record.copy("id", 3, "data", "world", "spec", "a"), - record.copy("id", 4, "data", "world", "spec", "b"), - record.copy("id", 5, "data", "world", "spec", "b"))); - } - - @TestTemplate - public void testRewriteLargeTableHasResiduals() throws IOException { - // all records belong to the same partition - List records1 = Lists.newArrayList(); - List records2 = Lists.newArrayList(); - List expected = Lists.newArrayList(); - for (int i = 0; i < 100; i++) { - int id = i; - String data = String.valueOf(i % 3); - if (i % 2 == 0) { - records1.add("(" + id + ",'" + data + "')"); - } else { - records2.add("(" + id + ",'" + data + "')"); - } - Record record = RECORD.copy(); - record.setField("id", id); - record.setField("data", data); - expected.add(record); - } - - sql("INSERT INTO %s values " + StringUtils.join(records1, ","), TABLE_NAME_UNPARTITIONED); - sql("INSERT INTO %s values " + StringUtils.join(records2, ","), TABLE_NAME_UNPARTITIONED); - - icebergTableUnPartitioned.refresh(); - - CloseableIterable tasks = - icebergTableUnPartitioned - .newScan() - .ignoreResiduals() - .filter(Expressions.equal("data", "0")) - .planFiles(); - for (FileScanTask task : tasks) { - assertThat(task.residual()) - .as("Residuals must be ignored") - .isEqualTo(Expressions.alwaysTrue()); - } - List dataFiles = - Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); - assertThat(dataFiles).hasSize(2); - Actions actions = Actions.forTable(icebergTableUnPartitioned); - - RewriteDataFilesActionResult result = - actions.rewriteDataFiles().filter(Expressions.equal("data", "0")).execute(); - assertThat(result.deletedDataFiles()).hasSize(2); - assertThat(result.addedDataFiles()).hasSize(1); - // Assert the table records as expected. - SimpleDataUtil.assertTableRecords(icebergTableUnPartitioned, expected); - } - - /** - * a test case to test avoid repeate compress - * - *

    If datafile cannot be combined to CombinedScanTask with other DataFiles, the size of the - * CombinedScanTask list size is 1, so we remove these CombinedScanTasks to avoid compressed - * repeatedly. - * - *

    In this test case,we generated 3 data files and set targetSizeInBytes greater than the - * largest file size so that it cannot be combined a CombinedScanTask with other datafiles. The - * datafile with the largest file size will not be compressed. - * - * @throws IOException IOException - */ - @TestTemplate - public void testRewriteAvoidRepeateCompress() throws IOException { - List expected = Lists.newArrayList(); - Schema schema = icebergTableUnPartitioned.schema(); - GenericAppenderFactory genericAppenderFactory = new GenericAppenderFactory(schema); - File file = File.createTempFile("junit", null, temp.toFile()); - int count = 0; - try (FileAppender fileAppender = - genericAppenderFactory.newAppender(Files.localOutput(file), format)) { - long filesize = 20000; - for (; fileAppender.length() < filesize; count++) { - Record record = SimpleDataUtil.createRecord(count, UUID.randomUUID().toString()); - fileAppender.add(record); - expected.add(record); - } - } - - DataFile dataFile = - DataFiles.builder(icebergTableUnPartitioned.spec()) - .withPath(file.getAbsolutePath()) - .withFileSizeInBytes(file.length()) - .withFormat(format) - .withRecordCount(count) - .build(); - - icebergTableUnPartitioned.newAppend().appendFile(dataFile).commit(); - - sql("INSERT INTO %s SELECT 1,'a' ", TABLE_NAME_UNPARTITIONED); - sql("INSERT INTO %s SELECT 2,'b' ", TABLE_NAME_UNPARTITIONED); - - icebergTableUnPartitioned.refresh(); - - CloseableIterable tasks = icebergTableUnPartitioned.newScan().planFiles(); - List dataFiles = - Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); - assertThat(dataFiles).hasSize(3); - Actions actions = Actions.forTable(icebergTableUnPartitioned); - - long targetSizeInBytes = file.length() + 10; - RewriteDataFilesActionResult result = - actions - .rewriteDataFiles() - .targetSizeInBytes(targetSizeInBytes) - .splitOpenFileCost(1) - .execute(); - assertThat(result.deletedDataFiles()).hasSize(2); - assertThat(result.addedDataFiles()).hasSize(1); - icebergTableUnPartitioned.refresh(); - - CloseableIterable tasks1 = icebergTableUnPartitioned.newScan().planFiles(); - List dataFilesRewrote = - Lists.newArrayList(CloseableIterable.transform(tasks1, FileScanTask::file)); - assertThat(dataFilesRewrote).hasSize(2); - // the biggest file do not be rewrote - List rewroteDataFileNames = - dataFilesRewrote.stream().map(ContentFile::path).collect(Collectors.toList()); - assertThat(rewroteDataFileNames).contains(file.getAbsolutePath()); - - // Assert the table records as expected. - expected.add(SimpleDataUtil.createRecord(1, "a")); - expected.add(SimpleDataUtil.createRecord(2, "b")); - SimpleDataUtil.assertTableRecords(icebergTableUnPartitioned, expected); - } - - @TestTemplate - public void testRewriteNoConflictWithEqualityDeletes() throws IOException { - // Add 2 data files - sql("INSERT INTO %s SELECT 1, 'hello'", TABLE_NAME_WITH_PK); - sql("INSERT INTO %s SELECT 2, 'world'", TABLE_NAME_WITH_PK); - - // Load 2 stale tables to pass to rewrite actions - // Since the first rewrite will refresh stale1, we need another stale2 for the second rewrite - Table stale1 = - validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE_NAME_WITH_PK)); - Table stale2 = - validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE_NAME_WITH_PK)); - - // Add 1 data file and 1 equality-delete file - sql("INSERT INTO %s /*+ OPTIONS('upsert-enabled'='true')*/ SELECT 1, 'hi'", TABLE_NAME_WITH_PK); - - icebergTableWithPk.refresh(); - assertThat(icebergTableWithPk.currentSnapshot().sequenceNumber()) - .as("The latest sequence number should be greater than that of the stale snapshot") - .isEqualTo(stale1.currentSnapshot().sequenceNumber() + 1); - CloseableIterable tasks = icebergTableWithPk.newScan().planFiles(); - List dataFiles = - Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); - Set deleteFiles = - Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::deletes)).stream() - .flatMap(Collection::stream) - .collect(Collectors.toSet()); - assertThat(dataFiles).hasSize(3); - assertThat(deleteFiles).hasSize(1); - assertThat(Iterables.getOnlyElement(deleteFiles).content()) - .isEqualTo(FileContent.EQUALITY_DELETES); - shouldHaveDataAndFileSequenceNumbers( - TABLE_NAME_WITH_PK, - ImmutableList.of(Pair.of(1L, 1L), Pair.of(2L, 2L), Pair.of(3L, 3L), Pair.of(3L, 3L))); - - assertThatThrownBy( - () -> - Actions.forTable(stale1) - .rewriteDataFiles() - .useStartingSequenceNumber(false) - .execute(), - "Rewrite using new sequence number should fail") - .isInstanceOf(ValidationException.class); - - // Rewrite using the starting sequence number should succeed - RewriteDataFilesActionResult result = - Actions.forTable(stale2).rewriteDataFiles().useStartingSequenceNumber(true).execute(); - - // Should not rewrite files from the new commit - assertThat(result.deletedDataFiles()).hasSize(2); - assertThat(result.addedDataFiles()).hasSize(1); - // The 2 older files with file-sequence-number <= 2 should be rewritten into a new file. - // The new file is the one with file-sequence-number == 4. - // The new file should use rewrite's starting-sequence-number 2 as its data-sequence-number. - shouldHaveDataAndFileSequenceNumbers( - TABLE_NAME_WITH_PK, ImmutableList.of(Pair.of(3L, 3L), Pair.of(3L, 3L), Pair.of(2L, 4L))); - - // Assert the table records as expected. - SimpleDataUtil.assertTableRecords( - icebergTableWithPk, - Lists.newArrayList( - SimpleDataUtil.createRecord(1, "hi"), SimpleDataUtil.createRecord(2, "world"))); - } - - /** - * Assert that data files and delete files in the table should have expected data sequence numbers - * and file sequence numbers - * - * @param tableName table name - * @param expectedSequenceNumbers list of {@link Pair}'s. Each {@link Pair} contains - * (expectedDataSequenceNumber, expectedFileSequenceNumber) of a file. - */ - private void shouldHaveDataAndFileSequenceNumbers( - String tableName, List> expectedSequenceNumbers) { - // "status < 2" for added or existing entries - List liveEntries = sql("SELECT * FROM %s$entries WHERE status < 2", tableName); - - List> actualSequenceNumbers = - liveEntries.stream() - .map( - row -> - Pair.of( - row.getFieldAs("sequence_number"), row.getFieldAs("file_sequence_number"))) - .collect(Collectors.toList()); - assertThat(actualSequenceNumbers).hasSameElementsAs(expectedSequenceNumbers); - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/data/RandomRowData.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/data/RandomRowData.java deleted file mode 100644 index cc58d9817ac6..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/data/RandomRowData.java +++ /dev/null @@ -1,38 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.data; - -import org.apache.flink.table.data.RowData; -import org.apache.iceberg.Schema; -import org.apache.iceberg.data.RandomGenericData; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.flink.RowDataConverter; -import org.apache.iceberg.relocated.com.google.common.collect.Iterables; - -public class RandomRowData { - private RandomRowData() {} - - public static Iterable generate(Schema schema, int numRecords, long seed) { - return convert(schema, RandomGenericData.generate(schema, numRecords, seed)); - } - - public static Iterable convert(Schema schema, Iterable records) { - return Iterables.transform(records, record -> RowDataConverter.convert(schema, record)); - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/data/RowDataToRowMapper.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/data/RowDataToRowMapper.java deleted file mode 100644 index 74b1da6007e6..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/data/RowDataToRowMapper.java +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.data; - -import org.apache.flink.api.common.functions.RichMapFunction; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.conversion.DataStructureConverter; -import org.apache.flink.table.data.conversion.DataStructureConverters; -import org.apache.flink.table.types.logical.RowType; -import org.apache.flink.table.types.utils.TypeConversions; -import org.apache.flink.types.Row; - -public class RowDataToRowMapper extends RichMapFunction { - - private final RowType rowType; - - private transient DataStructureConverter converter; - - public RowDataToRowMapper(RowType rowType) { - this.rowType = rowType; - } - - @Override - public void open(Configuration parameters) throws Exception { - this.converter = - DataStructureConverters.getConverter(TypeConversions.fromLogicalToDataType(rowType)); - } - - @Override - public Row map(RowData value) throws Exception { - return (Row) converter.toExternal(value); - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkAvroReaderWriter.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkAvroReaderWriter.java deleted file mode 100644 index a1039d27d888..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkAvroReaderWriter.java +++ /dev/null @@ -1,185 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.data; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.File; -import java.io.IOException; -import java.math.BigDecimal; -import java.sql.Date; -import java.sql.Time; -import java.util.Iterator; -import java.util.List; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.Files; -import org.apache.iceberg.Schema; -import org.apache.iceberg.avro.Avro; -import org.apache.iceberg.data.DataTest; -import org.apache.iceberg.data.GenericRecord; -import org.apache.iceberg.data.RandomGenericData; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.data.avro.DataReader; -import org.apache.iceberg.data.avro.DataWriter; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.TestHelpers; -import org.apache.iceberg.io.CloseableIterable; -import org.apache.iceberg.io.FileAppender; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.types.Types; -import org.apache.iceberg.util.DateTimeUtil; -import org.junit.jupiter.api.Test; - -public class TestFlinkAvroReaderWriter extends DataTest { - - private static final int NUM_RECORDS = 100; - - private static final Schema SCHEMA_NUM_TYPE = - new Schema( - Types.NestedField.optional(1, "id", Types.IntegerType.get()), - Types.NestedField.optional(2, "int", Types.IntegerType.get()), - Types.NestedField.optional(3, "float", Types.FloatType.get()), - Types.NestedField.optional(4, "double", Types.DoubleType.get()), - Types.NestedField.optional(5, "date", Types.DateType.get()), - Types.NestedField.optional(6, "time", Types.TimeType.get()), - Types.NestedField.optional(7, "timestamp", Types.TimestampType.withoutZone()), - Types.NestedField.optional(8, "bigint", Types.LongType.get()), - Types.NestedField.optional(9, "decimal", Types.DecimalType.of(4, 2))); - - @Override - protected void writeAndValidate(Schema schema) throws IOException { - List expectedRecords = RandomGenericData.generate(schema, NUM_RECORDS, 1991L); - writeAndValidate(schema, expectedRecords, NUM_RECORDS); - } - - private void writeAndValidate(Schema schema, List expectedRecords, int numRecord) - throws IOException { - RowType flinkSchema = FlinkSchemaUtil.convert(schema); - List expectedRows = Lists.newArrayList(RandomRowData.convert(schema, expectedRecords)); - - File recordsFile = File.createTempFile("junit", null, temp.toFile()); - assertThat(recordsFile.delete()).isTrue(); - - // Write the expected records into AVRO file, then read them into RowData and assert with the - // expected Record list. - try (FileAppender writer = - Avro.write(Files.localOutput(recordsFile)) - .schema(schema) - .createWriterFunc(DataWriter::create) - .build()) { - writer.addAll(expectedRecords); - } - - try (CloseableIterable reader = - Avro.read(Files.localInput(recordsFile)) - .project(schema) - .createReaderFunc(FlinkAvroReader::new) - .build()) { - Iterator expected = expectedRecords.iterator(); - Iterator rows = reader.iterator(); - for (int i = 0; i < numRecord; i++) { - assertThat(rows).hasNext(); - TestHelpers.assertRowData(schema.asStruct(), flinkSchema, expected.next(), rows.next()); - } - assertThat(rows).isExhausted(); - } - - File rowDataFile = File.createTempFile("junit", null, temp.toFile()); - assertThat(rowDataFile.delete()).isTrue(); - - // Write the expected RowData into AVRO file, then read them into Record and assert with the - // expected RowData list. - try (FileAppender writer = - Avro.write(Files.localOutput(rowDataFile)) - .schema(schema) - .createWriterFunc(ignore -> new FlinkAvroWriter(flinkSchema)) - .build()) { - writer.addAll(expectedRows); - } - - try (CloseableIterable reader = - Avro.read(Files.localInput(rowDataFile)) - .project(schema) - .createReaderFunc(DataReader::create) - .build()) { - Iterator expected = expectedRows.iterator(); - Iterator records = reader.iterator(); - for (int i = 0; i < numRecord; i += 1) { - assertThat(records).hasNext(); - TestHelpers.assertRowData(schema.asStruct(), flinkSchema, records.next(), expected.next()); - } - assertThat(records).isExhausted(); - } - } - - private Record recordNumType( - int id, - int intV, - float floatV, - double doubleV, - long date, - long time, - long timestamp, - long bigint, - double decimal) { - Record record = GenericRecord.create(SCHEMA_NUM_TYPE); - record.setField("id", id); - record.setField("int", intV); - record.setField("float", floatV); - record.setField("double", doubleV); - record.setField( - "date", DateTimeUtil.dateFromDays((int) new Date(date).toLocalDate().toEpochDay())); - record.setField("time", new Time(time).toLocalTime()); - record.setField("timestamp", DateTimeUtil.timestampFromMicros(timestamp * 1000)); - record.setField("bigint", bigint); - record.setField("decimal", BigDecimal.valueOf(decimal)); - return record; - } - - @Test - public void testNumericTypes() throws IOException { - - List expected = - ImmutableList.of( - recordNumType( - 2, - Integer.MAX_VALUE, - Float.MAX_VALUE, - Double.MAX_VALUE, - Long.MAX_VALUE, - 1643811742000L, - 1643811742000L, - 1643811742000L, - 10.24d), - recordNumType( - 2, - Integer.MIN_VALUE, - Float.MIN_VALUE, - Double.MIN_VALUE, - Long.MIN_VALUE, - 1643811742000L, - 1643811742000L, - 1643811742000L, - 10.24d)); - - writeAndValidate(SCHEMA_NUM_TYPE, expected, 2); - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkOrcReaderWriter.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkOrcReaderWriter.java deleted file mode 100644 index 91ee017238ac..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkOrcReaderWriter.java +++ /dev/null @@ -1,107 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.data; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.File; -import java.io.IOException; -import java.util.Iterator; -import java.util.List; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.Files; -import org.apache.iceberg.Schema; -import org.apache.iceberg.data.DataTest; -import org.apache.iceberg.data.RandomGenericData; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.data.orc.GenericOrcReader; -import org.apache.iceberg.data.orc.GenericOrcWriter; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.TestHelpers; -import org.apache.iceberg.io.CloseableIterable; -import org.apache.iceberg.io.FileAppender; -import org.apache.iceberg.orc.ORC; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; - -public class TestFlinkOrcReaderWriter extends DataTest { - private static final int NUM_RECORDS = 100; - - @Override - protected void writeAndValidate(Schema schema) throws IOException { - RowType flinkSchema = FlinkSchemaUtil.convert(schema); - List expectedRecords = RandomGenericData.generate(schema, NUM_RECORDS, 1990L); - List expectedRows = Lists.newArrayList(RandomRowData.convert(schema, expectedRecords)); - - File recordsFile = File.createTempFile("junit", null, temp.toFile()); - assertThat(recordsFile.delete()).isTrue(); - - // Write the expected records into ORC file, then read them into RowData and assert with the - // expected Record list. - try (FileAppender writer = - ORC.write(Files.localOutput(recordsFile)) - .schema(schema) - .createWriterFunc(GenericOrcWriter::buildWriter) - .build()) { - writer.addAll(expectedRecords); - } - - try (CloseableIterable reader = - ORC.read(Files.localInput(recordsFile)) - .project(schema) - .createReaderFunc(type -> new FlinkOrcReader(schema, type)) - .build()) { - Iterator expected = expectedRecords.iterator(); - Iterator rows = reader.iterator(); - for (int i = 0; i < NUM_RECORDS; i++) { - assertThat(rows.hasNext()).isTrue(); - TestHelpers.assertRowData(schema.asStruct(), flinkSchema, expected.next(), rows.next()); - } - assertThat(rows).isExhausted(); - } - - File rowDataFile = File.createTempFile("junit", null, temp.toFile()); - assertThat(rowDataFile.delete()).isTrue(); - - // Write the expected RowData into ORC file, then read them into Record and assert with the - // expected RowData list. - RowType rowType = FlinkSchemaUtil.convert(schema); - try (FileAppender writer = - ORC.write(Files.localOutput(rowDataFile)) - .schema(schema) - .createWriterFunc((iSchema, typeDesc) -> FlinkOrcWriter.buildWriter(rowType, iSchema)) - .build()) { - writer.addAll(expectedRows); - } - - try (CloseableIterable reader = - ORC.read(Files.localInput(rowDataFile)) - .project(schema) - .createReaderFunc(type -> GenericOrcReader.buildReader(schema, type)) - .build()) { - Iterator expected = expectedRows.iterator(); - Iterator records = reader.iterator(); - for (int i = 0; i < NUM_RECORDS; i += 1) { - assertThat(records).hasNext(); - TestHelpers.assertRowData(schema.asStruct(), flinkSchema, records.next(), expected.next()); - } - assertThat(records).isExhausted(); - } - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkParquetReader.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkParquetReader.java deleted file mode 100644 index 4cfb24f62921..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkParquetReader.java +++ /dev/null @@ -1,239 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.data; - -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.parquet.schema.Types.primitive; -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.File; -import java.io.IOException; -import java.nio.ByteBuffer; -import java.util.Iterator; -import java.util.List; -import org.apache.avro.generic.GenericData; -import org.apache.avro.generic.GenericRecord; -import org.apache.avro.generic.GenericRecordBuilder; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.LogicalType; -import org.apache.hadoop.fs.Path; -import org.apache.iceberg.Files; -import org.apache.iceberg.Schema; -import org.apache.iceberg.avro.AvroSchemaUtil; -import org.apache.iceberg.data.DataTest; -import org.apache.iceberg.data.RandomGenericData; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.data.parquet.GenericParquetWriter; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.TestHelpers; -import org.apache.iceberg.io.CloseableIterable; -import org.apache.iceberg.io.FileAppender; -import org.apache.iceberg.parquet.Parquet; -import org.apache.iceberg.parquet.ParquetValueReader; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.types.Types; -import org.apache.parquet.avro.AvroParquetWriter; -import org.apache.parquet.hadoop.ParquetWriter; -import org.apache.parquet.schema.LogicalTypeAnnotation; -import org.apache.parquet.schema.MessageType; -import org.apache.parquet.schema.PrimitiveType; -import org.apache.parquet.schema.Type; -import org.junit.jupiter.api.Test; - -public class TestFlinkParquetReader extends DataTest { - private static final int NUM_RECORDS = 100; - - @Test - public void testBuildReader() { - MessageType fileSchema = - new MessageType( - "test", - // 0: required(100, "id", LongType.get()) - primitive(PrimitiveType.PrimitiveTypeName.INT64, Type.Repetition.REQUIRED) - .id(100) - .named("id"), - // 1: optional(101, "data", Types.StringType.get()) - primitive(PrimitiveType.PrimitiveTypeName.BINARY, Type.Repetition.OPTIONAL) - .id(101) - .named("data"), - // 2: required(102, "b", Types.BooleanType.get()) - primitive(PrimitiveType.PrimitiveTypeName.BOOLEAN, Type.Repetition.REQUIRED) - .id(102) - .named("b"), - // 3: optional(103, "i", Types.IntegerType.get()) - primitive(PrimitiveType.PrimitiveTypeName.INT32, Type.Repetition.OPTIONAL) - .id(103) - .named("i"), - // 4: optional(105, "f", Types.FloatType.get()) - primitive(PrimitiveType.PrimitiveTypeName.INT64, Type.Repetition.REQUIRED) - .id(104) - .named("l"), - // 5: required(106, "d", Types.DoubleType.get()) - primitive(PrimitiveType.PrimitiveTypeName.FLOAT, Type.Repetition.OPTIONAL) - .id(105) - .named("f"), - // 6: required(106, "d", Types.DoubleType.get()) - primitive(PrimitiveType.PrimitiveTypeName.DOUBLE, Type.Repetition.REQUIRED) - .id(106) - .named("d"), - // 7: optional(107, "date", Types.DateType.get()) - primitive(PrimitiveType.PrimitiveTypeName.INT32, Type.Repetition.OPTIONAL) - .id(107) - .as(LogicalTypeAnnotation.dateType()) - .named("date"), - // 8: required(108, "ts_tz", Types.TimestampType.withZone()) - primitive(PrimitiveType.PrimitiveTypeName.INT64, Type.Repetition.REQUIRED) - .id(108) - .as( - LogicalTypeAnnotation.timestampType( - true, LogicalTypeAnnotation.TimeUnit.MICROS)) - .named("ts_tz"), - // 9: required(109, "ts", Types.TimestampType.withoutZone()) - primitive(PrimitiveType.PrimitiveTypeName.INT64, Type.Repetition.REQUIRED) - .id(109) - .as( - LogicalTypeAnnotation.timestampType( - false, LogicalTypeAnnotation.TimeUnit.MICROS)) - .named("ts"), - // 10: required(110, "s", Types.StringType.get()) - primitive(PrimitiveType.PrimitiveTypeName.BINARY, Type.Repetition.REQUIRED) - .id(110) - .as(LogicalTypeAnnotation.stringType()) - .named("s"), - // 11: required(112, "fixed", Types.FixedType.ofLength(7)) - primitive( - PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY, Type.Repetition.REQUIRED) - .id(112) - .length(7) - .named("f"), - // 12: optional(113, "bytes", Types.BinaryType.get()) - primitive(PrimitiveType.PrimitiveTypeName.BINARY, Type.Repetition.OPTIONAL) - .id(113) - .named("bytes"), - // 13: required(114, "dec_9_0", Types.DecimalType.of(9, 0)) - primitive(PrimitiveType.PrimitiveTypeName.INT64, Type.Repetition.REQUIRED) - .id(114) - .as(LogicalTypeAnnotation.decimalType(0, 9)) - .named("dec_9_0"), - // 14: required(115, "dec_11_2", Types.DecimalType.of(11, 2)) - primitive(PrimitiveType.PrimitiveTypeName.INT64, Type.Repetition.REQUIRED) - .id(115) - .as(LogicalTypeAnnotation.decimalType(2, 11)) - .named("dec_11_2"), - // 15: required(116, "dec_38_10", Types.DecimalType.of(38, 10)) // maximum precision - primitive( - PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY, Type.Repetition.REQUIRED) - .id(116) - .length(16) - .as(LogicalTypeAnnotation.decimalType(10, 38)) - .named("dec_38_10"), - // 16: required(117, "time", Types.TimeType.get()) - primitive(PrimitiveType.PrimitiveTypeName.INT64, Type.Repetition.OPTIONAL) - .id(117) - .as(LogicalTypeAnnotation.timeType(true, LogicalTypeAnnotation.TimeUnit.MICROS)) - .named("time")); - ParquetValueReader reader = - FlinkParquetReaders.buildReader(new Schema(SUPPORTED_PRIMITIVES.fields()), fileSchema); - - assertThat(reader.columns().size()).isEqualTo(SUPPORTED_PRIMITIVES.fields().size()); - } - - @Test - public void testTwoLevelList() throws IOException { - Schema schema = - new Schema( - optional(1, "arraybytes", Types.ListType.ofRequired(3, Types.BinaryType.get())), - optional(2, "topbytes", Types.BinaryType.get())); - org.apache.avro.Schema avroSchema = AvroSchemaUtil.convert(schema.asStruct()); - - File testFile = File.createTempFile("junit", null, temp.toFile()); - assertThat(testFile.delete()).isTrue(); - - ParquetWriter writer = - AvroParquetWriter.builder(new Path(testFile.toURI())) - .withDataModel(GenericData.get()) - .withSchema(avroSchema) - .config("parquet.avro.add-list-element-records", "true") - .config("parquet.avro.write-old-list-structure", "true") - .build(); - - GenericRecordBuilder recordBuilder = new GenericRecordBuilder(avroSchema); - List expectedByteList = Lists.newArrayList(); - byte[] expectedByte = {0x00, 0x01}; - ByteBuffer expectedBinary = ByteBuffer.wrap(expectedByte); - expectedByteList.add(expectedBinary); - recordBuilder.set("arraybytes", expectedByteList); - recordBuilder.set("topbytes", expectedBinary); - GenericData.Record expectedRecord = recordBuilder.build(); - - writer.write(expectedRecord); - writer.close(); - - try (CloseableIterable reader = - Parquet.read(Files.localInput(testFile)) - .project(schema) - .createReaderFunc(type -> FlinkParquetReaders.buildReader(schema, type)) - .build()) { - Iterator rows = reader.iterator(); - assertThat(rows).hasNext(); - RowData rowData = rows.next(); - assertThat(rowData.getArray(0).getBinary(0)).isEqualTo(expectedByte); - assertThat(rowData.getBinary(1)).isEqualTo(expectedByte); - assertThat(rows).isExhausted(); - } - } - - private void writeAndValidate(Iterable iterable, Schema schema) throws IOException { - File testFile = File.createTempFile("junit", null, temp.toFile()); - assertThat(testFile.delete()).isTrue(); - - try (FileAppender writer = - Parquet.write(Files.localOutput(testFile)) - .schema(schema) - .createWriterFunc(GenericParquetWriter::buildWriter) - .build()) { - writer.addAll(iterable); - } - - try (CloseableIterable reader = - Parquet.read(Files.localInput(testFile)) - .project(schema) - .createReaderFunc(type -> FlinkParquetReaders.buildReader(schema, type)) - .build()) { - Iterator expected = iterable.iterator(); - Iterator rows = reader.iterator(); - LogicalType rowType = FlinkSchemaUtil.convert(schema); - for (int i = 0; i < NUM_RECORDS; i += 1) { - assertThat(rows).hasNext(); - TestHelpers.assertRowData(schema.asStruct(), rowType, expected.next(), rows.next()); - } - assertThat(rows).isExhausted(); - } - } - - @Override - protected void writeAndValidate(Schema schema) throws IOException { - writeAndValidate(RandomGenericData.generate(schema, NUM_RECORDS, 19981), schema); - writeAndValidate( - RandomGenericData.generateDictionaryEncodableRecords(schema, NUM_RECORDS, 21124), schema); - writeAndValidate( - RandomGenericData.generateFallbackRecords(schema, NUM_RECORDS, 21124, NUM_RECORDS / 20), - schema); - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkParquetWriter.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkParquetWriter.java deleted file mode 100644 index b1e6f5aa00ff..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkParquetWriter.java +++ /dev/null @@ -1,94 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.data; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.File; -import java.io.IOException; -import java.nio.file.Path; -import java.util.Iterator; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.LogicalType; -import org.apache.iceberg.Files; -import org.apache.iceberg.Schema; -import org.apache.iceberg.data.DataTest; -import org.apache.iceberg.data.RandomGenericData; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.data.parquet.GenericParquetReaders; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.TestHelpers; -import org.apache.iceberg.io.CloseableIterable; -import org.apache.iceberg.io.FileAppender; -import org.apache.iceberg.parquet.Parquet; -import org.junit.jupiter.api.io.TempDir; - -public class TestFlinkParquetWriter extends DataTest { - private static final int NUM_RECORDS = 100; - - @TempDir private Path temp; - - private void writeAndValidate(Iterable iterable, Schema schema) throws IOException { - File testFile = File.createTempFile("junit", null, temp.toFile()); - assertThat(testFile.delete()).isTrue(); - - LogicalType logicalType = FlinkSchemaUtil.convert(schema); - - try (FileAppender writer = - Parquet.write(Files.localOutput(testFile)) - .schema(schema) - .createWriterFunc(msgType -> FlinkParquetWriters.buildWriter(logicalType, msgType)) - .build()) { - writer.addAll(iterable); - } - - try (CloseableIterable reader = - Parquet.read(Files.localInput(testFile)) - .project(schema) - .createReaderFunc(msgType -> GenericParquetReaders.buildReader(schema, msgType)) - .build()) { - Iterator expected = iterable.iterator(); - Iterator actual = reader.iterator(); - LogicalType rowType = FlinkSchemaUtil.convert(schema); - for (int i = 0; i < NUM_RECORDS; i += 1) { - assertThat(actual).hasNext(); - TestHelpers.assertRowData(schema.asStruct(), rowType, actual.next(), expected.next()); - } - assertThat(actual).isExhausted(); - } - } - - @Override - protected void writeAndValidate(Schema schema) throws IOException { - writeAndValidate(RandomRowData.generate(schema, NUM_RECORDS, 19981), schema); - - writeAndValidate( - RandomRowData.convert( - schema, - RandomGenericData.generateDictionaryEncodableRecords(schema, NUM_RECORDS, 21124)), - schema); - - writeAndValidate( - RandomRowData.convert( - schema, - RandomGenericData.generateFallbackRecords( - schema, NUM_RECORDS, 21124, NUM_RECORDS / 20)), - schema); - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/data/TestRowDataProjection.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/data/TestRowDataProjection.java deleted file mode 100644 index d078b2228456..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/data/TestRowDataProjection.java +++ /dev/null @@ -1,593 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.data; - -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatNoException; -import static org.assertj.core.api.Assertions.assertThatThrownBy; - -import java.util.List; -import org.apache.flink.table.data.GenericArrayData; -import org.apache.flink.table.data.GenericMapData; -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.StringData; -import org.apache.iceberg.Schema; -import org.apache.iceberg.StructLike; -import org.apache.iceberg.data.RandomGenericData; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.flink.DataGenerator; -import org.apache.iceberg.flink.DataGenerators; -import org.apache.iceberg.flink.TestHelpers; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.types.Types; -import org.apache.iceberg.util.StructProjection; -import org.junit.jupiter.api.Test; - -public class TestRowDataProjection { - @Test - public void testNullRootRowData() { - Schema schema = - new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get())); - - RowDataProjection projection = RowDataProjection.create(schema, schema.select("id")); - - assertThatThrownBy(() -> projection.wrap(null)) - .isInstanceOf(IllegalArgumentException.class) - .hasMessage("Invalid row data: null"); - } - - @Test - public void testFullProjection() { - Schema schema = - new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get())); - - generateAndValidate(schema, schema); - - GenericRowData rowData = GenericRowData.of(1L, StringData.fromString("a")); - GenericRowData copyRowData = GenericRowData.of(1L, StringData.fromString("a")); - GenericRowData otherRowData = GenericRowData.of(2L, StringData.fromString("b")); - testEqualsAndHashCode(schema, schema, rowData, copyRowData, otherRowData); - } - - @Test - public void testReorderedFullProjection() { - Schema schema = - new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get())); - - Schema reordered = - new Schema( - Types.NestedField.optional(1, "data", Types.StringType.get()), - Types.NestedField.required(0, "id", Types.LongType.get())); - - generateAndValidate(schema, reordered); - - GenericRowData rowData = GenericRowData.of(1L, StringData.fromString("a")); - GenericRowData copyRowData = GenericRowData.of(1L, StringData.fromString("a")); - GenericRowData otherRowData = GenericRowData.of(2L, StringData.fromString("b")); - testEqualsAndHashCode(schema, reordered, rowData, copyRowData, otherRowData); - } - - @Test - public void testBasicProjection() { - Schema schema = - new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get())); - Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); - Schema dataOnly = new Schema(Types.NestedField.optional(1, "data", Types.StringType.get())); - generateAndValidate(schema, idOnly); - generateAndValidate(schema, dataOnly); - - GenericRowData rowData = GenericRowData.of(1L, StringData.fromString("a")); - GenericRowData copyRowData = GenericRowData.of(1L, StringData.fromString("a")); - GenericRowData otherRowData = GenericRowData.of(2L, StringData.fromString("b")); - testEqualsAndHashCode(schema, idOnly, rowData, copyRowData, otherRowData); - testEqualsAndHashCode(schema, dataOnly, rowData, copyRowData, otherRowData); - } - - @Test - public void testEmptyProjection() { - Schema schema = - new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get())); - generateAndValidate(schema, schema.select()); - - GenericRowData rowData = GenericRowData.of(1L, StringData.fromString("a")); - GenericRowData copyRowData = GenericRowData.of(1L, StringData.fromString("a")); - GenericRowData otherRowData = GenericRowData.of(2L, StringData.fromString("b")); - testEqualsAndHashCode(schema, schema.select(), rowData, copyRowData, otherRowData, true); - } - - @Test - public void testRename() { - Schema schema = - new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get())); - - Schema renamed = - new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "renamed", Types.StringType.get())); - generateAndValidate(schema, renamed); - - GenericRowData rowData = GenericRowData.of(1L, StringData.fromString("a")); - GenericRowData copyRowData = GenericRowData.of(1L, StringData.fromString("a")); - GenericRowData otherRowData = GenericRowData.of(2L, StringData.fromString("b")); - testEqualsAndHashCode(schema, renamed, rowData, copyRowData, otherRowData); - } - - @Test - public void testNestedProjection() { - Schema schema = - new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional( - 3, - "location", - Types.StructType.of( - Types.NestedField.required(1, "lat", Types.FloatType.get()), - Types.NestedField.required(2, "long", Types.FloatType.get())))); - - GenericRowData rowData = GenericRowData.of(1L, GenericRowData.of(1.0f, 1.0f)); - GenericRowData copyRowData = GenericRowData.of(1L, GenericRowData.of(1.0f, 1.0f)); - GenericRowData otherRowData = GenericRowData.of(2L, GenericRowData.of(2.0f, 2.0f)); - - GenericRowData rowDataNullStruct = GenericRowData.of(1L, null); - GenericRowData copyRowDataNullStruct = GenericRowData.of(1L, null); - GenericRowData otherRowDataNullStruct = GenericRowData.of(2L, null); - - // Project id only. - Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); - assertThat(idOnly.columns().size()).isGreaterThan(0); - generateAndValidate(schema, idOnly); - testEqualsAndHashCode(schema, idOnly, rowData, copyRowData, otherRowData); - testEqualsAndHashCode( - schema, idOnly, rowDataNullStruct, copyRowDataNullStruct, otherRowDataNullStruct); - - // Project lat only. - Schema latOnly = - new Schema( - Types.NestedField.optional( - 3, - "location", - Types.StructType.of(Types.NestedField.required(1, "lat", Types.FloatType.get())))); - assertThat(latOnly.columns().size()).isGreaterThan(0); - generateAndValidate(schema, latOnly); - testEqualsAndHashCode(schema, latOnly, rowData, copyRowData, otherRowData); - testEqualsAndHashCode( - schema, latOnly, rowDataNullStruct, copyRowDataNullStruct, otherRowDataNullStruct, true); - - // Project long only. - Schema longOnly = - new Schema( - Types.NestedField.optional( - 3, - "location", - Types.StructType.of(Types.NestedField.required(2, "long", Types.FloatType.get())))); - assertThat(longOnly.columns().size()).isGreaterThan(0); - generateAndValidate(schema, longOnly); - testEqualsAndHashCode(schema, longOnly, rowData, copyRowData, otherRowData); - testEqualsAndHashCode( - schema, longOnly, rowDataNullStruct, copyRowDataNullStruct, otherRowDataNullStruct, true); - - // Project location. - Schema locationOnly = schema.select("location"); - assertThat(locationOnly.columns().size()).isGreaterThan(0); - generateAndValidate(schema, locationOnly); - testEqualsAndHashCode(schema, locationOnly, rowData, copyRowData, otherRowData); - testEqualsAndHashCode( - schema, - locationOnly, - rowDataNullStruct, - copyRowDataNullStruct, - otherRowDataNullStruct, - true); - } - - @Test - public void testPrimitivesFullProjection() { - DataGenerator dataGenerator = new DataGenerators.Primitives(); - Schema schema = dataGenerator.icebergSchema(); - generateAndValidate(schema, schema); - - GenericRowData rowData = dataGenerator.generateFlinkRowData(); - GenericRowData copyRowData = dataGenerator.generateFlinkRowData(); - GenericRowData otherRowData = dataGenerator.generateFlinkRowData(); - // modify the string field value (position 6) - otherRowData.setField(6, StringData.fromString("foo_bar")); - testEqualsAndHashCode(schema, schema, rowData, copyRowData, otherRowData); - - GenericRowData rowDataNullOptionalFields = dataGenerator.generateFlinkRowData(); - setOptionalFieldsNullForPrimitives(rowDataNullOptionalFields); - GenericRowData copyRowDataNullOptionalFields = dataGenerator.generateFlinkRowData(); - setOptionalFieldsNullForPrimitives(copyRowDataNullOptionalFields); - GenericRowData otherRowDataNullOptionalFields = dataGenerator.generateFlinkRowData(); - // modify the string field value (position 6) - otherRowDataNullOptionalFields.setField(6, StringData.fromString("foo_bar")); - setOptionalFieldsNullForPrimitives(otherRowData); - testEqualsAndHashCode( - schema, - schema, - rowDataNullOptionalFields, - copyRowDataNullOptionalFields, - otherRowDataNullOptionalFields); - } - - private void setOptionalFieldsNullForPrimitives(GenericRowData rowData) { - // fields from [1, 5] range are optional - for (int pos = 1; pos <= 5; ++pos) { - rowData.setField(pos, null); - } - } - - @Test - public void testMapOfPrimitivesProjection() { - DataGenerator dataGenerator = new DataGenerators.MapOfPrimitives(); - Schema schema = dataGenerator.icebergSchema(); - - // Project id only. - Schema idOnly = schema.select("row_id"); - assertThat(idOnly.columns().size()).isGreaterThan(0); - generateAndValidate(schema, idOnly); - - // Project map only. - Schema mapOnly = schema.select("map_of_primitives"); - assertThat(mapOnly.columns().size()).isGreaterThan(0); - generateAndValidate(schema, mapOnly); - - // Project all. - generateAndValidate(schema, schema); - - GenericRowData rowData = dataGenerator.generateFlinkRowData(); - GenericRowData copyRowData = dataGenerator.generateFlinkRowData(); - // modify the map field value - GenericRowData otherRowData = - GenericRowData.of( - StringData.fromString("row_id_value"), - new GenericMapData( - ImmutableMap.of(StringData.fromString("foo"), 1, StringData.fromString("bar"), 2))); - testEqualsAndHashCode(schema, idOnly, rowData, copyRowData, otherRowData, true); - testEqualsAndHashCode(schema, mapOnly, rowData, copyRowData, otherRowData); - testEqualsAndHashCode(schema, schema, rowData, copyRowData, otherRowData); - - GenericRowData rowDataNullOptionalFields = - GenericRowData.of(StringData.fromString("row_id_value"), null); - GenericRowData copyRowDataNullOptionalFields = - GenericRowData.of(StringData.fromString("row_id_value"), null); - // modify the map field value - GenericRowData otherRowDataNullOptionalFields = - GenericRowData.of(StringData.fromString("other_row_id_value"), null); - testEqualsAndHashCode( - schema, - idOnly, - rowDataNullOptionalFields, - copyRowDataNullOptionalFields, - otherRowDataNullOptionalFields); - testEqualsAndHashCode( - schema, - mapOnly, - rowDataNullOptionalFields, - copyRowDataNullOptionalFields, - otherRowDataNullOptionalFields, - true); - testEqualsAndHashCode( - schema, - schema, - rowDataNullOptionalFields, - copyRowDataNullOptionalFields, - otherRowDataNullOptionalFields); - } - - @Test - public void testMapOfStructStructProjection() { - DataGenerator dataGenerator = new DataGenerators.MapOfStructStruct(); - Schema schema = dataGenerator.icebergSchema(); - - // Project id only. - Schema idOnly = schema.select("row_id"); - assertThat(idOnly.columns().size()).isGreaterThan(0); - generateAndValidate(schema, idOnly); - - // Project map only. - Schema mapOnly = schema.select("map"); - assertThat(mapOnly.columns().size()).isGreaterThan(0); - generateAndValidate(schema, mapOnly); - - // Project all. - generateAndValidate(schema, schema); - - // Project partial map key. - Schema partialMapKey = - new Schema( - Types.NestedField.optional( - 2, - "map", - Types.MapType.ofOptional( - 101, - 102, - Types.StructType.of( - Types.NestedField.required(201, "key", Types.LongType.get())), - Types.StructType.of( - Types.NestedField.required(203, "value", Types.LongType.get()), - Types.NestedField.required(204, "valueData", Types.StringType.get()))))); - assertThatThrownBy(() -> generateAndValidate(schema, partialMapKey)) - .isInstanceOf(IllegalArgumentException.class) - .hasMessageContaining("Cannot project a partial map key or value struct."); - - // Project partial map key. - Schema partialMapValue = - new Schema( - Types.NestedField.optional( - 2, - "map", - Types.MapType.ofOptional( - 101, - 102, - Types.StructType.of( - Types.NestedField.required(201, "key", Types.LongType.get()), - Types.NestedField.required(202, "keyData", Types.StringType.get())), - Types.StructType.of( - Types.NestedField.required(203, "value", Types.LongType.get()))))); - assertThatThrownBy(() -> generateAndValidate(schema, partialMapValue)) - .isInstanceOf(IllegalArgumentException.class) - .hasMessageContaining("Cannot project a partial map key or value struct."); - - GenericRowData rowData = dataGenerator.generateFlinkRowData(); - GenericRowData copyRowData = dataGenerator.generateFlinkRowData(); - // modify the map field value - GenericRowData otherRowData = - GenericRowData.of( - StringData.fromString("other_row_id_value"), - new GenericMapData( - ImmutableMap.of( - GenericRowData.of(1L, StringData.fromString("other_key_data")), - GenericRowData.of(1L, StringData.fromString("other_value_data"))))); - testEqualsAndHashCode(schema, idOnly, rowData, copyRowData, otherRowData); - testEqualsAndHashCode(schema, mapOnly, rowData, copyRowData, otherRowData); - testEqualsAndHashCode(schema, schema, rowData, copyRowData, otherRowData); - - GenericRowData rowDataNullOptionalFields = - GenericRowData.of( - StringData.fromString("row_id_value"), - new GenericMapData( - ImmutableMap.of(GenericRowData.of(1L, null), GenericRowData.of(1L, null)))); - GenericRowData copyRowDataNullOptionalFields = - GenericRowData.of( - StringData.fromString("row_id_value"), - new GenericMapData( - ImmutableMap.of(GenericRowData.of(1L, null), GenericRowData.of(1L, null)))); - // modify the map field value - GenericRowData otherRowDataNullOptionalFields = - GenericRowData.of( - StringData.fromString("other_row_id_value"), - new GenericMapData( - ImmutableMap.of(GenericRowData.of(2L, null), GenericRowData.of(2L, null)))); - testEqualsAndHashCode( - schema, - idOnly, - rowDataNullOptionalFields, - copyRowDataNullOptionalFields, - otherRowDataNullOptionalFields); - testEqualsAndHashCode( - schema, - mapOnly, - rowDataNullOptionalFields, - copyRowDataNullOptionalFields, - otherRowDataNullOptionalFields); - testEqualsAndHashCode( - schema, - schema, - rowDataNullOptionalFields, - copyRowDataNullOptionalFields, - otherRowDataNullOptionalFields); - } - - @Test - public void testArrayOfPrimitiveProjection() { - DataGenerator dataGenerator = new DataGenerators.ArrayOfPrimitive(); - Schema schema = dataGenerator.icebergSchema(); - - // Project id only. - Schema idOnly = schema.select("row_id"); - assertThat(idOnly.columns().size()).isGreaterThan(0); - generateAndValidate(schema, idOnly); - - // Project list only. - Schema arrayOnly = schema.select("array_of_int"); - assertThat(arrayOnly.columns().size()).isGreaterThan(0); - generateAndValidate(schema, arrayOnly); - - // Project all. - generateAndValidate(schema, schema); - - GenericRowData rowData = dataGenerator.generateFlinkRowData(); - GenericRowData copyRowData = dataGenerator.generateFlinkRowData(); - // modify the map field value - GenericRowData otherRowData = - GenericRowData.of( - StringData.fromString("other_row_id_value"), - new GenericArrayData(new Integer[] {4, 5, 6})); - testEqualsAndHashCode(schema, idOnly, rowData, copyRowData, otherRowData); - testEqualsAndHashCode(schema, arrayOnly, rowData, copyRowData, otherRowData); - testEqualsAndHashCode(schema, schema, rowData, copyRowData, otherRowData); - - GenericRowData rowDataNullOptionalFields = - GenericRowData.of( - StringData.fromString("row_id_value"), - new GenericArrayData(new Integer[] {1, null, 3})); - GenericRowData copyRowDataNullOptionalFields = - GenericRowData.of( - StringData.fromString("row_id_value"), - new GenericArrayData(new Integer[] {1, null, 3})); - // modify the map field value - GenericRowData otherRowDataNullOptionalFields = - GenericRowData.of( - StringData.fromString("other_row_id_value"), - new GenericArrayData(new Integer[] {4, null, 6})); - testEqualsAndHashCode( - schema, - idOnly, - rowDataNullOptionalFields, - copyRowDataNullOptionalFields, - otherRowDataNullOptionalFields); - testEqualsAndHashCode( - schema, - arrayOnly, - rowDataNullOptionalFields, - copyRowDataNullOptionalFields, - otherRowDataNullOptionalFields); - testEqualsAndHashCode( - schema, - schema, - rowDataNullOptionalFields, - copyRowDataNullOptionalFields, - otherRowDataNullOptionalFields); - } - - @Test - public void testArrayOfStructProjection() { - DataGenerator dataGenerator = new DataGenerators.ArrayOfStruct(); - Schema schema = dataGenerator.icebergSchema(); - - // Project id only. - Schema idOnly = schema.select("row_id"); - assertThat(idOnly.columns().size()).isGreaterThan(0); - generateAndValidate(schema, idOnly); - - // Project list only. - Schema arrayOnly = schema.select("array_of_struct"); - assertThat(arrayOnly.columns().size()).isGreaterThan(0); - generateAndValidate(schema, arrayOnly); - - // Project all. - generateAndValidate(schema, schema); - - // Project partial list value. - Schema partialList = - new Schema( - Types.NestedField.optional( - 2, - "array_of_struct", - Types.ListType.ofOptional( - 101, - Types.StructType.of( - Types.NestedField.required(202, "name", Types.StringType.get()))))); - - assertThatThrownBy(() -> generateAndValidate(schema, partialList)) - .isInstanceOf(IllegalArgumentException.class) - .hasMessageContaining("Cannot project a partial list element struct."); - - GenericRowData rowData = dataGenerator.generateFlinkRowData(); - GenericRowData copyRowData = dataGenerator.generateFlinkRowData(); - // modify the map field value - GenericRowData otherRowData = - GenericRowData.of( - StringData.fromString("row_id_value"), new GenericArrayData(new Integer[] {4, 5, 6})); - testEqualsAndHashCode(schema, schema, rowData, copyRowData, otherRowData); - - GenericRowData rowDataNullOptionalFields = - GenericRowData.of( - StringData.fromString("row_id_value"), - new GenericArrayData(new Integer[] {1, null, 3})); - GenericRowData copyRowDataNullOptionalFields = - GenericRowData.of( - StringData.fromString("row_id_value"), - new GenericArrayData(new Integer[] {1, null, 3})); - // modify the map field value - GenericRowData otherRowDataNullOptionalFields = - GenericRowData.of( - StringData.fromString("row_id_value"), - new GenericArrayData(new Integer[] {4, null, 6})); - testEqualsAndHashCode( - schema, - schema, - rowDataNullOptionalFields, - copyRowDataNullOptionalFields, - otherRowDataNullOptionalFields); - } - - private void generateAndValidate(Schema schema, Schema projectSchema) { - int numRecords = 100; - List recordList = RandomGenericData.generate(schema, numRecords, 102L); - List rowDataList = - Lists.newArrayList(RandomRowData.generate(schema, numRecords, 102L).iterator()); - assertThat(rowDataList).hasSize(recordList.size()); - - StructProjection structProjection = StructProjection.create(schema, projectSchema); - RowDataProjection rowDataProjection = RowDataProjection.create(schema, projectSchema); - - for (int i = 0; i < numRecords; i++) { - StructLike expected = structProjection.wrap(recordList.get(i)); - RowData projected = rowDataProjection.wrap(rowDataList.get(i)); - TestHelpers.assertRowData(projectSchema, expected, projected); - - assertThat(projected).isEqualTo(projected); - assertThat(projected).hasSameHashCodeAs(projected); - // make sure toString doesn't throw NPE for null values - assertThatNoException().isThrownBy(projected::toString); - } - } - - private void testEqualsAndHashCode( - Schema schema, - Schema projectionSchema, - RowData rowData, - RowData copyRowData, - RowData otherRowData) { - testEqualsAndHashCode(schema, projectionSchema, rowData, copyRowData, otherRowData, false); - } - - /** - * @param isOtherRowDataSameAsRowData sometimes projection on otherRowData can result in the same - * RowData, e.g. due to empty projection or null struct - */ - private void testEqualsAndHashCode( - Schema schema, - Schema projectionSchema, - RowData rowData, - RowData copyRowData, - RowData otherRowData, - boolean isOtherRowDataSameAsRowData) { - RowDataProjection projection = RowDataProjection.create(schema, projectionSchema); - RowDataProjection copyProjection = RowDataProjection.create(schema, projectionSchema); - RowDataProjection otherProjection = RowDataProjection.create(schema, projectionSchema); - - assertThat(projection.wrap(rowData)).isEqualTo(copyProjection.wrap(copyRowData)); - assertThat(projection.wrap(rowData)).hasSameHashCodeAs(copyProjection.wrap(copyRowData)); - - if (isOtherRowDataSameAsRowData) { - assertThat(projection.wrap(rowData)).isEqualTo(otherProjection.wrap(otherRowData)); - assertThat(projection.wrap(rowData)).hasSameHashCodeAs(otherProjection.wrap(otherRowData)); - } else { - assertThat(projection.wrap(rowData)).isNotEqualTo(otherProjection.wrap(otherRowData)); - assertThat(projection.wrap(rowData)) - .doesNotHaveSameHashCodeAs(otherProjection.wrap(otherRowData)); - } - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/data/TestRowProjection.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/data/TestRowProjection.java deleted file mode 100644 index e76452b7cea0..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/data/TestRowProjection.java +++ /dev/null @@ -1,594 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.data; - -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.withPrecision; - -import java.io.File; -import java.io.IOException; -import java.nio.file.Path; -import java.util.Map; -import org.apache.flink.table.data.ArrayData; -import org.apache.flink.table.data.GenericArrayData; -import org.apache.flink.table.data.GenericMapData; -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.StringData; -import org.apache.iceberg.Files; -import org.apache.iceberg.Schema; -import org.apache.iceberg.avro.Avro; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.io.FileAppender; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Iterables; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.types.Types; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.io.TempDir; - -public class TestRowProjection { - - @TempDir private Path temp; - - private RowData writeAndRead(String desc, Schema writeSchema, Schema readSchema, RowData row) - throws IOException { - File file = File.createTempFile("junit", desc + ".avro", temp.toFile()); - assertThat(file.delete()).isTrue(); - - try (FileAppender appender = - Avro.write(Files.localOutput(file)) - .schema(writeSchema) - .createWriterFunc(ignore -> new FlinkAvroWriter(FlinkSchemaUtil.convert(writeSchema))) - .build()) { - appender.add(row); - } - - Iterable records = - Avro.read(Files.localInput(file)) - .project(readSchema) - .createReaderFunc(FlinkAvroReader::new) - .build(); - - return Iterables.getOnlyElement(records); - } - - @Test - public void testFullProjection() throws Exception { - Schema schema = - new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get())); - - RowData row = GenericRowData.of(34L, StringData.fromString("test")); - - RowData projected = writeAndRead("full_projection", schema, schema, row); - - assertThat(projected.getLong(0)).isEqualTo(34); - assertThat(projected.getString(1)).asString().isEqualTo("test"); - } - - @Test - public void testSpecialCharacterProjection() throws Exception { - Schema schema = - new Schema( - Types.NestedField.required(0, "user id", Types.LongType.get()), - Types.NestedField.optional(1, "data%0", Types.StringType.get())); - - RowData row = GenericRowData.of(34L, StringData.fromString("test")); - - RowData full = writeAndRead("special_chars", schema, schema, row); - - assertThat(full.getLong(0)).isEqualTo(34L); - assertThat(full.getString(1)).asString().isEqualTo("test"); - - RowData projected = writeAndRead("special_characters", schema, schema.select("data%0"), full); - - assertThat(projected.getArity()).isEqualTo(1); - assertThat(projected.getString(0)).asString().isEqualTo("test"); - } - - @Test - public void testReorderedFullProjection() throws Exception { - Schema schema = - new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get())); - - RowData row = GenericRowData.of(34L, StringData.fromString("test")); - - Schema reordered = - new Schema( - Types.NestedField.optional(1, "data", Types.StringType.get()), - Types.NestedField.required(0, "id", Types.LongType.get())); - - RowData projected = writeAndRead("full_projection", schema, reordered, row); - - assertThat(projected.getString(0)).asString().isEqualTo("test"); - assertThat(projected.getLong(1)).isEqualTo(34); - } - - @Test - public void testReorderedProjection() throws Exception { - Schema schema = - new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get())); - - RowData row = GenericRowData.of(34L, StringData.fromString("test")); - - Schema reordered = - new Schema( - Types.NestedField.optional(2, "missing_1", Types.StringType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()), - Types.NestedField.optional(3, "missing_2", Types.LongType.get())); - - RowData projected = writeAndRead("full_projection", schema, reordered, row); - - assertThat(projected.isNullAt(0)).isTrue(); - assertThat(projected.getString(1)).asString().isEqualTo("test"); - assertThat(projected.isNullAt(2)).isTrue(); - } - - @Test - public void testRenamedAddedField() throws Exception { - Schema schema = - new Schema( - Types.NestedField.required(1, "a", Types.LongType.get()), - Types.NestedField.required(2, "b", Types.LongType.get()), - Types.NestedField.required(3, "d", Types.LongType.get())); - - RowData row = GenericRowData.of(100L, 200L, 300L); - - Schema renamedAdded = - new Schema( - Types.NestedField.optional(1, "a", Types.LongType.get()), - Types.NestedField.optional(2, "b", Types.LongType.get()), - Types.NestedField.optional(3, "c", Types.LongType.get()), - Types.NestedField.optional(4, "d", Types.LongType.get())); - - RowData projected = writeAndRead("rename_and_add_column_projection", schema, renamedAdded, row); - assertThat(projected.getLong(0)) - .as("Should contain the correct value in column 1") - .isEqualTo(100L); - assertThat(projected.getLong(1)) - .as("Should contain the correct value in column 2") - .isEqualTo(200L); - assertThat(projected.getLong(2)) - .as("Should contain the correct value in column 1") - .isEqualTo(300L); - assertThat(projected.isNullAt(3)).as("Should contain empty value on new column 4").isTrue(); - } - - @Test - public void testEmptyProjection() throws Exception { - Schema schema = - new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get())); - - RowData row = GenericRowData.of(34L, StringData.fromString("test")); - - RowData projected = writeAndRead("empty_projection", schema, schema.select(), row); - - assertThat(projected).isNotNull(); - assertThat(projected.getArity()).isEqualTo(0); - } - - @Test - public void testBasicProjection() throws Exception { - Schema writeSchema = - new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get())); - - RowData row = GenericRowData.of(34L, StringData.fromString("test")); - - Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); - - RowData projected = writeAndRead("basic_projection_id", writeSchema, idOnly, row); - assertThat(projected.getArity()).as("Should not project data").isEqualTo(1); - assertThat(projected.getLong(0)).isEqualTo(34L); - - Schema dataOnly = new Schema(Types.NestedField.optional(1, "data", Types.StringType.get())); - - projected = writeAndRead("basic_projection_data", writeSchema, dataOnly, row); - - assertThat(projected.getArity()).as("Should not project id").isEqualTo(1); - assertThat(projected.getString(0)).asString().isEqualTo("test"); - } - - @Test - public void testRename() throws Exception { - Schema writeSchema = - new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get())); - - RowData row = GenericRowData.of(34L, StringData.fromString("test")); - - Schema readSchema = - new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "renamed", Types.StringType.get())); - - RowData projected = writeAndRead("project_and_rename", writeSchema, readSchema, row); - - assertThat(projected.getLong(0)).isEqualTo(34L); - assertThat(projected.getString(1)) - .as("Should contain the correct data/renamed value") - .asString() - .isEqualTo("test"); - } - - @Test - public void testNestedStructProjection() throws Exception { - Schema writeSchema = - new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional( - 3, - "location", - Types.StructType.of( - Types.NestedField.required(1, "lat", Types.FloatType.get()), - Types.NestedField.required(2, "long", Types.FloatType.get())))); - - RowData location = GenericRowData.of(52.995143f, -1.539054f); - RowData record = GenericRowData.of(34L, location); - - Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); - - RowData projected = writeAndRead("id_only", writeSchema, idOnly, record); - assertThat(projected.getArity()).isEqualTo(1); - assertThat(projected.getLong(0)).as("Should contain the correct id value").isEqualTo(34L); - - Schema latOnly = - new Schema( - Types.NestedField.optional( - 3, - "location", - Types.StructType.of(Types.NestedField.required(1, "lat", Types.FloatType.get())))); - - projected = writeAndRead("latitude_only", writeSchema, latOnly, record); - RowData projectedLocation = projected.getRow(0, 1); - assertThat(projected.getArity()).as("Should not project id").isEqualTo(1); - assertThat(projected.isNullAt(0)).as("Should project location").isFalse(); - assertThat(projectedLocation.getArity()).as("Should not project longitude").isEqualTo(1); - assertThat(projectedLocation.getFloat(0)) - .as("Should project latitude") - .isEqualTo(52.995143f, withPrecision(0.000001f)); - - Schema longOnly = - new Schema( - Types.NestedField.optional( - 3, - "location", - Types.StructType.of(Types.NestedField.required(2, "long", Types.FloatType.get())))); - - projected = writeAndRead("longitude_only", writeSchema, longOnly, record); - projectedLocation = projected.getRow(0, 1); - assertThat(projected.getArity()).as("Should not project id").isEqualTo(1); - assertThat(projected.isNullAt(0)).as("Should project location").isFalse(); - assertThat(projectedLocation.getArity()).as("Should not project latitutde").isEqualTo(1); - assertThat(projectedLocation.getFloat(0)) - .as("Should project longitude") - .isEqualTo(-1.539054f, withPrecision(0.000001f)); - - Schema locationOnly = writeSchema.select("location"); - projected = writeAndRead("location_only", writeSchema, locationOnly, record); - projectedLocation = projected.getRow(0, 1); - assertThat(projected.getArity()).as("Should not project id").isEqualTo(1); - assertThat(projected.isNullAt(0)).as("Should project location").isFalse(); - assertThat(projectedLocation.getFloat(0)) - .as("Should project latitude") - .isEqualTo(52.995143f, withPrecision(0.000001f)); - assertThat(projectedLocation.getFloat(1)) - .as("Should project longitude") - .isEqualTo(-1.539054f, withPrecision(0.000001f)); - } - - @Test - public void testMapProjection() throws IOException { - Schema writeSchema = - new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional( - 5, - "properties", - Types.MapType.ofOptional(6, 7, Types.StringType.get(), Types.StringType.get()))); - - GenericMapData properties = - new GenericMapData( - ImmutableMap.of( - StringData.fromString("a"), - StringData.fromString("A"), - StringData.fromString("b"), - StringData.fromString("B"))); - - RowData row = GenericRowData.of(34L, properties); - - Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); - - RowData projected = writeAndRead("id_only", writeSchema, idOnly, row); - assertThat(projected.getLong(0)).isEqualTo(34L); - assertThat(projected.getArity()).as("Should not project properties map").isEqualTo(1); - - Schema keyOnly = writeSchema.select("properties.key"); - projected = writeAndRead("key_only", writeSchema, keyOnly, row); - assertThat(projected.getArity()).as("Should not project id").isEqualTo(1); - assertThat(projected.getMap(0)).isEqualTo(properties); - - Schema valueOnly = writeSchema.select("properties.value"); - projected = writeAndRead("value_only", writeSchema, valueOnly, row); - assertThat(projected.getArity()).as("Should not project id").isEqualTo(1); - assertThat(projected.getMap(0)).isEqualTo(properties); - - Schema mapOnly = writeSchema.select("properties"); - projected = writeAndRead("map_only", writeSchema, mapOnly, row); - assertThat(projected.getArity()).as("Should not project id").isEqualTo(1); - assertThat(projected.getMap(0)).isEqualTo(properties); - } - - private Map toStringMap(Map map) { - Map stringMap = Maps.newHashMap(); - for (Map.Entry entry : map.entrySet()) { - if (entry.getValue() instanceof CharSequence) { - stringMap.put(entry.getKey().toString(), entry.getValue().toString()); - } else { - stringMap.put(entry.getKey().toString(), entry.getValue()); - } - } - return stringMap; - } - - @Test - public void testMapOfStructsProjection() throws IOException { - Schema writeSchema = - new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional( - 5, - "locations", - Types.MapType.ofOptional( - 6, - 7, - Types.StringType.get(), - Types.StructType.of( - Types.NestedField.required(1, "lat", Types.FloatType.get()), - Types.NestedField.required(2, "long", Types.FloatType.get()))))); - - RowData l1 = GenericRowData.of(53.992811f, -1.542616f); - RowData l2 = GenericRowData.of(52.995143f, -1.539054f); - GenericMapData map = - new GenericMapData( - ImmutableMap.of(StringData.fromString("L1"), l1, StringData.fromString("L2"), l2)); - RowData row = GenericRowData.of(34L, map); - - Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); - - RowData projected = writeAndRead("id_only", writeSchema, idOnly, row); - assertThat(projected.getLong(0)).isEqualTo(34L); - assertThat(projected.getArity()).as("Should not project locations map").isEqualTo(1); - - projected = writeAndRead("all_locations", writeSchema, writeSchema.select("locations"), row); - assertThat(projected.getArity()).as("Should not project id").isEqualTo(1); - assertThat(projected.getMap(0)).isEqualTo(row.getMap(1)); - - projected = writeAndRead("lat_only", writeSchema, writeSchema.select("locations.lat"), row); - GenericMapData locations = (GenericMapData) projected.getMap(0); - assertThat(locations).isNotNull(); - GenericArrayData l1l2Array = - new GenericArrayData( - new Object[] {StringData.fromString("L2"), StringData.fromString("L1")}); - assertThat(locations.keyArray()).isEqualTo(l1l2Array); - RowData projectedL1 = (RowData) locations.get(StringData.fromString("L1")); - assertThat(projectedL1).isNotNull(); - assertThat(projectedL1.getFloat(0)) - .as("L1 should contain lat") - .isEqualTo(53.992811f, withPrecision(0.000001f)); - assertThat(projectedL1.getArity()).as("L1 should not contain long").isEqualTo(1); - RowData projectedL2 = (RowData) locations.get(StringData.fromString("L2")); - assertThat(projectedL2).isNotNull(); - assertThat(projectedL2.getFloat(0)) - .as("L2 should contain lat") - .isEqualTo(52.995143f, withPrecision(0.000001f)); - assertThat(projectedL2.getArity()).as("L2 should not contain long").isEqualTo(1); - - projected = writeAndRead("long_only", writeSchema, writeSchema.select("locations.long"), row); - assertThat(projected.getArity()).as("Should not project id").isEqualTo(1); - locations = (GenericMapData) projected.getMap(0); - assertThat(locations).isNotNull(); - assertThat(locations.keyArray()).isEqualTo(l1l2Array); - projectedL1 = (RowData) locations.get(StringData.fromString("L1")); - assertThat(projectedL1).isNotNull(); - assertThat(projectedL1.getArity()).as("L1 should not contain lat").isEqualTo(1); - assertThat(projectedL1.getFloat(0)) - .as("L1 should contain long") - .isEqualTo(-1.542616f, withPrecision(0.000001f)); - projectedL2 = (RowData) locations.get(StringData.fromString("L2")); - assertThat(projectedL2).isNotNull(); - assertThat(projectedL2.getArity()).as("L2 should not contain lat").isEqualTo(1); - assertThat(projectedL2.getFloat(0)) - .as("L2 should contain long") - .isEqualTo(-1.539054f, withPrecision(0.000001f)); - - Schema latitiudeRenamed = - new Schema( - Types.NestedField.optional( - 5, - "locations", - Types.MapType.ofOptional( - 6, - 7, - Types.StringType.get(), - Types.StructType.of( - Types.NestedField.required(1, "latitude", Types.FloatType.get()))))); - - projected = writeAndRead("latitude_renamed", writeSchema, latitiudeRenamed, row); - assertThat(projected.getArity()).as("Should not project id").isEqualTo(1); - locations = (GenericMapData) projected.getMap(0); - assertThat(locations).isNotNull(); - assertThat(locations.keyArray()).isEqualTo(l1l2Array); - projectedL1 = (RowData) locations.get(StringData.fromString("L1")); - assertThat(projectedL1).isNotNull(); - assertThat(projectedL1.getFloat(0)) - .as("L1 should contain latitude") - .isEqualTo(53.992811f, withPrecision(0.000001f)); - projectedL2 = (RowData) locations.get(StringData.fromString("L2")); - assertThat(projectedL2).isNotNull(); - assertThat(projectedL2.getFloat(0)) - .as("L2 should contain latitude") - .isEqualTo(52.995143f, withPrecision(0.000001f)); - } - - @Test - public void testListProjection() throws IOException { - Schema writeSchema = - new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional( - 10, "values", Types.ListType.ofOptional(11, Types.LongType.get()))); - - GenericArrayData values = new GenericArrayData(new Long[] {56L, 57L, 58L}); - - RowData row = GenericRowData.of(34L, values); - - Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); - - RowData projected = writeAndRead("id_only", writeSchema, idOnly, row); - assertThat(projected.getLong(0)).isEqualTo(34L); - assertThat(projected.getArity()).as("Should not project values list").isEqualTo(1); - - Schema elementOnly = writeSchema.select("values.element"); - projected = writeAndRead("element_only", writeSchema, elementOnly, row); - assertThat(projected.getArity()).as("Should not project id").isEqualTo(1); - assertThat(projected.getArray(0)).isEqualTo(values); - - Schema listOnly = writeSchema.select("values"); - projected = writeAndRead("list_only", writeSchema, listOnly, row); - assertThat(projected.getArity()).as("Should not project id").isEqualTo(1); - assertThat(projected.getArray(0)).isEqualTo(values); - } - - @Test - @SuppressWarnings("unchecked") - public void testListOfStructsProjection() throws IOException { - Schema writeSchema = - new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional( - 22, - "points", - Types.ListType.ofOptional( - 21, - Types.StructType.of( - Types.NestedField.required(19, "x", Types.IntegerType.get()), - Types.NestedField.optional(18, "y", Types.IntegerType.get()))))); - - RowData p1 = GenericRowData.of(1, 2); - RowData p2 = GenericRowData.of(3, null); - GenericArrayData arrayData = new GenericArrayData(new RowData[] {p1, p2}); - RowData row = GenericRowData.of(34L, arrayData); - - Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); - - RowData projected = writeAndRead("id_only", writeSchema, idOnly, row); - assertThat(projected.getLong(0)).isEqualTo(34L); - assertThat(projected.getArity()).isEqualTo(1); - - projected = writeAndRead("all_points", writeSchema, writeSchema.select("points"), row); - assertThat(projected.getArity()).as("Should not project id").isEqualTo(1); - assertThat(projected.getArray(0)).isEqualTo(row.getArray(1)); - - projected = writeAndRead("x_only", writeSchema, writeSchema.select("points.x"), row); - assertThat(projected.getArity()).as("Should not project id").isEqualTo(1); - assertThat(projected.isNullAt(0)).isFalse(); - ArrayData points = projected.getArray(0); - assertThat(points.size()).isEqualTo(2); - RowData projectedP1 = points.getRow(0, 2); - assertThat(projectedP1.getInt(0)).as("Should project x").isEqualTo(1); - assertThat(projectedP1.getArity()).as("Should not project y").isEqualTo(1); - RowData projectedP2 = points.getRow(1, 2); - assertThat(projectedP2.getArity()).as("Should not project y").isEqualTo(1); - assertThat(projectedP2.getInt(0)).as("Should project x").isEqualTo(3); - - projected = writeAndRead("y_only", writeSchema, writeSchema.select("points.y"), row); - assertThat(projected.getArity()).as("Should not project id").isEqualTo(1); - assertThat(projected.isNullAt(0)).isFalse(); - points = projected.getArray(0); - assertThat(points.size()).isEqualTo(2); - projectedP1 = points.getRow(0, 2); - assertThat(projectedP1.getArity()).as("Should not project x").isEqualTo(1); - assertThat(projectedP1.getInt(0)).as("Should project y").isEqualTo(2); - projectedP2 = points.getRow(1, 2); - assertThat(projectedP2.getArity()).as("Should not project x").isEqualTo(1); - assertThat(projectedP2.isNullAt(0)).as("Should project null y").isTrue(); - - Schema yRenamed = - new Schema( - Types.NestedField.optional( - 22, - "points", - Types.ListType.ofOptional( - 21, - Types.StructType.of( - Types.NestedField.optional(18, "z", Types.IntegerType.get()))))); - - projected = writeAndRead("y_renamed", writeSchema, yRenamed, row); - assertThat(projected.getArity()).as("Should not project id").isEqualTo(1); - assertThat(projected.isNullAt(0)).isFalse(); - points = projected.getArray(0); - assertThat(points.size()).isEqualTo(2); - projectedP1 = points.getRow(0, 2); - assertThat(projectedP1.getArity()).as("Should not project x and y").isEqualTo(1); - assertThat(projectedP1.getInt(0)).as("Should project z").isEqualTo(2); - projectedP2 = points.getRow(1, 2); - assertThat(projectedP2.getArity()).as("Should not project x and y").isEqualTo(1); - assertThat(projectedP2.isNullAt(0)).as("Should project null z").isTrue(); - } - - @Test - public void testAddedFieldsWithRequiredChildren() throws Exception { - Schema schema = new Schema(Types.NestedField.required(1, "a", Types.LongType.get())); - - RowData row = GenericRowData.of(100L); - - Schema addedFields = - new Schema( - Types.NestedField.optional(1, "a", Types.LongType.get()), - Types.NestedField.optional( - 2, - "b", - Types.StructType.of(Types.NestedField.required(3, "c", Types.LongType.get()))), - Types.NestedField.optional(4, "d", Types.ListType.ofRequired(5, Types.LongType.get())), - Types.NestedField.optional( - 6, - "e", - Types.MapType.ofRequired(7, 8, Types.LongType.get(), Types.LongType.get()))); - - RowData projected = - writeAndRead("add_fields_with_required_children_projection", schema, addedFields, row); - assertThat(projected.getLong(0)) - .as("Should contain the correct value in column 1") - .isEqualTo(100L); - assertThat(projected.isNullAt(1)).as("Should contain empty value in new column 2").isTrue(); - assertThat(projected.isNullAt(2)).as("Should contain empty value in new column 4").isTrue(); - assertThat(projected.isNullAt(3)).as("Should contain empty value in new column 6").isTrue(); - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/data/TestStructRowData.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/data/TestStructRowData.java deleted file mode 100644 index eccab20e04fc..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/data/TestStructRowData.java +++ /dev/null @@ -1,100 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.data; - -import org.apache.iceberg.data.GenericRecord; -import org.apache.iceberg.flink.DataGenerator; -import org.apache.iceberg.flink.DataGenerators; -import org.apache.iceberg.flink.TestHelpers; -import org.junit.jupiter.api.Test; - -public class TestStructRowData { - - protected void testConverter(DataGenerator dataGenerator) { - StructRowData converter = new StructRowData(dataGenerator.icebergSchema().asStruct()); - GenericRecord expected = dataGenerator.generateIcebergGenericRecord(); - StructRowData actual = converter.setStruct(expected); - TestHelpers.assertRowData(dataGenerator.icebergSchema(), expected, actual); - } - - @Test - public void testPrimitiveTypes() { - testConverter(new DataGenerators.Primitives()); - } - - @Test - public void testStructOfPrimitive() { - testConverter(new DataGenerators.StructOfPrimitive()); - } - - @Test - public void testStructOfArray() { - testConverter(new DataGenerators.StructOfArray()); - } - - @Test - public void testStructOfMap() { - testConverter(new DataGenerators.StructOfMap()); - } - - @Test - public void testStructOfStruct() { - testConverter(new DataGenerators.StructOfStruct()); - } - - @Test - public void testArrayOfPrimitive() { - testConverter(new DataGenerators.ArrayOfPrimitive()); - } - - @Test - public void testArrayOfArray() { - testConverter(new DataGenerators.ArrayOfArray()); - } - - @Test - public void testArrayOfMap() { - testConverter(new DataGenerators.ArrayOfMap()); - } - - @Test - public void testArrayOfStruct() { - testConverter(new DataGenerators.ArrayOfStruct()); - } - - @Test - public void testMapOfPrimitives() { - testConverter(new DataGenerators.MapOfPrimitives()); - } - - @Test - public void testMapOfArray() { - testConverter(new DataGenerators.MapOfArray()); - } - - @Test - public void testMapOfMap() { - testConverter(new DataGenerators.MapOfMap()); - } - - @Test - public void testMapOfStruct() { - testConverter(new DataGenerators.MapOfStruct()); - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestAvroGenericRecordToRowDataMapper.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestAvroGenericRecordToRowDataMapper.java deleted file mode 100644 index 44eb907a17aa..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestAvroGenericRecordToRowDataMapper.java +++ /dev/null @@ -1,38 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import static org.assertj.core.api.Assertions.assertThat; - -import org.apache.flink.table.data.RowData; -import org.apache.iceberg.flink.AvroGenericRecordConverterBase; -import org.apache.iceberg.flink.DataGenerator; - -public class TestAvroGenericRecordToRowDataMapper extends AvroGenericRecordConverterBase { - @Override - protected void testConverter(DataGenerator dataGenerator) throws Exception { - // Need to use avroSchema from DataGenerator because some primitive types have special Avro - // type handling. Hence the Avro schema converted from Iceberg schema won't work. - AvroGenericRecordToRowDataMapper mapper = - AvroGenericRecordToRowDataMapper.forAvroSchema(dataGenerator.avroSchema()); - RowData expected = dataGenerator.generateFlinkRowData(); - RowData actual = mapper.map(dataGenerator.generateAvroGenericRecord()); - assertThat(actual).isEqualTo(expected); - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitionKeySelector.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitionKeySelector.java deleted file mode 100644 index abac605f81fd..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitionKeySelector.java +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatExceptionOfType; - -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.flink.SimpleDataUtil; -import org.apache.iceberg.flink.sink.TestBucketPartitionerUtil.TableSchemaType; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.params.ParameterizedTest; -import org.junit.jupiter.params.provider.EnumSource; - -public class TestBucketPartitionKeySelector { - - @ParameterizedTest - @EnumSource( - value = TableSchemaType.class, - names = {"ONE_BUCKET", "IDENTITY_AND_BUCKET"}) - public void testCorrectKeySelection(TableSchemaType tableSchemaType) { - int numBuckets = 60; - - PartitionSpec partitionSpec = tableSchemaType.getPartitionSpec(numBuckets); - BucketPartitionKeySelector keySelector = - new BucketPartitionKeySelector( - partitionSpec, SimpleDataUtil.SCHEMA, SimpleDataUtil.ROW_TYPE); - - TestBucketPartitionerUtil.generateRowsForBucketIdRange(2, numBuckets) - .forEach( - rowData -> { - int expectedBucketId = - TestBucketPartitionerUtil.computeBucketId( - numBuckets, rowData.getString(1).toString()); - Integer key = keySelector.getKey(rowData); - assertThat(key).isEqualTo(expectedBucketId); - }); - } - - @Test - public void testKeySelectorMultipleBucketsFail() { - PartitionSpec partitionSpec = TableSchemaType.TWO_BUCKETS.getPartitionSpec(1); - - assertThatExceptionOfType(RuntimeException.class) - .isThrownBy( - () -> - new BucketPartitionKeySelector( - partitionSpec, SimpleDataUtil.SCHEMA, SimpleDataUtil.ROW_TYPE)) - .withMessage(BucketPartitionerUtil.BAD_NUMBER_OF_BUCKETS_ERROR_MESSAGE, 2); - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitioner.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitioner.java deleted file mode 100644 index 59bdba578ebb..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitioner.java +++ /dev/null @@ -1,108 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import static org.apache.iceberg.flink.sink.BucketPartitioner.BUCKET_GREATER_THAN_UPPER_BOUND_MESSAGE; -import static org.apache.iceberg.flink.sink.BucketPartitioner.BUCKET_LESS_THAN_LOWER_BOUND_MESSAGE; -import static org.apache.iceberg.flink.sink.BucketPartitioner.BUCKET_NULL_MESSAGE; -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatExceptionOfType; - -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.flink.sink.TestBucketPartitionerUtil.TableSchemaType; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.params.ParameterizedTest; -import org.junit.jupiter.params.provider.CsvSource; - -public class TestBucketPartitioner { - - static final int DEFAULT_NUM_BUCKETS = 60; - - @ParameterizedTest - @CsvSource({"ONE_BUCKET,50", "IDENTITY_AND_BUCKET,50", "ONE_BUCKET,60", "IDENTITY_AND_BUCKET,60"}) - public void testPartitioningParallelismGreaterThanBuckets( - String schemaTypeStr, String numBucketsStr) { - int numPartitions = 500; - TableSchemaType tableSchemaType = TableSchemaType.valueOf(schemaTypeStr); - int numBuckets = Integer.parseInt(numBucketsStr); - PartitionSpec partitionSpec = tableSchemaType.getPartitionSpec(numBuckets); - BucketPartitioner bucketPartitioner = new BucketPartitioner(partitionSpec); - - int bucketId = 0; - for (int expectedIndex = 0; expectedIndex < numPartitions; expectedIndex++) { - int actualPartitionIndex = bucketPartitioner.partition(bucketId, numPartitions); - assertThat(actualPartitionIndex).isEqualTo(expectedIndex); - bucketId++; - if (bucketId == numBuckets) { - bucketId = 0; - } - } - } - - @ParameterizedTest - @CsvSource({"ONE_BUCKET,50", "IDENTITY_AND_BUCKET,50", "ONE_BUCKET,60", "IDENTITY_AND_BUCKET,60"}) - public void testPartitioningParallelismEqualLessThanBuckets( - String schemaTypeStr, String numBucketsStr) { - int numPartitions = 30; - TableSchemaType tableSchemaType = TableSchemaType.valueOf(schemaTypeStr); - int numBuckets = Integer.parseInt(numBucketsStr); - PartitionSpec partitionSpec = tableSchemaType.getPartitionSpec(numBuckets); - BucketPartitioner bucketPartitioner = new BucketPartitioner(partitionSpec); - - for (int bucketId = 0; bucketId < numBuckets; bucketId++) { - int actualPartitionIndex = bucketPartitioner.partition(bucketId, numPartitions); - assertThat(actualPartitionIndex).isEqualTo(bucketId % numPartitions); - } - } - - @Test - public void testPartitionerBucketIdNullFail() { - PartitionSpec partitionSpec = TableSchemaType.ONE_BUCKET.getPartitionSpec(DEFAULT_NUM_BUCKETS); - BucketPartitioner bucketPartitioner = new BucketPartitioner(partitionSpec); - - assertThatExceptionOfType(RuntimeException.class) - .isThrownBy(() -> bucketPartitioner.partition(null, DEFAULT_NUM_BUCKETS)) - .withMessage(BUCKET_NULL_MESSAGE); - } - - @Test - public void testPartitionerMultipleBucketsFail() { - PartitionSpec partitionSpec = TableSchemaType.TWO_BUCKETS.getPartitionSpec(DEFAULT_NUM_BUCKETS); - - assertThatExceptionOfType(RuntimeException.class) - .isThrownBy(() -> new BucketPartitioner(partitionSpec)) - .withMessage(BucketPartitionerUtil.BAD_NUMBER_OF_BUCKETS_ERROR_MESSAGE, 2); - } - - @Test - public void testPartitionerBucketIdOutOfRangeFail() { - PartitionSpec partitionSpec = TableSchemaType.ONE_BUCKET.getPartitionSpec(DEFAULT_NUM_BUCKETS); - BucketPartitioner bucketPartitioner = new BucketPartitioner(partitionSpec); - - int negativeBucketId = -1; - assertThatExceptionOfType(IllegalArgumentException.class) - .isThrownBy(() -> bucketPartitioner.partition(negativeBucketId, 1)) - .withMessage(BUCKET_LESS_THAN_LOWER_BOUND_MESSAGE, negativeBucketId); - - int tooBigBucketId = DEFAULT_NUM_BUCKETS; - assertThatExceptionOfType(IllegalArgumentException.class) - .isThrownBy(() -> bucketPartitioner.partition(tooBigBucketId, 1)) - .withMessage(BUCKET_GREATER_THAN_UPPER_BOUND_MESSAGE, tooBigBucketId, DEFAULT_NUM_BUCKETS); - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitionerFlinkIcebergSink.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitionerFlinkIcebergSink.java deleted file mode 100644 index ba0ea867ffb7..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitionerFlinkIcebergSink.java +++ /dev/null @@ -1,227 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import static org.apache.iceberg.flink.MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG; -import static org.apache.iceberg.flink.TestFixtures.DATABASE; -import static org.apache.iceberg.flink.TestFixtures.TABLE_IDENTIFIER; -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.IOException; -import java.util.Arrays; -import java.util.List; -import java.util.Map; -import org.apache.flink.api.common.typeinfo.TypeInformation; -import org.apache.flink.api.java.typeutils.RowTypeInfo; -import org.apache.flink.runtime.testutils.MiniClusterResourceConfiguration; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.util.DataFormatConverters; -import org.apache.flink.test.junit5.MiniClusterExtension; -import org.apache.flink.types.Row; -import org.apache.iceberg.DistributionMode; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.FileScanTask; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.flink.HadoopCatalogExtension; -import org.apache.iceberg.flink.SimpleDataUtil; -import org.apache.iceberg.flink.TableLoader; -import org.apache.iceberg.flink.TestFixtures; -import org.apache.iceberg.flink.sink.TestBucketPartitionerUtil.TableSchemaType; -import org.apache.iceberg.flink.source.BoundedTestSource; -import org.apache.iceberg.flink.util.FlinkCompatibilityUtil; -import org.apache.iceberg.io.CloseableIterable; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.junit.jupiter.api.extension.RegisterExtension; -import org.junit.jupiter.params.ParameterizedTest; -import org.junit.jupiter.params.provider.EnumSource; - -public class TestBucketPartitionerFlinkIcebergSink { - - private static final int NUMBER_TASK_MANAGERS = 1; - private static final int SLOTS_PER_TASK_MANAGER = 8; - - @RegisterExtension - private static final MiniClusterExtension MINI_CLUSTER_EXTENSION = - new MiniClusterExtension( - new MiniClusterResourceConfiguration.Builder() - .setNumberTaskManagers(NUMBER_TASK_MANAGERS) - .setNumberSlotsPerTaskManager(SLOTS_PER_TASK_MANAGER) - .setConfiguration(DISABLE_CLASSLOADER_CHECK_CONFIG) - .build()); - - @RegisterExtension - private static final HadoopCatalogExtension CATALOG_EXTENSION = - new HadoopCatalogExtension(DATABASE, TestFixtures.TABLE); - - private static final TypeInformation ROW_TYPE_INFO = - new RowTypeInfo(SimpleDataUtil.FLINK_SCHEMA.getFieldTypes()); - - // Parallelism = 8 (parallelism > numBuckets) throughout the test suite - private final int parallelism = NUMBER_TASK_MANAGERS * SLOTS_PER_TASK_MANAGER; - private final FileFormat format = FileFormat.PARQUET; - private final int numBuckets = 4; - - private Table table; - private StreamExecutionEnvironment env; - private TableLoader tableLoader; - - private void setupEnvironment(TableSchemaType tableSchemaType) { - PartitionSpec partitionSpec = tableSchemaType.getPartitionSpec(numBuckets); - table = - CATALOG_EXTENSION - .catalog() - .createTable( - TABLE_IDENTIFIER, - SimpleDataUtil.SCHEMA, - partitionSpec, - ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name())); - env = - StreamExecutionEnvironment.getExecutionEnvironment(DISABLE_CLASSLOADER_CHECK_CONFIG) - .enableCheckpointing(100) - .setParallelism(parallelism) - .setMaxParallelism(parallelism * 2); - tableLoader = CATALOG_EXTENSION.tableLoader(); - } - - private void appendRowsToTable(List allRows) throws Exception { - DataFormatConverters.RowConverter converter = - new DataFormatConverters.RowConverter(SimpleDataUtil.FLINK_SCHEMA.getFieldDataTypes()); - - DataStream dataStream = - env.addSource( - new BoundedTestSource<>( - allRows.stream().map(converter::toExternal).toArray(Row[]::new)), - ROW_TYPE_INFO) - .map(converter::toInternal, FlinkCompatibilityUtil.toTypeInfo(SimpleDataUtil.ROW_TYPE)) - .partitionCustom( - new BucketPartitioner(table.spec()), - new BucketPartitionKeySelector( - table.spec(), - table.schema(), - FlinkSink.toFlinkRowType(table.schema(), SimpleDataUtil.FLINK_SCHEMA))); - - FlinkSink.forRowData(dataStream) - .table(table) - .tableLoader(tableLoader) - .writeParallelism(parallelism) - .distributionMode(DistributionMode.NONE) - .append(); - - env.execute("Test Iceberg DataStream"); - - SimpleDataUtil.assertTableRows(table, allRows); - } - - @ParameterizedTest - @EnumSource( - value = TableSchemaType.class, - names = {"ONE_BUCKET", "IDENTITY_AND_BUCKET"}) - public void testSendRecordsToAllBucketsEvenly(TableSchemaType tableSchemaType) throws Exception { - setupEnvironment(tableSchemaType); - List rows = generateTestDataRows(); - - appendRowsToTable(rows); - TableTestStats stats = extractPartitionResults(tableSchemaType); - - assertThat(stats.totalRowCount).isEqualTo(rows.size()); - // All 4 buckets should've been written to - assertThat(stats.writersPerBucket.size()).isEqualTo(numBuckets); - assertThat(stats.numFilesPerBucket.size()).isEqualTo(numBuckets); - // Writer expectation (2 writers per bucket): - // - Bucket0 -> Writers [0, 4] - // - Bucket1 -> Writers [1, 5] - // - Bucket2 -> Writers [2, 6] - // - Bucket3 -> Writers [3, 7] - for (int i = 0, j = numBuckets; i < numBuckets; i++, j++) { - assertThat(stats.writersPerBucket.get(i)).hasSameElementsAs(Arrays.asList(i, j)); - // 2 files per bucket (one file is created by each writer) - assertThat(stats.numFilesPerBucket.get(i)).isEqualTo(2); - // 2 rows per file (total of 16 rows across 8 files) - assertThat(stats.rowsPerWriter.get(i)).isEqualTo(2); - } - } - - /** - * Generating 16 rows to be sent uniformly to all writers (round-robin across 8 writers -> 4 - * buckets) - */ - private List generateTestDataRows() { - int totalNumRows = parallelism * 2; - int numRowsPerBucket = totalNumRows / numBuckets; - return TestBucketPartitionerUtil.generateRowsForBucketIdRange(numRowsPerBucket, numBuckets); - } - - private TableTestStats extractPartitionResults(TableSchemaType tableSchemaType) - throws IOException { - int totalRecordCount = 0; - Map> writersPerBucket = Maps.newHashMap(); // > - Map filesPerBucket = Maps.newHashMap(); // - Map rowsPerWriter = Maps.newHashMap(); // - - try (CloseableIterable fileScanTasks = table.newScan().planFiles()) { - for (FileScanTask scanTask : fileScanTasks) { - long recordCountInFile = scanTask.file().recordCount(); - - String[] splitFilePath = scanTask.file().path().toString().split("/"); - // Filename example: 00007-0-a7d3a29a-33e9-4740-88f4-0f494397d60c-00001.parquet - // Writer ID: .......^^^^^ - String filename = splitFilePath[splitFilePath.length - 1]; - int writerId = Integer.parseInt(filename.split("-")[0]); - - totalRecordCount += recordCountInFile; - int bucketId = - scanTask - .file() - .partition() - .get(tableSchemaType.bucketPartitionColumnPosition(), Integer.class); - writersPerBucket.computeIfAbsent(bucketId, k -> Lists.newArrayList()); - writersPerBucket.get(bucketId).add(writerId); - filesPerBucket.put(bucketId, filesPerBucket.getOrDefault(bucketId, 0) + 1); - rowsPerWriter.put(writerId, rowsPerWriter.getOrDefault(writerId, 0L) + recordCountInFile); - } - } - - return new TableTestStats(totalRecordCount, writersPerBucket, filesPerBucket, rowsPerWriter); - } - - /** DTO to hold Test Stats */ - private static class TableTestStats { - final int totalRowCount; - final Map> writersPerBucket; - final Map numFilesPerBucket; - final Map rowsPerWriter; - - TableTestStats( - int totalRecordCount, - Map> writersPerBucket, - Map numFilesPerBucket, - Map rowsPerWriter) { - this.totalRowCount = totalRecordCount; - this.writersPerBucket = writersPerBucket; - this.numFilesPerBucket = numFilesPerBucket; - this.rowsPerWriter = rowsPerWriter; - } - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitionerUtil.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitionerUtil.java deleted file mode 100644 index e1309bfac6d5..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestBucketPartitionerUtil.java +++ /dev/null @@ -1,126 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import java.util.List; -import java.util.UUID; -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.StringData; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.flink.SimpleDataUtil; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.util.BucketUtil; - -final class TestBucketPartitionerUtil { - - enum TableSchemaType { - ONE_BUCKET { - @Override - public int bucketPartitionColumnPosition() { - return 0; - } - - @Override - public PartitionSpec getPartitionSpec(int numBuckets) { - return PartitionSpec.builderFor(SimpleDataUtil.SCHEMA).bucket("data", numBuckets).build(); - } - }, - IDENTITY_AND_BUCKET { - @Override - public int bucketPartitionColumnPosition() { - return 1; - } - - @Override - public PartitionSpec getPartitionSpec(int numBuckets) { - return PartitionSpec.builderFor(SimpleDataUtil.SCHEMA) - .identity("id") - .bucket("data", numBuckets) - .build(); - } - }, - TWO_BUCKETS { - @Override - public int bucketPartitionColumnPosition() { - return 1; - } - - @Override - public PartitionSpec getPartitionSpec(int numBuckets) { - return PartitionSpec.builderFor(SimpleDataUtil.SCHEMA) - .bucket("id", numBuckets) - .bucket("data", numBuckets) - .build(); - } - }; - - public abstract int bucketPartitionColumnPosition(); - - public abstract PartitionSpec getPartitionSpec(int numBuckets); - } - - private TestBucketPartitionerUtil() {} - - /** - * Utility method to generate rows whose values will "hash" to a range of bucketIds (from 0 to - * numBuckets - 1) - * - * @param numRowsPerBucket how many different rows should be generated per bucket - * @param numBuckets max number of buckets to consider - * @return the list of rows whose data "hashes" to the desired bucketId - */ - static List generateRowsForBucketIdRange(int numRowsPerBucket, int numBuckets) { - List rows = Lists.newArrayListWithCapacity(numBuckets * numRowsPerBucket); - // For some of our tests, this order of the generated rows matters - for (int i = 0; i < numRowsPerBucket; i++) { - for (int bucketId = 0; bucketId < numBuckets; bucketId++) { - String value = generateValueForBucketId(bucketId, numBuckets); - rows.add(GenericRowData.of(1, StringData.fromString(value))); - } - } - return rows; - } - - /** - * Utility method to generate a UUID string that will "hash" to a desired bucketId - * - * @param bucketId the desired bucketId - * @return the string data that "hashes" to the desired bucketId - */ - private static String generateValueForBucketId(int bucketId, int numBuckets) { - while (true) { - String uuid = UUID.randomUUID().toString(); - if (computeBucketId(numBuckets, uuid) == bucketId) { - return uuid; - } - } - } - - /** - * Utility that performs the same hashing/bucketing mechanism used by Bucket.java - * - * @param numBuckets max number of buckets to consider - * @param value the string to compute the bucketId from - * @return the computed bucketId - */ - static int computeBucketId(int numBuckets, String value) { - return (BucketUtil.hash(value) & Integer.MAX_VALUE) % numBuckets; - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestCachingTableSupplier.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestCachingTableSupplier.java deleted file mode 100644 index 360db658cd2f..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestCachingTableSupplier.java +++ /dev/null @@ -1,81 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatThrownBy; -import static org.mockito.Mockito.mock; -import static org.mockito.Mockito.when; - -import java.time.Duration; -import java.util.concurrent.TimeUnit; -import org.apache.iceberg.SerializableTable; -import org.apache.iceberg.Table; -import org.apache.iceberg.flink.TableLoader; -import org.awaitility.Awaitility; -import org.junit.jupiter.api.Test; - -public class TestCachingTableSupplier { - - @Test - public void testCheckArguments() { - SerializableTable initialTable = mock(SerializableTable.class); - - Table loadedTable = mock(Table.class); - TableLoader tableLoader = mock(TableLoader.class); - when(tableLoader.loadTable()).thenReturn(loadedTable); - - new CachingTableSupplier(initialTable, tableLoader, Duration.ofMillis(100)); - - assertThatThrownBy(() -> new CachingTableSupplier(initialTable, tableLoader, null)) - .isInstanceOf(IllegalArgumentException.class) - .hasMessage("tableRefreshInterval cannot be null"); - assertThatThrownBy(() -> new CachingTableSupplier(null, tableLoader, Duration.ofMillis(100))) - .isInstanceOf(IllegalArgumentException.class) - .hasMessage("initialTable cannot be null"); - assertThatThrownBy(() -> new CachingTableSupplier(initialTable, null, Duration.ofMillis(100))) - .isInstanceOf(IllegalArgumentException.class) - .hasMessage("tableLoader cannot be null"); - } - - @Test - public void testTableReload() { - SerializableTable initialTable = mock(SerializableTable.class); - - Table loadedTable = mock(Table.class); - TableLoader tableLoader = mock(TableLoader.class); - when(tableLoader.loadTable()).thenReturn(loadedTable); - - CachingTableSupplier cachingTableSupplier = - new CachingTableSupplier(initialTable, tableLoader, Duration.ofMillis(100)); - - // refresh shouldn't do anything as the min reload interval hasn't passed - cachingTableSupplier.refreshTable(); - assertThat(cachingTableSupplier.get()).isEqualTo(initialTable); - - // refresh after waiting past the min reload interval - Awaitility.await() - .atLeast(100, TimeUnit.MILLISECONDS) - .untilAsserted( - () -> { - cachingTableSupplier.refreshTable(); - assertThat(cachingTableSupplier.get()).isEqualTo(loadedTable); - }); - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestCompressionSettings.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestCompressionSettings.java deleted file mode 100644 index 8faae1b05a4e..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestCompressionSettings.java +++ /dev/null @@ -1,257 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.File; -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.Map; -import org.apache.flink.streaming.util.OneInputStreamOperatorTestHarness; -import org.apache.flink.table.api.TableSchema; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.Parameter; -import org.apache.iceberg.ParameterizedTestExtension; -import org.apache.iceberg.Parameters; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.common.DynFields; -import org.apache.iceberg.flink.FlinkWriteConf; -import org.apache.iceberg.flink.FlinkWriteOptions; -import org.apache.iceberg.flink.SimpleDataUtil; -import org.apache.iceberg.io.BaseTaskWriter; -import org.apache.iceberg.io.TaskWriter; -import org.apache.iceberg.io.WriteResult; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.TestTemplate; -import org.junit.jupiter.api.extension.ExtendWith; -import org.junit.jupiter.api.io.TempDir; - -@ExtendWith(ParameterizedTestExtension.class) -public class TestCompressionSettings { - @TempDir protected Path temporaryFolder; - - private Table table; - - @Parameter(index = 0) - private Map initProperties; - - @Parameters(name = "tableProperties = {0}") - public static Object[][] parameters() { - return new Object[][] { - new Object[] {ImmutableMap.of()}, - new Object[] { - ImmutableMap.of( - TableProperties.AVRO_COMPRESSION, - "zstd", - TableProperties.AVRO_COMPRESSION_LEVEL, - "3", - TableProperties.PARQUET_COMPRESSION, - "zstd", - TableProperties.PARQUET_COMPRESSION_LEVEL, - "3", - TableProperties.ORC_COMPRESSION, - "zstd", - TableProperties.ORC_COMPRESSION_STRATEGY, - "compression") - } - }; - } - - @BeforeEach - public void before() throws IOException { - File folder = Files.createTempDirectory(temporaryFolder, "junit").toFile(); - table = SimpleDataUtil.createTable(folder.getAbsolutePath(), initProperties, false); - } - - @TestTemplate - public void testCompressionAvro() throws Exception { - // No override provided - Map resultProperties = - appenderProperties( - table, - SimpleDataUtil.FLINK_SCHEMA, - ImmutableMap.of(FlinkWriteOptions.WRITE_FORMAT.key(), "AVRO")); - - if (initProperties.get(TableProperties.AVRO_COMPRESSION) == null) { - assertThat(resultProperties) - .containsEntry(TableProperties.AVRO_COMPRESSION, TableProperties.AVRO_COMPRESSION_DEFAULT) - .doesNotContainKey(TableProperties.AVRO_COMPRESSION_LEVEL); - } else { - assertThat(resultProperties) - .containsEntry( - TableProperties.AVRO_COMPRESSION, - initProperties.get(TableProperties.AVRO_COMPRESSION)) - .containsEntry( - TableProperties.AVRO_COMPRESSION_LEVEL, - initProperties.get(TableProperties.AVRO_COMPRESSION_LEVEL)); - } - - // Override compression to snappy and some random level - resultProperties = - appenderProperties( - table, - SimpleDataUtil.FLINK_SCHEMA, - ImmutableMap.of( - FlinkWriteOptions.WRITE_FORMAT.key(), - "AVRO", - FlinkWriteOptions.COMPRESSION_CODEC.key(), - "snappy", - FlinkWriteOptions.COMPRESSION_LEVEL.key(), - "6")); - - assertThat(resultProperties) - .containsEntry(TableProperties.AVRO_COMPRESSION, "snappy") - .containsEntry(TableProperties.AVRO_COMPRESSION_LEVEL, "6"); - } - - @TestTemplate - public void testCompressionParquet() throws Exception { - // No override provided - Map resultProperties = - appenderProperties( - table, - SimpleDataUtil.FLINK_SCHEMA, - ImmutableMap.of(FlinkWriteOptions.WRITE_FORMAT.key(), "PARQUET")); - - if (initProperties.get(TableProperties.PARQUET_COMPRESSION) == null) { - assertThat(resultProperties) - .containsEntry( - TableProperties.PARQUET_COMPRESSION, - TableProperties.PARQUET_COMPRESSION_DEFAULT_SINCE_1_4_0) - .doesNotContainKey(TableProperties.PARQUET_COMPRESSION_LEVEL); - } else { - assertThat(resultProperties) - .containsEntry( - TableProperties.PARQUET_COMPRESSION, - initProperties.get(TableProperties.PARQUET_COMPRESSION)) - .containsEntry( - TableProperties.PARQUET_COMPRESSION_LEVEL, - initProperties.get(TableProperties.PARQUET_COMPRESSION_LEVEL)); - } - - // Override compression to snappy and some random level - resultProperties = - appenderProperties( - table, - SimpleDataUtil.FLINK_SCHEMA, - ImmutableMap.of( - FlinkWriteOptions.WRITE_FORMAT.key(), - "PARQUET", - FlinkWriteOptions.COMPRESSION_CODEC.key(), - "snappy", - FlinkWriteOptions.COMPRESSION_LEVEL.key(), - "6")); - - assertThat(resultProperties) - .containsEntry(TableProperties.PARQUET_COMPRESSION, "snappy") - .containsEntry(TableProperties.PARQUET_COMPRESSION_LEVEL, "6"); - } - - @TestTemplate - public void testCompressionOrc() throws Exception { - // No override provided - Map resultProperties = - appenderProperties( - table, - SimpleDataUtil.FLINK_SCHEMA, - ImmutableMap.of(FlinkWriteOptions.WRITE_FORMAT.key(), "ORC")); - - if (initProperties.get(TableProperties.ORC_COMPRESSION) == null) { - assertThat(resultProperties) - .containsEntry(TableProperties.ORC_COMPRESSION, TableProperties.ORC_COMPRESSION_DEFAULT) - .containsEntry( - TableProperties.ORC_COMPRESSION_STRATEGY, - TableProperties.ORC_COMPRESSION_STRATEGY_DEFAULT); - } else { - assertThat(resultProperties) - .containsEntry( - TableProperties.ORC_COMPRESSION, initProperties.get(TableProperties.ORC_COMPRESSION)) - .containsEntry( - TableProperties.ORC_COMPRESSION_STRATEGY, - initProperties.get(TableProperties.ORC_COMPRESSION_STRATEGY)); - } - - // Override compression to snappy and a different strategy - resultProperties = - appenderProperties( - table, - SimpleDataUtil.FLINK_SCHEMA, - ImmutableMap.of( - FlinkWriteOptions.WRITE_FORMAT.key(), - "ORC", - FlinkWriteOptions.COMPRESSION_CODEC.key(), - "snappy", - FlinkWriteOptions.COMPRESSION_STRATEGY.key(), - "speed")); - - assertThat(resultProperties) - .containsEntry(TableProperties.ORC_COMPRESSION, "snappy") - .containsEntry(TableProperties.ORC_COMPRESSION_STRATEGY, "speed"); - } - - private static OneInputStreamOperatorTestHarness createIcebergStreamWriter( - Table icebergTable, TableSchema flinkSchema, Map override) throws Exception { - RowType flinkRowType = FlinkSink.toFlinkRowType(icebergTable.schema(), flinkSchema); - FlinkWriteConf flinkWriteConfig = - new FlinkWriteConf( - icebergTable, override, new org.apache.flink.configuration.Configuration()); - - IcebergStreamWriter streamWriter = - FlinkSink.createStreamWriter(() -> icebergTable, flinkWriteConfig, flinkRowType, null); - OneInputStreamOperatorTestHarness harness = - new OneInputStreamOperatorTestHarness<>(streamWriter, 1, 1, 0); - - harness.setup(); - harness.open(); - - return harness; - } - - private static Map appenderProperties( - Table table, TableSchema schema, Map override) throws Exception { - try (OneInputStreamOperatorTestHarness testHarness = - createIcebergStreamWriter(table, schema, override)) { - testHarness.processElement(SimpleDataUtil.createRowData(1, "hello"), 1); - - testHarness.prepareSnapshotPreBarrier(1L); - DynFields.BoundField operatorField = - DynFields.builder() - .hiddenImpl(testHarness.getOperatorFactory().getClass(), "operator") - .build(testHarness.getOperatorFactory()); - DynFields.BoundField writerField = - DynFields.builder() - .hiddenImpl(IcebergStreamWriter.class, "writer") - .build(operatorField.get()); - DynFields.BoundField appenderField = - DynFields.builder() - .hiddenImpl(BaseTaskWriter.class, "appenderFactory") - .build(writerField.get()); - DynFields.BoundField> propsField = - DynFields.builder() - .hiddenImpl(FlinkAppenderFactory.class, "props") - .build(appenderField.get()); - return propsField.get(); - } - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestDeltaTaskWriter.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestDeltaTaskWriter.java deleted file mode 100644 index 21f3ee2c655a..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestDeltaTaskWriter.java +++ /dev/null @@ -1,429 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import static org.apache.iceberg.flink.SimpleDataUtil.createDelete; -import static org.apache.iceberg.flink.SimpleDataUtil.createInsert; -import static org.apache.iceberg.flink.SimpleDataUtil.createRecord; -import static org.apache.iceberg.flink.SimpleDataUtil.createUpdateAfter; -import static org.apache.iceberg.flink.SimpleDataUtil.createUpdateBefore; -import static org.apache.iceberg.types.Types.NestedField.required; -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.File; -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.Paths; -import java.time.OffsetDateTime; -import java.util.Arrays; -import java.util.List; -import java.util.stream.Collectors; -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.TimestampData; -import org.apache.flink.table.runtime.typeutils.RowDataSerializer; -import org.apache.flink.table.types.logical.IntType; -import org.apache.flink.table.types.logical.LocalZonedTimestampType; -import org.apache.flink.table.types.logical.RowType; -import org.apache.flink.types.RowKind; -import org.apache.iceberg.ContentFile; -import org.apache.iceberg.FileContent; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Parameter; -import org.apache.iceberg.ParameterizedTestExtension; -import org.apache.iceberg.Parameters; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.RowDelta; -import org.apache.iceberg.Schema; -import org.apache.iceberg.SerializableTable; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.TestBase; -import org.apache.iceberg.TestTables; -import org.apache.iceberg.data.GenericRecord; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.SimpleDataUtil; -import org.apache.iceberg.io.TaskWriter; -import org.apache.iceberg.io.WriteResult; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Sets; -import org.apache.iceberg.types.Types; -import org.apache.iceberg.util.StructLikeSet; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.TestTemplate; -import org.junit.jupiter.api.extension.ExtendWith; - -@ExtendWith(ParameterizedTestExtension.class) -public class TestDeltaTaskWriter extends TestBase { - - @Parameter(index = 1) - private FileFormat format; - - @Parameters(name = "formatVersion = {0}, fileFormat = {1}") - protected static List parameters() { - return Arrays.asList( - new Object[] {2, FileFormat.AVRO}, - new Object[] {2, FileFormat.ORC}, - new Object[] {2, FileFormat.PARQUET}); - } - - @Override - @BeforeEach - public void setupTable() throws IOException { - this.tableDir = Files.createTempDirectory(temp, "junit").toFile(); - assertThat(tableDir.delete()).isTrue(); // created by table create - - this.metadataDir = new File(tableDir, "metadata"); - } - - private int idFieldId() { - return table.schema().findField("id").fieldId(); - } - - private int dataFieldId() { - return table.schema().findField("data").fieldId(); - } - - private void testCdcEvents(boolean partitioned) throws IOException { - List equalityFieldIds = Lists.newArrayList(idFieldId()); - TaskWriterFactory taskWriterFactory = createTaskWriterFactory(equalityFieldIds); - taskWriterFactory.initialize(1, 1); - - // Start the 1th transaction. - TaskWriter writer = taskWriterFactory.create(); - - writer.write(createInsert(1, "aaa")); - writer.write(createInsert(2, "bbb")); - writer.write(createInsert(3, "ccc")); - - // Update <2, 'bbb'> to <2, 'ddd'> - writer.write(createUpdateBefore(2, "bbb")); // 1 pos-delete and 1 eq-delete. - writer.write(createUpdateAfter(2, "ddd")); - - // Update <1, 'aaa'> to <1, 'eee'> - writer.write(createUpdateBefore(1, "aaa")); // 1 pos-delete and 1 eq-delete. - writer.write(createUpdateAfter(1, "eee")); - - // Insert <4, 'fff'> - writer.write(createInsert(4, "fff")); - // Insert <5, 'ggg'> - writer.write(createInsert(5, "ggg")); - - // Delete <3, 'ccc'> - writer.write(createDelete(3, "ccc")); // 1 pos-delete and 1 eq-delete. - - WriteResult result = writer.complete(); - assertThat(result.dataFiles()).hasSize(partitioned ? 7 : 1); - assertThat(result.deleteFiles()).hasSize(partitioned ? 3 : 1); - commitTransaction(result); - - assertThat(actualRowSet("*")) - .isEqualTo( - expectedRowSet( - createRecord(1, "eee"), - createRecord(2, "ddd"), - createRecord(4, "fff"), - createRecord(5, "ggg"))); - - // Start the 2nd transaction. - writer = taskWriterFactory.create(); - - // Update <2, 'ddd'> to <6, 'hhh'> - (Update both key and value) - writer.write(createUpdateBefore(2, "ddd")); // 1 eq-delete - writer.write(createUpdateAfter(6, "hhh")); - - // Update <5, 'ggg'> to <5, 'iii'> - writer.write(createUpdateBefore(5, "ggg")); // 1 eq-delete - writer.write(createUpdateAfter(5, "iii")); - - // Delete <4, 'fff'> - writer.write(createDelete(4, "fff")); // 1 eq-delete. - - result = writer.complete(); - assertThat(result.dataFiles()).hasSize(partitioned ? 2 : 1); - assertThat(result.deleteFiles()).hasSize(partitioned ? 3 : 1); - commitTransaction(result); - - assertThat(actualRowSet("*")) - .isEqualTo( - expectedRowSet(createRecord(1, "eee"), createRecord(5, "iii"), createRecord(6, "hhh"))); - } - - @TestTemplate - public void testUnpartitioned() throws IOException { - createAndInitTable(false); - testCdcEvents(false); - } - - @TestTemplate - public void testPartitioned() throws IOException { - createAndInitTable(true); - testCdcEvents(true); - } - - private void testWritePureEqDeletes(boolean partitioned) throws IOException { - createAndInitTable(partitioned); - List equalityFieldIds = Lists.newArrayList(idFieldId()); - TaskWriterFactory taskWriterFactory = createTaskWriterFactory(equalityFieldIds); - taskWriterFactory.initialize(1, 1); - - TaskWriter writer = taskWriterFactory.create(); - writer.write(createDelete(1, "aaa")); - writer.write(createDelete(2, "bbb")); - writer.write(createDelete(3, "ccc")); - - WriteResult result = writer.complete(); - assertThat(result.dataFiles()).isEmpty(); - assertThat(result.deleteFiles()).hasSize(partitioned ? 3 : 1); - commitTransaction(result); - - assertThat(actualRowSet("*")).isEqualTo(expectedRowSet()); - } - - @TestTemplate - public void testUnpartitionedPureEqDeletes() throws IOException { - testWritePureEqDeletes(false); - } - - @TestTemplate - public void testPartitionedPureEqDeletes() throws IOException { - testWritePureEqDeletes(true); - } - - private void testAbort(boolean partitioned) throws IOException { - createAndInitTable(partitioned); - List equalityFieldIds = Lists.newArrayList(idFieldId()); - TaskWriterFactory taskWriterFactory = createTaskWriterFactory(equalityFieldIds); - taskWriterFactory.initialize(1, 1); - - TaskWriter writer = taskWriterFactory.create(); - for (int i = 0; i < 8_000; i += 2) { - writer.write(createUpdateBefore(i + 1, "aaa")); - writer.write(createUpdateAfter(i + 1, "aaa")); - - writer.write(createUpdateBefore(i + 2, "bbb")); - writer.write(createUpdateAfter(i + 2, "bbb")); - } - - // Assert the current data/delete file count. - List files = - Files.walk(Paths.get(tableDir.getPath(), "data")) - .filter(p -> p.toFile().isFile()) - .filter(p -> !p.toString().endsWith(".crc")) - .collect(Collectors.toList()); - assertThat(files).hasSize(partitioned ? 4 : 2); - - writer.abort(); - for (Path file : files) { - assertThat(file).doesNotExist(); - } - } - - @TestTemplate - public void testUnpartitionedAbort() throws IOException { - testAbort(false); - } - - @TestTemplate - public void testPartitionedAbort() throws IOException { - testAbort(true); - } - - @TestTemplate - public void testPartitionedTableWithDataAsKey() throws IOException { - createAndInitTable(true); - List equalityFieldIds = Lists.newArrayList(dataFieldId()); - TaskWriterFactory taskWriterFactory = createTaskWriterFactory(equalityFieldIds); - taskWriterFactory.initialize(1, 1); - - // Start the 1th transaction. - TaskWriter writer = taskWriterFactory.create(); - writer.write(createInsert(1, "aaa")); - writer.write(createInsert(2, "aaa")); - writer.write(createInsert(3, "bbb")); - writer.write(createInsert(4, "ccc")); - - WriteResult result = writer.complete(); - assertThat(result.dataFiles()).hasSize(3); - assertThat(result.deleteFiles()).hasSize(1); - commitTransaction(result); - - assertThat(actualRowSet("*")) - .isEqualTo( - expectedRowSet(createRecord(2, "aaa"), createRecord(3, "bbb"), createRecord(4, "ccc"))); - - // Start the 2nd transaction. - writer = taskWriterFactory.create(); - writer.write(createInsert(5, "aaa")); - writer.write(createInsert(6, "bbb")); - writer.write(createDelete(7, "ccc")); // 1 eq-delete. - - result = writer.complete(); - assertThat(result.dataFiles()).hasSize(2); - assertThat(result.deleteFiles()).hasSize(1); - commitTransaction(result); - - assertThat(actualRowSet("*")) - .isEqualTo( - expectedRowSet( - createRecord(2, "aaa"), - createRecord(5, "aaa"), - createRecord(3, "bbb"), - createRecord(6, "bbb"))); - } - - @TestTemplate - public void testPartitionedTableWithDataAndIdAsKey() throws IOException { - createAndInitTable(true); - List equalityFieldIds = Lists.newArrayList(dataFieldId(), idFieldId()); - TaskWriterFactory taskWriterFactory = createTaskWriterFactory(equalityFieldIds); - taskWriterFactory.initialize(1, 1); - - TaskWriter writer = taskWriterFactory.create(); - writer.write(createInsert(1, "aaa")); - writer.write(createInsert(2, "aaa")); - - writer.write(createDelete(2, "aaa")); // 1 pos-delete. - - WriteResult result = writer.complete(); - assertThat(result.dataFiles()).hasSize(1); - assertThat(result.deleteFiles()).hasSize(1); - assertThat(result.deleteFiles()[0].content()).isEqualTo(FileContent.POSITION_DELETES); - commitTransaction(result); - - assertThat(actualRowSet("*")).isEqualTo(expectedRowSet(createRecord(1, "aaa"))); - } - - @TestTemplate - public void testEqualityColumnOnCustomPrecisionTSColumn() throws IOException { - Schema tableSchema = - new Schema( - required(3, "id", Types.IntegerType.get()), - required(4, "ts", Types.TimestampType.withZone())); - RowType flinkType = - new RowType( - false, - ImmutableList.of( - new RowType.RowField("id", new IntType()), - new RowType.RowField("ts", new LocalZonedTimestampType(3)))); - - this.table = create(tableSchema, PartitionSpec.unpartitioned()); - initTable(table); - - List equalityIds = ImmutableList.of(table.schema().findField("ts").fieldId()); - TaskWriterFactory taskWriterFactory = createTaskWriterFactory(flinkType, equalityIds); - taskWriterFactory.initialize(1, 1); - - TaskWriter writer = taskWriterFactory.create(); - RowDataSerializer serializer = new RowDataSerializer(flinkType); - OffsetDateTime start = OffsetDateTime.now(); - writer.write( - serializer.toBinaryRow( - GenericRowData.ofKind( - RowKind.INSERT, 1, TimestampData.fromInstant(start.toInstant())))); - writer.write( - serializer.toBinaryRow( - GenericRowData.ofKind( - RowKind.INSERT, 2, TimestampData.fromInstant(start.plusSeconds(1).toInstant())))); - writer.write( - serializer.toBinaryRow( - GenericRowData.ofKind( - RowKind.DELETE, 2, TimestampData.fromInstant(start.plusSeconds(1).toInstant())))); - - WriteResult result = writer.complete(); - // One data file - assertThat(result.dataFiles()).hasSize(1); - // One eq delete file + one pos delete file - assertThat(result.deleteFiles()).hasSize(2); - assertThat( - Arrays.stream(result.deleteFiles()) - .map(ContentFile::content) - .collect(Collectors.toSet())) - .isEqualTo(Sets.newHashSet(FileContent.POSITION_DELETES, FileContent.EQUALITY_DELETES)); - commitTransaction(result); - - Record expectedRecord = GenericRecord.create(tableSchema); - expectedRecord.setField("id", 1); - int cutPrecisionNano = start.getNano() / 1000000 * 1000000; - expectedRecord.setField("ts", start.withNano(cutPrecisionNano)); - - assertThat(actualRowSet("*")).isEqualTo(expectedRowSet(expectedRecord)); - } - - private void commitTransaction(WriteResult result) { - RowDelta rowDelta = table.newRowDelta(); - Arrays.stream(result.dataFiles()).forEach(rowDelta::addRows); - Arrays.stream(result.deleteFiles()).forEach(rowDelta::addDeletes); - rowDelta - .validateDeletedFiles() - .validateDataFilesExist(Lists.newArrayList(result.referencedDataFiles())) - .commit(); - } - - private StructLikeSet expectedRowSet(Record... records) { - return SimpleDataUtil.expectedRowSet(table, records); - } - - private StructLikeSet actualRowSet(String... columns) throws IOException { - return SimpleDataUtil.actualRowSet(table, columns); - } - - private TaskWriterFactory createTaskWriterFactory(List equalityFieldIds) { - return new RowDataTaskWriterFactory( - SerializableTable.copyOf(table), - FlinkSchemaUtil.convert(table.schema()), - 128 * 1024 * 1024, - format, - table.properties(), - equalityFieldIds, - false); - } - - private TaskWriterFactory createTaskWriterFactory( - RowType flinkType, List equalityFieldIds) { - return new RowDataTaskWriterFactory( - SerializableTable.copyOf(table), - flinkType, - 128 * 1024 * 1024, - format, - table.properties(), - equalityFieldIds, - true); - } - - private void createAndInitTable(boolean partitioned) { - if (partitioned) { - this.table = create(SCHEMA, PartitionSpec.builderFor(SCHEMA).identity("data").build()); - } else { - this.table = create(SCHEMA, PartitionSpec.unpartitioned()); - } - - initTable(table); - } - - private void initTable(TestTables.TestTable testTable) { - testTable - .updateProperties() - .set(TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES, String.valueOf(8 * 1024)) - .defaultFormat(format) - .commit(); - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkAppenderFactory.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkAppenderFactory.java deleted file mode 100644 index dd89f43483b0..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkAppenderFactory.java +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import java.util.List; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.Schema; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.RowDataWrapper; -import org.apache.iceberg.flink.SimpleDataUtil; -import org.apache.iceberg.io.FileAppenderFactory; -import org.apache.iceberg.io.TestAppenderFactory; -import org.apache.iceberg.util.ArrayUtil; -import org.apache.iceberg.util.StructLikeSet; - -public class TestFlinkAppenderFactory extends TestAppenderFactory { - - private final RowType rowType = FlinkSchemaUtil.convert(SCHEMA); - - @Override - protected FileAppenderFactory createAppenderFactory( - List equalityFieldIds, Schema eqDeleteSchema, Schema posDeleteRowSchema) { - return new FlinkAppenderFactory( - table, - table.schema(), - rowType, - table.properties(), - table.spec(), - ArrayUtil.toIntArray(equalityFieldIds), - eqDeleteSchema, - posDeleteRowSchema); - } - - @Override - protected RowData createRow(Integer id, String data) { - return SimpleDataUtil.createRowData(id, data); - } - - @Override - protected StructLikeSet expectedRowSet(Iterable rows) { - StructLikeSet set = StructLikeSet.create(table.schema().asStruct()); - for (RowData row : rows) { - RowDataWrapper wrapper = new RowDataWrapper(rowType, table.schema().asStruct()); - set.add(wrapper.wrap(row)); - } - return set; - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkFileWriterFactory.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkFileWriterFactory.java deleted file mode 100644 index 414ee40d1357..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkFileWriterFactory.java +++ /dev/null @@ -1,66 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import java.util.List; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.Schema; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.RowDataWrapper; -import org.apache.iceberg.flink.SimpleDataUtil; -import org.apache.iceberg.io.FileWriterFactory; -import org.apache.iceberg.io.TestFileWriterFactory; -import org.apache.iceberg.util.ArrayUtil; -import org.apache.iceberg.util.StructLikeSet; - -public class TestFlinkFileWriterFactory extends TestFileWriterFactory { - - @Override - protected FileWriterFactory newWriterFactory( - Schema dataSchema, - List equalityFieldIds, - Schema equalityDeleteRowSchema, - Schema positionDeleteRowSchema) { - return FlinkFileWriterFactory.builderFor(table) - .dataSchema(table.schema()) - .dataFileFormat(format()) - .deleteFileFormat(format()) - .equalityFieldIds(ArrayUtil.toIntArray(equalityFieldIds)) - .equalityDeleteRowSchema(equalityDeleteRowSchema) - .positionDeleteRowSchema(positionDeleteRowSchema) - .build(); - } - - @Override - protected RowData toRow(Integer id, String data) { - return SimpleDataUtil.createRowData(id, data); - } - - @Override - protected StructLikeSet toSet(Iterable rows) { - StructLikeSet set = StructLikeSet.create(table.schema().asStruct()); - RowType flinkType = FlinkSchemaUtil.convert(table.schema()); - for (RowData row : rows) { - RowDataWrapper wrapper = new RowDataWrapper(flinkType, table.schema().asStruct()); - set.add(wrapper.wrap(row)); - } - return set; - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSink.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSink.java deleted file mode 100644 index 61ab087f2ca3..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSink.java +++ /dev/null @@ -1,385 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import static org.apache.iceberg.flink.TestFixtures.DATABASE; -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatThrownBy; - -import java.io.IOException; -import java.util.Collections; -import java.util.List; -import java.util.Map; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.table.api.TableSchema; -import org.apache.flink.table.data.RowData; -import org.apache.flink.test.junit5.MiniClusterExtension; -import org.apache.flink.types.Row; -import org.apache.iceberg.DistributionMode; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Parameter; -import org.apache.iceberg.ParameterizedTestExtension; -import org.apache.iceberg.Parameters; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.flink.FlinkWriteOptions; -import org.apache.iceberg.flink.HadoopCatalogExtension; -import org.apache.iceberg.flink.MiniFlinkClusterExtension; -import org.apache.iceberg.flink.SimpleDataUtil; -import org.apache.iceberg.flink.TableLoader; -import org.apache.iceberg.flink.TestFixtures; -import org.apache.iceberg.flink.util.FlinkCompatibilityUtil; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.TestTemplate; -import org.junit.jupiter.api.extension.ExtendWith; -import org.junit.jupiter.api.extension.RegisterExtension; - -@ExtendWith(ParameterizedTestExtension.class) -public class TestFlinkIcebergSink extends TestFlinkIcebergSinkBase { - - @RegisterExtension - public static final MiniClusterExtension MINI_CLUSTER_EXTENSION = - MiniFlinkClusterExtension.createWithClassloaderCheckDisabled(); - - @RegisterExtension - private static final HadoopCatalogExtension CATALOG_EXTENSION = - new HadoopCatalogExtension(DATABASE, TestFixtures.TABLE); - - private TableLoader tableLoader; - - @Parameter(index = 0) - private FileFormat format; - - @Parameter(index = 1) - private int parallelism; - - @Parameter(index = 2) - private boolean partitioned; - - @Parameters(name = "format={0}, parallelism = {1}, partitioned = {2}") - public static Object[][] parameters() { - return new Object[][] { - {FileFormat.AVRO, 1, true}, - {FileFormat.AVRO, 1, false}, - {FileFormat.AVRO, 2, true}, - {FileFormat.AVRO, 2, false}, - {FileFormat.ORC, 1, true}, - {FileFormat.ORC, 1, false}, - {FileFormat.ORC, 2, true}, - {FileFormat.ORC, 2, false}, - {FileFormat.PARQUET, 1, true}, - {FileFormat.PARQUET, 1, false}, - {FileFormat.PARQUET, 2, true}, - {FileFormat.PARQUET, 2, false} - }; - } - - @BeforeEach - public void before() throws IOException { - table = - CATALOG_EXTENSION - .catalog() - .createTable( - TestFixtures.TABLE_IDENTIFIER, - SimpleDataUtil.SCHEMA, - partitioned - ? PartitionSpec.builderFor(SimpleDataUtil.SCHEMA).identity("data").build() - : PartitionSpec.unpartitioned(), - ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name())); - - env = - StreamExecutionEnvironment.getExecutionEnvironment( - MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG) - .enableCheckpointing(100) - .setParallelism(parallelism) - .setMaxParallelism(parallelism); - - tableLoader = CATALOG_EXTENSION.tableLoader(); - } - - @TestTemplate - public void testWriteRowData() throws Exception { - List rows = Lists.newArrayList(Row.of(1, "hello"), Row.of(2, "world"), Row.of(3, "foo")); - DataStream dataStream = - env.addSource(createBoundedSource(rows), ROW_TYPE_INFO) - .map(CONVERTER::toInternal, FlinkCompatibilityUtil.toTypeInfo(SimpleDataUtil.ROW_TYPE)); - - FlinkSink.forRowData(dataStream) - .table(table) - .tableLoader(tableLoader) - .writeParallelism(parallelism) - .append(); - - // Execute the program. - env.execute("Test Iceberg DataStream"); - - // Assert the iceberg table's records. - SimpleDataUtil.assertTableRows(table, convertToRowData(rows)); - } - - private void testWriteRow(TableSchema tableSchema, DistributionMode distributionMode) - throws Exception { - List rows = createRows(""); - DataStream dataStream = env.addSource(createBoundedSource(rows), ROW_TYPE_INFO); - - FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) - .table(table) - .tableLoader(tableLoader) - .tableSchema(tableSchema) - .writeParallelism(parallelism) - .distributionMode(distributionMode) - .append(); - - // Execute the program. - env.execute("Test Iceberg DataStream."); - - SimpleDataUtil.assertTableRows(table, convertToRowData(rows)); - } - - private int partitionFiles(String partition) throws IOException { - return SimpleDataUtil.partitionDataFiles(table, ImmutableMap.of("data", partition)).size(); - } - - @TestTemplate - public void testWriteRow() throws Exception { - testWriteRow(null, DistributionMode.NONE); - } - - @TestTemplate - public void testWriteRowWithTableSchema() throws Exception { - testWriteRow(SimpleDataUtil.FLINK_SCHEMA, DistributionMode.NONE); - } - - @TestTemplate - public void testJobNoneDistributeMode() throws Exception { - table - .updateProperties() - .set(TableProperties.WRITE_DISTRIBUTION_MODE, DistributionMode.HASH.modeName()) - .commit(); - - testWriteRow(null, DistributionMode.NONE); - - if (parallelism > 1) { - if (partitioned) { - int files = partitionFiles("aaa") + partitionFiles("bbb") + partitionFiles("ccc"); - assertThat(files).isGreaterThan(3); - } - } - } - - @TestTemplate - public void testJobHashDistributionMode() { - table - .updateProperties() - .set(TableProperties.WRITE_DISTRIBUTION_MODE, DistributionMode.HASH.modeName()) - .commit(); - - assertThatThrownBy(() -> testWriteRow(null, DistributionMode.RANGE)) - .isInstanceOf(IllegalArgumentException.class) - .hasMessage("Flink does not support 'range' write distribution mode now."); - } - - @TestTemplate - public void testJobNullDistributionMode() throws Exception { - table - .updateProperties() - .set(TableProperties.WRITE_DISTRIBUTION_MODE, DistributionMode.HASH.modeName()) - .commit(); - - testWriteRow(null, null); - - if (partitioned) { - assertThat(partitionFiles("aaa")).isEqualTo(1); - assertThat(partitionFiles("bbb")).isEqualTo(1); - assertThat(partitionFiles("ccc")).isEqualTo(1); - } - } - - @TestTemplate - public void testPartitionWriteMode() throws Exception { - testWriteRow(null, DistributionMode.HASH); - if (partitioned) { - assertThat(partitionFiles("aaa")).isEqualTo(1); - assertThat(partitionFiles("bbb")).isEqualTo(1); - assertThat(partitionFiles("ccc")).isEqualTo(1); - } - } - - @TestTemplate - public void testShuffleByPartitionWithSchema() throws Exception { - testWriteRow(SimpleDataUtil.FLINK_SCHEMA, DistributionMode.HASH); - if (partitioned) { - assertThat(partitionFiles("aaa")).isEqualTo(1); - assertThat(partitionFiles("bbb")).isEqualTo(1); - assertThat(partitionFiles("ccc")).isEqualTo(1); - } - } - - @TestTemplate - public void testTwoSinksInDisjointedDAG() throws Exception { - Map props = ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name()); - - Table leftTable = - CATALOG_EXTENSION - .catalog() - .createTable( - TableIdentifier.of("left"), - SimpleDataUtil.SCHEMA, - partitioned - ? PartitionSpec.builderFor(SimpleDataUtil.SCHEMA).identity("data").build() - : PartitionSpec.unpartitioned(), - props); - TableLoader leftTableLoader = - TableLoader.fromCatalog(CATALOG_EXTENSION.catalogLoader(), TableIdentifier.of("left")); - - Table rightTable = - CATALOG_EXTENSION - .catalog() - .createTable( - TableIdentifier.of("right"), - SimpleDataUtil.SCHEMA, - partitioned - ? PartitionSpec.builderFor(SimpleDataUtil.SCHEMA).identity("data").build() - : PartitionSpec.unpartitioned(), - props); - TableLoader rightTableLoader = - TableLoader.fromCatalog(CATALOG_EXTENSION.catalogLoader(), TableIdentifier.of("right")); - - env = - StreamExecutionEnvironment.getExecutionEnvironment( - MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG) - .enableCheckpointing(100) - .setParallelism(parallelism) - .setMaxParallelism(parallelism); - env.getConfig().disableAutoGeneratedUIDs(); - - List leftRows = createRows("left-"); - DataStream leftStream = - env.fromCollection(leftRows, ROW_TYPE_INFO) - .name("leftCustomSource") - .uid("leftCustomSource"); - FlinkSink.forRow(leftStream, SimpleDataUtil.FLINK_SCHEMA) - .table(leftTable) - .tableLoader(leftTableLoader) - .tableSchema(SimpleDataUtil.FLINK_SCHEMA) - .distributionMode(DistributionMode.NONE) - .uidPrefix("leftIcebergSink") - .append(); - - List rightRows = createRows("right-"); - DataStream rightStream = - env.fromCollection(rightRows, ROW_TYPE_INFO) - .name("rightCustomSource") - .uid("rightCustomSource"); - FlinkSink.forRow(rightStream, SimpleDataUtil.FLINK_SCHEMA) - .table(rightTable) - .tableLoader(rightTableLoader) - .tableSchema(SimpleDataUtil.FLINK_SCHEMA) - .writeParallelism(parallelism) - .distributionMode(DistributionMode.HASH) - .uidPrefix("rightIcebergSink") - .setSnapshotProperty("flink.test", TestFlinkIcebergSink.class.getName()) - .setSnapshotProperties(Collections.singletonMap("direction", "rightTable")) - .append(); - - // Execute the program. - env.execute("Test Iceberg DataStream."); - - SimpleDataUtil.assertTableRows(leftTable, convertToRowData(leftRows)); - SimpleDataUtil.assertTableRows(rightTable, convertToRowData(rightRows)); - - leftTable.refresh(); - assertThat(leftTable.currentSnapshot().summary()).doesNotContainKeys("flink.test", "direction"); - rightTable.refresh(); - assertThat(rightTable.currentSnapshot().summary()) - .containsEntry("flink.test", TestFlinkIcebergSink.class.getName()) - .containsEntry("direction", "rightTable"); - } - - @TestTemplate - public void testOverrideWriteConfigWithUnknownDistributionMode() { - Map newProps = Maps.newHashMap(); - newProps.put(FlinkWriteOptions.DISTRIBUTION_MODE.key(), "UNRECOGNIZED"); - - List rows = createRows(""); - DataStream dataStream = env.addSource(createBoundedSource(rows), ROW_TYPE_INFO); - - FlinkSink.Builder builder = - FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) - .table(table) - .tableLoader(tableLoader) - .writeParallelism(parallelism) - .setAll(newProps); - - assertThatThrownBy(builder::append) - .isInstanceOf(IllegalArgumentException.class) - .hasMessage("Invalid distribution mode: UNRECOGNIZED"); - } - - @TestTemplate - public void testOverrideWriteConfigWithUnknownFileFormat() { - Map newProps = Maps.newHashMap(); - newProps.put(FlinkWriteOptions.WRITE_FORMAT.key(), "UNRECOGNIZED"); - - List rows = createRows(""); - DataStream dataStream = env.addSource(createBoundedSource(rows), ROW_TYPE_INFO); - - FlinkSink.Builder builder = - FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) - .table(table) - .tableLoader(tableLoader) - .writeParallelism(parallelism) - .setAll(newProps); - - assertThatThrownBy(builder::append) - .isInstanceOf(IllegalArgumentException.class) - .hasMessage("Invalid file format: UNRECOGNIZED"); - } - - @TestTemplate - public void testWriteRowWithTableRefreshInterval() throws Exception { - List rows = Lists.newArrayList(Row.of(1, "hello"), Row.of(2, "world"), Row.of(3, "foo")); - DataStream dataStream = - env.addSource(createBoundedSource(rows), ROW_TYPE_INFO) - .map(CONVERTER::toInternal, FlinkCompatibilityUtil.toTypeInfo(SimpleDataUtil.ROW_TYPE)); - - Configuration flinkConf = new Configuration(); - flinkConf.setString(FlinkWriteOptions.TABLE_REFRESH_INTERVAL.key(), "100ms"); - - FlinkSink.forRowData(dataStream) - .table(table) - .tableLoader(tableLoader) - .flinkConf(flinkConf) - .writeParallelism(parallelism) - .append(); - - // Execute the program. - env.execute("Test Iceberg DataStream"); - - // Assert the iceberg table's records. - SimpleDataUtil.assertTableRows(table, convertToRowData(rows)); - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkBase.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkBase.java deleted file mode 100644 index b38aa6b50ce6..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkBase.java +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import java.util.List; -import java.util.stream.Collectors; -import org.apache.flink.api.common.typeinfo.TypeInformation; -import org.apache.flink.api.java.typeutils.RowTypeInfo; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.util.DataFormatConverters; -import org.apache.flink.types.Row; -import org.apache.iceberg.Table; -import org.apache.iceberg.flink.SimpleDataUtil; -import org.apache.iceberg.flink.source.BoundedTestSource; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; - -public class TestFlinkIcebergSinkBase { - - protected Table table; - protected StreamExecutionEnvironment env; - protected static final TypeInformation ROW_TYPE_INFO = - new RowTypeInfo(SimpleDataUtil.FLINK_SCHEMA.getFieldTypes()); - - protected static final DataFormatConverters.RowConverter CONVERTER = - new DataFormatConverters.RowConverter(SimpleDataUtil.FLINK_SCHEMA.getFieldDataTypes()); - - protected BoundedTestSource createBoundedSource(List rows) { - return new BoundedTestSource<>(rows.toArray(new Row[0])); - } - - protected List createRows(String prefix) { - return Lists.newArrayList( - Row.of(1, prefix + "aaa"), - Row.of(1, prefix + "bbb"), - Row.of(1, prefix + "ccc"), - Row.of(2, prefix + "aaa"), - Row.of(2, prefix + "bbb"), - Row.of(2, prefix + "ccc"), - Row.of(3, prefix + "aaa"), - Row.of(3, prefix + "bbb"), - Row.of(3, prefix + "ccc")); - } - - protected List convertToRowData(List rows) { - return rows.stream().map(CONVERTER::toInternal).collect(Collectors.toList()); - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkBranch.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkBranch.java deleted file mode 100644 index 441b5ed2a4ae..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkBranch.java +++ /dev/null @@ -1,137 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import static org.apache.iceberg.flink.TestFixtures.DATABASE; -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.IOException; -import java.util.List; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.table.api.TableSchema; -import org.apache.flink.types.Row; -import org.apache.iceberg.DistributionMode; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Parameter; -import org.apache.iceberg.ParameterizedTestExtension; -import org.apache.iceberg.Parameters; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.SnapshotRef; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.flink.HadoopCatalogExtension; -import org.apache.iceberg.flink.MiniFlinkClusterExtension; -import org.apache.iceberg.flink.SimpleDataUtil; -import org.apache.iceberg.flink.TableLoader; -import org.apache.iceberg.flink.TestFixtures; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.TestTemplate; -import org.junit.jupiter.api.extension.ExtendWith; -import org.junit.jupiter.api.extension.RegisterExtension; - -@ExtendWith(ParameterizedTestExtension.class) -public class TestFlinkIcebergSinkBranch extends TestFlinkIcebergSinkBase { - @RegisterExtension - public static final HadoopCatalogExtension CATALOG_EXTENSION = - new HadoopCatalogExtension(DATABASE, TestFixtures.TABLE); - - @Parameter(index = 0) - private String formatVersion; - - @Parameter(index = 1) - private String branch; - - private TableLoader tableLoader; - - @Parameters(name = "formatVersion = {0}, branch = {1}") - public static Object[][] parameters() { - return new Object[][] { - {"1", "main"}, - {"1", "testBranch"}, - {"2", "main"}, - {"2", "testBranch"} - }; - } - - @BeforeEach - public void before() throws IOException { - table = - CATALOG_EXTENSION - .catalog() - .createTable( - TestFixtures.TABLE_IDENTIFIER, - SimpleDataUtil.SCHEMA, - PartitionSpec.unpartitioned(), - ImmutableMap.of( - TableProperties.DEFAULT_FILE_FORMAT, - FileFormat.AVRO.name(), - TableProperties.FORMAT_VERSION, - formatVersion)); - - env = - StreamExecutionEnvironment.getExecutionEnvironment( - MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG) - .enableCheckpointing(100); - - tableLoader = CATALOG_EXTENSION.tableLoader(); - } - - @TestTemplate - public void testWriteRowWithTableSchema() throws Exception { - testWriteRow(SimpleDataUtil.FLINK_SCHEMA, DistributionMode.NONE); - verifyOtherBranchUnmodified(); - } - - private void testWriteRow(TableSchema tableSchema, DistributionMode distributionMode) - throws Exception { - List rows = createRows(""); - DataStream dataStream = env.addSource(createBoundedSource(rows), ROW_TYPE_INFO); - - FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) - .table(table) - .tableLoader(tableLoader) - .tableSchema(tableSchema) - .toBranch(branch) - .distributionMode(distributionMode) - .append(); - - // Execute the program. - env.execute("Test Iceberg DataStream."); - - SimpleDataUtil.assertTableRows(table, convertToRowData(rows), branch); - SimpleDataUtil.assertTableRows( - table, - ImmutableList.of(), - branch.equals(SnapshotRef.MAIN_BRANCH) ? "test-branch" : SnapshotRef.MAIN_BRANCH); - - verifyOtherBranchUnmodified(); - } - - private void verifyOtherBranchUnmodified() { - String otherBranch = - branch.equals(SnapshotRef.MAIN_BRANCH) ? "test-branch" : SnapshotRef.MAIN_BRANCH; - if (otherBranch.equals(SnapshotRef.MAIN_BRANCH)) { - assertThat(table.currentSnapshot()).isNull(); - } - - assertThat(table.snapshot(otherBranch)).isNull(); - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2.java deleted file mode 100644 index 577c54976b9a..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2.java +++ /dev/null @@ -1,235 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import static org.apache.iceberg.flink.TestFixtures.DATABASE; -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatThrownBy; -import static org.assertj.core.api.Assumptions.assumeThat; - -import java.util.List; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.test.junit5.MiniClusterExtension; -import org.apache.flink.types.Row; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.DeleteFile; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.MetadataColumns; -import org.apache.iceberg.ParameterizedTestExtension; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.SnapshotRef; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.flink.HadoopCatalogExtension; -import org.apache.iceberg.flink.MiniFlinkClusterExtension; -import org.apache.iceberg.flink.SimpleDataUtil; -import org.apache.iceberg.flink.TestFixtures; -import org.apache.iceberg.flink.source.BoundedTestSource; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.types.Types; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.TestTemplate; -import org.junit.jupiter.api.Timeout; -import org.junit.jupiter.api.extension.ExtendWith; -import org.junit.jupiter.api.extension.RegisterExtension; - -@ExtendWith(ParameterizedTestExtension.class) -@Timeout(value = 60) -public class TestFlinkIcebergSinkV2 extends TestFlinkIcebergSinkV2Base { - @RegisterExtension - public static final MiniClusterExtension MINI_CLUSTER_EXTENSION = - MiniFlinkClusterExtension.createWithClassloaderCheckDisabled(); - - @RegisterExtension - private static final HadoopCatalogExtension CATALOG_EXTENSION = - new HadoopCatalogExtension(DATABASE, TestFixtures.TABLE); - - @BeforeEach - public void setupTable() { - table = - CATALOG_EXTENSION - .catalog() - .createTable( - TestFixtures.TABLE_IDENTIFIER, - SimpleDataUtil.SCHEMA, - partitioned - ? PartitionSpec.builderFor(SimpleDataUtil.SCHEMA).identity("data").build() - : PartitionSpec.unpartitioned(), - ImmutableMap.of( - TableProperties.DEFAULT_FILE_FORMAT, - format.name(), - TableProperties.FORMAT_VERSION, - String.valueOf(FORMAT_V2))); - - table - .updateProperties() - .set(TableProperties.DEFAULT_FILE_FORMAT, format.name()) - .set(TableProperties.WRITE_DISTRIBUTION_MODE, writeDistributionMode) - .commit(); - - env = - StreamExecutionEnvironment.getExecutionEnvironment( - MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG) - .enableCheckpointing(100L) - .setParallelism(parallelism) - .setMaxParallelism(parallelism); - - tableLoader = CATALOG_EXTENSION.tableLoader(); - } - - @TestTemplate - public void testCheckAndGetEqualityFieldIds() { - table - .updateSchema() - .allowIncompatibleChanges() - .addRequiredColumn("type", Types.StringType.get()) - .setIdentifierFields("type") - .commit(); - - DataStream dataStream = - env.addSource(new BoundedTestSource<>(ImmutableList.of()), ROW_TYPE_INFO); - FlinkSink.Builder builder = - FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA).table(table); - - // Use schema identifier field IDs as equality field id list by default - assertThat(builder.checkAndGetEqualityFieldIds()) - .containsExactlyInAnyOrderElementsOf(table.schema().identifierFieldIds()); - - // Use user-provided equality field column as equality field id list - builder.equalityFieldColumns(Lists.newArrayList("id")); - assertThat(builder.checkAndGetEqualityFieldIds()) - .containsExactlyInAnyOrder(table.schema().findField("id").fieldId()); - - builder.equalityFieldColumns(Lists.newArrayList("type")); - assertThat(builder.checkAndGetEqualityFieldIds()) - .containsExactlyInAnyOrder(table.schema().findField("type").fieldId()); - } - - @TestTemplate - public void testChangeLogOnIdKey() throws Exception { - testChangeLogOnIdKey(SnapshotRef.MAIN_BRANCH); - } - - @TestTemplate - public void testUpsertOnlyDeletesOnDataKey() throws Exception { - List> elementsPerCheckpoint = - ImmutableList.of( - ImmutableList.of(row("+I", 1, "aaa")), - ImmutableList.of(row("-D", 1, "aaa"), row("-D", 2, "bbb"))); - - List> expectedRecords = - ImmutableList.of(ImmutableList.of(record(1, "aaa")), ImmutableList.of()); - - testChangeLogs( - ImmutableList.of("data"), - row -> row.getField(ROW_DATA_POS), - true, - elementsPerCheckpoint, - expectedRecords, - SnapshotRef.MAIN_BRANCH); - } - - @TestTemplate - public void testChangeLogOnDataKey() throws Exception { - testChangeLogOnDataKey(SnapshotRef.MAIN_BRANCH); - } - - @TestTemplate - public void testChangeLogOnIdDataKey() throws Exception { - testChangeLogOnIdDataKey(SnapshotRef.MAIN_BRANCH); - } - - @TestTemplate - public void testChangeLogOnSameKey() throws Exception { - testChangeLogOnSameKey(SnapshotRef.MAIN_BRANCH); - } - - @TestTemplate - public void testUpsertModeCheck() throws Exception { - DataStream dataStream = - env.addSource(new BoundedTestSource<>(ImmutableList.of()), ROW_TYPE_INFO); - FlinkSink.Builder builder = - FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) - .tableLoader(tableLoader) - .tableSchema(SimpleDataUtil.FLINK_SCHEMA) - .writeParallelism(parallelism) - .upsert(true); - - assertThatThrownBy( - () -> - builder - .equalityFieldColumns(ImmutableList.of("id", "data")) - .overwrite(true) - .append()) - .isInstanceOf(IllegalStateException.class) - .hasMessage( - "OVERWRITE mode shouldn't be enable when configuring to use UPSERT data stream."); - - assertThatThrownBy( - () -> builder.equalityFieldColumns(ImmutableList.of()).overwrite(false).append()) - .isInstanceOf(IllegalStateException.class) - .hasMessage( - "Equality field columns shouldn't be empty when configuring to use UPSERT data stream."); - } - - @TestTemplate - public void testUpsertOnIdKey() throws Exception { - testUpsertOnIdKey(SnapshotRef.MAIN_BRANCH); - } - - @TestTemplate - public void testUpsertOnDataKey() throws Exception { - testUpsertOnDataKey(SnapshotRef.MAIN_BRANCH); - } - - @TestTemplate - public void testUpsertOnIdDataKey() throws Exception { - testUpsertOnIdDataKey(SnapshotRef.MAIN_BRANCH); - } - - @TestTemplate - public void testDeleteStats() throws Exception { - assumeThat(format).isNotEqualTo(FileFormat.AVRO); - - List> elementsPerCheckpoint = - ImmutableList.of( - // Checkpoint #1 - ImmutableList.of(row("+I", 1, "aaa"), row("-D", 1, "aaa"), row("+I", 1, "aaa"))); - - List> expectedRecords = ImmutableList.of(ImmutableList.of(record(1, "aaa"))); - - testChangeLogs( - ImmutableList.of("id", "data"), - row -> Row.of(row.getField(ROW_ID_POS), row.getField(ROW_DATA_POS)), - false, - elementsPerCheckpoint, - expectedRecords, - "main"); - - DeleteFile deleteFile = table.currentSnapshot().addedDeleteFiles(table.io()).iterator().next(); - String fromStat = - new String( - deleteFile.lowerBounds().get(MetadataColumns.DELETE_FILE_PATH.fieldId()).array()); - DataFile dataFile = table.currentSnapshot().addedDataFiles(table.io()).iterator().next(); - assumeThat(fromStat).isEqualTo(dataFile.path().toString()); - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2Base.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2Base.java deleted file mode 100644 index fc33c2fea5e6..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2Base.java +++ /dev/null @@ -1,389 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatThrownBy; - -import java.io.IOException; -import java.util.List; -import java.util.Map; -import org.apache.flink.api.common.typeinfo.TypeInformation; -import org.apache.flink.api.java.functions.KeySelector; -import org.apache.flink.api.java.typeutils.RowTypeInfo; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.types.Row; -import org.apache.flink.types.RowKind; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Parameter; -import org.apache.iceberg.Parameters; -import org.apache.iceberg.Snapshot; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.data.IcebergGenerics; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.flink.SimpleDataUtil; -import org.apache.iceberg.flink.TableLoader; -import org.apache.iceberg.flink.source.BoundedTestSource; -import org.apache.iceberg.io.CloseableIterable; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.util.StructLikeSet; - -public class TestFlinkIcebergSinkV2Base { - - protected static final int FORMAT_V2 = 2; - protected static final TypeInformation ROW_TYPE_INFO = - new RowTypeInfo(SimpleDataUtil.FLINK_SCHEMA.getFieldTypes()); - - protected static final int ROW_ID_POS = 0; - protected static final int ROW_DATA_POS = 1; - - protected TableLoader tableLoader; - protected Table table; - protected StreamExecutionEnvironment env; - - @Parameter(index = 0) - protected FileFormat format; - - @Parameter(index = 1) - protected int parallelism = 1; - - @Parameter(index = 2) - protected boolean partitioned; - - @Parameter(index = 3) - protected String writeDistributionMode; - - @Parameters(name = "FileFormat={0}, Parallelism={1}, Partitioned={2}, WriteDistributionMode={3}") - public static Object[][] parameters() { - return new Object[][] { - new Object[] {FileFormat.AVRO, 1, true, TableProperties.WRITE_DISTRIBUTION_MODE_NONE}, - new Object[] {FileFormat.AVRO, 1, false, TableProperties.WRITE_DISTRIBUTION_MODE_NONE}, - new Object[] {FileFormat.AVRO, 4, true, TableProperties.WRITE_DISTRIBUTION_MODE_NONE}, - new Object[] {FileFormat.AVRO, 4, false, TableProperties.WRITE_DISTRIBUTION_MODE_NONE}, - new Object[] {FileFormat.ORC, 1, true, TableProperties.WRITE_DISTRIBUTION_MODE_HASH}, - new Object[] {FileFormat.ORC, 1, false, TableProperties.WRITE_DISTRIBUTION_MODE_HASH}, - new Object[] {FileFormat.ORC, 4, true, TableProperties.WRITE_DISTRIBUTION_MODE_HASH}, - new Object[] {FileFormat.ORC, 4, false, TableProperties.WRITE_DISTRIBUTION_MODE_HASH}, - new Object[] {FileFormat.PARQUET, 1, true, TableProperties.WRITE_DISTRIBUTION_MODE_RANGE}, - new Object[] {FileFormat.PARQUET, 1, false, TableProperties.WRITE_DISTRIBUTION_MODE_RANGE}, - new Object[] {FileFormat.PARQUET, 4, true, TableProperties.WRITE_DISTRIBUTION_MODE_RANGE}, - new Object[] {FileFormat.PARQUET, 4, false, TableProperties.WRITE_DISTRIBUTION_MODE_RANGE} - }; - } - - protected static final Map ROW_KIND_MAP = - ImmutableMap.of( - "+I", RowKind.INSERT, - "-D", RowKind.DELETE, - "-U", RowKind.UPDATE_BEFORE, - "+U", RowKind.UPDATE_AFTER); - - protected Row row(String rowKind, int id, String data) { - RowKind kind = ROW_KIND_MAP.get(rowKind); - if (kind == null) { - throw new IllegalArgumentException("Unknown row kind: " + rowKind); - } - - return Row.ofKind(kind, id, data); - } - - protected void testUpsertOnIdDataKey(String branch) throws Exception { - List> elementsPerCheckpoint = - ImmutableList.of( - ImmutableList.of(row("+I", 1, "aaa"), row("+U", 1, "aaa"), row("+I", 2, "bbb")), - ImmutableList.of(row("+I", 1, "aaa"), row("-D", 2, "bbb"), row("+I", 2, "ccc")), - ImmutableList.of(row("+U", 1, "bbb"), row("-U", 1, "ccc"), row("-D", 1, "aaa"))); - - List> expectedRecords = - ImmutableList.of( - ImmutableList.of(record(1, "aaa"), record(2, "bbb")), - ImmutableList.of(record(1, "aaa"), record(2, "ccc")), - ImmutableList.of(record(1, "bbb"), record(2, "ccc"))); - testChangeLogs( - ImmutableList.of("id", "data"), - row -> Row.of(row.getField(ROW_ID_POS), row.getField(ROW_DATA_POS)), - true, - elementsPerCheckpoint, - expectedRecords, - branch); - } - - protected void testChangeLogOnIdDataKey(String branch) throws Exception { - List> elementsPerCheckpoint = - ImmutableList.of( - ImmutableList.of( - row("+I", 1, "aaa"), - row("-D", 1, "aaa"), - row("+I", 2, "bbb"), - row("+I", 1, "bbb"), - row("+I", 2, "aaa")), - ImmutableList.of(row("-U", 2, "aaa"), row("+U", 1, "ccc"), row("+I", 1, "aaa")), - ImmutableList.of(row("-D", 1, "bbb"), row("+I", 2, "aaa"))); - - List> expectedRecords = - ImmutableList.of( - ImmutableList.of(record(1, "bbb"), record(2, "aaa"), record(2, "bbb")), - ImmutableList.of( - record(1, "aaa"), record(1, "bbb"), record(1, "ccc"), record(2, "bbb")), - ImmutableList.of( - record(1, "aaa"), record(1, "ccc"), record(2, "aaa"), record(2, "bbb"))); - - testChangeLogs( - ImmutableList.of("data", "id"), - row -> Row.of(row.getField(ROW_ID_POS), row.getField(ROW_DATA_POS)), - false, - elementsPerCheckpoint, - expectedRecords, - branch); - } - - protected void testChangeLogOnSameKey(String branch) throws Exception { - List> elementsPerCheckpoint = - ImmutableList.of( - // Checkpoint #1 - ImmutableList.of(row("+I", 1, "aaa"), row("-D", 1, "aaa"), row("+I", 1, "aaa")), - // Checkpoint #2 - ImmutableList.of(row("-U", 1, "aaa"), row("+U", 1, "aaa")), - // Checkpoint #3 - ImmutableList.of(row("-D", 1, "aaa"), row("+I", 1, "aaa")), - // Checkpoint #4 - ImmutableList.of(row("-U", 1, "aaa"), row("+U", 1, "aaa"), row("+I", 1, "aaa"))); - - List> expectedRecords = - ImmutableList.of( - ImmutableList.of(record(1, "aaa")), - ImmutableList.of(record(1, "aaa")), - ImmutableList.of(record(1, "aaa")), - ImmutableList.of(record(1, "aaa"), record(1, "aaa"))); - - testChangeLogs( - ImmutableList.of("id", "data"), - row -> Row.of(row.getField(ROW_ID_POS), row.getField(ROW_DATA_POS)), - false, - elementsPerCheckpoint, - expectedRecords, - branch); - } - - protected void testChangeLogOnDataKey(String branch) throws Exception { - List> elementsPerCheckpoint = - ImmutableList.of( - ImmutableList.of( - row("+I", 1, "aaa"), - row("-D", 1, "aaa"), - row("+I", 2, "bbb"), - row("+I", 1, "bbb"), - row("+I", 2, "aaa")), - ImmutableList.of(row("-U", 2, "aaa"), row("+U", 1, "ccc"), row("+I", 1, "aaa")), - ImmutableList.of(row("-D", 1, "bbb"), row("+I", 2, "aaa"), row("+I", 2, "ccc"))); - - List> expectedRecords = - ImmutableList.of( - ImmutableList.of(record(1, "bbb"), record(2, "aaa")), - ImmutableList.of(record(1, "aaa"), record(1, "bbb"), record(1, "ccc")), - ImmutableList.of( - record(1, "aaa"), record(1, "ccc"), record(2, "aaa"), record(2, "ccc"))); - - testChangeLogs( - ImmutableList.of("data"), - row -> row.getField(ROW_DATA_POS), - false, - elementsPerCheckpoint, - expectedRecords, - branch); - } - - protected void testUpsertOnDataKey(String branch) throws Exception { - List> elementsPerCheckpoint = - ImmutableList.of( - ImmutableList.of(row("+I", 1, "aaa"), row("+I", 2, "aaa"), row("+I", 3, "bbb")), - ImmutableList.of(row("+U", 4, "aaa"), row("-U", 3, "bbb"), row("+U", 5, "bbb")), - ImmutableList.of(row("+I", 6, "aaa"), row("+U", 7, "bbb"))); - - List> expectedRecords = - ImmutableList.of( - ImmutableList.of(record(2, "aaa"), record(3, "bbb")), - ImmutableList.of(record(4, "aaa"), record(5, "bbb")), - ImmutableList.of(record(6, "aaa"), record(7, "bbb"))); - - testChangeLogs( - ImmutableList.of("data"), - row -> row.getField(ROW_DATA_POS), - true, - elementsPerCheckpoint, - expectedRecords, - branch); - } - - protected void testChangeLogOnIdKey(String branch) throws Exception { - List> elementsPerCheckpoint = - ImmutableList.of( - ImmutableList.of( - row("+I", 1, "aaa"), - row("-D", 1, "aaa"), - row("+I", 1, "bbb"), - row("+I", 2, "aaa"), - row("-D", 2, "aaa"), - row("+I", 2, "bbb")), - ImmutableList.of( - row("-U", 2, "bbb"), row("+U", 2, "ccc"), row("-D", 2, "ccc"), row("+I", 2, "ddd")), - ImmutableList.of( - row("-D", 1, "bbb"), - row("+I", 1, "ccc"), - row("-D", 1, "ccc"), - row("+I", 1, "ddd"))); - - List> expectedRecords = - ImmutableList.of( - ImmutableList.of(record(1, "bbb"), record(2, "bbb")), - ImmutableList.of(record(1, "bbb"), record(2, "ddd")), - ImmutableList.of(record(1, "ddd"), record(2, "ddd"))); - - if (partitioned && writeDistributionMode.equals(TableProperties.WRITE_DISTRIBUTION_MODE_HASH)) { - assertThatThrownBy( - () -> - testChangeLogs( - ImmutableList.of("id"), - row -> row.getField(ROW_ID_POS), - false, - elementsPerCheckpoint, - expectedRecords, - branch)) - .isInstanceOf(IllegalStateException.class) - .hasMessageStartingWith( - "In 'hash' distribution mode with equality fields set, partition field") - .hasMessageContaining("should be included in equality fields:"); - - } else { - testChangeLogs( - ImmutableList.of("id"), - row -> row.getField(ROW_ID_POS), - false, - elementsPerCheckpoint, - expectedRecords, - branch); - } - } - - protected void testUpsertOnIdKey(String branch) throws Exception { - List> elementsPerCheckpoint = - ImmutableList.of( - ImmutableList.of(row("+I", 1, "aaa"), row("+U", 1, "bbb")), - ImmutableList.of(row("+I", 1, "ccc")), - ImmutableList.of(row("+U", 1, "ddd"), row("+I", 1, "eee"))); - - List> expectedRecords = - ImmutableList.of( - ImmutableList.of(record(1, "bbb")), - ImmutableList.of(record(1, "ccc")), - ImmutableList.of(record(1, "eee"))); - - if (!partitioned) { - testChangeLogs( - ImmutableList.of("id"), - row -> row.getField(ROW_ID_POS), - true, - elementsPerCheckpoint, - expectedRecords, - branch); - } else { - assertThatThrownBy( - () -> - testChangeLogs( - ImmutableList.of("id"), - row -> row.getField(ROW_ID_POS), - true, - elementsPerCheckpoint, - expectedRecords, - branch)) - .isInstanceOf(IllegalStateException.class) - .hasMessageContaining("should be included in equality fields:"); - } - } - - protected void testChangeLogs( - List equalityFieldColumns, - KeySelector keySelector, - boolean insertAsUpsert, - List> elementsPerCheckpoint, - List> expectedRecordsPerCheckpoint, - String branch) - throws Exception { - DataStream dataStream = - env.addSource(new BoundedTestSource<>(elementsPerCheckpoint), ROW_TYPE_INFO); - - FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) - .tableLoader(tableLoader) - .tableSchema(SimpleDataUtil.FLINK_SCHEMA) - .writeParallelism(parallelism) - .equalityFieldColumns(equalityFieldColumns) - .upsert(insertAsUpsert) - .toBranch(branch) - .append(); - - // Execute the program. - env.execute("Test Iceberg Change-Log DataStream."); - - table.refresh(); - List snapshots = findValidSnapshots(); - int expectedSnapshotNum = expectedRecordsPerCheckpoint.size(); - assertThat(snapshots).hasSize(expectedSnapshotNum); - - for (int i = 0; i < expectedSnapshotNum; i++) { - long snapshotId = snapshots.get(i).snapshotId(); - List expectedRecords = expectedRecordsPerCheckpoint.get(i); - assertThat(actualRowSet(snapshotId, "*")) - .as("Should have the expected records for the checkpoint#" + i) - .isEqualTo(expectedRowSet(expectedRecords.toArray(new Record[0]))); - } - } - - protected Record record(int id, String data) { - return SimpleDataUtil.createRecord(id, data); - } - - private List findValidSnapshots() { - List validSnapshots = Lists.newArrayList(); - for (Snapshot snapshot : table.snapshots()) { - if (snapshot.allManifests(table.io()).stream() - .anyMatch(m -> snapshot.snapshotId() == m.snapshotId())) { - validSnapshots.add(snapshot); - } - } - return validSnapshots; - } - - private StructLikeSet expectedRowSet(Record... records) { - return SimpleDataUtil.expectedRowSet(table, records); - } - - private StructLikeSet actualRowSet(long snapshotId, String... columns) throws IOException { - table.refresh(); - StructLikeSet set = StructLikeSet.create(table.schema().asStruct()); - try (CloseableIterable reader = - IcebergGenerics.read(table).useSnapshot(snapshotId).select(columns).build()) { - reader.forEach(set::add); - } - return set; - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2Branch.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2Branch.java deleted file mode 100644 index 0b0c55f51c32..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2Branch.java +++ /dev/null @@ -1,125 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import static org.apache.iceberg.flink.TestFixtures.DATABASE; -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.IOException; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Parameter; -import org.apache.iceberg.ParameterizedTestExtension; -import org.apache.iceberg.Parameters; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.SnapshotRef; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.flink.HadoopCatalogExtension; -import org.apache.iceberg.flink.MiniFlinkClusterExtension; -import org.apache.iceberg.flink.SimpleDataUtil; -import org.apache.iceberg.flink.TestFixtures; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.TestTemplate; -import org.junit.jupiter.api.extension.ExtendWith; -import org.junit.jupiter.api.extension.RegisterExtension; - -@ExtendWith(ParameterizedTestExtension.class) -public class TestFlinkIcebergSinkV2Branch extends TestFlinkIcebergSinkV2Base { - @RegisterExtension - private static final HadoopCatalogExtension CATALOG_EXTENSION = - new HadoopCatalogExtension(DATABASE, TestFixtures.TABLE); - - @Parameter(index = 0) - private String branch; - - @Parameters(name = "branch = {0}") - public static Object[][] parameters() { - return new Object[][] {new Object[] {"main"}, new Object[] {"testBranch"}}; - } - - @BeforeEach - public void before() throws IOException { - table = - CATALOG_EXTENSION - .catalog() - .createTable( - TestFixtures.TABLE_IDENTIFIER, - SimpleDataUtil.SCHEMA, - PartitionSpec.unpartitioned(), - ImmutableMap.of( - TableProperties.DEFAULT_FILE_FORMAT, - FileFormat.AVRO.name(), - TableProperties.FORMAT_VERSION, - "2")); - - env = - StreamExecutionEnvironment.getExecutionEnvironment( - MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG) - .enableCheckpointing(100); - - tableLoader = CATALOG_EXTENSION.tableLoader(); - } - - @TestTemplate - public void testChangeLogOnIdKey() throws Exception { - testChangeLogOnIdKey(branch); - verifyOtherBranchUnmodified(); - } - - @TestTemplate - public void testChangeLogOnDataKey() throws Exception { - testChangeLogOnDataKey(branch); - verifyOtherBranchUnmodified(); - } - - @TestTemplate - public void testChangeLogOnIdDataKey() throws Exception { - testChangeLogOnIdDataKey(branch); - verifyOtherBranchUnmodified(); - } - - @TestTemplate - public void testUpsertOnIdKey() throws Exception { - testUpsertOnIdKey(branch); - verifyOtherBranchUnmodified(); - } - - @TestTemplate - public void testUpsertOnDataKey() throws Exception { - testUpsertOnDataKey(branch); - verifyOtherBranchUnmodified(); - } - - @TestTemplate - public void testUpsertOnIdDataKey() throws Exception { - testUpsertOnIdDataKey(branch); - verifyOtherBranchUnmodified(); - } - - private void verifyOtherBranchUnmodified() { - String otherBranch = - branch.equals(SnapshotRef.MAIN_BRANCH) ? "test-branch" : SnapshotRef.MAIN_BRANCH; - if (otherBranch.equals(SnapshotRef.MAIN_BRANCH)) { - assertThat(table.currentSnapshot()).isNull(); - } - - assertThat(table.snapshot(otherBranch)).isNull(); - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkManifest.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkManifest.java deleted file mode 100644 index 53b7c4c0cc91..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkManifest.java +++ /dev/null @@ -1,312 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import static org.apache.iceberg.flink.sink.ManifestOutputFileFactory.FLINK_MANIFEST_LOCATION; -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.File; -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.Paths; -import java.util.List; -import java.util.Map; -import java.util.UUID; -import java.util.concurrent.atomic.AtomicInteger; -import org.apache.flink.core.io.SimpleVersionedSerialization; -import org.apache.flink.core.io.SimpleVersionedSerializer; -import org.apache.flink.table.data.RowData; -import org.apache.hadoop.conf.Configuration; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.DeleteFile; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.ManifestFile; -import org.apache.iceberg.ManifestFiles; -import org.apache.iceberg.Table; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.SimpleDataUtil; -import org.apache.iceberg.flink.TestHelpers; -import org.apache.iceberg.io.FileAppenderFactory; -import org.apache.iceberg.io.WriteResult; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.util.Pair; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.io.TempDir; - -public class TestFlinkManifest { - private static final Configuration CONF = new Configuration(); - - @TempDir protected Path temporaryFolder; - - private Table table; - private FileAppenderFactory appenderFactory; - private final AtomicInteger fileCount = new AtomicInteger(0); - - @BeforeEach - public void before() throws IOException { - File folder = Files.createTempDirectory(temporaryFolder, "junit").toFile(); - String warehouse = folder.getAbsolutePath(); - - String tablePath = warehouse.concat("/test"); - assertThat(new File(tablePath).mkdir()).isTrue(); - - // Construct the iceberg table. - table = SimpleDataUtil.createTable(tablePath, ImmutableMap.of(), false); - - int[] equalityFieldIds = - new int[] { - table.schema().findField("id").fieldId(), table.schema().findField("data").fieldId() - }; - this.appenderFactory = - new FlinkAppenderFactory( - table, - table.schema(), - FlinkSchemaUtil.convert(table.schema()), - table.properties(), - table.spec(), - equalityFieldIds, - table.schema(), - null); - } - - @Test - public void testIO() throws IOException { - String flinkJobId = newFlinkJobId(); - String operatorId = newOperatorUniqueId(); - for (long checkpointId = 1; checkpointId <= 3; checkpointId++) { - ManifestOutputFileFactory factory = - FlinkManifestUtil.createOutputFileFactory( - () -> table, table.properties(), flinkJobId, operatorId, 1, 1); - final long curCkpId = checkpointId; - - List dataFiles = generateDataFiles(10); - List eqDeleteFiles = generateEqDeleteFiles(5); - List posDeleteFiles = generatePosDeleteFiles(5); - DeltaManifests deltaManifests = - FlinkManifestUtil.writeCompletedFiles( - WriteResult.builder() - .addDataFiles(dataFiles) - .addDeleteFiles(eqDeleteFiles) - .addDeleteFiles(posDeleteFiles) - .build(), - () -> factory.create(curCkpId), - table.spec()); - - WriteResult result = - FlinkManifestUtil.readCompletedFiles(deltaManifests, table.io(), table.specs()); - assertThat(result.deleteFiles()).hasSize(10); - for (int i = 0; i < dataFiles.size(); i++) { - TestHelpers.assertEquals(dataFiles.get(i), result.dataFiles()[i]); - } - assertThat(result.deleteFiles()).hasSize(10); - for (int i = 0; i < 5; i++) { - TestHelpers.assertEquals(eqDeleteFiles.get(i), result.deleteFiles()[i]); - } - for (int i = 0; i < 5; i++) { - TestHelpers.assertEquals(posDeleteFiles.get(i), result.deleteFiles()[5 + i]); - } - } - } - - @Test - public void testUserProvidedManifestLocation() throws IOException { - long checkpointId = 1; - String flinkJobId = newFlinkJobId(); - String operatorId = newOperatorUniqueId(); - File userProvidedFolder = Files.createTempDirectory(temporaryFolder, "junit").toFile(); - Map props = - ImmutableMap.of(FLINK_MANIFEST_LOCATION, userProvidedFolder.getAbsolutePath() + "///"); - ManifestOutputFileFactory factory = - new ManifestOutputFileFactory(() -> table, props, flinkJobId, operatorId, 1, 1); - - List dataFiles = generateDataFiles(5); - DeltaManifests deltaManifests = - FlinkManifestUtil.writeCompletedFiles( - WriteResult.builder().addDataFiles(dataFiles).build(), - () -> factory.create(checkpointId), - table.spec()); - - assertThat(deltaManifests.dataManifest()).isNotNull(); - assertThat(deltaManifests.deleteManifest()).isNull(); - assertThat(Paths.get(deltaManifests.dataManifest().path())) - .hasParent(userProvidedFolder.toPath()); - - WriteResult result = - FlinkManifestUtil.readCompletedFiles(deltaManifests, table.io(), table.specs()); - - assertThat(result.deleteFiles()).isEmpty(); - assertThat(result.dataFiles()).hasSize(5); - - assertThat(result.dataFiles()).hasSameSizeAs(dataFiles); - for (int i = 0; i < dataFiles.size(); i++) { - TestHelpers.assertEquals(dataFiles.get(i), result.dataFiles()[i]); - } - } - - @Test - public void testVersionedSerializer() throws IOException { - long checkpointId = 1; - String flinkJobId = newFlinkJobId(); - String operatorId = newOperatorUniqueId(); - ManifestOutputFileFactory factory = - FlinkManifestUtil.createOutputFileFactory( - () -> table, table.properties(), flinkJobId, operatorId, 1, 1); - - List dataFiles = generateDataFiles(10); - List eqDeleteFiles = generateEqDeleteFiles(10); - List posDeleteFiles = generatePosDeleteFiles(10); - DeltaManifests expected = - FlinkManifestUtil.writeCompletedFiles( - WriteResult.builder() - .addDataFiles(dataFiles) - .addDeleteFiles(eqDeleteFiles) - .addDeleteFiles(posDeleteFiles) - .build(), - () -> factory.create(checkpointId), - table.spec()); - - byte[] versionedSerializeData = - SimpleVersionedSerialization.writeVersionAndSerialize( - DeltaManifestsSerializer.INSTANCE, expected); - DeltaManifests actual = - SimpleVersionedSerialization.readVersionAndDeSerialize( - DeltaManifestsSerializer.INSTANCE, versionedSerializeData); - TestHelpers.assertEquals(expected.dataManifest(), actual.dataManifest()); - TestHelpers.assertEquals(expected.deleteManifest(), actual.deleteManifest()); - - byte[] versionedSerializeData2 = - SimpleVersionedSerialization.writeVersionAndSerialize( - DeltaManifestsSerializer.INSTANCE, actual); - assertThat(versionedSerializeData2).containsExactly(versionedSerializeData); - } - - @Test - public void testCompatibility() throws IOException { - // The v2 deserializer should be able to deserialize the v1 binary. - long checkpointId = 1; - String flinkJobId = newFlinkJobId(); - String operatorId = newOperatorUniqueId(); - ManifestOutputFileFactory factory = - FlinkManifestUtil.createOutputFileFactory( - () -> table, table.properties(), flinkJobId, operatorId, 1, 1); - - List dataFiles = generateDataFiles(10); - ManifestFile manifest = - FlinkManifestUtil.writeDataFiles(factory.create(checkpointId), table.spec(), dataFiles); - byte[] dataV1 = - SimpleVersionedSerialization.writeVersionAndSerialize(new V1Serializer(), manifest); - - DeltaManifests delta = - SimpleVersionedSerialization.readVersionAndDeSerialize( - DeltaManifestsSerializer.INSTANCE, dataV1); - assertThat(delta.deleteManifest()).isNull(); - assertThat(delta.dataManifest()).isNotNull(); - TestHelpers.assertEquals(manifest, delta.dataManifest()); - - List actualFiles = - FlinkManifestUtil.readDataFiles(delta.dataManifest(), table.io(), table.specs()); - assertThat(actualFiles).hasSize(10); - for (int i = 0; i < 10; i++) { - TestHelpers.assertEquals(dataFiles.get(i), actualFiles.get(i)); - } - } - - private static class V1Serializer implements SimpleVersionedSerializer { - - @Override - public int getVersion() { - return 1; - } - - @Override - public byte[] serialize(ManifestFile m) throws IOException { - return ManifestFiles.encode(m); - } - - @Override - public ManifestFile deserialize(int version, byte[] serialized) throws IOException { - return ManifestFiles.decode(serialized); - } - } - - private DataFile writeDataFile(String filename, List rows) throws IOException { - return SimpleDataUtil.writeFile( - table, - table.schema(), - table.spec(), - CONF, - table.location(), - FileFormat.PARQUET.addExtension(filename), - rows); - } - - private DeleteFile writeEqDeleteFile(String filename, List deletes) throws IOException { - return SimpleDataUtil.writeEqDeleteFile( - table, FileFormat.PARQUET, filename, appenderFactory, deletes); - } - - private DeleteFile writePosDeleteFile(String filename, List> positions) - throws IOException { - return SimpleDataUtil.writePosDeleteFile( - table, FileFormat.PARQUET, filename, appenderFactory, positions); - } - - private List generateDataFiles(int fileNum) throws IOException { - List rowDataList = Lists.newArrayList(); - List dataFiles = Lists.newArrayList(); - for (int i = 0; i < fileNum; i++) { - rowDataList.add(SimpleDataUtil.createRowData(i, "a" + i)); - dataFiles.add(writeDataFile("data-file-" + fileCount.incrementAndGet(), rowDataList)); - } - return dataFiles; - } - - private List generateEqDeleteFiles(int fileNum) throws IOException { - List rowDataList = Lists.newArrayList(); - List deleteFiles = Lists.newArrayList(); - for (int i = 0; i < fileNum; i++) { - rowDataList.add(SimpleDataUtil.createDelete(i, "a" + i)); - deleteFiles.add( - writeEqDeleteFile("eq-delete-file-" + fileCount.incrementAndGet(), rowDataList)); - } - return deleteFiles; - } - - private List generatePosDeleteFiles(int fileNum) throws IOException { - List> positions = Lists.newArrayList(); - List deleteFiles = Lists.newArrayList(); - for (int i = 0; i < fileNum; i++) { - positions.add(Pair.of("data-file-1", (long) i)); - deleteFiles.add( - writePosDeleteFile("pos-delete-file-" + fileCount.incrementAndGet(), positions)); - } - return deleteFiles; - } - - private static String newFlinkJobId() { - return UUID.randomUUID().toString(); - } - - private static String newOperatorUniqueId() { - return UUID.randomUUID().toString(); - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkPartitioningWriters.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkPartitioningWriters.java deleted file mode 100644 index f79a3e634071..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkPartitioningWriters.java +++ /dev/null @@ -1,66 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import java.util.List; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.Schema; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.RowDataWrapper; -import org.apache.iceberg.flink.SimpleDataUtil; -import org.apache.iceberg.io.FileWriterFactory; -import org.apache.iceberg.io.TestPartitioningWriters; -import org.apache.iceberg.util.ArrayUtil; -import org.apache.iceberg.util.StructLikeSet; - -public class TestFlinkPartitioningWriters extends TestPartitioningWriters { - - @Override - protected FileWriterFactory newWriterFactory( - Schema dataSchema, - List equalityFieldIds, - Schema equalityDeleteRowSchema, - Schema positionDeleteRowSchema) { - return FlinkFileWriterFactory.builderFor(table) - .dataSchema(table.schema()) - .dataFileFormat(format()) - .deleteFileFormat(format()) - .equalityFieldIds(ArrayUtil.toIntArray(equalityFieldIds)) - .equalityDeleteRowSchema(equalityDeleteRowSchema) - .positionDeleteRowSchema(positionDeleteRowSchema) - .build(); - } - - @Override - protected RowData toRow(Integer id, String data) { - return SimpleDataUtil.createRowData(id, data); - } - - @Override - protected StructLikeSet toSet(Iterable rows) { - StructLikeSet set = StructLikeSet.create(table.schema().asStruct()); - RowType flinkType = FlinkSchemaUtil.convert(table.schema()); - for (RowData row : rows) { - RowDataWrapper wrapper = new RowDataWrapper(flinkType, table.schema().asStruct()); - set.add(wrapper.wrap(row)); - } - return set; - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkPositionDeltaWriters.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkPositionDeltaWriters.java deleted file mode 100644 index 3050752d1c24..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkPositionDeltaWriters.java +++ /dev/null @@ -1,66 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import java.util.List; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.Schema; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.RowDataWrapper; -import org.apache.iceberg.flink.SimpleDataUtil; -import org.apache.iceberg.io.FileWriterFactory; -import org.apache.iceberg.io.TestPositionDeltaWriters; -import org.apache.iceberg.util.ArrayUtil; -import org.apache.iceberg.util.StructLikeSet; - -public class TestFlinkPositionDeltaWriters extends TestPositionDeltaWriters { - - @Override - protected FileWriterFactory newWriterFactory( - Schema dataSchema, - List equalityFieldIds, - Schema equalityDeleteRowSchema, - Schema positionDeleteRowSchema) { - return FlinkFileWriterFactory.builderFor(table) - .dataSchema(table.schema()) - .dataFileFormat(format()) - .deleteFileFormat(format()) - .equalityFieldIds(ArrayUtil.toIntArray(equalityFieldIds)) - .equalityDeleteRowSchema(equalityDeleteRowSchema) - .positionDeleteRowSchema(positionDeleteRowSchema) - .build(); - } - - @Override - protected RowData toRow(Integer id, String data) { - return SimpleDataUtil.createRowData(id, data); - } - - @Override - protected StructLikeSet toSet(Iterable rows) { - StructLikeSet set = StructLikeSet.create(table.schema().asStruct()); - RowType flinkType = FlinkSchemaUtil.convert(table.schema()); - for (RowData row : rows) { - RowDataWrapper wrapper = new RowDataWrapper(flinkType, table.schema().asStruct()); - set.add(wrapper.wrap(row)); - } - return set; - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkRollingFileWriters.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkRollingFileWriters.java deleted file mode 100644 index 03051b69cf87..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkRollingFileWriters.java +++ /dev/null @@ -1,51 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import java.util.List; -import org.apache.flink.table.data.RowData; -import org.apache.iceberg.Schema; -import org.apache.iceberg.flink.SimpleDataUtil; -import org.apache.iceberg.io.FileWriterFactory; -import org.apache.iceberg.io.TestRollingFileWriters; -import org.apache.iceberg.util.ArrayUtil; - -public class TestFlinkRollingFileWriters extends TestRollingFileWriters { - - @Override - protected FileWriterFactory newWriterFactory( - Schema dataSchema, - List equalityFieldIds, - Schema equalityDeleteRowSchema, - Schema positionDeleteRowSchema) { - return FlinkFileWriterFactory.builderFor(table) - .dataSchema(table.schema()) - .dataFileFormat(format()) - .deleteFileFormat(format()) - .equalityFieldIds(ArrayUtil.toIntArray(equalityFieldIds)) - .equalityDeleteRowSchema(equalityDeleteRowSchema) - .positionDeleteRowSchema(positionDeleteRowSchema) - .build(); - } - - @Override - protected RowData toRow(Integer id, String data) { - return SimpleDataUtil.createRowData(id, data); - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkWriterMetrics.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkWriterMetrics.java deleted file mode 100644 index e6d64ef2c720..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkWriterMetrics.java +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.StringData; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Table; -import org.apache.iceberg.io.FileWriterFactory; -import org.apache.iceberg.io.TestWriterMetrics; - -public class TestFlinkWriterMetrics extends TestWriterMetrics { - - public TestFlinkWriterMetrics(FileFormat fileFormat) { - super(fileFormat); - } - - @Override - protected FileWriterFactory newWriterFactory(Table sourceTable) { - return FlinkFileWriterFactory.builderFor(sourceTable) - .dataSchema(sourceTable.schema()) - .dataFileFormat(fileFormat) - .deleteFileFormat(fileFormat) - .positionDeleteRowSchema(sourceTable.schema()) - .build(); - } - - @Override - protected RowData toRow(Integer id, String data, boolean boolValue, Long longValue) { - GenericRowData nested = GenericRowData.of(boolValue, longValue); - GenericRowData row = GenericRowData.of(id, StringData.fromString(data), nested); - return row; - } - - @Override - public RowData toGenericRow(int value, int repeated) { - GenericRowData row = new GenericRowData(repeated); - for (int i = 0; i < repeated; i++) { - row.setField(i, value); - } - return row; - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergFilesCommitter.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergFilesCommitter.java deleted file mode 100644 index 948c7b31430c..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergFilesCommitter.java +++ /dev/null @@ -1,1148 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; -import static org.apache.iceberg.flink.sink.IcebergFilesCommitter.MAX_CONTINUOUS_EMPTY_COMMITS; -import static org.apache.iceberg.flink.sink.ManifestOutputFileFactory.FLINK_MANIFEST_LOCATION; -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assumptions.assumeThat; - -import java.io.File; -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.Arrays; -import java.util.Collections; -import java.util.List; -import java.util.NavigableMap; -import java.util.SortedMap; -import java.util.stream.Collectors; -import org.apache.flink.api.common.ExecutionConfig; -import org.apache.flink.api.common.JobID; -import org.apache.flink.api.common.state.ListState; -import org.apache.flink.api.common.state.OperatorStateStore; -import org.apache.flink.core.io.SimpleVersionedSerialization; -import org.apache.flink.runtime.checkpoint.OperatorSubtaskState; -import org.apache.flink.runtime.jobgraph.OperatorID; -import org.apache.flink.runtime.operators.testutils.MockEnvironment; -import org.apache.flink.runtime.operators.testutils.MockEnvironmentBuilder; -import org.apache.flink.runtime.operators.testutils.MockInputSplitProvider; -import org.apache.flink.streaming.api.operators.AbstractStreamOperatorFactory; -import org.apache.flink.streaming.api.operators.BoundedOneInput; -import org.apache.flink.streaming.api.operators.OneInputStreamOperatorFactory; -import org.apache.flink.streaming.api.operators.StreamOperator; -import org.apache.flink.streaming.api.operators.StreamOperatorParameters; -import org.apache.flink.streaming.util.OneInputStreamOperatorTestHarness; -import org.apache.flink.table.data.RowData; -import org.apache.hadoop.conf.Configuration; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.DeleteFile; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.GenericManifestFile; -import org.apache.iceberg.ManifestContent; -import org.apache.iceberg.ManifestFile; -import org.apache.iceberg.Parameter; -import org.apache.iceberg.ParameterizedTestExtension; -import org.apache.iceberg.Parameters; -import org.apache.iceberg.PartitionData; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.StructLike; -import org.apache.iceberg.TestBase; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.SimpleDataUtil; -import org.apache.iceberg.flink.TestHelpers; -import org.apache.iceberg.flink.TestTableLoader; -import org.apache.iceberg.io.FileAppenderFactory; -import org.apache.iceberg.io.WriteResult; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.util.Pair; -import org.apache.iceberg.util.ThreadPools; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.TestTemplate; -import org.junit.jupiter.api.extension.ExtendWith; - -@ExtendWith(ParameterizedTestExtension.class) -public class TestIcebergFilesCommitter extends TestBase { - private static final Configuration CONF = new Configuration(); - - private File flinkManifestFolder; - - @Parameter(index = 1) - private FileFormat format; - - @Parameter(index = 2) - private String branch; - - @Parameters(name = "formatVersion = {0}, fileFormat = {1}, branch = {2}") - protected static List parameters() { - return Arrays.asList( - new Object[] {1, FileFormat.AVRO, "main"}, - new Object[] {2, FileFormat.AVRO, "test-branch"}, - new Object[] {1, FileFormat.PARQUET, "main"}, - new Object[] {2, FileFormat.PARQUET, "test-branch"}, - new Object[] {1, FileFormat.ORC, "main"}, - new Object[] {2, FileFormat.ORC, "test-branch"}); - } - - @Override - @BeforeEach - public void setupTable() throws IOException { - flinkManifestFolder = Files.createTempDirectory(temp, "flink").toFile(); - - this.tableDir = Files.createTempDirectory(temp, "junit").toFile(); - this.metadataDir = new File(tableDir, "metadata"); - assertThat(tableDir.delete()).isTrue(); - - // Construct the iceberg table. - table = create(SimpleDataUtil.SCHEMA, PartitionSpec.unpartitioned()); - - table - .updateProperties() - .set(DEFAULT_FILE_FORMAT, format.name()) - .set(FLINK_MANIFEST_LOCATION, flinkManifestFolder.getAbsolutePath()) - .set(MAX_CONTINUOUS_EMPTY_COMMITS, "1") - .commit(); - } - - @TestTemplate - public void testCommitTxnWithoutDataFiles() throws Exception { - long checkpointId = 0; - long timestamp = 0; - JobID jobId = new JobID(); - OperatorID operatorId; - try (OneInputStreamOperatorTestHarness harness = createStreamSink(jobId)) { - harness.setup(); - harness.open(); - operatorId = harness.getOperator().getOperatorID(); - - SimpleDataUtil.assertTableRows(table, Lists.newArrayList(), branch); - assertSnapshotSize(0); - assertMaxCommittedCheckpointId(jobId, operatorId, -1L); - - // It's better to advance the max-committed-checkpoint-id in iceberg snapshot, so that the - // future flink job - // failover won't fail. - for (int i = 1; i <= 3; i++) { - harness.snapshot(++checkpointId, ++timestamp); - assertFlinkManifests(0); - - harness.notifyOfCompletedCheckpoint(checkpointId); - assertFlinkManifests(0); - - assertSnapshotSize(i); - assertMaxCommittedCheckpointId(jobId, operatorId, checkpointId); - } - } - } - - @TestTemplate - public void testMaxContinuousEmptyCommits() throws Exception { - table.updateProperties().set(MAX_CONTINUOUS_EMPTY_COMMITS, "3").commit(); - - JobID jobId = new JobID(); - long checkpointId = 0; - long timestamp = 0; - try (OneInputStreamOperatorTestHarness harness = createStreamSink(jobId)) { - harness.setup(); - harness.open(); - - assertSnapshotSize(0); - - for (int i = 1; i <= 9; i++) { - harness.snapshot(++checkpointId, ++timestamp); - harness.notifyOfCompletedCheckpoint(checkpointId); - - assertSnapshotSize(i / 3); - } - } - } - - private WriteResult of(DataFile dataFile) { - return WriteResult.builder().addDataFiles(dataFile).build(); - } - - @TestTemplate - public void testCommitTxn() throws Exception { - // Test with 3 continues checkpoints: - // 1. snapshotState for checkpoint#1 - // 2. notifyCheckpointComplete for checkpoint#1 - // 3. snapshotState for checkpoint#2 - // 4. notifyCheckpointComplete for checkpoint#2 - // 5. snapshotState for checkpoint#3 - // 6. notifyCheckpointComplete for checkpoint#3 - long timestamp = 0; - - JobID jobID = new JobID(); - OperatorID operatorId; - try (OneInputStreamOperatorTestHarness harness = createStreamSink(jobID)) { - harness.setup(); - harness.open(); - operatorId = harness.getOperator().getOperatorID(); - - assertSnapshotSize(0); - - List rows = Lists.newArrayListWithExpectedSize(3); - for (int i = 1; i <= 3; i++) { - RowData rowData = SimpleDataUtil.createRowData(i, "hello" + i); - DataFile dataFile = writeDataFile("data-" + i, ImmutableList.of(rowData)); - harness.processElement(of(dataFile), ++timestamp); - rows.add(rowData); - - harness.snapshot(i, ++timestamp); - assertFlinkManifests(1); - - harness.notifyOfCompletedCheckpoint(i); - assertFlinkManifests(0); - - SimpleDataUtil.assertTableRows(table, ImmutableList.copyOf(rows), branch); - assertSnapshotSize(i); - assertMaxCommittedCheckpointId(jobID, operatorId, i); - assertThat(SimpleDataUtil.latestSnapshot(table, branch).summary()) - .containsEntry("flink.test", TestIcebergFilesCommitter.class.getName()); - } - } - } - - @TestTemplate - public void testOrderedEventsBetweenCheckpoints() throws Exception { - // It's possible that two checkpoints happen in the following orders: - // 1. snapshotState for checkpoint#1; - // 2. snapshotState for checkpoint#2; - // 3. notifyCheckpointComplete for checkpoint#1; - // 4. notifyCheckpointComplete for checkpoint#2; - long timestamp = 0; - - JobID jobId = new JobID(); - OperatorID operatorId; - try (OneInputStreamOperatorTestHarness harness = createStreamSink(jobId)) { - harness.setup(); - harness.open(); - operatorId = harness.getOperator().getOperatorID(); - - assertMaxCommittedCheckpointId(jobId, operatorId, -1L); - - RowData row1 = SimpleDataUtil.createRowData(1, "hello"); - DataFile dataFile1 = writeDataFile("data-1", ImmutableList.of(row1)); - - harness.processElement(of(dataFile1), ++timestamp); - assertMaxCommittedCheckpointId(jobId, operatorId, -1L); - - // 1. snapshotState for checkpoint#1 - long firstCheckpointId = 1; - harness.snapshot(firstCheckpointId, ++timestamp); - assertFlinkManifests(1); - - RowData row2 = SimpleDataUtil.createRowData(2, "world"); - DataFile dataFile2 = writeDataFile("data-2", ImmutableList.of(row2)); - harness.processElement(of(dataFile2), ++timestamp); - assertMaxCommittedCheckpointId(jobId, operatorId, -1L); - - // 2. snapshotState for checkpoint#2 - long secondCheckpointId = 2; - harness.snapshot(secondCheckpointId, ++timestamp); - assertFlinkManifests(2); - - // 3. notifyCheckpointComplete for checkpoint#1 - harness.notifyOfCompletedCheckpoint(firstCheckpointId); - SimpleDataUtil.assertTableRows(table, ImmutableList.of(row1), branch); - assertMaxCommittedCheckpointId(jobId, operatorId, firstCheckpointId); - assertFlinkManifests(1); - - // 4. notifyCheckpointComplete for checkpoint#2 - harness.notifyOfCompletedCheckpoint(secondCheckpointId); - SimpleDataUtil.assertTableRows(table, ImmutableList.of(row1, row2), branch); - assertMaxCommittedCheckpointId(jobId, operatorId, secondCheckpointId); - assertFlinkManifests(0); - } - } - - @TestTemplate - public void testDisorderedEventsBetweenCheckpoints() throws Exception { - // It's possible that the two checkpoints happen in the following orders: - // 1. snapshotState for checkpoint#1; - // 2. snapshotState for checkpoint#2; - // 3. notifyCheckpointComplete for checkpoint#2; - // 4. notifyCheckpointComplete for checkpoint#1; - long timestamp = 0; - - JobID jobId = new JobID(); - OperatorID operatorId; - try (OneInputStreamOperatorTestHarness harness = createStreamSink(jobId)) { - harness.setup(); - harness.open(); - operatorId = harness.getOperator().getOperatorID(); - - assertMaxCommittedCheckpointId(jobId, operatorId, -1L); - - RowData row1 = SimpleDataUtil.createRowData(1, "hello"); - DataFile dataFile1 = writeDataFile("data-1", ImmutableList.of(row1)); - - harness.processElement(of(dataFile1), ++timestamp); - assertMaxCommittedCheckpointId(jobId, operatorId, -1L); - - // 1. snapshotState for checkpoint#1 - long firstCheckpointId = 1; - harness.snapshot(firstCheckpointId, ++timestamp); - assertFlinkManifests(1); - - RowData row2 = SimpleDataUtil.createRowData(2, "world"); - DataFile dataFile2 = writeDataFile("data-2", ImmutableList.of(row2)); - harness.processElement(of(dataFile2), ++timestamp); - assertMaxCommittedCheckpointId(jobId, operatorId, -1L); - - // 2. snapshotState for checkpoint#2 - long secondCheckpointId = 2; - harness.snapshot(secondCheckpointId, ++timestamp); - assertFlinkManifests(2); - - // 3. notifyCheckpointComplete for checkpoint#2 - harness.notifyOfCompletedCheckpoint(secondCheckpointId); - SimpleDataUtil.assertTableRows(table, ImmutableList.of(row1, row2), branch); - assertMaxCommittedCheckpointId(jobId, operatorId, secondCheckpointId); - assertFlinkManifests(0); - - // 4. notifyCheckpointComplete for checkpoint#1 - harness.notifyOfCompletedCheckpoint(firstCheckpointId); - SimpleDataUtil.assertTableRows(table, ImmutableList.of(row1, row2), branch); - assertMaxCommittedCheckpointId(jobId, operatorId, secondCheckpointId); - assertFlinkManifests(0); - } - } - - @TestTemplate - public void testRecoveryFromValidSnapshot() throws Exception { - long checkpointId = 0; - long timestamp = 0; - List expectedRows = Lists.newArrayList(); - OperatorSubtaskState snapshot; - - JobID jobId = new JobID(); - OperatorID operatorId; - try (OneInputStreamOperatorTestHarness harness = createStreamSink(jobId)) { - harness.setup(); - harness.open(); - operatorId = harness.getOperator().getOperatorID(); - - assertSnapshotSize(0); - assertMaxCommittedCheckpointId(jobId, operatorId, -1L); - - RowData row = SimpleDataUtil.createRowData(1, "hello"); - expectedRows.add(row); - DataFile dataFile1 = writeDataFile("data-1", ImmutableList.of(row)); - - harness.processElement(of(dataFile1), ++timestamp); - snapshot = harness.snapshot(++checkpointId, ++timestamp); - assertFlinkManifests(1); - - harness.notifyOfCompletedCheckpoint(checkpointId); - assertFlinkManifests(0); - - SimpleDataUtil.assertTableRows(table, ImmutableList.of(row), branch); - assertSnapshotSize(1); - assertMaxCommittedCheckpointId(jobId, operatorId, checkpointId); - } - - // Restore from the given snapshot - try (OneInputStreamOperatorTestHarness harness = createStreamSink(jobId)) { - harness.getStreamConfig().setOperatorID(operatorId); - harness.setup(); - harness.initializeState(snapshot); - harness.open(); - - SimpleDataUtil.assertTableRows(table, expectedRows, branch); - assertSnapshotSize(1); - assertMaxCommittedCheckpointId(jobId, operatorId, checkpointId); - - RowData row = SimpleDataUtil.createRowData(2, "world"); - expectedRows.add(row); - DataFile dataFile = writeDataFile("data-2", ImmutableList.of(row)); - harness.processElement(of(dataFile), ++timestamp); - - harness.snapshot(++checkpointId, ++timestamp); - assertFlinkManifests(1); - - harness.notifyOfCompletedCheckpoint(checkpointId); - assertFlinkManifests(0); - - SimpleDataUtil.assertTableRows(table, expectedRows, branch); - assertSnapshotSize(2); - assertMaxCommittedCheckpointId(jobId, operatorId, checkpointId); - } - } - - @TestTemplate - public void testRecoveryFromSnapshotWithoutCompletedNotification() throws Exception { - // We've two steps in checkpoint: 1. snapshotState(ckp); 2. notifyCheckpointComplete(ckp). It's - // possible that we - // flink job will restore from a checkpoint with only step#1 finished. - long checkpointId = 0; - long timestamp = 0; - OperatorSubtaskState snapshot; - List expectedRows = Lists.newArrayList(); - JobID jobId = new JobID(); - OperatorID operatorId; - try (OneInputStreamOperatorTestHarness harness = createStreamSink(jobId)) { - harness.setup(); - harness.open(); - operatorId = harness.getOperator().getOperatorID(); - - assertSnapshotSize(0); - assertMaxCommittedCheckpointId(jobId, operatorId, -1L); - - RowData row = SimpleDataUtil.createRowData(1, "hello"); - expectedRows.add(row); - DataFile dataFile = writeDataFile("data-1", ImmutableList.of(row)); - harness.processElement(of(dataFile), ++timestamp); - - snapshot = harness.snapshot(++checkpointId, ++timestamp); - SimpleDataUtil.assertTableRows(table, ImmutableList.of(), branch); - assertMaxCommittedCheckpointId(jobId, operatorId, -1L); - assertFlinkManifests(1); - } - - try (OneInputStreamOperatorTestHarness harness = createStreamSink(jobId)) { - harness.getStreamConfig().setOperatorID(operatorId); - harness.setup(); - harness.initializeState(snapshot); - harness.open(); - - // All flink manifests should be cleaned because it has committed the unfinished iceberg - // transaction. - assertFlinkManifests(0); - - SimpleDataUtil.assertTableRows(table, expectedRows, branch); - assertMaxCommittedCheckpointId(jobId, operatorId, checkpointId); - - harness.snapshot(++checkpointId, ++timestamp); - // Did not write any new record, so it won't generate new manifest. - assertFlinkManifests(0); - - harness.notifyOfCompletedCheckpoint(checkpointId); - assertFlinkManifests(0); - - SimpleDataUtil.assertTableRows(table, expectedRows, branch); - assertSnapshotSize(2); - assertMaxCommittedCheckpointId(jobId, operatorId, checkpointId); - - RowData row = SimpleDataUtil.createRowData(2, "world"); - expectedRows.add(row); - DataFile dataFile = writeDataFile("data-2", ImmutableList.of(row)); - harness.processElement(of(dataFile), ++timestamp); - - snapshot = harness.snapshot(++checkpointId, ++timestamp); - assertFlinkManifests(1); - } - - // Redeploying flink job from external checkpoint. - JobID newJobId = new JobID(); - try (OneInputStreamOperatorTestHarness harness = - createStreamSink(newJobId)) { - harness.setup(); - harness.initializeState(snapshot); - harness.open(); - operatorId = harness.getOperator().getOperatorID(); - - // All flink manifests should be cleaned because it has committed the unfinished iceberg - // transaction. - assertFlinkManifests(0); - - assertMaxCommittedCheckpointId(newJobId, operatorId, -1); - assertMaxCommittedCheckpointId(jobId, operatorId, checkpointId); - SimpleDataUtil.assertTableRows(table, expectedRows, branch); - assertSnapshotSize(3); - - RowData row = SimpleDataUtil.createRowData(3, "foo"); - expectedRows.add(row); - DataFile dataFile = writeDataFile("data-3", ImmutableList.of(row)); - harness.processElement(of(dataFile), ++timestamp); - - harness.snapshot(++checkpointId, ++timestamp); - assertFlinkManifests(1); - - harness.notifyOfCompletedCheckpoint(checkpointId); - assertFlinkManifests(0); - - SimpleDataUtil.assertTableRows(table, expectedRows, branch); - assertSnapshotSize(4); - assertMaxCommittedCheckpointId(newJobId, operatorId, checkpointId); - } - } - - @TestTemplate - public void testStartAnotherJobToWriteSameTable() throws Exception { - long checkpointId = 0; - long timestamp = 0; - List rows = Lists.newArrayList(); - List tableRows = Lists.newArrayList(); - - JobID oldJobId = new JobID(); - OperatorID oldOperatorId; - try (OneInputStreamOperatorTestHarness harness = - createStreamSink(oldJobId)) { - harness.setup(); - harness.open(); - oldOperatorId = harness.getOperator().getOperatorID(); - - assertSnapshotSize(0); - assertMaxCommittedCheckpointId(oldJobId, oldOperatorId, -1L); - - for (int i = 1; i <= 3; i++) { - rows.add(SimpleDataUtil.createRowData(i, "hello" + i)); - tableRows.addAll(rows); - - DataFile dataFile = writeDataFile(String.format("data-%d", i), rows); - harness.processElement(of(dataFile), ++timestamp); - harness.snapshot(++checkpointId, ++timestamp); - assertFlinkManifests(1); - - harness.notifyOfCompletedCheckpoint(checkpointId); - assertFlinkManifests(0); - - SimpleDataUtil.assertTableRows(table, tableRows, branch); - assertSnapshotSize(i); - assertMaxCommittedCheckpointId(oldJobId, oldOperatorId, checkpointId); - } - } - - // The new started job will start with checkpoint = 1 again. - checkpointId = 0; - timestamp = 0; - JobID newJobId = new JobID(); - OperatorID newOperatorId; - try (OneInputStreamOperatorTestHarness harness = - createStreamSink(newJobId)) { - harness.setup(); - harness.open(); - newOperatorId = harness.getOperator().getOperatorID(); - - assertSnapshotSize(3); - assertMaxCommittedCheckpointId(oldJobId, oldOperatorId, 3); - assertMaxCommittedCheckpointId(newJobId, newOperatorId, -1); - - rows.add(SimpleDataUtil.createRowData(2, "world")); - tableRows.addAll(rows); - - DataFile dataFile = writeDataFile("data-new-1", rows); - harness.processElement(of(dataFile), ++timestamp); - harness.snapshot(++checkpointId, ++timestamp); - assertFlinkManifests(1); - - harness.notifyOfCompletedCheckpoint(checkpointId); - assertFlinkManifests(0); - SimpleDataUtil.assertTableRows(table, tableRows, branch); - assertSnapshotSize(4); - assertMaxCommittedCheckpointId(newJobId, newOperatorId, checkpointId); - } - } - - @TestTemplate - public void testMultipleJobsWriteSameTable() throws Exception { - long timestamp = 0; - List tableRows = Lists.newArrayList(); - - JobID[] jobs = new JobID[] {new JobID(), new JobID(), new JobID()}; - OperatorID[] operatorIds = - new OperatorID[] {new OperatorID(), new OperatorID(), new OperatorID()}; - for (int i = 0; i < 20; i++) { - int jobIndex = i % 3; - int checkpointId = i / 3; - JobID jobId = jobs[jobIndex]; - OperatorID operatorId = operatorIds[jobIndex]; - try (OneInputStreamOperatorTestHarness harness = createStreamSink(jobId)) { - harness.getStreamConfig().setOperatorID(operatorId); - harness.setup(); - harness.open(); - - assertSnapshotSize(i); - assertMaxCommittedCheckpointId(jobId, operatorId, checkpointId == 0 ? -1 : checkpointId); - - List rows = Lists.newArrayList(SimpleDataUtil.createRowData(i, "word-" + i)); - tableRows.addAll(rows); - - DataFile dataFile = writeDataFile(String.format("data-%d", i), rows); - harness.processElement(of(dataFile), ++timestamp); - harness.snapshot(checkpointId + 1, ++timestamp); - assertFlinkManifests(1); - - harness.notifyOfCompletedCheckpoint(checkpointId + 1); - assertFlinkManifests(0); - SimpleDataUtil.assertTableRows(table, tableRows, branch); - assertSnapshotSize(i + 1); - assertMaxCommittedCheckpointId(jobId, operatorId, checkpointId + 1); - } - } - } - - @TestTemplate - public void testMultipleSinksRecoveryFromValidSnapshot() throws Exception { - long checkpointId = 0; - long timestamp = 0; - List expectedRows = Lists.newArrayList(); - OperatorSubtaskState snapshot1; - OperatorSubtaskState snapshot2; - - JobID jobId = new JobID(); - OperatorID operatorId1 = new OperatorID(); - OperatorID operatorId2 = new OperatorID(); - try (OneInputStreamOperatorTestHarness harness1 = createStreamSink(jobId); - OneInputStreamOperatorTestHarness harness2 = createStreamSink(jobId)) { - harness1.getStreamConfig().setOperatorID(operatorId1); - harness1.setup(); - harness1.open(); - harness2.getStreamConfig().setOperatorID(operatorId2); - harness2.setup(); - harness2.open(); - - assertSnapshotSize(0); - assertMaxCommittedCheckpointId(jobId, operatorId1, -1L); - assertMaxCommittedCheckpointId(jobId, operatorId2, -1L); - - RowData row1 = SimpleDataUtil.createRowData(1, "hello1"); - expectedRows.add(row1); - DataFile dataFile1 = writeDataFile("data-1-1", ImmutableList.of(row1)); - - harness1.processElement(of(dataFile1), ++timestamp); - snapshot1 = harness1.snapshot(++checkpointId, ++timestamp); - - RowData row2 = SimpleDataUtil.createRowData(1, "hello2"); - expectedRows.add(row2); - DataFile dataFile2 = writeDataFile("data-1-2", ImmutableList.of(row2)); - - harness2.processElement(of(dataFile2), ++timestamp); - snapshot2 = harness2.snapshot(checkpointId, ++timestamp); - assertFlinkManifests(2); - - // Only notify one of the committers - harness1.notifyOfCompletedCheckpoint(checkpointId); - assertFlinkManifests(1); - - // Only the first row is committed at this point - SimpleDataUtil.assertTableRows(table, ImmutableList.of(row1), branch); - assertSnapshotSize(1); - assertMaxCommittedCheckpointId(jobId, operatorId1, checkpointId); - assertMaxCommittedCheckpointId(jobId, operatorId2, -1); - } - - // Restore from the given snapshot - try (OneInputStreamOperatorTestHarness harness1 = createStreamSink(jobId); - OneInputStreamOperatorTestHarness harness2 = createStreamSink(jobId)) { - harness1.getStreamConfig().setOperatorID(operatorId1); - harness1.setup(); - harness1.initializeState(snapshot1); - harness1.open(); - - harness2.getStreamConfig().setOperatorID(operatorId2); - harness2.setup(); - harness2.initializeState(snapshot2); - harness2.open(); - - // All flink manifests should be cleaned because it has committed the unfinished iceberg - // transaction. - assertFlinkManifests(0); - - SimpleDataUtil.assertTableRows(table, expectedRows, branch); - assertSnapshotSize(2); - assertMaxCommittedCheckpointId(jobId, operatorId1, checkpointId); - assertMaxCommittedCheckpointId(jobId, operatorId2, checkpointId); - - RowData row1 = SimpleDataUtil.createRowData(2, "world1"); - expectedRows.add(row1); - DataFile dataFile1 = writeDataFile("data-2-1", ImmutableList.of(row1)); - - harness1.processElement(of(dataFile1), ++timestamp); - harness1.snapshot(++checkpointId, ++timestamp); - - RowData row2 = SimpleDataUtil.createRowData(2, "world2"); - expectedRows.add(row2); - DataFile dataFile2 = writeDataFile("data-2-2", ImmutableList.of(row2)); - harness2.processElement(of(dataFile2), ++timestamp); - harness2.snapshot(checkpointId, ++timestamp); - - assertFlinkManifests(2); - - harness1.notifyOfCompletedCheckpoint(checkpointId); - harness2.notifyOfCompletedCheckpoint(checkpointId); - assertFlinkManifests(0); - - SimpleDataUtil.assertTableRows(table, expectedRows, branch); - assertSnapshotSize(4); - assertMaxCommittedCheckpointId(jobId, operatorId1, checkpointId); - assertMaxCommittedCheckpointId(jobId, operatorId2, checkpointId); - } - } - - @TestTemplate - public void testBoundedStream() throws Exception { - JobID jobId = new JobID(); - OperatorID operatorId; - try (OneInputStreamOperatorTestHarness harness = createStreamSink(jobId)) { - harness.setup(); - harness.open(); - operatorId = harness.getOperator().getOperatorID(); - - assertFlinkManifests(0); - assertSnapshotSize(0); - assertMaxCommittedCheckpointId(jobId, operatorId, -1L); - - List tableRows = Lists.newArrayList(SimpleDataUtil.createRowData(1, "word-1")); - - DataFile dataFile = writeDataFile("data-1", tableRows); - harness.processElement(of(dataFile), 1); - ((BoundedOneInput) harness.getOneInputOperator()).endInput(); - - assertFlinkManifests(0); - SimpleDataUtil.assertTableRows(table, tableRows, branch); - assertSnapshotSize(1); - assertMaxCommittedCheckpointId(jobId, operatorId, Long.MAX_VALUE); - assertThat(SimpleDataUtil.latestSnapshot(table, branch).summary()) - .containsEntry("flink.test", TestIcebergFilesCommitter.class.getName()); - } - } - - @TestTemplate - public void testFlinkManifests() throws Exception { - long timestamp = 0; - final long checkpoint = 10; - - JobID jobId = new JobID(); - OperatorID operatorId; - try (OneInputStreamOperatorTestHarness harness = createStreamSink(jobId)) { - harness.setup(); - harness.open(); - operatorId = harness.getOperator().getOperatorID(); - - assertMaxCommittedCheckpointId(jobId, operatorId, -1L); - - RowData row1 = SimpleDataUtil.createRowData(1, "hello"); - DataFile dataFile1 = writeDataFile("data-1", ImmutableList.of(row1)); - - harness.processElement(of(dataFile1), ++timestamp); - assertMaxCommittedCheckpointId(jobId, operatorId, -1L); - - // 1. snapshotState for checkpoint#1 - harness.snapshot(checkpoint, ++timestamp); - List manifestPaths = assertFlinkManifests(1); - Path manifestPath = manifestPaths.get(0); - assertThat(manifestPath.getFileName()) - .asString() - .isEqualTo( - String.format("%s-%s-%05d-%d-%d-%05d.avro", jobId, operatorId, 0, 0, checkpoint, 1)); - - // 2. Read the data files from manifests and assert. - List dataFiles = - FlinkManifestUtil.readDataFiles( - createTestingManifestFile(manifestPath), table.io(), table.specs()); - assertThat(dataFiles).hasSize(1); - TestHelpers.assertEquals(dataFile1, dataFiles.get(0)); - - // 3. notifyCheckpointComplete for checkpoint#1 - harness.notifyOfCompletedCheckpoint(checkpoint); - SimpleDataUtil.assertTableRows(table, ImmutableList.of(row1), branch); - assertMaxCommittedCheckpointId(jobId, operatorId, checkpoint); - assertFlinkManifests(0); - } - } - - @TestTemplate - public void testDeleteFiles() throws Exception { - assumeThat(formatVersion) - .as("Only support equality-delete in format v2 or later.") - .isGreaterThan(1); - - long timestamp = 0; - long checkpoint = 10; - - JobID jobId = new JobID(); - OperatorID operatorId; - FileAppenderFactory appenderFactory = createDeletableAppenderFactory(); - - try (OneInputStreamOperatorTestHarness harness = createStreamSink(jobId)) { - harness.setup(); - harness.open(); - operatorId = harness.getOperator().getOperatorID(); - - assertMaxCommittedCheckpointId(jobId, operatorId, -1L); - - RowData row1 = SimpleDataUtil.createInsert(1, "aaa"); - DataFile dataFile1 = writeDataFile("data-file-1", ImmutableList.of(row1)); - harness.processElement(of(dataFile1), ++timestamp); - assertMaxCommittedCheckpointId(jobId, operatorId, -1L); - - // 1. snapshotState for checkpoint#1 - harness.snapshot(checkpoint, ++timestamp); - List manifestPaths = assertFlinkManifests(1); - Path manifestPath = manifestPaths.get(0); - assertThat(manifestPath.getFileName()) - .asString() - .isEqualTo( - String.format("%s-%s-%05d-%d-%d-%05d.avro", jobId, operatorId, 0, 0, checkpoint, 1)); - - // 2. Read the data files from manifests and assert. - List dataFiles = - FlinkManifestUtil.readDataFiles( - createTestingManifestFile(manifestPath), table.io(), table.specs()); - assertThat(dataFiles).hasSize(1); - TestHelpers.assertEquals(dataFile1, dataFiles.get(0)); - - // 3. notifyCheckpointComplete for checkpoint#1 - harness.notifyOfCompletedCheckpoint(checkpoint); - SimpleDataUtil.assertTableRows(table, ImmutableList.of(row1), branch); - assertMaxCommittedCheckpointId(jobId, operatorId, checkpoint); - assertFlinkManifests(0); - - // 4. process both data files and delete files. - RowData row2 = SimpleDataUtil.createInsert(2, "bbb"); - DataFile dataFile2 = writeDataFile("data-file-2", ImmutableList.of(row2)); - - RowData delete1 = SimpleDataUtil.createDelete(1, "aaa"); - DeleteFile deleteFile1 = - writeEqDeleteFile(appenderFactory, "delete-file-1", ImmutableList.of(delete1)); - harness.processElement( - WriteResult.builder().addDataFiles(dataFile2).addDeleteFiles(deleteFile1).build(), - ++timestamp); - assertMaxCommittedCheckpointId(jobId, operatorId, checkpoint); - - // 5. snapshotState for checkpoint#2 - harness.snapshot(++checkpoint, ++timestamp); - assertFlinkManifests(2); - - // 6. notifyCheckpointComplete for checkpoint#2 - harness.notifyOfCompletedCheckpoint(checkpoint); - SimpleDataUtil.assertTableRows(table, ImmutableList.of(row2), branch); - assertMaxCommittedCheckpointId(jobId, operatorId, checkpoint); - assertFlinkManifests(0); - } - } - - @TestTemplate - public void testCommitTwoCheckpointsInSingleTxn() throws Exception { - assumeThat(formatVersion) - .as("Only support equality-delete in format v2 or later.") - .isGreaterThan(1); - - long timestamp = 0; - long checkpoint = 10; - - JobID jobId = new JobID(); - OperatorID operatorId; - FileAppenderFactory appenderFactory = createDeletableAppenderFactory(); - - try (OneInputStreamOperatorTestHarness harness = createStreamSink(jobId)) { - harness.setup(); - harness.open(); - operatorId = harness.getOperator().getOperatorID(); - - assertMaxCommittedCheckpointId(jobId, operatorId, -1L); - - RowData insert1 = SimpleDataUtil.createInsert(1, "aaa"); - RowData insert2 = SimpleDataUtil.createInsert(2, "bbb"); - RowData delete3 = SimpleDataUtil.createDelete(3, "ccc"); - DataFile dataFile1 = writeDataFile("data-file-1", ImmutableList.of(insert1, insert2)); - DeleteFile deleteFile1 = - writeEqDeleteFile(appenderFactory, "delete-file-1", ImmutableList.of(delete3)); - harness.processElement( - WriteResult.builder().addDataFiles(dataFile1).addDeleteFiles(deleteFile1).build(), - ++timestamp); - - // The 1th snapshotState. - harness.snapshot(checkpoint, ++timestamp); - - RowData insert4 = SimpleDataUtil.createInsert(4, "ddd"); - RowData delete2 = SimpleDataUtil.createDelete(2, "bbb"); - DataFile dataFile2 = writeDataFile("data-file-2", ImmutableList.of(insert4)); - DeleteFile deleteFile2 = - writeEqDeleteFile(appenderFactory, "delete-file-2", ImmutableList.of(delete2)); - harness.processElement( - WriteResult.builder().addDataFiles(dataFile2).addDeleteFiles(deleteFile2).build(), - ++timestamp); - - // The 2nd snapshotState. - harness.snapshot(++checkpoint, ++timestamp); - - // Notify the 2nd snapshot to complete. - harness.notifyOfCompletedCheckpoint(checkpoint); - SimpleDataUtil.assertTableRows(table, ImmutableList.of(insert1, insert4), branch); - assertMaxCommittedCheckpointId(jobId, operatorId, checkpoint); - assertFlinkManifests(0); - assertThat(table.snapshots()).hasSize(2); - } - } - - @TestTemplate - public void testSpecEvolution() throws Exception { - long timestamp = 0; - int checkpointId = 0; - List rows = Lists.newArrayList(); - JobID jobId = new JobID(); - - OperatorID operatorId; - OperatorSubtaskState snapshot; - DataFile dataFile; - int specId; - - try (OneInputStreamOperatorTestHarness harness = createStreamSink(jobId)) { - harness.setup(); - harness.open(); - operatorId = harness.getOperator().getOperatorID(); - - assertSnapshotSize(0); - - checkpointId++; - RowData rowData = SimpleDataUtil.createRowData(checkpointId, "hello" + checkpointId); - // table unpartitioned - dataFile = writeDataFile("data-" + checkpointId, ImmutableList.of(rowData)); - harness.processElement(of(dataFile), ++timestamp); - rows.add(rowData); - harness.snapshot(checkpointId, ++timestamp); - - specId = - getStagingManifestSpecId(harness.getOperator().getOperatorStateBackend(), checkpointId); - assertThat(specId).isEqualTo(table.spec().specId()); - - harness.notifyOfCompletedCheckpoint(checkpointId); - - // Change partition spec - table.refresh(); - PartitionSpec oldSpec = table.spec(); - table.updateSpec().addField("id").commit(); - - checkpointId++; - rowData = SimpleDataUtil.createRowData(checkpointId, "hello" + checkpointId); - // write data with old partition spec - dataFile = writeDataFile("data-" + checkpointId, ImmutableList.of(rowData), oldSpec, null); - harness.processElement(of(dataFile), ++timestamp); - rows.add(rowData); - snapshot = harness.snapshot(checkpointId, ++timestamp); - - specId = - getStagingManifestSpecId(harness.getOperator().getOperatorStateBackend(), checkpointId); - assertThat(specId).isEqualTo(oldSpec.specId()); - - harness.notifyOfCompletedCheckpoint(checkpointId); - - assertFlinkManifests(0); - - SimpleDataUtil.assertTableRows(table, ImmutableList.copyOf(rows), branch); - assertSnapshotSize(checkpointId); - assertMaxCommittedCheckpointId(jobId, operatorId, checkpointId); - } - - // Restore from the given snapshot - try (OneInputStreamOperatorTestHarness harness = createStreamSink(jobId)) { - harness.getStreamConfig().setOperatorID(operatorId); - harness.setup(); - harness.initializeState(snapshot); - harness.open(); - - SimpleDataUtil.assertTableRows(table, rows, branch); - assertSnapshotSize(checkpointId); - assertMaxCommittedCheckpointId(jobId, operatorId, checkpointId); - - checkpointId++; - RowData row = SimpleDataUtil.createRowData(checkpointId, "world" + checkpointId); - StructLike partition = new PartitionData(table.spec().partitionType()); - partition.set(0, checkpointId); - dataFile = - writeDataFile("data-" + checkpointId, ImmutableList.of(row), table.spec(), partition); - harness.processElement(of(dataFile), ++timestamp); - rows.add(row); - harness.snapshot(checkpointId, ++timestamp); - assertFlinkManifests(1); - - specId = - getStagingManifestSpecId(harness.getOperator().getOperatorStateBackend(), checkpointId); - assertThat(specId).isEqualTo(table.spec().specId()); - - harness.notifyOfCompletedCheckpoint(checkpointId); - assertFlinkManifests(0); - - SimpleDataUtil.assertTableRows(table, rows, branch); - assertSnapshotSize(checkpointId); - assertMaxCommittedCheckpointId(jobId, operatorId, checkpointId); - } - } - - private int getStagingManifestSpecId(OperatorStateStore operatorStateStore, long checkPointId) - throws Exception { - ListState> checkpointsState = - operatorStateStore.getListState(IcebergFilesCommitter.buildStateDescriptor()); - NavigableMap statedDataFiles = - Maps.newTreeMap(checkpointsState.get().iterator().next()); - DeltaManifests deltaManifests = - SimpleVersionedSerialization.readVersionAndDeSerialize( - DeltaManifestsSerializer.INSTANCE, statedDataFiles.get(checkPointId)); - return deltaManifests.dataManifest().partitionSpecId(); - } - - private DeleteFile writeEqDeleteFile( - FileAppenderFactory appenderFactory, String filename, List deletes) - throws IOException { - return SimpleDataUtil.writeEqDeleteFile(table, format, filename, appenderFactory, deletes); - } - - private DeleteFile writePosDeleteFile( - FileAppenderFactory appenderFactory, - String filename, - List> positions) - throws IOException { - return SimpleDataUtil.writePosDeleteFile(table, format, filename, appenderFactory, positions); - } - - private FileAppenderFactory createDeletableAppenderFactory() { - int[] equalityFieldIds = - new int[] { - table.schema().findField("id").fieldId(), table.schema().findField("data").fieldId() - }; - return new FlinkAppenderFactory( - table, - table.schema(), - FlinkSchemaUtil.convert(table.schema()), - table.properties(), - table.spec(), - equalityFieldIds, - table.schema(), - null); - } - - private ManifestFile createTestingManifestFile(Path manifestPath) { - return new GenericManifestFile( - manifestPath.toAbsolutePath().toString(), - manifestPath.toFile().length(), - 0, - ManifestContent.DATA, - 0, - 0, - 0L, - 0, - 0, - 0, - 0, - 0, - 0, - null, - null); - } - - private List assertFlinkManifests(int expectedCount) throws IOException { - List manifests = - Files.list(flinkManifestFolder.toPath()) - .filter(p -> !p.toString().endsWith(".crc")) - .collect(Collectors.toList()); - assertThat(manifests).hasSize(expectedCount); - return manifests; - } - - private DataFile writeDataFile(String filename, List rows) throws IOException { - return SimpleDataUtil.writeFile( - table, - table.schema(), - table.spec(), - CONF, - table.location(), - format.addExtension(filename), - rows); - } - - private DataFile writeDataFile( - String filename, List rows, PartitionSpec spec, StructLike partition) - throws IOException { - return SimpleDataUtil.writeFile( - table, - table.schema(), - spec, - CONF, - table.location(), - format.addExtension(filename), - rows, - partition); - } - - private void assertMaxCommittedCheckpointId(JobID jobID, OperatorID operatorID, long expectedId) { - table.refresh(); - long actualId = - IcebergFilesCommitter.getMaxCommittedCheckpointId( - table, jobID.toString(), operatorID.toHexString(), branch); - assertThat(actualId).isEqualTo(expectedId); - } - - private void assertSnapshotSize(int expectedSnapshotSize) { - table.refresh(); - assertThat(table.snapshots()).hasSize(expectedSnapshotSize); - } - - private OneInputStreamOperatorTestHarness createStreamSink(JobID jobID) - throws Exception { - TestOperatorFactory factory = TestOperatorFactory.of(table.location(), branch, table.spec()); - return new OneInputStreamOperatorTestHarness<>(factory, createEnvironment(jobID)); - } - - private static MockEnvironment createEnvironment(JobID jobID) { - return new MockEnvironmentBuilder() - .setTaskName("test task") - .setManagedMemorySize(32 * 1024) - .setInputSplitProvider(new MockInputSplitProvider()) - .setBufferSize(256) - .setTaskConfiguration(new org.apache.flink.configuration.Configuration()) - .setExecutionConfig(new ExecutionConfig()) - .setMaxParallelism(16) - .setJobID(jobID) - .build(); - } - - private static class TestOperatorFactory extends AbstractStreamOperatorFactory - implements OneInputStreamOperatorFactory { - private final String tablePath; - private final String branch; - private final PartitionSpec spec; - - private TestOperatorFactory(String tablePath, String branch, PartitionSpec spec) { - this.tablePath = tablePath; - this.branch = branch; - this.spec = spec; - } - - private static TestOperatorFactory of(String tablePath, String branch, PartitionSpec spec) { - return new TestOperatorFactory(tablePath, branch, spec); - } - - @Override - @SuppressWarnings("unchecked") - public > T createStreamOperator( - StreamOperatorParameters param) { - IcebergFilesCommitter committer = - new IcebergFilesCommitter( - new TestTableLoader(tablePath), - false, - Collections.singletonMap("flink.test", TestIcebergFilesCommitter.class.getName()), - ThreadPools.WORKER_THREAD_POOL_SIZE, - branch, - spec); - committer.setup(param.getContainingTask(), param.getStreamConfig(), param.getOutput()); - return (T) committer; - } - - @Override - public Class getStreamOperatorClass(ClassLoader classLoader) { - return IcebergFilesCommitter.class; - } - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergStreamWriter.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergStreamWriter.java deleted file mode 100644 index 50283f7ad215..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergStreamWriter.java +++ /dev/null @@ -1,390 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.File; -import java.io.IOException; -import java.nio.file.Files; -import java.util.Arrays; -import java.util.List; -import java.util.Locale; -import java.util.Map; -import java.util.Set; -import org.apache.flink.streaming.api.operators.BoundedOneInput; -import org.apache.flink.streaming.util.OneInputStreamOperatorTestHarness; -import org.apache.flink.table.api.DataTypes; -import org.apache.flink.table.api.TableSchema; -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.LocatedFileStatus; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.fs.RemoteIterator; -import org.apache.iceberg.AppendFiles; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Parameter; -import org.apache.iceberg.ParameterizedTestExtension; -import org.apache.iceberg.Parameters; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.data.GenericRecord; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.flink.FlinkWriteConf; -import org.apache.iceberg.flink.SimpleDataUtil; -import org.apache.iceberg.hadoop.HadoopTables; -import org.apache.iceberg.io.WriteResult; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.relocated.com.google.common.collect.Sets; -import org.apache.iceberg.types.Types; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.TestTemplate; -import org.junit.jupiter.api.extension.ExtendWith; -import org.junit.jupiter.api.io.TempDir; - -@ExtendWith(ParameterizedTestExtension.class) -public class TestIcebergStreamWriter { - @TempDir protected java.nio.file.Path temporaryFolder; - - private Table table; - - @Parameter(index = 0) - private FileFormat format; - - @Parameter(index = 1) - private boolean partitioned; - - @Parameters(name = "format = {0}, partitioned = {1}") - public static Object[][] parameters() { - return new Object[][] { - {FileFormat.AVRO, true}, - {FileFormat.AVRO, false}, - {FileFormat.ORC, true}, - {FileFormat.ORC, false}, - {FileFormat.PARQUET, true}, - {FileFormat.PARQUET, false} - }; - } - - @BeforeEach - public void before() throws IOException { - File folder = Files.createTempDirectory(temporaryFolder, "junit").toFile(); - // Construct the iceberg table. - Map props = ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name()); - table = SimpleDataUtil.createTable(folder.getAbsolutePath(), props, partitioned); - } - - @TestTemplate - public void testWritingTable() throws Exception { - long checkpointId = 1L; - try (OneInputStreamOperatorTestHarness testHarness = - createIcebergStreamWriter()) { - // The first checkpoint - testHarness.processElement(SimpleDataUtil.createRowData(1, "hello"), 1); - testHarness.processElement(SimpleDataUtil.createRowData(2, "world"), 1); - testHarness.processElement(SimpleDataUtil.createRowData(3, "hello"), 1); - - testHarness.prepareSnapshotPreBarrier(checkpointId); - int expectedDataFiles = partitioned ? 2 : 1; - WriteResult result = WriteResult.builder().addAll(testHarness.extractOutputValues()).build(); - assertThat(result.deleteFiles()).isEmpty(); - assertThat(result.dataFiles()).hasSize(expectedDataFiles); - - checkpointId = checkpointId + 1; - - // The second checkpoint - testHarness.processElement(SimpleDataUtil.createRowData(4, "foo"), 1); - testHarness.processElement(SimpleDataUtil.createRowData(5, "bar"), 2); - - testHarness.prepareSnapshotPreBarrier(checkpointId); - expectedDataFiles = partitioned ? 4 : 2; - result = WriteResult.builder().addAll(testHarness.extractOutputValues()).build(); - assertThat(result.deleteFiles()).isEmpty(); - assertThat(result.dataFiles()).hasSize(expectedDataFiles); - - // Commit the iceberg transaction. - AppendFiles appendFiles = table.newAppend(); - Arrays.stream(result.dataFiles()).forEach(appendFiles::appendFile); - appendFiles.commit(); - - // Assert the table records. - SimpleDataUtil.assertTableRecords( - table, - Lists.newArrayList( - SimpleDataUtil.createRecord(1, "hello"), - SimpleDataUtil.createRecord(2, "world"), - SimpleDataUtil.createRecord(3, "hello"), - SimpleDataUtil.createRecord(4, "foo"), - SimpleDataUtil.createRecord(5, "bar"))); - } - } - - @TestTemplate - public void testSnapshotTwice() throws Exception { - long checkpointId = 1; - long timestamp = 1; - try (OneInputStreamOperatorTestHarness testHarness = - createIcebergStreamWriter()) { - testHarness.processElement(SimpleDataUtil.createRowData(1, "hello"), timestamp++); - testHarness.processElement(SimpleDataUtil.createRowData(2, "world"), timestamp); - - testHarness.prepareSnapshotPreBarrier(checkpointId++); - int expectedDataFiles = partitioned ? 2 : 1; - WriteResult result = WriteResult.builder().addAll(testHarness.extractOutputValues()).build(); - assertThat(result.deleteFiles()).isEmpty(); - assertThat(result.dataFiles()).hasSize(expectedDataFiles); - - // snapshot again immediately. - for (int i = 0; i < 5; i++) { - testHarness.prepareSnapshotPreBarrier(checkpointId++); - - result = WriteResult.builder().addAll(testHarness.extractOutputValues()).build(); - assertThat(result.deleteFiles()).isEmpty(); - assertThat(result.dataFiles()).hasSize(expectedDataFiles); - } - } - } - - @TestTemplate - public void testTableWithoutSnapshot() throws Exception { - try (OneInputStreamOperatorTestHarness testHarness = - createIcebergStreamWriter()) { - assertThat(testHarness.extractOutputValues()).isEmpty(); - } - // Even if we closed the iceberg stream writer, there's no orphan data file. - assertThat(scanDataFiles()).isEmpty(); - - try (OneInputStreamOperatorTestHarness testHarness = - createIcebergStreamWriter()) { - testHarness.processElement(SimpleDataUtil.createRowData(1, "hello"), 1); - // Still not emit the data file yet, because there is no checkpoint. - assertThat(testHarness.extractOutputValues()).isEmpty(); - } - // Once we closed the iceberg stream writer, there will left an orphan data file. - assertThat(scanDataFiles()).hasSize(1); - } - - private Set scanDataFiles() throws IOException { - Path dataDir = new Path(table.location(), "data"); - FileSystem fs = FileSystem.get(new Configuration()); - if (!fs.exists(dataDir)) { - return ImmutableSet.of(); - } else { - Set paths = Sets.newHashSet(); - RemoteIterator iterators = fs.listFiles(dataDir, true); - while (iterators.hasNext()) { - LocatedFileStatus status = iterators.next(); - if (status.isFile()) { - Path path = status.getPath(); - if (path.getName().endsWith("." + format.toString().toLowerCase(Locale.ROOT))) { - paths.add(path.toString()); - } - } - } - return paths; - } - } - - @TestTemplate - public void testBoundedStreamCloseWithEmittingDataFiles() throws Exception { - try (OneInputStreamOperatorTestHarness testHarness = - createIcebergStreamWriter()) { - testHarness.processElement(SimpleDataUtil.createRowData(1, "hello"), 1); - testHarness.processElement(SimpleDataUtil.createRowData(2, "world"), 2); - - assertThat(testHarness.getOneInputOperator()).isInstanceOf(BoundedOneInput.class); - ((BoundedOneInput) testHarness.getOneInputOperator()).endInput(); - - int expectedDataFiles = partitioned ? 2 : 1; - WriteResult result = WriteResult.builder().addAll(testHarness.extractOutputValues()).build(); - assertThat(result.deleteFiles()).isEmpty(); - assertThat(result.dataFiles()).hasSize(expectedDataFiles); - - ((BoundedOneInput) testHarness.getOneInputOperator()).endInput(); - - result = WriteResult.builder().addAll(testHarness.extractOutputValues()).build(); - assertThat(result.deleteFiles()).isEmpty(); - // Datafiles should not be sent again - assertThat(result.dataFiles()).hasSize(expectedDataFiles); - } - } - - @TestTemplate - public void testBoundedStreamTriggeredEndInputBeforeTriggeringCheckpoint() throws Exception { - try (OneInputStreamOperatorTestHarness testHarness = - createIcebergStreamWriter()) { - testHarness.processElement(SimpleDataUtil.createRowData(1, "hello"), 1); - testHarness.processElement(SimpleDataUtil.createRowData(2, "world"), 2); - - testHarness.endInput(); - - int expectedDataFiles = partitioned ? 2 : 1; - WriteResult result = WriteResult.builder().addAll(testHarness.extractOutputValues()).build(); - assertThat(result.deleteFiles()).isEmpty(); - assertThat(result.dataFiles()).hasSize(expectedDataFiles); - - testHarness.prepareSnapshotPreBarrier(1L); - - result = WriteResult.builder().addAll(testHarness.extractOutputValues()).build(); - assertThat(result.deleteFiles()).isEmpty(); - // It should be ensured that after endInput is triggered, when prepareSnapshotPreBarrier - // is triggered, write should only send WriteResult once - assertThat(result.dataFiles()).hasSize(expectedDataFiles); - } - } - - @TestTemplate - public void testTableWithTargetFileSize() throws Exception { - // Adjust the target-file-size in table properties. - table - .updateProperties() - .set(TableProperties.WRITE_TARGET_FILE_SIZE_BYTES, "4") // ~4 bytes; low enough to trigger - .commit(); - - List rows = Lists.newArrayListWithCapacity(8000); - List records = Lists.newArrayListWithCapacity(8000); - for (int i = 0; i < 2000; i++) { - for (String data : new String[] {"a", "b", "c", "d"}) { - rows.add(SimpleDataUtil.createRowData(i, data)); - records.add(SimpleDataUtil.createRecord(i, data)); - } - } - - try (OneInputStreamOperatorTestHarness testHarness = - createIcebergStreamWriter()) { - for (RowData row : rows) { - testHarness.processElement(row, 1); - } - - // snapshot the operator. - testHarness.prepareSnapshotPreBarrier(1); - WriteResult result = WriteResult.builder().addAll(testHarness.extractOutputValues()).build(); - assertThat(result.deleteFiles()).isEmpty(); - assertThat(result.dataFiles()).hasSize(8); - - // Assert that the data file have the expected records. - for (DataFile dataFile : result.dataFiles()) { - assertThat(dataFile.recordCount()).isEqualTo(1000); - } - - // Commit the iceberg transaction. - AppendFiles appendFiles = table.newAppend(); - Arrays.stream(result.dataFiles()).forEach(appendFiles::appendFile); - appendFiles.commit(); - } - - // Assert the table records. - SimpleDataUtil.assertTableRecords(table, records); - } - - @TestTemplate - public void testPromotedFlinkDataType() throws Exception { - Schema iSchema = - new Schema( - Types.NestedField.required(1, "tinyint", Types.IntegerType.get()), - Types.NestedField.required(2, "smallint", Types.IntegerType.get()), - Types.NestedField.optional(3, "int", Types.IntegerType.get())); - TableSchema flinkSchema = - TableSchema.builder() - .field("tinyint", DataTypes.TINYINT().notNull()) - .field("smallint", DataTypes.SMALLINT().notNull()) - .field("int", DataTypes.INT().nullable()) - .build(); - - PartitionSpec spec; - if (partitioned) { - spec = - PartitionSpec.builderFor(iSchema) - .identity("smallint") - .identity("tinyint") - .identity("int") - .build(); - } else { - spec = PartitionSpec.unpartitioned(); - } - - String location = - Files.createTempDirectory(temporaryFolder, "junit").toFile().getAbsolutePath(); - Map props = ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name()); - Table icebergTable = new HadoopTables().create(iSchema, spec, props, location); - - List rows = - Lists.newArrayList( - GenericRowData.of((byte) 0x01, (short) -32768, 101), - GenericRowData.of((byte) 0x02, (short) 0, 102), - GenericRowData.of((byte) 0x03, (short) 32767, 103)); - - Record record = GenericRecord.create(iSchema); - List expected = - Lists.newArrayList( - record.copy(ImmutableMap.of("tinyint", 1, "smallint", -32768, "int", 101)), - record.copy(ImmutableMap.of("tinyint", 2, "smallint", 0, "int", 102)), - record.copy(ImmutableMap.of("tinyint", 3, "smallint", 32767, "int", 103))); - - try (OneInputStreamOperatorTestHarness testHarness = - createIcebergStreamWriter(icebergTable, flinkSchema)) { - for (RowData row : rows) { - testHarness.processElement(row, 1); - } - testHarness.prepareSnapshotPreBarrier(1); - WriteResult result = WriteResult.builder().addAll(testHarness.extractOutputValues()).build(); - assertThat(result.deleteFiles()).isEmpty(); - assertThat(result.dataFiles()).hasSize(partitioned ? 3 : 1); - - // Commit the iceberg transaction. - AppendFiles appendFiles = icebergTable.newAppend(); - Arrays.stream(result.dataFiles()).forEach(appendFiles::appendFile); - appendFiles.commit(); - } - - SimpleDataUtil.assertTableRecords(location, expected); - } - - private OneInputStreamOperatorTestHarness createIcebergStreamWriter() - throws Exception { - return createIcebergStreamWriter(table, SimpleDataUtil.FLINK_SCHEMA); - } - - private OneInputStreamOperatorTestHarness createIcebergStreamWriter( - Table icebergTable, TableSchema flinkSchema) throws Exception { - RowType flinkRowType = FlinkSink.toFlinkRowType(icebergTable.schema(), flinkSchema); - FlinkWriteConf flinkWriteConfig = - new FlinkWriteConf( - icebergTable, Maps.newHashMap(), new org.apache.flink.configuration.Configuration()); - - IcebergStreamWriter streamWriter = - FlinkSink.createStreamWriter(() -> icebergTable, flinkWriteConfig, flinkRowType, null); - OneInputStreamOperatorTestHarness harness = - new OneInputStreamOperatorTestHarness<>(streamWriter, 1, 1, 0); - - harness.setup(); - harness.open(); - - return harness; - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestRowDataPartitionKey.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestRowDataPartitionKey.java deleted file mode 100644 index 919fef579ab0..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestRowDataPartitionKey.java +++ /dev/null @@ -1,251 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.util.List; -import java.util.stream.Collectors; -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.StringData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.PartitionKey; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.data.InternalRecordWrapper; -import org.apache.iceberg.data.RandomGenericData; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.RowDataWrapper; -import org.apache.iceberg.flink.data.RandomRowData; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.types.Types; -import org.junit.jupiter.api.Test; - -public class TestRowDataPartitionKey { - private static final Schema SCHEMA = - new Schema( - Types.NestedField.required(0, "boolType", Types.BooleanType.get()), - Types.NestedField.required(1, "id", Types.IntegerType.get()), - Types.NestedField.required(2, "longType", Types.LongType.get()), - Types.NestedField.required(3, "dateType", Types.DateType.get()), - Types.NestedField.required(4, "timeType", Types.TimeType.get()), - Types.NestedField.required(5, "stringType", Types.StringType.get()), - Types.NestedField.required(6, "timestampWithoutZone", Types.TimestampType.withoutZone()), - Types.NestedField.required(7, "timestampWithZone", Types.TimestampType.withZone()), - Types.NestedField.required(8, "fixedType", Types.FixedType.ofLength(5)), - Types.NestedField.required(9, "uuidType", Types.UUIDType.get()), - Types.NestedField.required(10, "binaryType", Types.BinaryType.get()), - Types.NestedField.required(11, "decimalType1", Types.DecimalType.of(18, 3)), - Types.NestedField.required(12, "decimalType2", Types.DecimalType.of(10, 5)), - Types.NestedField.required(13, "decimalType3", Types.DecimalType.of(38, 19)), - Types.NestedField.required(14, "floatType", Types.FloatType.get()), - Types.NestedField.required(15, "doubleType", Types.DoubleType.get())); - - private static final List SUPPORTED_PRIMITIVES = - SCHEMA.asStruct().fields().stream().map(Types.NestedField::name).collect(Collectors.toList()); - - private static final Schema NESTED_SCHEMA = - new Schema( - Types.NestedField.required( - 1, - "structType", - Types.StructType.of( - Types.NestedField.optional(2, "innerStringType", Types.StringType.get()), - Types.NestedField.optional(3, "innerIntegerType", Types.IntegerType.get())))); - - @Test - public void testNullPartitionValue() { - Schema schema = - new Schema( - Types.NestedField.optional(1, "id", Types.IntegerType.get()), - Types.NestedField.optional(2, "data", Types.StringType.get())); - - PartitionSpec spec = PartitionSpec.builderFor(schema).identity("data").build(); - - List rows = - Lists.newArrayList( - GenericRowData.of(1, StringData.fromString("a")), - GenericRowData.of(2, StringData.fromString("b")), - GenericRowData.of(3, null)); - - RowDataWrapper rowWrapper = - new RowDataWrapper(FlinkSchemaUtil.convert(schema), schema.asStruct()); - - for (RowData row : rows) { - PartitionKey partitionKey = new PartitionKey(spec, schema); - partitionKey.partition(rowWrapper.wrap(row)); - assertThat(partitionKey.size()).isEqualTo(1); - - String expectedStr = row.isNullAt(1) ? null : row.getString(1).toString(); - assertThat(partitionKey.get(0, String.class)).isEqualTo(expectedStr); - } - } - - @Test - public void testPartitionWithOneNestedField() { - RowDataWrapper rowWrapper = - new RowDataWrapper(FlinkSchemaUtil.convert(NESTED_SCHEMA), NESTED_SCHEMA.asStruct()); - List records = RandomGenericData.generate(NESTED_SCHEMA, 10, 1991); - List rows = Lists.newArrayList(RandomRowData.convert(NESTED_SCHEMA, records)); - - PartitionSpec spec1 = - PartitionSpec.builderFor(NESTED_SCHEMA).identity("structType.innerStringType").build(); - PartitionSpec spec2 = - PartitionSpec.builderFor(NESTED_SCHEMA).identity("structType.innerIntegerType").build(); - - for (int i = 0; i < rows.size(); i++) { - RowData row = rows.get(i); - Record record = (Record) records.get(i).get(0); - - PartitionKey partitionKey1 = new PartitionKey(spec1, NESTED_SCHEMA); - partitionKey1.partition(rowWrapper.wrap(row)); - assertThat(partitionKey1.size()).isEqualTo(1); - - assertThat(partitionKey1.get(0, String.class)).isEqualTo(record.get(0)); - - PartitionKey partitionKey2 = new PartitionKey(spec2, NESTED_SCHEMA); - partitionKey2.partition(rowWrapper.wrap(row)); - assertThat(partitionKey2.size()).isEqualTo(1); - - assertThat(partitionKey2.get(0, Integer.class)).isEqualTo(record.get(1)); - } - } - - @Test - public void testPartitionMultipleNestedField() { - RowDataWrapper rowWrapper = - new RowDataWrapper(FlinkSchemaUtil.convert(NESTED_SCHEMA), NESTED_SCHEMA.asStruct()); - List records = RandomGenericData.generate(NESTED_SCHEMA, 10, 1992); - List rows = Lists.newArrayList(RandomRowData.convert(NESTED_SCHEMA, records)); - - PartitionSpec spec1 = - PartitionSpec.builderFor(NESTED_SCHEMA) - .identity("structType.innerIntegerType") - .identity("structType.innerStringType") - .build(); - PartitionSpec spec2 = - PartitionSpec.builderFor(NESTED_SCHEMA) - .identity("structType.innerStringType") - .identity("structType.innerIntegerType") - .build(); - - PartitionKey pk1 = new PartitionKey(spec1, NESTED_SCHEMA); - PartitionKey pk2 = new PartitionKey(spec2, NESTED_SCHEMA); - - for (int i = 0; i < rows.size(); i++) { - RowData row = rows.get(i); - Record record = (Record) records.get(i).get(0); - - pk1.partition(rowWrapper.wrap(row)); - assertThat(pk1.size()).isEqualTo(2); - - assertThat(pk1.get(0, Integer.class)).isEqualTo(record.get(1)); - assertThat(pk1.get(1, String.class)).isEqualTo(record.get(0)); - - pk2.partition(rowWrapper.wrap(row)); - assertThat(pk2.size()).isEqualTo(2); - - assertThat(pk2.get(0, String.class)).isEqualTo(record.get(0)); - assertThat(pk2.get(1, Integer.class)).isEqualTo(record.get(1)); - } - } - - @Test - public void testPartitionValueTypes() { - RowType rowType = FlinkSchemaUtil.convert(SCHEMA); - RowDataWrapper rowWrapper = new RowDataWrapper(rowType, SCHEMA.asStruct()); - InternalRecordWrapper recordWrapper = new InternalRecordWrapper(SCHEMA.asStruct()); - - List records = RandomGenericData.generate(SCHEMA, 10, 1993); - List rows = Lists.newArrayList(RandomRowData.convert(SCHEMA, records)); - - for (String column : SUPPORTED_PRIMITIVES) { - PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity(column).build(); - Class[] javaClasses = spec.javaClasses(); - - PartitionKey pk = new PartitionKey(spec, SCHEMA); - PartitionKey expectedPK = new PartitionKey(spec, SCHEMA); - - for (int j = 0; j < rows.size(); j++) { - RowData row = rows.get(j); - Record record = records.get(j); - - pk.partition(rowWrapper.wrap(row)); - expectedPK.partition(recordWrapper.wrap(record)); - - assertThat(pk.size()) - .as("Partition with column " + column + " should have one field.") - .isEqualTo(1); - - if (column.equals("timeType")) { - assertThat(pk.get(0, Long.class) / 1000) - .as("Partition with column " + column + " should have the expected values") - .isEqualTo(expectedPK.get(0, Long.class) / 1000); - } else { - assertThat(pk.get(0, javaClasses[0])) - .as("Partition with column " + column + " should have the expected values") - .isEqualTo(expectedPK.get(0, javaClasses[0])); - } - } - } - } - - @Test - public void testNestedPartitionValues() { - Schema nestedSchema = new Schema(Types.NestedField.optional(1001, "nested", SCHEMA.asStruct())); - RowType rowType = FlinkSchemaUtil.convert(nestedSchema); - - RowDataWrapper rowWrapper = new RowDataWrapper(rowType, nestedSchema.asStruct()); - InternalRecordWrapper recordWrapper = new InternalRecordWrapper(nestedSchema.asStruct()); - - List records = RandomGenericData.generate(nestedSchema, 10, 1994); - List rows = Lists.newArrayList(RandomRowData.convert(nestedSchema, records)); - - for (String supportedPrimitive : SUPPORTED_PRIMITIVES) { - String column = String.format("nested.%s", supportedPrimitive); - - PartitionSpec spec = PartitionSpec.builderFor(nestedSchema).identity(column).build(); - Class[] javaClasses = spec.javaClasses(); - - PartitionKey pk = new PartitionKey(spec, nestedSchema); - PartitionKey expectedPK = new PartitionKey(spec, nestedSchema); - - for (int j = 0; j < rows.size(); j++) { - pk.partition(rowWrapper.wrap(rows.get(j))); - expectedPK.partition(recordWrapper.wrap(records.get(j))); - - assertThat(pk.size()) - .as("Partition with nested column " + column + " should have one field.") - .isEqualTo(1); - - if (column.equals("nested.timeType")) { - assertThat(pk.get(0, Long.class) / 1000) - .as("Partition with nested column " + column + " should have the expected values.") - .isEqualTo(expectedPK.get(0, Long.class) / 1000); - } else { - assertThat(pk.get(0, javaClasses[0])) - .as("Partition with nested column " + column + " should have the expected values.") - .isEqualTo(expectedPK.get(0, javaClasses[0])); - } - } - } - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestTaskWriters.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestTaskWriters.java deleted file mode 100644 index 8bfd6cb3d043..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/TestTaskWriters.java +++ /dev/null @@ -1,242 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.File; -import java.io.IOException; -import java.nio.file.Files; -import java.util.List; -import java.util.Map; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.iceberg.AppendFiles; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Parameter; -import org.apache.iceberg.ParameterizedTestExtension; -import org.apache.iceberg.Parameters; -import org.apache.iceberg.SerializableTable; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.flink.SimpleDataUtil; -import org.apache.iceberg.flink.data.RandomRowData; -import org.apache.iceberg.io.TaskWriter; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.TestTemplate; -import org.junit.jupiter.api.extension.ExtendWith; -import org.junit.jupiter.api.io.TempDir; - -@ExtendWith(ParameterizedTestExtension.class) -public class TestTaskWriters { - private static final Configuration CONF = new Configuration(); - private static final long TARGET_FILE_SIZE = 128 * 1024 * 1024; - - @TempDir protected java.nio.file.Path temporaryFolder; - - @Parameters(name = "format = {0}, partitioned = {1}") - public static Object[][] parameters() { - return new Object[][] { - {FileFormat.AVRO, true}, - {FileFormat.AVRO, false}, - {FileFormat.ORC, true}, - {FileFormat.ORC, false}, - {FileFormat.PARQUET, true}, - {FileFormat.PARQUET, false} - }; - } - - @Parameter(index = 0) - private FileFormat format; - - @Parameter(index = 1) - private boolean partitioned; - - private Table table; - - @BeforeEach - public void before() throws IOException { - File folder = Files.createTempDirectory(temporaryFolder, "junit").toFile(); - // Construct the iceberg table with the specified file format. - Map props = ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name()); - table = SimpleDataUtil.createTable(folder.getAbsolutePath(), props, partitioned); - } - - @TestTemplate - public void testWriteZeroRecord() throws IOException { - try (TaskWriter taskWriter = createTaskWriter(TARGET_FILE_SIZE)) { - taskWriter.close(); - - DataFile[] dataFiles = taskWriter.dataFiles(); - assertThat(dataFiles).isNotNull().isEmpty(); - - // Close again. - taskWriter.close(); - dataFiles = taskWriter.dataFiles(); - assertThat(dataFiles).isNotNull().isEmpty(); - } - } - - @TestTemplate - public void testCloseTwice() throws IOException { - try (TaskWriter taskWriter = createTaskWriter(TARGET_FILE_SIZE)) { - taskWriter.write(SimpleDataUtil.createRowData(1, "hello")); - taskWriter.write(SimpleDataUtil.createRowData(2, "world")); - taskWriter.close(); // The first close - taskWriter.close(); // The second close - - int expectedFiles = partitioned ? 2 : 1; - DataFile[] dataFiles = taskWriter.dataFiles(); - assertThat(dataFiles).hasSize(expectedFiles); - - FileSystem fs = FileSystem.get(CONF); - for (DataFile dataFile : dataFiles) { - assertThat(fs.exists(new Path(dataFile.path().toString()))).isTrue(); - } - } - } - - @TestTemplate - public void testAbort() throws IOException { - try (TaskWriter taskWriter = createTaskWriter(TARGET_FILE_SIZE)) { - taskWriter.write(SimpleDataUtil.createRowData(1, "hello")); - taskWriter.write(SimpleDataUtil.createRowData(2, "world")); - - taskWriter.abort(); - DataFile[] dataFiles = taskWriter.dataFiles(); - - int expectedFiles = partitioned ? 2 : 1; - assertThat(dataFiles).hasSize(expectedFiles); - - FileSystem fs = FileSystem.get(CONF); - for (DataFile dataFile : dataFiles) { - assertThat(fs.exists(new Path(dataFile.path().toString()))).isFalse(); - } - } - } - - @TestTemplate - public void testCompleteFiles() throws IOException { - try (TaskWriter taskWriter = createTaskWriter(TARGET_FILE_SIZE)) { - taskWriter.write(SimpleDataUtil.createRowData(1, "a")); - taskWriter.write(SimpleDataUtil.createRowData(2, "b")); - taskWriter.write(SimpleDataUtil.createRowData(3, "c")); - taskWriter.write(SimpleDataUtil.createRowData(4, "d")); - - DataFile[] dataFiles = taskWriter.dataFiles(); - int expectedFiles = partitioned ? 4 : 1; - assertThat(dataFiles).hasSize(expectedFiles); - - dataFiles = taskWriter.dataFiles(); - assertThat(dataFiles).hasSize(expectedFiles); - - FileSystem fs = FileSystem.get(CONF); - for (DataFile dataFile : dataFiles) { - assertThat(fs.exists(new Path(dataFile.path().toString()))).isTrue(); - } - - AppendFiles appendFiles = table.newAppend(); - for (DataFile dataFile : dataFiles) { - appendFiles.appendFile(dataFile); - } - appendFiles.commit(); - - // Assert the data rows. - SimpleDataUtil.assertTableRecords( - table, - Lists.newArrayList( - SimpleDataUtil.createRecord(1, "a"), - SimpleDataUtil.createRecord(2, "b"), - SimpleDataUtil.createRecord(3, "c"), - SimpleDataUtil.createRecord(4, "d"))); - } - } - - @TestTemplate - public void testRollingWithTargetFileSize() throws IOException { - try (TaskWriter taskWriter = createTaskWriter(4)) { - List rows = Lists.newArrayListWithCapacity(8000); - List records = Lists.newArrayListWithCapacity(8000); - for (int i = 0; i < 2000; i++) { - for (String data : new String[] {"a", "b", "c", "d"}) { - rows.add(SimpleDataUtil.createRowData(i, data)); - records.add(SimpleDataUtil.createRecord(i, data)); - } - } - - for (RowData row : rows) { - taskWriter.write(row); - } - - DataFile[] dataFiles = taskWriter.dataFiles(); - assertThat(dataFiles).hasSize(8); - - AppendFiles appendFiles = table.newAppend(); - for (DataFile dataFile : dataFiles) { - appendFiles.appendFile(dataFile); - } - appendFiles.commit(); - - // Assert the data rows. - SimpleDataUtil.assertTableRecords(table, records); - } - } - - @TestTemplate - public void testRandomData() throws IOException { - try (TaskWriter taskWriter = createTaskWriter(TARGET_FILE_SIZE)) { - Iterable rows = RandomRowData.generate(SimpleDataUtil.SCHEMA, 100, 1996); - for (RowData row : rows) { - taskWriter.write(row); - } - - taskWriter.close(); - DataFile[] dataFiles = taskWriter.dataFiles(); - AppendFiles appendFiles = table.newAppend(); - for (DataFile dataFile : dataFiles) { - appendFiles.appendFile(dataFile); - } - appendFiles.commit(); - - // Assert the data rows. - SimpleDataUtil.assertTableRows(table, Lists.newArrayList(rows)); - } - } - - private TaskWriter createTaskWriter(long targetFileSize) { - TaskWriterFactory taskWriterFactory = - new RowDataTaskWriterFactory( - SerializableTable.copyOf(table), - (RowType) SimpleDataUtil.FLINK_SCHEMA.toRowDataType().getLogicalType(), - targetFileSize, - format, - table.properties(), - null, - false); - taskWriterFactory.initialize(1, 1); - return taskWriterFactory.create(); - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/Fixtures.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/Fixtures.java deleted file mode 100644 index 5910bd685510..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/Fixtures.java +++ /dev/null @@ -1,100 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import java.util.Comparator; -import java.util.Map; -import org.apache.flink.api.common.typeutils.TypeSerializer; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.runtime.typeutils.RowDataSerializer; -import org.apache.flink.table.types.logical.IntType; -import org.apache.flink.table.types.logical.RowType; -import org.apache.flink.table.types.logical.VarCharType; -import org.apache.iceberg.Schema; -import org.apache.iceberg.SortKey; -import org.apache.iceberg.SortOrder; -import org.apache.iceberg.SortOrderComparators; -import org.apache.iceberg.StructLike; -import org.apache.iceberg.flink.RowDataWrapper; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.types.Types; - -class Fixtures { - private Fixtures() {} - - public static final int NUM_SUBTASKS = 2; - public static final Schema SCHEMA = - new Schema( - Types.NestedField.optional(1, "id", Types.StringType.get()), - Types.NestedField.optional(2, "number", Types.IntegerType.get())); - public static final RowType ROW_TYPE = RowType.of(new VarCharType(), new IntType()); - public static final TypeSerializer ROW_SERIALIZER = new RowDataSerializer(ROW_TYPE); - public static final RowDataWrapper ROW_WRAPPER = new RowDataWrapper(ROW_TYPE, SCHEMA.asStruct()); - public static final SortOrder SORT_ORDER = SortOrder.builderFor(SCHEMA).asc("id").build(); - public static final Comparator SORT_ORDER_COMPARTOR = - SortOrderComparators.forSchema(SCHEMA, SORT_ORDER); - public static final SortKeySerializer SORT_KEY_SERIALIZER = - new SortKeySerializer(SCHEMA, SORT_ORDER); - public static final DataStatisticsSerializer TASK_STATISTICS_SERIALIZER = - new DataStatisticsSerializer(SORT_KEY_SERIALIZER); - public static final GlobalStatisticsSerializer GLOBAL_STATISTICS_SERIALIZER = - new GlobalStatisticsSerializer(SORT_KEY_SERIALIZER); - public static final CompletedStatisticsSerializer COMPLETED_STATISTICS_SERIALIZER = - new CompletedStatisticsSerializer(SORT_KEY_SERIALIZER); - - public static final SortKey SORT_KEY = new SortKey(SCHEMA, SORT_ORDER); - public static final Map CHAR_KEYS = createCharKeys(); - - public static StatisticsEvent createStatisticsEvent( - StatisticsType type, - TypeSerializer statisticsSerializer, - long checkpointId, - SortKey... keys) { - DataStatistics statistics = createTaskStatistics(type, keys); - return StatisticsEvent.createTaskStatisticsEvent( - checkpointId, statistics, statisticsSerializer); - } - - public static DataStatistics createTaskStatistics(StatisticsType type, SortKey... keys) { - DataStatistics statistics; - if (type == StatisticsType.Sketch) { - statistics = new SketchDataStatistics(128); - } else { - statistics = new MapDataStatistics(); - } - - for (SortKey key : keys) { - statistics.add(key); - } - - return statistics; - } - - private static Map createCharKeys() { - Map keys = Maps.newHashMap(); - for (char c = 'a'; c <= 'z'; ++c) { - String key = Character.toString(c); - SortKey sortKey = SORT_KEY.copy(); - sortKey.set(0, key); - keys.put(key, sortKey); - } - - return keys; - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestAggregatedStatisticsTracker.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestAggregatedStatisticsTracker.java deleted file mode 100644 index 8322ce683768..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestAggregatedStatisticsTracker.java +++ /dev/null @@ -1,465 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import static org.apache.iceberg.flink.sink.shuffle.Fixtures.CHAR_KEYS; -import static org.apache.iceberg.flink.sink.shuffle.Fixtures.TASK_STATISTICS_SERIALIZER; -import static org.apache.iceberg.flink.sink.shuffle.Fixtures.createStatisticsEvent; -import static org.assertj.core.api.Assertions.assertThat; - -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.params.ParameterizedTest; -import org.junit.jupiter.params.provider.EnumSource; - -public class TestAggregatedStatisticsTracker { - @ParameterizedTest - @EnumSource(StatisticsType.class) - public void receiveNewerStatisticsEvent(StatisticsType type) { - AggregatedStatisticsTracker tracker = createTracker(type); - - StatisticsEvent checkpoint1Subtask0StatisticsEvent = - createStatisticsEvent(type, TASK_STATISTICS_SERIALIZER, 1L, CHAR_KEYS.get("a")); - CompletedStatistics completedStatistics = - tracker.updateAndCheckCompletion(0, checkpoint1Subtask0StatisticsEvent); - assertThat(completedStatistics).isNull(); - assertThat(tracker.aggregationsPerCheckpoint().keySet()).containsExactlyInAnyOrder(1L); - AggregatedStatisticsTracker.Aggregation aggregation = - tracker.aggregationsPerCheckpoint().get(1L); - assertThat(aggregation.currentType()).isEqualTo(StatisticsUtil.collectType(type)); - if (StatisticsUtil.collectType(type) == StatisticsType.Map) { - assertThat(aggregation.mapStatistics()).isEqualTo(ImmutableMap.of(CHAR_KEYS.get("a"), 1L)); - } else { - assertThat(aggregation.sketchStatistics().getResult().getSamples()) - .containsExactlyInAnyOrder(CHAR_KEYS.get("a")); - } - - StatisticsEvent checkpoint2Subtask0StatisticsEvent = - createStatisticsEvent( - type, - TASK_STATISTICS_SERIALIZER, - 2L, - CHAR_KEYS.get("a"), - CHAR_KEYS.get("b"), - CHAR_KEYS.get("b")); - completedStatistics = tracker.updateAndCheckCompletion(0, checkpoint2Subtask0StatisticsEvent); - assertThat(completedStatistics).isNull(); - // both checkpoints are tracked - assertThat(tracker.aggregationsPerCheckpoint().keySet()).containsExactlyInAnyOrder(1L, 2L); - aggregation = tracker.aggregationsPerCheckpoint().get(2L); - assertThat(aggregation.currentType()).isEqualTo(StatisticsUtil.collectType(type)); - if (StatisticsUtil.collectType(type) == StatisticsType.Map) { - assertThat(aggregation.mapStatistics()) - .isEqualTo(ImmutableMap.of(CHAR_KEYS.get("a"), 1L, CHAR_KEYS.get("b"), 2L)); - } else { - assertThat(aggregation.sketchStatistics().getResult().getSamples()) - .containsExactlyInAnyOrder(CHAR_KEYS.get("a"), CHAR_KEYS.get("b"), CHAR_KEYS.get("b")); - } - - StatisticsEvent checkpoint1Subtask1StatisticsEvent = - createStatisticsEvent(type, TASK_STATISTICS_SERIALIZER, 1L, CHAR_KEYS.get("b")); - completedStatistics = tracker.updateAndCheckCompletion(1, checkpoint1Subtask1StatisticsEvent); - // checkpoint 1 is completed - assertThat(completedStatistics).isNotNull(); - assertThat(completedStatistics.type()).isEqualTo(StatisticsUtil.collectType(type)); - assertThat(completedStatistics.checkpointId()).isEqualTo(1L); - if (StatisticsUtil.collectType(type) == StatisticsType.Map) { - assertThat(completedStatistics.keyFrequency()) - .isEqualTo( - ImmutableMap.of( - CHAR_KEYS.get("a"), 1L, - CHAR_KEYS.get("b"), 1L)); - } else { - assertThat(completedStatistics.keySamples()) - .containsExactly(CHAR_KEYS.get("a"), CHAR_KEYS.get("b")); - } - - // checkpoint 2 remains - assertThat(tracker.aggregationsPerCheckpoint().keySet()).containsExactlyInAnyOrder(2L); - aggregation = tracker.aggregationsPerCheckpoint().get(2L); - assertThat(aggregation.currentType()).isEqualTo(StatisticsUtil.collectType(type)); - if (StatisticsUtil.collectType(type) == StatisticsType.Map) { - assertThat(aggregation.mapStatistics()) - .isEqualTo(ImmutableMap.of(CHAR_KEYS.get("a"), 1L, CHAR_KEYS.get("b"), 2L)); - } else { - assertThat(aggregation.sketchStatistics().getResult().getSamples()) - .containsExactlyInAnyOrder(CHAR_KEYS.get("a"), CHAR_KEYS.get("b"), CHAR_KEYS.get("b")); - } - } - - @ParameterizedTest - @EnumSource(StatisticsType.class) - public void receiveOlderStatisticsEventTest(StatisticsType type) { - AggregatedStatisticsTracker tracker = createTracker(type); - - StatisticsEvent checkpoint2Subtask0StatisticsEvent = - createStatisticsEvent( - type, - TASK_STATISTICS_SERIALIZER, - 2L, - CHAR_KEYS.get("a"), - CHAR_KEYS.get("b"), - CHAR_KEYS.get("b")); - CompletedStatistics completedStatistics = - tracker.updateAndCheckCompletion(0, checkpoint2Subtask0StatisticsEvent); - assertThat(completedStatistics).isNull(); - assertThat(tracker.aggregationsPerCheckpoint().keySet()).containsExactlyInAnyOrder(2L); - AggregatedStatisticsTracker.Aggregation aggregation = - tracker.aggregationsPerCheckpoint().get(2L); - assertThat(aggregation.currentType()).isEqualTo(StatisticsUtil.collectType(type)); - if (StatisticsUtil.collectType(type) == StatisticsType.Map) { - assertThat(aggregation.mapStatistics()) - .isEqualTo(ImmutableMap.of(CHAR_KEYS.get("a"), 1L, CHAR_KEYS.get("b"), 2L)); - } else { - assertThat(aggregation.sketchStatistics().getResult().getSamples()) - .containsExactlyInAnyOrder(CHAR_KEYS.get("a"), CHAR_KEYS.get("b"), CHAR_KEYS.get("b")); - } - - StatisticsEvent checkpoint1Subtask1StatisticsEvent = - createStatisticsEvent(type, TASK_STATISTICS_SERIALIZER, 1L, CHAR_KEYS.get("b")); - completedStatistics = tracker.updateAndCheckCompletion(1, checkpoint1Subtask1StatisticsEvent); - assertThat(completedStatistics).isNull(); - // both checkpoints are tracked - assertThat(tracker.aggregationsPerCheckpoint().keySet()).containsExactlyInAnyOrder(1L, 2L); - aggregation = tracker.aggregationsPerCheckpoint().get(1L); - assertThat(aggregation.currentType()).isEqualTo(StatisticsUtil.collectType(type)); - if (StatisticsUtil.collectType(type) == StatisticsType.Map) { - assertThat(aggregation.mapStatistics()).isEqualTo(ImmutableMap.of(CHAR_KEYS.get("b"), 1L)); - } else { - assertThat(aggregation.sketchStatistics().getResult().getSamples()) - .containsExactlyInAnyOrder(CHAR_KEYS.get("b")); - } - - StatisticsEvent checkpoint3Subtask0StatisticsEvent = - createStatisticsEvent(type, TASK_STATISTICS_SERIALIZER, 3L, CHAR_KEYS.get("x")); - completedStatistics = tracker.updateAndCheckCompletion(1, checkpoint3Subtask0StatisticsEvent); - assertThat(completedStatistics).isNull(); - assertThat(tracker.aggregationsPerCheckpoint().keySet()).containsExactlyInAnyOrder(1L, 2L, 3L); - aggregation = tracker.aggregationsPerCheckpoint().get(3L); - assertThat(aggregation.currentType()).isEqualTo(StatisticsUtil.collectType(type)); - if (StatisticsUtil.collectType(type) == StatisticsType.Map) { - assertThat(aggregation.mapStatistics()).isEqualTo(ImmutableMap.of(CHAR_KEYS.get("x"), 1L)); - } else { - assertThat(aggregation.sketchStatistics().getResult().getSamples()) - .containsExactlyInAnyOrder(CHAR_KEYS.get("x")); - } - - StatisticsEvent checkpoint2Subtask1StatisticsEvent = - createStatisticsEvent( - type, - TASK_STATISTICS_SERIALIZER, - 2L, - CHAR_KEYS.get("a"), - CHAR_KEYS.get("b"), - CHAR_KEYS.get("b")); - completedStatistics = tracker.updateAndCheckCompletion(1, checkpoint2Subtask1StatisticsEvent); - // checkpoint 1 is cleared along with checkpoint 2. checkpoint 3 remains - assertThat(tracker.aggregationsPerCheckpoint().keySet()).containsExactlyInAnyOrder(3L); - aggregation = tracker.aggregationsPerCheckpoint().get(3L); - assertThat(aggregation.currentType()).isEqualTo(StatisticsUtil.collectType(type)); - if (StatisticsUtil.collectType(type) == StatisticsType.Map) { - assertThat(aggregation.mapStatistics()).isEqualTo(ImmutableMap.of(CHAR_KEYS.get("x"), 1L)); - } else { - assertThat(aggregation.sketchStatistics().getResult().getSamples()) - .containsExactlyInAnyOrder(CHAR_KEYS.get("x")); - } - - assertThat(completedStatistics).isNotNull(); - assertThat(completedStatistics.type()).isEqualTo(StatisticsUtil.collectType(type)); - assertThat(completedStatistics.checkpointId()).isEqualTo(2L); - if (StatisticsUtil.collectType(type) == StatisticsType.Map) { - assertThat(completedStatistics.keyFrequency()) - .isEqualTo( - ImmutableMap.of( - CHAR_KEYS.get("a"), 2L, - CHAR_KEYS.get("b"), 4L)); - } else { - assertThat(completedStatistics.keySamples()) - .containsExactly( - CHAR_KEYS.get("a"), - CHAR_KEYS.get("b"), - CHAR_KEYS.get("b"), - CHAR_KEYS.get("a"), - CHAR_KEYS.get("b"), - CHAR_KEYS.get("b")); - } - } - - @ParameterizedTest - @EnumSource(StatisticsType.class) - public void receiveCompletedStatisticsEvent(StatisticsType type) { - AggregatedStatisticsTracker tracker = createTracker(type); - - StatisticsEvent checkpoint1Subtask0DataStatisticEvent = - createStatisticsEvent( - type, - TASK_STATISTICS_SERIALIZER, - 1L, - CHAR_KEYS.get("a"), - CHAR_KEYS.get("b"), - CHAR_KEYS.get("b")); - - CompletedStatistics completedStatistics = - tracker.updateAndCheckCompletion(0, checkpoint1Subtask0DataStatisticEvent); - assertThat(completedStatistics).isNull(); - assertThat(tracker.aggregationsPerCheckpoint().keySet()).containsExactlyInAnyOrder(1L); - AggregatedStatisticsTracker.Aggregation aggregation = - tracker.aggregationsPerCheckpoint().get(1L); - assertThat(aggregation.subtaskSet()).containsExactlyInAnyOrder(0); - if (StatisticsUtil.collectType(type) == StatisticsType.Map) { - assertThat(aggregation.mapStatistics()) - .isEqualTo(ImmutableMap.of(CHAR_KEYS.get("a"), 1L, CHAR_KEYS.get("b"), 2L)); - } else { - assertThat(aggregation.sketchStatistics().getResult().getSamples()) - .containsExactlyInAnyOrder(CHAR_KEYS.get("a"), CHAR_KEYS.get("b"), CHAR_KEYS.get("b")); - } - - StatisticsEvent checkpoint1Subtask1DataStatisticEvent = - createStatisticsEvent( - type, - TASK_STATISTICS_SERIALIZER, - 1L, - CHAR_KEYS.get("a"), - CHAR_KEYS.get("a"), - CHAR_KEYS.get("b")); - - // Receive data statistics from all subtasks at checkpoint 1 - completedStatistics = - tracker.updateAndCheckCompletion(1, checkpoint1Subtask1DataStatisticEvent); - assertThat(tracker.aggregationsPerCheckpoint()).isEmpty(); - - assertThat(completedStatistics).isNotNull(); - assertThat(completedStatistics.type()).isEqualTo(StatisticsUtil.collectType(type)); - assertThat(completedStatistics.checkpointId()).isEqualTo(1L); - if (StatisticsUtil.collectType(type) == StatisticsType.Map) { - assertThat(completedStatistics.keyFrequency()) - .isEqualTo( - ImmutableMap.of( - CHAR_KEYS.get("a"), 3L, - CHAR_KEYS.get("b"), 3L)); - } else { - assertThat(completedStatistics.keySamples()) - .containsExactly( - CHAR_KEYS.get("a"), - CHAR_KEYS.get("b"), - CHAR_KEYS.get("b"), - CHAR_KEYS.get("a"), - CHAR_KEYS.get("a"), - CHAR_KEYS.get("b")); - } - - StatisticsEvent checkpoint2Subtask0DataStatisticEvent = - createStatisticsEvent(type, TASK_STATISTICS_SERIALIZER, 2L, CHAR_KEYS.get("a")); - completedStatistics = - tracker.updateAndCheckCompletion(0, checkpoint2Subtask0DataStatisticEvent); - assertThat(completedStatistics).isNull(); - assertThat(tracker.aggregationsPerCheckpoint().keySet()).containsExactlyInAnyOrder(2L); - aggregation = tracker.aggregationsPerCheckpoint().get(2L); - assertThat(aggregation.subtaskSet()).containsExactlyInAnyOrder(0); - if (StatisticsUtil.collectType(type) == StatisticsType.Map) { - assertThat(aggregation.mapStatistics()).isEqualTo(ImmutableMap.of(CHAR_KEYS.get("a"), 1L)); - } else { - assertThat(aggregation.sketchStatistics().getResult().getSamples()) - .containsExactlyInAnyOrder(CHAR_KEYS.get("a")); - } - - StatisticsEvent checkpoint2Subtask1DataStatisticEvent = - createStatisticsEvent(type, TASK_STATISTICS_SERIALIZER, 2L, CHAR_KEYS.get("b")); - // Receive data statistics from all subtasks at checkpoint 2 - completedStatistics = - tracker.updateAndCheckCompletion(1, checkpoint2Subtask1DataStatisticEvent); - assertThat(tracker.aggregationsPerCheckpoint()).isEmpty(); - - assertThat(completedStatistics).isNotNull(); - assertThat(completedStatistics.checkpointId()).isEqualTo(2L); - if (StatisticsUtil.collectType(type) == StatisticsType.Map) { - assertThat(completedStatistics.keyFrequency()) - .isEqualTo( - ImmutableMap.of( - CHAR_KEYS.get("a"), 1L, - CHAR_KEYS.get("b"), 1L)); - } else { - assertThat(completedStatistics.keySamples()) - .containsExactly(CHAR_KEYS.get("a"), CHAR_KEYS.get("b")); - } - } - - @Test - public void coordinatorSwitchToSketchOverThreshold() { - int parallelism = 3; - int downstreamParallelism = 3; - int switchToSketchThreshold = 3; - AggregatedStatisticsTracker tracker = - new AggregatedStatisticsTracker( - "testOperator", - parallelism, - Fixtures.SCHEMA, - Fixtures.SORT_ORDER, - downstreamParallelism, - StatisticsType.Auto, - switchToSketchThreshold, - null); - - StatisticsEvent checkpoint1Subtask0StatisticsEvent = - createStatisticsEvent( - StatisticsType.Map, - TASK_STATISTICS_SERIALIZER, - 1L, - CHAR_KEYS.get("a"), - CHAR_KEYS.get("b")); - CompletedStatistics completedStatistics = - tracker.updateAndCheckCompletion(0, checkpoint1Subtask0StatisticsEvent); - assertThat(completedStatistics).isNull(); - assertThat(tracker.aggregationsPerCheckpoint().keySet()).containsExactlyInAnyOrder(1L); - AggregatedStatisticsTracker.Aggregation aggregation = - tracker.aggregationsPerCheckpoint().get(1L); - assertThat(aggregation.subtaskSet()).containsExactlyInAnyOrder(0); - assertThat(aggregation.currentType()).isEqualTo(StatisticsType.Map); - assertThat(aggregation.sketchStatistics()).isNull(); - assertThat(aggregation.mapStatistics()) - .isEqualTo(ImmutableMap.of(CHAR_KEYS.get("a"), 1L, CHAR_KEYS.get("b"), 1L)); - - StatisticsEvent checkpoint1Subtask1StatisticsEvent = - createStatisticsEvent( - StatisticsType.Map, - TASK_STATISTICS_SERIALIZER, - 1L, - CHAR_KEYS.get("c"), - CHAR_KEYS.get("d")); - completedStatistics = tracker.updateAndCheckCompletion(1, checkpoint1Subtask1StatisticsEvent); - assertThat(completedStatistics).isNull(); - assertThat(tracker.aggregationsPerCheckpoint().keySet()).containsExactlyInAnyOrder(1L); - aggregation = tracker.aggregationsPerCheckpoint().get(1L); - assertThat(aggregation.subtaskSet()).containsExactlyInAnyOrder(0, 1); - // converted to sketch statistics as map size is 4 (over the switch threshold of 3) - assertThat(aggregation.currentType()).isEqualTo(StatisticsType.Sketch); - assertThat(aggregation.mapStatistics()).isNull(); - assertThat(aggregation.sketchStatistics().getResult().getSamples()) - .containsExactlyInAnyOrder( - CHAR_KEYS.get("a"), CHAR_KEYS.get("b"), CHAR_KEYS.get("c"), CHAR_KEYS.get("d")); - - StatisticsEvent checkpoint1Subtask2StatisticsEvent = - createStatisticsEvent( - StatisticsType.Map, - TASK_STATISTICS_SERIALIZER, - 1L, - CHAR_KEYS.get("e"), - CHAR_KEYS.get("f")); - completedStatistics = tracker.updateAndCheckCompletion(2, checkpoint1Subtask2StatisticsEvent); - assertThat(tracker.aggregationsPerCheckpoint()).isEmpty(); - assertThat(completedStatistics).isNotNull(); - assertThat(completedStatistics.type()).isEqualTo(StatisticsType.Sketch); - assertThat(completedStatistics.keySamples()) - .containsExactly( - CHAR_KEYS.get("a"), - CHAR_KEYS.get("b"), - CHAR_KEYS.get("c"), - CHAR_KEYS.get("d"), - CHAR_KEYS.get("e"), - CHAR_KEYS.get("f")); - } - - @Test - public void coordinatorMapOperatorSketch() { - int parallelism = 3; - int downstreamParallelism = 3; - AggregatedStatisticsTracker tracker = - new AggregatedStatisticsTracker( - "testOperator", - parallelism, - Fixtures.SCHEMA, - Fixtures.SORT_ORDER, - downstreamParallelism, - StatisticsType.Auto, - SketchUtil.COORDINATOR_SKETCH_SWITCH_THRESHOLD, - null); - - // first operator event has map statistics - StatisticsEvent checkpoint1Subtask0StatisticsEvent = - createStatisticsEvent( - StatisticsType.Map, - TASK_STATISTICS_SERIALIZER, - 1L, - CHAR_KEYS.get("a"), - CHAR_KEYS.get("b")); - CompletedStatistics completedStatistics = - tracker.updateAndCheckCompletion(0, checkpoint1Subtask0StatisticsEvent); - assertThat(completedStatistics).isNull(); - assertThat(tracker.aggregationsPerCheckpoint().keySet()).containsExactlyInAnyOrder(1L); - AggregatedStatisticsTracker.Aggregation aggregation = - tracker.aggregationsPerCheckpoint().get(1L); - assertThat(aggregation.subtaskSet()).containsExactlyInAnyOrder(0); - assertThat(aggregation.currentType()).isEqualTo(StatisticsType.Map); - assertThat(aggregation.sketchStatistics()).isNull(); - assertThat(aggregation.mapStatistics()) - .isEqualTo(ImmutableMap.of(CHAR_KEYS.get("a"), 1L, CHAR_KEYS.get("b"), 1L)); - - // second operator event contains sketch statistics - StatisticsEvent checkpoint1Subtask1StatisticsEvent = - createStatisticsEvent( - StatisticsType.Sketch, - TASK_STATISTICS_SERIALIZER, - 1L, - CHAR_KEYS.get("c"), - CHAR_KEYS.get("d")); - completedStatistics = tracker.updateAndCheckCompletion(1, checkpoint1Subtask1StatisticsEvent); - assertThat(completedStatistics).isNull(); - assertThat(tracker.aggregationsPerCheckpoint().keySet()).containsExactlyInAnyOrder(1L); - aggregation = tracker.aggregationsPerCheckpoint().get(1L); - assertThat(aggregation.subtaskSet()).containsExactlyInAnyOrder(0, 1); - assertThat(aggregation.currentType()).isEqualTo(StatisticsType.Sketch); - assertThat(aggregation.mapStatistics()).isNull(); - assertThat(aggregation.sketchStatistics().getResult().getSamples()) - .containsExactlyInAnyOrder( - CHAR_KEYS.get("a"), CHAR_KEYS.get("b"), CHAR_KEYS.get("c"), CHAR_KEYS.get("d")); - - // third operator event has Map statistics - StatisticsEvent checkpoint1Subtask2StatisticsEvent = - createStatisticsEvent( - StatisticsType.Map, - TASK_STATISTICS_SERIALIZER, - 1L, - CHAR_KEYS.get("e"), - CHAR_KEYS.get("f")); - completedStatistics = tracker.updateAndCheckCompletion(2, checkpoint1Subtask2StatisticsEvent); - assertThat(tracker.aggregationsPerCheckpoint()).isEmpty(); - assertThat(completedStatistics).isNotNull(); - assertThat(completedStatistics.type()).isEqualTo(StatisticsType.Sketch); - assertThat(completedStatistics.keySamples()) - .containsExactly( - CHAR_KEYS.get("a"), - CHAR_KEYS.get("b"), - CHAR_KEYS.get("c"), - CHAR_KEYS.get("d"), - CHAR_KEYS.get("e"), - CHAR_KEYS.get("f")); - } - - private AggregatedStatisticsTracker createTracker(StatisticsType type) { - return new AggregatedStatisticsTracker( - "testOperator", - Fixtures.NUM_SUBTASKS, - Fixtures.SCHEMA, - Fixtures.SORT_ORDER, - Fixtures.NUM_SUBTASKS, - type, - SketchUtil.COORDINATOR_SKETCH_SWITCH_THRESHOLD, - null); - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestCompletedStatisticsSerializer.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestCompletedStatisticsSerializer.java deleted file mode 100644 index 4ee9888934a8..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestCompletedStatisticsSerializer.java +++ /dev/null @@ -1,54 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import static org.apache.iceberg.flink.sink.shuffle.Fixtures.CHAR_KEYS; - -import org.apache.flink.api.common.typeutils.SerializerTestBase; -import org.apache.flink.api.common.typeutils.TypeSerializer; -import org.apache.iceberg.SortKey; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; - -public class TestCompletedStatisticsSerializer extends SerializerTestBase { - - @Override - protected TypeSerializer createSerializer() { - return Fixtures.COMPLETED_STATISTICS_SERIALIZER; - } - - @Override - protected int getLength() { - return -1; - } - - @Override - protected Class getTypeClass() { - return CompletedStatistics.class; - } - - @Override - protected CompletedStatistics[] getTestData() { - - return new CompletedStatistics[] { - CompletedStatistics.fromKeyFrequency( - 1L, ImmutableMap.of(CHAR_KEYS.get("a"), 1L, CHAR_KEYS.get("b"), 2L)), - CompletedStatistics.fromKeySamples(2L, new SortKey[] {CHAR_KEYS.get("a"), CHAR_KEYS.get("b")}) - }; - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsCoordinator.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsCoordinator.java deleted file mode 100644 index a08a8a73e80c..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsCoordinator.java +++ /dev/null @@ -1,246 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import static org.apache.iceberg.flink.sink.shuffle.Fixtures.CHAR_KEYS; -import static org.apache.iceberg.flink.sink.shuffle.Fixtures.NUM_SUBTASKS; -import static org.apache.iceberg.flink.sink.shuffle.Fixtures.SORT_ORDER_COMPARTOR; -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatThrownBy; - -import java.time.Duration; -import java.util.Map; -import java.util.concurrent.CompletableFuture; -import java.util.concurrent.ExecutionException; -import org.apache.flink.runtime.jobgraph.OperatorID; -import org.apache.flink.runtime.operators.coordination.EventReceivingTasks; -import org.apache.flink.runtime.operators.coordination.MockOperatorCoordinatorContext; -import org.apache.flink.util.ExceptionUtils; -import org.apache.iceberg.SortKey; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.awaitility.Awaitility; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.params.ParameterizedTest; -import org.junit.jupiter.params.provider.EnumSource; - -public class TestDataStatisticsCoordinator { - private static final String OPERATOR_NAME = "TestCoordinator"; - private static final OperatorID TEST_OPERATOR_ID = new OperatorID(1234L, 5678L); - - private EventReceivingTasks receivingTasks; - - @BeforeEach - public void before() throws Exception { - receivingTasks = EventReceivingTasks.createForRunningTasks(); - } - - private void tasksReady(DataStatisticsCoordinator coordinator) { - setAllTasksReady(NUM_SUBTASKS, coordinator, receivingTasks); - } - - @ParameterizedTest - @EnumSource(StatisticsType.class) - public void testThrowExceptionWhenNotStarted(StatisticsType type) throws Exception { - try (DataStatisticsCoordinator dataStatisticsCoordinator = createCoordinator(type)) { - String failureMessage = "The coordinator of TestCoordinator has not started yet."; - assertThatThrownBy( - () -> - dataStatisticsCoordinator.handleEventFromOperator( - 0, - 0, - StatisticsEvent.createTaskStatisticsEvent( - 0, new MapDataStatistics(), Fixtures.TASK_STATISTICS_SERIALIZER))) - .isInstanceOf(IllegalStateException.class) - .hasMessage(failureMessage); - assertThatThrownBy(() -> dataStatisticsCoordinator.executionAttemptFailed(0, 0, null)) - .isInstanceOf(IllegalStateException.class) - .hasMessage(failureMessage); - assertThatThrownBy(() -> dataStatisticsCoordinator.checkpointCoordinator(0, null)) - .isInstanceOf(IllegalStateException.class) - .hasMessage(failureMessage); - } - } - - @ParameterizedTest - @EnumSource(StatisticsType.class) - public void testDataStatisticsEventHandling(StatisticsType type) throws Exception { - try (DataStatisticsCoordinator dataStatisticsCoordinator = createCoordinator(type)) { - dataStatisticsCoordinator.start(); - tasksReady(dataStatisticsCoordinator); - - StatisticsEvent checkpoint1Subtask0DataStatisticEvent = - Fixtures.createStatisticsEvent( - type, - Fixtures.TASK_STATISTICS_SERIALIZER, - 1L, - CHAR_KEYS.get("a"), - CHAR_KEYS.get("b"), - CHAR_KEYS.get("b"), - CHAR_KEYS.get("c"), - CHAR_KEYS.get("c"), - CHAR_KEYS.get("c")); - StatisticsEvent checkpoint1Subtask1DataStatisticEvent = - Fixtures.createStatisticsEvent( - type, - Fixtures.TASK_STATISTICS_SERIALIZER, - 1L, - CHAR_KEYS.get("a"), - CHAR_KEYS.get("b"), - CHAR_KEYS.get("c"), - CHAR_KEYS.get("c")); - // Handle events from operators for checkpoint 1 - dataStatisticsCoordinator.handleEventFromOperator( - 0, 0, checkpoint1Subtask0DataStatisticEvent); - dataStatisticsCoordinator.handleEventFromOperator( - 1, 0, checkpoint1Subtask1DataStatisticEvent); - - waitForCoordinatorToProcessActions(dataStatisticsCoordinator); - - Map keyFrequency = - ImmutableMap.of( - CHAR_KEYS.get("a"), 2L, - CHAR_KEYS.get("b"), 3L, - CHAR_KEYS.get("c"), 5L); - MapAssignment mapAssignment = - MapAssignment.fromKeyFrequency(NUM_SUBTASKS, keyFrequency, 0.0d, SORT_ORDER_COMPARTOR); - - CompletedStatistics completedStatistics = dataStatisticsCoordinator.completedStatistics(); - assertThat(completedStatistics.checkpointId()).isEqualTo(1L); - assertThat(completedStatistics.type()).isEqualTo(StatisticsUtil.collectType(type)); - if (StatisticsUtil.collectType(type) == StatisticsType.Map) { - assertThat(completedStatistics.keyFrequency()).isEqualTo(keyFrequency); - } else { - assertThat(completedStatistics.keySamples()) - .containsExactly( - CHAR_KEYS.get("a"), - CHAR_KEYS.get("a"), - CHAR_KEYS.get("b"), - CHAR_KEYS.get("b"), - CHAR_KEYS.get("b"), - CHAR_KEYS.get("c"), - CHAR_KEYS.get("c"), - CHAR_KEYS.get("c"), - CHAR_KEYS.get("c"), - CHAR_KEYS.get("c")); - } - - GlobalStatistics globalStatistics = dataStatisticsCoordinator.globalStatistics(); - assertThat(globalStatistics.checkpointId()).isEqualTo(1L); - assertThat(globalStatistics.type()).isEqualTo(StatisticsUtil.collectType(type)); - if (StatisticsUtil.collectType(type) == StatisticsType.Map) { - assertThat(globalStatistics.mapAssignment()).isEqualTo(mapAssignment); - } else { - assertThat(globalStatistics.rangeBounds()).containsExactly(CHAR_KEYS.get("b")); - } - } - } - - @Test - public void testRequestGlobalStatisticsEventHandling() throws Exception { - try (DataStatisticsCoordinator dataStatisticsCoordinator = - createCoordinator(StatisticsType.Sketch)) { - dataStatisticsCoordinator.start(); - tasksReady(dataStatisticsCoordinator); - - // receive request before global statistics is ready - dataStatisticsCoordinator.handleEventFromOperator(0, 0, new RequestGlobalStatisticsEvent()); - assertThat(receivingTasks.getSentEventsForSubtask(0)).isEmpty(); - assertThat(receivingTasks.getSentEventsForSubtask(1)).isEmpty(); - - StatisticsEvent checkpoint1Subtask0DataStatisticEvent = - Fixtures.createStatisticsEvent( - StatisticsType.Sketch, Fixtures.TASK_STATISTICS_SERIALIZER, 1L, CHAR_KEYS.get("a")); - StatisticsEvent checkpoint1Subtask1DataStatisticEvent = - Fixtures.createStatisticsEvent( - StatisticsType.Sketch, Fixtures.TASK_STATISTICS_SERIALIZER, 1L, CHAR_KEYS.get("b")); - // Handle events from operators for checkpoint 1 - dataStatisticsCoordinator.handleEventFromOperator( - 0, 0, checkpoint1Subtask0DataStatisticEvent); - dataStatisticsCoordinator.handleEventFromOperator( - 1, 0, checkpoint1Subtask1DataStatisticEvent); - - waitForCoordinatorToProcessActions(dataStatisticsCoordinator); - Awaitility.await("wait for statistics event") - .pollInterval(Duration.ofMillis(10)) - .atMost(Duration.ofSeconds(10)) - .until(() -> receivingTasks.getSentEventsForSubtask(0).size() == 1); - assertThat(receivingTasks.getSentEventsForSubtask(0).get(0)) - .isInstanceOf(StatisticsEvent.class); - - Awaitility.await("wait for statistics event") - .pollInterval(Duration.ofMillis(10)) - .atMost(Duration.ofSeconds(10)) - .until(() -> receivingTasks.getSentEventsForSubtask(1).size() == 1); - assertThat(receivingTasks.getSentEventsForSubtask(1).get(0)) - .isInstanceOf(StatisticsEvent.class); - - dataStatisticsCoordinator.handleEventFromOperator(1, 0, new RequestGlobalStatisticsEvent()); - - // coordinator should send a response to subtask 1 - Awaitility.await("wait for statistics event") - .pollInterval(Duration.ofMillis(10)) - .atMost(Duration.ofSeconds(10)) - .until(() -> receivingTasks.getSentEventsForSubtask(1).size() == 2); - assertThat(receivingTasks.getSentEventsForSubtask(1).get(0)) - .isInstanceOf(StatisticsEvent.class); - assertThat(receivingTasks.getSentEventsForSubtask(1).get(1)) - .isInstanceOf(StatisticsEvent.class); - } - } - - static void setAllTasksReady( - int subtasks, - DataStatisticsCoordinator dataStatisticsCoordinator, - EventReceivingTasks receivingTasks) { - for (int i = 0; i < subtasks; i++) { - dataStatisticsCoordinator.executionAttemptReady( - i, 0, receivingTasks.createGatewayForSubtask(i, 0)); - } - } - - static void waitForCoordinatorToProcessActions(DataStatisticsCoordinator coordinator) { - CompletableFuture future = new CompletableFuture<>(); - coordinator.callInCoordinatorThread( - () -> { - future.complete(null); - return null; - }, - "Coordinator fails to process action"); - - try { - future.get(); - } catch (InterruptedException e) { - throw new AssertionError("test interrupted"); - } catch (ExecutionException e) { - ExceptionUtils.rethrow(ExceptionUtils.stripExecutionException(e)); - } - } - - private static DataStatisticsCoordinator createCoordinator(StatisticsType type) { - return new DataStatisticsCoordinator( - OPERATOR_NAME, - new MockOperatorCoordinatorContext(TEST_OPERATOR_ID, NUM_SUBTASKS), - Fixtures.SCHEMA, - Fixtures.SORT_ORDER, - NUM_SUBTASKS, - type, - 0.0d); - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsCoordinatorProvider.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsCoordinatorProvider.java deleted file mode 100644 index 6317f2bfde18..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsCoordinatorProvider.java +++ /dev/null @@ -1,187 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import static org.apache.iceberg.flink.sink.shuffle.Fixtures.CHAR_KEYS; -import static org.apache.iceberg.flink.sink.shuffle.Fixtures.SORT_ORDER_COMPARTOR; -import static org.apache.iceberg.flink.sink.shuffle.Fixtures.TASK_STATISTICS_SERIALIZER; -import static org.apache.iceberg.flink.sink.shuffle.Fixtures.createStatisticsEvent; -import static org.assertj.core.api.Assertions.assertThat; - -import java.util.Map; -import java.util.concurrent.CompletableFuture; -import java.util.concurrent.ExecutionException; -import org.apache.flink.runtime.jobgraph.OperatorID; -import org.apache.flink.runtime.operators.coordination.EventReceivingTasks; -import org.apache.flink.runtime.operators.coordination.MockOperatorCoordinatorContext; -import org.apache.flink.runtime.operators.coordination.RecreateOnResetOperatorCoordinator; -import org.apache.iceberg.SortKey; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.params.ParameterizedTest; -import org.junit.jupiter.params.provider.EnumSource; - -public class TestDataStatisticsCoordinatorProvider { - private static final OperatorID OPERATOR_ID = new OperatorID(); - - private EventReceivingTasks receivingTasks; - - @BeforeEach - public void before() { - receivingTasks = EventReceivingTasks.createForRunningTasks(); - } - - @ParameterizedTest - @EnumSource(StatisticsType.class) - public void testCheckpointAndReset(StatisticsType type) throws Exception { - DataStatisticsCoordinatorProvider provider = createProvider(type, Fixtures.NUM_SUBTASKS); - try (RecreateOnResetOperatorCoordinator coordinator = - (RecreateOnResetOperatorCoordinator) - provider.create( - new MockOperatorCoordinatorContext(OPERATOR_ID, Fixtures.NUM_SUBTASKS))) { - DataStatisticsCoordinator dataStatisticsCoordinator = - (DataStatisticsCoordinator) coordinator.getInternalCoordinator(); - - // Start the coordinator - coordinator.start(); - TestDataStatisticsCoordinator.setAllTasksReady( - Fixtures.NUM_SUBTASKS, dataStatisticsCoordinator, receivingTasks); - - // Handle events from operators for checkpoint 1 - StatisticsEvent checkpoint1Subtask0StatisticsEvent = - createStatisticsEvent(type, TASK_STATISTICS_SERIALIZER, 1L, CHAR_KEYS.get("a")); - coordinator.handleEventFromOperator(0, 0, checkpoint1Subtask0StatisticsEvent); - TestDataStatisticsCoordinator.waitForCoordinatorToProcessActions(dataStatisticsCoordinator); - - StatisticsEvent checkpoint1Subtask1StatisticsEvent = - createStatisticsEvent(type, TASK_STATISTICS_SERIALIZER, 1L, CHAR_KEYS.get("b")); - coordinator.handleEventFromOperator(1, 0, checkpoint1Subtask1StatisticsEvent); - TestDataStatisticsCoordinator.waitForCoordinatorToProcessActions(dataStatisticsCoordinator); - - // Verify checkpoint 1 global data statistics - Map checkpoint1KeyFrequency = - ImmutableMap.of(CHAR_KEYS.get("a"), 1L, CHAR_KEYS.get("b"), 1L); - MapAssignment checkpoint1MapAssignment = - MapAssignment.fromKeyFrequency( - Fixtures.NUM_SUBTASKS, checkpoint1KeyFrequency, 0.0d, SORT_ORDER_COMPARTOR); - - CompletedStatistics completedStatistics = dataStatisticsCoordinator.completedStatistics(); - assertThat(completedStatistics).isNotNull(); - assertThat(completedStatistics.type()).isEqualTo(StatisticsUtil.collectType(type)); - if (StatisticsUtil.collectType(type) == StatisticsType.Map) { - assertThat(completedStatistics.keyFrequency()).isEqualTo(checkpoint1KeyFrequency); - } else { - assertThat(completedStatistics.keySamples()) - .containsExactly(CHAR_KEYS.get("a"), CHAR_KEYS.get("b")); - } - - GlobalStatistics globalStatistics = dataStatisticsCoordinator.globalStatistics(); - assertThat(globalStatistics).isNotNull(); - assertThat(globalStatistics.type()).isEqualTo(StatisticsUtil.collectType(type)); - if (StatisticsUtil.collectType(type) == StatisticsType.Map) { - assertThat(globalStatistics.mapAssignment()).isEqualTo(checkpoint1MapAssignment); - } else { - assertThat(globalStatistics.rangeBounds()).containsExactly(CHAR_KEYS.get("a")); - } - - byte[] checkpoint1Bytes = waitForCheckpoint(1L, dataStatisticsCoordinator); - - StatisticsEvent checkpoint2Subtask0StatisticsEvent = - createStatisticsEvent( - type, TASK_STATISTICS_SERIALIZER, 2L, CHAR_KEYS.get("d"), CHAR_KEYS.get("e")); - coordinator.handleEventFromOperator(0, 0, checkpoint2Subtask0StatisticsEvent); - TestDataStatisticsCoordinator.waitForCoordinatorToProcessActions(dataStatisticsCoordinator); - - StatisticsEvent checkpoint2Subtask1StatisticsEvent = - createStatisticsEvent(type, TASK_STATISTICS_SERIALIZER, 2L, CHAR_KEYS.get("f")); - coordinator.handleEventFromOperator(1, 0, checkpoint2Subtask1StatisticsEvent); - TestDataStatisticsCoordinator.waitForCoordinatorToProcessActions(dataStatisticsCoordinator); - - // Verify checkpoint 2 global data statistics - Map checkpoint2KeyFrequency = - ImmutableMap.of(CHAR_KEYS.get("d"), 1L, CHAR_KEYS.get("e"), 1L, CHAR_KEYS.get("f"), 1L); - MapAssignment checkpoint2MapAssignment = - MapAssignment.fromKeyFrequency( - Fixtures.NUM_SUBTASKS, checkpoint2KeyFrequency, 0.0d, SORT_ORDER_COMPARTOR); - completedStatistics = dataStatisticsCoordinator.completedStatistics(); - assertThat(completedStatistics.type()).isEqualTo(StatisticsUtil.collectType(type)); - if (StatisticsUtil.collectType(type) == StatisticsType.Map) { - assertThat(completedStatistics.keyFrequency()).isEqualTo(checkpoint2KeyFrequency); - } else { - assertThat(completedStatistics.keySamples()) - .containsExactly(CHAR_KEYS.get("d"), CHAR_KEYS.get("e"), CHAR_KEYS.get("f")); - } - - globalStatistics = dataStatisticsCoordinator.globalStatistics(); - assertThat(globalStatistics.type()).isEqualTo(StatisticsUtil.collectType(type)); - if (StatisticsUtil.collectType(type) == StatisticsType.Map) { - assertThat(globalStatistics.mapAssignment()).isEqualTo(checkpoint2MapAssignment); - } else { - assertThat(globalStatistics.rangeBounds()).containsExactly(CHAR_KEYS.get("e")); - } - - waitForCheckpoint(2L, dataStatisticsCoordinator); - - // Reset coordinator to checkpoint 1 - coordinator.resetToCheckpoint(1L, checkpoint1Bytes); - DataStatisticsCoordinator restoredDataStatisticsCoordinator = - (DataStatisticsCoordinator) coordinator.getInternalCoordinator(); - assertThat(dataStatisticsCoordinator).isNotSameAs(restoredDataStatisticsCoordinator); - - completedStatistics = restoredDataStatisticsCoordinator.completedStatistics(); - assertThat(completedStatistics.type()).isEqualTo(StatisticsUtil.collectType(type)); - // Verify restored data statistics - if (StatisticsUtil.collectType(type) == StatisticsType.Map) { - assertThat(completedStatistics.keyFrequency()) - .isEqualTo(ImmutableMap.of(CHAR_KEYS.get("a"), 1L, CHAR_KEYS.get("b"), 1L)); - } else { - assertThat(completedStatistics.keySamples()) - .containsExactly(CHAR_KEYS.get("a"), CHAR_KEYS.get("b")); - } - - globalStatistics = restoredDataStatisticsCoordinator.globalStatistics(); - assertThat(globalStatistics).isNotNull(); - assertThat(globalStatistics.type()).isEqualTo(StatisticsUtil.collectType(type)); - if (StatisticsUtil.collectType(type) == StatisticsType.Map) { - assertThat(globalStatistics.mapAssignment()).isEqualTo(checkpoint1MapAssignment); - } else { - assertThat(globalStatistics.rangeBounds()).containsExactly(CHAR_KEYS.get("a")); - } - } - } - - private byte[] waitForCheckpoint(long checkpointId, DataStatisticsCoordinator coordinator) - throws InterruptedException, ExecutionException { - CompletableFuture future = new CompletableFuture<>(); - coordinator.checkpointCoordinator(checkpointId, future); - return future.get(); - } - - private static DataStatisticsCoordinatorProvider createProvider( - StatisticsType type, int downstreamParallelism) { - return new DataStatisticsCoordinatorProvider( - "DataStatisticsCoordinatorProvider", - OPERATOR_ID, - Fixtures.SCHEMA, - Fixtures.SORT_ORDER, - downstreamParallelism, - type, - 0.0); - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsOperator.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsOperator.java deleted file mode 100644 index c760f1ba96d3..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsOperator.java +++ /dev/null @@ -1,350 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import static org.apache.iceberg.flink.sink.shuffle.Fixtures.CHAR_KEYS; -import static org.apache.iceberg.flink.sink.shuffle.Fixtures.SORT_ORDER_COMPARTOR; -import static org.assertj.core.api.Assertions.assertThat; -import static org.mockito.ArgumentMatchers.any; -import static org.mockito.Mockito.verify; - -import java.util.Collections; -import java.util.List; -import java.util.Map; -import java.util.stream.Collectors; -import java.util.stream.Stream; -import org.apache.datasketches.sampling.ReservoirItemsSketch; -import org.apache.flink.api.common.ExecutionConfig; -import org.apache.flink.api.common.state.OperatorStateStore; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.core.fs.CloseableRegistry; -import org.apache.flink.runtime.checkpoint.OperatorSubtaskState; -import org.apache.flink.runtime.execution.Environment; -import org.apache.flink.runtime.operators.coordination.MockOperatorEventGateway; -import org.apache.flink.runtime.operators.testutils.MockInputSplitProvider; -import org.apache.flink.runtime.state.AbstractStateBackend; -import org.apache.flink.runtime.state.StateInitializationContext; -import org.apache.flink.runtime.state.StateInitializationContextImpl; -import org.apache.flink.runtime.state.TestTaskStateManager; -import org.apache.flink.runtime.state.hashmap.HashMapStateBackend; -import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; -import org.apache.flink.streaming.runtime.tasks.OneInputStreamTask; -import org.apache.flink.streaming.runtime.tasks.StreamMockEnvironment; -import org.apache.flink.streaming.util.MockOutput; -import org.apache.flink.streaming.util.MockStreamConfig; -import org.apache.flink.streaming.util.OneInputStreamOperatorTestHarness; -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.StringData; -import org.apache.iceberg.SortKey; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.params.ParameterizedTest; -import org.junit.jupiter.params.provider.Arguments; -import org.junit.jupiter.params.provider.EnumSource; -import org.junit.jupiter.params.provider.MethodSource; -import org.mockito.Mockito; - -public class TestDataStatisticsOperator { - - private Environment env; - - @BeforeEach - public void before() throws Exception { - this.env = - new StreamMockEnvironment( - new Configuration(), - new Configuration(), - new ExecutionConfig(), - 1L, - new MockInputSplitProvider(), - 1, - new TestTaskStateManager()); - } - - private DataStatisticsOperator createOperator(StatisticsType type, int downstreamParallelism) - throws Exception { - MockOperatorEventGateway mockGateway = new MockOperatorEventGateway(); - return createOperator(type, downstreamParallelism, mockGateway); - } - - private DataStatisticsOperator createOperator( - StatisticsType type, int downstreamParallelism, MockOperatorEventGateway mockGateway) - throws Exception { - DataStatisticsOperator operator = - new DataStatisticsOperator( - "testOperator", - Fixtures.SCHEMA, - Fixtures.SORT_ORDER, - mockGateway, - downstreamParallelism, - type); - operator.setup( - new OneInputStreamTask(env), - new MockStreamConfig(new Configuration(), 1), - new MockOutput<>(Lists.newArrayList())); - return operator; - } - - @SuppressWarnings("unchecked") - @ParameterizedTest - @EnumSource(StatisticsType.class) - public void testProcessElement(StatisticsType type) throws Exception { - DataStatisticsOperator operator = createOperator(type, Fixtures.NUM_SUBTASKS); - try (OneInputStreamOperatorTestHarness testHarness = - createHarness(operator)) { - StateInitializationContext stateContext = getStateContext(); - operator.initializeState(stateContext); - operator.processElement(new StreamRecord<>(GenericRowData.of(StringData.fromString("a"), 5))); - operator.processElement(new StreamRecord<>(GenericRowData.of(StringData.fromString("a"), 3))); - operator.processElement(new StreamRecord<>(GenericRowData.of(StringData.fromString("b"), 1))); - - DataStatistics localStatistics = operator.localStatistics(); - assertThat(localStatistics.type()).isEqualTo(StatisticsUtil.collectType(type)); - if (StatisticsUtil.collectType(type) == StatisticsType.Map) { - Map keyFrequency = (Map) localStatistics.result(); - assertThat(keyFrequency) - .isEqualTo(ImmutableMap.of(CHAR_KEYS.get("a"), 2L, CHAR_KEYS.get("b"), 1L)); - } else { - ReservoirItemsSketch sketch = - (ReservoirItemsSketch) localStatistics.result(); - assertThat(sketch.getSamples()) - .containsExactly(CHAR_KEYS.get("a"), CHAR_KEYS.get("a"), CHAR_KEYS.get("b")); - } - - testHarness.endInput(); - } - } - - @ParameterizedTest - @EnumSource(StatisticsType.class) - public void testOperatorOutput(StatisticsType type) throws Exception { - DataStatisticsOperator operator = createOperator(type, Fixtures.NUM_SUBTASKS); - try (OneInputStreamOperatorTestHarness testHarness = - createHarness(operator)) { - testHarness.processElement( - new StreamRecord<>(GenericRowData.of(StringData.fromString("a"), 2))); - testHarness.processElement( - new StreamRecord<>(GenericRowData.of(StringData.fromString("b"), 3))); - testHarness.processElement( - new StreamRecord<>(GenericRowData.of(StringData.fromString("b"), 1))); - - List recordsOutput = - testHarness.extractOutputValues().stream() - .filter(StatisticsOrRecord::hasRecord) - .map(StatisticsOrRecord::record) - .collect(Collectors.toList()); - assertThat(recordsOutput) - .containsExactlyInAnyOrderElementsOf( - ImmutableList.of( - GenericRowData.of(StringData.fromString("a"), 2), - GenericRowData.of(StringData.fromString("b"), 3), - GenericRowData.of(StringData.fromString("b"), 1))); - } - } - - private static Stream provideRestoreStateParameters() { - return Stream.of( - Arguments.of(StatisticsType.Map, -1), - Arguments.of(StatisticsType.Map, 0), - Arguments.of(StatisticsType.Map, 1), - Arguments.of(StatisticsType.Sketch, -1), - Arguments.of(StatisticsType.Sketch, 0), - Arguments.of(StatisticsType.Sketch, 1)); - } - - @ParameterizedTest - @MethodSource("provideRestoreStateParameters") - public void testRestoreState(StatisticsType type, int parallelismAdjustment) throws Exception { - Map keyFrequency = - ImmutableMap.of(CHAR_KEYS.get("a"), 2L, CHAR_KEYS.get("b"), 1L, CHAR_KEYS.get("c"), 1L); - SortKey[] rangeBounds = new SortKey[] {CHAR_KEYS.get("a")}; - MapAssignment mapAssignment = - MapAssignment.fromKeyFrequency(2, keyFrequency, 0.0d, SORT_ORDER_COMPARTOR); - DataStatisticsOperator operator = createOperator(type, Fixtures.NUM_SUBTASKS); - OperatorSubtaskState snapshot; - try (OneInputStreamOperatorTestHarness testHarness1 = - createHarness(operator)) { - GlobalStatistics statistics; - if (StatisticsUtil.collectType(type) == StatisticsType.Map) { - statistics = GlobalStatistics.fromMapAssignment(1L, mapAssignment); - } else { - statistics = GlobalStatistics.fromRangeBounds(1L, rangeBounds); - } - - StatisticsEvent event = - StatisticsEvent.createGlobalStatisticsEvent( - statistics, Fixtures.GLOBAL_STATISTICS_SERIALIZER, false); - operator.handleOperatorEvent(event); - - GlobalStatistics globalStatistics = operator.globalStatistics(); - assertThat(globalStatistics.type()).isEqualTo(StatisticsUtil.collectType(type)); - if (StatisticsUtil.collectType(type) == StatisticsType.Map) { - assertThat(globalStatistics.mapAssignment()).isEqualTo(mapAssignment); - assertThat(globalStatistics.rangeBounds()).isNull(); - } else { - assertThat(globalStatistics.mapAssignment()).isNull(); - assertThat(globalStatistics.rangeBounds()).isEqualTo(rangeBounds); - } - - snapshot = testHarness1.snapshot(1L, 0); - } - - // Use the snapshot to initialize state for another new operator and then verify that the global - // statistics for the new operator is same as before - MockOperatorEventGateway spyGateway = Mockito.spy(new MockOperatorEventGateway()); - DataStatisticsOperator restoredOperator = - createOperator(type, Fixtures.NUM_SUBTASKS + parallelismAdjustment, spyGateway); - try (OneInputStreamOperatorTestHarness testHarness2 = - new OneInputStreamOperatorTestHarness<>(restoredOperator, 2, 2, 1)) { - testHarness2.setup(); - testHarness2.initializeState(snapshot); - - GlobalStatistics globalStatistics = restoredOperator.globalStatistics(); - // global statistics is always restored and used initially even if - // downstream parallelism changed. - assertThat(globalStatistics).isNotNull(); - // request is always sent to coordinator during initialization. - // coordinator would respond with a new global statistics that - // has range bound recomputed with new parallelism. - verify(spyGateway).sendEventToCoordinator(any(RequestGlobalStatisticsEvent.class)); - assertThat(globalStatistics.type()).isEqualTo(StatisticsUtil.collectType(type)); - if (StatisticsUtil.collectType(type) == StatisticsType.Map) { - assertThat(globalStatistics.mapAssignment()).isEqualTo(mapAssignment); - assertThat(globalStatistics.rangeBounds()).isNull(); - } else { - assertThat(globalStatistics.mapAssignment()).isNull(); - assertThat(globalStatistics.rangeBounds()).isEqualTo(rangeBounds); - } - } - } - - @SuppressWarnings("unchecked") - @Test - public void testMigrationWithLocalStatsOverThreshold() throws Exception { - DataStatisticsOperator operator = createOperator(StatisticsType.Auto, Fixtures.NUM_SUBTASKS); - try (OneInputStreamOperatorTestHarness testHarness = - createHarness(operator)) { - StateInitializationContext stateContext = getStateContext(); - operator.initializeState(stateContext); - - // add rows with unique keys - for (int i = 0; i < SketchUtil.OPERATOR_SKETCH_SWITCH_THRESHOLD; ++i) { - operator.processElement( - new StreamRecord<>(GenericRowData.of(StringData.fromString(String.valueOf(i)), i))); - assertThat(operator.localStatistics().type()).isEqualTo(StatisticsType.Map); - assertThat((Map) operator.localStatistics().result()).hasSize(i + 1); - } - - // one more item should trigger the migration to sketch stats - operator.processElement( - new StreamRecord<>(GenericRowData.of(StringData.fromString("key-trigger-migration"), 1))); - - int reservoirSize = - SketchUtil.determineOperatorReservoirSize(Fixtures.NUM_SUBTASKS, Fixtures.NUM_SUBTASKS); - - assertThat(operator.localStatistics().type()).isEqualTo(StatisticsType.Sketch); - ReservoirItemsSketch sketch = - (ReservoirItemsSketch) operator.localStatistics().result(); - assertThat(sketch.getK()).isEqualTo(reservoirSize); - assertThat(sketch.getN()).isEqualTo(SketchUtil.OPERATOR_SKETCH_SWITCH_THRESHOLD + 1); - // reservoir not full yet - assertThat(sketch.getN()).isLessThan(reservoirSize); - assertThat(sketch.getSamples()).hasSize((int) sketch.getN()); - - // add more items to saturate the reservoir - for (int i = 0; i < reservoirSize; ++i) { - operator.processElement( - new StreamRecord<>(GenericRowData.of(StringData.fromString(String.valueOf(i)), i))); - } - - assertThat(operator.localStatistics().type()).isEqualTo(StatisticsType.Sketch); - sketch = (ReservoirItemsSketch) operator.localStatistics().result(); - assertThat(sketch.getK()).isEqualTo(reservoirSize); - assertThat(sketch.getN()) - .isEqualTo(SketchUtil.OPERATOR_SKETCH_SWITCH_THRESHOLD + 1 + reservoirSize); - // reservoir is full now - assertThat(sketch.getN()).isGreaterThan(reservoirSize); - assertThat(sketch.getSamples()).hasSize(reservoirSize); - - testHarness.endInput(); - } - } - - @SuppressWarnings("unchecked") - @Test - public void testMigrationWithGlobalSketchStatistics() throws Exception { - DataStatisticsOperator operator = createOperator(StatisticsType.Auto, Fixtures.NUM_SUBTASKS); - try (OneInputStreamOperatorTestHarness testHarness = - createHarness(operator)) { - StateInitializationContext stateContext = getStateContext(); - operator.initializeState(stateContext); - - // started with Map stype - operator.processElement(new StreamRecord<>(GenericRowData.of(StringData.fromString("a"), 1))); - assertThat(operator.localStatistics().type()).isEqualTo(StatisticsType.Map); - assertThat((Map) operator.localStatistics().result()) - .isEqualTo(ImmutableMap.of(CHAR_KEYS.get("a"), 1L)); - - // received global statistics with sketch type - GlobalStatistics globalStatistics = - GlobalStatistics.fromRangeBounds( - 1L, new SortKey[] {CHAR_KEYS.get("c"), CHAR_KEYS.get("f")}); - operator.handleOperatorEvent( - StatisticsEvent.createGlobalStatisticsEvent( - globalStatistics, Fixtures.GLOBAL_STATISTICS_SERIALIZER, false)); - - int reservoirSize = - SketchUtil.determineOperatorReservoirSize(Fixtures.NUM_SUBTASKS, Fixtures.NUM_SUBTASKS); - - assertThat(operator.localStatistics().type()).isEqualTo(StatisticsType.Sketch); - ReservoirItemsSketch sketch = - (ReservoirItemsSketch) operator.localStatistics().result(); - assertThat(sketch.getK()).isEqualTo(reservoirSize); - assertThat(sketch.getN()).isEqualTo(1); - assertThat(sketch.getSamples()).isEqualTo(new SortKey[] {CHAR_KEYS.get("a")}); - - testHarness.endInput(); - } - } - - private StateInitializationContext getStateContext() throws Exception { - AbstractStateBackend abstractStateBackend = new HashMapStateBackend(); - CloseableRegistry cancelStreamRegistry = new CloseableRegistry(); - OperatorStateStore operatorStateStore = - abstractStateBackend.createOperatorStateBackend( - env, "test-operator", Collections.emptyList(), cancelStreamRegistry); - return new StateInitializationContextImpl(null, operatorStateStore, null, null, null); - } - - private OneInputStreamOperatorTestHarness createHarness( - DataStatisticsOperator dataStatisticsOperator) throws Exception { - OneInputStreamOperatorTestHarness harness = - new OneInputStreamOperatorTestHarness<>( - dataStatisticsOperator, Fixtures.NUM_SUBTASKS, Fixtures.NUM_SUBTASKS, 0); - harness.setup( - new StatisticsOrRecordSerializer( - Fixtures.GLOBAL_STATISTICS_SERIALIZER, Fixtures.ROW_SERIALIZER)); - harness.open(); - return harness; - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsSerializer.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsSerializer.java deleted file mode 100644 index 59ce6df05d9d..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestDataStatisticsSerializer.java +++ /dev/null @@ -1,53 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import static org.apache.iceberg.flink.sink.shuffle.Fixtures.CHAR_KEYS; - -import org.apache.flink.api.common.typeutils.SerializerTestBase; -import org.apache.flink.api.common.typeutils.TypeSerializer; - -public class TestDataStatisticsSerializer extends SerializerTestBase { - @Override - protected TypeSerializer createSerializer() { - return Fixtures.TASK_STATISTICS_SERIALIZER; - } - - @Override - protected int getLength() { - return -1; - } - - @Override - protected Class getTypeClass() { - return DataStatistics.class; - } - - @Override - protected DataStatistics[] getTestData() { - return new DataStatistics[] { - new MapDataStatistics(), - Fixtures.createTaskStatistics( - StatisticsType.Map, CHAR_KEYS.get("a"), CHAR_KEYS.get("a"), CHAR_KEYS.get("b")), - new SketchDataStatistics(128), - Fixtures.createTaskStatistics( - StatisticsType.Sketch, CHAR_KEYS.get("a"), CHAR_KEYS.get("a"), CHAR_KEYS.get("b")) - }; - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestGlobalStatisticsSerializer.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestGlobalStatisticsSerializer.java deleted file mode 100644 index 7afaf239c668..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestGlobalStatisticsSerializer.java +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import static org.apache.iceberg.flink.sink.shuffle.Fixtures.CHAR_KEYS; -import static org.apache.iceberg.flink.sink.shuffle.Fixtures.SORT_ORDER_COMPARTOR; - -import org.apache.flink.api.common.typeutils.SerializerTestBase; -import org.apache.flink.api.common.typeutils.TypeSerializer; -import org.apache.iceberg.SortKey; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; - -public class TestGlobalStatisticsSerializer extends SerializerTestBase { - - @Override - protected TypeSerializer createSerializer() { - return Fixtures.GLOBAL_STATISTICS_SERIALIZER; - } - - @Override - protected int getLength() { - return -1; - } - - @Override - protected Class getTypeClass() { - return GlobalStatistics.class; - } - - @Override - protected GlobalStatistics[] getTestData() { - return new GlobalStatistics[] { - GlobalStatistics.fromMapAssignment( - 1L, - MapAssignment.fromKeyFrequency( - Fixtures.NUM_SUBTASKS, - ImmutableMap.of(CHAR_KEYS.get("a"), 1L, CHAR_KEYS.get("b"), 2L), - 0.0d, - SORT_ORDER_COMPARTOR)), - GlobalStatistics.fromRangeBounds(2L, new SortKey[] {CHAR_KEYS.get("a"), CHAR_KEYS.get("b")}) - }; - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestMapDataStatistics.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestMapDataStatistics.java deleted file mode 100644 index 8a25c7ad9898..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestMapDataStatistics.java +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import static org.apache.iceberg.flink.sink.shuffle.Fixtures.CHAR_KEYS; -import static org.apache.iceberg.flink.sink.shuffle.Fixtures.ROW_WRAPPER; -import static org.assertj.core.api.Assertions.assertThat; - -import java.util.Map; -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.StringData; -import org.apache.iceberg.SortKey; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.junit.jupiter.api.Test; - -public class TestMapDataStatistics { - @SuppressWarnings("unchecked") - @Test - public void testAddsAndGet() { - MapDataStatistics dataStatistics = new MapDataStatistics(); - - GenericRowData reusedRow = GenericRowData.of(StringData.fromString("a"), 1); - Fixtures.SORT_KEY.wrap(ROW_WRAPPER.wrap(reusedRow)); - dataStatistics.add(Fixtures.SORT_KEY); - - reusedRow.setField(0, StringData.fromString("b")); - Fixtures.SORT_KEY.wrap(ROW_WRAPPER.wrap(reusedRow)); - dataStatistics.add(Fixtures.SORT_KEY); - - reusedRow.setField(0, StringData.fromString("c")); - Fixtures.SORT_KEY.wrap(ROW_WRAPPER.wrap(reusedRow)); - dataStatistics.add(Fixtures.SORT_KEY); - - reusedRow.setField(0, StringData.fromString("b")); - Fixtures.SORT_KEY.wrap(ROW_WRAPPER.wrap(reusedRow)); - dataStatistics.add(Fixtures.SORT_KEY); - - reusedRow.setField(0, StringData.fromString("a")); - Fixtures.SORT_KEY.wrap(ROW_WRAPPER.wrap(reusedRow)); - dataStatistics.add(Fixtures.SORT_KEY); - - reusedRow.setField(0, StringData.fromString("b")); - Fixtures.SORT_KEY.wrap(ROW_WRAPPER.wrap(reusedRow)); - dataStatistics.add(Fixtures.SORT_KEY); - - Map actual = (Map) dataStatistics.result(); - Map expected = - ImmutableMap.of(CHAR_KEYS.get("a"), 2L, CHAR_KEYS.get("b"), 3L, CHAR_KEYS.get("c"), 1L); - assertThat(actual).isEqualTo(expected); - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestMapRangePartitioner.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestMapRangePartitioner.java deleted file mode 100644 index d5a0bebc74e7..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestMapRangePartitioner.java +++ /dev/null @@ -1,434 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import static org.apache.iceberg.flink.sink.shuffle.Fixtures.SORT_ORDER_COMPARTOR; -import static org.assertj.core.api.Assertions.assertThat; - -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.concurrent.atomic.AtomicLong; -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.StringData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.SortKey; -import org.apache.iceberg.SortOrder; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.RowDataWrapper; -import org.apache.iceberg.flink.TestFixtures; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.relocated.com.google.common.collect.Sets; -import org.apache.iceberg.util.Pair; -import org.junit.jupiter.api.Test; - -public class TestMapRangePartitioner { - private static final SortOrder SORT_ORDER = - SortOrder.builderFor(TestFixtures.SCHEMA).asc("data").build(); - - private static final SortKey SORT_KEY = new SortKey(TestFixtures.SCHEMA, SORT_ORDER); - private static final RowType ROW_TYPE = FlinkSchemaUtil.convert(TestFixtures.SCHEMA); - private static final SortKey[] SORT_KEYS = initSortKeys(); - - private static SortKey[] initSortKeys() { - SortKey[] sortKeys = new SortKey[10]; - for (int i = 0; i < 10; ++i) { - RowData rowData = - GenericRowData.of(StringData.fromString("k" + i), i, StringData.fromString("2023-06-20")); - RowDataWrapper keyWrapper = new RowDataWrapper(ROW_TYPE, TestFixtures.SCHEMA.asStruct()); - keyWrapper.wrap(rowData); - SortKey sortKey = SORT_KEY.copy(); - sortKey.wrap(keyWrapper); - sortKeys[i] = sortKey; - } - return sortKeys; - } - - // Total weight is 800 - private final Map mapStatistics = - ImmutableMap.of( - SORT_KEYS[0], - 350L, - SORT_KEYS[1], - 230L, - SORT_KEYS[2], - 120L, - SORT_KEYS[3], - 40L, - SORT_KEYS[4], - 10L, - SORT_KEYS[5], - 10L, - SORT_KEYS[6], - 10L, - SORT_KEYS[7], - 10L, - SORT_KEYS[8], - 10L, - SORT_KEYS[9], - 10L); - - @Test - public void testEvenlyDividableNoClosingFileCost() { - int numPartitions = 8; - MapAssignment mapAssignment = - MapAssignment.fromKeyFrequency(numPartitions, mapStatistics, 0.0, SORT_ORDER_COMPARTOR); - - // each task should get targeted weight of 100 (=800/8) - Map expectedAssignment = - ImmutableMap.of( - SORT_KEYS[0], - new KeyAssignment( - ImmutableList.of(0, 1, 2, 3), ImmutableList.of(100L, 100L, 100L, 50L), 0L), - SORT_KEYS[1], - new KeyAssignment(ImmutableList.of(3, 4, 5), ImmutableList.of(50L, 100L, 80L), 0L), - SORT_KEYS[2], - new KeyAssignment(ImmutableList.of(5, 6), ImmutableList.of(20L, 100L), 0L), - SORT_KEYS[3], - new KeyAssignment(ImmutableList.of(7), ImmutableList.of(40L), 0L), - SORT_KEYS[4], - new KeyAssignment(ImmutableList.of(7), ImmutableList.of(10L), 0L), - SORT_KEYS[5], - new KeyAssignment(ImmutableList.of(7), ImmutableList.of(10L), 0L), - SORT_KEYS[6], - new KeyAssignment(ImmutableList.of(7), ImmutableList.of(10L), 0L), - SORT_KEYS[7], - new KeyAssignment(ImmutableList.of(7), ImmutableList.of(10L), 0L), - SORT_KEYS[8], - new KeyAssignment(ImmutableList.of(7), ImmutableList.of(10L), 0L), - SORT_KEYS[9], - new KeyAssignment(ImmutableList.of(7), ImmutableList.of(10L), 0L)); - assertThat(mapAssignment).isEqualTo(new MapAssignment(numPartitions, expectedAssignment)); - - // key: subtask id - // value pair: first is the assigned weight, second is the number of assigned keys - Map> expectedAssignmentInfo = - ImmutableMap.of( - 0, - Pair.of(100L, 1), - 1, - Pair.of(100L, 1), - 2, - Pair.of(100L, 1), - 3, - Pair.of(100L, 2), - 4, - Pair.of(100L, 1), - 5, - Pair.of(100L, 2), - 6, - Pair.of(100L, 1), - 7, - Pair.of(100L, 7)); - assertThat(mapAssignment.assignmentInfo()).isEqualTo(expectedAssignmentInfo); - - MapRangePartitioner partitioner = - new MapRangePartitioner(TestFixtures.SCHEMA, SORT_ORDER, mapAssignment); - Map>> partitionResults = - runPartitioner(partitioner, numPartitions, mapStatistics); - validatePartitionResults(expectedAssignmentInfo, partitionResults, 5.0); - } - - @Test - public void testEvenlyDividableWithClosingFileCost() { - int numPartitions = 8; - MapAssignment mapAssignment = - MapAssignment.fromKeyFrequency(numPartitions, mapStatistics, 5.0, SORT_ORDER_COMPARTOR); - - // target subtask weight is 100 before close file cost factored in. - // close file cost is 5 = 5% * 100. - // key weights before and after close file cost factored in - // before: 350, 230, 120, 40, 10, 10, 10, 10, 10, 10 - // close-cost: 20, 15, 10, 5, 5, 5, 5, 5, 5, 5 - // after: 370, 245, 130, 45, 15, 15, 15, 15, 15, 15 - // target subtask weight with close cost per subtask is 110 (880/8) - Map expectedAssignment = - ImmutableMap.of( - SORT_KEYS[0], - new KeyAssignment( - ImmutableList.of(0, 1, 2, 3), ImmutableList.of(110L, 110L, 110L, 40L), 5L), - SORT_KEYS[1], - new KeyAssignment(ImmutableList.of(3, 4, 5), ImmutableList.of(70L, 110L, 65L), 5L), - SORT_KEYS[2], - new KeyAssignment(ImmutableList.of(5, 6), ImmutableList.of(45L, 85L), 5L), - SORT_KEYS[3], - new KeyAssignment(ImmutableList.of(6, 7), ImmutableList.of(25L, 20L), 5L), - SORT_KEYS[4], - new KeyAssignment(ImmutableList.of(7), ImmutableList.of(15L), 5L), - SORT_KEYS[5], - new KeyAssignment(ImmutableList.of(7), ImmutableList.of(15L), 5L), - SORT_KEYS[6], - new KeyAssignment(ImmutableList.of(7), ImmutableList.of(15L), 5L), - SORT_KEYS[7], - new KeyAssignment(ImmutableList.of(7), ImmutableList.of(15L), 5L), - SORT_KEYS[8], - new KeyAssignment(ImmutableList.of(7), ImmutableList.of(15L), 5L), - SORT_KEYS[9], - new KeyAssignment(ImmutableList.of(7), ImmutableList.of(15L), 5L)); - assertThat(mapAssignment.keyAssignments()).isEqualTo(expectedAssignment); - - // key: subtask id - // value pair: first is the assigned weight (excluding close file cost) for the subtask, - // second is the number of keys assigned to the subtask - Map> expectedAssignmentInfo = - ImmutableMap.of( - 0, - Pair.of(105L, 1), - 1, - Pair.of(105L, 1), - 2, - Pair.of(105L, 1), - 3, - Pair.of(100L, 2), - 4, - Pair.of(105L, 1), - 5, - Pair.of(100L, 2), - 6, - Pair.of(100L, 2), - 7, - Pair.of(75L, 7)); - assertThat(mapAssignment.assignmentInfo()).isEqualTo(expectedAssignmentInfo); - - MapRangePartitioner partitioner = - new MapRangePartitioner(TestFixtures.SCHEMA, SORT_ORDER, mapAssignment); - Map>> partitionResults = - runPartitioner(partitioner, numPartitions, mapStatistics); - validatePartitionResults(expectedAssignmentInfo, partitionResults, 5.0); - } - - @Test - public void testNonDividableNoClosingFileCost() { - int numPartitions = 9; - MapAssignment mapAssignment = - MapAssignment.fromKeyFrequency(numPartitions, mapStatistics, 0.0, SORT_ORDER_COMPARTOR); - - // before: 350, 230, 120, 40, 10, 10, 10, 10, 10, 10 - // each task should get targeted weight of 89 = ceiling(800/9) - Map expectedAssignment = - ImmutableMap.of( - SORT_KEYS[0], - new KeyAssignment( - ImmutableList.of(0, 1, 2, 3), ImmutableList.of(89L, 89L, 89L, 83L), 0L), - SORT_KEYS[1], - new KeyAssignment( - ImmutableList.of(3, 4, 5, 6), ImmutableList.of(6L, 89L, 89L, 46L), 0L), - SORT_KEYS[2], - new KeyAssignment(ImmutableList.of(6, 7), ImmutableList.of(43L, 77L), 0L), - SORT_KEYS[3], - new KeyAssignment(ImmutableList.of(7, 8), ImmutableList.of(12L, 28L), 0L), - SORT_KEYS[4], - new KeyAssignment(ImmutableList.of(8), ImmutableList.of(10L), 0L), - SORT_KEYS[5], - new KeyAssignment(ImmutableList.of(8), ImmutableList.of(10L), 0L), - SORT_KEYS[6], - new KeyAssignment(ImmutableList.of(8), ImmutableList.of(10L), 0L), - SORT_KEYS[7], - new KeyAssignment(ImmutableList.of(8), ImmutableList.of(10L), 0L), - SORT_KEYS[8], - new KeyAssignment(ImmutableList.of(8), ImmutableList.of(10L), 0L), - SORT_KEYS[9], - new KeyAssignment(ImmutableList.of(8), ImmutableList.of(10L), 0L)); - assertThat(mapAssignment.keyAssignments()).isEqualTo(expectedAssignment); - - // key: subtask id - // value pair: first is the assigned weight, second is the number of assigned keys - Map> expectedAssignmentInfo = - ImmutableMap.of( - 0, - Pair.of(89L, 1), - 1, - Pair.of(89L, 1), - 2, - Pair.of(89L, 1), - 3, - Pair.of(89L, 2), - 4, - Pair.of(89L, 1), - 5, - Pair.of(89L, 1), - 6, - Pair.of(89L, 2), - 7, - Pair.of(89L, 2), - 8, - Pair.of(88L, 7)); - assertThat(mapAssignment.assignmentInfo()).isEqualTo(expectedAssignmentInfo); - - MapRangePartitioner partitioner = - new MapRangePartitioner(TestFixtures.SCHEMA, SORT_ORDER, mapAssignment); - Map>> partitionResults = - runPartitioner(partitioner, numPartitions, mapStatistics); - validatePartitionResults(expectedAssignmentInfo, partitionResults, 5.0); - } - - @Test - public void testNonDividableWithClosingFileCost() { - int numPartitions = 9; - MapAssignment mapAssignment = - MapAssignment.fromKeyFrequency(numPartitions, mapStatistics, 5.0, SORT_ORDER_COMPARTOR); - - // target subtask weight is 89 before close file cost factored in. - // close file cost is 5 (= 5% * 89) per file. - // key weights before and after close file cost factored in - // before: 350, 230, 120, 40, 10, 10, 10, 10, 10, 10 - // close-cost: 20, 15, 10, 5, 5, 5, 5, 5, 5, 5 - // after: 370, 245, 130, 45, 15, 15, 15, 15, 15, 15 - // target subtask weight per subtask is 98 ceiling(880/9) - Map expectedAssignment = - ImmutableMap.of( - SORT_KEYS[0], - new KeyAssignment( - ImmutableList.of(0, 1, 2, 3), ImmutableList.of(98L, 98L, 98L, 76L), 5L), - SORT_KEYS[1], - new KeyAssignment( - ImmutableList.of(3, 4, 5, 6), ImmutableList.of(22L, 98L, 98L, 27L), 5L), - SORT_KEYS[2], - new KeyAssignment(ImmutableList.of(6, 7), ImmutableList.of(71L, 59L), 5L), - SORT_KEYS[3], - new KeyAssignment(ImmutableList.of(7, 8), ImmutableList.of(39L, 6L), 5L), - SORT_KEYS[4], - new KeyAssignment(ImmutableList.of(8), ImmutableList.of(15L), 5L), - SORT_KEYS[5], - new KeyAssignment(ImmutableList.of(8), ImmutableList.of(15L), 5L), - SORT_KEYS[6], - new KeyAssignment(ImmutableList.of(8), ImmutableList.of(15L), 5L), - SORT_KEYS[7], - new KeyAssignment(ImmutableList.of(8), ImmutableList.of(15L), 5L), - SORT_KEYS[8], - new KeyAssignment(ImmutableList.of(8), ImmutableList.of(15L), 5L), - SORT_KEYS[9], - new KeyAssignment(ImmutableList.of(8), ImmutableList.of(15L), 5L)); - assertThat(mapAssignment.keyAssignments()).isEqualTo(expectedAssignment); - - // key: subtask id - // value pair: first is the assigned weight for the subtask, second is the number of keys - // assigned to the subtask - Map> expectedAssignmentInfo = - ImmutableMap.of( - 0, - Pair.of(93L, 1), - 1, - Pair.of(93L, 1), - 2, - Pair.of(93L, 1), - 3, - Pair.of(88L, 2), - 4, - Pair.of(93L, 1), - 5, - Pair.of(93L, 1), - 6, - Pair.of(88L, 2), - 7, - Pair.of(88L, 2), - 8, - Pair.of(61L, 7)); - assertThat(mapAssignment.assignmentInfo()).isEqualTo(expectedAssignmentInfo); - - MapRangePartitioner partitioner = - new MapRangePartitioner(TestFixtures.SCHEMA, SORT_ORDER, mapAssignment); - Map>> partitionResults = - runPartitioner(partitioner, numPartitions, mapStatistics); - // drift threshold is high for non-dividable scenario with close cost - validatePartitionResults(expectedAssignmentInfo, partitionResults, 10.0); - } - - private static Map>> runPartitioner( - MapRangePartitioner partitioner, int numPartitions, Map mapStatistics) { - // The Map key is the subtaskId. - // For the map value pair, the first element is the count of assigned and - // the second element of Set is for the set of assigned keys. - Map>> partitionResults = Maps.newHashMap(); - mapStatistics.forEach( - (sortKey, weight) -> { - String key = sortKey.get(0, String.class); - // run 100x times of the weight - long iterations = weight * 100; - for (int i = 0; i < iterations; ++i) { - RowData rowData = - GenericRowData.of( - StringData.fromString(key), 1, StringData.fromString("2023-06-20")); - int subtaskId = partitioner.partition(rowData, numPartitions); - partitionResults.computeIfAbsent( - subtaskId, k -> Pair.of(new AtomicLong(0), Sets.newHashSet())); - Pair> pair = partitionResults.get(subtaskId); - pair.first().incrementAndGet(); - pair.second().add(rowData); - } - }); - return partitionResults; - } - - /** @param expectedAssignmentInfo excluding closing cost */ - private void validatePartitionResults( - Map> expectedAssignmentInfo, - Map>> partitionResults, - double maxDriftPercentage) { - - assertThat(partitionResults.size()).isEqualTo(expectedAssignmentInfo.size()); - - List expectedAssignedKeyCounts = - Lists.newArrayListWithExpectedSize(expectedAssignmentInfo.size()); - List actualAssignedKeyCounts = - Lists.newArrayListWithExpectedSize(partitionResults.size()); - List expectedNormalizedWeights = - Lists.newArrayListWithExpectedSize(expectedAssignmentInfo.size()); - List actualNormalizedWeights = - Lists.newArrayListWithExpectedSize(partitionResults.size()); - - long expectedTotalWeight = - expectedAssignmentInfo.values().stream().mapToLong(Pair::first).sum(); - expectedAssignmentInfo.forEach( - (subtaskId, pair) -> { - expectedAssignedKeyCounts.add(pair.second()); - expectedNormalizedWeights.add(pair.first().doubleValue() / expectedTotalWeight); - }); - - long actualTotalWeight = - partitionResults.values().stream().mapToLong(pair -> pair.first().longValue()).sum(); - partitionResults.forEach( - (subtaskId, pair) -> { - actualAssignedKeyCounts.add(pair.second().size()); - actualNormalizedWeights.add(pair.first().doubleValue() / actualTotalWeight); - }); - - // number of assigned keys should match exactly - assertThat(actualAssignedKeyCounts) - .as("the number of assigned keys should match for every subtask") - .isEqualTo(expectedAssignedKeyCounts); - - // weight for every subtask shouldn't differ for more than some threshold relative to the - // expected weight - for (int subtaskId = 0; subtaskId < expectedNormalizedWeights.size(); ++subtaskId) { - double expectedWeight = expectedNormalizedWeights.get(subtaskId); - double min = expectedWeight * (1 - maxDriftPercentage / 100); - double max = expectedWeight * (1 + maxDriftPercentage / 100); - assertThat(actualNormalizedWeights.get(subtaskId)) - .as( - "Subtask %d weight should within %.1f percent of the expected range %s", - subtaskId, maxDriftPercentage, expectedWeight) - .isBetween(min, max); - } - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestRangePartitioner.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestRangePartitioner.java deleted file mode 100644 index 0485fdb7fa04..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestRangePartitioner.java +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import static org.apache.iceberg.flink.sink.shuffle.Fixtures.CHAR_KEYS; -import static org.apache.iceberg.flink.sink.shuffle.Fixtures.SCHEMA; -import static org.apache.iceberg.flink.sink.shuffle.Fixtures.SORT_ORDER; -import static org.assertj.core.api.Assertions.assertThat; - -import java.util.Set; -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.StringData; -import org.apache.iceberg.SortKey; -import org.apache.iceberg.relocated.com.google.common.collect.Sets; -import org.junit.jupiter.api.Test; - -public class TestRangePartitioner { - private final int numPartitions = 4; - - @Test - public void testRoundRobinRecordsBeforeStatisticsAvailable() { - RangePartitioner partitioner = new RangePartitioner(SCHEMA, SORT_ORDER); - Set results = Sets.newHashSetWithExpectedSize(numPartitions); - for (int i = 0; i < numPartitions; ++i) { - results.add( - partitioner.partition( - StatisticsOrRecord.fromRecord(GenericRowData.of(StringData.fromString("a"), 1)), - numPartitions)); - } - - // round-robin. every partition should get an assignment - assertThat(results).containsExactlyInAnyOrder(0, 1, 2, 3); - } - - @Test - public void testRoundRobinStatisticsWrapper() { - RangePartitioner partitioner = new RangePartitioner(SCHEMA, SORT_ORDER); - Set results = Sets.newHashSetWithExpectedSize(numPartitions); - for (int i = 0; i < numPartitions; ++i) { - GlobalStatistics statistics = - GlobalStatistics.fromRangeBounds(1L, new SortKey[] {CHAR_KEYS.get("a")}); - results.add( - partitioner.partition(StatisticsOrRecord.fromStatistics(statistics), numPartitions)); - } - - // round-robin. every partition should get an assignment - assertThat(results).containsExactlyInAnyOrder(0, 1, 2, 3); - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchDataStatistics.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchDataStatistics.java deleted file mode 100644 index 396bfae2f13c..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchDataStatistics.java +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import static org.apache.iceberg.flink.sink.shuffle.Fixtures.CHAR_KEYS; -import static org.apache.iceberg.flink.sink.shuffle.Fixtures.ROW_WRAPPER; -import static org.assertj.core.api.Assertions.assertThat; - -import org.apache.datasketches.sampling.ReservoirItemsSketch; -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.StringData; -import org.apache.iceberg.SortKey; -import org.junit.jupiter.api.Test; - -public class TestSketchDataStatistics { - @SuppressWarnings("unchecked") - @Test - public void testAddsAndGet() { - SketchDataStatistics dataStatistics = new SketchDataStatistics(128); - - GenericRowData reusedRow = GenericRowData.of(StringData.fromString("a"), 1); - Fixtures.SORT_KEY.wrap(ROW_WRAPPER.wrap(reusedRow)); - dataStatistics.add(Fixtures.SORT_KEY); - - reusedRow.setField(0, StringData.fromString("b")); - Fixtures.SORT_KEY.wrap(ROW_WRAPPER.wrap(reusedRow)); - dataStatistics.add(Fixtures.SORT_KEY); - - reusedRow.setField(0, StringData.fromString("c")); - Fixtures.SORT_KEY.wrap(ROW_WRAPPER.wrap(reusedRow)); - dataStatistics.add(Fixtures.SORT_KEY); - - reusedRow.setField(0, StringData.fromString("b")); - Fixtures.SORT_KEY.wrap(ROW_WRAPPER.wrap(reusedRow)); - dataStatistics.add(Fixtures.SORT_KEY); - - ReservoirItemsSketch actual = (ReservoirItemsSketch) dataStatistics.result(); - assertThat(actual.getSamples()) - .isEqualTo( - new SortKey[] { - CHAR_KEYS.get("a"), CHAR_KEYS.get("b"), CHAR_KEYS.get("c"), CHAR_KEYS.get("b") - }); - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchRangePartitioner.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchRangePartitioner.java deleted file mode 100644 index 378c6afff077..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchRangePartitioner.java +++ /dev/null @@ -1,88 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import static org.assertj.core.api.Assertions.assertThat; - -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.StringData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.SortKey; -import org.apache.iceberg.SortOrder; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.RowDataWrapper; -import org.apache.iceberg.flink.TestFixtures; -import org.junit.jupiter.api.Test; - -public class TestSketchRangePartitioner { - // sort on the long id field - private static final SortOrder SORT_ORDER = - SortOrder.builderFor(TestFixtures.SCHEMA).asc("id").build(); - private static final SortKey SORT_KEY = new SortKey(TestFixtures.SCHEMA, SORT_ORDER); - private static final RowType ROW_TYPE = FlinkSchemaUtil.convert(TestFixtures.SCHEMA); - private static final int NUM_PARTITIONS = 16; - private static final long RANGE_STEP = 1_000; - private static final long MAX_ID = RANGE_STEP * NUM_PARTITIONS; - private static final SortKey[] RANGE_BOUNDS = createRangeBounds(); - - /** - * To understand how range bounds are used in range partitioning, here is an example for human - * ages with 4 partitions: [15, 32, 60]. The 4 ranges would be - * - *
      - *
    • age <= 15 - *
    • age > 15 && age <= 32 - *
    • age >32 && age <= 60 - *
    • age > 60 - *
    - */ - private static SortKey[] createRangeBounds() { - SortKey[] rangeBounds = new SortKey[NUM_PARTITIONS - 1]; - for (int i = 0; i < NUM_PARTITIONS - 1; ++i) { - RowData rowData = - GenericRowData.of( - StringData.fromString("data"), - RANGE_STEP * (i + 1), - StringData.fromString("2023-06-20")); - RowDataWrapper keyWrapper = new RowDataWrapper(ROW_TYPE, TestFixtures.SCHEMA.asStruct()); - keyWrapper.wrap(rowData); - SortKey sortKey = new SortKey(TestFixtures.SCHEMA, SORT_ORDER); - sortKey.wrap(keyWrapper); - rangeBounds[i] = sortKey; - } - - return rangeBounds; - } - - @Test - public void testRangePartitioningWithRangeBounds() { - SketchRangePartitioner partitioner = - new SketchRangePartitioner(TestFixtures.SCHEMA, SORT_ORDER, RANGE_BOUNDS); - GenericRowData row = - GenericRowData.of(StringData.fromString("data"), 0L, StringData.fromString("2023-06-20")); - for (long id = 0; id < MAX_ID; ++id) { - row.setField(1, id); - int partition = partitioner.partition(row, NUM_PARTITIONS); - assertThat(partition).isGreaterThanOrEqualTo(0).isLessThan(NUM_PARTITIONS); - int expectedPartition = id == 0L ? 0 : (int) ((id - 1) / RANGE_STEP); - assertThat(partition).isEqualTo(expectedPartition); - } - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchUtil.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchUtil.java deleted file mode 100644 index 16202c075ea0..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSketchUtil.java +++ /dev/null @@ -1,189 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import static org.apache.iceberg.flink.sink.shuffle.Fixtures.CHAR_KEYS; -import static org.apache.iceberg.flink.sink.shuffle.Fixtures.SORT_ORDER_COMPARTOR; -import static org.assertj.core.api.Assertions.assertThat; - -import org.apache.iceberg.SortKey; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.params.ParameterizedTest; -import org.junit.jupiter.params.provider.ValueSource; - -public class TestSketchUtil { - @Test - public void testCoordinatorReservoirSize() { - // adjusted to over min threshold of 10_000 and is divisible by number of partitions (3) - assertThat(SketchUtil.determineCoordinatorReservoirSize(3)).isEqualTo(10_002); - // adjust to multiplier of 100 - assertThat(SketchUtil.determineCoordinatorReservoirSize(123)).isEqualTo(123_00); - // adjusted to below max threshold of 1_000_000 and is divisible by number of partitions (3) - assertThat(SketchUtil.determineCoordinatorReservoirSize(10_123)) - .isEqualTo(1_000_000 - (1_000_000 % 10_123)); - } - - @Test - public void testOperatorReservoirSize() { - assertThat(SketchUtil.determineOperatorReservoirSize(5, 3)) - .isEqualTo((10_002 * SketchUtil.OPERATOR_OVER_SAMPLE_RATIO) / 5); - assertThat(SketchUtil.determineOperatorReservoirSize(123, 123)) - .isEqualTo((123_00 * SketchUtil.OPERATOR_OVER_SAMPLE_RATIO) / 123); - assertThat(SketchUtil.determineOperatorReservoirSize(256, 123)) - .isEqualTo( - (int) Math.ceil((double) (123_00 * SketchUtil.OPERATOR_OVER_SAMPLE_RATIO) / 256)); - assertThat(SketchUtil.determineOperatorReservoirSize(5_120, 10_123)) - .isEqualTo( - (int) Math.ceil((double) (992_054 * SketchUtil.OPERATOR_OVER_SAMPLE_RATIO) / 5_120)); - } - - @Test - public void testRangeBoundsOneChannel() { - assertThat( - SketchUtil.rangeBounds( - 1, - SORT_ORDER_COMPARTOR, - new SortKey[] { - CHAR_KEYS.get("a"), - CHAR_KEYS.get("b"), - CHAR_KEYS.get("c"), - CHAR_KEYS.get("d"), - CHAR_KEYS.get("e"), - CHAR_KEYS.get("f") - })) - .isEmpty(); - } - - @Test - public void testRangeBoundsDivisible() { - assertThat( - SketchUtil.rangeBounds( - 3, - SORT_ORDER_COMPARTOR, - new SortKey[] { - CHAR_KEYS.get("a"), - CHAR_KEYS.get("b"), - CHAR_KEYS.get("c"), - CHAR_KEYS.get("d"), - CHAR_KEYS.get("e"), - CHAR_KEYS.get("f") - })) - .containsExactly(CHAR_KEYS.get("b"), CHAR_KEYS.get("d")); - } - - @Test - public void testRangeBoundsNonDivisible() { - // step is 3 = ceiling(11/4) - assertThat( - SketchUtil.rangeBounds( - 4, - SORT_ORDER_COMPARTOR, - new SortKey[] { - CHAR_KEYS.get("a"), - CHAR_KEYS.get("b"), - CHAR_KEYS.get("c"), - CHAR_KEYS.get("d"), - CHAR_KEYS.get("e"), - CHAR_KEYS.get("f"), - CHAR_KEYS.get("g"), - CHAR_KEYS.get("h"), - CHAR_KEYS.get("i"), - CHAR_KEYS.get("j"), - CHAR_KEYS.get("k"), - })) - .containsExactly(CHAR_KEYS.get("c"), CHAR_KEYS.get("f"), CHAR_KEYS.get("i")); - } - - @Test - public void testRangeBoundsSkipDuplicates() { - // step is 3 = ceiling(11/4) - assertThat( - SketchUtil.rangeBounds( - 4, - SORT_ORDER_COMPARTOR, - new SortKey[] { - CHAR_KEYS.get("a"), - CHAR_KEYS.get("b"), - CHAR_KEYS.get("c"), - CHAR_KEYS.get("c"), - CHAR_KEYS.get("c"), - CHAR_KEYS.get("c"), - CHAR_KEYS.get("g"), - CHAR_KEYS.get("h"), - CHAR_KEYS.get("i"), - CHAR_KEYS.get("j"), - CHAR_KEYS.get("k"), - })) - // skipped duplicate c's - .containsExactly(CHAR_KEYS.get("c"), CHAR_KEYS.get("g"), CHAR_KEYS.get("j")); - } - - @ParameterizedTest - @ValueSource(ints = {4, 6}) - public void testPartitioningAndScaleUp(int numPartitions) { - // Range bounds are calculated based on 4 partitions - SortKey[] rangeBounds = - new SortKey[] {CHAR_KEYS.get("c"), CHAR_KEYS.get("j"), CHAR_KEYS.get("m")}; - - // <= c - assertPartition(0, CHAR_KEYS.get("a"), numPartitions, rangeBounds); - assertPartition(0, CHAR_KEYS.get("c"), numPartitions, rangeBounds); - // > c && <= j - assertPartition(1, CHAR_KEYS.get("d"), numPartitions, rangeBounds); - assertPartition(1, CHAR_KEYS.get("i"), numPartitions, rangeBounds); - assertPartition(1, CHAR_KEYS.get("j"), numPartitions, rangeBounds); - // > j && <= m - assertPartition(2, CHAR_KEYS.get("k"), numPartitions, rangeBounds); - assertPartition(2, CHAR_KEYS.get("l"), numPartitions, rangeBounds); - assertPartition(2, CHAR_KEYS.get("m"), numPartitions, rangeBounds); - // > m - assertPartition(3, CHAR_KEYS.get("n"), numPartitions, rangeBounds); - assertPartition(3, CHAR_KEYS.get("z"), numPartitions, rangeBounds); - } - - @Test - public void testPartitionScaleDown() { - // Range bounds are calculated based on 4 partitions - SortKey[] rangeBounds = - new SortKey[] {CHAR_KEYS.get("c"), CHAR_KEYS.get("j"), CHAR_KEYS.get("m")}; - int numPartitions = 3; - - // <= c - assertPartition(0, CHAR_KEYS.get("a"), numPartitions, rangeBounds); - assertPartition(0, CHAR_KEYS.get("c"), numPartitions, rangeBounds); - // > c && <= j - assertPartition(1, CHAR_KEYS.get("d"), numPartitions, rangeBounds); - assertPartition(1, CHAR_KEYS.get("i"), numPartitions, rangeBounds); - assertPartition(1, CHAR_KEYS.get("j"), numPartitions, rangeBounds); - // > j && <= m - assertPartition(2, CHAR_KEYS.get("k"), numPartitions, rangeBounds); - assertPartition(2, CHAR_KEYS.get("l"), numPartitions, rangeBounds); - assertPartition(2, CHAR_KEYS.get("m"), numPartitions, rangeBounds); - // > m - // reassigns out-of-range partitions via mod (% 3 in this case) - assertPartition(0, CHAR_KEYS.get("n"), numPartitions, rangeBounds); - assertPartition(0, CHAR_KEYS.get("z"), numPartitions, rangeBounds); - } - - private static void assertPartition( - int expectedPartition, SortKey key, int numPartitions, SortKey[] rangeBounds) { - assertThat(SketchUtil.partition(key, numPartitions, rangeBounds, SORT_ORDER_COMPARTOR)) - .isEqualTo(expectedPartition); - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerBase.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerBase.java deleted file mode 100644 index c7fea015142c..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerBase.java +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import org.apache.flink.api.common.typeutils.SerializerTestBase; -import org.apache.flink.api.common.typeutils.TypeSerializer; -import org.apache.flink.table.data.GenericRowData; -import org.apache.iceberg.Schema; -import org.apache.iceberg.SortKey; -import org.apache.iceberg.SortOrder; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.RowDataWrapper; - -public abstract class TestSortKeySerializerBase extends SerializerTestBase { - - protected abstract Schema schema(); - - protected abstract SortOrder sortOrder(); - - protected abstract GenericRowData rowData(); - - @Override - protected TypeSerializer createSerializer() { - return new SortKeySerializer(schema(), sortOrder()); - } - - @Override - protected int getLength() { - return -1; - } - - @Override - protected Class getTypeClass() { - return SortKey.class; - } - - @Override - protected SortKey[] getTestData() { - return new SortKey[] {sortKey()}; - } - - private SortKey sortKey() { - RowDataWrapper rowDataWrapper = - new RowDataWrapper(FlinkSchemaUtil.convert(schema()), schema().asStruct()); - SortKey sortKey = new SortKey(schema(), sortOrder()); - sortKey.wrap(rowDataWrapper.wrap(rowData())); - return sortKey; - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerNestedStruct.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerNestedStruct.java deleted file mode 100644 index 0000688a8b55..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerNestedStruct.java +++ /dev/null @@ -1,55 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import org.apache.flink.table.data.GenericRowData; -import org.apache.iceberg.NullOrder; -import org.apache.iceberg.Schema; -import org.apache.iceberg.SortDirection; -import org.apache.iceberg.SortOrder; -import org.apache.iceberg.expressions.Expressions; -import org.apache.iceberg.flink.DataGenerator; -import org.apache.iceberg.flink.DataGenerators; - -public class TestSortKeySerializerNestedStruct extends TestSortKeySerializerBase { - private final DataGenerator generator = new DataGenerators.StructOfStruct(); - - @Override - protected Schema schema() { - return generator.icebergSchema(); - } - - @Override - protected SortOrder sortOrder() { - return SortOrder.builderFor(schema()) - .asc("row_id") - .sortBy( - Expressions.bucket("struct_of_struct.id", 4), SortDirection.DESC, NullOrder.NULLS_LAST) - .sortBy( - Expressions.truncate("struct_of_struct.person_struct.name", 16), - SortDirection.ASC, - NullOrder.NULLS_FIRST) - .build(); - } - - @Override - protected GenericRowData rowData() { - return generator.generateFlinkRowData(); - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerPrimitives.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerPrimitives.java deleted file mode 100644 index 54cceae6e55b..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerPrimitives.java +++ /dev/null @@ -1,90 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import static org.assertj.core.api.AssertionsForClassTypes.assertThat; - -import org.apache.flink.core.memory.DataInputDeserializer; -import org.apache.flink.core.memory.DataOutputSerializer; -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.StringData; -import org.apache.iceberg.NullOrder; -import org.apache.iceberg.Schema; -import org.apache.iceberg.SortDirection; -import org.apache.iceberg.SortKey; -import org.apache.iceberg.SortOrder; -import org.apache.iceberg.StructLike; -import org.apache.iceberg.expressions.Expressions; -import org.apache.iceberg.flink.DataGenerator; -import org.apache.iceberg.flink.DataGenerators; -import org.apache.iceberg.flink.RowDataWrapper; -import org.junit.jupiter.api.Test; - -public class TestSortKeySerializerPrimitives extends TestSortKeySerializerBase { - private final DataGenerator generator = new DataGenerators.Primitives(); - - @Override - protected Schema schema() { - return generator.icebergSchema(); - } - - @Override - protected SortOrder sortOrder() { - return SortOrder.builderFor(schema()) - .asc("boolean_field") - .sortBy(Expressions.bucket("int_field", 4), SortDirection.DESC, NullOrder.NULLS_LAST) - .sortBy(Expressions.truncate("string_field", 2), SortDirection.ASC, NullOrder.NULLS_FIRST) - .sortBy(Expressions.bucket("uuid_field", 16), SortDirection.ASC, NullOrder.NULLS_FIRST) - .sortBy(Expressions.hour("ts_with_zone_field"), SortDirection.ASC, NullOrder.NULLS_FIRST) - .sortBy(Expressions.day("ts_without_zone_field"), SortDirection.ASC, NullOrder.NULLS_FIRST) - // can not test HeapByteBuffer due to equality test inside SerializerTestBase - // .sortBy(Expressions.truncate("binary_field", 2), SortDirection.ASC, - // NullOrder.NULLS_FIRST) - .build(); - } - - @Override - protected GenericRowData rowData() { - return generator.generateFlinkRowData(); - } - - @Test - public void testSerializationSize() throws Exception { - RowData rowData = - GenericRowData.of(StringData.fromString("550e8400-e29b-41d4-a716-446655440000"), 1L); - RowDataWrapper rowDataWrapper = - new RowDataWrapper(Fixtures.ROW_TYPE, Fixtures.SCHEMA.asStruct()); - StructLike struct = rowDataWrapper.wrap(rowData); - SortKey sortKey = Fixtures.SORT_KEY.copy(); - sortKey.wrap(struct); - SortKeySerializer serializer = new SortKeySerializer(Fixtures.SCHEMA, Fixtures.SORT_ORDER); - DataOutputSerializer output = new DataOutputSerializer(1024); - serializer.serialize(sortKey, output); - byte[] serializedBytes = output.getCopyOfBuffer(); - assertThat(serializedBytes.length) - .as( - "Serialized bytes for sort key should be 38 bytes (34 UUID text + 4 byte integer of string length") - .isEqualTo(38); - - DataInputDeserializer input = new DataInputDeserializer(serializedBytes); - SortKey deserialized = serializer.deserialize(input); - assertThat(deserialized).isEqualTo(sortKey); - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerSnapshot.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerSnapshot.java deleted file mode 100644 index 012654603b04..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeySerializerSnapshot.java +++ /dev/null @@ -1,213 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import static org.apache.iceberg.flink.sink.shuffle.Fixtures.ROW_TYPE; -import static org.apache.iceberg.flink.sink.shuffle.Fixtures.SCHEMA; -import static org.apache.iceberg.flink.sink.shuffle.Fixtures.SORT_KEY; -import static org.apache.iceberg.flink.sink.shuffle.Fixtures.SORT_ORDER; -import static org.assertj.core.api.AssertionsForClassTypes.assertThat; - -import java.io.IOException; -import org.apache.flink.api.common.typeutils.TypeSerializer; -import org.apache.flink.api.common.typeutils.TypeSerializerSchemaCompatibility; -import org.apache.flink.api.common.typeutils.TypeSerializerSnapshot; -import org.apache.flink.core.memory.DataInputDeserializer; -import org.apache.flink.core.memory.DataInputView; -import org.apache.flink.core.memory.DataOutputSerializer; -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.StringData; -import org.apache.iceberg.Schema; -import org.apache.iceberg.SortKey; -import org.apache.iceberg.SortOrder; -import org.apache.iceberg.StructLike; -import org.apache.iceberg.flink.RowDataWrapper; -import org.apache.iceberg.types.Types; -import org.junit.jupiter.api.Test; - -public class TestSortKeySerializerSnapshot { - private final Schema schema = - new Schema( - Types.NestedField.optional(1, "ts", Types.TimestampType.withoutZone()), - Types.NestedField.optional(2, "str", Types.StringType.get()), - Types.NestedField.optional(3, "int", Types.IntegerType.get()), - Types.NestedField.optional(4, "boolean", Types.BooleanType.get())); - private final SortOrder sortOrder = SortOrder.builderFor(schema).asc("str").asc("int").build(); - - @Test - public void testRestoredSerializer() throws Exception { - RowData rowData = GenericRowData.of(StringData.fromString("str"), 1); - RowDataWrapper rowDataWrapper = new RowDataWrapper(ROW_TYPE, SCHEMA.asStruct()); - StructLike struct = rowDataWrapper.wrap(rowData); - SortKey sortKey = SORT_KEY.copy(); - sortKey.wrap(struct); - - SortKeySerializer originalSerializer = new SortKeySerializer(SCHEMA, SORT_ORDER); - TypeSerializerSnapshot snapshot = - roundTrip(originalSerializer.snapshotConfiguration()); - TypeSerializer restoredSerializer = snapshot.restoreSerializer(); - - DataOutputSerializer output = new DataOutputSerializer(1024); - originalSerializer.serialize(sortKey, output); - byte[] serializedBytes = output.getCopyOfBuffer(); - - DataInputDeserializer input = new DataInputDeserializer(serializedBytes); - SortKey deserialized = restoredSerializer.deserialize(input); - assertThat(deserialized).isEqualTo(sortKey); - } - - @Test - public void testSnapshotIsCompatibleWithSameSortOrder() throws Exception { - SortKeySerializer oldSerializer = new SortKeySerializer(schema, sortOrder); - SortKeySerializer.SortKeySerializerSnapshot oldSnapshot = - roundTrip(oldSerializer.snapshotConfiguration()); - - SortKeySerializer newSerializer = new SortKeySerializer(schema, sortOrder); - - TypeSerializerSchemaCompatibility resultCompatibility = - oldSnapshot.resolveSchemaCompatibility(newSerializer); - assertThat(resultCompatibility.isCompatibleAsIs()).isTrue(); - } - - @Test - public void testSnapshotIsCompatibleWithRemoveNonSortField() throws Exception { - SortKeySerializer oldSerializer = new SortKeySerializer(schema, sortOrder); - SortKeySerializer.SortKeySerializerSnapshot oldSnapshot = - roundTrip(oldSerializer.snapshotConfiguration()); - - // removed non-sort boolean field - Schema newSchema = - new Schema( - Types.NestedField.optional(1, "ts", Types.TimestampType.withoutZone()), - Types.NestedField.optional(2, "str", Types.StringType.get()), - Types.NestedField.optional(3, "int", Types.IntegerType.get())); - SortOrder newSortOrder = SortOrder.builderFor(newSchema).asc("str").asc("int").build(); - SortKeySerializer newSerializer = new SortKeySerializer(newSchema, newSortOrder); - - TypeSerializerSchemaCompatibility resultCompatibility = - oldSnapshot.resolveSchemaCompatibility(newSerializer); - assertThat(resultCompatibility.isCompatibleAsIs()).isTrue(); - } - - @Test - public void testSnapshotIsCompatibleWithAddNonSortField() throws Exception { - SortKeySerializer oldSerializer = new SortKeySerializer(schema, sortOrder); - SortKeySerializer.SortKeySerializerSnapshot oldSnapshot = - roundTrip(oldSerializer.snapshotConfiguration()); - - // add a new non-sort float field - Schema newSchema = - new Schema( - Types.NestedField.optional(1, "ts", Types.TimestampType.withoutZone()), - Types.NestedField.optional(2, "str", Types.StringType.get()), - Types.NestedField.optional(3, "int", Types.IntegerType.get()), - Types.NestedField.optional(4, "boolean", Types.BooleanType.get()), - Types.NestedField.required(5, "float", Types.FloatType.get())); - SortOrder newSortOrder = SortOrder.builderFor(newSchema).asc("str").asc("int").build(); - SortKeySerializer newSerializer = new SortKeySerializer(newSchema, newSortOrder); - - TypeSerializerSchemaCompatibility resultCompatibility = - oldSnapshot.resolveSchemaCompatibility(newSerializer); - assertThat(resultCompatibility.isCompatibleAsIs()).isTrue(); - } - - @Test - public void testSnapshotIsIncompatibleWithIncompatibleSchema() throws Exception { - SortKeySerializer oldSerializer = new SortKeySerializer(schema, sortOrder); - SortKeySerializer.SortKeySerializerSnapshot oldSnapshot = - roundTrip(oldSerializer.snapshotConfiguration()); - - // change str field to a long type - Schema newSchema = - new Schema( - Types.NestedField.optional(1, "ts", Types.TimestampType.withoutZone()), - Types.NestedField.optional(2, "str", Types.LongType.get()), - Types.NestedField.optional(3, "int", Types.IntegerType.get()), - Types.NestedField.optional(4, "boolean", Types.BooleanType.get())); - SortOrder newSortOrder = SortOrder.builderFor(newSchema).asc("str").asc("int").build(); - // switch sort field order - SortKeySerializer newSerializer = new SortKeySerializer(newSchema, newSortOrder); - - TypeSerializerSchemaCompatibility resultCompatibility = - oldSnapshot.resolveSchemaCompatibility(newSerializer); - assertThat(resultCompatibility.isIncompatible()).isTrue(); - } - - @Test - public void testSnapshotIsIncompatibleWithAddSortField() throws Exception { - SortKeySerializer oldSerializer = new SortKeySerializer(schema, sortOrder); - SortKeySerializer.SortKeySerializerSnapshot oldSnapshot = - roundTrip(oldSerializer.snapshotConfiguration()); - - // removed str field from sort order - SortOrder newSortOrder = - SortOrder.builderFor(schema).asc("str").asc("int").desc("boolean").build(); - SortKeySerializer newSerializer = new SortKeySerializer(schema, newSortOrder); - - TypeSerializerSchemaCompatibility resultCompatibility = - oldSnapshot.resolveSchemaCompatibility(newSerializer); - assertThat(resultCompatibility.isIncompatible()).isTrue(); - } - - @Test - public void testSnapshotIsIncompatibleWithRemoveSortField() throws Exception { - SortKeySerializer oldSerializer = new SortKeySerializer(schema, sortOrder); - SortKeySerializer.SortKeySerializerSnapshot oldSnapshot = - roundTrip(oldSerializer.snapshotConfiguration()); - - // remove str field from sort order - SortOrder newSortOrder = SortOrder.builderFor(schema).asc("int").build(); - SortKeySerializer newSerializer = new SortKeySerializer(schema, newSortOrder); - - TypeSerializerSchemaCompatibility resultCompatibility = - oldSnapshot.resolveSchemaCompatibility(newSerializer); - assertThat(resultCompatibility.isIncompatible()).isTrue(); - } - - @Test - public void testSnapshotIsIncompatibleWithSortFieldsOrderChange() throws Exception { - SortKeySerializer oldSerializer = new SortKeySerializer(schema, sortOrder); - SortKeySerializer.SortKeySerializerSnapshot oldSnapshot = - roundTrip(oldSerializer.snapshotConfiguration()); - - // switch sort field order - SortOrder newSortOrder = SortOrder.builderFor(schema).asc("int").asc("str").build(); - SortKeySerializer newSerializer = new SortKeySerializer(schema, newSortOrder); - - TypeSerializerSchemaCompatibility resultCompatibility = - oldSnapshot.resolveSchemaCompatibility(newSerializer); - assertThat(resultCompatibility.isIncompatible()).isTrue(); - } - - /** Copied from Flink {@code AvroSerializerSnapshotTest} */ - private static SortKeySerializer.SortKeySerializerSnapshot roundTrip( - TypeSerializerSnapshot original) throws IOException { - // writeSnapshot(); - DataOutputSerializer out = new DataOutputSerializer(1024); - original.writeSnapshot(out); - // init - SortKeySerializer.SortKeySerializerSnapshot restored = - new SortKeySerializer.SortKeySerializerSnapshot(); - // readSnapshot(); - DataInputView in = new DataInputDeserializer(out.wrapAsByteBuffer()); - restored.readSnapshot(restored.getCurrentVersion(), in, original.getClass().getClassLoader()); - return restored; - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeyUtil.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeyUtil.java deleted file mode 100644 index 1be7e27f2c01..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/sink/shuffle/TestSortKeyUtil.java +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.sink.shuffle; - -import static org.assertj.core.api.Assertions.assertThat; - -import org.apache.iceberg.NullOrder; -import org.apache.iceberg.Schema; -import org.apache.iceberg.SortDirection; -import org.apache.iceberg.SortOrder; -import org.apache.iceberg.expressions.Expressions; -import org.apache.iceberg.types.Types; -import org.junit.jupiter.api.Test; - -public class TestSortKeyUtil { - @Test - public void testResultSchema() { - Schema schema = - new Schema( - Types.NestedField.required(1, "id", Types.StringType.get()), - Types.NestedField.required(2, "ratio", Types.DoubleType.get()), - Types.NestedField.optional( - 3, - "user", - Types.StructType.of( - Types.NestedField.required(11, "name", Types.StringType.get()), - Types.NestedField.required(12, "ts", Types.TimestampType.withoutZone()), - Types.NestedField.optional(13, "device_id", Types.UUIDType.get()), - Types.NestedField.optional( - 14, - "location", - Types.StructType.of( - Types.NestedField.required(101, "lat", Types.FloatType.get()), - Types.NestedField.required(102, "long", Types.FloatType.get()), - Types.NestedField.required(103, "blob", Types.BinaryType.get())))))); - - SortOrder sortOrder = - SortOrder.builderFor(schema) - .asc("ratio") - .sortBy(Expressions.hour("user.ts"), SortDirection.ASC, NullOrder.NULLS_FIRST) - .sortBy( - Expressions.bucket("user.device_id", 16), SortDirection.ASC, NullOrder.NULLS_FIRST) - .sortBy( - Expressions.truncate("user.location.blob", 16), - SortDirection.ASC, - NullOrder.NULLS_FIRST) - .build(); - - assertThat(SortKeyUtil.sortKeySchema(schema, sortOrder).asStruct()) - .isEqualTo( - Types.StructType.of( - Types.NestedField.required(0, "ratio_0", Types.DoubleType.get()), - Types.NestedField.required(1, "ts_1", Types.IntegerType.get()), - Types.NestedField.optional(2, "device_id_2", Types.IntegerType.get()), - Types.NestedField.required(3, "blob_3", Types.BinaryType.get()))); - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/BoundedTableFactory.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/BoundedTableFactory.java deleted file mode 100644 index a08578a4c106..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/BoundedTableFactory.java +++ /dev/null @@ -1,170 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.concurrent.atomic.AtomicInteger; -import java.util.function.Supplier; -import java.util.stream.Stream; -import org.apache.flink.api.java.typeutils.RowTypeInfo; -import org.apache.flink.configuration.ConfigOption; -import org.apache.flink.configuration.ConfigOptions; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.streaming.api.functions.source.SourceFunction; -import org.apache.flink.table.api.TableSchema; -import org.apache.flink.table.connector.ChangelogMode; -import org.apache.flink.table.connector.ProviderContext; -import org.apache.flink.table.connector.source.DataStreamScanProvider; -import org.apache.flink.table.connector.source.DynamicTableSource; -import org.apache.flink.table.connector.source.ScanTableSource; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.util.DataFormatConverters; -import org.apache.flink.table.factories.DynamicTableSourceFactory; -import org.apache.flink.table.types.logical.RowType; -import org.apache.flink.table.utils.TableSchemaUtils; -import org.apache.flink.types.Row; -import org.apache.flink.types.RowKind; -import org.apache.iceberg.flink.util.FlinkCompatibilityUtil; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; - -public class BoundedTableFactory implements DynamicTableSourceFactory { - private static final AtomicInteger DATA_SET_ID = new AtomicInteger(0); - private static final Map>> DATA_SETS = Maps.newHashMap(); - - private static final ConfigOption DATA_ID = - ConfigOptions.key("data-id").stringType().noDefaultValue(); - - public static String registerDataSet(List> dataSet) { - String dataSetId = String.valueOf(DATA_SET_ID.incrementAndGet()); - DATA_SETS.put(dataSetId, dataSet); - return dataSetId; - } - - public static void clearDataSets() { - DATA_SETS.clear(); - } - - @Override - public DynamicTableSource createDynamicTableSource(Context context) { - TableSchema tableSchema = - TableSchemaUtils.getPhysicalSchema(context.getCatalogTable().getSchema()); - - Configuration configuration = Configuration.fromMap(context.getCatalogTable().getOptions()); - String dataId = configuration.getString(DATA_ID); - Preconditions.checkArgument( - DATA_SETS.containsKey(dataId), "data-id %s does not found in registered data set.", dataId); - - return new BoundedTableSource(DATA_SETS.get(dataId), tableSchema); - } - - @Override - public String factoryIdentifier() { - return "BoundedSource"; - } - - @Override - public Set> requiredOptions() { - return ImmutableSet.of(); - } - - @Override - public Set> optionalOptions() { - return ImmutableSet.of(DATA_ID); - } - - private static class BoundedTableSource implements ScanTableSource { - - private final List> elementsPerCheckpoint; - private final TableSchema tableSchema; - - private BoundedTableSource(List> elementsPerCheckpoint, TableSchema tableSchema) { - this.elementsPerCheckpoint = elementsPerCheckpoint; - this.tableSchema = tableSchema; - } - - private BoundedTableSource(BoundedTableSource toCopy) { - this.elementsPerCheckpoint = toCopy.elementsPerCheckpoint; - this.tableSchema = toCopy.tableSchema; - } - - @Override - public ChangelogMode getChangelogMode() { - Supplier> supplier = () -> elementsPerCheckpoint.stream().flatMap(List::stream); - - // Add the INSERT row kind by default. - ChangelogMode.Builder builder = ChangelogMode.newBuilder().addContainedKind(RowKind.INSERT); - - if (supplier.get().anyMatch(r -> r.getKind() == RowKind.DELETE)) { - builder.addContainedKind(RowKind.DELETE); - } - - if (supplier.get().anyMatch(r -> r.getKind() == RowKind.UPDATE_BEFORE)) { - builder.addContainedKind(RowKind.UPDATE_BEFORE); - } - - if (supplier.get().anyMatch(r -> r.getKind() == RowKind.UPDATE_AFTER)) { - builder.addContainedKind(RowKind.UPDATE_AFTER); - } - - return builder.build(); - } - - @Override - public ScanRuntimeProvider getScanRuntimeProvider(ScanContext runtimeProviderContext) { - return new DataStreamScanProvider() { - @Override - public DataStream produceDataStream( - ProviderContext providerContext, StreamExecutionEnvironment env) { - boolean checkpointEnabled = env.getCheckpointConfig().isCheckpointingEnabled(); - SourceFunction source = - new BoundedTestSource<>(elementsPerCheckpoint, checkpointEnabled); - - RowType rowType = (RowType) tableSchema.toRowDataType().getLogicalType(); - // Converter to convert the Row to RowData. - DataFormatConverters.RowConverter rowConverter = - new DataFormatConverters.RowConverter(tableSchema.getFieldDataTypes()); - - return env.addSource(source, new RowTypeInfo(tableSchema.getFieldTypes())) - .map(rowConverter::toInternal, FlinkCompatibilityUtil.toTypeInfo(rowType)); - } - - @Override - public boolean isBounded() { - return true; - } - }; - } - - @Override - public DynamicTableSource copy() { - return new BoundedTableSource(this); - } - - @Override - public String asSummaryString() { - return "Bounded test table source"; - } - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/BoundedTestSource.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/BoundedTestSource.java deleted file mode 100644 index 7b435d059845..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/BoundedTestSource.java +++ /dev/null @@ -1,108 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import java.util.Arrays; -import java.util.Collections; -import java.util.List; -import java.util.concurrent.atomic.AtomicInteger; -import org.apache.flink.api.common.state.CheckpointListener; -import org.apache.flink.streaming.api.functions.source.SourceFunction; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; - -/** - * A stream source that: 1) emits the elements from elementsPerCheckpoint.get(0) without allowing - * checkpoints. 2) then waits for the checkpoint to complete. 3) emits the elements from - * elementsPerCheckpoint.get(1) without allowing checkpoints. 4) then waits for the checkpoint to - * complete. 5) ... - * - *

    Util all the list from elementsPerCheckpoint are exhausted. - */ -public final class BoundedTestSource implements SourceFunction, CheckpointListener { - - private final List> elementsPerCheckpoint; - private final boolean checkpointEnabled; - private volatile boolean running = true; - - private final AtomicInteger numCheckpointsComplete = new AtomicInteger(0); - - /** Emits all those elements in several checkpoints. */ - public BoundedTestSource(List> elementsPerCheckpoint, boolean checkpointEnabled) { - this.elementsPerCheckpoint = elementsPerCheckpoint; - this.checkpointEnabled = checkpointEnabled; - } - - public BoundedTestSource(List> elementsPerCheckpoint) { - this(elementsPerCheckpoint, true); - } - - /** Emits all those elements in a single checkpoint. */ - public BoundedTestSource(T... elements) { - this(Collections.singletonList(Arrays.asList(elements))); - } - - @Override - public void run(SourceContext ctx) throws Exception { - if (!checkpointEnabled) { - Preconditions.checkArgument( - elementsPerCheckpoint.size() <= 1, - "There should be at most one list in the elementsPerCheckpoint when checkpoint is disabled."); - elementsPerCheckpoint.stream().flatMap(List::stream).forEach(ctx::collect); - return; - } - - for (List elements : elementsPerCheckpoint) { - - final int checkpointToAwait; - synchronized (ctx.getCheckpointLock()) { - // Let's say checkpointToAwait = numCheckpointsComplete.get() + delta, in fact the value of - // delta should not - // affect the final table records because we only need to make sure that there will be - // exactly - // elementsPerCheckpoint.size() checkpoints to emit each records buffer from the original - // elementsPerCheckpoint. - // Even if the checkpoints that emitted results are not continuous, the correctness of the - // data should not be - // affected in the end. Setting the delta to be 2 is introducing the variable that produce - // un-continuous - // checkpoints that emit the records buffer from elementsPerCheckpoints. - checkpointToAwait = numCheckpointsComplete.get() + 2; - for (T element : elements) { - ctx.collect(element); - } - } - - synchronized (ctx.getCheckpointLock()) { - while (running && numCheckpointsComplete.get() < checkpointToAwait) { - ctx.getCheckpointLock().wait(1); - } - } - } - } - - @Override - public void notifyCheckpointComplete(long checkpointId) throws Exception { - numCheckpointsComplete.incrementAndGet(); - } - - @Override - public void cancel() { - running = false; - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/ChangeLogTableTestBase.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/ChangeLogTableTestBase.java deleted file mode 100644 index 5dfbbe3abe73..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/ChangeLogTableTestBase.java +++ /dev/null @@ -1,95 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.util.List; -import java.util.stream.Collectors; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.table.api.EnvironmentSettings; -import org.apache.flink.table.api.TableEnvironment; -import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; -import org.apache.flink.types.Row; -import org.apache.flink.types.RowKind; -import org.apache.iceberg.flink.MiniFlinkClusterExtension; -import org.apache.iceberg.flink.TestBase; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.TestInfo; - -public class ChangeLogTableTestBase extends TestBase { - private volatile TableEnvironment tEnv = null; - - protected String tableName; - - @BeforeEach - public void setup(TestInfo testInfo) { - assertThat(testInfo.getTestMethod()).isPresent(); - this.tableName = testInfo.getTestMethod().get().getName(); - } - - @AfterEach - public void clean() { - sql("DROP TABLE IF EXISTS %s", tableName); - BoundedTableFactory.clearDataSets(); - } - - @Override - protected TableEnvironment getTableEnv() { - if (tEnv == null) { - synchronized (this) { - if (tEnv == null) { - EnvironmentSettings settings = - EnvironmentSettings.newInstance().inStreamingMode().build(); - - StreamExecutionEnvironment env = - StreamExecutionEnvironment.getExecutionEnvironment( - MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG) - .enableCheckpointing(400) - .setMaxParallelism(1) - .setParallelism(1); - - tEnv = StreamTableEnvironment.create(env, settings); - } - } - } - return tEnv; - } - - protected static Row insertRow(Object... values) { - return Row.ofKind(RowKind.INSERT, values); - } - - protected static Row deleteRow(Object... values) { - return Row.ofKind(RowKind.DELETE, values); - } - - protected static Row updateBeforeRow(Object... values) { - return Row.ofKind(RowKind.UPDATE_BEFORE, values); - } - - protected static Row updateAfterRow(Object... values) { - return Row.ofKind(RowKind.UPDATE_AFTER, values); - } - - protected static List listJoin(List> lists) { - return lists.stream().flatMap(List::stream).collect(Collectors.toList()); - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/SplitHelpers.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/SplitHelpers.java deleted file mode 100644 index 540902f3cea5..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/SplitHelpers.java +++ /dev/null @@ -1,200 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import static org.assertj.core.api.Assertions.assertThat; -import static org.mockito.Mockito.doReturn; -import static org.mockito.Mockito.spy; - -import java.io.File; -import java.io.IOException; -import java.nio.file.Path; -import java.util.List; -import java.util.stream.Collectors; -import org.apache.hadoop.conf.Configuration; -import org.apache.iceberg.BaseCombinedScanTask; -import org.apache.iceberg.BaseFileScanTask; -import org.apache.iceberg.CombinedScanTask; -import org.apache.iceberg.DeleteFile; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.FileMetadata; -import org.apache.iceberg.FileScanTask; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.PartitionSpecParser; -import org.apache.iceberg.SchemaParser; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.data.GenericAppenderHelper; -import org.apache.iceberg.data.RandomGenericData; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.expressions.ResidualEvaluator; -import org.apache.iceberg.flink.TestFixtures; -import org.apache.iceberg.flink.source.split.IcebergSourceSplit; -import org.apache.iceberg.hadoop.HadoopCatalog; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.util.ThreadPools; - -public class SplitHelpers { - - private SplitHelpers() {} - - /** - * This create a list of IcebergSourceSplit from real files - *

  • Create a new Hadoop table under the {@code temporaryFolder} - *
  • write {@code fileCount} number of files to the new Iceberg table - *
  • Discover the splits from the table and partition the splits by the {@code filePerSplit} - * limit - *
  • Delete the Hadoop table - * - *

    Since the table and data files are deleted before this method return, caller shouldn't - * attempt to read the data files. - * - *

    By default, v1 Iceberg table is created. For v2 table use {@link - * SplitHelpers#createSplitsFromTransientHadoopTable(Path, int, int, String)} - * - * @param temporaryFolder Folder to place the data to - * @param fileCount The number of files to create and add to the table - * @param filesPerSplit The number of files used for a split - */ - public static List createSplitsFromTransientHadoopTable( - Path temporaryFolder, int fileCount, int filesPerSplit) throws Exception { - return createSplitsFromTransientHadoopTable(temporaryFolder, fileCount, filesPerSplit, "1"); - } - - /** - * This create a list of IcebergSourceSplit from real files - *

  • Create a new Hadoop table under the {@code temporaryFolder} - *
  • write {@code fileCount} number of files to the new Iceberg table - *
  • Discover the splits from the table and partition the splits by the {@code filePerSplit} - * limit - *
  • Delete the Hadoop table - * - *

    Since the table and data files are deleted before this method return, caller shouldn't - * attempt to read the data files. - * - * @param temporaryFolder Folder to place the data to - * @param fileCount The number of files to create and add to the table - * @param filesPerSplit The number of files used for a split - * @param version The table version to create - */ - public static List createSplitsFromTransientHadoopTable( - Path temporaryFolder, int fileCount, int filesPerSplit, String version) throws Exception { - final File warehouseFile = File.createTempFile("junit", null, temporaryFolder.toFile()); - assertThat(warehouseFile.delete()).isTrue(); - final String warehouse = "file:" + warehouseFile; - Configuration hadoopConf = new Configuration(); - final HadoopCatalog catalog = new HadoopCatalog(hadoopConf, warehouse); - ImmutableMap properties = - ImmutableMap.of(TableProperties.FORMAT_VERSION, version); - try { - final Table table = - catalog.createTable( - TestFixtures.TABLE_IDENTIFIER, - TestFixtures.SCHEMA, - PartitionSpec.unpartitioned(), - null, - properties); - final GenericAppenderHelper dataAppender = - new GenericAppenderHelper(table, FileFormat.PARQUET, temporaryFolder); - for (int i = 0; i < fileCount; ++i) { - List records = RandomGenericData.generate(TestFixtures.SCHEMA, 2, i); - dataAppender.appendToTable(records); - } - - final ScanContext scanContext = ScanContext.builder().build(); - final List splits = - FlinkSplitPlanner.planIcebergSourceSplits( - table, scanContext, ThreadPools.getWorkerPool()); - return splits.stream() - .flatMap( - split -> { - List> filesList = - Lists.partition(Lists.newArrayList(split.task().files()), filesPerSplit); - return filesList.stream() - .map(files -> new BaseCombinedScanTask(files)) - .map( - combinedScanTask -> - IcebergSourceSplit.fromCombinedScanTask(combinedScanTask)); - }) - .collect(Collectors.toList()); - } finally { - catalog.dropTable(TestFixtures.TABLE_IDENTIFIER); - catalog.close(); - } - } - - /** - * This method will equip the {@code icebergSourceSplits} with mock delete files. - *

  • For each split, create {@code deleteFilesPerSplit} number of delete files - *
  • Replace the original {@code FileScanTask} with the new {@code FileScanTask} with mock - *
  • Caller should not attempt to read the deleted files since they are created as mock, and - * they are not real files - * - * @param icebergSourceSplits The real splits to equip with mock delete files - * @param temporaryFolder The temporary folder to create the mock delete files with - * @param deleteFilesPerSplit The number of delete files to create for each split - * @return The list of re-created splits with mock delete files - * @throws IOException If there is any error creating the mock delete files - */ - public static List equipSplitsWithMockDeleteFiles( - List icebergSourceSplits, Path temporaryFolder, int deleteFilesPerSplit) - throws IOException { - List icebergSourceSplitsWithMockDeleteFiles = Lists.newArrayList(); - for (IcebergSourceSplit split : icebergSourceSplits) { - final CombinedScanTask combinedScanTask = spy(split.task()); - - final List deleteFiles = Lists.newArrayList(); - final PartitionSpec spec = - PartitionSpec.builderFor(TestFixtures.SCHEMA).withSpecId(0).build(); - - for (int i = 0; i < deleteFilesPerSplit; ++i) { - final DeleteFile deleteFile = - FileMetadata.deleteFileBuilder(spec) - .withFormat(FileFormat.PARQUET) - .withPath(File.createTempFile("junit", null, temporaryFolder.toFile()).getPath()) - .ofPositionDeletes() - .withFileSizeInBytes(1000) - .withRecordCount(1000) - .build(); - deleteFiles.add(deleteFile); - } - - List newFileScanTasks = Lists.newArrayList(); - for (FileScanTask task : combinedScanTask.tasks()) { - String schemaString = SchemaParser.toJson(task.schema()); - String specString = PartitionSpecParser.toJson(task.spec()); - - BaseFileScanTask baseFileScanTask = - new BaseFileScanTask( - task.file(), - deleteFiles.toArray(new DeleteFile[] {}), - schemaString, - specString, - ResidualEvaluator.unpartitioned(task.residual())); - newFileScanTasks.add(baseFileScanTask); - } - doReturn(newFileScanTasks).when(combinedScanTask).tasks(); - icebergSourceSplitsWithMockDeleteFiles.add( - IcebergSourceSplit.fromCombinedScanTask( - combinedScanTask, split.fileOffset(), split.recordOffset())); - } - return icebergSourceSplitsWithMockDeleteFiles; - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/SqlHelpers.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/SqlHelpers.java deleted file mode 100644 index e4e48ca67f66..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/SqlHelpers.java +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import java.util.List; -import java.util.Map; -import org.apache.flink.table.api.TableEnvironment; -import org.apache.flink.table.api.TableResult; -import org.apache.flink.types.Row; -import org.apache.flink.util.CloseableIterator; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; - -public class SqlHelpers { - private SqlHelpers() {} - - public static List sql(TableEnvironment tableEnv, String query, Object... args) { - TableResult tableResult = tableEnv.executeSql(String.format(query, args)); - try (CloseableIterator iter = tableResult.collect()) { - List results = Lists.newArrayList(iter); - return results; - } catch (Exception e) { - throw new RuntimeException("Failed to collect table result", e); - } - } - - public static String sqlOptionsToString(Map sqlOptions) { - StringBuilder builder = new StringBuilder(); - sqlOptions.forEach((key, value) -> builder.append(optionToKv(key, value)).append(",")); - String optionStr = builder.toString(); - if (optionStr.endsWith(",")) { - optionStr = optionStr.substring(0, optionStr.length() - 1); - } - - if (!optionStr.isEmpty()) { - optionStr = String.format("/*+ OPTIONS(%s)*/", optionStr); - } - - return optionStr; - } - - private static String optionToKv(String key, Object value) { - return "'" + key + "'='" + value + "'"; - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TableSourceTestBase.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TableSourceTestBase.java deleted file mode 100644 index 32c81d9465a4..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TableSourceTestBase.java +++ /dev/null @@ -1,104 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.File; -import java.io.IOException; -import org.apache.flink.configuration.CoreOptions; -import org.apache.flink.table.api.TableEnvironment; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Parameter; -import org.apache.iceberg.ParameterizedTestExtension; -import org.apache.iceberg.Parameters; -import org.apache.iceberg.events.Listeners; -import org.apache.iceberg.events.ScanEvent; -import org.apache.iceberg.flink.FlinkConfigOptions; -import org.apache.iceberg.flink.TestBase; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.extension.ExtendWith; - -@ExtendWith(ParameterizedTestExtension.class) -public abstract class TableSourceTestBase extends TestBase { - @Parameters(name = "useFlip27Source = {0}") - protected static Object[][] parameters() { - return new Object[][] { - {false}, {true}, - }; - } - - @Parameter(index = 0) - protected boolean useFlip27Source; - - protected static final String CATALOG_NAME = "test_catalog"; - protected static final String DATABASE_NAME = "test_db"; - protected static final String TABLE_NAME = "test_table"; - protected final FileFormat format = FileFormat.AVRO; - protected int scanEventCount = 0; - protected ScanEvent lastScanEvent = null; - - @Override - protected TableEnvironment getTableEnv() { - super.getTableEnv().getConfig().getConfiguration().set(CoreOptions.DEFAULT_PARALLELISM, 1); - super.getTableEnv() - .getConfig() - .getConfiguration() - .setBoolean(FlinkConfigOptions.TABLE_EXEC_ICEBERG_USE_FLIP27_SOURCE.key(), useFlip27Source); - return super.getTableEnv(); - } - - @BeforeEach - public void before() throws IOException { - // register a scan event listener to validate pushdown - Listeners.register( - event -> { - scanEventCount += 1; - lastScanEvent = event; - }, - ScanEvent.class); - - File warehouseFile = File.createTempFile("junit", null, temporaryDirectory.toFile()); - assertThat(warehouseFile.delete()).isTrue(); - String warehouse = String.format("file:%s", warehouseFile); - - sql( - "CREATE CATALOG %s WITH ('type'='iceberg', 'catalog-type'='hadoop', 'warehouse'='%s')", - CATALOG_NAME, warehouse); - sql("USE CATALOG %s", CATALOG_NAME); - sql("CREATE DATABASE %s", DATABASE_NAME); - sql("USE %s", DATABASE_NAME); - sql( - "CREATE TABLE %s (id INT, data VARCHAR,d DOUBLE) WITH ('write.format.default'='%s')", - TABLE_NAME, format.name()); - sql( - "INSERT INTO %s VALUES (1,'iceberg',10),(2,'b',20),(3,CAST(NULL AS VARCHAR),30)", - TABLE_NAME); - - this.scanEventCount = 0; - this.lastScanEvent = null; - } - - @AfterEach - public void clean() { - sql("DROP TABLE IF EXISTS %s.%s", DATABASE_NAME, TABLE_NAME); - dropCatalog(CATALOG_NAME, true); - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestBoundedTableFactory.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestBoundedTableFactory.java deleted file mode 100644 index bde751e1f87f..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestBoundedTableFactory.java +++ /dev/null @@ -1,81 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.util.List; -import java.util.Objects; -import java.util.stream.Collectors; -import org.apache.flink.types.Row; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.Streams; -import org.junit.jupiter.api.Test; - -public class TestBoundedTableFactory extends ChangeLogTableTestBase { - - @Test - public void testEmptyDataSet() { - List> emptyDataSet = ImmutableList.of(); - - String dataId = BoundedTableFactory.registerDataSet(emptyDataSet); - sql( - "CREATE TABLE %s(id INT, data STRING) WITH ('connector'='BoundedSource', 'data-id'='%s')", - tableName, dataId); - - assertThat(sql("SELECT * FROM %s", tableName)).isEmpty(); - } - - @Test - public void testBoundedTableFactory() { - List> dataSet = - ImmutableList.of( - ImmutableList.of( - insertRow(1, "aaa"), - deleteRow(1, "aaa"), - insertRow(1, "bbb"), - insertRow(2, "aaa"), - deleteRow(2, "aaa"), - insertRow(2, "bbb")), - ImmutableList.of( - updateBeforeRow(2, "bbb"), - updateAfterRow(2, "ccc"), - deleteRow(2, "ccc"), - insertRow(2, "ddd")), - ImmutableList.of( - deleteRow(1, "bbb"), - insertRow(1, "ccc"), - deleteRow(1, "ccc"), - insertRow(1, "ddd"))); - - String dataId = BoundedTableFactory.registerDataSet(dataSet); - sql( - "CREATE TABLE %s(id INT, data STRING) WITH ('connector'='BoundedSource', 'data-id'='%s')", - tableName, dataId); - - List rowSet = dataSet.stream().flatMap(Streams::stream).collect(Collectors.toList()); - assertThat(sql("SELECT * FROM %s", tableName)).isEqualTo(rowSet); - - assertThat(sql("SELECT * FROM %s WHERE data='aaa'", tableName)) - .isEqualTo( - rowSet.stream() - .filter(r -> Objects.equals(r.getField(1), "aaa")) - .collect(Collectors.toList())); - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkInputFormat.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkInputFormat.java deleted file mode 100644 index c8b65e131c33..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkInputFormat.java +++ /dev/null @@ -1,211 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import static org.apache.iceberg.flink.SimpleDataUtil.SCHEMA; -import static org.apache.iceberg.types.Types.NestedField.required; -import static org.assertj.core.api.Assertions.assertThatThrownBy; -import static org.assertj.core.api.Assumptions.assumeThat; - -import java.io.IOException; -import java.util.Collections; -import java.util.List; -import java.util.Map; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.table.api.DataTypes; -import org.apache.flink.table.api.TableSchema; -import org.apache.flink.table.types.logical.RowType; -import org.apache.flink.types.Row; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.data.GenericAppenderHelper; -import org.apache.iceberg.data.RandomGenericData; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.TestFixtures; -import org.apache.iceberg.flink.TestHelpers; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.types.Types; -import org.junit.jupiter.api.TestTemplate; - -/** Test {@link FlinkInputFormat}. */ -public class TestFlinkInputFormat extends TestFlinkSource { - - @Override - protected List run( - FlinkSource.Builder formatBuilder, - Map sqlOptions, - String sqlFilter, - String... sqlSelectedFields) - throws Exception { - return runFormat(formatBuilder.tableLoader(tableLoader()).buildFormat()); - } - - @TestTemplate - public void testNestedProjection() throws Exception { - Schema schema = - new Schema( - required(1, "data", Types.StringType.get()), - required( - 2, - "nested", - Types.StructType.of( - Types.NestedField.required(3, "f1", Types.StringType.get()), - Types.NestedField.required(4, "f2", Types.StringType.get()), - Types.NestedField.required(5, "f3", Types.LongType.get()))), - required(6, "id", Types.LongType.get())); - - Table table = - CATALOG_EXTENSION.catalog().createTable(TableIdentifier.of("default", "t"), schema); - - List writeRecords = RandomGenericData.generate(schema, 2, 0L); - new GenericAppenderHelper(table, fileFormat, temporaryDirectory).appendToTable(writeRecords); - - // Schema: [data, nested[f1, f2, f3], id] - // Projection: [nested.f2, data] - // The Flink SQL output: [f2, data] - // The FlinkInputFormat output: [nested[f2], data] - - TableSchema projectedSchema = - TableSchema.builder() - .field("nested", DataTypes.ROW(DataTypes.FIELD("f2", DataTypes.STRING()))) - .field("data", DataTypes.STRING()) - .build(); - List result = - runFormat( - FlinkSource.forRowData() - .tableLoader(tableLoader()) - .project(projectedSchema) - .buildFormat()); - - List expected = Lists.newArrayList(); - for (Record record : writeRecords) { - Row nested = Row.of(((Record) record.get(1)).get(1)); - expected.add(Row.of(nested, record.get(0))); - } - - TestHelpers.assertRows(result, expected); - } - - @TestTemplate - public void testBasicProjection() throws IOException { - Schema writeSchema = - new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()), - Types.NestedField.optional(2, "time", Types.TimestampType.withZone())); - - Table table = - CATALOG_EXTENSION.catalog().createTable(TableIdentifier.of("default", "t"), writeSchema); - - List writeRecords = RandomGenericData.generate(writeSchema, 2, 0L); - new GenericAppenderHelper(table, fileFormat, temporaryDirectory).appendToTable(writeRecords); - - TableSchema projectedSchema = - TableSchema.builder() - .field("id", DataTypes.BIGINT()) - .field("data", DataTypes.STRING()) - .build(); - List result = - runFormat( - FlinkSource.forRowData() - .tableLoader(tableLoader()) - .project(projectedSchema) - .buildFormat()); - - List expected = Lists.newArrayList(); - for (Record record : writeRecords) { - expected.add(Row.of(record.get(0), record.get(1))); - } - - TestHelpers.assertRows(result, expected); - } - - @TestTemplate - public void testReadPartitionColumn() throws Exception { - assumeThat(fileFormat).as("Temporary skip ORC").isNotEqualTo(FileFormat.ORC); - - Schema nestedSchema = - new Schema( - Types.NestedField.optional(1, "id", Types.LongType.get()), - Types.NestedField.optional( - 2, - "struct", - Types.StructType.of( - Types.NestedField.optional(3, "innerId", Types.LongType.get()), - Types.NestedField.optional(4, "innerName", Types.StringType.get())))); - PartitionSpec spec = - PartitionSpec.builderFor(nestedSchema).identity("struct.innerName").build(); - - Table table = - CATALOG_EXTENSION.catalog().createTable(TestFixtures.TABLE_IDENTIFIER, nestedSchema, spec); - List records = RandomGenericData.generate(nestedSchema, 10, 0L); - GenericAppenderHelper appender = - new GenericAppenderHelper(table, fileFormat, temporaryDirectory); - for (Record record : records) { - org.apache.iceberg.TestHelpers.Row partition = - org.apache.iceberg.TestHelpers.Row.of(record.get(1, Record.class).get(1)); - appender.appendToTable(partition, Collections.singletonList(record)); - } - - TableSchema projectedSchema = - TableSchema.builder() - .field("struct", DataTypes.ROW(DataTypes.FIELD("innerName", DataTypes.STRING()))) - .build(); - List result = - runFormat( - FlinkSource.forRowData() - .tableLoader(tableLoader()) - .project(projectedSchema) - .buildFormat()); - - List expected = Lists.newArrayList(); - for (Record record : records) { - Row nested = Row.of(((Record) record.get(1)).get(1)); - expected.add(Row.of(nested)); - } - - TestHelpers.assertRows(result, expected); - } - - @TestTemplate - public void testValidation() { - CATALOG_EXTENSION.catalog().createTable(TestFixtures.TABLE_IDENTIFIER, SCHEMA); - - assertThatThrownBy( - () -> - FlinkSource.forRowData() - .env(StreamExecutionEnvironment.getExecutionEnvironment()) - .tableLoader(tableLoader()) - .streaming(false) - .endTag("tag") - .endSnapshotId(1L) - .build()) - .hasMessage("END_SNAPSHOT_ID and END_TAG cannot both be set.") - .isInstanceOf(IllegalArgumentException.class); - } - - private List runFormat(FlinkInputFormat inputFormat) throws IOException { - RowType rowType = FlinkSchemaUtil.convert(inputFormat.projectedSchema()); - return TestHelpers.readRows(inputFormat, rowType); - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkInputFormatReaderDeletes.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkInputFormatReaderDeletes.java deleted file mode 100644 index 226da5813ad8..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkInputFormatReaderDeletes.java +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import java.io.IOException; -import java.util.Map; -import org.apache.flink.table.types.logical.RowType; -import org.apache.hadoop.hive.conf.HiveConf; -import org.apache.iceberg.CatalogProperties; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.flink.CatalogLoader; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.RowDataWrapper; -import org.apache.iceberg.flink.TableLoader; -import org.apache.iceberg.flink.TestHelpers; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.util.StructLikeSet; - -public class TestFlinkInputFormatReaderDeletes extends TestFlinkReaderDeletesBase { - @Override - protected StructLikeSet rowSet(String tableName, Table testTable, String... columns) - throws IOException { - Schema projected = testTable.schema().select(columns); - RowType rowType = FlinkSchemaUtil.convert(projected); - Map properties = Maps.newHashMap(); - properties.put( - CatalogProperties.WAREHOUSE_LOCATION, - hiveConf.get(HiveConf.ConfVars.METASTOREWAREHOUSE.varname)); - properties.put(CatalogProperties.URI, hiveConf.get(HiveConf.ConfVars.METASTOREURIS.varname)); - properties.put( - CatalogProperties.CLIENT_POOL_SIZE, - Integer.toString(hiveConf.getInt("iceberg.hive.client-pool-size", 5))); - CatalogLoader hiveCatalogLoader = CatalogLoader.hive(catalog.name(), hiveConf, properties); - FlinkInputFormat inputFormat = - FlinkSource.forRowData() - .tableLoader( - TableLoader.fromCatalog( - hiveCatalogLoader, TableIdentifier.of("default", tableName))) - .project(FlinkSchemaUtil.toSchema(rowType)) - .buildFormat(); - - StructLikeSet set = StructLikeSet.create(projected.asStruct()); - TestHelpers.readRowData(inputFormat, rowType) - .forEach( - rowData -> { - RowDataWrapper wrapper = new RowDataWrapper(rowType, projected.asStruct()); - set.add(wrapper.wrap(rowData)); - }); - - return set; - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkMergingMetrics.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkMergingMetrics.java deleted file mode 100644 index 5be4a31b4ac8..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkMergingMetrics.java +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import java.io.File; -import java.io.IOException; -import java.util.List; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Table; -import org.apache.iceberg.TestMergingMetrics; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.HadoopCatalogExtension; -import org.apache.iceberg.flink.RowDataConverter; -import org.apache.iceberg.flink.TestFixtures; -import org.apache.iceberg.flink.sink.FlinkAppenderFactory; -import org.apache.iceberg.io.FileAppender; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.junit.jupiter.api.extension.RegisterExtension; - -public class TestFlinkMergingMetrics extends TestMergingMetrics { - - @RegisterExtension - private static final HadoopCatalogExtension CATALOG_EXTENSION = - new HadoopCatalogExtension("test_db", "test_table"); - - @Override - protected FileAppender writeAndGetAppender(List records) throws IOException { - Table table = CATALOG_EXTENSION.catalog().createTable(TestFixtures.TABLE_IDENTIFIER, SCHEMA); - RowType flinkSchema = FlinkSchemaUtil.convert(SCHEMA); - FileAppender appender = - new FlinkAppenderFactory( - table, - SCHEMA, - flinkSchema, - ImmutableMap.of(), - PartitionSpec.unpartitioned(), - null, - null, - null) - .newAppender( - org.apache.iceberg.Files.localOutput(File.createTempFile("junit", null, tempDir)), - fileFormat); - try (FileAppender fileAppender = appender) { - records.stream().map(r -> RowDataConverter.convert(SCHEMA, r)).forEach(fileAppender::add); - } - return appender; - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkMetaDataTable.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkMetaDataTable.java deleted file mode 100644 index f58cc87c6a29..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkMetaDataTable.java +++ /dev/null @@ -1,813 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assumptions.assumeThat; - -import java.io.File; -import java.io.IOException; -import java.nio.file.Path; -import java.time.Instant; -import java.util.Comparator; -import java.util.Iterator; -import java.util.List; -import java.util.Map; -import java.util.stream.Collectors; -import java.util.stream.Stream; -import org.apache.avro.generic.GenericData; -import org.apache.commons.collections.ListUtils; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.configuration.CoreOptions; -import org.apache.flink.table.api.TableEnvironment; -import org.apache.flink.types.Row; -import org.apache.iceberg.DeleteFile; -import org.apache.iceberg.FileContent; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Files; -import org.apache.iceberg.HasTableOperations; -import org.apache.iceberg.ManifestFile; -import org.apache.iceberg.MetadataTableType; -import org.apache.iceberg.MetadataTableUtils; -import org.apache.iceberg.MetricsUtil; -import org.apache.iceberg.Parameter; -import org.apache.iceberg.Parameters; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Snapshot; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableMetadata; -import org.apache.iceberg.avro.Avro; -import org.apache.iceberg.catalog.Namespace; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.data.FileHelpers; -import org.apache.iceberg.data.GenericRecord; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.expressions.Expressions; -import org.apache.iceberg.flink.CatalogTestBase; -import org.apache.iceberg.flink.TestHelpers; -import org.apache.iceberg.io.CloseableIterable; -import org.apache.iceberg.io.InputFile; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.types.Types; -import org.apache.iceberg.util.SnapshotUtil; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.TestTemplate; -import org.junit.jupiter.api.io.TempDir; - -public class TestFlinkMetaDataTable extends CatalogTestBase { - private static final String TABLE_NAME = "test_table"; - private final FileFormat format = FileFormat.AVRO; - private @TempDir Path temp; - - @Parameter(index = 2) - private Boolean isPartition; - - @Parameters(name = "catalogName={0}, baseNamespace={1}, isPartition={2}") - protected static List parameters() { - List parameters = Lists.newArrayList(); - - for (Boolean isPartition : new Boolean[] {true, false}) { - String catalogName = "testhadoop"; - Namespace baseNamespace = Namespace.of("default"); - parameters.add(new Object[] {catalogName, baseNamespace, isPartition}); - } - return parameters; - } - - @Override - protected TableEnvironment getTableEnv() { - Configuration configuration = super.getTableEnv().getConfig().getConfiguration(); - configuration.set(CoreOptions.DEFAULT_PARALLELISM, 1); - return super.getTableEnv(); - } - - @BeforeEach - public void before() { - super.before(); - sql("USE CATALOG %s", catalogName); - sql("CREATE DATABASE %s", flinkDatabase); - sql("USE %s", DATABASE); - if (isPartition) { - sql( - "CREATE TABLE %s (id INT, data VARCHAR,d DOUBLE) PARTITIONED BY (data) WITH ('format-version'='2', 'write.format.default'='%s')", - TABLE_NAME, format.name()); - sql("INSERT INTO %s VALUES (1,'a',10),(2,'a',20)", TABLE_NAME); - sql("INSERT INTO %s VALUES (1,'b',10),(2,'b',20)", TABLE_NAME); - } else { - sql( - "CREATE TABLE %s (id INT, data VARCHAR,d DOUBLE) WITH ('format-version'='2', 'write.format.default'='%s')", - TABLE_NAME, format.name()); - sql( - "INSERT INTO %s VALUES (1,'iceberg',10),(2,'b',20),(3,CAST(NULL AS VARCHAR),30)", - TABLE_NAME); - sql("INSERT INTO %s VALUES (4,'iceberg',10)", TABLE_NAME); - } - } - - @Override - @AfterEach - public void clean() { - sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, TABLE_NAME); - sql("DROP DATABASE IF EXISTS %s", flinkDatabase); - super.clean(); - } - - @TestTemplate - public void testSnapshots() { - String sql = String.format("SELECT * FROM %s$snapshots ", TABLE_NAME); - List result = sql(sql); - - Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE_NAME)); - Iterator snapshots = table.snapshots().iterator(); - for (Row row : result) { - Snapshot next = snapshots.next(); - assertThat(((Instant) row.getField(0)).toEpochMilli()) - .as("Should have expected timestamp") - .isEqualTo(next.timestampMillis()); - assertThat(next.snapshotId()) - .as("Should have expected snapshot id") - .isEqualTo(next.snapshotId()); - assertThat(row.getField(2)).as("Should have expected parent id").isEqualTo(next.parentId()); - assertThat(row.getField(3)).as("Should have expected operation").isEqualTo(next.operation()); - assertThat(row.getField(4)) - .as("Should have expected manifest list location") - .isEqualTo(next.manifestListLocation()); - assertThat(row.getField(5)).as("Should have expected summary").isEqualTo(next.summary()); - } - } - - @TestTemplate - public void testHistory() { - String sql = String.format("SELECT * FROM %s$history ", TABLE_NAME); - List result = sql(sql); - - Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE_NAME)); - Iterator snapshots = table.snapshots().iterator(); - for (Row row : result) { - Snapshot next = snapshots.next(); - assertThat(((Instant) row.getField(0)).toEpochMilli()) - .as("Should have expected made_current_at") - .isEqualTo(next.timestampMillis()); - assertThat(row.getField(1)) - .as("Should have expected snapshot id") - .isEqualTo(next.snapshotId()); - assertThat(row.getField(2)).as("Should have expected parent id").isEqualTo(next.parentId()); - assertThat(row.getField(3)) - .as("Should have expected is current ancestor") - .isEqualTo( - SnapshotUtil.isAncestorOf( - table, table.currentSnapshot().snapshotId(), next.snapshotId())); - } - } - - @TestTemplate - public void testManifests() { - String sql = String.format("SELECT * FROM %s$manifests ", TABLE_NAME); - List result = sql(sql); - - Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE_NAME)); - List expectedDataManifests = dataManifests(table); - - for (int i = 0; i < result.size(); i++) { - Row row = result.get(i); - ManifestFile manifestFile = expectedDataManifests.get(i); - assertThat(row.getField(0)) - .as("Should have expected content") - .isEqualTo(manifestFile.content().id()); - assertThat(row.getField(1)).as("Should have expected path").isEqualTo(manifestFile.path()); - assertThat(row.getField(2)) - .as("Should have expected length") - .isEqualTo(manifestFile.length()); - assertThat(row.getField(3)) - .as("Should have expected partition_spec_id") - .isEqualTo(manifestFile.partitionSpecId()); - assertThat(row.getField(4)) - .as("Should have expected added_snapshot_id") - .isEqualTo(manifestFile.snapshotId()); - assertThat(row.getField(5)) - .as("Should have expected added_data_files_count") - .isEqualTo(manifestFile.addedFilesCount()); - assertThat(row.getField(6)) - .as("Should have expected existing_data_files_count") - .isEqualTo(manifestFile.existingFilesCount()); - assertThat(row.getField(7)) - .as("Should have expected deleted_data_files_count") - .isEqualTo(manifestFile.deletedFilesCount()); - } - } - - @TestTemplate - public void testAllManifests() { - Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE_NAME)); - - String sql = String.format("SELECT * FROM %s$all_manifests ", TABLE_NAME); - List result = sql(sql); - - List expectedDataManifests = allDataManifests(table); - - assertThat(expectedDataManifests).hasSize(result.size()); - for (int i = 0; i < result.size(); i++) { - Row row = result.get(i); - ManifestFile manifestFile = expectedDataManifests.get(i); - assertThat(row.getField(0)) - .as("Should have expected content") - .isEqualTo(manifestFile.content().id()); - assertThat(row.getField(1)).as("Should have expected path").isEqualTo(manifestFile.path()); - assertThat(row.getField(2)) - .as("Should have expected length") - .isEqualTo(manifestFile.length()); - assertThat(row.getField(3)) - .as("Should have expected partition_spec_id") - .isEqualTo(manifestFile.partitionSpecId()); - assertThat(row.getField(4)) - .as("Should have expected added_snapshot_id") - .isEqualTo(manifestFile.snapshotId()); - assertThat(row.getField(5)) - .as("Should have expected added_data_files_count") - .isEqualTo(manifestFile.addedFilesCount()); - assertThat(row.getField(6)) - .as("Should have expected existing_data_files_count") - .isEqualTo(manifestFile.existingFilesCount()); - assertThat(row.getField(7)) - .as("Should have expected deleted_data_files_count") - .isEqualTo(manifestFile.deletedFilesCount()); - } - } - - @TestTemplate - public void testUnPartitionedTable() throws IOException { - assumeThat(isPartition).isFalse(); - Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE_NAME)); - - Schema deleteRowSchema = table.schema().select("id"); - Record dataDelete = GenericRecord.create(deleteRowSchema); - List dataDeletes = Lists.newArrayList(dataDelete.copy("id", 1)); - File testFile = File.createTempFile("junit", null, temp.toFile()); - DeleteFile eqDeletes = - FileHelpers.writeDeleteFile( - table, Files.localOutput(testFile), dataDeletes, deleteRowSchema); - table.newRowDelta().addDeletes(eqDeletes).commit(); - - List expectedDataManifests = dataManifests(table); - List expectedDeleteManifests = deleteManifests(table); - - assertThat(expectedDataManifests).hasSize(2); - assertThat(expectedDeleteManifests).hasSize(1); - - Schema entriesTableSchema = - MetadataTableUtils.createMetadataTableInstance(table, MetadataTableType.from("entries")) - .schema(); - - // check delete files table - Schema deleteFilesTableSchema = - MetadataTableUtils.createMetadataTableInstance( - table, MetadataTableType.from("delete_files")) - .schema(); - - List deleteColumns = - deleteFilesTableSchema.columns().stream() - .map(Types.NestedField::name) - .filter(c -> !c.equals(MetricsUtil.READABLE_METRICS)) - .collect(Collectors.toList()); - String deleteNames = - deleteColumns.stream().map(n -> "`" + n + "`").collect(Collectors.joining(",")); - - deleteFilesTableSchema = deleteFilesTableSchema.select(deleteColumns); - - List actualDeleteFiles = sql("SELECT %s FROM %s$delete_files", deleteNames, TABLE_NAME); - assertThat(actualDeleteFiles).hasSize(1); - assertThat(expectedDeleteManifests).as("Should have 1 delete manifest").hasSize(1); - - List expectedDeleteFiles = - expectedEntries( - table, FileContent.EQUALITY_DELETES, entriesTableSchema, expectedDeleteManifests, null); - assertThat(expectedDeleteFiles).as("Should be 1 delete file manifest entry").hasSize(1); - TestHelpers.assertEquals( - deleteFilesTableSchema, expectedDeleteFiles.get(0), actualDeleteFiles.get(0)); - - // Check data files table - Schema filesTableSchema = - MetadataTableUtils.createMetadataTableInstance(table, MetadataTableType.from("files")) - .schema(); - - List columns = - filesTableSchema.columns().stream() - .map(Types.NestedField::name) - .filter(c -> !c.equals(MetricsUtil.READABLE_METRICS)) - .collect(Collectors.toList()); - String names = columns.stream().map(n -> "`" + n + "`").collect(Collectors.joining(",")); - - filesTableSchema = filesTableSchema.select(columns); - - List actualDataFiles = sql("SELECT %s FROM %s$data_files", names, TABLE_NAME); - assertThat(actualDataFiles).as("Metadata table should return 2 data file").hasSize(2); - List expectedDataFiles = - expectedEntries(table, FileContent.DATA, entriesTableSchema, expectedDataManifests, null); - assertThat(expectedDataFiles).as("Should be 2 data file manifest entry").hasSize(2); - TestHelpers.assertEquals(filesTableSchema, expectedDataFiles.get(0), actualDataFiles.get(0)); - - // check all files table - List actualFiles = sql("SELECT %s FROM %s$files ORDER BY content", names, TABLE_NAME); - assertThat(actualFiles).as("Metadata table should return 3 files").hasSize(3); - List expectedFiles = - Stream.concat(expectedDataFiles.stream(), expectedDeleteFiles.stream()) - .collect(Collectors.toList()); - assertThat(expectedFiles).as("Should have 3 files manifest entriess").hasSize(3); - TestHelpers.assertEquals(filesTableSchema, expectedFiles.get(0), actualFiles.get(0)); - TestHelpers.assertEquals(filesTableSchema, expectedFiles.get(1), actualFiles.get(1)); - } - - @TestTemplate - public void testPartitionedTable() throws Exception { - assumeThat(isPartition).isTrue(); - Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE_NAME)); - - Schema deleteRowSchema = table.schema().select("id", "data"); - Record dataDelete = GenericRecord.create(deleteRowSchema); - - Map deleteRow = Maps.newHashMap(); - deleteRow.put("id", 1); - deleteRow.put("data", "a"); - File testFile = File.createTempFile("junit", null, temp.toFile()); - DeleteFile eqDeletes = - FileHelpers.writeDeleteFile( - table, - Files.localOutput(testFile), - org.apache.iceberg.TestHelpers.Row.of("a"), - Lists.newArrayList(dataDelete.copy(deleteRow)), - deleteRowSchema); - table.newRowDelta().addDeletes(eqDeletes).commit(); - - deleteRow.put("data", "b"); - File testFile2 = File.createTempFile("junit", null, temp.toFile()); - DeleteFile eqDeletes2 = - FileHelpers.writeDeleteFile( - table, - Files.localOutput(testFile2), - org.apache.iceberg.TestHelpers.Row.of("b"), - Lists.newArrayList(dataDelete.copy(deleteRow)), - deleteRowSchema); - table.newRowDelta().addDeletes(eqDeletes2).commit(); - - Schema entriesTableSchema = - MetadataTableUtils.createMetadataTableInstance(table, MetadataTableType.from("entries")) - .schema(); - - List expectedDataManifests = dataManifests(table); - List expectedDeleteManifests = deleteManifests(table); - - assertThat(expectedDataManifests).hasSize(2); - assertThat(expectedDeleteManifests).hasSize(2); - Table deleteFilesTable = - MetadataTableUtils.createMetadataTableInstance( - table, MetadataTableType.from("delete_files")); - Schema filesTableSchema = deleteFilesTable.schema(); - - List columns = - filesTableSchema.columns().stream() - .map(Types.NestedField::name) - .filter(c -> !c.equals(MetricsUtil.READABLE_METRICS)) - .collect(Collectors.toList()); - String names = columns.stream().map(n -> "`" + n + "`").collect(Collectors.joining(",")); - - filesTableSchema = filesTableSchema.select(columns); - - // Check delete files table - List expectedDeleteFiles = - expectedEntries( - table, FileContent.EQUALITY_DELETES, entriesTableSchema, expectedDeleteManifests, "a"); - assertThat(expectedDeleteFiles).hasSize(1); - List actualDeleteFiles = - sql("SELECT %s FROM %s$delete_files WHERE `partition`.`data`='a'", names, TABLE_NAME); - - assertThat(actualDeleteFiles).hasSize(1); - TestHelpers.assertEquals( - filesTableSchema, expectedDeleteFiles.get(0), actualDeleteFiles.get(0)); - - // Check data files table - List expectedDataFiles = - expectedEntries(table, FileContent.DATA, entriesTableSchema, expectedDataManifests, "a"); - assertThat(expectedDataFiles).hasSize(1); - List actualDataFiles = - sql("SELECT %s FROM %s$data_files WHERE `partition`.`data`='a'", names, TABLE_NAME); - assertThat(actualDataFiles).hasSize(1); - TestHelpers.assertEquals(filesTableSchema, expectedDataFiles.get(0), actualDataFiles.get(0)); - - List actualPartitionsWithProjection = - sql("SELECT file_count FROM %s$partitions ", TABLE_NAME); - assertThat(actualPartitionsWithProjection).hasSize(2); - for (int i = 0; i < 2; ++i) { - assertThat(actualPartitionsWithProjection.get(i).getField(0)).isEqualTo(1); - } - - // Check files table - List expectedFiles = - Stream.concat(expectedDataFiles.stream(), expectedDeleteFiles.stream()) - .collect(Collectors.toList()); - assertThat(expectedFiles).hasSize(2); - List actualFiles = - sql( - "SELECT %s FROM %s$files WHERE `partition`.`data`='a' ORDER BY content", - names, TABLE_NAME); - assertThat(actualFiles).hasSize(2); - TestHelpers.assertEquals(filesTableSchema, expectedFiles.get(0), actualFiles.get(0)); - TestHelpers.assertEquals(filesTableSchema, expectedFiles.get(1), actualFiles.get(1)); - } - - @TestTemplate - public void testAllFilesUnpartitioned() throws Exception { - assumeThat(isPartition).isFalse(); - Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE_NAME)); - - Schema deleteRowSchema = table.schema().select("id", "data"); - Record dataDelete = GenericRecord.create(deleteRowSchema); - - Map deleteRow = Maps.newHashMap(); - deleteRow.put("id", 1); - File testFile = File.createTempFile("junit", null, temp.toFile()); - DeleteFile eqDeletes = - FileHelpers.writeDeleteFile( - table, - Files.localOutput(testFile), - Lists.newArrayList(dataDelete.copy(deleteRow)), - deleteRowSchema); - table.newRowDelta().addDeletes(eqDeletes).commit(); - - List expectedDataManifests = dataManifests(table); - assertThat(expectedDataManifests).hasSize(2); - List expectedDeleteManifests = deleteManifests(table); - assertThat(expectedDeleteManifests).hasSize(1); - - // Clear table to test whether 'all_files' can read past files - table.newDelete().deleteFromRowFilter(Expressions.alwaysTrue()).commit(); - - Schema entriesTableSchema = - MetadataTableUtils.createMetadataTableInstance(table, MetadataTableType.from("entries")) - .schema(); - Schema filesTableSchema = - MetadataTableUtils.createMetadataTableInstance( - table, MetadataTableType.from("all_data_files")) - .schema(); - - List columns = - filesTableSchema.columns().stream() - .map(Types.NestedField::name) - .filter(c -> !c.equals(MetricsUtil.READABLE_METRICS)) - .collect(Collectors.toList()); - String names = columns.stream().map(n -> "`" + n + "`").collect(Collectors.joining(",")); - - filesTableSchema = filesTableSchema.select(columns); - - // Check all data files table - List actualDataFiles = - sql("SELECT %s FROM %s$all_data_files order by record_count ", names, TABLE_NAME); - - List expectedDataFiles = - expectedEntries(table, FileContent.DATA, entriesTableSchema, expectedDataManifests, null); - assertThat(expectedDataFiles).hasSize(2); - assertThat(actualDataFiles).hasSize(2); - TestHelpers.assertEquals(filesTableSchema, expectedDataFiles, actualDataFiles); - - // Check all delete files table - List actualDeleteFiles = sql("SELECT %s FROM %s$all_delete_files", names, TABLE_NAME); - List expectedDeleteFiles = - expectedEntries( - table, FileContent.EQUALITY_DELETES, entriesTableSchema, expectedDeleteManifests, null); - assertThat(expectedDeleteFiles).hasSize(1); - assertThat(actualDeleteFiles).hasSize(1); - TestHelpers.assertEquals( - filesTableSchema, expectedDeleteFiles.get(0), actualDeleteFiles.get(0)); - - // Check all files table - List actualFiles = - sql("SELECT %s FROM %s$all_files ORDER BY content, record_count asc", names, TABLE_NAME); - List expectedFiles = - ListUtils.union(expectedDataFiles, expectedDeleteFiles); - expectedFiles.sort(Comparator.comparing(r -> ((Integer) r.get("content")))); - assertThat(actualFiles).hasSize(3); - TestHelpers.assertEquals(filesTableSchema, expectedFiles, actualFiles); - } - - @TestTemplate - public void testAllFilesPartitioned() throws Exception { - assumeThat(!isPartition).isFalse(); - Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE_NAME)); - - // Create delete file - Schema deleteRowSchema = table.schema().select("id"); - Record dataDelete = GenericRecord.create(deleteRowSchema); - - Map deleteRow = Maps.newHashMap(); - deleteRow.put("id", 1); - File testFile = File.createTempFile("junit", null, temp.toFile()); - DeleteFile eqDeletes = - FileHelpers.writeDeleteFile( - table, - Files.localOutput(testFile), - org.apache.iceberg.TestHelpers.Row.of("a"), - Lists.newArrayList(dataDelete.copy(deleteRow)), - deleteRowSchema); - File testFile2 = File.createTempFile("junit", null, temp.toFile()); - DeleteFile eqDeletes2 = - FileHelpers.writeDeleteFile( - table, - Files.localOutput(testFile2), - org.apache.iceberg.TestHelpers.Row.of("b"), - Lists.newArrayList(dataDelete.copy(deleteRow)), - deleteRowSchema); - table.newRowDelta().addDeletes(eqDeletes).addDeletes(eqDeletes2).commit(); - - List expectedDataManifests = dataManifests(table); - assertThat(expectedDataManifests).hasSize(2); - List expectedDeleteManifests = deleteManifests(table); - assertThat(expectedDeleteManifests).hasSize(1); - // Clear table to test whether 'all_files' can read past files - table.newDelete().deleteFromRowFilter(Expressions.alwaysTrue()).commit(); - - Schema entriesTableSchema = - MetadataTableUtils.createMetadataTableInstance(table, MetadataTableType.from("entries")) - .schema(); - Schema filesTableSchema = - MetadataTableUtils.createMetadataTableInstance( - table, MetadataTableType.from("all_data_files")) - .schema(); - - List columns = - filesTableSchema.columns().stream() - .map(Types.NestedField::name) - .filter(c -> !c.equals(MetricsUtil.READABLE_METRICS)) - .collect(Collectors.toList()); - String names = columns.stream().map(n -> "`" + n + "`").collect(Collectors.joining(",")); - - filesTableSchema = filesTableSchema.select(columns); - - // Check all data files table - List actualDataFiles = - sql("SELECT %s FROM %s$all_data_files WHERE `partition`.`data`='a'", names, TABLE_NAME); - List expectedDataFiles = - expectedEntries(table, FileContent.DATA, entriesTableSchema, expectedDataManifests, "a"); - assertThat(expectedDataFiles).hasSize(1); - assertThat(actualDataFiles).hasSize(1); - TestHelpers.assertEquals(filesTableSchema, expectedDataFiles.get(0), actualDataFiles.get(0)); - - // Check all delete files table - List actualDeleteFiles = - sql("SELECT %s FROM %s$all_delete_files WHERE `partition`.`data`='a'", names, TABLE_NAME); - List expectedDeleteFiles = - expectedEntries( - table, FileContent.EQUALITY_DELETES, entriesTableSchema, expectedDeleteManifests, "a"); - assertThat(expectedDeleteFiles).hasSize(1); - assertThat(actualDeleteFiles).hasSize(1); - TestHelpers.assertEquals( - filesTableSchema, expectedDeleteFiles.get(0), actualDeleteFiles.get(0)); - - // Check all files table - List actualFiles = - sql( - "SELECT %s FROM %s$all_files WHERE `partition`.`data`='a' ORDER BY content", - names, TABLE_NAME); - List expectedFiles = - ListUtils.union(expectedDataFiles, expectedDeleteFiles); - expectedFiles.sort(Comparator.comparing(r -> ((Integer) r.get("content")))); - assertThat(actualFiles).hasSize(2); - TestHelpers.assertEquals(filesTableSchema, expectedFiles, actualFiles); - } - - @TestTemplate - public void testMetadataLogEntries() { - Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE_NAME)); - - Long currentSnapshotId = table.currentSnapshot().snapshotId(); - TableMetadata tableMetadata = ((HasTableOperations) table).operations().current(); - Snapshot currentSnapshot = tableMetadata.currentSnapshot(); - Snapshot parentSnapshot = table.snapshot(currentSnapshot.parentId()); - List metadataLogEntries = - Lists.newArrayList(tableMetadata.previousFiles()); - - // Check metadataLog table - List metadataLogs = sql("SELECT * FROM %s$metadata_log_entries", TABLE_NAME); - - assertThat(metadataLogs).hasSize(3); - Row metadataLog = metadataLogs.get(0); - assertThat(metadataLog.getField("timestamp")) - .isEqualTo(Instant.ofEpochMilli(metadataLogEntries.get(0).timestampMillis())); - assertThat(metadataLog.getField("file")).isEqualTo(metadataLogEntries.get(0).file()); - assertThat(metadataLog.getField("latest_snapshot_id")).isNull(); - assertThat(metadataLog.getField("latest_schema_id")).isNull(); - assertThat(metadataLog.getField("latest_sequence_number")).isNull(); - - metadataLog = metadataLogs.get(1); - assertThat(metadataLog.getField("timestamp")) - .isEqualTo(Instant.ofEpochMilli(metadataLogEntries.get(1).timestampMillis())); - assertThat(metadataLog.getField("file")).isEqualTo(metadataLogEntries.get(1).file()); - assertThat(metadataLog.getField("latest_snapshot_id")).isEqualTo(parentSnapshot.snapshotId()); - assertThat(metadataLog.getField("latest_schema_id")).isEqualTo(parentSnapshot.schemaId()); - assertThat(metadataLog.getField("latest_sequence_number")) - .isEqualTo(parentSnapshot.sequenceNumber()); - assertThat(metadataLog.getField("latest_snapshot_id")).isEqualTo(parentSnapshot.snapshotId()); - - metadataLog = metadataLogs.get(2); - assertThat(metadataLog.getField("timestamp")) - .isEqualTo(Instant.ofEpochMilli(currentSnapshot.timestampMillis())); - assertThat(metadataLog.getField("file")).isEqualTo(tableMetadata.metadataFileLocation()); - assertThat(metadataLog.getField("latest_snapshot_id")).isEqualTo(currentSnapshot.snapshotId()); - assertThat(metadataLog.getField("latest_schema_id")).isEqualTo(currentSnapshot.schemaId()); - assertThat(metadataLog.getField("latest_sequence_number")) - .isEqualTo(currentSnapshot.sequenceNumber()); - - // test filtering - List metadataLogWithFilters = - sql( - "SELECT * FROM %s$metadata_log_entries WHERE latest_snapshot_id = %s", - TABLE_NAME, currentSnapshotId); - assertThat(metadataLogWithFilters).hasSize(1); - metadataLog = metadataLogWithFilters.get(0); - assertThat(Instant.ofEpochMilli(tableMetadata.currentSnapshot().timestampMillis())) - .isEqualTo(metadataLog.getField("timestamp")); - - assertThat(metadataLog.getField("file")).isEqualTo(tableMetadata.metadataFileLocation()); - assertThat(metadataLog.getField("latest_snapshot_id")) - .isEqualTo(tableMetadata.currentSnapshot().snapshotId()); - assertThat(metadataLog.getField("latest_schema_id")) - .isEqualTo(tableMetadata.currentSnapshot().schemaId()); - assertThat(metadataLog.getField("latest_sequence_number")) - .isEqualTo(tableMetadata.currentSnapshot().sequenceNumber()); - - // test projection - List metadataFiles = - metadataLogEntries.stream() - .map(TableMetadata.MetadataLogEntry::file) - .collect(Collectors.toList()); - metadataFiles.add(tableMetadata.metadataFileLocation()); - List metadataLogWithProjection = - sql("SELECT file FROM %s$metadata_log_entries", TABLE_NAME); - assertThat(metadataLogWithProjection).hasSize(3); - for (int i = 0; i < metadataFiles.size(); i++) { - assertThat(metadataLogWithProjection.get(i).getField("file")).isEqualTo(metadataFiles.get(i)); - } - } - - @TestTemplate - public void testSnapshotReferencesMetatable() { - Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE_NAME)); - - Long currentSnapshotId = table.currentSnapshot().snapshotId(); - - // Create branch - table - .manageSnapshots() - .createBranch("testBranch", currentSnapshotId) - .setMaxRefAgeMs("testBranch", 10) - .setMinSnapshotsToKeep("testBranch", 20) - .setMaxSnapshotAgeMs("testBranch", 30) - .commit(); - // Create Tag - table - .manageSnapshots() - .createTag("testTag", currentSnapshotId) - .setMaxRefAgeMs("testTag", 50) - .commit(); - // Check refs table - List references = sql("SELECT * FROM %s$refs", TABLE_NAME); - List branches = sql("SELECT * FROM %s$refs WHERE type='BRANCH'", TABLE_NAME); - assertThat(references).hasSize(3); - assertThat(branches).hasSize(2); - List tags = sql("SELECT * FROM %s$refs WHERE type='TAG'", TABLE_NAME); - assertThat(tags).hasSize(1); - // Check branch entries in refs table - List mainBranch = - sql("SELECT * FROM %s$refs WHERE name='main' AND type='BRANCH'", TABLE_NAME); - assertThat((String) mainBranch.get(0).getFieldAs("name")).isEqualTo("main"); - assertThat((String) mainBranch.get(0).getFieldAs("type")).isEqualTo("BRANCH"); - assertThat((Long) mainBranch.get(0).getFieldAs("snapshot_id")).isEqualTo(currentSnapshotId); - List testBranch = - sql("SELECT * FROM %s$refs WHERE name='testBranch' AND type='BRANCH'", TABLE_NAME); - assertThat((String) testBranch.get(0).getFieldAs("name")).isEqualTo("testBranch"); - assertThat((String) testBranch.get(0).getFieldAs("type")).isEqualTo("BRANCH"); - assertThat((Long) testBranch.get(0).getFieldAs("snapshot_id")).isEqualTo(currentSnapshotId); - assertThat((Long) testBranch.get(0).getFieldAs("max_reference_age_in_ms")) - .isEqualTo(Long.valueOf(10)); - assertThat((Integer) testBranch.get(0).getFieldAs("min_snapshots_to_keep")) - .isEqualTo(Integer.valueOf(20)); - assertThat((Long) testBranch.get(0).getFieldAs("max_snapshot_age_in_ms")) - .isEqualTo(Long.valueOf(30)); - - // Check tag entries in refs table - List testTag = - sql("SELECT * FROM %s$refs WHERE name='testTag' AND type='TAG'", TABLE_NAME); - assertThat((String) testTag.get(0).getFieldAs("name")).isEqualTo("testTag"); - assertThat((String) testTag.get(0).getFieldAs("type")).isEqualTo("TAG"); - assertThat((Long) testTag.get(0).getFieldAs("snapshot_id")).isEqualTo(currentSnapshotId); - assertThat((Long) testTag.get(0).getFieldAs("max_reference_age_in_ms")) - .isEqualTo(Long.valueOf(50)); - // Check projection in refs table - List testTagProjection = - sql( - "SELECT name,type,snapshot_id,max_reference_age_in_ms,min_snapshots_to_keep FROM %s$refs where type='TAG'", - TABLE_NAME); - assertThat((String) testTagProjection.get(0).getFieldAs("name")).isEqualTo("testTag"); - assertThat((String) testTagProjection.get(0).getFieldAs("type")).isEqualTo("TAG"); - assertThat((Long) testTagProjection.get(0).getFieldAs("snapshot_id")) - .isEqualTo(currentSnapshotId); - assertThat((Long) testTagProjection.get(0).getFieldAs("max_reference_age_in_ms")) - .isEqualTo(Long.valueOf(50)); - assertThat((String) testTagProjection.get(0).getFieldAs("min_snapshots_to_keep")).isNull(); - List mainBranchProjection = - sql("SELECT name, type FROM %s$refs WHERE name='main' AND type = 'BRANCH'", TABLE_NAME); - assertThat((String) mainBranchProjection.get(0).getFieldAs("name")).isEqualTo("main"); - assertThat((String) mainBranchProjection.get(0).getFieldAs("type")).isEqualTo("BRANCH"); - List testBranchProjection = - sql( - "SELECT type, name, max_reference_age_in_ms, snapshot_id FROM %s$refs WHERE name='testBranch' AND type = 'BRANCH'", - TABLE_NAME); - assertThat((String) testBranchProjection.get(0).getFieldAs("name")).isEqualTo("testBranch"); - assertThat((String) testBranchProjection.get(0).getFieldAs("type")).isEqualTo("BRANCH"); - assertThat((Long) testBranchProjection.get(0).getFieldAs("snapshot_id")) - .isEqualTo(currentSnapshotId); - assertThat((Long) testBranchProjection.get(0).getFieldAs("max_reference_age_in_ms")) - .isEqualTo(Long.valueOf(10)); - } - - /** - * Find matching manifest entries of an Iceberg table - * - * @param table iceberg table - * @param expectedContent file content to populate on entries - * @param entriesTableSchema schema of Manifest entries - * @param manifestsToExplore manifests to explore of the table - * @param partValue partition value that manifest entries must match, or null to skip filtering - */ - private List expectedEntries( - Table table, - FileContent expectedContent, - Schema entriesTableSchema, - List manifestsToExplore, - String partValue) - throws IOException { - List expected = Lists.newArrayList(); - for (ManifestFile manifest : manifestsToExplore) { - InputFile in = table.io().newInputFile(manifest.path()); - try (CloseableIterable rows = - Avro.read(in).project(entriesTableSchema).build()) { - for (GenericData.Record record : rows) { - if ((Integer) record.get("status") < 2 /* added or existing */) { - GenericData.Record file = (GenericData.Record) record.get("data_file"); - if (partitionMatch(file, partValue)) { - asMetadataRecord(file, expectedContent); - expected.add(file); - } - } - } - } - } - return expected; - } - - // Populate certain fields derived in the metadata tables - private void asMetadataRecord(GenericData.Record file, FileContent content) { - file.put(0, content.id()); - file.put(3, 0); // specId - } - - private boolean partitionMatch(GenericData.Record file, String partValue) { - if (partValue == null) { - return true; - } - GenericData.Record partition = (GenericData.Record) file.get(4); - return partValue.equals(partition.get(0).toString()); - } - - private List dataManifests(Table table) { - return table.currentSnapshot().dataManifests(table.io()); - } - - private List allDataManifests(Table table) { - List manifests = Lists.newArrayList(); - for (Snapshot snapshot : table.snapshots()) { - manifests.addAll(snapshot.dataManifests(table.io())); - } - return manifests; - } - - private List deleteManifests(Table table) { - return table.currentSnapshot().deleteManifests(table.io()); - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkReaderDeletesBase.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkReaderDeletesBase.java deleted file mode 100644 index 0b5a8011ad3f..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkReaderDeletesBase.java +++ /dev/null @@ -1,90 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import java.util.Map; -import org.apache.hadoop.hive.conf.HiveConf; -import org.apache.iceberg.BaseTable; -import org.apache.iceberg.CatalogUtil; -import org.apache.iceberg.ParameterizedTestExtension; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableMetadata; -import org.apache.iceberg.TableOperations; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.data.DeleteReadTests; -import org.apache.iceberg.hive.HiveCatalog; -import org.apache.iceberg.hive.TestHiveMetastore; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.junit.jupiter.api.AfterAll; -import org.junit.jupiter.api.BeforeAll; -import org.junit.jupiter.api.extension.ExtendWith; - -@ExtendWith(ParameterizedTestExtension.class) -public abstract class TestFlinkReaderDeletesBase extends DeleteReadTests { - - protected static String databaseName = "default"; - - protected static HiveConf hiveConf = null; - protected static HiveCatalog catalog = null; - private static TestHiveMetastore metastore = null; - - @BeforeAll - public static void startMetastore() { - metastore = new TestHiveMetastore(); - metastore.start(); - hiveConf = metastore.hiveConf(); - catalog = - (HiveCatalog) - CatalogUtil.loadCatalog( - HiveCatalog.class.getName(), "hive", ImmutableMap.of(), hiveConf); - } - - @AfterAll - public static void stopMetastore() throws Exception { - metastore.stop(); - catalog = null; - } - - @Override - protected Table createTable(String name, Schema schema, PartitionSpec spec) { - Map props = Maps.newHashMap(); - props.put(TableProperties.DEFAULT_FILE_FORMAT, format.name()); - - Table table = catalog.createTable(TableIdentifier.of(databaseName, name), schema, spec, props); - TableOperations ops = ((BaseTable) table).operations(); - TableMetadata meta = ops.current(); - ops.commit(meta, meta.upgradeToFormatVersion(2)); - - return table; - } - - @Override - protected void dropTable(String name) { - catalog.dropTable(TableIdentifier.of(databaseName, name)); - } - - @Override - protected boolean expectPruned() { - return false; - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScan.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScan.java deleted file mode 100644 index cf6b233dcec6..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScan.java +++ /dev/null @@ -1,540 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatThrownBy; - -import java.nio.file.Path; -import java.time.LocalDate; -import java.time.LocalDateTime; -import java.time.LocalTime; -import java.util.Arrays; -import java.util.Collection; -import java.util.Collections; -import java.util.List; -import java.util.Map; -import org.apache.flink.test.junit5.MiniClusterExtension; -import org.apache.flink.types.Row; -import org.apache.iceberg.AppendFiles; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Parameter; -import org.apache.iceberg.ParameterizedTestExtension; -import org.apache.iceberg.Parameters; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.data.GenericAppenderHelper; -import org.apache.iceberg.data.RandomGenericData; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.expressions.Expression; -import org.apache.iceberg.expressions.Expressions; -import org.apache.iceberg.flink.HadoopCatalogExtension; -import org.apache.iceberg.flink.MiniFlinkClusterExtension; -import org.apache.iceberg.flink.TableLoader; -import org.apache.iceberg.flink.TestFixtures; -import org.apache.iceberg.flink.TestHelpers; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.types.Types; -import org.apache.iceberg.util.DateTimeUtil; -import org.junit.jupiter.api.TestTemplate; -import org.junit.jupiter.api.extension.ExtendWith; -import org.junit.jupiter.api.extension.RegisterExtension; -import org.junit.jupiter.api.io.TempDir; - -@ExtendWith(ParameterizedTestExtension.class) -public abstract class TestFlinkScan { - @RegisterExtension - protected static MiniClusterExtension miniClusterExtension = - MiniFlinkClusterExtension.createWithClassloaderCheckDisabled(); - - @TempDir protected Path temporaryDirectory; - - @RegisterExtension - protected static final HadoopCatalogExtension CATALOG_EXTENSION = - new HadoopCatalogExtension(TestFixtures.DATABASE, TestFixtures.TABLE); - - @Parameter protected FileFormat fileFormat; - - @Parameters(name = "format={0}") - public static Collection fileFormat() { - return Arrays.asList(FileFormat.AVRO, FileFormat.PARQUET, FileFormat.ORC); - } - - protected TableLoader tableLoader() { - return CATALOG_EXTENSION.tableLoader(); - } - - protected abstract List runWithProjection(String... projected) throws Exception; - - protected abstract List runWithFilter( - Expression filter, String sqlFilter, boolean caseSensitive) throws Exception; - - protected List runWithFilter(Expression filter, String sqlFilter) throws Exception { - return runWithFilter(filter, sqlFilter, true); - } - - protected abstract List runWithOptions(Map options) throws Exception; - - protected abstract List run() throws Exception; - - @TestTemplate - public void testUnpartitionedTable() throws Exception { - Table table = - CATALOG_EXTENSION.catalog().createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA); - List expectedRecords = RandomGenericData.generate(TestFixtures.SCHEMA, 2, 0L); - new GenericAppenderHelper(table, fileFormat, temporaryDirectory).appendToTable(expectedRecords); - TestHelpers.assertRecords(run(), expectedRecords, TestFixtures.SCHEMA); - } - - @TestTemplate - public void testPartitionedTable() throws Exception { - Table table = - CATALOG_EXTENSION - .catalog() - .createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA, TestFixtures.SPEC); - List expectedRecords = RandomGenericData.generate(TestFixtures.SCHEMA, 1, 0L); - expectedRecords.get(0).set(2, "2020-03-20"); - new GenericAppenderHelper(table, fileFormat, temporaryDirectory) - .appendToTable(org.apache.iceberg.TestHelpers.Row.of("2020-03-20", 0), expectedRecords); - TestHelpers.assertRecords(run(), expectedRecords, TestFixtures.SCHEMA); - } - - @TestTemplate - public void testProjection() throws Exception { - Table table = - CATALOG_EXTENSION - .catalog() - .createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA, TestFixtures.SPEC); - List inputRecords = RandomGenericData.generate(TestFixtures.SCHEMA, 1, 0L); - new GenericAppenderHelper(table, fileFormat, temporaryDirectory) - .appendToTable(org.apache.iceberg.TestHelpers.Row.of("2020-03-20", 0), inputRecords); - assertRows(runWithProjection("data"), Row.of(inputRecords.get(0).get(0))); - } - - @TestTemplate - public void testIdentityPartitionProjections() throws Exception { - Schema logSchema = - new Schema( - Types.NestedField.optional(1, "id", Types.IntegerType.get()), - Types.NestedField.optional(2, "dt", Types.StringType.get()), - Types.NestedField.optional(3, "level", Types.StringType.get()), - Types.NestedField.optional(4, "message", Types.StringType.get())); - PartitionSpec spec = - PartitionSpec.builderFor(logSchema).identity("dt").identity("level").build(); - - Table table = - CATALOG_EXTENSION.catalog().createTable(TestFixtures.TABLE_IDENTIFIER, logSchema, spec); - List inputRecords = RandomGenericData.generate(logSchema, 10, 0L); - - int idx = 0; - AppendFiles append = table.newAppend(); - for (Record record : inputRecords) { - record.set(1, "2020-03-2" + idx); - record.set(2, Integer.toString(idx)); - append.appendFile( - new GenericAppenderHelper(table, fileFormat, temporaryDirectory) - .writeFile( - org.apache.iceberg.TestHelpers.Row.of("2020-03-2" + idx, Integer.toString(idx)), - ImmutableList.of(record))); - idx += 1; - } - append.commit(); - - // individual fields - validateIdentityPartitionProjections(table, Collections.singletonList("dt"), inputRecords); - validateIdentityPartitionProjections(table, Collections.singletonList("level"), inputRecords); - validateIdentityPartitionProjections(table, Collections.singletonList("message"), inputRecords); - validateIdentityPartitionProjections(table, Collections.singletonList("id"), inputRecords); - // field pairs - validateIdentityPartitionProjections(table, Arrays.asList("dt", "message"), inputRecords); - validateIdentityPartitionProjections(table, Arrays.asList("level", "message"), inputRecords); - validateIdentityPartitionProjections(table, Arrays.asList("dt", "level"), inputRecords); - // out-of-order pairs - validateIdentityPartitionProjections(table, Arrays.asList("message", "dt"), inputRecords); - validateIdentityPartitionProjections(table, Arrays.asList("message", "level"), inputRecords); - validateIdentityPartitionProjections(table, Arrays.asList("level", "dt"), inputRecords); - // out-of-order triplets - validateIdentityPartitionProjections( - table, Arrays.asList("dt", "level", "message"), inputRecords); - validateIdentityPartitionProjections( - table, Arrays.asList("level", "dt", "message"), inputRecords); - validateIdentityPartitionProjections( - table, Arrays.asList("dt", "message", "level"), inputRecords); - validateIdentityPartitionProjections( - table, Arrays.asList("level", "message", "dt"), inputRecords); - validateIdentityPartitionProjections( - table, Arrays.asList("message", "dt", "level"), inputRecords); - validateIdentityPartitionProjections( - table, Arrays.asList("message", "level", "dt"), inputRecords); - } - - private void validateIdentityPartitionProjections( - Table table, List projectedFields, List inputRecords) throws Exception { - List rows = runWithProjection(projectedFields.toArray(new String[0])); - - for (int pos = 0; pos < inputRecords.size(); pos++) { - Record inputRecord = inputRecords.get(pos); - Row actualRecord = rows.get(pos); - - for (int i = 0; i < projectedFields.size(); i++) { - String name = projectedFields.get(i); - assertThat(inputRecord.getField(name)) - .as("Projected field " + name + " should match") - .isEqualTo(actualRecord.getField(i)); - } - } - } - - @TestTemplate - public void testSnapshotReads() throws Exception { - Table table = - CATALOG_EXTENSION.catalog().createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA); - - GenericAppenderHelper helper = new GenericAppenderHelper(table, fileFormat, temporaryDirectory); - - List expectedRecords = RandomGenericData.generate(TestFixtures.SCHEMA, 1, 0L); - helper.appendToTable(expectedRecords); - long snapshotId = table.currentSnapshot().snapshotId(); - - long timestampMillis = table.currentSnapshot().timestampMillis(); - - // produce another timestamp - waitUntilAfter(timestampMillis); - helper.appendToTable(RandomGenericData.generate(TestFixtures.SCHEMA, 1, 0L)); - - TestHelpers.assertRecords( - runWithOptions(ImmutableMap.of("snapshot-id", Long.toString(snapshotId))), - expectedRecords, - TestFixtures.SCHEMA); - TestHelpers.assertRecords( - runWithOptions(ImmutableMap.of("as-of-timestamp", Long.toString(timestampMillis))), - expectedRecords, - TestFixtures.SCHEMA); - } - - @TestTemplate - public void testTagReads() throws Exception { - Table table = - CATALOG_EXTENSION.catalog().createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA); - - GenericAppenderHelper helper = new GenericAppenderHelper(table, fileFormat, temporaryDirectory); - - List expectedRecords1 = RandomGenericData.generate(TestFixtures.SCHEMA, 1, 0L); - helper.appendToTable(expectedRecords1); - long snapshotId = table.currentSnapshot().snapshotId(); - - table.manageSnapshots().createTag("t1", snapshotId).commit(); - - TestHelpers.assertRecords( - runWithOptions(ImmutableMap.of("tag", "t1")), expectedRecords1, TestFixtures.SCHEMA); - - List expectedRecords2 = RandomGenericData.generate(TestFixtures.SCHEMA, 1, 0L); - helper.appendToTable(expectedRecords2); - snapshotId = table.currentSnapshot().snapshotId(); - - table.manageSnapshots().replaceTag("t1", snapshotId).commit(); - - List expectedRecords = Lists.newArrayList(); - expectedRecords.addAll(expectedRecords1); - expectedRecords.addAll(expectedRecords2); - TestHelpers.assertRecords( - runWithOptions(ImmutableMap.of("tag", "t1")), expectedRecords, TestFixtures.SCHEMA); - } - - @TestTemplate - public void testBranchReads() throws Exception { - Table table = - CATALOG_EXTENSION.catalog().createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA); - - GenericAppenderHelper helper = new GenericAppenderHelper(table, fileFormat, temporaryDirectory); - - List expectedRecordsBase = RandomGenericData.generate(TestFixtures.SCHEMA, 1, 0L); - helper.appendToTable(expectedRecordsBase); - long snapshotId = table.currentSnapshot().snapshotId(); - - String branchName = "b1"; - table.manageSnapshots().createBranch(branchName, snapshotId).commit(); - - List expectedRecordsForBranch = RandomGenericData.generate(TestFixtures.SCHEMA, 1, 0L); - helper.appendToTable(branchName, expectedRecordsForBranch); - - List expectedRecordsForMain = RandomGenericData.generate(TestFixtures.SCHEMA, 1, 0L); - helper.appendToTable(expectedRecordsForMain); - - List branchExpectedRecords = Lists.newArrayList(); - branchExpectedRecords.addAll(expectedRecordsBase); - branchExpectedRecords.addAll(expectedRecordsForBranch); - - TestHelpers.assertRecords( - runWithOptions(ImmutableMap.of("branch", branchName)), - branchExpectedRecords, - TestFixtures.SCHEMA); - - List mainExpectedRecords = Lists.newArrayList(); - mainExpectedRecords.addAll(expectedRecordsBase); - mainExpectedRecords.addAll(expectedRecordsForMain); - - TestHelpers.assertRecords(run(), mainExpectedRecords, TestFixtures.SCHEMA); - } - - @TestTemplate - public void testIncrementalReadViaTag() throws Exception { - Table table = - CATALOG_EXTENSION.catalog().createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA); - - GenericAppenderHelper helper = new GenericAppenderHelper(table, fileFormat, temporaryDirectory); - - List records1 = RandomGenericData.generate(TestFixtures.SCHEMA, 1, 0L); - helper.appendToTable(records1); - long snapshotId1 = table.currentSnapshot().snapshotId(); - String startTag = "t1"; - table.manageSnapshots().createTag(startTag, snapshotId1).commit(); - - List records2 = RandomGenericData.generate(TestFixtures.SCHEMA, 1, 1L); - helper.appendToTable(records2); - - List records3 = RandomGenericData.generate(TestFixtures.SCHEMA, 1, 2L); - helper.appendToTable(records3); - long snapshotId3 = table.currentSnapshot().snapshotId(); - String endTag = "t2"; - table.manageSnapshots().createTag(endTag, snapshotId3).commit(); - - helper.appendToTable(RandomGenericData.generate(TestFixtures.SCHEMA, 1, 3L)); - - List expected = Lists.newArrayList(); - expected.addAll(records2); - expected.addAll(records3); - - TestHelpers.assertRecords( - runWithOptions( - ImmutableMap.builder() - .put("start-tag", startTag) - .put("end-tag", endTag) - .buildOrThrow()), - expected, - TestFixtures.SCHEMA); - - TestHelpers.assertRecords( - runWithOptions( - ImmutableMap.builder() - .put("start-snapshot-id", Long.toString(snapshotId1)) - .put("end-tag", endTag) - .buildOrThrow()), - expected, - TestFixtures.SCHEMA); - - TestHelpers.assertRecords( - runWithOptions( - ImmutableMap.builder() - .put("start-tag", startTag) - .put("end-snapshot-id", Long.toString(snapshotId3)) - .buildOrThrow()), - expected, - TestFixtures.SCHEMA); - - assertThatThrownBy( - () -> - runWithOptions( - ImmutableMap.builder() - .put("start-tag", startTag) - .put("end-tag", endTag) - .put("start-snapshot-id", Long.toString(snapshotId1)) - .buildOrThrow())) - .isInstanceOf(Exception.class) - .hasMessage("START_SNAPSHOT_ID and START_TAG cannot both be set."); - - assertThatThrownBy( - () -> - runWithOptions( - ImmutableMap.builder() - .put("start-tag", startTag) - .put("end-tag", endTag) - .put("end-snapshot-id", Long.toString(snapshotId3)) - .buildOrThrow())) - .isInstanceOf(Exception.class) - .hasMessage("END_SNAPSHOT_ID and END_TAG cannot both be set."); - } - - @TestTemplate - public void testIncrementalRead() throws Exception { - Table table = - CATALOG_EXTENSION.catalog().createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA); - - GenericAppenderHelper helper = new GenericAppenderHelper(table, fileFormat, temporaryDirectory); - - List records1 = RandomGenericData.generate(TestFixtures.SCHEMA, 1, 0L); - helper.appendToTable(records1); - long snapshotId1 = table.currentSnapshot().snapshotId(); - - // snapshot 2 - List records2 = RandomGenericData.generate(TestFixtures.SCHEMA, 1, 1L); - helper.appendToTable(records2); - - List records3 = RandomGenericData.generate(TestFixtures.SCHEMA, 1, 2L); - helper.appendToTable(records3); - long snapshotId3 = table.currentSnapshot().snapshotId(); - - // snapshot 4 - helper.appendToTable(RandomGenericData.generate(TestFixtures.SCHEMA, 1, 3L)); - - List expected2 = Lists.newArrayList(); - expected2.addAll(records2); - expected2.addAll(records3); - TestHelpers.assertRecords( - runWithOptions( - ImmutableMap.builder() - .put("start-snapshot-id", Long.toString(snapshotId1)) - .put("end-snapshot-id", Long.toString(snapshotId3)) - .buildOrThrow()), - expected2, - TestFixtures.SCHEMA); - } - - @TestTemplate - public void testFilterExpPartition() throws Exception { - Table table = - CATALOG_EXTENSION - .catalog() - .createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA, TestFixtures.SPEC); - - List expectedRecords = RandomGenericData.generate(TestFixtures.SCHEMA, 2, 0L); - expectedRecords.get(0).set(2, "2020-03-20"); - expectedRecords.get(1).set(2, "2020-03-20"); - - GenericAppenderHelper helper = new GenericAppenderHelper(table, fileFormat, temporaryDirectory); - DataFile dataFile1 = - helper.writeFile(org.apache.iceberg.TestHelpers.Row.of("2020-03-20", 0), expectedRecords); - DataFile dataFile2 = - helper.writeFile( - org.apache.iceberg.TestHelpers.Row.of("2020-03-21", 0), - RandomGenericData.generate(TestFixtures.SCHEMA, 2, 0L)); - helper.appendToTable(dataFile1, dataFile2); - TestHelpers.assertRecords( - runWithFilter(Expressions.equal("dt", "2020-03-20"), "where dt='2020-03-20'", true), - expectedRecords, - TestFixtures.SCHEMA); - } - - private void testFilterExp(Expression filter, String sqlFilter, boolean caseSensitive) - throws Exception { - Table table = - CATALOG_EXTENSION.catalog().createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA); - - List expectedRecords = RandomGenericData.generate(TestFixtures.SCHEMA, 3, 0L); - expectedRecords.get(0).set(0, "a"); - expectedRecords.get(1).set(0, "b"); - expectedRecords.get(2).set(0, "c"); - - GenericAppenderHelper helper = new GenericAppenderHelper(table, fileFormat, temporaryDirectory); - DataFile dataFile = helper.writeFile(expectedRecords); - helper.appendToTable(dataFile); - - List actual = - runWithFilter(Expressions.greaterThanOrEqual("data", "b"), "where data>='b'", true); - - TestHelpers.assertRecords(actual, expectedRecords.subList(1, 3), TestFixtures.SCHEMA); - } - - @TestTemplate - public void testFilterExp() throws Exception { - testFilterExp(Expressions.greaterThanOrEqual("data", "b"), "where data>='b'", true); - } - - @TestTemplate - public void testFilterExpCaseInsensitive() throws Exception { - // sqlFilter does not support case-insensitive filtering: - // https://issues.apache.org/jira/browse/FLINK-16175 - testFilterExp(Expressions.greaterThanOrEqual("DATA", "b"), "where data>='b'", false); - } - - @TestTemplate - public void testPartitionTypes() throws Exception { - Schema typesSchema = - new Schema( - Types.NestedField.optional(1, "id", Types.IntegerType.get()), - Types.NestedField.optional(2, "decimal", Types.DecimalType.of(38, 18)), - Types.NestedField.optional(3, "str", Types.StringType.get()), - Types.NestedField.optional(4, "binary", Types.BinaryType.get()), - Types.NestedField.optional(5, "date", Types.DateType.get()), - Types.NestedField.optional(6, "time", Types.TimeType.get()), - Types.NestedField.optional(7, "timestamp", Types.TimestampType.withoutZone())); - PartitionSpec spec = - PartitionSpec.builderFor(typesSchema) - .identity("decimal") - .identity("str") - .identity("binary") - .identity("date") - .identity("time") - .identity("timestamp") - .build(); - - Table table = - CATALOG_EXTENSION.catalog().createTable(TestFixtures.TABLE_IDENTIFIER, typesSchema, spec); - List records = RandomGenericData.generate(typesSchema, 10, 0L); - GenericAppenderHelper appender = - new GenericAppenderHelper(table, fileFormat, temporaryDirectory); - for (Record record : records) { - org.apache.iceberg.TestHelpers.Row partition = - org.apache.iceberg.TestHelpers.Row.of( - record.get(1), - record.get(2), - record.get(3), - record.get(4) == null ? null : DateTimeUtil.daysFromDate((LocalDate) record.get(4)), - record.get(5) == null ? null : DateTimeUtil.microsFromTime((LocalTime) record.get(5)), - record.get(6) == null - ? null - : DateTimeUtil.microsFromTimestamp((LocalDateTime) record.get(6))); - appender.appendToTable(partition, Collections.singletonList(record)); - } - - TestHelpers.assertRecords(run(), records, typesSchema); - } - - @TestTemplate - public void testCustomizedFlinkDataTypes() throws Exception { - Schema schema = - new Schema( - Types.NestedField.required( - 1, - "map", - Types.MapType.ofRequired(2, 3, Types.StringType.get(), Types.StringType.get())), - Types.NestedField.required( - 4, "arr", Types.ListType.ofRequired(5, Types.StringType.get()))); - Table table = CATALOG_EXTENSION.catalog().createTable(TestFixtures.TABLE_IDENTIFIER, schema); - List records = RandomGenericData.generate(schema, 10, 0L); - GenericAppenderHelper helper = new GenericAppenderHelper(table, fileFormat, temporaryDirectory); - helper.appendToTable(records); - TestHelpers.assertRecords(run(), records, schema); - } - - private static void assertRows(List results, Row... expected) { - TestHelpers.assertRows(results, Arrays.asList(expected)); - } - - private static void waitUntilAfter(long timestampMillis) { - long current = System.currentTimeMillis(); - while (current <= timestampMillis) { - current = System.currentTimeMillis(); - } - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScanSql.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScanSql.java deleted file mode 100644 index 1493c0932044..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScanSql.java +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import java.io.IOException; -import java.util.List; -import java.util.Map; -import org.apache.flink.table.api.EnvironmentSettings; -import org.apache.flink.table.api.TableEnvironment; -import org.apache.flink.table.api.config.TableConfigOptions; -import org.apache.flink.types.Row; -import org.junit.jupiter.api.BeforeEach; - -/** Test Flink SELECT SQLs. */ -public class TestFlinkScanSql extends TestFlinkSource { - private volatile TableEnvironment tEnv; - - @BeforeEach - public void before() throws IOException { - SqlHelpers.sql( - getTableEnv(), - "create catalog iceberg_catalog with ('type'='iceberg', 'catalog-type'='hadoop', 'warehouse'='%s')", - CATALOG_EXTENSION.warehouse()); - SqlHelpers.sql(getTableEnv(), "use catalog iceberg_catalog"); - getTableEnv() - .getConfig() - .getConfiguration() - .set(TableConfigOptions.TABLE_DYNAMIC_TABLE_OPTIONS_ENABLED, true); - } - - private TableEnvironment getTableEnv() { - if (tEnv == null) { - synchronized (this) { - if (tEnv == null) { - this.tEnv = - TableEnvironment.create(EnvironmentSettings.newInstance().inBatchMode().build()); - } - } - } - return tEnv; - } - - @Override - protected List run( - FlinkSource.Builder formatBuilder, - Map sqlOptions, - String sqlFilter, - String... sqlSelectedFields) { - String select = String.join(",", sqlSelectedFields); - String optionStr = SqlHelpers.sqlOptionsToString(sqlOptions); - return SqlHelpers.sql(getTableEnv(), "select %s from t %s %s", select, optionStr, sqlFilter); - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSource.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSource.java deleted file mode 100644 index dd50170f0fd7..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSource.java +++ /dev/null @@ -1,90 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import java.util.Collections; -import java.util.List; -import java.util.Map; -import java.util.Optional; -import org.apache.flink.table.api.TableColumn; -import org.apache.flink.table.api.TableSchema; -import org.apache.flink.types.Row; -import org.apache.iceberg.expressions.Expression; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.TestFixtures; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; - -public abstract class TestFlinkSource extends TestFlinkScan { - - @Override - protected List runWithProjection(String... projected) throws Exception { - TableSchema.Builder builder = TableSchema.builder(); - TableSchema schema = - FlinkSchemaUtil.toSchema( - FlinkSchemaUtil.convert( - CATALOG_EXTENSION.catalog().loadTable(TestFixtures.TABLE_IDENTIFIER).schema())); - for (String field : projected) { - TableColumn column = schema.getTableColumn(field).get(); - builder.field(column.getName(), column.getType()); - } - return run(FlinkSource.forRowData().project(builder.build()), Maps.newHashMap(), "", projected); - } - - @Override - protected List runWithFilter(Expression filter, String sqlFilter, boolean caseSensitive) - throws Exception { - FlinkSource.Builder builder = - FlinkSource.forRowData().filters(Collections.singletonList(filter)); - Map options = Maps.newHashMap(); - options.put("case-sensitive", Boolean.toString(caseSensitive)); - return run(builder, options, sqlFilter, "*"); - } - - @Override - protected List runWithOptions(Map options) throws Exception { - FlinkSource.Builder builder = FlinkSource.forRowData(); - Optional.ofNullable(options.get("case-sensitive")) - .ifPresent(value -> builder.caseSensitive(Boolean.parseBoolean(value))); - Optional.ofNullable(options.get("snapshot-id")) - .ifPresent(value -> builder.snapshotId(Long.parseLong(value))); - Optional.ofNullable(options.get("tag")).ifPresent(value -> builder.tag(value)); - Optional.ofNullable(options.get("branch")).ifPresent(value -> builder.branch(value)); - Optional.ofNullable(options.get("start-tag")).ifPresent(value -> builder.startTag(value)); - Optional.ofNullable(options.get("end-tag")).ifPresent(value -> builder.endTag(value)); - Optional.ofNullable(options.get("start-snapshot-id")) - .ifPresent(value -> builder.startSnapshotId(Long.parseLong(value))); - Optional.ofNullable(options.get("end-snapshot-id")) - .ifPresent(value -> builder.endSnapshotId(Long.parseLong(value))); - Optional.ofNullable(options.get("as-of-timestamp")) - .ifPresent(value -> builder.asOfTimestamp(Long.parseLong(value))); - return run(builder, options, "", "*"); - } - - @Override - protected List run() throws Exception { - return run(FlinkSource.forRowData(), Maps.newHashMap(), "", "*"); - } - - protected abstract List run( - FlinkSource.Builder formatBuilder, - Map sqlOptions, - String sqlFilter, - String... sqlSelectedFields) - throws Exception; -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSourceConfig.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSourceConfig.java deleted file mode 100644 index 14131d9e96d5..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSourceConfig.java +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatThrownBy; - -import java.util.List; -import org.apache.flink.types.Row; -import org.apache.iceberg.flink.FlinkReadOptions; -import org.junit.jupiter.api.TestTemplate; - -public class TestFlinkSourceConfig extends TableSourceTestBase { - private static final String TABLE = "test_table"; - - @TestTemplate - public void testFlinkSessionConfig() { - getTableEnv().getConfig().set(FlinkReadOptions.STREAMING_OPTION, true); - assertThatThrownBy(() -> sql("SELECT * FROM %s /*+ OPTIONS('as-of-timestamp'='1')*/", TABLE)) - .isInstanceOf(IllegalArgumentException.class) - .hasMessage("Cannot set as-of-timestamp option for streaming reader"); - } - - @TestTemplate - public void testFlinkHintConfig() { - List result = - sql( - "SELECT * FROM %s /*+ OPTIONS('as-of-timestamp'='%d','streaming'='false')*/", - TABLE, System.currentTimeMillis()); - assertThat(result).hasSize(3); - } - - @TestTemplate - public void testReadOptionHierarchy() { - getTableEnv().getConfig().set(FlinkReadOptions.LIMIT_OPTION, 1L); - List result = sql("SELECT * FROM %s", TABLE); - // Note that this query doesn't have the limit clause in the SQL. - // This assertions works because limit is pushed down to the reader and - // reader parallelism is 1. - assertThat(result).hasSize(1); - - result = sql("SELECT * FROM %s /*+ OPTIONS('limit'='3')*/", TABLE); - assertThat(result).hasSize(3); - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSourceSql.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSourceSql.java deleted file mode 100644 index e1162c3225b1..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSourceSql.java +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import java.io.IOException; -import java.util.List; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.configuration.PipelineOptions; -import org.apache.flink.table.api.config.TableConfigOptions; -import org.apache.flink.types.Row; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Table; -import org.apache.iceberg.data.GenericAppenderHelper; -import org.apache.iceberg.data.RandomGenericData; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.flink.TestFixtures; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.junit.jupiter.api.Test; - -/** Use the FlinkSource */ -public class TestFlinkSourceSql extends TestSqlBase { - @Override - public void before() throws IOException { - SqlHelpers.sql( - getTableEnv(), - "create catalog iceberg_catalog with ('type'='iceberg', 'catalog-type'='hadoop', 'warehouse'='%s')", - CATALOG_EXTENSION.warehouse()); - SqlHelpers.sql(getTableEnv(), "use catalog iceberg_catalog"); - getTableEnv() - .getConfig() - .getConfiguration() - .set(TableConfigOptions.TABLE_DYNAMIC_TABLE_OPTIONS_ENABLED, true); - } - - @Test - public void testInferParallelismWithGlobalSetting() throws IOException { - Configuration cfg = getTableEnv().getConfig().getConfiguration(); - cfg.set(PipelineOptions.MAX_PARALLELISM, 1); - - Table table = - CATALOG_EXTENSION - .catalog() - .createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA, null); - - GenericAppenderHelper helper = - new GenericAppenderHelper(table, FileFormat.PARQUET, temporaryFolder); - List expectedRecords = Lists.newArrayList(); - long maxFileLen = 0; - for (int i = 0; i < 5; i++) { - List records = RandomGenericData.generate(TestFixtures.SCHEMA, 2, i); - DataFile dataFile = helper.writeFile(null, records); - helper.appendToTable(dataFile); - expectedRecords.addAll(records); - maxFileLen = Math.max(dataFile.fileSizeInBytes(), maxFileLen); - } - - // Make sure to generate multiple CombinedScanTasks - SqlHelpers.sql( - getTableEnv(), - "ALTER TABLE t SET ('read.split.open-file-cost'='1', 'read.split.target-size'='%s')", - maxFileLen); - - List results = run(Maps.newHashMap(), "", "*"); - org.apache.iceberg.flink.TestHelpers.assertRecords( - results, expectedRecords, TestFixtures.SCHEMA); - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkTableSource.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkTableSource.java deleted file mode 100644 index 18528c789114..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkTableSource.java +++ /dev/null @@ -1,561 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatThrownBy; - -import java.util.List; -import org.apache.flink.table.api.SqlParserException; -import org.apache.flink.types.Row; -import org.apache.iceberg.expressions.Expressions; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.junit.jupiter.api.TestTemplate; - -public class TestFlinkTableSource extends TableSourceTestBase { - - @TestTemplate - public void testLimitPushDown() { - - assertThatThrownBy(() -> sql("SELECT * FROM %s LIMIT -1", TABLE_NAME)) - .isInstanceOf(SqlParserException.class) - .hasMessageStartingWith("SQL parse failed."); - - assertThat(sql("SELECT * FROM %s LIMIT 0", TABLE_NAME)).isEmpty(); - - String sqlLimitExceed = String.format("SELECT * FROM %s LIMIT 4", TABLE_NAME); - List resultExceed = sql(sqlLimitExceed); - assertThat(resultExceed).hasSize(3); - List expectedList = - Lists.newArrayList(Row.of(1, "iceberg", 10.0), Row.of(2, "b", 20.0), Row.of(3, null, 30.0)); - assertSameElements(expectedList, resultExceed); - - String querySql = String.format("SELECT * FROM %s LIMIT 1", TABLE_NAME); - String explain = getTableEnv().explainSql(querySql); - String expectedExplain = "limit=[1]"; - assertThat(explain).as("Explain should contain LimitPushDown").contains(expectedExplain); - List result = sql(querySql); - assertThat(result).hasSize(1); - assertThat(result).containsAnyElementsOf(expectedList); - - String sqlMixed = String.format("SELECT * FROM %s WHERE id = 1 LIMIT 2", TABLE_NAME); - List mixedResult = sql(sqlMixed); - assertThat(mixedResult).hasSize(1).first().isEqualTo(Row.of(1, "iceberg", 10.0)); - } - - @TestTemplate - public void testNoFilterPushDown() { - String sql = String.format("SELECT * FROM %s ", TABLE_NAME); - List result = sql(sql); - List expectedRecords = - Lists.newArrayList(Row.of(1, "iceberg", 10.0), Row.of(2, "b", 20.0), Row.of(3, null, 30.0)); - assertSameElements(expectedRecords, result); - assertThat(lastScanEvent.filter()) - .as("Should not push down a filter") - .isEqualTo(Expressions.alwaysTrue()); - } - - @TestTemplate - public void testFilterPushDownEqual() { - String sqlLiteralRight = String.format("SELECT * FROM %s WHERE id = 1 ", TABLE_NAME); - String expectedFilter = "ref(name=\"id\") == 1"; - - List result = sql(sqlLiteralRight); - assertThat(result).hasSize(1).first().isEqualTo(Row.of(1, "iceberg", 10.0)); - assertThat(scanEventCount).isEqualTo(1); - assertThat(lastScanEvent.filter()) - .as("Should contain the push down filter") - .asString() - .isEqualTo(expectedFilter); - } - - @TestTemplate - public void testFilterPushDownEqualNull() { - String sqlEqualNull = String.format("SELECT * FROM %s WHERE data = NULL ", TABLE_NAME); - - List result = sql(sqlEqualNull); - assertThat(result).isEmpty(); - assertThat(lastScanEvent).as("Should not push down a filter").isNull(); - } - - @TestTemplate - public void testFilterPushDownEqualLiteralOnLeft() { - String sqlLiteralLeft = String.format("SELECT * FROM %s WHERE 1 = id ", TABLE_NAME); - String expectedFilter = "ref(name=\"id\") == 1"; - - List resultLeft = sql(sqlLiteralLeft); - assertThat(resultLeft).hasSize(1).first().isEqualTo(Row.of(1, "iceberg", 10.0)); - assertThat(scanEventCount).isEqualTo(1); - assertThat(lastScanEvent.filter()) - .as("Should contain the push down filter") - .asString() - .isEqualTo(expectedFilter); - } - - @TestTemplate - public void testFilterPushDownNoEqual() { - String sqlNE = String.format("SELECT * FROM %s WHERE id <> 1 ", TABLE_NAME); - String expectedFilter = "ref(name=\"id\") != 1"; - - List resultNE = sql(sqlNE); - assertThat(resultNE).hasSize(2); - - List expectedNE = Lists.newArrayList(Row.of(2, "b", 20.0), Row.of(3, null, 30.0)); - assertSameElements(expectedNE, resultNE); - assertThat(scanEventCount).isEqualTo(1); - assertThat(lastScanEvent.filter()) - .as("Should contain the push down filter") - .asString() - .isEqualTo(expectedFilter); - } - - @TestTemplate - public void testFilterPushDownNoEqualNull() { - String sqlNotEqualNull = String.format("SELECT * FROM %s WHERE data <> NULL ", TABLE_NAME); - - List resultNE = sql(sqlNotEqualNull); - assertThat(resultNE).isEmpty(); - assertThat(lastScanEvent).as("Should not push down a filter").isNull(); - } - - @TestTemplate - public void testFilterPushDownAnd() { - String sqlAnd = - String.format("SELECT * FROM %s WHERE id = 1 AND data = 'iceberg' ", TABLE_NAME); - - List resultAnd = sql(sqlAnd); - assertThat(resultAnd).hasSize(1).first().isEqualTo(Row.of(1, "iceberg", 10.0)); - assertThat(scanEventCount).isEqualTo(1); - String expected = "(ref(name=\"id\") == 1 and ref(name=\"data\") == \"iceberg\")"; - assertThat(lastScanEvent.filter()) - .as("Should contain the push down filter") - .asString() - .isEqualTo(expected); - } - - @TestTemplate - public void testFilterPushDownOr() { - String sqlOr = String.format("SELECT * FROM %s WHERE id = 1 OR data = 'b' ", TABLE_NAME); - String expectedFilter = "(ref(name=\"id\") == 1 or ref(name=\"data\") == \"b\")"; - - List resultOr = sql(sqlOr); - assertThat(resultOr).hasSize(2); - - List expectedOR = Lists.newArrayList(Row.of(1, "iceberg", 10.0), Row.of(2, "b", 20.0)); - assertSameElements(expectedOR, resultOr); - - assertThat(scanEventCount).isEqualTo(1); - assertThat(lastScanEvent.filter()) - .as("Should contain the push down filter") - .asString() - .isEqualTo(expectedFilter); - } - - @TestTemplate - public void testFilterPushDownGreaterThan() { - String sqlGT = String.format("SELECT * FROM %s WHERE id > 1 ", TABLE_NAME); - String expectedFilter = "ref(name=\"id\") > 1"; - - List resultGT = sql(sqlGT); - assertThat(resultGT).hasSize(2); - - List expectedGT = Lists.newArrayList(Row.of(2, "b", 20.0), Row.of(3, null, 30.0)); - assertSameElements(expectedGT, resultGT); - - assertThat(scanEventCount).isEqualTo(1); - assertThat(lastScanEvent.filter()) - .as("Should contain the push down filter") - .asString() - .isEqualTo(expectedFilter); - } - - @TestTemplate - public void testFilterPushDownGreaterThanNull() { - String sqlGT = String.format("SELECT * FROM %s WHERE data > null ", TABLE_NAME); - - List resultGT = sql(sqlGT); - assertThat(resultGT).isEmpty(); - assertThat(lastScanEvent).as("Should not push down a filter").isNull(); - } - - @TestTemplate - public void testFilterPushDownGreaterThanLiteralOnLeft() { - String sqlGT = String.format("SELECT * FROM %s WHERE 3 > id ", TABLE_NAME); - String expectedFilter = "ref(name=\"id\") < 3"; - - List resultGT = sql(sqlGT); - assertThat(resultGT).hasSize(2); - - List expectedGT = Lists.newArrayList(Row.of(1, "iceberg", 10.0), Row.of(2, "b", 20.0)); - assertSameElements(expectedGT, resultGT); - - assertThat(scanEventCount).isEqualTo(1); - assertThat(lastScanEvent.filter()) - .as("Should contain the push down filter") - .asString() - .isEqualTo(expectedFilter); - } - - @TestTemplate - public void testFilterPushDownGreaterThanEqual() { - String sqlGTE = String.format("SELECT * FROM %s WHERE id >= 2 ", TABLE_NAME); - String expectedFilter = "ref(name=\"id\") >= 2"; - - List resultGTE = sql(sqlGTE); - assertThat(resultGTE).hasSize(2); - - List expectedGTE = Lists.newArrayList(Row.of(2, "b", 20.0), Row.of(3, null, 30.0)); - assertSameElements(expectedGTE, resultGTE); - - assertThat(scanEventCount).isEqualTo(1); - assertThat(lastScanEvent.filter()) - .as("Should contain the push down filter") - .asString() - .isEqualTo(expectedFilter); - } - - @TestTemplate - public void testFilterPushDownGreaterThanEqualNull() { - String sqlGTE = String.format("SELECT * FROM %s WHERE data >= null ", TABLE_NAME); - - List resultGT = sql(sqlGTE); - assertThat(resultGT).isEmpty(); - assertThat(lastScanEvent).as("Should not push down a filter").isNull(); - } - - @TestTemplate - public void testFilterPushDownGreaterThanEqualLiteralOnLeft() { - String sqlGTE = String.format("SELECT * FROM %s WHERE 2 >= id ", TABLE_NAME); - String expectedFilter = "ref(name=\"id\") <= 2"; - - List resultGTE = sql(sqlGTE); - assertThat(resultGTE).hasSize(2); - - List expectedGTE = Lists.newArrayList(Row.of(1, "iceberg", 10.0), Row.of(2, "b", 20.0)); - assertSameElements(expectedGTE, resultGTE); - - assertThat(scanEventCount).isEqualTo(1); - assertThat(lastScanEvent.filter()) - .as("Should contain the push down filter") - .asString() - .isEqualTo(expectedFilter); - } - - @TestTemplate - public void testFilterPushDownLessThan() { - String sqlLT = String.format("SELECT * FROM %s WHERE id < 2 ", TABLE_NAME); - String expectedFilter = "ref(name=\"id\") < 2"; - - List resultLT = sql(sqlLT); - assertThat(resultLT).hasSize(1).first().isEqualTo(Row.of(1, "iceberg", 10.0)); - - assertThat(scanEventCount).isEqualTo(1); - assertThat(lastScanEvent.filter()) - .as("Should contain the push down filter") - .asString() - .isEqualTo(expectedFilter); - } - - @TestTemplate - public void testFilterPushDownLessThanNull() { - String sqlLT = String.format("SELECT * FROM %s WHERE data < null ", TABLE_NAME); - - List resultGT = sql(sqlLT); - assertThat(resultGT).isEmpty(); - assertThat(lastScanEvent).as("Should not push down a filter").isNull(); - } - - @TestTemplate - public void testFilterPushDownLessThanLiteralOnLeft() { - String sqlLT = String.format("SELECT * FROM %s WHERE 2 < id ", TABLE_NAME); - String expectedFilter = "ref(name=\"id\") > 2"; - - List resultLT = sql(sqlLT); - assertThat(resultLT).hasSize(1).first().isEqualTo(Row.of(3, null, 30.0)); - - assertThat(scanEventCount).isEqualTo(1); - assertThat(lastScanEvent.filter()) - .as("Should contain the push down filter") - .asString() - .isEqualTo(expectedFilter); - } - - @TestTemplate - public void testFilterPushDownLessThanEqual() { - String sqlLTE = String.format("SELECT * FROM %s WHERE id <= 1 ", TABLE_NAME); - String expectedFilter = "ref(name=\"id\") <= 1"; - - List resultLTE = sql(sqlLTE); - assertThat(resultLTE).hasSize(1).first().isEqualTo(Row.of(1, "iceberg", 10.0)); - - assertThat(scanEventCount).isEqualTo(1); - assertThat(lastScanEvent.filter()) - .as("Should contain the push down filter") - .asString() - .isEqualTo(expectedFilter); - } - - @TestTemplate - public void testFilterPushDownLessThanEqualNull() { - String sqlLTE = String.format("SELECT * FROM %s WHERE data <= null ", TABLE_NAME); - - List resultGT = sql(sqlLTE); - assertThat(resultGT).isEmpty(); - assertThat(lastScanEvent).as("Should not push down a filter").isNull(); - } - - @TestTemplate - public void testFilterPushDownLessThanEqualLiteralOnLeft() { - String sqlLTE = String.format("SELECT * FROM %s WHERE 3 <= id ", TABLE_NAME); - String expectedFilter = "ref(name=\"id\") >= 3"; - - List resultLTE = sql(sqlLTE); - assertThat(resultLTE).hasSize(1).first().isEqualTo(Row.of(3, null, 30.0)); - - assertThat(scanEventCount).isEqualTo(1); - assertThat(lastScanEvent.filter()) - .as("Should contain the push down filter") - .asString() - .isEqualTo(expectedFilter); - } - - @TestTemplate - public void testFilterPushDownIn() { - String sqlIN = String.format("SELECT * FROM %s WHERE id IN (1,2) ", TABLE_NAME); - String expectedFilter = "(ref(name=\"id\") == 1 or ref(name=\"id\") == 2)"; - List resultIN = sql(sqlIN); - assertThat(resultIN).hasSize(2); - - List expectedIN = Lists.newArrayList(Row.of(1, "iceberg", 10.0), Row.of(2, "b", 20.0)); - assertSameElements(expectedIN, resultIN); - assertThat(scanEventCount).isEqualTo(1); - assertThat(lastScanEvent.filter()) - .as("Should contain the push down filter") - .asString() - .isEqualTo(expectedFilter); - } - - @TestTemplate - public void testFilterPushDownInNull() { - String sqlInNull = - String.format("SELECT * FROM %s WHERE data IN ('iceberg',NULL) ", TABLE_NAME); - - List result = sql(sqlInNull); - assertThat(result).hasSize(1).first().isEqualTo(Row.of(1, "iceberg", 10.0)); - - // In SQL, null check can only be done as IS NULL or IS NOT NULL, so it's correct to ignore it - // and push the rest down. - String expectedScan = "ref(name=\"data\") == \"iceberg\""; - assertThat(lastScanEvent.filter()) - .as("Should contain the push down filter") - .asString() - .isEqualTo(expectedScan); - } - - @TestTemplate - public void testFilterPushDownNotIn() { - String sqlNotIn = String.format("SELECT * FROM %s WHERE id NOT IN (3,2) ", TABLE_NAME); - - List resultNotIn = sql(sqlNotIn); - assertThat(resultNotIn).hasSize(1).first().isEqualTo(Row.of(1, "iceberg", 10.0)); - assertThat(scanEventCount).isEqualTo(1); - String expectedScan = "(ref(name=\"id\") != 2 and ref(name=\"id\") != 3)"; - assertThat(lastScanEvent.filter()) - .as("Should contain the push down filter") - .asString() - .isEqualTo(expectedScan); - } - - @TestTemplate - public void testFilterPushDownNotInNull() { - String sqlNotInNull = String.format("SELECT * FROM %s WHERE id NOT IN (1,2,NULL) ", TABLE_NAME); - List resultGT = sql(sqlNotInNull); - assertThat(resultGT).isEmpty(); - assertThat(lastScanEvent) - .as( - "As the predicate pushdown filter out all rows, Flink did not create scan plan, so it doesn't publish any ScanEvent.") - .isNull(); - } - - @TestTemplate - public void testFilterPushDownIsNotNull() { - String sqlNotNull = String.format("SELECT * FROM %s WHERE data IS NOT NULL", TABLE_NAME); - String expectedFilter = "not_null(ref(name=\"data\"))"; - - List resultNotNull = sql(sqlNotNull); - assertThat(resultNotNull).hasSize(2); - - List expected = Lists.newArrayList(Row.of(1, "iceberg", 10.0), Row.of(2, "b", 20.0)); - assertSameElements(expected, resultNotNull); - - assertThat(scanEventCount).isEqualTo(1); - assertThat(lastScanEvent.filter()) - .as("Should contain the push down filter") - .asString() - .isEqualTo(expectedFilter); - } - - @TestTemplate - public void testFilterPushDownIsNull() { - String sqlNull = String.format("SELECT * FROM %s WHERE data IS NULL", TABLE_NAME); - String expectedFilter = "is_null(ref(name=\"data\"))"; - - List resultNull = sql(sqlNull); - assertThat(resultNull).hasSize(1).first().isEqualTo(Row.of(3, null, 30.0)); - - assertThat(scanEventCount).isEqualTo(1); - assertThat(lastScanEvent.filter()) - .as("Should contain the push down filter") - .asString() - .isEqualTo(expectedFilter); - } - - @TestTemplate - public void testFilterPushDownNot() { - String sqlNot = String.format("SELECT * FROM %s WHERE NOT (id = 1 OR id = 2 ) ", TABLE_NAME); - - List resultNot = sql(sqlNot); - assertThat(resultNot).hasSize(1).first().isEqualTo(Row.of(3, null, 30.0)); - - assertThat(scanEventCount).isEqualTo(1); - String expectedFilter = "(ref(name=\"id\") != 1 and ref(name=\"id\") != 2)"; - assertThat(lastScanEvent.filter()) - .as("Should contain the push down filter") - .asString() - .isEqualTo(expectedFilter); - } - - @TestTemplate - public void testFilterPushDownBetween() { - String sqlBetween = String.format("SELECT * FROM %s WHERE id BETWEEN 1 AND 2 ", TABLE_NAME); - - List resultBetween = sql(sqlBetween); - assertThat(resultBetween).hasSize(2); - - List expectedBetween = - Lists.newArrayList(Row.of(1, "iceberg", 10.0), Row.of(2, "b", 20.0)); - assertSameElements(expectedBetween, resultBetween); - - assertThat(scanEventCount).isEqualTo(1); - String expected = "(ref(name=\"id\") >= 1 and ref(name=\"id\") <= 2)"; - assertThat(lastScanEvent.filter()) - .as("Should contain the push down filter") - .asString() - .isEqualTo(expected); - } - - @TestTemplate - public void testFilterPushDownNotBetween() { - String sqlNotBetween = - String.format("SELECT * FROM %s WHERE id NOT BETWEEN 2 AND 3 ", TABLE_NAME); - String expectedFilter = "(ref(name=\"id\") < 2 or ref(name=\"id\") > 3)"; - - List resultNotBetween = sql(sqlNotBetween); - assertThat(resultNotBetween).hasSize(1).first().isEqualTo(Row.of(1, "iceberg", 10.0)); - - assertThat(scanEventCount).isEqualTo(1); - assertThat(lastScanEvent.filter()) - .as("Should contain the push down filter") - .asString() - .isEqualTo(expectedFilter); - } - - @TestTemplate - public void testFilterPushDownLike() { - String expectedFilter = "ref(name=\"data\") startsWith \"\"ice\"\""; - - String sqlLike = "SELECT * FROM " + TABLE_NAME + " WHERE data LIKE 'ice%%' "; - List resultLike = sql(sqlLike); - assertThat(resultLike).hasSize(1).first().isEqualTo(Row.of(1, "iceberg", 10.0)); - assertThat(scanEventCount).isEqualTo(1); - assertThat(lastScanEvent.filter()) - .as("Should contain the push down filter") - .asString() - .isEqualTo(expectedFilter); - - // %% won't match the row with null value - sqlLike = "SELECT * FROM " + TABLE_NAME + " WHERE data LIKE '%%' "; - resultLike = sql(sqlLike); - assertThat(resultLike).hasSize(2); - List expectedRecords = - Lists.newArrayList(Row.of(1, "iceberg", 10.0), Row.of(2, "b", 20.0)); - assertSameElements(expectedRecords, resultLike); - String expectedScan = "not_null(ref(name=\"data\"))"; - assertThat(lastScanEvent.filter()) - .as("Should contain the push down filter") - .asString() - .isEqualTo(expectedScan); - } - - @TestTemplate - public void testFilterNotPushDownLike() { - Row expectRecord = Row.of(1, "iceberg", 10.0); - String sqlNoPushDown = "SELECT * FROM " + TABLE_NAME + " WHERE data LIKE '%%i' "; - List resultLike = sql(sqlNoPushDown); - assertThat(resultLike).isEmpty(); - assertThat(lastScanEvent.filter()) - .as("Should not push down a filter") - .isEqualTo(Expressions.alwaysTrue()); - - sqlNoPushDown = "SELECT * FROM " + TABLE_NAME + " WHERE data LIKE '%%i%%' "; - resultLike = sql(sqlNoPushDown); - assertThat(resultLike).hasSize(1).first().isEqualTo(expectRecord); - assertThat(lastScanEvent.filter()) - .as("Should not push down a filter") - .isEqualTo(Expressions.alwaysTrue()); - - sqlNoPushDown = "SELECT * FROM " + TABLE_NAME + " WHERE data LIKE '%%ice%%g' "; - resultLike = sql(sqlNoPushDown); - assertThat(resultLike).hasSize(1).first().isEqualTo(expectRecord); - assertThat(lastScanEvent.filter()) - .as("Should not push down a filter") - .isEqualTo(Expressions.alwaysTrue()); - - sqlNoPushDown = "SELECT * FROM " + TABLE_NAME + " WHERE data LIKE 'iceber_' "; - resultLike = sql(sqlNoPushDown); - assertThat(resultLike).hasSize(1).first().isEqualTo(expectRecord); - assertThat(lastScanEvent.filter()) - .as("Should not push down a filter") - .isEqualTo(Expressions.alwaysTrue()); - - sqlNoPushDown = "SELECT * FROM " + TABLE_NAME + " WHERE data LIKE 'i%%g' "; - resultLike = sql(sqlNoPushDown); - assertThat(resultLike).hasSize(1).first().isEqualTo(expectRecord); - assertThat(lastScanEvent.filter()) - .as("Should not push down a filter") - .isEqualTo(Expressions.alwaysTrue()); - } - - @TestTemplate - public void testFilterPushDown2Literal() { - String sql2Literal = String.format("SELECT * FROM %s WHERE 1 > 0 ", TABLE_NAME); - List result = sql(sql2Literal); - List expectedRecords = - Lists.newArrayList(Row.of(1, "iceberg", 10.0), Row.of(2, "b", 20.0), Row.of(3, null, 30.0)); - assertSameElements(expectedRecords, result); - assertThat(lastScanEvent.filter()) - .as("Should not push down a filter") - .isEqualTo(Expressions.alwaysTrue()); - } - - @TestTemplate - public void testSqlParseNaN() { - // todo add some test case to test NaN - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBounded.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBounded.java deleted file mode 100644 index b7447d15c05a..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBounded.java +++ /dev/null @@ -1,147 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import static org.apache.iceberg.flink.SimpleDataUtil.SCHEMA; -import static org.assertj.core.api.Assertions.assertThatThrownBy; - -import java.util.Collections; -import java.util.List; -import java.util.Map; -import org.apache.flink.api.common.eventtime.WatermarkStrategy; -import org.apache.flink.api.common.typeinfo.TypeInformation; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.table.api.TableColumn; -import org.apache.flink.table.api.TableSchema; -import org.apache.flink.table.data.RowData; -import org.apache.flink.types.Row; -import org.apache.flink.util.CloseableIterator; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.expressions.Expression; -import org.apache.iceberg.flink.FlinkConfigOptions; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.TableLoader; -import org.apache.iceberg.flink.TestFixtures; -import org.apache.iceberg.flink.data.RowDataToRowMapper; -import org.apache.iceberg.flink.source.assigner.SimpleSplitAssignerFactory; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.junit.jupiter.api.TestTemplate; - -public class TestIcebergSourceBounded extends TestFlinkScan { - @TestTemplate - public void testValidation() { - CATALOG_EXTENSION.catalog().createTable(TestFixtures.TABLE_IDENTIFIER, SCHEMA); - - assertThatThrownBy( - () -> - IcebergSource.forRowData() - .tableLoader(tableLoader()) - .assignerFactory(new SimpleSplitAssignerFactory()) - .streaming(false) - .endTag("tag") - .endSnapshotId(1L) - .build()) - .hasMessage("END_SNAPSHOT_ID and END_TAG cannot both be set.") - .isInstanceOf(IllegalArgumentException.class); - } - - @Override - protected List runWithProjection(String... projected) throws Exception { - Schema icebergTableSchema = - CATALOG_EXTENSION.catalog().loadTable(TestFixtures.TABLE_IDENTIFIER).schema(); - TableSchema.Builder builder = TableSchema.builder(); - TableSchema schema = FlinkSchemaUtil.toSchema(FlinkSchemaUtil.convert(icebergTableSchema)); - for (String field : projected) { - TableColumn column = schema.getTableColumn(field).get(); - builder.field(column.getName(), column.getType()); - } - TableSchema flinkSchema = builder.build(); - Schema projectedSchema = FlinkSchemaUtil.convert(icebergTableSchema, flinkSchema); - return run(projectedSchema, Lists.newArrayList(), Maps.newHashMap(), "", projected); - } - - @Override - protected List runWithFilter(Expression filter, String sqlFilter, boolean caseSensitive) - throws Exception { - Map options = Maps.newHashMap(); - options.put("case-sensitive", Boolean.toString(caseSensitive)); - return run(null, Collections.singletonList(filter), options, sqlFilter, "*"); - } - - @Override - protected List runWithOptions(Map options) throws Exception { - return run(null, Lists.newArrayList(), options, "", "*"); - } - - @Override - protected List run() throws Exception { - return run(null, Lists.newArrayList(), Maps.newHashMap(), "", "*"); - } - - protected List run( - Schema projectedSchema, - List filters, - Map options, - String sqlFilter, - String... sqlSelectedFields) - throws Exception { - - StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); - env.setParallelism(1); - Configuration config = new Configuration(); - config.setInteger(FlinkConfigOptions.SOURCE_READER_FETCH_BATCH_RECORD_COUNT, 128); - Table table; - try (TableLoader tableLoader = tableLoader()) { - tableLoader.open(); - table = tableLoader.loadTable(); - } - - IcebergSource.Builder sourceBuilder = - IcebergSource.forRowData() - .tableLoader(tableLoader()) - .table(table) - .assignerFactory(new SimpleSplitAssignerFactory()) - .flinkConfig(config); - if (projectedSchema != null) { - sourceBuilder.project(projectedSchema); - } - - sourceBuilder.filters(filters); - sourceBuilder.properties(options); - - DataStream stream = - env.fromSource( - sourceBuilder.build(), - WatermarkStrategy.noWatermarks(), - "testBasicRead", - TypeInformation.of(RowData.class)) - .map( - new RowDataToRowMapper( - FlinkSchemaUtil.convert( - projectedSchema == null ? table.schema() : projectedSchema))); - - try (CloseableIterator iter = stream.executeAndCollect()) { - return Lists.newArrayList(iter); - } - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBoundedGenericRecord.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBoundedGenericRecord.java deleted file mode 100644 index 7bfed00a9eb4..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBoundedGenericRecord.java +++ /dev/null @@ -1,196 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import java.nio.file.Path; -import java.util.Arrays; -import java.util.Collections; -import java.util.List; -import java.util.Map; -import org.apache.avro.generic.GenericRecord; -import org.apache.flink.api.common.eventtime.WatermarkStrategy; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.formats.avro.typeutils.GenericRecordAvroTypeInfo; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.table.types.logical.RowType; -import org.apache.flink.types.Row; -import org.apache.flink.util.CloseableIterator; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Parameter; -import org.apache.iceberg.ParameterizedTestExtension; -import org.apache.iceberg.Parameters; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.avro.AvroSchemaUtil; -import org.apache.iceberg.data.GenericAppenderHelper; -import org.apache.iceberg.data.RandomGenericData; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.expressions.Expression; -import org.apache.iceberg.flink.FlinkConfigOptions; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.HadoopCatalogExtension; -import org.apache.iceberg.flink.TableLoader; -import org.apache.iceberg.flink.TestFixtures; -import org.apache.iceberg.flink.TestHelpers; -import org.apache.iceberg.flink.data.RowDataToRowMapper; -import org.apache.iceberg.flink.sink.AvroGenericRecordToRowDataMapper; -import org.apache.iceberg.flink.source.assigner.SimpleSplitAssignerFactory; -import org.apache.iceberg.flink.source.reader.AvroGenericRecordReaderFunction; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Sets; -import org.apache.iceberg.types.TypeUtil; -import org.junit.jupiter.api.TestTemplate; -import org.junit.jupiter.api.extension.ExtendWith; -import org.junit.jupiter.api.extension.RegisterExtension; -import org.junit.jupiter.api.io.TempDir; - -@ExtendWith(ParameterizedTestExtension.class) -public class TestIcebergSourceBoundedGenericRecord { - @TempDir protected Path temporaryFolder; - - @RegisterExtension - private static final HadoopCatalogExtension CATALOG_EXTENSION = - new HadoopCatalogExtension(TestFixtures.DATABASE, TestFixtures.TABLE); - - @Parameters(name = "format={0}, parallelism = {1}") - public static Object[][] parameters() { - return new Object[][] { - {FileFormat.AVRO, 2}, - {FileFormat.PARQUET, 2}, - {FileFormat.ORC, 2} - }; - } - - @Parameter(index = 0) - private FileFormat fileFormat; - - @Parameter(index = 1) - private int parallelism; - - @TestTemplate - public void testUnpartitionedTable() throws Exception { - Table table = - CATALOG_EXTENSION.catalog().createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA); - List expectedRecords = RandomGenericData.generate(TestFixtures.SCHEMA, 2, 0L); - new GenericAppenderHelper(table, fileFormat, temporaryFolder).appendToTable(expectedRecords); - TestHelpers.assertRecords(run(), expectedRecords, TestFixtures.SCHEMA); - } - - @TestTemplate - public void testPartitionedTable() throws Exception { - String dateStr = "2020-03-20"; - Table table = - CATALOG_EXTENSION - .catalog() - .createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA, TestFixtures.SPEC); - List expectedRecords = RandomGenericData.generate(TestFixtures.SCHEMA, 2, 0L); - for (int i = 0; i < expectedRecords.size(); ++i) { - expectedRecords.get(i).setField("dt", dateStr); - } - - new GenericAppenderHelper(table, fileFormat, temporaryFolder) - .appendToTable(org.apache.iceberg.TestHelpers.Row.of(dateStr, 0), expectedRecords); - TestHelpers.assertRecords(run(), expectedRecords, TestFixtures.SCHEMA); - } - - @TestTemplate - public void testProjection() throws Exception { - Table table = - CATALOG_EXTENSION - .catalog() - .createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA, TestFixtures.SPEC); - List expectedRecords = RandomGenericData.generate(TestFixtures.SCHEMA, 2, 0L); - new GenericAppenderHelper(table, fileFormat, temporaryFolder) - .appendToTable(org.apache.iceberg.TestHelpers.Row.of("2020-03-20", 0), expectedRecords); - // select the "data" field (fieldId == 1) - Schema projectedSchema = TypeUtil.select(TestFixtures.SCHEMA, Sets.newHashSet(1)); - List expectedRows = - Arrays.asList(Row.of(expectedRecords.get(0).get(0)), Row.of(expectedRecords.get(1).get(0))); - TestHelpers.assertRows( - run(projectedSchema, Collections.emptyList(), Collections.emptyMap()), expectedRows); - } - - private List run() throws Exception { - return run(null, Collections.emptyList(), Collections.emptyMap()); - } - - private List run( - Schema projectedSchema, List filters, Map options) - throws Exception { - - StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); - env.setParallelism(parallelism); - env.getConfig().enableObjectReuse(); - - Configuration config = new Configuration(); - config.setInteger(FlinkConfigOptions.SOURCE_READER_FETCH_BATCH_RECORD_COUNT, 128); - Table table; - try (TableLoader tableLoader = CATALOG_EXTENSION.tableLoader()) { - tableLoader.open(); - table = tableLoader.loadTable(); - } - - AvroGenericRecordReaderFunction readerFunction = - new AvroGenericRecordReaderFunction( - TestFixtures.TABLE_IDENTIFIER.name(), - new Configuration(), - table.schema(), - null, - null, - false, - table.io(), - table.encryption(), - filters); - - IcebergSource.Builder sourceBuilder = - IcebergSource.builder() - .tableLoader(CATALOG_EXTENSION.tableLoader()) - .readerFunction(readerFunction) - .assignerFactory(new SimpleSplitAssignerFactory()) - .flinkConfig(config); - if (projectedSchema != null) { - sourceBuilder.project(projectedSchema); - } - - sourceBuilder.filters(filters); - sourceBuilder.setAll(options); - - Schema readSchema = projectedSchema != null ? projectedSchema : table.schema(); - RowType rowType = FlinkSchemaUtil.convert(readSchema); - org.apache.avro.Schema avroSchema = - AvroSchemaUtil.convert(readSchema, TestFixtures.TABLE_IDENTIFIER.name()); - - DataStream stream = - env.fromSource( - sourceBuilder.build(), - WatermarkStrategy.noWatermarks(), - "testBasicRead", - new GenericRecordAvroTypeInfo(avroSchema)) - // There are two reasons for converting GenericRecord back to Row. - // 1. Avro GenericRecord/Schema is not serializable. - // 2. leverage the TestHelpers.assertRecords for validation. - .map(AvroGenericRecordToRowDataMapper.forAvroSchema(avroSchema)) - .map(new RowDataToRowMapper(rowType)); - - try (CloseableIterator iter = stream.executeAndCollect()) { - return Lists.newArrayList(iter); - } - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBoundedSql.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBoundedSql.java deleted file mode 100644 index 0f41c5af4c95..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBoundedSql.java +++ /dev/null @@ -1,76 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import java.io.IOException; -import java.util.List; -import java.util.Map; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.table.api.EnvironmentSettings; -import org.apache.flink.table.api.TableEnvironment; -import org.apache.flink.table.api.config.TableConfigOptions; -import org.apache.flink.types.Row; -import org.apache.iceberg.Schema; -import org.apache.iceberg.expressions.Expression; -import org.apache.iceberg.flink.FlinkConfigOptions; -import org.junit.jupiter.api.BeforeEach; - -public class TestIcebergSourceBoundedSql extends TestIcebergSourceBounded { - private volatile TableEnvironment tEnv; - - @BeforeEach - public void before() throws IOException { - Configuration tableConf = getTableEnv().getConfig().getConfiguration(); - tableConf.setBoolean(FlinkConfigOptions.TABLE_EXEC_ICEBERG_USE_FLIP27_SOURCE.key(), true); - SqlHelpers.sql( - getTableEnv(), - "create catalog iceberg_catalog with ('type'='iceberg', 'catalog-type'='hadoop', 'warehouse'='%s')", - CATALOG_EXTENSION.warehouse()); - SqlHelpers.sql(getTableEnv(), "use catalog iceberg_catalog"); - getTableEnv() - .getConfig() - .getConfiguration() - .set(TableConfigOptions.TABLE_DYNAMIC_TABLE_OPTIONS_ENABLED, true); - } - - private TableEnvironment getTableEnv() { - if (tEnv == null) { - synchronized (this) { - if (tEnv == null) { - this.tEnv = - TableEnvironment.create(EnvironmentSettings.newInstance().inBatchMode().build()); - } - } - } - return tEnv; - } - - @Override - protected List run( - Schema projectedSchema, - List filters, - Map options, - String sqlFilter, - String... sqlSelectedFields) - throws Exception { - String select = String.join(",", sqlSelectedFields); - String optionStr = SqlHelpers.sqlOptionsToString(options); - return SqlHelpers.sql(getTableEnv(), "select %s from t %s %s", select, optionStr, sqlFilter); - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceContinuous.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceContinuous.java deleted file mode 100644 index 9c7006e16b8e..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceContinuous.java +++ /dev/null @@ -1,538 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatThrownBy; - -import java.nio.file.Path; -import java.time.Duration; -import java.util.Collection; -import java.util.List; -import java.util.concurrent.atomic.AtomicLong; -import java.util.stream.Collectors; -import org.apache.flink.api.common.JobID; -import org.apache.flink.api.common.JobStatus; -import org.apache.flink.api.common.eventtime.WatermarkStrategy; -import org.apache.flink.api.common.typeinfo.TypeInformation; -import org.apache.flink.client.program.ClusterClient; -import org.apache.flink.runtime.client.JobStatusMessage; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.table.data.RowData; -import org.apache.flink.test.junit5.InjectClusterClient; -import org.apache.flink.test.junit5.MiniClusterExtension; -import org.apache.flink.types.Row; -import org.apache.flink.util.CloseableIterator; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.data.GenericAppenderHelper; -import org.apache.iceberg.data.RandomGenericData; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.HadoopTableExtension; -import org.apache.iceberg.flink.MiniFlinkClusterExtension; -import org.apache.iceberg.flink.TestFixtures; -import org.apache.iceberg.flink.TestHelpers; -import org.apache.iceberg.flink.data.RowDataToRowMapper; -import org.apache.iceberg.flink.source.assigner.SimpleSplitAssignerFactory; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.awaitility.Awaitility; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.extension.RegisterExtension; -import org.junit.jupiter.api.io.TempDir; - -public class TestIcebergSourceContinuous { - - @TempDir protected Path temporaryFolder; - - @RegisterExtension - public static final MiniClusterExtension MINI_CLUSTER_EXTENSION = - MiniFlinkClusterExtension.createWithClassloaderCheckDisabled(); - - @RegisterExtension - private static final HadoopTableExtension TABLE_EXTENSION = - new HadoopTableExtension(TestFixtures.DATABASE, TestFixtures.TABLE, TestFixtures.SCHEMA); - - private final AtomicLong randomSeed = new AtomicLong(0L); - - @Test - public void testTableScanThenIncremental() throws Exception { - GenericAppenderHelper dataAppender = - new GenericAppenderHelper(TABLE_EXTENSION.table(), FileFormat.PARQUET, temporaryFolder); - - // snapshot1 - List batch1 = - RandomGenericData.generate( - TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); - dataAppender.appendToTable(batch1); - - ScanContext scanContext = - ScanContext.builder() - .streaming(true) - .monitorInterval(Duration.ofMillis(10L)) - .startingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) - .build(); - - try (CloseableIterator iter = - createStream(scanContext).executeAndCollect(getClass().getSimpleName())) { - List result1 = waitForResult(iter, 2); - TestHelpers.assertRecords(result1, batch1, TABLE_EXTENSION.table().schema()); - - // snapshot2 - List batch2 = - RandomGenericData.generate( - TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); - dataAppender.appendToTable(batch2); - TABLE_EXTENSION.table().currentSnapshot().snapshotId(); - - List result2 = waitForResult(iter, 2); - TestHelpers.assertRecords(result2, batch2, TABLE_EXTENSION.table().schema()); - - // snapshot3 - List batch3 = - RandomGenericData.generate( - TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); - dataAppender.appendToTable(batch3); - TABLE_EXTENSION.table().currentSnapshot().snapshotId(); - - List result3 = waitForResult(iter, 2); - TestHelpers.assertRecords(result3, batch3, TABLE_EXTENSION.table().schema()); - } - } - - @Test - public void testTableScanThenIncrementalAfterExpiration() throws Exception { - GenericAppenderHelper dataAppender = - new GenericAppenderHelper(TABLE_EXTENSION.table(), FileFormat.PARQUET, temporaryFolder); - - // snapshot1 - List batch1 = - RandomGenericData.generate( - TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); - dataAppender.appendToTable(batch1); - long snapshotId = TABLE_EXTENSION.table().currentSnapshot().snapshotId(); - - // snapshot2 - List batch2 = - RandomGenericData.generate( - TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); - dataAppender.appendToTable(batch2); - - TABLE_EXTENSION.table().expireSnapshots().expireSnapshotId(snapshotId).commit(); - - assertThat(TABLE_EXTENSION.table().history()).hasSize(1); - - ScanContext scanContext = - ScanContext.builder() - .streaming(true) - .monitorInterval(Duration.ofMillis(10L)) - .startingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) - .build(); - - assertThat(FlinkSplitPlanner.checkScanMode(scanContext)) - .isEqualTo(FlinkSplitPlanner.ScanMode.BATCH); - - try (CloseableIterator iter = - createStream(scanContext).executeAndCollect(getClass().getSimpleName())) { - List result1 = waitForResult(iter, 4); - List initialRecords = Lists.newArrayList(); - initialRecords.addAll(batch1); - initialRecords.addAll(batch2); - TestHelpers.assertRecords(result1, initialRecords, TABLE_EXTENSION.table().schema()); - - // snapshot3 - List batch3 = - RandomGenericData.generate( - TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); - dataAppender.appendToTable(batch3); - TABLE_EXTENSION.table().currentSnapshot().snapshotId(); - - List result3 = waitForResult(iter, 2); - TestHelpers.assertRecords(result3, batch3, TABLE_EXTENSION.table().schema()); - } - } - - @Test - public void testEarliestSnapshot() throws Exception { - GenericAppenderHelper dataAppender = - new GenericAppenderHelper(TABLE_EXTENSION.table(), FileFormat.PARQUET, temporaryFolder); - - // snapshot0 - List batch0 = - RandomGenericData.generate( - TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); - dataAppender.appendToTable(batch0); - - // snapshot1 - List batch1 = - RandomGenericData.generate( - TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); - dataAppender.appendToTable(batch1); - - ScanContext scanContext = - ScanContext.builder() - .streaming(true) - .monitorInterval(Duration.ofMillis(10L)) - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_EARLIEST_SNAPSHOT) - .build(); - - try (CloseableIterator iter = - createStream(scanContext).executeAndCollect(getClass().getSimpleName())) { - List result1 = waitForResult(iter, 4); - List combinedBatch0AndBatch1 = Lists.newArrayList(batch0); - combinedBatch0AndBatch1.addAll(batch1); - TestHelpers.assertRecords(result1, combinedBatch0AndBatch1, TABLE_EXTENSION.table().schema()); - - // snapshot2 - List batch2 = - RandomGenericData.generate( - TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); - dataAppender.appendToTable(batch2); - - List result2 = waitForResult(iter, 2); - TestHelpers.assertRecords(result2, batch2, TABLE_EXTENSION.table().schema()); - - // snapshot3 - List batch3 = - RandomGenericData.generate( - TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); - dataAppender.appendToTable(batch3); - - List result3 = waitForResult(iter, 2); - TestHelpers.assertRecords(result3, batch3, TABLE_EXTENSION.table().schema()); - } - } - - @Test - public void testLatestSnapshot(@InjectClusterClient ClusterClient clusterClient) - throws Exception { - GenericAppenderHelper dataAppender = - new GenericAppenderHelper(TABLE_EXTENSION.table(), FileFormat.PARQUET, temporaryFolder); - - // snapshot0 - List batch0 = - RandomGenericData.generate( - TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); - dataAppender.appendToTable(batch0); - - // snapshot1 - List batch1 = - RandomGenericData.generate( - TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); - dataAppender.appendToTable(batch1); - - ScanContext scanContext = - ScanContext.builder() - .streaming(true) - .monitorInterval(Duration.ofMillis(10L)) - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_LATEST_SNAPSHOT) - .build(); - - try (CloseableIterator iter = - createStream(scanContext).executeAndCollect(getClass().getSimpleName())) { - // we want to make sure job is running first so that enumerator can - // start from the latest snapshot before inserting the next batch2 below. - waitUntilJobIsRunning(clusterClient); - - // inclusive behavior for starting snapshot - List result1 = waitForResult(iter, 2); - TestHelpers.assertRecords(result1, batch1, TABLE_EXTENSION.table().schema()); - - // snapshot2 - List batch2 = - RandomGenericData.generate( - TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); - dataAppender.appendToTable(batch2); - - List result2 = waitForResult(iter, 2); - TestHelpers.assertRecords(result2, batch2, TABLE_EXTENSION.table().schema()); - - // snapshot3 - List batch3 = - RandomGenericData.generate( - TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); - dataAppender.appendToTable(batch3); - - List result3 = waitForResult(iter, 2); - TestHelpers.assertRecords(result3, batch3, TABLE_EXTENSION.table().schema()); - } - } - - @Test - public void testSpecificSnapshotId() throws Exception { - GenericAppenderHelper dataAppender = - new GenericAppenderHelper(TABLE_EXTENSION.table(), FileFormat.PARQUET, temporaryFolder); - - // snapshot0 - List batch0 = - RandomGenericData.generate( - TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); - dataAppender.appendToTable(batch0); - long snapshot0 = TABLE_EXTENSION.table().currentSnapshot().snapshotId(); - - // snapshot1 - List batch1 = - RandomGenericData.generate( - TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); - dataAppender.appendToTable(batch1); - long snapshot1 = TABLE_EXTENSION.table().currentSnapshot().snapshotId(); - - ScanContext scanContext = - ScanContext.builder() - .streaming(true) - .monitorInterval(Duration.ofMillis(10L)) - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_ID) - .startSnapshotId(snapshot1) - .build(); - - try (CloseableIterator iter = - createStream(scanContext).executeAndCollect(getClass().getSimpleName())) { - List result1 = waitForResult(iter, 2); - TestHelpers.assertRecords(result1, batch1, TABLE_EXTENSION.table().schema()); - - // snapshot2 - List batch2 = - RandomGenericData.generate( - TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); - dataAppender.appendToTable(batch2); - - List result2 = waitForResult(iter, 2); - TestHelpers.assertRecords(result2, batch2, TABLE_EXTENSION.table().schema()); - - // snapshot3 - List batch3 = - RandomGenericData.generate( - TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); - dataAppender.appendToTable(batch3); - - List result3 = waitForResult(iter, 2); - TestHelpers.assertRecords(result3, batch3, TABLE_EXTENSION.table().schema()); - } - } - - @Test - public void testSpecificSnapshotTimestamp() throws Exception { - GenericAppenderHelper dataAppender = - new GenericAppenderHelper(TABLE_EXTENSION.table(), FileFormat.PARQUET, temporaryFolder); - - // snapshot0 - List batch0 = - RandomGenericData.generate( - TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); - dataAppender.appendToTable(batch0); - long snapshot0Timestamp = TABLE_EXTENSION.table().currentSnapshot().timestampMillis(); - - // sleep for 2 ms to make sure snapshot1 has a higher timestamp value - Thread.sleep(2); - - // snapshot1 - List batch1 = - RandomGenericData.generate( - TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); - dataAppender.appendToTable(batch1); - long snapshot1Timestamp = TABLE_EXTENSION.table().currentSnapshot().timestampMillis(); - - ScanContext scanContext = - ScanContext.builder() - .streaming(true) - .monitorInterval(Duration.ofMillis(10L)) - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP) - .startSnapshotTimestamp(snapshot1Timestamp) - .build(); - - try (CloseableIterator iter = - createStream(scanContext).executeAndCollect(getClass().getSimpleName())) { - // consume data from snapshot1 - List result1 = waitForResult(iter, 2); - TestHelpers.assertRecords(result1, batch1, TABLE_EXTENSION.table().schema()); - - // snapshot2 - List batch2 = - RandomGenericData.generate( - TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); - dataAppender.appendToTable(batch2); - - List result2 = waitForResult(iter, 2); - TestHelpers.assertRecords(result2, batch2, TABLE_EXTENSION.table().schema()); - - // snapshot3 - List batch3 = - RandomGenericData.generate( - TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); - dataAppender.appendToTable(batch3); - - List result3 = waitForResult(iter, 2); - TestHelpers.assertRecords(result3, batch3, TABLE_EXTENSION.table().schema()); - } - } - - @Test - public void testReadingFromBranch() throws Exception { - String branch = "b1"; - GenericAppenderHelper dataAppender = - new GenericAppenderHelper(TABLE_EXTENSION.table(), FileFormat.PARQUET, temporaryFolder); - - List batchBase = - RandomGenericData.generate( - TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); - dataAppender.appendToTable(batchBase); - - // create branch - TABLE_EXTENSION - .table() - .manageSnapshots() - .createBranch(branch, TABLE_EXTENSION.table().currentSnapshot().snapshotId()) - .commit(); - - // snapshot1 to branch - List batch1 = - RandomGenericData.generate( - TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); - dataAppender.appendToTable(branch, batch1); - - // snapshot2 to branch - List batch2 = - RandomGenericData.generate( - TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); - dataAppender.appendToTable(branch, batch2); - - List branchExpectedRecords = Lists.newArrayList(); - branchExpectedRecords.addAll(batchBase); - branchExpectedRecords.addAll(batch1); - branchExpectedRecords.addAll(batch2); - // reads from branch: it should contain the first snapshot (before the branch creation) followed - // by the next 2 snapshots added - ScanContext scanContext = - ScanContext.builder() - .streaming(true) - .monitorInterval(Duration.ofMillis(10L)) - .startingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) - .useBranch(branch) - .build(); - - try (CloseableIterator iter = - createStream(scanContext).executeAndCollect(getClass().getSimpleName())) { - List resultMain = waitForResult(iter, 6); - TestHelpers.assertRecords( - resultMain, branchExpectedRecords, TABLE_EXTENSION.table().schema()); - - // snapshot3 to branch - List batch3 = - RandomGenericData.generate( - TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); - dataAppender.appendToTable(branch, batch3); - - List result3 = waitForResult(iter, 2); - TestHelpers.assertRecords(result3, batch3, TABLE_EXTENSION.table().schema()); - - // snapshot4 to branch - List batch4 = - RandomGenericData.generate( - TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); - dataAppender.appendToTable(branch, batch4); - - List result4 = waitForResult(iter, 2); - TestHelpers.assertRecords(result4, batch4, TABLE_EXTENSION.table().schema()); - } - - // read only from main branch. Should contain only the first snapshot - scanContext = - ScanContext.builder() - .streaming(true) - .monitorInterval(Duration.ofMillis(10L)) - .startingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) - .build(); - try (CloseableIterator iter = - createStream(scanContext).executeAndCollect(getClass().getSimpleName())) { - List resultMain = waitForResult(iter, 2); - TestHelpers.assertRecords(resultMain, batchBase, TABLE_EXTENSION.table().schema()); - - List batchMain2 = - RandomGenericData.generate( - TABLE_EXTENSION.table().schema(), 2, randomSeed.incrementAndGet()); - dataAppender.appendToTable(batchMain2); - resultMain = waitForResult(iter, 2); - TestHelpers.assertRecords(resultMain, batchMain2, TABLE_EXTENSION.table().schema()); - } - } - - @Test - public void testValidation() { - assertThatThrownBy( - () -> - IcebergSource.forRowData() - .tableLoader(TABLE_EXTENSION.tableLoader()) - .assignerFactory(new SimpleSplitAssignerFactory()) - .streaming(true) - .endTag("tag") - .build()) - .hasMessage("Cannot set end-tag option for streaming reader") - .isInstanceOf(IllegalArgumentException.class); - } - - private DataStream createStream(ScanContext scanContext) throws Exception { - // start the source and collect output - StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); - env.setParallelism(1); - DataStream stream = - env.fromSource( - IcebergSource.forRowData() - .tableLoader(TABLE_EXTENSION.tableLoader()) - .assignerFactory(new SimpleSplitAssignerFactory()) - .streaming(scanContext.isStreaming()) - .streamingStartingStrategy(scanContext.streamingStartingStrategy()) - .startSnapshotTimestamp(scanContext.startSnapshotTimestamp()) - .startSnapshotId(scanContext.startSnapshotId()) - .monitorInterval(Duration.ofMillis(10L)) - .branch(scanContext.branch()) - .build(), - WatermarkStrategy.noWatermarks(), - "icebergSource", - TypeInformation.of(RowData.class)) - .map(new RowDataToRowMapper(FlinkSchemaUtil.convert(TABLE_EXTENSION.table().schema()))); - return stream; - } - - public static List waitForResult(CloseableIterator iter, int limit) { - List results = Lists.newArrayListWithCapacity(limit); - while (results.size() < limit) { - if (iter.hasNext()) { - results.add(iter.next()); - } else { - break; - } - } - return results; - } - - public static void waitUntilJobIsRunning(ClusterClient client) { - Awaitility.await("job should be running") - .atMost(Duration.ofSeconds(30)) - .pollInterval(Duration.ofMillis(10)) - .untilAsserted(() -> assertThat(getRunningJobs(client)).isNotEmpty()); - } - - public static List getRunningJobs(ClusterClient client) throws Exception { - Collection statusMessages = client.listJobs().get(); - return statusMessages.stream() - .filter(status -> status.getJobState() == JobStatus.RUNNING) - .map(JobStatusMessage::getJobId) - .collect(Collectors.toList()); - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceFailover.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceFailover.java deleted file mode 100644 index 938ae4d9bb0a..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceFailover.java +++ /dev/null @@ -1,394 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import static org.apache.iceberg.flink.SimpleDataUtil.tableRecords; -import static org.assertj.core.api.Assertions.assertThat; - -import java.nio.file.Path; -import java.time.Duration; -import java.util.List; -import java.util.concurrent.CompletableFuture; -import java.util.concurrent.CountDownLatch; -import java.util.concurrent.atomic.AtomicInteger; -import org.apache.flink.api.common.JobID; -import org.apache.flink.api.common.eventtime.WatermarkStrategy; -import org.apache.flink.api.common.restartstrategy.RestartStrategies; -import org.apache.flink.api.common.typeinfo.TypeInformation; -import org.apache.flink.client.program.ClusterClient; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.core.execution.JobClient; -import org.apache.flink.core.execution.SavepointFormatType; -import org.apache.flink.runtime.highavailability.nonha.embedded.HaLeadershipControl; -import org.apache.flink.runtime.jobgraph.SavepointConfigOptions; -import org.apache.flink.runtime.minicluster.MiniCluster; -import org.apache.flink.runtime.minicluster.RpcServiceSharing; -import org.apache.flink.runtime.testutils.MiniClusterResourceConfiguration; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.table.data.RowData; -import org.apache.flink.test.junit5.InjectClusterClient; -import org.apache.flink.test.junit5.MiniClusterExtension; -import org.apache.flink.test.util.MiniClusterWithClientResource; -import org.apache.flink.util.function.ThrowingConsumer; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.data.GenericAppenderHelper; -import org.apache.iceberg.data.RandomGenericData; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.flink.FlinkConfigOptions; -import org.apache.iceberg.flink.FlinkReadOptions; -import org.apache.iceberg.flink.HadoopCatalogExtension; -import org.apache.iceberg.flink.SimpleDataUtil; -import org.apache.iceberg.flink.TestFixtures; -import org.apache.iceberg.flink.sink.FlinkSink; -import org.apache.iceberg.flink.source.assigner.SimpleSplitAssignerFactory; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Disabled; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.Timeout; -import org.junit.jupiter.api.extension.RegisterExtension; -import org.junit.jupiter.api.io.TempDir; - -@Timeout(value = 120) -public class TestIcebergSourceFailover { - - // Parallelism higher than 1, but lower than the number of splits used by some of our tests - // The goal is to allow some splits to remain in the enumerator when restoring the state - private static final int PARALLELISM = 2; - private static final int DO_NOT_FAIL = Integer.MAX_VALUE; - protected static final MiniClusterResourceConfiguration MINI_CLUSTER_RESOURCE_CONFIG = - new MiniClusterResourceConfiguration.Builder() - .setNumberTaskManagers(1) - .setNumberSlotsPerTaskManager(PARALLELISM) - .setRpcServiceSharing(RpcServiceSharing.DEDICATED) - .withHaLeadershipControl() - .build(); - - @RegisterExtension - public static final MiniClusterExtension MINI_CLUSTER_EXTENSION = - new MiniClusterExtension(MINI_CLUSTER_RESOURCE_CONFIG); - - @TempDir protected Path temporaryFolder; - - @RegisterExtension - protected static final HadoopCatalogExtension SOURCE_CATALOG_EXTENSION = - new HadoopCatalogExtension(TestFixtures.DATABASE, TestFixtures.TABLE); - - @RegisterExtension - protected static final HadoopCatalogExtension SINK_CATALOG_EXTENSION = - new HadoopCatalogExtension(TestFixtures.DATABASE, TestFixtures.SINK_TABLE); - - protected Table sourceTable; - protected Table sinkTable; - - @BeforeEach - protected void setupTable() { - this.sourceTable = - SOURCE_CATALOG_EXTENSION - .catalog() - .createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA); - this.sinkTable = - SINK_CATALOG_EXTENSION - .catalog() - .createTable(TestFixtures.SINK_TABLE_IDENTIFIER, TestFixtures.SCHEMA); - } - - protected IcebergSource.Builder sourceBuilder() { - Configuration config = new Configuration(); - return IcebergSource.forRowData() - .tableLoader(SOURCE_CATALOG_EXTENSION.tableLoader()) - .assignerFactory(new SimpleSplitAssignerFactory()) - // Prevent combining splits - .set( - FlinkReadOptions.SPLIT_FILE_OPEN_COST, - Long.toString(TableProperties.SPLIT_SIZE_DEFAULT)) - .flinkConfig(config); - } - - protected Schema schema() { - return TestFixtures.SCHEMA; - } - - protected List generateRecords(int numRecords, long seed) { - return RandomGenericData.generate(schema(), numRecords, seed); - } - - protected void assertRecords(Table table, List expectedRecords, Duration timeout) - throws Exception { - SimpleDataUtil.assertTableRecords(table, expectedRecords, timeout); - } - - @Disabled("Disabled for now as it is flaky on CI") - @Test - public void testBoundedWithSavepoint(@InjectClusterClient ClusterClient clusterClient) - throws Exception { - List expectedRecords = Lists.newArrayList(); - GenericAppenderHelper dataAppender = - new GenericAppenderHelper(sourceTable, FileFormat.PARQUET, temporaryFolder); - for (int i = 0; i < 4; ++i) { - List records = generateRecords(2, i); - expectedRecords.addAll(records); - dataAppender.appendToTable(records); - } - - StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); - createBoundedStreams(env, 2); - - JobClient jobClient = env.executeAsync("Bounded Iceberg Source Savepoint Test"); - JobID jobId = jobClient.getJobID(); - - // Write something, but do not finish before checkpoint is created - RecordCounterToWait.waitForCondition(); - CompletableFuture savepoint = - clusterClient.stopWithSavepoint( - jobId, false, temporaryFolder.toString(), SavepointFormatType.CANONICAL); - RecordCounterToWait.continueProcessing(); - - // Wait for the job to stop with the savepoint - String savepointPath = savepoint.get(); - - // We expect that at least a few records has written - assertThat(tableRecords(sinkTable)).hasSizeGreaterThan(0); - - // New env from the savepoint - Configuration conf = new Configuration(); - conf.set(SavepointConfigOptions.SAVEPOINT_PATH, savepointPath); - env = StreamExecutionEnvironment.getExecutionEnvironment(conf); - createBoundedStreams(env, DO_NOT_FAIL); - - env.execute("Bounded Iceberg Source Savepoint Test"); - - // We expect no duplications - assertRecords(sinkTable, expectedRecords, Duration.ofSeconds(120)); - } - - @Test - public void testBoundedWithTaskManagerFailover() throws Exception { - runTestWithNewMiniCluster( - miniCluster -> testBoundedIcebergSource(FailoverType.TM, miniCluster)); - } - - @Test - public void testBoundedWithJobManagerFailover() throws Exception { - runTestWithNewMiniCluster( - miniCluster -> testBoundedIcebergSource(FailoverType.JM, miniCluster)); - } - - private void testBoundedIcebergSource(FailoverType failoverType, MiniCluster miniCluster) - throws Exception { - List expectedRecords = Lists.newArrayList(); - GenericAppenderHelper dataAppender = - new GenericAppenderHelper(sourceTable, FileFormat.PARQUET, temporaryFolder); - for (int i = 0; i < 4; ++i) { - List records = generateRecords(2, i); - expectedRecords.addAll(records); - dataAppender.appendToTable(records); - } - - StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); - env.setRestartStrategy(RestartStrategies.fixedDelayRestart(1, 0)); - createBoundedStreams(env, 2); - - JobClient jobClient = env.executeAsync("Bounded Iceberg Source Failover Test"); - JobID jobId = jobClient.getJobID(); - - RecordCounterToWait.waitForCondition(); - triggerFailover(failoverType, jobId, RecordCounterToWait::continueProcessing, miniCluster); - - assertRecords(sinkTable, expectedRecords, Duration.ofSeconds(120)); - } - - @Test - public void testContinuousWithTaskManagerFailover() throws Exception { - runTestWithNewMiniCluster( - miniCluster -> testContinuousIcebergSource(FailoverType.TM, miniCluster)); - } - - @Test - public void testContinuousWithJobManagerFailover() throws Exception { - runTestWithNewMiniCluster( - miniCluster -> testContinuousIcebergSource(FailoverType.JM, miniCluster)); - } - - private void testContinuousIcebergSource(FailoverType failoverType, MiniCluster miniCluster) - throws Exception { - GenericAppenderHelper dataAppender = - new GenericAppenderHelper(sourceTable, FileFormat.PARQUET, temporaryFolder); - List expectedRecords = Lists.newArrayList(); - - List batch = generateRecords(2, 0); - expectedRecords.addAll(batch); - dataAppender.appendToTable(batch); - - StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); - env.setParallelism(PARALLELISM); - env.enableCheckpointing(10L); - Configuration config = new Configuration(); - config.setInteger(FlinkConfigOptions.SOURCE_READER_FETCH_BATCH_RECORD_COUNT, 128); - - DataStream stream = - env.fromSource( - sourceBuilder() - .streaming(true) - .monitorInterval(Duration.ofMillis(10)) - .streamingStartingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) - .build(), - WatermarkStrategy.noWatermarks(), - "IcebergSource", - TypeInformation.of(RowData.class)); - - // CollectStreamSink from DataStream#executeAndCollect() doesn't guarantee - // exactly-once behavior. When Iceberg sink, we can verify end-to-end - // exactly-once. Here we mainly about source exactly-once behavior. - FlinkSink.forRowData(stream) - .table(sinkTable) - .tableLoader(SINK_CATALOG_EXTENSION.tableLoader()) - .append(); - - JobClient jobClient = env.executeAsync("Continuous Iceberg Source Failover Test"); - JobID jobId = jobClient.getJobID(); - - for (int i = 1; i < 5; i++) { - Thread.sleep(10); - List records = generateRecords(2, i); - expectedRecords.addAll(records); - dataAppender.appendToTable(records); - if (i == 2) { - triggerFailover(failoverType, jobId, () -> {}, miniCluster); - } - } - - // wait longer for continuous source to reduce flakiness - // because CI servers tend to be overloaded. - assertRecords(sinkTable, expectedRecords, Duration.ofSeconds(120)); - } - - private void createBoundedStreams(StreamExecutionEnvironment env, int failAfter) { - env.setParallelism(PARALLELISM); - - DataStream stream = - env.fromSource( - sourceBuilder().build(), - WatermarkStrategy.noWatermarks(), - "IcebergSource", - TypeInformation.of(RowData.class)); - - DataStream streamFailingInTheMiddleOfReading = - RecordCounterToWait.wrapWithFailureAfter(stream, failAfter); - - // CollectStreamSink from DataStream#executeAndCollect() doesn't guarantee - // exactly-once behavior. When Iceberg sink, we can verify end-to-end - // exactly-once. Here we mainly about source exactly-once behavior. - FlinkSink.forRowData(streamFailingInTheMiddleOfReading) - .table(sinkTable) - .tableLoader(SINK_CATALOG_EXTENSION.tableLoader()) - .append(); - } - - // ------------------------------------------------------------------------ - // test utilities copied from Flink's FileSourceTextLinesITCase - // ------------------------------------------------------------------------ - - private static void runTestWithNewMiniCluster(ThrowingConsumer testMethod) - throws Exception { - MiniClusterWithClientResource miniCluster = null; - try { - miniCluster = new MiniClusterWithClientResource(MINI_CLUSTER_RESOURCE_CONFIG); - miniCluster.before(); - testMethod.accept(miniCluster.getMiniCluster()); - } finally { - if (miniCluster != null) { - miniCluster.after(); - } - } - } - - private enum FailoverType { - NONE, - TM, - JM - } - - private static void triggerFailover( - FailoverType type, JobID jobId, Runnable afterFailAction, MiniCluster miniCluster) - throws Exception { - switch (type) { - case NONE: - afterFailAction.run(); - break; - case TM: - restartTaskManager(afterFailAction, miniCluster); - break; - case JM: - triggerJobManagerFailover(jobId, afterFailAction, miniCluster); - break; - } - } - - private static void triggerJobManagerFailover( - JobID jobId, Runnable afterFailAction, MiniCluster miniCluster) throws Exception { - HaLeadershipControl haLeadershipControl = miniCluster.getHaLeadershipControl().get(); - haLeadershipControl.revokeJobMasterLeadership(jobId).get(); - afterFailAction.run(); - haLeadershipControl.grantJobMasterLeadership(jobId).get(); - } - - private static void restartTaskManager(Runnable afterFailAction, MiniCluster miniCluster) - throws Exception { - miniCluster.terminateTaskManager(0).get(); - afterFailAction.run(); - miniCluster.startTaskManager(); - } - - private static class RecordCounterToWait { - - private static AtomicInteger records; - private static CountDownLatch countDownLatch; - private static CompletableFuture continueProcessing; - - private static DataStream wrapWithFailureAfter(DataStream stream, int condition) { - - records = new AtomicInteger(); - continueProcessing = new CompletableFuture<>(); - countDownLatch = new CountDownLatch(stream.getParallelism()); - return stream.map( - record -> { - boolean reachedFailPoint = records.incrementAndGet() > condition; - boolean notFailedYet = countDownLatch.getCount() != 0; - if (notFailedYet && reachedFailPoint) { - countDownLatch.countDown(); - continueProcessing.get(); - } - return record; - }); - } - - private static void waitForCondition() throws InterruptedException { - countDownLatch.await(); - } - - private static void continueProcessing() { - continueProcessing.complete(null); - } - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceFailoverWithWatermarkExtractor.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceFailoverWithWatermarkExtractor.java deleted file mode 100644 index 4f61d2f7308a..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceFailoverWithWatermarkExtractor.java +++ /dev/null @@ -1,130 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import java.time.Duration; -import java.time.Instant; -import java.time.LocalDateTime; -import java.time.ZoneId; -import java.time.ZoneOffset; -import java.util.List; -import java.util.concurrent.TimeUnit; -import java.util.concurrent.atomic.AtomicLong; -import java.util.stream.Collectors; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.table.data.RowData; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.data.RandomGenericData; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.flink.FlinkReadOptions; -import org.apache.iceberg.flink.SimpleDataUtil; -import org.apache.iceberg.flink.TestFixtures; -import org.apache.iceberg.types.Comparators; -import org.apache.iceberg.util.StructLikeWrapper; -import org.awaitility.Awaitility; -import org.junit.jupiter.api.BeforeEach; - -public class TestIcebergSourceFailoverWithWatermarkExtractor extends TestIcebergSourceFailover { - // Increment ts by 15 minutes for each generateRecords batch - private static final long RECORD_BATCH_TS_INCREMENT_MILLI = TimeUnit.MINUTES.toMillis(15); - // Within a batch, increment ts by 1 second - private static final long RECORD_TS_INCREMENT_MILLI = TimeUnit.SECONDS.toMillis(1); - - private final AtomicLong tsMilli = new AtomicLong(System.currentTimeMillis()); - - @Override - @BeforeEach - protected void setupTable() { - this.sourceTable = - SOURCE_CATALOG_EXTENSION - .catalog() - .createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.TS_SCHEMA); - this.sinkTable = - SINK_CATALOG_EXTENSION - .catalog() - .createTable(TestFixtures.SINK_TABLE_IDENTIFIER, TestFixtures.TS_SCHEMA); - } - - @Override - protected IcebergSource.Builder sourceBuilder() { - Configuration config = new Configuration(); - return IcebergSource.forRowData() - .tableLoader(SOURCE_CATALOG_EXTENSION.tableLoader()) - .watermarkColumn("ts") - .project(TestFixtures.TS_SCHEMA) - // Prevent combining splits - .set( - FlinkReadOptions.SPLIT_FILE_OPEN_COST, - Long.toString(TableProperties.SPLIT_SIZE_DEFAULT)) - .flinkConfig(config); - } - - @Override - protected Schema schema() { - return TestFixtures.TS_SCHEMA; - } - - @Override - protected List generateRecords(int numRecords, long seed) { - // Override the ts field to create a more realistic situation for event time alignment - tsMilli.addAndGet(RECORD_BATCH_TS_INCREMENT_MILLI); - return RandomGenericData.generate(schema(), numRecords, seed).stream() - .peek( - record -> { - LocalDateTime ts = - LocalDateTime.ofInstant( - Instant.ofEpochMilli(tsMilli.addAndGet(RECORD_TS_INCREMENT_MILLI)), - ZoneId.of("Z")); - record.setField("ts", ts); - }) - .collect(Collectors.toList()); - } - - /** - * This override is needed because {@link Comparators} used by {@link StructLikeWrapper} retrieves - * Timestamp type using Long type as inner class, while the {@link RandomGenericData} generates - * {@link LocalDateTime} for {@code TimestampType.withoutZone()}. This method normalizes the - * {@link LocalDateTime} to a Long type so that Comparators can continue to work. - */ - @Override - protected void assertRecords(Table table, List expectedRecords, Duration timeout) - throws Exception { - List expectedNormalized = convertLocalDateTimeToMilli(expectedRecords); - Awaitility.await("expected list of records should be produced") - .atMost(timeout) - .untilAsserted( - () -> - SimpleDataUtil.assertRecordsEqual( - expectedNormalized, - convertLocalDateTimeToMilli(SimpleDataUtil.tableRecords(table)), - table.schema())); - } - - private List convertLocalDateTimeToMilli(List records) { - return records.stream() - .peek( - r -> { - LocalDateTime localDateTime = ((LocalDateTime) r.getField("ts")); - r.setField("ts", localDateTime.atZone(ZoneOffset.UTC).toInstant().toEpochMilli()); - }) - .collect(Collectors.toList()); - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceReaderDeletes.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceReaderDeletes.java deleted file mode 100644 index df148c212ebd..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceReaderDeletes.java +++ /dev/null @@ -1,102 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import java.io.IOException; -import java.util.List; -import java.util.Map; -import org.apache.flink.api.common.eventtime.WatermarkStrategy; -import org.apache.flink.api.common.typeinfo.TypeInformation; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.flink.test.junit5.MiniClusterExtension; -import org.apache.flink.util.CloseableIterator; -import org.apache.hadoop.hive.conf.HiveConf; -import org.apache.iceberg.CatalogProperties; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.flink.CatalogLoader; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.MiniFlinkClusterExtension; -import org.apache.iceberg.flink.RowDataWrapper; -import org.apache.iceberg.flink.TableLoader; -import org.apache.iceberg.flink.source.assigner.SimpleSplitAssignerFactory; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.util.StructLikeSet; -import org.junit.jupiter.api.extension.RegisterExtension; - -public class TestIcebergSourceReaderDeletes extends TestFlinkReaderDeletesBase { - - private static final int PARALLELISM = 4; - - @RegisterExtension - private static final MiniClusterExtension MINI_CLUSTER = - MiniFlinkClusterExtension.createWithClassloaderCheckDisabled(); - - @Override - protected StructLikeSet rowSet(String tableName, Table testTable, String... columns) - throws IOException { - Schema projected = testTable.schema().select(columns); - RowType rowType = FlinkSchemaUtil.convert(projected); - - Map properties = Maps.newHashMap(); - properties.put( - CatalogProperties.WAREHOUSE_LOCATION, - hiveConf.get(HiveConf.ConfVars.METASTOREWAREHOUSE.varname)); - properties.put(CatalogProperties.URI, hiveConf.get(HiveConf.ConfVars.METASTOREURIS.varname)); - properties.put( - CatalogProperties.CLIENT_POOL_SIZE, - Integer.toString(hiveConf.getInt("iceberg.hive.client-pool-size", 5))); - CatalogLoader hiveCatalogLoader = CatalogLoader.hive(catalog.name(), hiveConf, properties); - TableLoader hiveTableLoader = - TableLoader.fromCatalog(hiveCatalogLoader, TableIdentifier.of("default", tableName)); - hiveTableLoader.open(); - try (TableLoader tableLoader = hiveTableLoader) { - StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); - env.setParallelism(1); - DataStream stream = - env.fromSource( - IcebergSource.builder() - .tableLoader(tableLoader) - .assignerFactory(new SimpleSplitAssignerFactory()) - .project(projected) - .build(), - WatermarkStrategy.noWatermarks(), - "testBasicRead", - TypeInformation.of(RowData.class)); - - try (CloseableIterator iter = stream.executeAndCollect()) { - List rowDataList = Lists.newArrayList(iter); - StructLikeSet set = StructLikeSet.create(projected.asStruct()); - rowDataList.forEach( - rowData -> { - RowDataWrapper wrapper = new RowDataWrapper(rowType, projected.asStruct()); - set.add(wrapper.wrap(rowData)); - }); - return set; - } catch (Exception e) { - throw new IOException("Failed to collect result", e); - } - } - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceSql.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceSql.java deleted file mode 100644 index 75f0a785a8c5..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceSql.java +++ /dev/null @@ -1,158 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import static org.apache.iceberg.types.Types.NestedField.required; - -import java.io.IOException; -import java.time.Instant; -import java.time.ZoneId; -import java.util.List; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.table.api.TableEnvironment; -import org.apache.flink.table.api.config.TableConfigOptions; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.data.GenericAppenderHelper; -import org.apache.iceberg.data.GenericRecord; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.flink.FlinkConfigOptions; -import org.apache.iceberg.flink.TestFixtures; -import org.apache.iceberg.flink.TestHelpers; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.types.Types; -import org.junit.jupiter.api.Test; - -/** Use the IcebergSource (FLIP-27) */ -public class TestIcebergSourceSql extends TestSqlBase { - private static final Schema SCHEMA_TS = - new Schema( - required(1, "t1", Types.TimestampType.withoutZone()), - required(2, "t2", Types.LongType.get())); - - @Override - public void before() throws IOException { - TableEnvironment tableEnvironment = getTableEnv(); - Configuration tableConf = tableEnvironment.getConfig().getConfiguration(); - tableConf.setBoolean(FlinkConfigOptions.TABLE_EXEC_ICEBERG_USE_FLIP27_SOURCE.key(), true); - - tableEnvironment.getConfig().set("table.exec.resource.default-parallelism", "1"); - SqlHelpers.sql( - tableEnvironment, - "create catalog iceberg_catalog with ('type'='iceberg', 'catalog-type'='hadoop', 'warehouse'='%s')", - CATALOG_EXTENSION.warehouse()); - SqlHelpers.sql(tableEnvironment, "use catalog iceberg_catalog"); - - tableConf.set(TableConfigOptions.TABLE_DYNAMIC_TABLE_OPTIONS_ENABLED, true); - } - - private Record generateRecord(Instant t1, long t2) { - Record record = GenericRecord.create(SCHEMA_TS); - record.setField("t1", t1.atZone(ZoneId.systemDefault()).toLocalDateTime()); - record.setField("t2", t2); - return record; - } - - /** Generates the records in the expected order, with respect to their datafile */ - private List generateExpectedRecords(boolean ascending) throws Exception { - Table table = CATALOG_EXTENSION.catalog().createTable(TestFixtures.TABLE_IDENTIFIER, SCHEMA_TS); - long baseTime = 1702382109000L; - - GenericAppenderHelper helper = - new GenericAppenderHelper(table, FileFormat.PARQUET, temporaryFolder); - - Record file1Record1 = - generateRecord(Instant.ofEpochMilli(baseTime), baseTime + (1000 * 60 * 60 * 24 * 30L)); - Record file1Record2 = - generateRecord( - Instant.ofEpochMilli(baseTime - 10 * 1000L), baseTime + (1000 * 60 * 60 * 24 * 35L)); - - List recordsDataFile1 = Lists.newArrayList(); - recordsDataFile1.add(file1Record1); - recordsDataFile1.add(file1Record2); - DataFile dataFile1 = helper.writeFile(recordsDataFile1); - - Record file2Record1 = - generateRecord( - Instant.ofEpochMilli(baseTime + 14 * 1000L), baseTime - (1000 * 60 * 60 * 24 * 30L)); - Record file2Record2 = - generateRecord( - Instant.ofEpochMilli(baseTime + 12 * 1000L), baseTime - (1000 * 60 * 61 * 24 * 35L)); - - List recordsDataFile2 = Lists.newArrayList(); - recordsDataFile2.add(file2Record1); - recordsDataFile2.add(file2Record2); - - DataFile dataFile2 = helper.writeFile(recordsDataFile2); - helper.appendToTable(dataFile1, dataFile2); - - // Expected records if the splits are ordered - // - ascending (watermark from t1) - records from the split with early timestamps, then - // records from the split with late timestamps - // - descending (watermark from t2) - records from the split with old longs, then records - // from the split with new longs - List expected = Lists.newArrayList(); - if (ascending) { - expected.addAll(recordsDataFile1); - expected.addAll(recordsDataFile2); - } else { - expected.addAll(recordsDataFile2); - expected.addAll(recordsDataFile1); - } - return expected; - } - - /** Tests the order of splits returned when setting the watermark-column options */ - @Test - public void testWatermarkOptionsAscending() throws Exception { - List expected = generateExpectedRecords(true); - TestHelpers.assertRecordsWithOrder( - run( - ImmutableMap.of("watermark-column", "t1", "split-file-open-cost", "128000000"), - "", - "*"), - expected, - SCHEMA_TS); - } - - /** - * Tests the order of splits returned when setting the watermark-column and - * watermark-column-time-unit" options - */ - @Test - public void testWatermarkOptionsDescending() throws Exception { - List expected = generateExpectedRecords(false); - TestHelpers.assertRecordsWithOrder( - run( - ImmutableMap.of( - "watermark-column", - "t2", - "watermark-column-time-unit", - "MILLISECONDS", - "split-file-open-cost", - "128000000"), - "", - "*"), - expected, - SCHEMA_TS); - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceWithWatermarkExtractor.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceWithWatermarkExtractor.java deleted file mode 100644 index 70889f4f76aa..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceWithWatermarkExtractor.java +++ /dev/null @@ -1,408 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import static org.apache.iceberg.flink.MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG; - -import java.io.Serializable; -import java.nio.file.Path; -import java.time.Duration; -import java.time.Instant; -import java.time.LocalDateTime; -import java.time.ZoneId; -import java.util.List; -import java.util.Optional; -import java.util.concurrent.ConcurrentMap; -import java.util.concurrent.TimeUnit; -import java.util.concurrent.atomic.AtomicInteger; -import org.apache.flink.api.common.JobID; -import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner; -import org.apache.flink.api.common.eventtime.WatermarkStrategy; -import org.apache.flink.api.common.typeinfo.TypeInformation; -import org.apache.flink.core.execution.JobClient; -import org.apache.flink.metrics.Gauge; -import org.apache.flink.runtime.metrics.MetricNames; -import org.apache.flink.runtime.minicluster.MiniCluster; -import org.apache.flink.runtime.minicluster.RpcServiceSharing; -import org.apache.flink.runtime.testutils.CommonTestUtils; -import org.apache.flink.runtime.testutils.InMemoryReporter; -import org.apache.flink.runtime.testutils.MiniClusterResourceConfiguration; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.streaming.api.functions.windowing.AllWindowFunction; -import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows; -import org.apache.flink.streaming.api.windowing.time.Time; -import org.apache.flink.streaming.api.windowing.windows.TimeWindow; -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.test.junit5.InjectMiniCluster; -import org.apache.flink.test.junit5.MiniClusterExtension; -import org.apache.flink.util.CloseableIterator; -import org.apache.flink.util.Collector; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.data.GenericAppenderHelper; -import org.apache.iceberg.data.GenericRecord; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.flink.HadoopTableExtension; -import org.apache.iceberg.flink.TestFixtures; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.awaitility.Awaitility; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.extension.RegisterExtension; -import org.junit.jupiter.api.io.TempDir; - -public class TestIcebergSourceWithWatermarkExtractor implements Serializable { - private static final int PARALLELISM = 4; - private static final String SOURCE_NAME = "IcebergSource"; - private static final int RECORD_NUM_FOR_2_SPLITS = 200; - private static final ConcurrentMap WINDOWS = Maps.newConcurrentMap(); - - @TempDir protected Path temporaryFolder; - - private static final InMemoryReporter REPORTER = InMemoryReporter.createWithRetainedMetrics(); - - @RegisterExtension - public static final MiniClusterExtension MINI_CLUSTER_EXTENSION = - new MiniClusterExtension( - new MiniClusterResourceConfiguration.Builder() - .setNumberTaskManagers(1) - .setNumberSlotsPerTaskManager(PARALLELISM) - .setRpcServiceSharing(RpcServiceSharing.DEDICATED) - .setConfiguration(REPORTER.addToConfiguration(DISABLE_CLASSLOADER_CHECK_CONFIG)) - .withHaLeadershipControl() - .build()); - - @RegisterExtension - private static final HadoopTableExtension TABLE_EXTENSION = - new HadoopTableExtension(TestFixtures.DATABASE, TestFixtures.TABLE, TestFixtures.TS_SCHEMA); - - /** - * This is an integration test for watermark handling and windowing. Integration testing the - * following features: - * - *
      - *
    • - Ordering of the splits - *
    • - Emitting of watermarks - *
    • - Firing windows based on watermarks - *
    - * - *

    The test generates 4 splits - * - *

      - *
    • - Split 1 - Watermark 100 min - *
    • - Split 2, 3 - Watermark 0 min - *
    • - Split 4 - Watermark 6 min - *
    - * - *

    Creates a source with 5 minutes tumbling window with parallelism 1 (to prevent concurrency - * issues). - * - *

    Checks that windows are handled correctly based on the emitted watermarks, and splits are - * read in the following order: - * - *

      - *
    • - Split 2, 3 - *
    • - Split 4 - *
    • - Split 1 - *
    - * - *

    As a result the window aggregator emits the records based on in Split 2-3, and Split 4 data. - * - *

    Add 2 more splits, so the task manager close the windows for the original 4 splits and emit - * the appropriate aggregated records. - */ - @Test - public void testWindowing() throws Exception { - GenericAppenderHelper dataAppender = appender(); - List expectedRecords = Lists.newArrayList(); - - // Generate records with the following pattern: - // - File 1 - Later records (Watermark 6000000) - // - Split 1 - 2 records (100, "file_1-recordTs_100"), (103, "file_1-recordTs_103") - // - File 2 - First records (Watermark 0) - // - Split 1 - 100 records (0, "file_2-recordTs_0"), (1, "file_2-recordTs_1"),... - // - Split 2 - 100 records (0, "file_2-recordTs_0"), (1, "file_2-recordTs_1"),... - // - File 3 - Parallel write for the first records (Watermark 360000) - // - Split 1 - 2 records (6, "file_3-recordTs_6"), (7, "file_3-recordTs_7") - List batch = - ImmutableList.of( - generateRecord(100, "file_1-recordTs_100"), - generateRecord(101, "file_1-recordTs_101"), - generateRecord(103, "file_1-recordTs_103")); - expectedRecords.addAll(batch); - dataAppender.appendToTable(batch); - - batch = Lists.newArrayListWithCapacity(100); - for (int i = 0; i < RECORD_NUM_FOR_2_SPLITS; ++i) { - // Generate records where the timestamps are out of order, but still between 0-5 minutes - batch.add(generateRecord(4 - i % 5, "file_2-recordTs_" + i)); - } - expectedRecords.addAll(batch); - dataAppender.appendToTable(batch); - - batch = - ImmutableList.of( - generateRecord(6, "file_3-recordTs_6"), generateRecord(7, "file_3-recordTs_7")); - expectedRecords.addAll(batch); - dataAppender.appendToTable(batch); - - StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); - env.setParallelism(1); - - DataStream stream = - env.fromSource( - source(), - WatermarkStrategy.noWatermarks() - .withTimestampAssigner(new RowDataTimestampAssigner()), - SOURCE_NAME, - TypeInformation.of(RowData.class)); - - stream - .windowAll(TumblingEventTimeWindows.of(Time.minutes(5))) - .apply( - new AllWindowFunction() { - @Override - public void apply( - TimeWindow window, Iterable values, Collector out) { - // Emit RowData which contains the window start time, and the record count in - // that window - AtomicInteger count = new AtomicInteger(0); - values.forEach(a -> count.incrementAndGet()); - out.collect(row(window.getStart(), count.get())); - WINDOWS.put(window.getStart(), count.get()); - } - }); - - // Use static variable to collect the windows, since other solutions were flaky - WINDOWS.clear(); - env.executeAsync("Iceberg Source Windowing Test"); - - // Wait for the 2 first windows from File 2 and File 3 - Awaitility.await() - .pollInterval(Duration.ofMillis(10)) - .atMost(30, TimeUnit.SECONDS) - .until( - () -> - WINDOWS.equals( - ImmutableMap.of(0L, RECORD_NUM_FOR_2_SPLITS, TimeUnit.MINUTES.toMillis(5), 2))); - - // Write data so the windows containing test data are closed - dataAppender.appendToTable( - dataAppender.writeFile(ImmutableList.of(generateRecord(1500, "last-record")))); - - // Wait for last test record window from File 1 - Awaitility.await() - .pollInterval(Duration.ofMillis(10)) - .atMost(30, TimeUnit.SECONDS) - .until( - () -> - WINDOWS.equals( - ImmutableMap.of( - 0L, - RECORD_NUM_FOR_2_SPLITS, - TimeUnit.MINUTES.toMillis(5), - 2, - TimeUnit.MINUTES.toMillis(100), - 3))); - } - - /** - * This is an integration test for watermark handling and throttling. Integration testing the - * following: - * - *

      - *
    • - Emitting of watermarks - *
    • - Watermark alignment - *
    - * - *

    The test generates 3 splits - * - *

      - *
    • - Split 1 - Watermark 100 min - *
    • - Split 2, 3 - Watermark 0 min - *
    - * - * The splits are read in the following order: - * - *
      - *
    • - Split 2, 3 (Task Manager 1, Task Manager 2) - *
    • - Split 1 (Task Manager 1 or ask Manager 2 depending on scheduling) - *
    - * - * Reading split 1 will cause the watermark alignment to pause reading for the given task manager. - * - *

    The status of the watermark alignment is checked by the alignment related metrics. - * - *

    Adding new records with old timestamps to the table will enable the running reader to - * continue reading the files, but the watermark alignment will still prevent the paused reader to - * continue. - * - *

    After adding some records with new timestamps the blocked reader is un-paused, and both ot - * the readers continue reading. - */ - @Test - public void testThrottling(@InjectMiniCluster MiniCluster miniCluster) throws Exception { - GenericAppenderHelper dataAppender = appender(); - - // Generate records in advance - - // File 1 - Later records (Watermark 6.000.000 - 100 min) - // - Split 1 - 2 records (100, "file_1-recordTs_100"), (103, "file_1-recordTs_103") - List batch1 = - ImmutableList.of( - generateRecord(100, "file_1-recordTs_100"), generateRecord(103, "file_1-recordTs_103")); - - // File 2 - First records (Watermark 0 - 0 min) - // - Split 1 - 100 records (0, "file_2-recordTs_0"), (1, "file_2-recordTs_1"),... - // - Split 2 - 100 records (0, "file_2-recordTs_0"), (1, "file_2-recordTs_1"),... - List batch2 = Lists.newArrayListWithCapacity(100); - for (int i = 0; i < RECORD_NUM_FOR_2_SPLITS; ++i) { - batch2.add(generateRecord(4 - i % 5, "file_2-recordTs_" + i)); - } - - // File 3 - Some records will be blocked (Watermark 900.000 - 15 min) - List batch3 = - ImmutableList.of( - generateRecord(15, "file_3-recordTs_15"), - generateRecord(16, "file_3-recordTs_16"), - generateRecord(17, "file_3-recordTs_17")); - - // File 4 - Some records will be blocked (Watermark 900.000 - 15 min) - List batch4 = - ImmutableList.of( - generateRecord(15, "file_4-recordTs_15"), - generateRecord(16, "file_4-recordTs_16"), - generateRecord(17, "file_4-recordTs_17")); - - // File 5 - Records which will remove the block (Watermark 5.400.000 - 90 min) - List batch5 = - ImmutableList.of( - generateRecord(90, "file_5-recordTs_90"), generateRecord(91, "file_5-recordTs_91")); - - StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); - env.setParallelism(2); - - DataStream stream = - env.fromSource( - source(), - WatermarkStrategy.noWatermarks() - .withWatermarkAlignment("iceberg", Duration.ofMinutes(20), Duration.ofMillis(10)), - SOURCE_NAME, - TypeInformation.of(RowData.class)); - - try (CloseableIterator resultIterator = stream.collectAsync()) { - JobClient jobClient = env.executeAsync("Iceberg Source Throttling Test"); - CommonTestUtils.waitForAllTaskRunning(miniCluster, jobClient.getJobID(), false); - - // Insert the first data into the table - dataAppender.appendToTable(dataAppender.writeFile(batch1), dataAppender.writeFile(batch2)); - - // Get the drift metric, wait for it to be created and reach the expected state - // (100 min - 20 min - 0 min) - // Also this validates that the WatermarkAlignment is working - Awaitility.await() - .pollInterval(Duration.ofMillis(10)) - .atMost(30, TimeUnit.SECONDS) - .until( - () -> - findAlignmentDriftMetric(jobClient.getJobID(), TimeUnit.MINUTES.toMillis(80)) - .isPresent()); - Gauge drift = - findAlignmentDriftMetric(jobClient.getJobID(), TimeUnit.MINUTES.toMillis(80)).get(); - - // Add some old records with 2 splits, so even if the blocked gets one split, the other reader - // one gets one as well - dataAppender.appendToTable(dataAppender.writeFile(batch3), dataAppender.writeFile(batch4)); - - // Get the drift metric, wait for it to be created and reach the expected state (100 min - 20 - // min - 15 min) - Awaitility.await() - .pollInterval(Duration.ofMillis(10)) - .atMost(30, TimeUnit.SECONDS) - .until(() -> drift.getValue() == TimeUnit.MINUTES.toMillis(65)); - - // Add some new records which should unblock the throttled reader - dataAppender.appendToTable(batch5); - - // Wait for the new drift to decrease below the allowed drift to signal the normal state - Awaitility.await() - .pollInterval(Duration.ofMillis(10)) - .atMost(30, TimeUnit.SECONDS) - .until(() -> drift.getValue() < TimeUnit.MINUTES.toMillis(20)); - } - } - - protected IcebergSource source() { - return IcebergSource.builder() - .tableLoader(TABLE_EXTENSION.tableLoader()) - .watermarkColumn("ts") - .project(TestFixtures.TS_SCHEMA) - .splitSize(100L) - .streaming(true) - .monitorInterval(Duration.ofMillis(10)) - .streamingStartingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) - .build(); - } - - protected Record generateRecord(int minutes, String str) { - // Override the ts field to create a more realistic situation for event time alignment - Record record = GenericRecord.create(TestFixtures.TS_SCHEMA); - LocalDateTime ts = - LocalDateTime.ofInstant( - Instant.ofEpochMilli(Time.of(minutes, TimeUnit.MINUTES).toMilliseconds()), - ZoneId.of("Z")); - record.setField("ts", ts); - record.setField("str", str); - return record; - } - - private Optional> findAlignmentDriftMetric(JobID jobID, long withValue) { - String metricsName = SOURCE_NAME + ".*" + MetricNames.WATERMARK_ALIGNMENT_DRIFT; - return REPORTER.findMetrics(jobID, metricsName).values().stream() - .map(m -> (Gauge) m) - .filter(m -> m.getValue() == withValue) - .findFirst(); - } - - private GenericAppenderHelper appender() { - // We need to create multiple splits, so we need to generate parquet files with multiple offsets - org.apache.hadoop.conf.Configuration hadoopConf = new org.apache.hadoop.conf.Configuration(); - hadoopConf.set("write.parquet.page-size-bytes", "64"); - hadoopConf.set("write.parquet.row-group-size-bytes", "64"); - return new GenericAppenderHelper( - TABLE_EXTENSION.table(), FileFormat.PARQUET, temporaryFolder, hadoopConf); - } - - private static RowData row(long time, long count) { - GenericRowData result = new GenericRowData(2); - result.setField(0, time); - result.setField(1, String.valueOf(count)); - return result; - } - - private static class RowDataTimestampAssigner implements SerializableTimestampAssigner { - @Override - public long extractTimestamp(RowData element, long recordTimestamp) { - return element.getTimestamp(0, 0).getMillisecond(); - } - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSpeculativeExecutionSupport.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSpeculativeExecutionSupport.java deleted file mode 100644 index 95d0b90b6ca9..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSpeculativeExecutionSupport.java +++ /dev/null @@ -1,184 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.IOException; -import java.nio.file.Files; -import java.time.Duration; -import java.util.Arrays; -import java.util.List; -import java.util.stream.Collectors; -import org.apache.flink.api.common.RuntimeExecutionMode; -import org.apache.flink.api.common.functions.RichMapFunction; -import org.apache.flink.api.common.typeinfo.Types; -import org.apache.flink.configuration.BatchExecutionOptions; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.configuration.CoreOptions; -import org.apache.flink.configuration.JobManagerOptions; -import org.apache.flink.configuration.RestOptions; -import org.apache.flink.configuration.SlowTaskDetectorOptions; -import org.apache.flink.runtime.testutils.MiniClusterResourceConfiguration; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.table.api.Table; -import org.apache.flink.table.api.TableEnvironment; -import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; -import org.apache.flink.test.junit5.MiniClusterExtension; -import org.apache.flink.types.Row; -import org.apache.iceberg.flink.FlinkConfigOptions; -import org.apache.iceberg.flink.TestBase; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.extension.RegisterExtension; - -public class TestIcebergSpeculativeExecutionSupport extends TestBase { - private static final int NUM_TASK_MANAGERS = 1; - private static final int NUM_TASK_SLOTS = 3; - - @RegisterExtension - public static MiniClusterExtension miniClusterResource = - new MiniClusterExtension( - new MiniClusterResourceConfiguration.Builder() - .setNumberTaskManagers(NUM_TASK_MANAGERS) - .setNumberSlotsPerTaskManager(NUM_TASK_SLOTS) - .setConfiguration(configure()) - .build()); - - private StreamTableEnvironment tEnv; - private static final String CATALOG_NAME = "test_catalog"; - private static final String DATABASE_NAME = "test_db"; - private static final String INPUT_TABLE_NAME = "test_table"; - private static final String OUTPUT_TABLE_NAME = "sink_table"; - - @Override - protected TableEnvironment getTableEnv() { - if (tEnv == null) { - synchronized (this) { - StreamExecutionEnvironment env = - StreamExecutionEnvironment.getExecutionEnvironment(configure()); - env.setRuntimeMode(RuntimeExecutionMode.BATCH); - tEnv = StreamTableEnvironment.create(env); - } - } - - return tEnv; - } - - @BeforeEach - public void before() throws IOException { - String warehouse = - String.format("file:%s", Files.createTempDirectory(temporaryDirectory, "junit").toString()); - sql( - "CREATE CATALOG %s WITH ('type'='iceberg', 'catalog-type'='hadoop', 'warehouse'='%s')", - CATALOG_NAME, warehouse); - sql("USE CATALOG %s", CATALOG_NAME); - sql("CREATE DATABASE %s", DATABASE_NAME); - sql("USE %s", DATABASE_NAME); - - sql("CREATE TABLE %s (i INT, j INT)", INPUT_TABLE_NAME); - sql("INSERT INTO %s VALUES (1, -1),(2, -1),(3, -1)", INPUT_TABLE_NAME); - sql("CREATE TABLE %s (i INT, j INT, subTask INT, attempt INT)", OUTPUT_TABLE_NAME); - } - - @AfterEach - public void after() { - sql("DROP TABLE IF EXISTS %s.%s", DATABASE_NAME, INPUT_TABLE_NAME); - sql("DROP TABLE IF EXISTS %s.%s", DATABASE_NAME, OUTPUT_TABLE_NAME); - sql("DROP DATABASE %s", DATABASE_NAME); - dropCatalog(CATALOG_NAME, true); - } - - @Test - public void testSpeculativeExecution() throws Exception { - Table table = - tEnv.sqlQuery(String.format("SELECT * FROM %s.%s", DATABASE_NAME, INPUT_TABLE_NAME)); - DataStream slowStream = - tEnv.toDataStream(table, Row.class) - .map(new TestingMap()) - .name("test_map") - .returns( - Types.ROW_NAMED( - new String[] {"i", "j", "subTask", "attempt"}, - Types.INT, - Types.INT, - Types.INT, - Types.INT)) - .setParallelism(NUM_TASK_SLOTS); - - tEnv.fromDataStream(slowStream) - .executeInsert(String.format("%s.%s", DATABASE_NAME, OUTPUT_TABLE_NAME)) - .await(); - - List output = sql(String.format("SELECT * FROM %s.%s", DATABASE_NAME, OUTPUT_TABLE_NAME)); - - // Ensure that all subTasks has attemptNum > 0 - assertThat(output.stream().map(x -> x.getField(3)).collect(Collectors.toSet())).contains(1); - - // Ensure the test_table rows are returned exactly the same after the slow map task from the - // sink_table - assertSameElements( - output.stream().map(x -> Row.of(x.getField(0), x.getField(1))).collect(Collectors.toList()), - Arrays.asList(Row.of(1, -1), Row.of(2, -1), Row.of(3, -1))); - } - - /** A testing map function that simulates the slow task. */ - private static class TestingMap extends RichMapFunction { - @Override - public Row map(Row row) throws Exception { - // Put the subtasks with the first attempt to sleep to trigger speculative - // execution - if (getRuntimeContext().getAttemptNumber() <= 0) { - Thread.sleep(Integer.MAX_VALUE); - } - - Row output = - Row.of( - row.getField(0), - row.getField(1), - getRuntimeContext().getIndexOfThisSubtask(), - getRuntimeContext().getAttemptNumber()); - - return output; - } - } - - private static Configuration configure() { - Configuration configuration = new Configuration(); - configuration.set(CoreOptions.CHECK_LEAKED_CLASSLOADER, false); - configuration.set(RestOptions.BIND_PORT, "0"); - configuration.set(JobManagerOptions.SLOT_REQUEST_TIMEOUT, 5000L); - - // Use FLIP-27 source - configuration.set(FlinkConfigOptions.TABLE_EXEC_ICEBERG_USE_FLIP27_SOURCE, true); - - // for speculative execution - configuration.set(BatchExecutionOptions.SPECULATIVE_ENABLED, true); - - configuration.set(SlowTaskDetectorOptions.EXECUTION_TIME_BASELINE_MULTIPLIER, 1.0); - configuration.set(SlowTaskDetectorOptions.EXECUTION_TIME_BASELINE_RATIO, 0.2); - configuration.set( - SlowTaskDetectorOptions.EXECUTION_TIME_BASELINE_LOWER_BOUND, Duration.ofMillis(0)); - configuration.set(BatchExecutionOptions.BLOCK_SLOW_NODE_DURATION, Duration.ofMillis(0)); - - return configuration; - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestMetadataTableReadableMetrics.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestMetadataTableReadableMetrics.java deleted file mode 100644 index 40dfda723749..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestMetadataTableReadableMetrics.java +++ /dev/null @@ -1,299 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - -import java.io.File; -import java.io.IOException; -import java.math.BigDecimal; -import java.nio.ByteBuffer; -import java.nio.file.Path; -import java.util.Base64; -import java.util.List; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.configuration.CoreOptions; -import org.apache.flink.table.api.TableEnvironment; -import org.apache.flink.types.Row; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.Files; -import org.apache.iceberg.Parameters; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.catalog.Namespace; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.data.FileHelpers; -import org.apache.iceberg.data.GenericRecord; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.flink.CatalogTestBase; -import org.apache.iceberg.flink.TestHelpers; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.types.Types; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.TestTemplate; -import org.junit.jupiter.api.io.TempDir; - -public class TestMetadataTableReadableMetrics extends CatalogTestBase { - private static final String TABLE_NAME = "test_table"; - - @Parameters(name = "catalogName={0}, baseNamespace={1}") - protected static List parameters() { - List parameters = Lists.newArrayList(); - String catalogName = "testhive"; - Namespace baseNamespace = Namespace.empty(); - parameters.add(new Object[] {catalogName, baseNamespace}); - return parameters; - } - - @Override - protected TableEnvironment getTableEnv() { - Configuration configuration = super.getTableEnv().getConfig().getConfiguration(); - configuration.set(CoreOptions.DEFAULT_PARALLELISM, 1); - return super.getTableEnv(); - } - - private @TempDir Path temp; - - private static final Types.StructType LEAF_STRUCT_TYPE = - Types.StructType.of( - optional(1, "leafLongCol", Types.LongType.get()), - optional(2, "leafDoubleCol", Types.DoubleType.get())); - - private static final Types.StructType NESTED_STRUCT_TYPE = - Types.StructType.of(required(3, "leafStructCol", LEAF_STRUCT_TYPE)); - - private static final Schema NESTED_SCHEMA = - new Schema(required(4, "nestedStructCol", NESTED_STRUCT_TYPE)); - - private static final Schema PRIMITIVE_SCHEMA = - new Schema( - required(1, "booleanCol", Types.BooleanType.get()), - required(2, "intCol", Types.IntegerType.get()), - required(3, "longCol", Types.LongType.get()), - required(4, "floatCol", Types.FloatType.get()), - required(5, "doubleCol", Types.DoubleType.get()), - optional(6, "decimalCol", Types.DecimalType.of(10, 2)), - optional(7, "stringCol", Types.StringType.get()), - optional(8, "fixedCol", Types.FixedType.ofLength(3)), - optional(9, "binaryCol", Types.BinaryType.get())); - - private Table createPrimitiveTable() throws IOException { - Table table = - catalog.createTable( - TableIdentifier.of(DATABASE, TABLE_NAME), - PRIMITIVE_SCHEMA, - PartitionSpec.unpartitioned(), - ImmutableMap.of()); - List records = - Lists.newArrayList( - createPrimitiveRecord( - false, - 1, - 1L, - 0, - 1.0D, - new BigDecimal("1.00"), - "1", - Base64.getDecoder().decode("1111"), - ByteBuffer.wrap(Base64.getDecoder().decode("1111"))), - createPrimitiveRecord( - true, - 2, - 2L, - 0, - 2.0D, - new BigDecimal("2.00"), - "2", - Base64.getDecoder().decode("2222"), - ByteBuffer.wrap(Base64.getDecoder().decode("2222"))), - createPrimitiveRecord(false, 1, 1, Float.NaN, Double.NaN, null, "1", null, null), - createPrimitiveRecord( - false, 2, 2L, Float.NaN, 2.0D, new BigDecimal("2.00"), "2", null, null)); - - File testFile = File.createTempFile("junit", null, temp.toFile()); - DataFile dataFile = FileHelpers.writeDataFile(table, Files.localOutput(testFile), records); - table.newAppend().appendFile(dataFile).commit(); - return table; - } - - private void createNestedTable() throws IOException { - Table table = - validationCatalog.createTable( - TableIdentifier.of(DATABASE, TABLE_NAME), - NESTED_SCHEMA, - PartitionSpec.unpartitioned(), - ImmutableMap.of()); - - List records = - Lists.newArrayList( - createNestedRecord(0L, 0.0), - createNestedRecord(1L, Double.NaN), - createNestedRecord(null, null)); - - File testFile = File.createTempFile("junit", null, temp.toFile()); - DataFile dataFile = FileHelpers.writeDataFile(table, Files.localOutput(testFile), records); - table.newAppend().appendFile(dataFile).commit(); - } - - @BeforeEach - public void before() { - super.before(); - sql("USE CATALOG %s", catalogName); - sql("CREATE DATABASE %s", DATABASE); - sql("USE %s", DATABASE); - } - - @Override - @AfterEach - public void clean() { - sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, TABLE_NAME); - sql("DROP DATABASE IF EXISTS %s", flinkDatabase); - super.clean(); - } - - protected GenericRecord createPrimitiveRecord( - boolean booleanCol, - int intCol, - long longCol, - float floatCol, - double doubleCol, - BigDecimal decimalCol, - String stringCol, - byte[] fixedCol, - ByteBuffer binaryCol) { - GenericRecord record = GenericRecord.create(PRIMITIVE_SCHEMA); - record.set(0, booleanCol); - record.set(1, intCol); - record.set(2, longCol); - record.set(3, floatCol); - record.set(4, doubleCol); - record.set(5, decimalCol); - record.set(6, stringCol); - record.set(7, fixedCol); - record.set(8, binaryCol); - return record; - } - - private GenericRecord createNestedRecord(Long longCol, Double doubleCol) { - GenericRecord record = GenericRecord.create(NESTED_SCHEMA); - GenericRecord nested = GenericRecord.create(NESTED_STRUCT_TYPE); - GenericRecord leaf = GenericRecord.create(LEAF_STRUCT_TYPE); - leaf.set(0, longCol); - leaf.set(1, doubleCol); - nested.set(0, leaf); - record.set(0, nested); - return record; - } - - protected Object[] row(Object... values) { - return values; - } - - @TestTemplate - public void testPrimitiveColumns() throws Exception { - createPrimitiveTable(); - List result = sql("SELECT readable_metrics FROM %s$files", TABLE_NAME); - - Row binaryCol = - Row.of( - 52L, - 4L, - 2L, - null, - Base64.getDecoder().decode("1111"), - Base64.getDecoder().decode("2222")); - Row booleanCol = Row.of(32L, 4L, 0L, null, false, true); - Row decimalCol = Row.of(85L, 4L, 1L, null, new BigDecimal("1.00"), new BigDecimal("2.00")); - Row doubleCol = Row.of(85L, 4L, 0L, 1L, 1.0D, 2.0D); - Row fixedCol = - Row.of( - 44L, - 4L, - 2L, - null, - Base64.getDecoder().decode("1111"), - Base64.getDecoder().decode("2222")); - Row floatCol = Row.of(71L, 4L, 0L, 2L, 0f, 0f); - Row intCol = Row.of(71L, 4L, 0L, null, 1, 2); - Row longCol = Row.of(79L, 4L, 0L, null, 1L, 2L); - Row stringCol = Row.of(79L, 4L, 0L, null, "1", "2"); - - List expected = - Lists.newArrayList( - Row.of( - Row.of( - binaryCol, - booleanCol, - decimalCol, - doubleCol, - fixedCol, - floatCol, - intCol, - longCol, - stringCol))); - TestHelpers.assertRows(result, expected); - } - - @TestTemplate - public void testSelectPrimitiveValues() throws Exception { - createPrimitiveTable(); - - TestHelpers.assertRows( - sql( - "SELECT readable_metrics.intCol.lower_bound, readable_metrics.booleanCol.upper_bound FROM %s$files", - TABLE_NAME), - ImmutableList.of(Row.of(1, true))); - - TestHelpers.assertRows( - sql("SELECT content, readable_metrics.longCol.value_count FROM %s$files", TABLE_NAME), - ImmutableList.of(Row.of(0, 4L))); - - TestHelpers.assertRows( - sql("SELECT readable_metrics.longCol.value_count, content FROM %s$files", TABLE_NAME), - ImmutableList.of(Row.of(4L, 0))); - } - - @TestTemplate - public void testSelectNestedValues() throws Exception { - createNestedTable(); - TestHelpers.assertRows( - sql( - "SELECT readable_metrics.`nestedStructCol.leafStructCol.leafLongCol`.lower_bound, " - + "readable_metrics.`nestedStructCol.leafStructCol.leafDoubleCol`.value_count FROM %s$files", - TABLE_NAME), - ImmutableList.of(Row.of(0L, 3L))); - } - - @TestTemplate - public void testNestedValues() throws Exception { - createNestedTable(); - - Row leafDoubleCol = Row.of(46L, 3L, 1L, 1L, 0.0D, 0.0D); - Row leafLongCol = Row.of(54L, 3L, 1L, null, 0L, 1L); - Row metrics = Row.of(Row.of(leafDoubleCol, leafLongCol)); - - TestHelpers.assertRows( - sql("SELECT readable_metrics FROM %s$files", TABLE_NAME), ImmutableList.of(metrics)); - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestProjectMetaColumn.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestProjectMetaColumn.java deleted file mode 100644 index ce9054ad49b6..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestProjectMetaColumn.java +++ /dev/null @@ -1,188 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.List; -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.RowData; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.DeleteFile; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Parameter; -import org.apache.iceberg.ParameterizedTestExtension; -import org.apache.iceberg.Parameters; -import org.apache.iceberg.RowDelta; -import org.apache.iceberg.SerializableTable; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.flink.SimpleDataUtil; -import org.apache.iceberg.flink.TableLoader; -import org.apache.iceberg.flink.TestHelpers; -import org.apache.iceberg.flink.data.RowDataProjection; -import org.apache.iceberg.flink.sink.RowDataTaskWriterFactory; -import org.apache.iceberg.flink.sink.TaskWriterFactory; -import org.apache.iceberg.io.TaskWriter; -import org.apache.iceberg.io.WriteResult; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.junit.jupiter.api.TestTemplate; -import org.junit.jupiter.api.extension.ExtendWith; -import org.junit.jupiter.api.io.TempDir; - -@ExtendWith(ParameterizedTestExtension.class) -public class TestProjectMetaColumn { - - @TempDir protected Path temporaryFolder; - - @Parameter(index = 0) - private FileFormat format; - - @Parameters(name = "fileFormat={0}") - public static Iterable parameters() { - return Lists.newArrayList( - new Object[] {FileFormat.PARQUET}, - new Object[] {FileFormat.ORC}, - new Object[] {FileFormat.AVRO}); - } - - private void testSkipToRemoveMetaColumn(int formatVersion) throws IOException { - // Create the table with given format version. - String location = Files.createTempDirectory(temporaryFolder, "junit").toFile().toString(); - Table table = - SimpleDataUtil.createTable( - location, - ImmutableMap.of(TableProperties.FORMAT_VERSION, String.valueOf(formatVersion)), - false); - - List rows = - Lists.newArrayList( - SimpleDataUtil.createInsert(1, "AAA"), - SimpleDataUtil.createInsert(2, "BBB"), - SimpleDataUtil.createInsert(3, "CCC")); - writeAndCommit(table, ImmutableList.of(), false, rows); - - FlinkInputFormat input = - FlinkSource.forRowData().tableLoader(TableLoader.fromHadoopTable(location)).buildFormat(); - - List results = Lists.newArrayList(); - TestHelpers.readRowData( - input, - rowData -> { - // If project to remove the meta columns, it will get a RowDataProjection. - assertThat(rowData).isInstanceOf(GenericRowData.class); - results.add(TestHelpers.copyRowData(rowData, SimpleDataUtil.ROW_TYPE)); - }); - - // Assert the results. - TestHelpers.assertRows(rows, results, SimpleDataUtil.ROW_TYPE); - } - - @TestTemplate - public void testV1SkipToRemoveMetaColumn() throws IOException { - testSkipToRemoveMetaColumn(1); - } - - @TestTemplate - public void testV2SkipToRemoveMetaColumn() throws IOException { - testSkipToRemoveMetaColumn(2); - } - - @TestTemplate - public void testV2RemoveMetaColumn() throws Exception { - // Create the v2 table. - String location = Files.createTempDirectory(temporaryFolder, "junit").toFile().toString(); - Table table = - SimpleDataUtil.createTable( - location, ImmutableMap.of(TableProperties.FORMAT_VERSION, "2"), false); - - List rows = - Lists.newArrayList( - SimpleDataUtil.createInsert(1, "AAA"), - SimpleDataUtil.createDelete(1, "AAA"), - SimpleDataUtil.createInsert(2, "AAA"), - SimpleDataUtil.createInsert(2, "BBB")); - int eqFieldId = table.schema().findField("data").fieldId(); - writeAndCommit(table, ImmutableList.of(eqFieldId), true, rows); - - FlinkInputFormat input = - FlinkSource.forRowData().tableLoader(TableLoader.fromHadoopTable(location)).buildFormat(); - - List results = Lists.newArrayList(); - TestHelpers.readRowData( - input, - rowData -> { - // If project to remove the meta columns, it will get a RowDataProjection. - assertThat(rowData).isInstanceOf(RowDataProjection.class); - results.add(TestHelpers.copyRowData(rowData, SimpleDataUtil.ROW_TYPE)); - }); - - // Assert the results. - TestHelpers.assertRows( - ImmutableList.of( - SimpleDataUtil.createInsert(2, "AAA"), SimpleDataUtil.createInsert(2, "BBB")), - results, - SimpleDataUtil.ROW_TYPE); - } - - private void writeAndCommit( - Table table, List eqFieldIds, boolean upsert, List rows) - throws IOException { - TaskWriter writer = createTaskWriter(table, eqFieldIds, upsert); - try (TaskWriter io = writer) { - for (RowData row : rows) { - io.write(row); - } - } - - RowDelta delta = table.newRowDelta(); - WriteResult result = writer.complete(); - - for (DataFile dataFile : result.dataFiles()) { - delta.addRows(dataFile); - } - - for (DeleteFile deleteFile : result.deleteFiles()) { - delta.addDeletes(deleteFile); - } - - delta.commit(); - } - - private TaskWriter createTaskWriter( - Table table, List equalityFieldIds, boolean upsert) { - TaskWriterFactory taskWriterFactory = - new RowDataTaskWriterFactory( - SerializableTable.copyOf(table), - SimpleDataUtil.ROW_TYPE, - TableProperties.WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT, - format, - table.properties(), - equalityFieldIds, - upsert); - - taskWriterFactory.initialize(1, 1); - return taskWriterFactory.create(); - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestRowDataToAvroGenericRecordConverter.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestRowDataToAvroGenericRecordConverter.java deleted file mode 100644 index 6ef40693827e..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestRowDataToAvroGenericRecordConverter.java +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import static org.assertj.core.api.Assertions.assertThat; - -import org.apache.avro.generic.GenericRecord; -import org.apache.iceberg.flink.AvroGenericRecordConverterBase; -import org.apache.iceberg.flink.DataGenerator; - -public class TestRowDataToAvroGenericRecordConverter extends AvroGenericRecordConverterBase { - @Override - protected void testConverter(DataGenerator dataGenerator) { - RowDataToAvroGenericRecordConverter converter = - RowDataToAvroGenericRecordConverter.fromAvroSchema(dataGenerator.avroSchema()); - GenericRecord expected = dataGenerator.generateAvroGenericRecord(); - GenericRecord actual = converter.apply(dataGenerator.generateFlinkRowData()); - assertThat(actual).isEqualTo(expected); - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestScanContext.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestScanContext.java deleted file mode 100644 index 5dd7de545e11..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestScanContext.java +++ /dev/null @@ -1,112 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import static org.assertj.core.api.Assertions.assertThatThrownBy; - -import org.junit.jupiter.api.Test; - -class TestScanContext { - @Test - void testIncrementalFromSnapshotId() { - ScanContext context = - ScanContext.builder() - .streaming(true) - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_ID) - .build(); - assertException( - context, "Invalid starting snapshot id for SPECIFIC_START_SNAPSHOT_ID strategy: null"); - - context = - ScanContext.builder() - .streaming(true) - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_ID) - .startSnapshotId(1L) - .startSnapshotTimestamp(1L) - .build(); - assertException( - context, - "Invalid starting snapshot timestamp for SPECIFIC_START_SNAPSHOT_ID strategy: not null"); - } - - @Test - void testIncrementalFromSnapshotTimestamp() { - ScanContext context = - ScanContext.builder() - .streaming(true) - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP) - .build(); - assertException( - context, - "Invalid starting snapshot timestamp for SPECIFIC_START_SNAPSHOT_TIMESTAMP strategy: null"); - - context = - ScanContext.builder() - .streaming(true) - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP) - .startSnapshotId(1L) - .startSnapshotTimestamp(1L) - .build(); - assertException( - context, "Invalid starting snapshot id for SPECIFIC_START_SNAPSHOT_ID strategy: not null"); - } - - @Test - void testStreaming() { - ScanContext context = ScanContext.builder().streaming(true).useTag("tag").build(); - assertException(context, "Cannot scan table using ref tag configured for streaming reader"); - - context = ScanContext.builder().streaming(true).useSnapshotId(1L).build(); - assertException(context, "Cannot set snapshot-id option for streaming reader"); - - context = ScanContext.builder().streaming(true).asOfTimestamp(1L).build(); - assertException(context, "Cannot set as-of-timestamp option for streaming reader"); - - context = ScanContext.builder().streaming(true).endSnapshotId(1L).build(); - assertException(context, "Cannot set end-snapshot-id option for streaming reader"); - - context = ScanContext.builder().streaming(true).endTag("tag").build(); - assertException(context, "Cannot set end-tag option for streaming reader"); - } - - @Test - void testStartConflict() { - ScanContext context = ScanContext.builder().startTag("tag").startSnapshotId(1L).build(); - assertException(context, "START_SNAPSHOT_ID and START_TAG cannot both be set."); - } - - @Test - void testEndConflict() { - ScanContext context = ScanContext.builder().endTag("tag").endSnapshotId(1L).build(); - assertException(context, "END_SNAPSHOT_ID and END_TAG cannot both be set."); - } - - @Test - void testMaxAllowedPlanningFailures() { - ScanContext context = ScanContext.builder().maxAllowedPlanningFailures(-2).build(); - assertException( - context, "Cannot set maxAllowedPlanningFailures to a negative number other than -1."); - } - - private void assertException(ScanContext context, String message) { - assertThatThrownBy(() -> context.validate()) - .hasMessage(message) - .isInstanceOf(IllegalArgumentException.class); - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestSourceUtil.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestSourceUtil.java deleted file mode 100644 index b701419a7499..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestSourceUtil.java +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.IOException; -import org.apache.flink.configuration.Configuration; -import org.apache.iceberg.flink.FlinkConfigOptions; -import org.junit.jupiter.api.Test; - -public class TestSourceUtil { - @Test - public void testInferedParallelism() throws IOException { - Configuration configuration = new Configuration(); - // Empty table, infer parallelism should be at least 1 - int parallelism = SourceUtil.inferParallelism(configuration, -1L, () -> 0); - assertThat(parallelism).isEqualTo(1); - - // 2 splits (max infer is the default value 100 , max > splits num), the parallelism is splits - // num : 2 - parallelism = SourceUtil.inferParallelism(configuration, -1L, () -> 2); - assertThat(parallelism).isEqualTo(2); - - // 2 splits and limit is 1 , max infer parallelism is default 100, - // which is greater than splits num and limit, the parallelism is the limit value : 1 - parallelism = SourceUtil.inferParallelism(configuration, 1, () -> 2); - assertThat(parallelism).isEqualTo(1); - - // 2 splits and max infer parallelism is 1 (max < splits num), the parallelism is 1 - configuration.setInteger(FlinkConfigOptions.TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM_MAX, 1); - parallelism = SourceUtil.inferParallelism(configuration, -1L, () -> 2); - assertThat(parallelism).isEqualTo(1); - - // 2 splits, max infer parallelism is 1, limit is 3, the parallelism is max infer parallelism : - // 1 - parallelism = SourceUtil.inferParallelism(configuration, 3, () -> 2); - assertThat(parallelism).isEqualTo(1); - - // 2 splits, infer parallelism is disabled, the parallelism is flink default parallelism 1 - configuration.setBoolean(FlinkConfigOptions.TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM, false); - parallelism = SourceUtil.inferParallelism(configuration, 3, () -> 2); - assertThat(parallelism).isEqualTo(1); - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestSqlBase.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestSqlBase.java deleted file mode 100644 index f9b776397cfc..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestSqlBase.java +++ /dev/null @@ -1,160 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import static org.apache.iceberg.flink.TestFixtures.DATABASE; -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.IOException; -import java.nio.file.Path; -import java.util.List; -import java.util.Map; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.table.api.EnvironmentSettings; -import org.apache.flink.table.api.TableEnvironment; -import org.apache.flink.test.junit5.MiniClusterExtension; -import org.apache.flink.types.Row; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Table; -import org.apache.iceberg.TestHelpers; -import org.apache.iceberg.data.GenericAppenderHelper; -import org.apache.iceberg.data.RandomGenericData; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.flink.FlinkConfigOptions; -import org.apache.iceberg.flink.HadoopCatalogExtension; -import org.apache.iceberg.flink.MiniFlinkClusterExtension; -import org.apache.iceberg.flink.TableLoader; -import org.apache.iceberg.flink.TestFixtures; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.extension.RegisterExtension; -import org.junit.jupiter.api.io.TempDir; - -/** Test other more advanced usage of SQL. They don't need to run for every file format. */ -public abstract class TestSqlBase { - @RegisterExtension - public static MiniClusterExtension miniClusterExtension = - MiniFlinkClusterExtension.createWithClassloaderCheckDisabled(); - - @RegisterExtension - public static final HadoopCatalogExtension CATALOG_EXTENSION = - new HadoopCatalogExtension(DATABASE, TestFixtures.TABLE); - - @TempDir protected Path temporaryFolder; - - private volatile TableEnvironment tEnv; - - protected TableEnvironment getTableEnv() { - if (tEnv == null) { - synchronized (this) { - if (tEnv == null) { - this.tEnv = - TableEnvironment.create(EnvironmentSettings.newInstance().inBatchMode().build()); - } - } - } - return tEnv; - } - - @BeforeEach - public abstract void before() throws IOException; - - @Test - public void testResiduals() throws Exception { - Table table = - CATALOG_EXTENSION - .catalog() - .createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA, TestFixtures.SPEC); - - List writeRecords = RandomGenericData.generate(TestFixtures.SCHEMA, 2, 0L); - writeRecords.get(0).set(1, 123L); - writeRecords.get(0).set(2, "2020-03-20"); - writeRecords.get(1).set(1, 456L); - writeRecords.get(1).set(2, "2020-03-20"); - - GenericAppenderHelper helper = - new GenericAppenderHelper(table, FileFormat.PARQUET, temporaryFolder); - - List expectedRecords = Lists.newArrayList(); - expectedRecords.add(writeRecords.get(0)); - - DataFile dataFile1 = helper.writeFile(TestHelpers.Row.of("2020-03-20", 0), writeRecords); - DataFile dataFile2 = - helper.writeFile( - TestHelpers.Row.of("2020-03-21", 0), - RandomGenericData.generate(TestFixtures.SCHEMA, 2, 0L)); - helper.appendToTable(dataFile1, dataFile2); - - org.apache.iceberg.flink.TestHelpers.assertRecords( - run(Maps.newHashMap(), "where dt='2020-03-20' and id=123", "*"), - expectedRecords, - TestFixtures.SCHEMA); - } - - @Test - public void testExposeLocality() throws Exception { - Table table = - CATALOG_EXTENSION - .catalog() - .createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA, TestFixtures.SPEC); - - TableLoader tableLoader = TableLoader.fromHadoopTable(table.location()); - List expectedRecords = RandomGenericData.generate(TestFixtures.SCHEMA, 10, 0L); - expectedRecords.forEach(expectedRecord -> expectedRecord.set(2, "2020-03-20")); - - GenericAppenderHelper helper = - new GenericAppenderHelper(table, FileFormat.PARQUET, temporaryFolder); - DataFile dataFile = helper.writeFile(TestHelpers.Row.of("2020-03-20", 0), expectedRecords); - helper.appendToTable(dataFile); - - // test sql api - Configuration tableConf = getTableEnv().getConfig().getConfiguration(); - tableConf.setBoolean( - FlinkConfigOptions.TABLE_EXEC_ICEBERG_EXPOSE_SPLIT_LOCALITY_INFO.key(), false); - - List results = SqlHelpers.sql(getTableEnv(), "select * from t"); - org.apache.iceberg.flink.TestHelpers.assertRecords( - results, expectedRecords, TestFixtures.SCHEMA); - - // test table api - tableConf.setBoolean( - FlinkConfigOptions.TABLE_EXEC_ICEBERG_EXPOSE_SPLIT_LOCALITY_INFO.key(), true); - FlinkSource.Builder builder = FlinkSource.forRowData().tableLoader(tableLoader).table(table); - - // When running with CI or local, `localityEnabled` will be false even if this configuration is - // enabled - assertThat(SourceUtil.isLocalityEnabled(table, tableConf, true)) - .as("Expose split locality info should be false.") - .isFalse(); - - results = run(Maps.newHashMap(), "where dt='2020-03-20'", "*"); - org.apache.iceberg.flink.TestHelpers.assertRecords( - results, expectedRecords, TestFixtures.SCHEMA); - } - - protected List run( - Map options, String sqlFilter, String... sqlSelectedFields) { - String select = String.join(",", sqlSelectedFields); - String optionStr = SqlHelpers.sqlOptionsToString(options); - return SqlHelpers.sql(getTableEnv(), "select %s from t %s %s", select, optionStr, sqlFilter); - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamScanSql.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamScanSql.java deleted file mode 100644 index 57ee7baf202c..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamScanSql.java +++ /dev/null @@ -1,434 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatThrownBy; - -import java.io.IOException; -import java.util.Iterator; -import java.util.List; -import org.apache.flink.core.execution.JobClient; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.table.api.EnvironmentSettings; -import org.apache.flink.table.api.TableEnvironment; -import org.apache.flink.table.api.TableResult; -import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; -import org.apache.flink.table.api.config.TableConfigOptions; -import org.apache.flink.types.Row; -import org.apache.flink.util.CloseableIterator; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.SnapshotRef; -import org.apache.iceberg.Table; -import org.apache.iceberg.TestHelpers; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.data.GenericAppenderHelper; -import org.apache.iceberg.data.GenericRecord; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.flink.CatalogTestBase; -import org.apache.iceberg.flink.MiniFlinkClusterExtension; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.TestTemplate; -import org.junit.jupiter.api.Timeout; - -@Timeout(60) -public class TestStreamScanSql extends CatalogTestBase { - private static final String TABLE = "test_table"; - private static final FileFormat FORMAT = FileFormat.PARQUET; - - private TableEnvironment tEnv; - - @Override - protected TableEnvironment getTableEnv() { - TableEnvironment tableEnv = tEnv; - if (tableEnv != null) { - return tableEnv; - } - synchronized (this) { - if (tEnv == null) { - EnvironmentSettings.Builder settingsBuilder = - EnvironmentSettings.newInstance().inStreamingMode(); - - StreamExecutionEnvironment env = - StreamExecutionEnvironment.getExecutionEnvironment( - MiniFlinkClusterExtension.DISABLE_CLASSLOADER_CHECK_CONFIG); - env.enableCheckpointing(400); - - StreamTableEnvironment streamTableEnv = - StreamTableEnvironment.create(env, settingsBuilder.build()); - streamTableEnv - .getConfig() - .getConfiguration() - .set(TableConfigOptions.TABLE_DYNAMIC_TABLE_OPTIONS_ENABLED, true); - tEnv = streamTableEnv; - } - } - return tEnv; - } - - @Override - @BeforeEach - public void before() { - super.before(); - sql("CREATE DATABASE %s", flinkDatabase); - sql("USE CATALOG %s", catalogName); - sql("USE %s", DATABASE); - } - - @Override - @AfterEach - public void clean() { - sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, TABLE); - sql("DROP DATABASE IF EXISTS %s", flinkDatabase); - super.clean(); - } - - private void insertRows(String partition, Table table, Row... rows) throws IOException { - insertRows(partition, SnapshotRef.MAIN_BRANCH, table, rows); - } - - private void insertRows(String partition, String branch, Table table, Row... rows) - throws IOException { - GenericAppenderHelper appender = new GenericAppenderHelper(table, FORMAT, temporaryDirectory); - - GenericRecord gRecord = GenericRecord.create(table.schema()); - List records = Lists.newArrayList(); - for (Row row : rows) { - records.add( - gRecord.copy( - "id", row.getField(0), - "data", row.getField(1), - "dt", row.getField(2))); - } - - if (partition != null) { - appender.appendToTable(TestHelpers.Row.of(partition, 0), branch, records); - } else { - appender.appendToTable(branch, records); - } - } - - private void insertRowsInBranch(String branch, Table table, Row... rows) throws IOException { - insertRows(null, branch, table, rows); - } - - private void insertRows(Table table, Row... rows) throws IOException { - insertRows(null, table, rows); - } - - private void assertRows(List expectedRows, Iterator iterator) { - for (Row expectedRow : expectedRows) { - assertThat(iterator).hasNext(); - Row actualRow = iterator.next(); - assertThat(actualRow.getArity()).isEqualTo(3); - assertThat(actualRow.getField(0)).isEqualTo(expectedRow.getField(0)); - assertThat(actualRow.getField(1)).isEqualTo(expectedRow.getField(1)); - assertThat(actualRow.getField(2)).isEqualTo(expectedRow.getField(2)); - } - } - - @TestTemplate - public void testUnPartitionedTable() throws Exception { - sql("CREATE TABLE %s (id INT, data VARCHAR, dt VARCHAR)", TABLE); - Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE)); - - TableResult result = - exec("SELECT * FROM %s /*+ OPTIONS('streaming'='true', 'monitor-interval'='1s')*/", TABLE); - try (CloseableIterator iterator = result.collect()) { - - Row row1 = Row.of(1, "aaa", "2021-01-01"); - insertRows(table, row1); - assertRows(ImmutableList.of(row1), iterator); - - Row row2 = Row.of(2, "bbb", "2021-01-01"); - insertRows(table, row2); - assertRows(ImmutableList.of(row2), iterator); - } - result.getJobClient().ifPresent(JobClient::cancel); - } - - @TestTemplate - public void testPartitionedTable() throws Exception { - sql("CREATE TABLE %s (id INT, data VARCHAR, dt VARCHAR) PARTITIONED BY (dt)", TABLE); - Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE)); - - TableResult result = - exec("SELECT * FROM %s /*+ OPTIONS('streaming'='true', 'monitor-interval'='1s')*/", TABLE); - try (CloseableIterator iterator = result.collect()) { - Row row1 = Row.of(1, "aaa", "2021-01-01"); - insertRows("2021-01-01", table, row1); - assertRows(ImmutableList.of(row1), iterator); - - Row row2 = Row.of(2, "bbb", "2021-01-02"); - insertRows("2021-01-02", table, row2); - assertRows(ImmutableList.of(row2), iterator); - - Row row3 = Row.of(1, "aaa", "2021-01-02"); - insertRows("2021-01-02", table, row3); - assertRows(ImmutableList.of(row3), iterator); - - Row row4 = Row.of(2, "bbb", "2021-01-01"); - insertRows("2021-01-01", table, row4); - assertRows(ImmutableList.of(row4), iterator); - } - result.getJobClient().ifPresent(JobClient::cancel); - } - - @TestTemplate - public void testConsumeFromBeginning() throws Exception { - sql("CREATE TABLE %s (id INT, data VARCHAR, dt VARCHAR)", TABLE); - Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE)); - - Row row1 = Row.of(1, "aaa", "2021-01-01"); - Row row2 = Row.of(2, "bbb", "2021-01-01"); - insertRows(table, row1, row2); - - TableResult result = - exec("SELECT * FROM %s /*+ OPTIONS('streaming'='true', 'monitor-interval'='1s')*/", TABLE); - try (CloseableIterator iterator = result.collect()) { - assertRows(ImmutableList.of(row1, row2), iterator); - - Row row3 = Row.of(3, "ccc", "2021-01-01"); - insertRows(table, row3); - assertRows(ImmutableList.of(row3), iterator); - - Row row4 = Row.of(4, "ddd", "2021-01-01"); - insertRows(table, row4); - assertRows(ImmutableList.of(row4), iterator); - } - result.getJobClient().ifPresent(JobClient::cancel); - } - - @TestTemplate - /** - * Insert records on the main branch. Then, insert in a named branch. Reads from the main branch - * and assert that the only records from main are returned - */ - public void testConsumeFilesFromMainBranch() throws Exception { - sql("CREATE TABLE %s (id INT, data VARCHAR, dt VARCHAR)", TABLE); - Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE)); - - // Produce two snapshots on main branch - Row row1 = Row.of(1, "aaa", "2021-01-01"); - Row row2 = Row.of(2, "bbb", "2021-01-01"); - - insertRows(table, row1, row2); - String branchName = "b1"; - table.manageSnapshots().createBranch(branchName).commit(); - - // insert on the 'b1' branch - Row row3 = Row.of(3, "ccc", "2021-01-01"); - Row row4 = Row.of(4, "ddd", "2021-01-01"); - - insertRowsInBranch(branchName, table, row3, row4); - - // read from main - TableResult result = - exec("SELECT * FROM %s /*+ OPTIONS('streaming'='true', 'monitor-interval'='1s')*/", TABLE); - - try (CloseableIterator iterator = result.collect()) { - // the start snapshot(row2) is exclusive. - assertRows(ImmutableList.of(row1, row2), iterator); - - Row row5 = Row.of(5, "eee", "2021-01-01"); - Row row6 = Row.of(6, "fff", "2021-01-01"); - insertRows(table, row5, row6); - assertRows(ImmutableList.of(row5, row6), iterator); - - Row row7 = Row.of(7, "ggg", "2021-01-01"); - insertRows(table, row7); - assertRows(ImmutableList.of(row7), iterator); - } - result.getJobClient().ifPresent(JobClient::cancel); - } - - @TestTemplate - /** - * Insert records on the main branch. Creates a named branch. Insert record on named branch. Then - * select from the named branch and assert all the records are returned. - */ - public void testConsumeFilesFromBranch() throws Exception { - sql("CREATE TABLE %s (id INT, data VARCHAR, dt VARCHAR)", TABLE); - Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE)); - - // Produce two snapshots on main branch - Row row1 = Row.of(1, "aaa", "2021-01-01"); - Row row2 = Row.of(2, "bbb", "2021-01-01"); - - insertRows(table, row1, row2); - String branchName = "b1"; - table.manageSnapshots().createBranch(branchName).commit(); - - TableResult result = - exec( - "SELECT * FROM %s /*+ OPTIONS('streaming'='true', 'monitor-interval'='1s', 'branch'='%s')*/ ", - TABLE, branchName); - - try (CloseableIterator iterator = result.collect()) { - assertRows(ImmutableList.of(row1, row2), iterator); - // insert on the 'b1' branch - Row row3 = Row.of(3, "ccc", "2021-01-01"); - Row row4 = Row.of(4, "ddd", "2021-01-01"); - insertRowsInBranch(branchName, table, row3, row4); - assertRows(ImmutableList.of(row3, row4), iterator); - } - result.getJobClient().ifPresent(JobClient::cancel); - } - - @TestTemplate - /** - * Insert records on branch b1. Then insert record on b2. Then select from each branch and assert - * the correct records are returned - */ - public void testConsumeFilesFromTwoBranches() throws Exception { - sql("CREATE TABLE %s (id INT, data VARCHAR, dt VARCHAR)", TABLE); - Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE)); - - String branch1 = "b1"; - String branch2 = "b2"; - table.manageSnapshots().createBranch(branch1).commit(); - table.manageSnapshots().createBranch(branch2).commit(); - - // Produce two snapshots on main branch - Row row1Branch1 = Row.of(1, "b1", "2021-01-01"); - Row row2Branch1 = Row.of(2, "b1", "2021-01-01"); - - Row row1Branch2 = Row.of(2, "b2", "2021-01-01"); - Row row2Branch2 = Row.of(3, "b3", "2021-01-01"); - - insertRowsInBranch(branch1, table, row1Branch1, row2Branch1); - insertRowsInBranch(branch2, table, row1Branch2, row2Branch2); - - TableResult resultBranch1 = - exec( - "SELECT * FROM %s /*+ OPTIONS('streaming'='true', 'monitor-interval'='1s', 'branch'='%s')*/ ", - TABLE, branch1); - - try (CloseableIterator iterator = resultBranch1.collect()) { - assertRows(ImmutableList.of(row1Branch1, row2Branch1), iterator); - Row another = Row.of(4, "ccc", "2021-01-01"); - insertRowsInBranch(branch1, table, another); - assertRows(ImmutableList.of(another), iterator); - } - - TableResult resultBranch2 = - exec( - "SELECT * FROM %s /*+ OPTIONS('streaming'='true', 'monitor-interval'='1s', 'branch'='%s')*/ ", - TABLE, branch2); - try (CloseableIterator iterator = resultBranch2.collect()) { - assertRows(ImmutableList.of(row1Branch2, row2Branch2), iterator); - Row another = Row.of(4, "ccc", "2021-01-01"); - insertRowsInBranch(branch2, table, another); - assertRows(ImmutableList.of(another), iterator); - } - - resultBranch1.getJobClient().ifPresent(JobClient::cancel); - resultBranch2.getJobClient().ifPresent(JobClient::cancel); - } - - @TestTemplate - public void testConsumeFromStartSnapshotId() throws Exception { - sql("CREATE TABLE %s (id INT, data VARCHAR, dt VARCHAR)", TABLE); - Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE)); - - // Produce two snapshots. - Row row1 = Row.of(1, "aaa", "2021-01-01"); - Row row2 = Row.of(2, "bbb", "2021-01-01"); - insertRows(table, row1); - insertRows(table, row2); - - long startSnapshotId = table.currentSnapshot().snapshotId(); - - Row row3 = Row.of(3, "ccc", "2021-01-01"); - Row row4 = Row.of(4, "ddd", "2021-01-01"); - insertRows(table, row3, row4); - - TableResult result = - exec( - "SELECT * FROM %s /*+ OPTIONS('streaming'='true', 'monitor-interval'='1s', " - + "'start-snapshot-id'='%d')*/", - TABLE, startSnapshotId); - try (CloseableIterator iterator = result.collect()) { - // the start snapshot(row2) is exclusive. - assertRows(ImmutableList.of(row3, row4), iterator); - - Row row5 = Row.of(5, "eee", "2021-01-01"); - Row row6 = Row.of(6, "fff", "2021-01-01"); - insertRows(table, row5, row6); - assertRows(ImmutableList.of(row5, row6), iterator); - - Row row7 = Row.of(7, "ggg", "2021-01-01"); - insertRows(table, row7); - assertRows(ImmutableList.of(row7), iterator); - } - result.getJobClient().ifPresent(JobClient::cancel); - } - - @TestTemplate - public void testConsumeFromStartTag() throws Exception { - sql("CREATE TABLE %s (id INT, data VARCHAR, dt VARCHAR)", TABLE); - Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE)); - - // Produce two snapshots. - Row row1 = Row.of(1, "aaa", "2021-01-01"); - Row row2 = Row.of(2, "bbb", "2021-01-01"); - insertRows(table, row1); - insertRows(table, row2); - - String tagName = "t1"; - long startSnapshotId = table.currentSnapshot().snapshotId(); - table.manageSnapshots().createTag(tagName, startSnapshotId).commit(); - - Row row3 = Row.of(3, "ccc", "2021-01-01"); - Row row4 = Row.of(4, "ddd", "2021-01-01"); - insertRows(table, row3, row4); - - TableResult result = - exec( - "SELECT * FROM %s /*+ OPTIONS('streaming'='true', 'monitor-interval'='1s', " - + "'start-tag'='%s')*/", - TABLE, tagName); - try (CloseableIterator iterator = result.collect()) { - // the start snapshot(row2) is exclusive. - assertRows(ImmutableList.of(row3, row4), iterator); - - Row row5 = Row.of(5, "eee", "2021-01-01"); - Row row6 = Row.of(6, "fff", "2021-01-01"); - insertRows(table, row5, row6); - assertRows(ImmutableList.of(row5, row6), iterator); - - Row row7 = Row.of(7, "ggg", "2021-01-01"); - insertRows(table, row7); - assertRows(ImmutableList.of(row7), iterator); - } - result.getJobClient().ifPresent(JobClient::cancel); - - assertThatThrownBy( - () -> - exec( - "SELECT * FROM %s /*+ OPTIONS('streaming'='true', 'monitor-interval'='1s', 'start-tag'='%s', " - + "'start-snapshot-id'='%d' )*/", - TABLE, tagName, startSnapshotId)) - .isInstanceOf(IllegalArgumentException.class) - .hasMessage("START_SNAPSHOT_ID and START_TAG cannot both be set."); - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamingMonitorFunction.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamingMonitorFunction.java deleted file mode 100644 index 9c4f476b02b4..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamingMonitorFunction.java +++ /dev/null @@ -1,402 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatThrownBy; - -import java.io.File; -import java.io.IOException; -import java.nio.file.Files; -import java.time.Duration; -import java.util.Arrays; -import java.util.List; -import java.util.concurrent.CountDownLatch; -import org.apache.flink.runtime.checkpoint.OperatorSubtaskState; -import org.apache.flink.streaming.api.functions.source.SourceFunction; -import org.apache.flink.streaming.api.operators.StreamSource; -import org.apache.flink.streaming.api.watermark.Watermark; -import org.apache.flink.streaming.util.AbstractStreamOperatorTestHarness; -import org.apache.flink.table.data.RowData; -import org.apache.flink.types.Row; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.ParameterizedTestExtension; -import org.apache.iceberg.Parameters; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.TestBase; -import org.apache.iceberg.data.GenericAppenderHelper; -import org.apache.iceberg.data.RandomGenericData; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.flink.TestHelpers; -import org.apache.iceberg.flink.TestTableLoader; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.Iterables; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.types.Types; -import org.apache.iceberg.util.SnapshotUtil; -import org.apache.iceberg.util.ThreadPools; -import org.awaitility.Awaitility; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.TestTemplate; -import org.junit.jupiter.api.extension.ExtendWith; - -@ExtendWith(ParameterizedTestExtension.class) -public class TestStreamingMonitorFunction extends TestBase { - - private static final Schema SCHEMA = - new Schema( - Types.NestedField.required(1, "id", Types.IntegerType.get()), - Types.NestedField.required(2, "data", Types.StringType.get())); - private static final FileFormat DEFAULT_FORMAT = FileFormat.PARQUET; - private static final long WAIT_TIME_MILLIS = 10 * 1000L; - - @Parameters(name = "formatVersion = {0}") - protected static List parameters() { - return Arrays.asList(1, 2); - } - - @BeforeEach - @Override - public void setupTable() throws IOException { - this.tableDir = Files.createTempDirectory(temp, "junit").toFile(); - this.metadataDir = new File(tableDir, "metadata"); - assertThat(tableDir.delete()).isTrue(); - - // Construct the iceberg table. - table = create(SCHEMA, PartitionSpec.unpartitioned()); - } - - private void runSourceFunctionInTask( - TestSourceContext sourceContext, StreamingMonitorFunction function) { - Thread task = - new Thread( - () -> { - try { - function.run(sourceContext); - } catch (Exception e) { - throw new RuntimeException(e); - } - }); - task.start(); - } - - @TestTemplate - public void testConsumeWithoutStartSnapshotId() throws Exception { - List> recordsList = generateRecordsAndCommitTxn(10); - ScanContext scanContext = ScanContext.builder().monitorInterval(Duration.ofMillis(100)).build(); - - StreamingMonitorFunction function = createFunction(scanContext); - try (AbstractStreamOperatorTestHarness harness = createHarness(function)) { - harness.setup(); - harness.open(); - - TestSourceContext sourceContext = new TestSourceContext(new CountDownLatch(1)); - runSourceFunctionInTask(sourceContext, function); - - awaitExpectedSplits(sourceContext); - - // Stop the stream task. - function.close(); - - TestHelpers.assertRecords( - sourceContext.toRows(), Lists.newArrayList(Iterables.concat(recordsList)), SCHEMA); - } - } - - @TestTemplate - public void testConsumeFromStartSnapshotId() throws Exception { - // Commit the first five transactions. - generateRecordsAndCommitTxn(5); - long startSnapshotId = table.currentSnapshot().snapshotId(); - - // Commit the next five transactions. - List> recordsList = generateRecordsAndCommitTxn(5); - - ScanContext scanContext = - ScanContext.builder() - .monitorInterval(Duration.ofMillis(100)) - .startSnapshotId(startSnapshotId) - .build(); - - StreamingMonitorFunction function = createFunction(scanContext); - try (AbstractStreamOperatorTestHarness harness = createHarness(function)) { - harness.setup(); - harness.open(); - - TestSourceContext sourceContext = new TestSourceContext(new CountDownLatch(1)); - runSourceFunctionInTask(sourceContext, function); - - awaitExpectedSplits(sourceContext); - - // Stop the stream task. - function.close(); - - TestHelpers.assertRecords( - sourceContext.toRows(), Lists.newArrayList(Iterables.concat(recordsList)), SCHEMA); - } - } - - @TestTemplate - public void testConsumeFromStartTag() throws Exception { - // Commit the first five transactions. - generateRecordsAndCommitTxn(5); - long startSnapshotId = table.currentSnapshot().snapshotId(); - String tagName = "t1"; - table.manageSnapshots().createTag(tagName, startSnapshotId).commit(); - - // Commit the next five transactions. - List> recordsList = generateRecordsAndCommitTxn(5); - - ScanContext scanContext = - ScanContext.builder().monitorInterval(Duration.ofMillis(100)).startTag(tagName).build(); - - StreamingMonitorFunction function = createFunction(scanContext); - try (AbstractStreamOperatorTestHarness harness = createHarness(function)) { - harness.setup(); - harness.open(); - - TestSourceContext sourceContext = new TestSourceContext(new CountDownLatch(1)); - runSourceFunctionInTask(sourceContext, function); - - awaitExpectedSplits(sourceContext); - - // Stop the stream task. - function.close(); - - TestHelpers.assertRecords( - sourceContext.toRows(), Lists.newArrayList(Iterables.concat(recordsList)), SCHEMA); - } - } - - @TestTemplate - public void testCheckpointRestore() throws Exception { - List> recordsList = generateRecordsAndCommitTxn(10); - ScanContext scanContext = ScanContext.builder().monitorInterval(Duration.ofMillis(100)).build(); - - StreamingMonitorFunction func = createFunction(scanContext); - OperatorSubtaskState state; - try (AbstractStreamOperatorTestHarness harness = createHarness(func)) { - harness.setup(); - harness.open(); - - TestSourceContext sourceContext = new TestSourceContext(new CountDownLatch(1)); - runSourceFunctionInTask(sourceContext, func); - - awaitExpectedSplits(sourceContext); - - state = harness.snapshot(1, 1); - - // Stop the stream task. - func.close(); - - TestHelpers.assertRecords( - sourceContext.toRows(), Lists.newArrayList(Iterables.concat(recordsList)), SCHEMA); - } - - List> newRecordsList = generateRecordsAndCommitTxn(10); - StreamingMonitorFunction newFunc = createFunction(scanContext); - try (AbstractStreamOperatorTestHarness harness = createHarness(newFunc)) { - harness.setup(); - // Recover to process the remaining snapshots. - harness.initializeState(state); - harness.open(); - - TestSourceContext sourceContext = new TestSourceContext(new CountDownLatch(1)); - runSourceFunctionInTask(sourceContext, newFunc); - - awaitExpectedSplits(sourceContext); - - // Stop the stream task. - newFunc.close(); - - TestHelpers.assertRecords( - sourceContext.toRows(), Lists.newArrayList(Iterables.concat(newRecordsList)), SCHEMA); - } - } - - private void awaitExpectedSplits(TestSourceContext sourceContext) { - Awaitility.await("expected splits should be produced") - .atMost(Duration.ofMillis(WAIT_TIME_MILLIS)) - .untilAsserted( - () -> { - assertThat(sourceContext.latch.getCount()).isEqualTo(0); - assertThat(sourceContext.splits).as("Should produce the expected splits").hasSize(1); - }); - } - - @TestTemplate - public void testInvalidMaxPlanningSnapshotCount() { - ScanContext scanContext1 = - ScanContext.builder() - .monitorInterval(Duration.ofMillis(100)) - .maxPlanningSnapshotCount(0) - .build(); - - assertThatThrownBy(() -> createFunction(scanContext1)) - .isInstanceOf(IllegalArgumentException.class) - .hasMessage("The max-planning-snapshot-count must be greater than zero"); - - ScanContext scanContext2 = - ScanContext.builder() - .monitorInterval(Duration.ofMillis(100)) - .maxPlanningSnapshotCount(-10) - .build(); - - assertThatThrownBy(() -> createFunction(scanContext2)) - .isInstanceOf(IllegalArgumentException.class) - .hasMessage("The max-planning-snapshot-count must be greater than zero"); - } - - @TestTemplate - public void testConsumeWithMaxPlanningSnapshotCount() throws Exception { - generateRecordsAndCommitTxn(10); - - // Use the oldest snapshot as starting to avoid the initial case. - long oldestSnapshotId = SnapshotUtil.oldestAncestor(table).snapshotId(); - - ScanContext scanContext = - ScanContext.builder() - .monitorInterval(Duration.ofMillis(100)) - .splitSize(1000L) - .startSnapshotId(oldestSnapshotId) - .maxPlanningSnapshotCount(Integer.MAX_VALUE) - .build(); - - FlinkInputSplit[] expectedSplits = - FlinkSplitPlanner.planInputSplits(table, scanContext, ThreadPools.getWorkerPool()); - - assertThat(expectedSplits).hasSize(9); - - // This covers three cases that maxPlanningSnapshotCount is less than, equal or greater than the - // total splits number - for (int maxPlanningSnapshotCount : ImmutableList.of(1, 9, 15)) { - scanContext = - ScanContext.builder() - .monitorInterval(Duration.ofMillis(500)) - .startSnapshotId(oldestSnapshotId) - .splitSize(1000L) - .maxPlanningSnapshotCount(maxPlanningSnapshotCount) - .build(); - - StreamingMonitorFunction function = createFunction(scanContext); - try (AbstractStreamOperatorTestHarness harness = createHarness(function)) { - harness.setup(); - harness.open(); - - CountDownLatch latch = new CountDownLatch(1); - TestSourceContext sourceContext = new TestSourceContext(latch); - function.sourceContext(sourceContext); - function.monitorAndForwardSplits(); - - if (maxPlanningSnapshotCount < 10) { - assertThat(sourceContext.splits).hasSize(maxPlanningSnapshotCount); - } - } - } - } - - private List> generateRecordsAndCommitTxn(int commitTimes) throws IOException { - List> expectedRecords = Lists.newArrayList(); - for (int i = 0; i < commitTimes; i++) { - List records = RandomGenericData.generate(SCHEMA, 100, 0L); - expectedRecords.add(records); - - // Commit those records to iceberg table. - writeRecords(records); - } - return expectedRecords; - } - - private void writeRecords(List records) throws IOException { - GenericAppenderHelper appender = new GenericAppenderHelper(table, DEFAULT_FORMAT, temp); - appender.appendToTable(records); - } - - private StreamingMonitorFunction createFunction(ScanContext scanContext) { - return new StreamingMonitorFunction( - TestTableLoader.of(tableDir.getAbsolutePath()), scanContext); - } - - private AbstractStreamOperatorTestHarness createHarness( - StreamingMonitorFunction function) throws Exception { - StreamSource streamSource = - new StreamSource<>(function); - return new AbstractStreamOperatorTestHarness<>(streamSource, 1, 1, 0); - } - - private class TestSourceContext implements SourceFunction.SourceContext { - private final List splits = Lists.newArrayList(); - private final Object checkpointLock = new Object(); - private final CountDownLatch latch; - - TestSourceContext(CountDownLatch latch) { - this.latch = latch; - } - - @Override - public void collect(FlinkInputSplit element) { - splits.add(element); - latch.countDown(); - } - - @Override - public void collectWithTimestamp(FlinkInputSplit element, long timestamp) { - collect(element); - } - - @Override - public void emitWatermark(Watermark mark) {} - - @Override - public void markAsTemporarilyIdle() {} - - @Override - public Object getCheckpointLock() { - return checkpointLock; - } - - @Override - public void close() {} - - private List toRows() throws IOException { - FlinkInputFormat format = - FlinkSource.forRowData() - .tableLoader(TestTableLoader.of(tableDir.getAbsolutePath())) - .buildFormat(); - - List rows = Lists.newArrayList(); - for (FlinkInputSplit split : splits) { - format.open(split); - - RowData element = null; - try { - while (!format.reachedEnd()) { - element = format.nextRecord(element); - rows.add(Row.of(element.getInt(0), element.getString(1).toString())); - } - } finally { - format.close(); - } - } - - return rows; - } - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamingReaderOperator.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamingReaderOperator.java deleted file mode 100644 index 1606ee9f9648..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamingReaderOperator.java +++ /dev/null @@ -1,293 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.File; -import java.io.IOException; -import java.nio.file.Files; -import java.util.Arrays; -import java.util.Collections; -import java.util.List; -import org.apache.flink.runtime.checkpoint.OperatorSubtaskState; -import org.apache.flink.streaming.api.TimeCharacteristic; -import org.apache.flink.streaming.api.operators.OneInputStreamOperatorFactory; -import org.apache.flink.streaming.runtime.tasks.StreamTaskActionExecutor; -import org.apache.flink.streaming.runtime.tasks.mailbox.MailboxDefaultAction; -import org.apache.flink.streaming.runtime.tasks.mailbox.SteppingMailboxProcessor; -import org.apache.flink.streaming.util.OneInputStreamOperatorTestHarness; -import org.apache.flink.table.data.RowData; -import org.apache.flink.types.Row; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.ParameterizedTestExtension; -import org.apache.iceberg.Parameters; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.TestBase; -import org.apache.iceberg.data.GenericAppenderHelper; -import org.apache.iceberg.data.RandomGenericData; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.flink.TestHelpers; -import org.apache.iceberg.flink.TestTableLoader; -import org.apache.iceberg.relocated.com.google.common.collect.Iterables; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.types.Types; -import org.apache.iceberg.util.SnapshotUtil; -import org.apache.iceberg.util.ThreadPools; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.TestTemplate; -import org.junit.jupiter.api.extension.ExtendWith; - -@ExtendWith(ParameterizedTestExtension.class) -public class TestStreamingReaderOperator extends TestBase { - - private static final Schema SCHEMA = - new Schema( - Types.NestedField.required(1, "id", Types.IntegerType.get()), - Types.NestedField.required(2, "data", Types.StringType.get())); - private static final FileFormat DEFAULT_FORMAT = FileFormat.PARQUET; - - @Parameters(name = "formatVersion = {0}") - protected static List parameters() { - return Arrays.asList(1, 2); - } - - @BeforeEach - @Override - public void setupTable() throws IOException { - this.tableDir = Files.createTempDirectory(temp, "junit").toFile(); - this.metadataDir = new File(tableDir, "metadata"); - assertThat(tableDir.delete()).isTrue(); - - // Construct the iceberg table. - table = create(SCHEMA, PartitionSpec.unpartitioned()); - } - - @TestTemplate - public void testProcessAllRecords() throws Exception { - List> expectedRecords = generateRecordsAndCommitTxn(10); - - List splits = generateSplits(); - assertThat(splits).hasSize(10); - - try (OneInputStreamOperatorTestHarness harness = createReader()) { - harness.setup(); - harness.open(); - - SteppingMailboxProcessor processor = createLocalMailbox(harness); - - List expected = Lists.newArrayList(); - for (int i = 0; i < splits.size(); i++) { - // Process this element to enqueue to mail-box. - harness.processElement(splits.get(i), -1); - - // Run the mail-box once to read all records from the given split. - assertThat(processor.runMailboxStep()).as("Should processed 1 split").isTrue(); - - // Assert the output has expected elements. - expected.addAll(expectedRecords.get(i)); - TestHelpers.assertRecords(readOutputValues(harness), expected, SCHEMA); - } - } - } - - @TestTemplate - public void testTriggerCheckpoint() throws Exception { - // Received emitted splits: split1, split2, split3, checkpoint request is triggered when reading - // records from - // split1. - List> expectedRecords = generateRecordsAndCommitTxn(3); - - List splits = generateSplits(); - assertThat(splits).hasSize(3); - - long timestamp = 0; - try (OneInputStreamOperatorTestHarness harness = createReader()) { - harness.setup(); - harness.open(); - - SteppingMailboxProcessor processor = createLocalMailbox(harness); - - harness.processElement(splits.get(0), ++timestamp); - harness.processElement(splits.get(1), ++timestamp); - harness.processElement(splits.get(2), ++timestamp); - - // Trigger snapshot state, it will start to work once all records from split0 are read. - processor.getMainMailboxExecutor().execute(() -> harness.snapshot(1, 3), "Trigger snapshot"); - - assertThat(processor.runMailboxStep()).as("Should have processed the split0").isTrue(); - assertThat(processor.runMailboxStep()) - .as("Should have processed the snapshot state action") - .isTrue(); - - TestHelpers.assertRecords(readOutputValues(harness), expectedRecords.get(0), SCHEMA); - - // Read records from split1. - assertThat(processor.runMailboxStep()).as("Should have processed the split1").isTrue(); - - // Read records from split2. - assertThat(processor.runMailboxStep()).as("Should have processed the split2").isTrue(); - - TestHelpers.assertRecords( - readOutputValues(harness), Lists.newArrayList(Iterables.concat(expectedRecords)), SCHEMA); - } - } - - @TestTemplate - public void testCheckpointRestore() throws Exception { - List> expectedRecords = generateRecordsAndCommitTxn(15); - - List splits = generateSplits(); - assertThat(splits).hasSize(15); - - OperatorSubtaskState state; - List expected = Lists.newArrayList(); - try (OneInputStreamOperatorTestHarness harness = createReader()) { - harness.setup(); - harness.open(); - - // Enqueue all the splits. - for (FlinkInputSplit split : splits) { - harness.processElement(split, -1); - } - - // Read all records from the first five splits. - SteppingMailboxProcessor localMailbox = createLocalMailbox(harness); - for (int i = 0; i < 5; i++) { - expected.addAll(expectedRecords.get(i)); - assertThat(localMailbox.runMailboxStep()) - .as("Should have processed the split#" + i) - .isTrue(); - - TestHelpers.assertRecords(readOutputValues(harness), expected, SCHEMA); - } - - // Snapshot state now, there're 10 splits left in the state. - state = harness.snapshot(1, 1); - } - - expected.clear(); - try (OneInputStreamOperatorTestHarness harness = createReader()) { - harness.setup(); - // Recover to process the remaining splits. - harness.initializeState(state); - harness.open(); - - SteppingMailboxProcessor localMailbox = createLocalMailbox(harness); - - for (int i = 5; i < 10; i++) { - expected.addAll(expectedRecords.get(i)); - assertThat(localMailbox.runMailboxStep()) - .as("Should have processed the split#" + i) - .isTrue(); - - TestHelpers.assertRecords(readOutputValues(harness), expected, SCHEMA); - } - - // Let's process the final 5 splits now. - for (int i = 10; i < 15; i++) { - expected.addAll(expectedRecords.get(i)); - harness.processElement(splits.get(i), 1); - - assertThat(localMailbox.runMailboxStep()) - .as("Should have processed the split#" + i) - .isTrue(); - TestHelpers.assertRecords(readOutputValues(harness), expected, SCHEMA); - } - } - } - - private List readOutputValues( - OneInputStreamOperatorTestHarness harness) { - List results = Lists.newArrayList(); - for (RowData rowData : harness.extractOutputValues()) { - results.add(Row.of(rowData.getInt(0), rowData.getString(1).toString())); - } - return results; - } - - private List> generateRecordsAndCommitTxn(int commitTimes) throws IOException { - List> expectedRecords = Lists.newArrayList(); - for (int i = 0; i < commitTimes; i++) { - List records = RandomGenericData.generate(SCHEMA, 100, 0L); - expectedRecords.add(records); - - // Commit those records to iceberg table. - writeRecords(records); - } - return expectedRecords; - } - - private void writeRecords(List records) throws IOException { - GenericAppenderHelper appender = new GenericAppenderHelper(table, DEFAULT_FORMAT, temp); - appender.appendToTable(records); - } - - private List generateSplits() { - List inputSplits = Lists.newArrayList(); - - List snapshotIds = SnapshotUtil.currentAncestorIds(table); - for (int i = snapshotIds.size() - 1; i >= 0; i--) { - ScanContext scanContext; - if (i == snapshotIds.size() - 1) { - // Generate the splits from the first snapshot. - scanContext = ScanContext.builder().useSnapshotId(snapshotIds.get(i)).build(); - } else { - // Generate the splits between the previous snapshot and current snapshot. - scanContext = - ScanContext.builder() - .startSnapshotId(snapshotIds.get(i + 1)) - .endSnapshotId(snapshotIds.get(i)) - .build(); - } - - Collections.addAll( - inputSplits, - FlinkSplitPlanner.planInputSplits(table, scanContext, ThreadPools.getWorkerPool())); - } - - return inputSplits; - } - - private OneInputStreamOperatorTestHarness createReader() - throws Exception { - // This input format is used to opening the emitted split. - FlinkInputFormat inputFormat = - FlinkSource.forRowData() - .tableLoader(TestTableLoader.of(tableDir.getAbsolutePath())) - .buildFormat(); - - OneInputStreamOperatorFactory factory = - StreamingReaderOperator.factory(inputFormat); - OneInputStreamOperatorTestHarness harness = - new OneInputStreamOperatorTestHarness<>(factory, 1, 1, 0); - harness.getStreamConfig().setTimeCharacteristic(TimeCharacteristic.ProcessingTime); - - return harness; - } - - private SteppingMailboxProcessor createLocalMailbox( - OneInputStreamOperatorTestHarness harness) { - return new SteppingMailboxProcessor( - MailboxDefaultAction.Controller::suspendDefaultAction, - harness.getTaskMailbox(), - StreamTaskActionExecutor.IMMEDIATE); - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/assigner/SplitAssignerTestBase.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/assigner/SplitAssignerTestBase.java deleted file mode 100644 index 1e612b0a2b2a..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/assigner/SplitAssignerTestBase.java +++ /dev/null @@ -1,132 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.assigner; - -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.fail; - -import java.nio.file.Path; -import java.util.Collection; -import java.util.List; -import java.util.concurrent.CompletableFuture; -import java.util.concurrent.atomic.AtomicBoolean; -import org.apache.iceberg.flink.source.SplitHelpers; -import org.apache.iceberg.flink.source.split.IcebergSourceSplit; -import org.apache.iceberg.flink.source.split.IcebergSourceSplitState; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.io.TempDir; - -public abstract class SplitAssignerTestBase { - @TempDir protected Path temporaryFolder; - - @Test - public void testEmptyInitialization() { - SplitAssigner assigner = splitAssigner(); - assertGetNext(assigner, GetSplitResult.Status.UNAVAILABLE); - } - - /** Test a sequence of interactions for StaticEnumerator */ - @Test - public void testStaticEnumeratorSequence() throws Exception { - SplitAssigner assigner = splitAssigner(); - assigner.onDiscoveredSplits(createSplits(4, 1, "1")); - - assertGetNext(assigner, GetSplitResult.Status.AVAILABLE); - assertGetNext(assigner, GetSplitResult.Status.AVAILABLE); - assertGetNext(assigner, GetSplitResult.Status.AVAILABLE); - assertSnapshot(assigner, 1); - assigner.onUnassignedSplits(createSplits(1, 1, "1")); - assertSnapshot(assigner, 2); - - assertGetNext(assigner, GetSplitResult.Status.AVAILABLE); - assertGetNext(assigner, GetSplitResult.Status.AVAILABLE); - assertGetNext(assigner, GetSplitResult.Status.UNAVAILABLE); - assertSnapshot(assigner, 0); - } - - /** Test a sequence of interactions for ContinuousEnumerator */ - @Test - public void testContinuousEnumeratorSequence() throws Exception { - SplitAssigner assigner = splitAssigner(); - assertGetNext(assigner, GetSplitResult.Status.UNAVAILABLE); - - List splits1 = createSplits(1, 1, "1"); - assertAvailableFuture(assigner, 1, () -> assigner.onDiscoveredSplits(splits1)); - List splits2 = createSplits(1, 1, "1"); - assertAvailableFuture(assigner, 1, () -> assigner.onUnassignedSplits(splits2)); - - assigner.onDiscoveredSplits(createSplits(2, 1, "1")); - assertSnapshot(assigner, 2); - assertGetNext(assigner, GetSplitResult.Status.AVAILABLE); - assertGetNext(assigner, GetSplitResult.Status.AVAILABLE); - assertGetNext(assigner, GetSplitResult.Status.UNAVAILABLE); - assertSnapshot(assigner, 0); - } - - private void assertAvailableFuture( - SplitAssigner assigner, int splitCount, Runnable addSplitsRunnable) { - // register callback - AtomicBoolean futureCompleted = new AtomicBoolean(); - CompletableFuture future = assigner.isAvailable(); - future.thenAccept(ignored -> futureCompleted.set(true)); - // calling isAvailable again should return the same object reference - // note that thenAccept will return a new future. - // we want to assert the same instance on the assigner returned future - assertThat(assigner.isAvailable()).isSameAs(future); - - // now add some splits - addSplitsRunnable.run(); - assertThat(futureCompleted.get()).isTrue(); - - for (int i = 0; i < splitCount; ++i) { - assertGetNext(assigner, GetSplitResult.Status.AVAILABLE); - } - assertGetNext(assigner, GetSplitResult.Status.UNAVAILABLE); - assertSnapshot(assigner, 0); - } - - protected void assertGetNext(SplitAssigner assigner, GetSplitResult.Status expectedStatus) { - GetSplitResult result = assigner.getNext(null); - assertThat(result.status()).isEqualTo(expectedStatus); - switch (expectedStatus) { - case AVAILABLE: - assertThat(result.split()).isNotNull(); - break; - case CONSTRAINED: - case UNAVAILABLE: - assertThat(result.split()).isNull(); - break; - default: - fail("Unknown status: %s", expectedStatus); - } - } - - protected void assertSnapshot(SplitAssigner assigner, int splitCount) { - Collection stateBeforeGet = assigner.state(); - assertThat(stateBeforeGet).hasSize(splitCount); - } - - protected List createSplits(int fileCount, int filesPerSplit, String version) - throws Exception { - return SplitHelpers.createSplitsFromTransientHadoopTable( - temporaryFolder, fileCount, filesPerSplit, version); - } - - protected abstract SplitAssigner splitAssigner(); -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/assigner/TestDefaultSplitAssigner.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/assigner/TestDefaultSplitAssigner.java deleted file mode 100644 index 17e64bbf0594..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/assigner/TestDefaultSplitAssigner.java +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.assigner; - -import org.apache.iceberg.flink.source.SplitHelpers; -import org.junit.jupiter.api.Test; - -public class TestDefaultSplitAssigner extends SplitAssignerTestBase { - @Override - protected SplitAssigner splitAssigner() { - return new DefaultSplitAssigner(null); - } - - /** Test the assigner when multiple files are in a single split */ - @Test - public void testMultipleFilesInASplit() throws Exception { - SplitAssigner assigner = splitAssigner(); - assigner.onDiscoveredSplits( - SplitHelpers.createSplitsFromTransientHadoopTable(temporaryFolder, 4, 2)); - - assertGetNext(assigner, GetSplitResult.Status.AVAILABLE); - assertSnapshot(assigner, 1); - assertGetNext(assigner, GetSplitResult.Status.AVAILABLE); - assertGetNext(assigner, GetSplitResult.Status.UNAVAILABLE); - assertSnapshot(assigner, 0); - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/assigner/TestFileSequenceNumberBasedSplitAssigner.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/assigner/TestFileSequenceNumberBasedSplitAssigner.java deleted file mode 100644 index ff63ba8e58a0..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/assigner/TestFileSequenceNumberBasedSplitAssigner.java +++ /dev/null @@ -1,81 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.assigner; - -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatThrownBy; - -import java.util.List; -import org.apache.iceberg.ContentFile; -import org.apache.iceberg.flink.source.split.IcebergSourceSplit; -import org.apache.iceberg.flink.source.split.SerializableComparator; -import org.apache.iceberg.flink.source.split.SplitComparators; -import org.apache.iceberg.util.SerializationUtil; -import org.junit.jupiter.api.Test; - -public class TestFileSequenceNumberBasedSplitAssigner extends SplitAssignerTestBase { - @Override - protected SplitAssigner splitAssigner() { - return new OrderedSplitAssignerFactory(SplitComparators.fileSequenceNumber()).createAssigner(); - } - - /** Test the assigner when multiple files are in a single split */ - @Test - public void testMultipleFilesInAnIcebergSplit() { - SplitAssigner assigner = splitAssigner(); - assertThatThrownBy( - () -> assigner.onDiscoveredSplits(createSplits(4, 2, "2")), - "Multiple files in a split is not allowed") - .isInstanceOf(IllegalArgumentException.class) - .hasMessageContaining("Please use 'split-open-file-cost'"); - } - - /** Test sorted splits */ - @Test - public void testSplitSort() throws Exception { - SplitAssigner assigner = splitAssigner(); - List splits = createSplits(5, 1, "2"); - - assigner.onDiscoveredSplits(splits.subList(3, 5)); - assigner.onDiscoveredSplits(splits.subList(0, 1)); - assigner.onDiscoveredSplits(splits.subList(1, 3)); - - assertGetNext(assigner, 1L); - assertGetNext(assigner, 2L); - assertGetNext(assigner, 3L); - assertGetNext(assigner, 4L); - assertGetNext(assigner, 5L); - - assertGetNext(assigner, GetSplitResult.Status.UNAVAILABLE); - } - - @Test - public void testSerializable() { - byte[] bytes = SerializationUtil.serializeToBytes(SplitComparators.fileSequenceNumber()); - SerializableComparator comparator = - SerializationUtil.deserializeFromBytes(bytes); - assertThat(comparator).isNotNull(); - } - - private void assertGetNext(SplitAssigner assigner, Long expectedSequenceNumber) { - GetSplitResult result = assigner.getNext(null); - ContentFile file = result.split().task().files().iterator().next().file(); - assertThat(file.fileSequenceNumber()).isEqualTo(expectedSequenceNumber); - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/assigner/TestWatermarkBasedSplitAssigner.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/assigner/TestWatermarkBasedSplitAssigner.java deleted file mode 100644 index 84f04d5a530a..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/assigner/TestWatermarkBasedSplitAssigner.java +++ /dev/null @@ -1,146 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.assigner; - -import static org.apache.iceberg.types.Types.NestedField.required; -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.IOException; -import java.time.Instant; -import java.time.LocalDateTime; -import java.time.ZoneOffset; -import java.time.temporal.ChronoUnit; -import java.util.List; -import java.util.concurrent.TimeUnit; -import java.util.stream.Collectors; -import java.util.stream.IntStream; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Schema; -import org.apache.iceberg.data.GenericAppenderFactory; -import org.apache.iceberg.data.GenericRecord; -import org.apache.iceberg.data.RandomGenericData; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.flink.TestFixtures; -import org.apache.iceberg.flink.source.reader.ColumnStatsWatermarkExtractor; -import org.apache.iceberg.flink.source.reader.ReaderUtil; -import org.apache.iceberg.flink.source.split.IcebergSourceSplit; -import org.apache.iceberg.flink.source.split.SerializableComparator; -import org.apache.iceberg.flink.source.split.SplitComparators; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.types.Types; -import org.apache.iceberg.util.SerializationUtil; -import org.junit.jupiter.api.Test; - -public class TestWatermarkBasedSplitAssigner extends SplitAssignerTestBase { - public static final Schema SCHEMA = - new Schema(required(1, "timestamp_column", Types.TimestampType.withoutZone())); - private static final GenericAppenderFactory APPENDER_FACTORY = new GenericAppenderFactory(SCHEMA); - - @Override - protected SplitAssigner splitAssigner() { - return new OrderedSplitAssignerFactory( - SplitComparators.watermark( - new ColumnStatsWatermarkExtractor(SCHEMA, "timestamp_column", null))) - .createAssigner(); - } - - /** Test the assigner when multiple files are in a single split */ - @Test - public void testMultipleFilesInAnIcebergSplit() { - SplitAssigner assigner = splitAssigner(); - assigner.onDiscoveredSplits(createSplits(4, 2, "2")); - - assertGetNext(assigner, GetSplitResult.Status.AVAILABLE); - assertGetNext(assigner, GetSplitResult.Status.AVAILABLE); - assertGetNext(assigner, GetSplitResult.Status.UNAVAILABLE); - } - - /** Test sorted splits */ - @Test - public void testSplitSort() { - SplitAssigner assigner = splitAssigner(); - - Instant now = Instant.now(); - List splits = - IntStream.range(0, 5) - .mapToObj(i -> splitFromInstant(now.plus(i, ChronoUnit.MINUTES))) - .collect(Collectors.toList()); - - assigner.onDiscoveredSplits(splits.subList(3, 5)); - assigner.onDiscoveredSplits(splits.subList(0, 1)); - assigner.onDiscoveredSplits(splits.subList(1, 3)); - - assertGetNext(assigner, splits.get(0)); - assertGetNext(assigner, splits.get(1)); - assertGetNext(assigner, splits.get(2)); - assertGetNext(assigner, splits.get(3)); - assertGetNext(assigner, splits.get(4)); - - assertGetNext(assigner, GetSplitResult.Status.UNAVAILABLE); - } - - @Test - public void testSerializable() { - byte[] bytes = - SerializationUtil.serializeToBytes( - SplitComparators.watermark( - new ColumnStatsWatermarkExtractor( - TestFixtures.SCHEMA, "id", TimeUnit.MILLISECONDS))); - SerializableComparator comparator = - SerializationUtil.deserializeFromBytes(bytes); - assertThat(comparator).isNotNull(); - } - - private void assertGetNext(SplitAssigner assigner, IcebergSourceSplit split) { - GetSplitResult result = assigner.getNext(null); - assertThat(split).isEqualTo(result.split()); - } - - @Override - protected List createSplits( - int fileCount, int filesPerSplit, String version) { - return IntStream.range(0, fileCount / filesPerSplit) - .mapToObj( - splitNum -> - splitFromRecords( - IntStream.range(0, filesPerSplit) - .mapToObj( - fileNum -> - RandomGenericData.generate( - SCHEMA, 2, (long) splitNum * filesPerSplit + fileNum)) - .collect(Collectors.toList()))) - .collect(Collectors.toList()); - } - - private IcebergSourceSplit splitFromInstant(Instant instant) { - Record record = GenericRecord.create(SCHEMA); - record.set(0, LocalDateTime.ofInstant(instant, ZoneOffset.UTC)); - return splitFromRecords(ImmutableList.of(ImmutableList.of(record))); - } - - private IcebergSourceSplit splitFromRecords(List> records) { - try { - return IcebergSourceSplit.fromCombinedScanTask( - ReaderUtil.createCombinedScanTask( - records, temporaryFolder, FileFormat.PARQUET, APPENDER_FACTORY)); - } catch (IOException e) { - throw new RuntimeException("Split creation exception", e); - } - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/ManualContinuousSplitPlanner.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/ManualContinuousSplitPlanner.java deleted file mode 100644 index ebc92df02360..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/ManualContinuousSplitPlanner.java +++ /dev/null @@ -1,97 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.enumerator; - -import java.io.IOException; -import java.util.List; -import java.util.NavigableMap; -import java.util.TreeMap; -import org.apache.iceberg.flink.source.ScanContext; -import org.apache.iceberg.flink.source.split.IcebergSourceSplit; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; - -class ManualContinuousSplitPlanner implements ContinuousSplitPlanner { - private final int maxPlanningSnapshotCount; - // track splits per snapshot - private final NavigableMap> splits; - private long latestSnapshotId; - private int remainingFailures; - - ManualContinuousSplitPlanner(ScanContext scanContext, int expectedFailures) { - this.maxPlanningSnapshotCount = scanContext.maxPlanningSnapshotCount(); - this.splits = new TreeMap<>(); - this.latestSnapshotId = 0L; - this.remainingFailures = expectedFailures; - } - - @Override - public synchronized ContinuousEnumerationResult planSplits( - IcebergEnumeratorPosition lastPosition) { - if (remainingFailures > 0) { - remainingFailures--; - throw new RuntimeException("Expected failure at planning"); - } - - long fromSnapshotIdExclusive = 0; - if (lastPosition != null && lastPosition.snapshotId() != null) { - fromSnapshotIdExclusive = lastPosition.snapshotId(); - } - - Preconditions.checkArgument( - fromSnapshotIdExclusive <= latestSnapshotId, - "last enumerated snapshotId is greater than the latestSnapshotId"); - if (fromSnapshotIdExclusive == latestSnapshotId) { - // already discovered everything. - return new ContinuousEnumerationResult(Lists.newArrayList(), lastPosition, lastPosition); - } - - // find the subset of snapshots to return discovered splits - long toSnapshotIdInclusive; - if (latestSnapshotId - fromSnapshotIdExclusive > maxPlanningSnapshotCount) { - toSnapshotIdInclusive = fromSnapshotIdExclusive + maxPlanningSnapshotCount; - } else { - toSnapshotIdInclusive = latestSnapshotId; - } - - List discoveredSplits = Lists.newArrayList(); - NavigableMap> discoveredView = - splits.subMap(fromSnapshotIdExclusive, false, toSnapshotIdInclusive, true); - discoveredView.forEach((snapshotId, snapshotSplits) -> discoveredSplits.addAll(snapshotSplits)); - ContinuousEnumerationResult result = - new ContinuousEnumerationResult( - discoveredSplits, - lastPosition, - // use the snapshot Id as snapshot timestamp. - IcebergEnumeratorPosition.of(toSnapshotIdInclusive, toSnapshotIdInclusive)); - return result; - } - - /** - * Add a collection of new splits. A monotonically increased snapshotId is assigned to each batch - * of splits added by this method. - */ - public synchronized void addSplits(List newSplits) { - latestSnapshotId += 1; - splits.put(latestSnapshotId, newSplits); - } - - @Override - public void close() throws IOException {} -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousIcebergEnumerator.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousIcebergEnumerator.java deleted file mode 100644 index 41a787762fda..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousIcebergEnumerator.java +++ /dev/null @@ -1,352 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.enumerator; - -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatThrownBy; - -import java.nio.file.Path; -import java.util.Arrays; -import java.util.Collection; -import java.util.Collections; -import java.util.List; -import java.util.stream.Collectors; -import org.apache.flink.api.connector.source.SplitEnumeratorContext; -import org.apache.flink.connector.testutils.source.reader.TestingSplitEnumeratorContext; -import org.apache.iceberg.flink.source.ScanContext; -import org.apache.iceberg.flink.source.SplitHelpers; -import org.apache.iceberg.flink.source.StreamingStartingStrategy; -import org.apache.iceberg.flink.source.assigner.DefaultSplitAssigner; -import org.apache.iceberg.flink.source.split.IcebergSourceSplit; -import org.apache.iceberg.flink.source.split.IcebergSourceSplitState; -import org.apache.iceberg.flink.source.split.IcebergSourceSplitStatus; -import org.apache.iceberg.flink.source.split.SplitRequestEvent; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.io.TempDir; - -public class TestContinuousIcebergEnumerator { - @TempDir protected Path temporaryFolder; - - @Test - public void testDiscoverSplitWhenNoReaderRegistered() throws Exception { - TestingSplitEnumeratorContext enumeratorContext = - new TestingSplitEnumeratorContext<>(4); - ScanContext scanContext = - ScanContext.builder() - .streaming(true) - .startingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) - .build(); - ManualContinuousSplitPlanner splitPlanner = new ManualContinuousSplitPlanner(scanContext, 0); - ContinuousIcebergEnumerator enumerator = - createEnumerator(enumeratorContext, scanContext, splitPlanner); - - Collection pendingSplitsEmpty = - enumerator.snapshotState(1).pendingSplits(); - assertThat(pendingSplitsEmpty).isEmpty(); - - // make one split available and trigger the periodic discovery - List splits = - SplitHelpers.createSplitsFromTransientHadoopTable(temporaryFolder, 1, 1); - splitPlanner.addSplits(splits); - enumeratorContext.triggerAllActions(); - - Collection pendingSplits = enumerator.snapshotState(2).pendingSplits(); - assertThat(pendingSplits).hasSize(1); - IcebergSourceSplitState pendingSplit = pendingSplits.iterator().next(); - assertThat(pendingSplit.split().splitId()).isEqualTo(splits.get(0).splitId()); - assertThat(pendingSplit.status()).isEqualTo(IcebergSourceSplitStatus.UNASSIGNED); - } - - @Test - public void testDiscoverWhenReaderRegistered() throws Exception { - TestingSplitEnumeratorContext enumeratorContext = - new TestingSplitEnumeratorContext<>(4); - ScanContext scanContext = - ScanContext.builder() - .streaming(true) - .startingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) - .build(); - ManualContinuousSplitPlanner splitPlanner = new ManualContinuousSplitPlanner(scanContext, 0); - ContinuousIcebergEnumerator enumerator = - createEnumerator(enumeratorContext, scanContext, splitPlanner); - - // register one reader, and let it request a split - enumeratorContext.registerReader(2, "localhost"); - enumerator.addReader(2); - enumerator.handleSourceEvent(2, new SplitRequestEvent()); - - // make one split available and trigger the periodic discovery - List splits = - SplitHelpers.createSplitsFromTransientHadoopTable(temporaryFolder, 1, 1); - splitPlanner.addSplits(splits); - enumeratorContext.triggerAllActions(); - - assertThat(enumerator.snapshotState(1).pendingSplits()).isEmpty(); - assertThat(enumeratorContext.getSplitAssignments().get(2).getAssignedSplits()) - .contains(splits.get(0)); - } - - @Test - public void testRequestingReaderUnavailableWhenSplitDiscovered() throws Exception { - TestingSplitEnumeratorContext enumeratorContext = - new TestingSplitEnumeratorContext<>(4); - ScanContext scanContext = - ScanContext.builder() - .streaming(true) - .startingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) - .build(); - ManualContinuousSplitPlanner splitPlanner = new ManualContinuousSplitPlanner(scanContext, 0); - ContinuousIcebergEnumerator enumerator = - createEnumerator(enumeratorContext, scanContext, splitPlanner); - - // register one reader, and let it request a split - enumeratorContext.registerReader(2, "localhost"); - enumerator.addReader(2); - enumerator.handleSourceEvent(2, new SplitRequestEvent()); - - // remove the reader (like in a failure) - enumeratorContext.registeredReaders().remove(2); - - // make one split available and trigger the periodic discovery - List splits = - SplitHelpers.createSplitsFromTransientHadoopTable(temporaryFolder, 1, 1); - assertThat(splits).hasSize(1); - splitPlanner.addSplits(splits); - enumeratorContext.triggerAllActions(); - - assertThat(enumeratorContext.getSplitAssignments()).doesNotContainKey(2); - List pendingSplitIds = - enumerator.snapshotState(1).pendingSplits().stream() - .map(IcebergSourceSplitState::split) - .map(IcebergSourceSplit::splitId) - .collect(Collectors.toList()); - assertThat(pendingSplitIds).hasSameSizeAs(splits).first().isEqualTo(splits.get(0).splitId()); - - // register the reader again, and let it request a split - enumeratorContext.registerReader(2, "localhost"); - enumerator.addReader(2); - enumerator.handleSourceEvent(2, new SplitRequestEvent()); - - assertThat(enumerator.snapshotState(2).pendingSplits()).isEmpty(); - assertThat(enumeratorContext.getSplitAssignments().get(2).getAssignedSplits()) - .contains(splits.get(0)); - } - - @Test - public void testThrottlingDiscovery() throws Exception { - // create 10 splits - List splits = - SplitHelpers.createSplitsFromTransientHadoopTable(temporaryFolder, 10, 1); - - TestingSplitEnumeratorContext enumeratorContext = - new TestingSplitEnumeratorContext<>(4); - ScanContext scanContext = - ScanContext.builder() - .streaming(true) - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_EARLIEST_SNAPSHOT) - // discover one snapshot at a time - .maxPlanningSnapshotCount(1) - .build(); - ManualContinuousSplitPlanner splitPlanner = new ManualContinuousSplitPlanner(scanContext, 0); - ContinuousIcebergEnumerator enumerator = - createEnumerator(enumeratorContext, scanContext, splitPlanner); - - // register reader-2, and let it request a split - enumeratorContext.registerReader(2, "localhost"); - enumerator.addReader(2); - enumerator.handleSourceEvent(2, new SplitRequestEvent()); - - // add splits[0] to the planner for next discovery - splitPlanner.addSplits(Arrays.asList(splits.get(0))); - enumeratorContext.triggerAllActions(); - - // because discovered split was assigned to reader, pending splits should be empty - assertThat(enumerator.snapshotState(1).pendingSplits()).isEmpty(); - // split assignment to reader-2 should contain splits[0, 1) - assertThat(enumeratorContext.getSplitAssignments().get(2).getAssignedSplits()) - .containsExactlyElementsOf(splits.subList(0, 1)); - - // add the remaining 9 splits (one for every snapshot) - // run discovery cycles while reader-2 still processing the splits[0] - for (int i = 1; i < 10; ++i) { - splitPlanner.addSplits(Arrays.asList(splits.get(i))); - enumeratorContext.triggerAllActions(); - } - - // can only discover up to 3 snapshots/splits - assertThat(enumerator.snapshotState(2).pendingSplits()).hasSize(3); - // split assignment to reader-2 should be splits[0, 1) - assertThat(enumeratorContext.getSplitAssignments().get(2).getAssignedSplits()) - .containsExactlyElementsOf(splits.subList(0, 1)); - - // now reader-2 finished splits[0] - enumerator.handleSourceEvent(2, new SplitRequestEvent(Arrays.asList(splits.get(0).splitId()))); - enumeratorContext.triggerAllActions(); - // still have 3 pending splits. After assigned splits[1] to reader-2, one more split was - // discovered and added. - assertThat(enumerator.snapshotState(3).pendingSplits()).hasSize(3); - // split assignment to reader-2 should be splits[0, 2) - assertThat(enumeratorContext.getSplitAssignments().get(2).getAssignedSplits()) - .containsExactlyElementsOf(splits.subList(0, 2)); - - // run 3 more split discovery cycles - for (int i = 0; i < 3; ++i) { - enumeratorContext.triggerAllActions(); - } - - // no more splits are discovered due to throttling - assertThat(enumerator.snapshotState(4).pendingSplits()).hasSize(3); - // split assignment to reader-2 should still be splits[0, 2) - assertThat(enumeratorContext.getSplitAssignments().get(2).getAssignedSplits()) - .containsExactlyElementsOf(splits.subList(0, 2)); - - // now reader-2 finished splits[1] - enumerator.handleSourceEvent(2, new SplitRequestEvent(Arrays.asList(splits.get(1).splitId()))); - enumeratorContext.triggerAllActions(); - // still have 3 pending splits. After assigned new splits[2] to reader-2, one more split was - // discovered and added. - assertThat(enumerator.snapshotState(5).pendingSplits()).hasSize(3); - // split assignment to reader-2 should be splits[0, 3) - assertThat(enumeratorContext.getSplitAssignments().get(2).getAssignedSplits()) - .containsExactlyElementsOf(splits.subList(0, 3)); - } - - @Test - public void testTransientPlanningErrorsWithSuccessfulRetry() throws Exception { - TestingSplitEnumeratorContext enumeratorContext = - new TestingSplitEnumeratorContext<>(4); - ScanContext scanContext = - ScanContext.builder() - .streaming(true) - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_EARLIEST_SNAPSHOT) - .maxPlanningSnapshotCount(1) - .maxAllowedPlanningFailures(2) - .build(); - ManualContinuousSplitPlanner splitPlanner = new ManualContinuousSplitPlanner(scanContext, 1); - ContinuousIcebergEnumerator enumerator = - createEnumerator(enumeratorContext, scanContext, splitPlanner); - - // Make one split available and trigger the periodic discovery - List splits = - SplitHelpers.createSplitsFromTransientHadoopTable(temporaryFolder, 1, 1); - splitPlanner.addSplits(splits); - - // Trigger a planning and check that no splits returned due to the planning error - enumeratorContext.triggerAllActions(); - assertThat(enumerator.snapshotState(2).pendingSplits()).isEmpty(); - - // Second scan planning should succeed and discover the expected splits - enumeratorContext.triggerAllActions(); - Collection pendingSplits = enumerator.snapshotState(3).pendingSplits(); - assertThat(pendingSplits).hasSize(1); - IcebergSourceSplitState pendingSplit = pendingSplits.iterator().next(); - assertThat(pendingSplit.split().splitId()).isEqualTo(splits.get(0).splitId()); - assertThat(pendingSplit.status()).isEqualTo(IcebergSourceSplitStatus.UNASSIGNED); - } - - @Test - public void testOverMaxAllowedPlanningErrors() throws Exception { - TestingSplitEnumeratorContext enumeratorContext = - new TestingSplitEnumeratorContext<>(4); - ScanContext scanContext = - ScanContext.builder() - .streaming(true) - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_EARLIEST_SNAPSHOT) - .maxPlanningSnapshotCount(1) - .maxAllowedPlanningFailures(1) - .build(); - ManualContinuousSplitPlanner splitPlanner = new ManualContinuousSplitPlanner(scanContext, 2); - createEnumerator(enumeratorContext, scanContext, splitPlanner); - - // Make one split available and trigger the periodic discovery - List splits = - SplitHelpers.createSplitsFromTransientHadoopTable(temporaryFolder, 1, 1); - splitPlanner.addSplits(splits); - - // Check that the scheduler response ignores the current error and continues to run until the - // failure limit is reached - enumeratorContext.triggerAllActions(); - assertThat(enumeratorContext.getExecutorService().getAllScheduledTasks().get(0).isDone()) - .isFalse(); - - // Check that the task has failed with the expected exception after the failure limit is reached - enumeratorContext.triggerAllActions(); - assertThat(enumeratorContext.getExecutorService().getAllScheduledTasks().get(0).isDone()) - .isTrue(); - assertThatThrownBy( - () -> enumeratorContext.getExecutorService().getAllScheduledTasks().get(0).get()) - .hasCauseInstanceOf(RuntimeException.class) - .hasMessageContaining("Failed to discover new split"); - } - - @Test - public void testPlanningIgnoringErrors() throws Exception { - int expectedFailures = 3; - TestingSplitEnumeratorContext enumeratorContext = - new TestingSplitEnumeratorContext<>(4); - ScanContext scanContext = - ScanContext.builder() - .streaming(true) - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_EARLIEST_SNAPSHOT) - .maxPlanningSnapshotCount(1) - .maxAllowedPlanningFailures(-1) - .build(); - ManualContinuousSplitPlanner splitPlanner = - new ManualContinuousSplitPlanner(scanContext, expectedFailures); - ContinuousIcebergEnumerator enumerator = - createEnumerator(enumeratorContext, scanContext, splitPlanner); - - // Make one split available and trigger the periodic discovery - List splits = - SplitHelpers.createSplitsFromTransientHadoopTable(temporaryFolder, 1, 1); - splitPlanner.addSplits(splits); - - Collection pendingSplits; - // Can not discover the new split with planning failures - for (int i = 0; i < expectedFailures; ++i) { - enumeratorContext.triggerAllActions(); - pendingSplits = enumerator.snapshotState(i).pendingSplits(); - assertThat(pendingSplits).isEmpty(); - } - - // Discovered the new split after a successful scan planning - enumeratorContext.triggerAllActions(); - pendingSplits = enumerator.snapshotState(expectedFailures + 1).pendingSplits(); - assertThat(pendingSplits).hasSize(1); - IcebergSourceSplitState pendingSplit = pendingSplits.iterator().next(); - assertThat(pendingSplit.split().splitId()).isEqualTo(splits.get(0).splitId()); - assertThat(pendingSplit.status()).isEqualTo(IcebergSourceSplitStatus.UNASSIGNED); - } - - private static ContinuousIcebergEnumerator createEnumerator( - SplitEnumeratorContext context, - ScanContext scanContext, - ContinuousSplitPlanner splitPlanner) { - - ContinuousIcebergEnumerator enumerator = - new ContinuousIcebergEnumerator( - context, - new DefaultSplitAssigner(null, Collections.emptyList()), - scanContext, - splitPlanner, - null); - enumerator.start(); - return enumerator; - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousSplitPlannerImpl.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousSplitPlannerImpl.java deleted file mode 100644 index 0690b456e033..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousSplitPlannerImpl.java +++ /dev/null @@ -1,692 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.enumerator; - -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatThrownBy; - -import java.io.IOException; -import java.nio.file.Path; -import java.util.List; -import java.util.Set; -import java.util.concurrent.atomic.AtomicLong; -import java.util.stream.Collectors; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Snapshot; -import org.apache.iceberg.data.GenericAppenderHelper; -import org.apache.iceberg.data.RandomGenericData; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.flink.HadoopTableExtension; -import org.apache.iceberg.flink.TestFixtures; -import org.apache.iceberg.flink.source.ScanContext; -import org.apache.iceberg.flink.source.StreamingStartingStrategy; -import org.apache.iceberg.flink.source.split.IcebergSourceSplit; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; -import org.apache.iceberg.relocated.com.google.common.collect.Iterables; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.extension.RegisterExtension; -import org.junit.jupiter.api.io.TempDir; - -public class TestContinuousSplitPlannerImpl { - @TempDir protected Path temporaryFolder; - - private static final FileFormat FILE_FORMAT = FileFormat.PARQUET; - private static final AtomicLong RANDOM_SEED = new AtomicLong(); - - @RegisterExtension - private static final HadoopTableExtension TABLE_RESOURCE = - new HadoopTableExtension(TestFixtures.DATABASE, TestFixtures.TABLE, TestFixtures.SCHEMA); - - private GenericAppenderHelper dataAppender; - private DataFile dataFile1; - private Snapshot snapshot1; - private DataFile dataFile2; - private Snapshot snapshot2; - - @BeforeEach - public void before() throws IOException { - dataAppender = new GenericAppenderHelper(TABLE_RESOURCE.table(), FILE_FORMAT, temporaryFolder); - } - - private void appendTwoSnapshots() throws IOException { - // snapshot1 - List batch1 = RandomGenericData.generate(TestFixtures.SCHEMA, 2, 0L); - dataFile1 = dataAppender.writeFile(null, batch1); - dataAppender.appendToTable(dataFile1); - snapshot1 = TABLE_RESOURCE.table().currentSnapshot(); - - // snapshot2 - List batch2 = RandomGenericData.generate(TestFixtures.SCHEMA, 2, 1L); - dataFile2 = dataAppender.writeFile(null, batch2); - dataAppender.appendToTable(dataFile2); - snapshot2 = TABLE_RESOURCE.table().currentSnapshot(); - } - - /** @return the last enumerated snapshot id */ - private CycleResult verifyOneCycle( - ContinuousSplitPlannerImpl splitPlanner, IcebergEnumeratorPosition lastPosition) - throws Exception { - List batch = - RandomGenericData.generate(TestFixtures.SCHEMA, 2, RANDOM_SEED.incrementAndGet()); - DataFile dataFile = dataAppender.writeFile(null, batch); - dataAppender.appendToTable(dataFile); - Snapshot snapshot = TABLE_RESOURCE.table().currentSnapshot(); - - ContinuousEnumerationResult result = splitPlanner.planSplits(lastPosition); - assertThat(result.fromPosition().snapshotId()).isEqualTo(lastPosition.snapshotId()); - assertThat(result.fromPosition().snapshotTimestampMs()) - .isEqualTo(lastPosition.snapshotTimestampMs()); - assertThat(result.toPosition().snapshotId().longValue()).isEqualTo(snapshot.snapshotId()); - assertThat(result.toPosition().snapshotTimestampMs().longValue()) - .isEqualTo(snapshot.timestampMillis()); - assertThat(result.splits()).hasSize(1); - IcebergSourceSplit split = Iterables.getOnlyElement(result.splits()); - assertThat(split.task().files()) - .hasSize(1) - .first() - .satisfies( - fileScanTask -> assertThat(fileScanTask.file().path()).isEqualTo(dataFile.path())); - return new CycleResult(result.toPosition(), split); - } - - @Test - public void testTableScanThenIncrementalWithEmptyTable() throws Exception { - ScanContext scanContext = - ScanContext.builder() - .startingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) - .build(); - ContinuousSplitPlannerImpl splitPlanner = - new ContinuousSplitPlannerImpl(TABLE_RESOURCE.tableLoader().clone(), scanContext, null); - - ContinuousEnumerationResult emptyTableInitialDiscoveryResult = splitPlanner.planSplits(null); - assertThat(emptyTableInitialDiscoveryResult.splits()).isEmpty(); - assertThat(emptyTableInitialDiscoveryResult.fromPosition()).isNull(); - assertThat(emptyTableInitialDiscoveryResult.toPosition().isEmpty()).isTrue(); - assertThat(emptyTableInitialDiscoveryResult.toPosition().snapshotTimestampMs()).isNull(); - - ContinuousEnumerationResult emptyTableSecondDiscoveryResult = - splitPlanner.planSplits(emptyTableInitialDiscoveryResult.toPosition()); - assertThat(emptyTableSecondDiscoveryResult.splits()).isEmpty(); - assertThat(emptyTableSecondDiscoveryResult.fromPosition().isEmpty()).isTrue(); - assertThat(emptyTableSecondDiscoveryResult.fromPosition().snapshotTimestampMs()).isNull(); - assertThat(emptyTableSecondDiscoveryResult.toPosition().isEmpty()).isTrue(); - assertThat(emptyTableSecondDiscoveryResult.toPosition().snapshotTimestampMs()).isNull(); - - // next 3 snapshots - IcebergEnumeratorPosition lastPosition = emptyTableSecondDiscoveryResult.toPosition(); - for (int i = 0; i < 3; ++i) { - lastPosition = verifyOneCycle(splitPlanner, lastPosition).lastPosition; - } - } - - @Test - public void testTableScanThenIncrementalWithNonEmptyTable() throws Exception { - appendTwoSnapshots(); - - ScanContext scanContext = - ScanContext.builder() - .startingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) - .build(); - ContinuousSplitPlannerImpl splitPlanner = - new ContinuousSplitPlannerImpl(TABLE_RESOURCE.tableLoader().clone(), scanContext, null); - - ContinuousEnumerationResult initialResult = splitPlanner.planSplits(null); - assertThat(initialResult.fromPosition()).isNull(); - assertThat(initialResult.toPosition().snapshotId().longValue()) - .isEqualTo(snapshot2.snapshotId()); - assertThat(initialResult.toPosition().snapshotTimestampMs().longValue()) - .isEqualTo(snapshot2.timestampMillis()); - assertThat(initialResult.splits()).hasSize(1); - IcebergSourceSplit split = Iterables.getOnlyElement(initialResult.splits()); - assertThat(split.task().files()).hasSize(2); - Set discoveredFiles = - split.task().files().stream() - .map(fileScanTask -> fileScanTask.file().path().toString()) - .collect(Collectors.toSet()); - Set expectedFiles = - ImmutableSet.of(dataFile1.path().toString(), dataFile2.path().toString()); - assertThat(discoveredFiles).containsExactlyInAnyOrderElementsOf(expectedFiles); - - IcebergEnumeratorPosition lastPosition = initialResult.toPosition(); - for (int i = 0; i < 3; ++i) { - lastPosition = verifyOneCycle(splitPlanner, lastPosition).lastPosition; - } - } - - @Test - public void testIncrementalFromLatestSnapshotWithEmptyTable() throws Exception { - ScanContext scanContext = - ScanContext.builder() - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_LATEST_SNAPSHOT) - .splitSize(1L) - .build(); - ContinuousSplitPlannerImpl splitPlanner = - new ContinuousSplitPlannerImpl(TABLE_RESOURCE.tableLoader().clone(), scanContext, null); - - ContinuousEnumerationResult emptyTableInitialDiscoveryResult = splitPlanner.planSplits(null); - assertThat(emptyTableInitialDiscoveryResult.splits()).isEmpty(); - assertThat(emptyTableInitialDiscoveryResult.fromPosition()).isNull(); - assertThat(emptyTableInitialDiscoveryResult.toPosition().isEmpty()).isTrue(); - assertThat(emptyTableInitialDiscoveryResult.toPosition().snapshotTimestampMs()).isNull(); - - ContinuousEnumerationResult emptyTableSecondDiscoveryResult = - splitPlanner.planSplits(emptyTableInitialDiscoveryResult.toPosition()); - assertThat(emptyTableSecondDiscoveryResult.splits()).isEmpty(); - assertThat(emptyTableSecondDiscoveryResult.fromPosition().isEmpty()).isTrue(); - assertThat(emptyTableSecondDiscoveryResult.fromPosition().snapshotTimestampMs()).isNull(); - assertThat(emptyTableSecondDiscoveryResult.toPosition().isEmpty()).isTrue(); - assertThat(emptyTableSecondDiscoveryResult.toPosition().snapshotTimestampMs()).isNull(); - - // latest mode should discover both snapshots, as latest position is marked by when job starts - appendTwoSnapshots(); - ContinuousEnumerationResult afterTwoSnapshotsAppended = - splitPlanner.planSplits(emptyTableSecondDiscoveryResult.toPosition()); - assertThat(afterTwoSnapshotsAppended.splits()).hasSize(2); - - // next 3 snapshots - IcebergEnumeratorPosition lastPosition = afterTwoSnapshotsAppended.toPosition(); - for (int i = 0; i < 3; ++i) { - lastPosition = verifyOneCycle(splitPlanner, lastPosition).lastPosition; - } - } - - @Test - public void testIncrementalFromLatestSnapshotWithNonEmptyTable() throws Exception { - appendTwoSnapshots(); - - ScanContext scanContext = - ScanContext.builder() - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_LATEST_SNAPSHOT) - .build(); - ContinuousSplitPlannerImpl splitPlanner = - new ContinuousSplitPlannerImpl(TABLE_RESOURCE.tableLoader().clone(), scanContext, null); - - ContinuousEnumerationResult initialResult = splitPlanner.planSplits(null); - assertThat(initialResult.fromPosition()).isNull(); - // For inclusive behavior, the initial result should point to snapshot1 - // Then the next incremental scan shall discover files from latest snapshot2 (inclusive) - assertThat(initialResult.toPosition().snapshotId().longValue()) - .isEqualTo(snapshot1.snapshotId()); - assertThat(initialResult.toPosition().snapshotTimestampMs().longValue()) - .isEqualTo(snapshot1.timestampMillis()); - assertThat(initialResult.splits()).isEmpty(); - - ContinuousEnumerationResult secondResult = splitPlanner.planSplits(initialResult.toPosition()); - assertThat(secondResult.fromPosition().snapshotId().longValue()) - .isEqualTo(snapshot1.snapshotId()); - assertThat(secondResult.fromPosition().snapshotTimestampMs().longValue()) - .isEqualTo(snapshot1.timestampMillis()); - assertThat(secondResult.toPosition().snapshotId().longValue()) - .isEqualTo(snapshot2.snapshotId()); - assertThat(secondResult.toPosition().snapshotTimestampMs().longValue()) - .isEqualTo(snapshot2.timestampMillis()); - IcebergSourceSplit split = Iterables.getOnlyElement(secondResult.splits()); - assertThat(split.task().files()).hasSize(1); - Set discoveredFiles = - split.task().files().stream() - .map(fileScanTask -> fileScanTask.file().path().toString()) - .collect(Collectors.toSet()); - // should discover dataFile2 appended in snapshot2 - Set expectedFiles = ImmutableSet.of(dataFile2.path().toString()); - assertThat(discoveredFiles).containsExactlyElementsOf(expectedFiles); - - IcebergEnumeratorPosition lastPosition = secondResult.toPosition(); - for (int i = 0; i < 3; ++i) { - lastPosition = verifyOneCycle(splitPlanner, lastPosition).lastPosition; - } - } - - @Test - public void testIncrementalFromEarliestSnapshotWithEmptyTable() throws Exception { - ScanContext scanContext = - ScanContext.builder() - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_EARLIEST_SNAPSHOT) - .build(); - ContinuousSplitPlannerImpl splitPlanner = - new ContinuousSplitPlannerImpl(TABLE_RESOURCE.tableLoader().clone(), scanContext, null); - - ContinuousEnumerationResult emptyTableInitialDiscoveryResult = splitPlanner.planSplits(null); - assertThat(emptyTableInitialDiscoveryResult.splits()).isEmpty(); - assertThat(emptyTableInitialDiscoveryResult.fromPosition()).isNull(); - assertThat(emptyTableInitialDiscoveryResult.toPosition().snapshotId()).isNull(); - assertThat(emptyTableInitialDiscoveryResult.toPosition().snapshotTimestampMs()).isNull(); - - ContinuousEnumerationResult emptyTableSecondDiscoveryResult = - splitPlanner.planSplits(emptyTableInitialDiscoveryResult.toPosition()); - assertThat(emptyTableSecondDiscoveryResult.splits()).isEmpty(); - assertThat(emptyTableSecondDiscoveryResult.fromPosition().snapshotId()).isNull(); - assertThat(emptyTableSecondDiscoveryResult.fromPosition().snapshotTimestampMs()).isNull(); - assertThat(emptyTableSecondDiscoveryResult.toPosition().snapshotId()).isNull(); - assertThat(emptyTableSecondDiscoveryResult.toPosition().snapshotTimestampMs()).isNull(); - - // next 3 snapshots - IcebergEnumeratorPosition lastPosition = emptyTableSecondDiscoveryResult.toPosition(); - for (int i = 0; i < 3; ++i) { - lastPosition = verifyOneCycle(splitPlanner, lastPosition).lastPosition; - } - } - - @Test - public void testIncrementalFromEarliestSnapshotWithNonEmptyTable() throws Exception { - appendTwoSnapshots(); - - ScanContext scanContext = - ScanContext.builder() - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_EARLIEST_SNAPSHOT) - .build(); - ContinuousSplitPlannerImpl splitPlanner = - new ContinuousSplitPlannerImpl(TABLE_RESOURCE.tableLoader().clone(), scanContext, null); - - ContinuousEnumerationResult initialResult = splitPlanner.planSplits(null); - assertThat(initialResult.fromPosition()).isNull(); - // For inclusive behavior, the initial result should point to snapshot1's parent, - // which leads to null snapshotId and snapshotTimestampMs. - assertThat(initialResult.toPosition().snapshotId()).isNull(); - assertThat(initialResult.toPosition().snapshotTimestampMs()).isNull(); - assertThat(initialResult.splits()).isEmpty(); - - ContinuousEnumerationResult secondResult = splitPlanner.planSplits(initialResult.toPosition()); - assertThat(secondResult.fromPosition().snapshotId()).isNull(); - assertThat(secondResult.fromPosition().snapshotTimestampMs()).isNull(); - assertThat(secondResult.toPosition().snapshotId().longValue()) - .isEqualTo(snapshot2.snapshotId()); - assertThat(secondResult.toPosition().snapshotTimestampMs().longValue()) - .isEqualTo(snapshot2.timestampMillis()); - IcebergSourceSplit split = Iterables.getOnlyElement(secondResult.splits()); - assertThat(split.task().files()).hasSize(2); - Set discoveredFiles = - split.task().files().stream() - .map(fileScanTask -> fileScanTask.file().path().toString()) - .collect(Collectors.toSet()); - // should discover files appended in both snapshot1 and snapshot2 - Set expectedFiles = - ImmutableSet.of(dataFile1.path().toString(), dataFile2.path().toString()); - assertThat(discoveredFiles).containsExactlyInAnyOrderElementsOf(expectedFiles); - - IcebergEnumeratorPosition lastPosition = secondResult.toPosition(); - for (int i = 0; i < 3; ++i) { - lastPosition = verifyOneCycle(splitPlanner, lastPosition).lastPosition; - } - } - - @Test - public void testIncrementalFromSnapshotIdWithEmptyTable() { - ScanContext scanContextWithInvalidSnapshotId = - ScanContext.builder() - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_ID) - .startSnapshotId(1L) - .build(); - ContinuousSplitPlannerImpl splitPlanner = - new ContinuousSplitPlannerImpl( - TABLE_RESOURCE.tableLoader().clone(), scanContextWithInvalidSnapshotId, null); - - assertThatThrownBy(() -> splitPlanner.planSplits(null)) - .isInstanceOf(IllegalArgumentException.class) - .hasMessage("Start snapshot id not found in history: 1"); - } - - @Test - public void testIncrementalFromSnapshotIdWithInvalidIds() throws Exception { - appendTwoSnapshots(); - - // find an invalid snapshotId - long invalidSnapshotId = 0L; - while (invalidSnapshotId == snapshot1.snapshotId() - || invalidSnapshotId == snapshot2.snapshotId()) { - invalidSnapshotId++; - } - - ScanContext scanContextWithInvalidSnapshotId = - ScanContext.builder() - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_ID) - .startSnapshotId(invalidSnapshotId) - .build(); - - ContinuousSplitPlannerImpl splitPlanner = - new ContinuousSplitPlannerImpl( - TABLE_RESOURCE.tableLoader().clone(), scanContextWithInvalidSnapshotId, null); - - assertThatThrownBy(() -> splitPlanner.planSplits(null)) - .isInstanceOf(IllegalArgumentException.class) - .hasMessage("Start snapshot id not found in history: " + invalidSnapshotId); - } - - @Test - public void testIncrementalFromSnapshotId() throws Exception { - appendTwoSnapshots(); - - ScanContext scanContext = - ScanContext.builder() - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_ID) - .startSnapshotId(snapshot2.snapshotId()) - .build(); - ContinuousSplitPlannerImpl splitPlanner = - new ContinuousSplitPlannerImpl(TABLE_RESOURCE.tableLoader().clone(), scanContext, null); - - ContinuousEnumerationResult initialResult = splitPlanner.planSplits(null); - assertThat(initialResult.fromPosition()).isNull(); - // For inclusive behavior of snapshot2, the initial result should point to snapshot1 (as - // snapshot2's parent) - assertThat(initialResult.toPosition().snapshotId().longValue()) - .isEqualTo(snapshot1.snapshotId()); - assertThat(initialResult.toPosition().snapshotTimestampMs().longValue()) - .isEqualTo(snapshot1.timestampMillis()); - assertThat(initialResult.splits()).isEmpty(); - - ContinuousEnumerationResult secondResult = splitPlanner.planSplits(initialResult.toPosition()); - assertThat(secondResult.fromPosition().snapshotId().longValue()) - .isEqualTo(snapshot1.snapshotId()); - assertThat(secondResult.fromPosition().snapshotTimestampMs().longValue()) - .isEqualTo(snapshot1.timestampMillis()); - assertThat(secondResult.toPosition().snapshotId().longValue()) - .isEqualTo(snapshot2.snapshotId()); - assertThat(secondResult.toPosition().snapshotTimestampMs().longValue()) - .isEqualTo(snapshot2.timestampMillis()); - IcebergSourceSplit split = Iterables.getOnlyElement(secondResult.splits()); - assertThat(split.task().files()).hasSize(1); - Set discoveredFiles = - split.task().files().stream() - .map(fileScanTask -> fileScanTask.file().path().toString()) - .collect(Collectors.toSet()); - // should discover dataFile2 appended in snapshot2 - Set expectedFiles = ImmutableSet.of(dataFile2.path().toString()); - assertThat(discoveredFiles).containsExactlyElementsOf(expectedFiles); - - IcebergEnumeratorPosition lastPosition = secondResult.toPosition(); - for (int i = 0; i < 3; ++i) { - lastPosition = verifyOneCycle(splitPlanner, lastPosition).lastPosition; - } - } - - @Test - public void testIncrementalFromSnapshotTimestampWithEmptyTable() { - ScanContext scanContextWithInvalidSnapshotId = - ScanContext.builder() - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP) - .startSnapshotTimestamp(1L) - .build(); - ContinuousSplitPlannerImpl splitPlanner = - new ContinuousSplitPlannerImpl( - TABLE_RESOURCE.tableLoader().clone(), scanContextWithInvalidSnapshotId, null); - - assertThatThrownBy(() -> splitPlanner.planSplits(null)) - .isInstanceOf(IllegalArgumentException.class) - .hasMessage("Cannot find a snapshot after: 1"); - } - - @Test - public void testIncrementalFromSnapshotTimestampWithInvalidIds() throws Exception { - appendTwoSnapshots(); - - long invalidSnapshotTimestampMs = snapshot2.timestampMillis() + 1000L; - - ScanContext scanContextWithInvalidSnapshotId = - ScanContext.builder() - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP) - .startSnapshotTimestamp(invalidSnapshotTimestampMs) - .build(); - - ContinuousSplitPlannerImpl splitPlanner = - new ContinuousSplitPlannerImpl( - TABLE_RESOURCE.tableLoader().clone(), scanContextWithInvalidSnapshotId, null); - - assertThatThrownBy(() -> splitPlanner.planSplits(null)) - .isInstanceOf(IllegalArgumentException.class) - .hasMessageStartingWith("Cannot find a snapshot after:"); - } - - @Test - public void testIncrementalFromSnapshotTimestamp() throws Exception { - appendTwoSnapshots(); - - ScanContext scanContext = - ScanContext.builder() - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP) - .startSnapshotTimestamp(snapshot2.timestampMillis()) - .build(); - ContinuousSplitPlannerImpl splitPlanner = - new ContinuousSplitPlannerImpl(TABLE_RESOURCE.tableLoader().clone(), scanContext, null); - - ContinuousEnumerationResult initialResult = splitPlanner.planSplits(null); - assertThat(initialResult.fromPosition()).isNull(); - // For inclusive behavior, the initial result should point to snapshot1 (as snapshot2's parent). - assertThat(initialResult.toPosition().snapshotId().longValue()) - .isEqualTo(snapshot1.snapshotId()); - assertThat(initialResult.toPosition().snapshotTimestampMs().longValue()) - .isEqualTo(snapshot1.timestampMillis()); - assertThat(initialResult.splits()).isEmpty(); - - ContinuousEnumerationResult secondResult = splitPlanner.planSplits(initialResult.toPosition()); - assertThat(secondResult.fromPosition().snapshotId().longValue()) - .isEqualTo(snapshot1.snapshotId()); - assertThat(secondResult.fromPosition().snapshotTimestampMs().longValue()) - .isEqualTo(snapshot1.timestampMillis()); - assertThat(secondResult.toPosition().snapshotId().longValue()) - .isEqualTo(snapshot2.snapshotId()); - assertThat(secondResult.toPosition().snapshotTimestampMs().longValue()) - .isEqualTo(snapshot2.timestampMillis()); - IcebergSourceSplit split = Iterables.getOnlyElement(secondResult.splits()); - assertThat(split.task().files()).hasSize(1); - Set discoveredFiles = - split.task().files().stream() - .map(fileScanTask -> fileScanTask.file().path().toString()) - .collect(Collectors.toSet()); - // should discover dataFile2 appended in snapshot2 - Set expectedFiles = ImmutableSet.of(dataFile2.path().toString()); - assertThat(discoveredFiles).containsExactlyElementsOf(expectedFiles); - - IcebergEnumeratorPosition lastPosition = secondResult.toPosition(); - for (int i = 0; i < 3; ++i) { - lastPosition = verifyOneCycle(splitPlanner, lastPosition).lastPosition; - } - } - - @Test - public void testMaxPlanningSnapshotCount() throws Exception { - appendTwoSnapshots(); - // append 3 more snapshots - for (int i = 2; i < 5; ++i) { - appendSnapshot(i, 2); - } - - ScanContext scanContext = - ScanContext.builder() - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_EARLIEST_SNAPSHOT) - // limit to 1 snapshot per discovery - .maxPlanningSnapshotCount(1) - .build(); - ContinuousSplitPlannerImpl splitPlanner = - new ContinuousSplitPlannerImpl(TABLE_RESOURCE.tableLoader().clone(), scanContext, null); - - ContinuousEnumerationResult initialResult = splitPlanner.planSplits(null); - assertThat(initialResult.fromPosition()).isNull(); - // For inclusive behavior, the initial result should point to snapshot1's parent, - // which leads to null snapshotId and snapshotTimestampMs. - assertThat(initialResult.toPosition().snapshotId()).isNull(); - assertThat(initialResult.toPosition().snapshotTimestampMs()).isNull(); - assertThat(initialResult.splits()).isEmpty(); - - ContinuousEnumerationResult secondResult = splitPlanner.planSplits(initialResult.toPosition()); - // should discover dataFile1 appended in snapshot1 - verifyMaxPlanningSnapshotCountResult( - secondResult, null, snapshot1, ImmutableSet.of(dataFile1.path().toString())); - - ContinuousEnumerationResult thirdResult = splitPlanner.planSplits(secondResult.toPosition()); - // should discover dataFile2 appended in snapshot2 - verifyMaxPlanningSnapshotCountResult( - thirdResult, snapshot1, snapshot2, ImmutableSet.of(dataFile2.path().toString())); - } - - @Test - public void testTableScanNoStats() throws Exception { - appendTwoSnapshots(); - - ScanContext scanContext = - ScanContext.builder() - .includeColumnStats(false) - .startingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) - .build(); - ContinuousSplitPlannerImpl splitPlanner = - new ContinuousSplitPlannerImpl(TABLE_RESOURCE.tableLoader().clone(), scanContext, null); - - ContinuousEnumerationResult initialResult = splitPlanner.planSplits(null); - assertThat(initialResult.splits()).hasSize(1); - IcebergSourceSplit split = Iterables.getOnlyElement(initialResult.splits()); - assertThat(split.task().files()).hasSize(2); - verifyStatCount(split, 0); - - IcebergEnumeratorPosition lastPosition = initialResult.toPosition(); - for (int i = 0; i < 3; ++i) { - CycleResult result = verifyOneCycle(splitPlanner, lastPosition); - verifyStatCount(result.split, 0); - lastPosition = result.lastPosition; - } - } - - @Test - public void testTableScanAllStats() throws Exception { - appendTwoSnapshots(); - - ScanContext scanContext = - ScanContext.builder() - .includeColumnStats(true) - .startingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) - .build(); - ContinuousSplitPlannerImpl splitPlanner = - new ContinuousSplitPlannerImpl(TABLE_RESOURCE.tableLoader().clone(), scanContext, null); - - ContinuousEnumerationResult initialResult = splitPlanner.planSplits(null); - assertThat(initialResult.splits()).hasSize(1); - IcebergSourceSplit split = Iterables.getOnlyElement(initialResult.splits()); - assertThat(split.task().files()).hasSize(2); - verifyStatCount(split, 3); - - IcebergEnumeratorPosition lastPosition = initialResult.toPosition(); - for (int i = 0; i < 3; ++i) { - CycleResult result = verifyOneCycle(splitPlanner, lastPosition); - verifyStatCount(result.split, 3); - lastPosition = result.lastPosition; - } - } - - @Test - public void testTableScanSingleStat() throws Exception { - appendTwoSnapshots(); - - ScanContext scanContext = - ScanContext.builder() - .includeColumnStats(ImmutableSet.of("data")) - .startingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) - .build(); - ContinuousSplitPlannerImpl splitPlanner = - new ContinuousSplitPlannerImpl(TABLE_RESOURCE.tableLoader().clone(), scanContext, null); - - ContinuousEnumerationResult initialResult = splitPlanner.planSplits(null); - assertThat(initialResult.splits()).hasSize(1); - IcebergSourceSplit split = Iterables.getOnlyElement(initialResult.splits()); - assertThat(split.task().files()).hasSize(2); - verifyStatCount(split, 1); - - IcebergEnumeratorPosition lastPosition = initialResult.toPosition(); - for (int i = 0; i < 3; ++i) { - CycleResult result = verifyOneCycle(splitPlanner, lastPosition); - verifyStatCount(result.split, 1); - lastPosition = result.lastPosition; - } - } - - private void verifyStatCount(IcebergSourceSplit split, int expected) { - if (expected == 0) { - split - .task() - .files() - .forEach( - f -> { - assertThat(f.file().valueCounts()).isNull(); - assertThat(f.file().columnSizes()).isNull(); - assertThat(f.file().lowerBounds()).isNull(); - assertThat(f.file().upperBounds()).isNull(); - assertThat(f.file().nanValueCounts()).isNull(); - assertThat(f.file().nullValueCounts()).isNull(); - }); - } else { - split - .task() - .files() - .forEach( - f -> { - assertThat(f.file().valueCounts()).hasSize(expected); - assertThat(f.file().columnSizes()).hasSize(expected); - assertThat(f.file().lowerBounds()).hasSize(expected); - assertThat(f.file().upperBounds()).hasSize(expected); - assertThat(f.file().nullValueCounts()).hasSize(expected); - // The nanValue is not counted for long and string fields - assertThat(f.file().nanValueCounts()).isEmpty(); - }); - } - } - - private void verifyMaxPlanningSnapshotCountResult( - ContinuousEnumerationResult result, - Snapshot fromSnapshotExclusive, - Snapshot toSnapshotInclusive, - Set expectedFiles) { - if (fromSnapshotExclusive == null) { - assertThat(result.fromPosition().snapshotId()).isNull(); - assertThat(result.fromPosition().snapshotTimestampMs()).isNull(); - } else { - assertThat(result.fromPosition().snapshotId().longValue()) - .isEqualTo(fromSnapshotExclusive.snapshotId()); - assertThat(result.fromPosition().snapshotTimestampMs().longValue()) - .isEqualTo(fromSnapshotExclusive.timestampMillis()); - } - assertThat(result.toPosition().snapshotId().longValue()) - .isEqualTo(toSnapshotInclusive.snapshotId()); - assertThat(result.toPosition().snapshotTimestampMs().longValue()) - .isEqualTo(toSnapshotInclusive.timestampMillis()); - // should only have one split with one data file, because split discover is limited to - // one snapshot and each snapshot has only one data file appended. - IcebergSourceSplit split = Iterables.getOnlyElement(result.splits()); - assertThat(split.task().files()).hasSize(1); - Set discoveredFiles = - split.task().files().stream() - .map(fileScanTask -> fileScanTask.file().path().toString()) - .collect(Collectors.toSet()); - assertThat(discoveredFiles).containsExactlyElementsOf(expectedFiles); - } - - private Snapshot appendSnapshot(long seed, int numRecords) throws Exception { - List batch = RandomGenericData.generate(TestFixtures.SCHEMA, numRecords, seed); - DataFile dataFile = dataAppender.writeFile(null, batch); - dataAppender.appendToTable(dataFile); - return TABLE_RESOURCE.table().currentSnapshot(); - } - - private static class CycleResult { - IcebergEnumeratorPosition lastPosition; - IcebergSourceSplit split; - - CycleResult(IcebergEnumeratorPosition lastPosition, IcebergSourceSplit split) { - this.lastPosition = lastPosition; - this.split = split; - } - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousSplitPlannerImplStartStrategy.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousSplitPlannerImplStartStrategy.java deleted file mode 100644 index b2185675340f..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousSplitPlannerImplStartStrategy.java +++ /dev/null @@ -1,200 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.enumerator; - -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatThrownBy; - -import java.io.IOException; -import java.nio.file.Path; -import java.util.List; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Snapshot; -import org.apache.iceberg.data.GenericAppenderHelper; -import org.apache.iceberg.data.RandomGenericData; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.flink.HadoopTableExtension; -import org.apache.iceberg.flink.TestFixtures; -import org.apache.iceberg.flink.source.ScanContext; -import org.apache.iceberg.flink.source.StreamingStartingStrategy; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.extension.RegisterExtension; -import org.junit.jupiter.api.io.TempDir; - -public class TestContinuousSplitPlannerImplStartStrategy { - private static final FileFormat FILE_FORMAT = FileFormat.PARQUET; - - @TempDir protected Path temporaryFolder; - - @RegisterExtension - private static final HadoopTableExtension TABLE_RESOURCE = - new HadoopTableExtension(TestFixtures.DATABASE, TestFixtures.TABLE, TestFixtures.SCHEMA); - - private GenericAppenderHelper dataAppender; - private Snapshot snapshot1; - private Snapshot snapshot2; - private Snapshot snapshot3; - - @BeforeEach - public void before() throws IOException { - dataAppender = new GenericAppenderHelper(TABLE_RESOURCE.table(), FILE_FORMAT, temporaryFolder); - } - - private void appendThreeSnapshots() throws IOException { - List batch1 = RandomGenericData.generate(TestFixtures.SCHEMA, 2, 0L); - dataAppender.appendToTable(batch1); - snapshot1 = TABLE_RESOURCE.table().currentSnapshot(); - - List batch2 = RandomGenericData.generate(TestFixtures.SCHEMA, 2, 1L); - dataAppender.appendToTable(batch2); - snapshot2 = TABLE_RESOURCE.table().currentSnapshot(); - - List batch3 = RandomGenericData.generate(TestFixtures.SCHEMA, 2, 2L); - dataAppender.appendToTable(batch3); - snapshot3 = TABLE_RESOURCE.table().currentSnapshot(); - } - - @Test - public void testTableScanThenIncrementalStrategy() throws IOException { - ScanContext scanContext = - ScanContext.builder() - .streaming(true) - .startingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) - .build(); - - assertThat(ContinuousSplitPlannerImpl.startSnapshot(TABLE_RESOURCE.table(), scanContext)) - .isNotPresent(); - - appendThreeSnapshots(); - Snapshot startSnapshot = - ContinuousSplitPlannerImpl.startSnapshot(TABLE_RESOURCE.table(), scanContext).get(); - assertThat(startSnapshot.snapshotId()).isEqualTo(snapshot3.snapshotId()); - } - - @Test - public void testForLatestSnapshotStrategy() throws IOException { - ScanContext scanContext = - ScanContext.builder() - .streaming(true) - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_LATEST_SNAPSHOT) - .build(); - - assertThat(ContinuousSplitPlannerImpl.startSnapshot(TABLE_RESOURCE.table(), scanContext)) - .isNotPresent(); - - appendThreeSnapshots(); - Snapshot startSnapshot = - ContinuousSplitPlannerImpl.startSnapshot(TABLE_RESOURCE.table(), scanContext).get(); - assertThat(startSnapshot.snapshotId()).isEqualTo(snapshot3.snapshotId()); - } - - @Test - public void testForEarliestSnapshotStrategy() throws IOException { - ScanContext scanContext = - ScanContext.builder() - .streaming(true) - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_EARLIEST_SNAPSHOT) - .build(); - - assertThat(ContinuousSplitPlannerImpl.startSnapshot(TABLE_RESOURCE.table(), scanContext)) - .isNotPresent(); - - appendThreeSnapshots(); - Snapshot startSnapshot = - ContinuousSplitPlannerImpl.startSnapshot(TABLE_RESOURCE.table(), scanContext).get(); - assertThat(startSnapshot.snapshotId()).isEqualTo(snapshot1.snapshotId()); - } - - @Test - public void testForSpecificSnapshotIdStrategy() throws IOException { - ScanContext scanContextInvalidSnapshotId = - ScanContext.builder() - .streaming(true) - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_ID) - .startSnapshotId(1L) - .build(); - - assertThatThrownBy( - () -> - ContinuousSplitPlannerImpl.startSnapshot( - TABLE_RESOURCE.table(), scanContextInvalidSnapshotId)) - .isInstanceOf(IllegalArgumentException.class) - .hasMessage("Start snapshot id not found in history: 1"); - - appendThreeSnapshots(); - - ScanContext scanContext = - ScanContext.builder() - .streaming(true) - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_ID) - .startSnapshotId(snapshot2.snapshotId()) - .build(); - - Snapshot startSnapshot = - ContinuousSplitPlannerImpl.startSnapshot(TABLE_RESOURCE.table(), scanContext).get(); - assertThat(startSnapshot.snapshotId()).isEqualTo(snapshot2.snapshotId()); - } - - @Test - public void testForSpecificSnapshotTimestampStrategySnapshot2() throws IOException { - ScanContext scanContextInvalidSnapshotTimestamp = - ScanContext.builder() - .streaming(true) - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP) - .startSnapshotTimestamp(1L) - .build(); - - assertThatThrownBy( - () -> - ContinuousSplitPlannerImpl.startSnapshot( - TABLE_RESOURCE.table(), scanContextInvalidSnapshotTimestamp)) - .isInstanceOf(IllegalArgumentException.class) - .hasMessageStartingWith("Cannot find a snapshot after: "); - - appendThreeSnapshots(); - - ScanContext scanContext = - ScanContext.builder() - .streaming(true) - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP) - .startSnapshotTimestamp(snapshot2.timestampMillis()) - .build(); - - Snapshot startSnapshot = - ContinuousSplitPlannerImpl.startSnapshot(TABLE_RESOURCE.table(), scanContext).get(); - assertThat(startSnapshot.snapshotId()).isEqualTo(snapshot2.snapshotId()); - } - - @Test - public void testForSpecificSnapshotTimestampStrategySnapshot2Minus1() throws IOException { - appendThreeSnapshots(); - - ScanContext config = - ScanContext.builder() - .streaming(true) - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP) - .startSnapshotTimestamp(snapshot2.timestampMillis() - 1L) - .build(); - - Snapshot startSnapshot = - ContinuousSplitPlannerImpl.startSnapshot(TABLE_RESOURCE.table(), config).get(); - assertThat(startSnapshot.snapshotId()).isEqualTo(snapshot2.snapshotId()); - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestEnumerationHistory.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestEnumerationHistory.java deleted file mode 100644 index feefcb98646b..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestEnumerationHistory.java +++ /dev/null @@ -1,135 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.enumerator; - -import static org.assertj.core.api.Assertions.assertThat; - -import org.junit.jupiter.api.Test; - -public class TestEnumerationHistory { - private static final int MAX_HISTORY_SIZE = 3; - private static final int FEW_PENDING_SPLITS = 2; - private static final int TOO_MANY_PENDING_SPLITS = 100; - - @Test - public void testEmptyHistory() { - EnumerationHistory history = new EnumerationHistory(MAX_HISTORY_SIZE); - int[] expectedHistorySnapshot = new int[0]; - testHistory(history, expectedHistorySnapshot); - } - - @Test - public void testNotFullHistory() { - EnumerationHistory history = new EnumerationHistory(3); - history.add(1); - history.add(2); - int[] expectedHistorySnapshot = {1, 2}; - testHistory(history, expectedHistorySnapshot); - } - - @Test - public void testExactFullHistory() { - EnumerationHistory history = new EnumerationHistory(3); - history.add(1); - history.add(2); - history.add(3); - int[] expectedHistorySnapshot = {1, 2, 3}; - testHistory(history, expectedHistorySnapshot); - } - - @Test - public void testOneMoreThanFullHistory() { - EnumerationHistory history = new EnumerationHistory(3); - history.add(1); - history.add(2); - history.add(3); - history.add(4); - int[] expectedHistorySnapshot = {2, 3, 4}; - testHistory(history, expectedHistorySnapshot); - } - - @Test - public void testTwoMoreThanFullHistory() { - EnumerationHistory history = new EnumerationHistory(3); - history.add(1); - history.add(2); - history.add(3); - history.add(4); - history.add(5); - int[] expectedHistorySnapshot = {3, 4, 5}; - testHistory(history, expectedHistorySnapshot); - } - - @Test - public void testThreeMoreThanFullHistory() { - EnumerationHistory history = new EnumerationHistory(3); - history.add(1); - history.add(2); - history.add(3); - history.add(4); - history.add(5); - history.add(6); - int[] expectedHistorySnapshot = {4, 5, 6}; - testHistory(history, expectedHistorySnapshot); - } - - private void testHistory(EnumerationHistory history, int[] expectedHistorySnapshot) { - assertThat(history.shouldPauseSplitDiscovery(FEW_PENDING_SPLITS)).isFalse(); - if (history.hasFullHistory()) { - // throttle because pending split count is more than the sum of enumeration history - assertThat(history.shouldPauseSplitDiscovery(TOO_MANY_PENDING_SPLITS)).isTrue(); - } else { - // skipped throttling check because there is not enough history - assertThat(history.shouldPauseSplitDiscovery(TOO_MANY_PENDING_SPLITS)).isFalse(); - } - - int[] historySnapshot = history.snapshot(); - assertThat(historySnapshot).containsExactly(expectedHistorySnapshot); - - EnumerationHistory restoredHistory = new EnumerationHistory(MAX_HISTORY_SIZE); - restoredHistory.restore(historySnapshot); - - assertThat(history.shouldPauseSplitDiscovery(FEW_PENDING_SPLITS)).isFalse(); - if (history.hasFullHistory()) { - // throttle because pending split count is more than the sum of enumeration history - assertThat(history.shouldPauseSplitDiscovery(TOO_MANY_PENDING_SPLITS)).isTrue(); - } else { - // skipped throttling check because there is not enough history - assertThat(history.shouldPauseSplitDiscovery(30)).isFalse(); - } - } - - @Test - public void testRestoreDifferentSize() { - EnumerationHistory history = new EnumerationHistory(3); - history.add(1); - history.add(2); - history.add(3); - int[] historySnapshot = history.snapshot(); - - EnumerationHistory smallerHistory = new EnumerationHistory(2); - smallerHistory.restore(historySnapshot); - int[] expectedRestoredHistorySnapshot = {2, 3}; - assertThat(smallerHistory.snapshot()).containsExactly(expectedRestoredHistorySnapshot); - - EnumerationHistory largerHisotry = new EnumerationHistory(4); - largerHisotry.restore(historySnapshot); - assertThat(largerHisotry.snapshot()).containsExactly(historySnapshot); - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestIcebergEnumeratorStateSerializer.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestIcebergEnumeratorStateSerializer.java deleted file mode 100644 index 2520a6b763e4..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestIcebergEnumeratorStateSerializer.java +++ /dev/null @@ -1,146 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.enumerator; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.IOException; -import java.nio.file.Path; -import java.util.Collection; -import java.util.Collections; -import java.util.Iterator; -import java.util.List; -import org.apache.iceberg.Parameter; -import org.apache.iceberg.ParameterizedTestExtension; -import org.apache.iceberg.Parameters; -import org.apache.iceberg.flink.source.SplitHelpers; -import org.apache.iceberg.flink.source.split.IcebergSourceSplit; -import org.apache.iceberg.flink.source.split.IcebergSourceSplitState; -import org.apache.iceberg.flink.source.split.IcebergSourceSplitStatus; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.junit.jupiter.api.TestTemplate; -import org.junit.jupiter.api.extension.ExtendWith; -import org.junit.jupiter.api.io.TempDir; - -@ExtendWith(ParameterizedTestExtension.class) -public class TestIcebergEnumeratorStateSerializer { - @TempDir protected Path temporaryFolder; - - private final IcebergEnumeratorStateSerializer serializer = - new IcebergEnumeratorStateSerializer(true); - - @Parameter(index = 0) - protected int version; - - @Parameters(name = "version={0}") - public static Object[][] parameters() { - return new Object[][] {new Object[] {1}, new Object[] {2}}; - } - - @TestTemplate - public void testEmptySnapshotIdAndPendingSplits() throws Exception { - IcebergEnumeratorState enumeratorState = new IcebergEnumeratorState(Collections.emptyList()); - testSerializer(enumeratorState); - } - - @TestTemplate - public void testSomeSnapshotIdAndEmptyPendingSplits() throws Exception { - IcebergEnumeratorPosition position = - IcebergEnumeratorPosition.of(1L, System.currentTimeMillis()); - - IcebergEnumeratorState enumeratorState = - new IcebergEnumeratorState(position, Collections.emptyList()); - testSerializer(enumeratorState); - } - - @TestTemplate - public void testSomeSnapshotIdAndPendingSplits() throws Exception { - IcebergEnumeratorPosition position = - IcebergEnumeratorPosition.of(2L, System.currentTimeMillis()); - List splits = - SplitHelpers.createSplitsFromTransientHadoopTable(temporaryFolder, 3, 1); - Collection pendingSplits = Lists.newArrayList(); - pendingSplits.add( - new IcebergSourceSplitState(splits.get(0), IcebergSourceSplitStatus.UNASSIGNED)); - pendingSplits.add( - new IcebergSourceSplitState(splits.get(1), IcebergSourceSplitStatus.ASSIGNED)); - pendingSplits.add( - new IcebergSourceSplitState(splits.get(2), IcebergSourceSplitStatus.COMPLETED)); - - IcebergEnumeratorState enumeratorState = new IcebergEnumeratorState(position, pendingSplits); - testSerializer(enumeratorState); - } - - @TestTemplate - public void testEnumerationSplitCountHistory() throws Exception { - if (version == 2) { - IcebergEnumeratorPosition position = - IcebergEnumeratorPosition.of(2L, System.currentTimeMillis()); - List splits = - SplitHelpers.createSplitsFromTransientHadoopTable(temporaryFolder, 3, 1); - Collection pendingSplits = Lists.newArrayList(); - pendingSplits.add( - new IcebergSourceSplitState(splits.get(0), IcebergSourceSplitStatus.UNASSIGNED)); - pendingSplits.add( - new IcebergSourceSplitState(splits.get(1), IcebergSourceSplitStatus.ASSIGNED)); - pendingSplits.add( - new IcebergSourceSplitState(splits.get(2), IcebergSourceSplitStatus.COMPLETED)); - int[] enumerationSplitCountHistory = {1, 2, 3}; - - IcebergEnumeratorState enumeratorState = - new IcebergEnumeratorState(position, pendingSplits, enumerationSplitCountHistory); - testSerializer(enumeratorState); - } - } - - private void testSerializer(IcebergEnumeratorState enumeratorState) throws IOException { - byte[] result; - if (version == 1) { - result = serializer.serializeV1(enumeratorState); - } else { - result = serializer.serialize(enumeratorState); - } - - IcebergEnumeratorState deserialized = serializer.deserialize(version, result); - assertEnumeratorStateEquals(enumeratorState, deserialized); - } - - private void assertEnumeratorStateEquals( - IcebergEnumeratorState expected, IcebergEnumeratorState actual) { - assertThat(actual.lastEnumeratedPosition()).isEqualTo(expected.lastEnumeratedPosition()); - - assertThat(actual.pendingSplits()).hasSameSizeAs(expected.pendingSplits()); - Iterator expectedIterator = expected.pendingSplits().iterator(); - Iterator actualIterator = actual.pendingSplits().iterator(); - for (int i = 0; i < expected.pendingSplits().size(); ++i) { - IcebergSourceSplitState expectedSplitState = expectedIterator.next(); - IcebergSourceSplitState actualSplitState = actualIterator.next(); - assertThat(actualSplitState.split().splitId()) - .isEqualTo(expectedSplitState.split().splitId()); - assertThat(actualSplitState.split().fileOffset()) - .isEqualTo(expectedSplitState.split().fileOffset()); - assertThat(actualSplitState.split().recordOffset()) - .isEqualTo(expectedSplitState.split().recordOffset()); - assertThat(actualSplitState.status()).isEqualTo(expectedSplitState.status()); - } - - assertThat(actual.enumerationSplitCountHistory()) - .containsExactly(expected.enumerationSplitCountHistory()); - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/reader/ReaderFunctionTestBase.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/reader/ReaderFunctionTestBase.java deleted file mode 100644 index 0d1d0ce3217c..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/reader/ReaderFunctionTestBase.java +++ /dev/null @@ -1,218 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.reader; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.IOException; -import java.nio.file.Path; -import java.util.List; -import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; -import org.apache.iceberg.CombinedScanTask; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Parameter; -import org.apache.iceberg.ParameterizedTestExtension; -import org.apache.iceberg.Parameters; -import org.apache.iceberg.Schema; -import org.apache.iceberg.data.GenericAppenderFactory; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.flink.TestFixtures; -import org.apache.iceberg.flink.source.split.IcebergSourceSplit; -import org.apache.iceberg.io.CloseableIterator; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.junit.jupiter.api.TestTemplate; -import org.junit.jupiter.api.extension.ExtendWith; -import org.junit.jupiter.api.io.TempDir; - -@ExtendWith(ParameterizedTestExtension.class) -public abstract class ReaderFunctionTestBase { - - @Parameters(name = "fileFormat={0}") - public static Object[][] parameters() { - return new Object[][] { - new Object[] {FileFormat.AVRO}, - new Object[] {FileFormat.ORC}, - new Object[] {FileFormat.PARQUET} - }; - } - - @TempDir protected Path temporaryFolder; - - protected abstract ReaderFunction readerFunction(); - - protected abstract void assertRecords(List expected, List actual, Schema schema); - - @Parameter(index = 0) - private FileFormat fileFormat; - - private final GenericAppenderFactory appenderFactory = - new GenericAppenderFactory(TestFixtures.SCHEMA); - - private void assertRecordsAndPosition( - List expectedRecords, - int expectedFileOffset, - long startRecordOffset, - RecordsWithSplitIds> batch) { - batch.nextSplit(); - List actualRecords = Lists.newArrayList(); - long recordOffset = startRecordOffset; - RecordAndPosition recordAndPosition; - while ((recordAndPosition = batch.nextRecordFromSplit()) != null) { - actualRecords.add(recordAndPosition.record()); - assertThat(recordAndPosition.fileOffset()).isEqualTo(expectedFileOffset); - assertThat(recordAndPosition.recordOffset() - 1).isEqualTo(recordOffset); - recordOffset++; - } - - assertThat(actualRecords).hasSameSizeAs(expectedRecords); - assertRecords(expectedRecords, actualRecords, TestFixtures.SCHEMA); - } - - @TestTemplate - public void testNoCheckpointedPosition() throws IOException { - List> recordBatchList = - ReaderUtil.createRecordBatchList(TestFixtures.SCHEMA, 3, 2); - CombinedScanTask combinedScanTask = - ReaderUtil.createCombinedScanTask( - recordBatchList, temporaryFolder, fileFormat, appenderFactory); - IcebergSourceSplit split = IcebergSourceSplit.fromCombinedScanTask(combinedScanTask); - CloseableIterator>> reader = - readerFunction().apply(split); - - RecordsWithSplitIds> batch0 = reader.next(); - assertRecordsAndPosition(recordBatchList.get(0), 0, 0L, batch0); - batch0.recycle(); - - RecordsWithSplitIds> batch1 = reader.next(); - assertRecordsAndPosition(recordBatchList.get(1), 1, 0L, batch1); - batch1.recycle(); - - RecordsWithSplitIds> batch2 = reader.next(); - assertRecordsAndPosition(recordBatchList.get(2), 2, 0L, batch2); - batch2.recycle(); - } - - @TestTemplate - public void testCheckpointedPositionBeforeFirstFile() throws IOException { - List> recordBatchList = - ReaderUtil.createRecordBatchList(TestFixtures.SCHEMA, 3, 2); - CombinedScanTask combinedScanTask = - ReaderUtil.createCombinedScanTask( - recordBatchList, temporaryFolder, fileFormat, appenderFactory); - IcebergSourceSplit split = IcebergSourceSplit.fromCombinedScanTask(combinedScanTask, 0, 0L); - CloseableIterator>> reader = - readerFunction().apply(split); - - RecordsWithSplitIds> batch0 = reader.next(); - assertRecordsAndPosition(recordBatchList.get(0), 0, 0L, batch0); - batch0.recycle(); - - RecordsWithSplitIds> batch1 = reader.next(); - assertRecordsAndPosition(recordBatchList.get(1), 1, 0L, batch1); - batch1.recycle(); - - RecordsWithSplitIds> batch2 = reader.next(); - assertRecordsAndPosition(recordBatchList.get(2), 2, 0L, batch2); - batch2.recycle(); - } - - @TestTemplate - public void testCheckpointedPositionMiddleFirstFile() throws IOException { - List> recordBatchList = - ReaderUtil.createRecordBatchList(TestFixtures.SCHEMA, 3, 2); - CombinedScanTask combinedScanTask = - ReaderUtil.createCombinedScanTask( - recordBatchList, temporaryFolder, fileFormat, appenderFactory); - IcebergSourceSplit split = IcebergSourceSplit.fromCombinedScanTask(combinedScanTask, 0, 1L); - CloseableIterator>> reader = - readerFunction().apply(split); - - RecordsWithSplitIds> batch0 = reader.next(); - assertRecordsAndPosition(recordBatchList.get(0).subList(1, 2), 0, 1L, batch0); - batch0.recycle(); - - RecordsWithSplitIds> batch1 = reader.next(); - assertRecordsAndPosition(recordBatchList.get(1), 1, 0L, batch1); - batch1.recycle(); - - RecordsWithSplitIds> batch2 = reader.next(); - assertRecordsAndPosition(recordBatchList.get(2), 2, 0L, batch2); - batch2.recycle(); - } - - @TestTemplate - public void testCheckpointedPositionAfterFirstFile() throws IOException { - List> recordBatchList = - ReaderUtil.createRecordBatchList(TestFixtures.SCHEMA, 3, 2); - CombinedScanTask combinedScanTask = - ReaderUtil.createCombinedScanTask( - recordBatchList, temporaryFolder, fileFormat, appenderFactory); - IcebergSourceSplit split = IcebergSourceSplit.fromCombinedScanTask(combinedScanTask, 0, 2L); - CloseableIterator>> reader = - readerFunction().apply(split); - - RecordsWithSplitIds> batch1 = reader.next(); - assertRecordsAndPosition(recordBatchList.get(1), 1, 0L, batch1); - batch1.recycle(); - - RecordsWithSplitIds> batch2 = reader.next(); - assertRecordsAndPosition(recordBatchList.get(2), 2, 0L, batch2); - batch2.recycle(); - } - - @TestTemplate - public void testCheckpointedPositionBeforeSecondFile() throws IOException { - List> recordBatchList = - ReaderUtil.createRecordBatchList(TestFixtures.SCHEMA, 3, 2); - CombinedScanTask combinedScanTask = - ReaderUtil.createCombinedScanTask( - recordBatchList, temporaryFolder, fileFormat, appenderFactory); - IcebergSourceSplit split = IcebergSourceSplit.fromCombinedScanTask(combinedScanTask, 1, 0L); - CloseableIterator>> reader = - readerFunction().apply(split); - - RecordsWithSplitIds> batch1 = reader.next(); - assertRecordsAndPosition(recordBatchList.get(1), 1, 0L, batch1); - batch1.recycle(); - - RecordsWithSplitIds> batch2 = reader.next(); - assertRecordsAndPosition(recordBatchList.get(2), 2, 0L, batch2); - batch2.recycle(); - } - - @TestTemplate - public void testCheckpointedPositionMidSecondFile() throws IOException { - List> recordBatchList = - ReaderUtil.createRecordBatchList(TestFixtures.SCHEMA, 3, 2); - CombinedScanTask combinedScanTask = - ReaderUtil.createCombinedScanTask( - recordBatchList, temporaryFolder, fileFormat, appenderFactory); - IcebergSourceSplit split = IcebergSourceSplit.fromCombinedScanTask(combinedScanTask, 1, 1L); - CloseableIterator>> reader = - readerFunction().apply(split); - - RecordsWithSplitIds> batch1 = reader.next(); - assertRecordsAndPosition(recordBatchList.get(1).subList(1, 2), 1, 1L, batch1); - batch1.recycle(); - - RecordsWithSplitIds> batch2 = reader.next(); - assertRecordsAndPosition(recordBatchList.get(2), 2, 0L, batch2); - batch2.recycle(); - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/reader/ReaderUtil.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/reader/ReaderUtil.java deleted file mode 100644 index 0edf8ae009fe..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/reader/ReaderUtil.java +++ /dev/null @@ -1,128 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.reader; - -import java.io.File; -import java.io.IOException; -import java.nio.file.Path; -import java.util.Collections; -import java.util.List; -import org.apache.flink.table.data.RowData; -import org.apache.iceberg.BaseCombinedScanTask; -import org.apache.iceberg.BaseFileScanTask; -import org.apache.iceberg.CombinedScanTask; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.DataFiles; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.FileScanTask; -import org.apache.iceberg.Files; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.PartitionSpecParser; -import org.apache.iceberg.Schema; -import org.apache.iceberg.SchemaParser; -import org.apache.iceberg.data.GenericAppenderFactory; -import org.apache.iceberg.data.RandomGenericData; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.encryption.PlaintextEncryptionManager; -import org.apache.iceberg.expressions.Expressions; -import org.apache.iceberg.expressions.ResidualEvaluator; -import org.apache.iceberg.flink.TestFixtures; -import org.apache.iceberg.flink.source.DataIterator; -import org.apache.iceberg.flink.source.RowDataFileScanTaskReader; -import org.apache.iceberg.hadoop.HadoopFileIO; -import org.apache.iceberg.io.FileAppender; -import org.apache.iceberg.io.FileAppenderFactory; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; - -public class ReaderUtil { - - private ReaderUtil() {} - - public static FileScanTask createFileTask( - List records, - File file, - FileFormat fileFormat, - FileAppenderFactory appenderFactory) - throws IOException { - FileAppender appender = - appenderFactory.newAppender(Files.localOutput(file), fileFormat); - try { - appender.addAll(records); - } finally { - appender.close(); - } - - DataFile dataFile = - DataFiles.builder(PartitionSpec.unpartitioned()) - .withRecordCount(records.size()) - .withFileSizeInBytes(file.length()) - .withPath(file.toString()) - .withFormat(fileFormat) - .withMetrics(appender.metrics()) - .build(); - - ResidualEvaluator residuals = ResidualEvaluator.unpartitioned(Expressions.alwaysTrue()); - return new BaseFileScanTask( - dataFile, - null, - SchemaParser.toJson(TestFixtures.SCHEMA), - PartitionSpecParser.toJson(PartitionSpec.unpartitioned()), - residuals); - } - - public static DataIterator createDataIterator(CombinedScanTask combinedTask) { - return new DataIterator<>( - new RowDataFileScanTaskReader( - TestFixtures.SCHEMA, TestFixtures.SCHEMA, null, true, Collections.emptyList()), - combinedTask, - new HadoopFileIO(new org.apache.hadoop.conf.Configuration()), - PlaintextEncryptionManager.instance()); - } - - public static List> createRecordBatchList( - Schema schema, int listSize, int batchCount) { - return createRecordBatchList(0L, schema, listSize, batchCount); - } - - public static List> createRecordBatchList( - long seed, Schema schema, int listSize, int batchCount) { - List records = RandomGenericData.generate(schema, listSize * batchCount, seed); - return Lists.partition(records, batchCount); - } - - public static CombinedScanTask createCombinedScanTask( - List> recordBatchList, - Path temporaryFolder, - FileFormat fileFormat, - GenericAppenderFactory appenderFactory) - throws IOException { - List fileTasks = Lists.newArrayListWithCapacity(recordBatchList.size()); - for (List recordBatch : recordBatchList) { - FileScanTask fileTask = - ReaderUtil.createFileTask( - recordBatch, - File.createTempFile("junit", null, temporaryFolder.toFile()), - fileFormat, - appenderFactory); - fileTasks.add(fileTask); - } - - return new BaseCombinedScanTask(fileTasks); - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestArrayBatchRecords.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestArrayBatchRecords.java deleted file mode 100644 index 6f09bd9a56d6..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestArrayBatchRecords.java +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.reader; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.util.concurrent.atomic.AtomicBoolean; -import org.junit.jupiter.api.Test; - -public class TestArrayBatchRecords { - - @Test - public void testFullRange() { - String[] elements = new String[] {"0", "1", "2", "3"}; - testArray(elements, elements.length, 2, 119); - } - - @Test - public void testSubRange() { - String[] elements = new String[] {"0", "1", "2", "3"}; - testArray(elements, 2, 0, 0); - } - - private void testArray( - String[] elements, int numberOfRecords, int fileOffset, long startingRecordOffset) { - String splitId = "iceberg_split_1"; - AtomicBoolean recycled = new AtomicBoolean(); - - ArrayBatchRecords recordsWithSplitIds = - ArrayBatchRecords.forRecords( - splitId, - ignored -> recycled.set(true), - elements, - numberOfRecords, - fileOffset, - startingRecordOffset); - - assertThat(recordsWithSplitIds.nextSplit()).isEqualTo(splitId); - - for (int i = 0; i < numberOfRecords; i++) { - RecordAndPosition recAndPos = recordsWithSplitIds.nextRecordFromSplit(); - assertThat(recAndPos.record()).isEqualTo(elements[i]); - assertThat(recAndPos.fileOffset()).isEqualTo(fileOffset); - // recordOffset points to the position after this one - assertThat(recAndPos.recordOffset()).isEqualTo(startingRecordOffset + i + 1); - } - - assertThat(recordsWithSplitIds.nextRecordFromSplit()).isNull(); - assertThat(recordsWithSplitIds.nextSplit()).isNull(); - recordsWithSplitIds.recycle(); - assertThat(recycled.get()).isTrue(); - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestArrayPoolDataIteratorBatcherRowData.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestArrayPoolDataIteratorBatcherRowData.java deleted file mode 100644 index 1a78bb1b0010..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestArrayPoolDataIteratorBatcherRowData.java +++ /dev/null @@ -1,360 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.reader; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.File; -import java.nio.file.Path; -import java.util.Arrays; -import java.util.List; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; -import org.apache.flink.connector.base.source.reader.SourceReaderOptions; -import org.apache.flink.table.data.RowData; -import org.apache.iceberg.BaseCombinedScanTask; -import org.apache.iceberg.CombinedScanTask; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.FileScanTask; -import org.apache.iceberg.data.GenericAppenderFactory; -import org.apache.iceberg.data.RandomGenericData; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.flink.FlinkConfigOptions; -import org.apache.iceberg.flink.TestFixtures; -import org.apache.iceberg.flink.TestHelpers; -import org.apache.iceberg.flink.source.DataIterator; -import org.apache.iceberg.io.CloseableIterator; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.io.TempDir; - -public class TestArrayPoolDataIteratorBatcherRowData { - - @TempDir protected Path temporaryFolder; - private static final FileFormat FILE_FORMAT = FileFormat.PARQUET; - private final Configuration config = - new Configuration() - .set(SourceReaderOptions.ELEMENT_QUEUE_CAPACITY, 1) - .set(FlinkConfigOptions.SOURCE_READER_FETCH_BATCH_RECORD_COUNT, 2); - - private final GenericAppenderFactory appenderFactory = - new GenericAppenderFactory(TestFixtures.SCHEMA); - private final DataIteratorBatcher batcher = - new ArrayPoolDataIteratorBatcher<>(config, new RowDataRecordFactory(TestFixtures.ROW_TYPE)); - - /** Read a CombinedScanTask that contains a single file with less than a full batch of records */ - @Test - public void testSingleFileLessThanOneFullBatch() throws Exception { - List records = RandomGenericData.generate(TestFixtures.SCHEMA, 1, 1); - FileScanTask fileTask = - ReaderUtil.createFileTask( - records, - File.createTempFile("junit", null, temporaryFolder.toFile()), - FILE_FORMAT, - appenderFactory); - CombinedScanTask combinedTask = new BaseCombinedScanTask(fileTask); - DataIterator dataIterator = ReaderUtil.createDataIterator(combinedTask); - String splitId = "someSplitId"; - CloseableIterator>> recordBatchIterator = - batcher.batch(splitId, dataIterator); - - ArrayBatchRecords batch = (ArrayBatchRecords) recordBatchIterator.next(); - assertThat(batch.finishedSplits()).isEmpty(); - assertThat(batch.nextSplit()).isEqualTo(splitId); - assertThat(batch.records()).hasSize(2); - assertThat(batch.numberOfRecords()).isEqualTo(1); - - RecordAndPosition recordAndPosition = batch.nextRecordFromSplit(); - - /////////////////////////////// - // assert first record - - assertThat(recordAndPosition.fileOffset()).isEqualTo(0); - assertThat(recordAndPosition.recordOffset()) - .as("The position points to where the reader should resume after this record is processed.") - .isEqualTo(1); - TestHelpers.assertRowData(TestFixtures.SCHEMA, records.get(0), recordAndPosition.record()); - - assertThat(batch.nextRecordFromSplit()).isNull(); - assertThat(batch.nextSplit()).isNull(); - batch.recycle(); - - assertThat(recordBatchIterator).isExhausted(); - } - - /** - * Read a CombinedScanTask that contains a single file with multiple batches. - * - *

    Insert 5 records in a single file that should result in 3 batches - */ - @Test - public void testSingleFileWithMultipleBatches() throws Exception { - List records = RandomGenericData.generate(TestFixtures.SCHEMA, 5, 1); - FileScanTask fileTask = - ReaderUtil.createFileTask( - records, - File.createTempFile("junit", null, temporaryFolder.toFile()), - FILE_FORMAT, - appenderFactory); - CombinedScanTask combinedTask = new BaseCombinedScanTask(fileTask); - DataIterator dataIterator = ReaderUtil.createDataIterator(combinedTask); - String splitId = "someSplitId"; - CloseableIterator>> recordBatchIterator = - batcher.batch(splitId, dataIterator); - - /////////////////////////////// - // assert first batch with full batch of 2 records - - ArrayBatchRecords batch0 = (ArrayBatchRecords) recordBatchIterator.next(); - assertThat(batch0.finishedSplits()).isEmpty(); - assertThat(batch0.nextSplit()).isEqualTo(splitId); - assertThat(batch0.records()).hasSize(2); - assertThat(batch0.numberOfRecords()).isEqualTo(2); - - RecordAndPosition recordAndPosition; - - // assert first record - recordAndPosition = batch0.nextRecordFromSplit(); - assertThat(recordAndPosition.fileOffset()).isEqualTo(0); - assertThat(recordAndPosition.recordOffset()) - .as("The position points to where the reader should resume after this record is processed.") - .isEqualTo(1); - TestHelpers.assertRowData(TestFixtures.SCHEMA, records.get(0), recordAndPosition.record()); - - // assert second record - recordAndPosition = batch0.nextRecordFromSplit(); - assertThat(recordAndPosition.fileOffset()).isEqualTo(0); - assertThat(recordAndPosition.recordOffset()) - .as("The position points to where the reader should resume after this record is processed.") - .isEqualTo(2); - TestHelpers.assertRowData(TestFixtures.SCHEMA, records.get(1), recordAndPosition.record()); - - assertThat(batch0.nextRecordFromSplit()).isNull(); - assertThat(batch0.nextSplit()).isNull(); - batch0.recycle(); - - /////////////////////////////// - // assert second batch with full batch of 2 records - - ArrayBatchRecords batch1 = (ArrayBatchRecords) recordBatchIterator.next(); - assertThat(batch1.records()).containsExactlyInAnyOrder(batch0.records()); - assertThat(batch1.finishedSplits()).isEmpty(); - assertThat(batch1.nextSplit()).isEqualTo(splitId); - assertThat(batch1.records()).hasSize(2); - assertThat(batch1.numberOfRecords()).isEqualTo(2); - - // assert third record - recordAndPosition = batch1.nextRecordFromSplit(); - assertThat(recordAndPosition.fileOffset()).isEqualTo(0); - assertThat(recordAndPosition.recordOffset()) - .as("The position points to where the reader should resume after this record is processed.") - .isEqualTo(3); - TestHelpers.assertRowData(TestFixtures.SCHEMA, records.get(2), recordAndPosition.record()); - - // assert fourth record - recordAndPosition = batch1.nextRecordFromSplit(); - assertThat(recordAndPosition.fileOffset()).isEqualTo(0); - assertThat(recordAndPosition.recordOffset()) - .as("The position points to where the reader should resume after this record is processed.") - .isEqualTo(4); - TestHelpers.assertRowData(TestFixtures.SCHEMA, records.get(3), recordAndPosition.record()); - - assertThat(batch1.nextRecordFromSplit()).isNull(); - assertThat(batch1.nextSplit()).isNull(); - batch1.recycle(); - - /////////////////////////////// - // assert third batch with partial batch of 1 record - - ArrayBatchRecords batch2 = (ArrayBatchRecords) recordBatchIterator.next(); - assertThat(batch2.records()).containsExactlyInAnyOrder(batch0.records()); - assertThat(batch2.finishedSplits()).isEmpty(); - assertThat(batch2.nextSplit()).isEqualTo(splitId); - assertThat(batch2.records()).hasSize(2); - assertThat(batch2.numberOfRecords()).isEqualTo(1); - - // assert fifth record - recordAndPosition = batch2.nextRecordFromSplit(); - assertThat(recordAndPosition.fileOffset()).isEqualTo(0); - assertThat(recordAndPosition.recordOffset()) - .as("The position points to where the reader should resume after this record is processed.") - .isEqualTo(5); - TestHelpers.assertRowData(TestFixtures.SCHEMA, records.get(4), recordAndPosition.record()); - - assertThat(batch2.nextRecordFromSplit()).isNull(); - assertThat(batch2.nextSplit()).isNull(); - batch2.recycle(); - - assertThat(recordBatchIterator).isExhausted(); - } - - /** - * Read a CombinedScanTask that contains with multiple files. - * - *

    In this test, we also seek the iterator to starting position (1, 1). - */ - @Test - public void testMultipleFilesWithSeekPosition() throws Exception { - List records0 = RandomGenericData.generate(TestFixtures.SCHEMA, 1, 1); - FileScanTask fileTask0 = - ReaderUtil.createFileTask( - records0, - File.createTempFile("junit", null, temporaryFolder.toFile()), - FILE_FORMAT, - appenderFactory); - List records1 = RandomGenericData.generate(TestFixtures.SCHEMA, 4, 2); - FileScanTask fileTask1 = - ReaderUtil.createFileTask( - records1, - File.createTempFile("junit", null, temporaryFolder.toFile()), - FILE_FORMAT, - appenderFactory); - List records2 = RandomGenericData.generate(TestFixtures.SCHEMA, 3, 3); - FileScanTask fileTask2 = - ReaderUtil.createFileTask( - records2, - File.createTempFile("junit", null, temporaryFolder.toFile()), - FILE_FORMAT, - appenderFactory); - CombinedScanTask combinedTask = - new BaseCombinedScanTask(Arrays.asList(fileTask0, fileTask1, fileTask2)); - - DataIterator dataIterator = ReaderUtil.createDataIterator(combinedTask); - dataIterator.seek(1, 1); - - String splitId = "someSplitId"; - CloseableIterator>> recordBatchIterator = - batcher.batch(splitId, dataIterator); - - /////////////////////////////// - // file0 is skipped by seek - - /////////////////////////////// - // file1 has 4 records. because the seek position, first record is skipped. - // we should read 3 remaining records in 2 batches: - // batch10 with 2 records and batch11 with 1 records. - - // assert first batch from file1 with full batch of 2 records - - // variable naming convention: batch - ArrayBatchRecords batch10 = (ArrayBatchRecords) recordBatchIterator.next(); - assertThat(batch10.finishedSplits()).isEmpty(); - assertThat(batch10.nextSplit()).isEqualTo(splitId); - assertThat(batch10.records()).hasSize(2); - assertThat(batch10.numberOfRecords()).isEqualTo(2); - - RecordAndPosition recordAndPosition; - - recordAndPosition = batch10.nextRecordFromSplit(); - assertThat(recordAndPosition.fileOffset()).isEqualTo(1); - assertThat(recordAndPosition.recordOffset()) - .as("seek should skip the first record in file1. starting from the second record") - .isEqualTo(2); - TestHelpers.assertRowData(TestFixtures.SCHEMA, records1.get(1), recordAndPosition.record()); - - recordAndPosition = batch10.nextRecordFromSplit(); - assertThat(recordAndPosition.fileOffset()).isEqualTo(1); - assertThat(recordAndPosition.recordOffset()) - .as("The position points to where the reader should resume after this record is processed.") - .isEqualTo(3); - TestHelpers.assertRowData(TestFixtures.SCHEMA, records1.get(2), recordAndPosition.record()); - - assertThat(batch10.nextRecordFromSplit()).isNull(); - assertThat(batch10.nextSplit()).isNull(); - batch10.recycle(); - - // assert second batch from file1 with partial batch of 1 record - - // variable naming convention: batch__ - ArrayBatchRecords batch11 = (ArrayBatchRecords) recordBatchIterator.next(); - assertThat(batch11.records()).containsExactlyInAnyOrder(batch10.records()); - assertThat(batch11.finishedSplits()).isEmpty(); - assertThat(batch11.nextSplit()).isEqualTo(splitId); - assertThat(batch11.records()).hasSize(2); - assertThat(batch11.numberOfRecords()).isEqualTo(1); - - recordAndPosition = batch11.nextRecordFromSplit(); - assertThat(recordAndPosition.fileOffset()).isEqualTo(1); - assertThat(recordAndPosition.recordOffset()) - .as("The position points to where the reader should resume after this record is processed.") - .isEqualTo(4); - TestHelpers.assertRowData(TestFixtures.SCHEMA, records1.get(3), recordAndPosition.record()); - - assertThat(batch11.nextRecordFromSplit()).isNull(); - assertThat(batch11.nextSplit()).isNull(); - batch11.recycle(); - - /////////////////////////////// - // file2 has 3 records. - // we should read 3 records in 2 batches: - // batch20 with 2 records and batch21 with 1 records - - // assert first batch from file2 with full batch of 2 records - - // variable naming convention: batch__ - ArrayBatchRecords batch20 = (ArrayBatchRecords) recordBatchIterator.next(); - assertThat(batch20.records()).containsExactlyInAnyOrder(batch10.records()); - assertThat(batch20.finishedSplits()).isEmpty(); - assertThat(batch20.nextSplit()).isEqualTo(splitId); - assertThat(batch20.records()).hasSize(2); - assertThat(batch20.numberOfRecords()).isEqualTo(2); - - recordAndPosition = batch20.nextRecordFromSplit(); - assertThat(recordAndPosition.fileOffset()).isEqualTo(2); - assertThat(recordAndPosition.recordOffset()) - .as("The position points to where the reader should resume after this record is processed.") - .isEqualTo(1); - TestHelpers.assertRowData(TestFixtures.SCHEMA, records2.get(0), recordAndPosition.record()); - - recordAndPosition = batch20.nextRecordFromSplit(); - assertThat(recordAndPosition.fileOffset()).isEqualTo(2); - assertThat(recordAndPosition.recordOffset()) - .as("The position points to where the reader should resume after this record is processed.") - .isEqualTo(2); - TestHelpers.assertRowData(TestFixtures.SCHEMA, records2.get(1), recordAndPosition.record()); - - assertThat(batch20.nextRecordFromSplit()).isNull(); - assertThat(batch20.nextSplit()).isNull(); - batch20.recycle(); - - /////////////////////////////// - // assert second batch from file2 with partial batch of 1 record - - // variable naming convention: batch__ - ArrayBatchRecords batch21 = (ArrayBatchRecords) recordBatchIterator.next(); - assertThat(batch21.records()).containsExactlyInAnyOrder(batch10.records()); - assertThat(batch21.finishedSplits()).isEmpty(); - assertThat(batch21.nextSplit()).isEqualTo(splitId); - assertThat(batch21.records()).hasSize(2); - assertThat(batch21.numberOfRecords()).isEqualTo(1); - - recordAndPosition = batch21.nextRecordFromSplit(); - assertThat(recordAndPosition.fileOffset()).isEqualTo(2); - - assertThat(recordAndPosition.recordOffset()) - .as("The position points to where the reader should resume after this record is processed.") - .isEqualTo(3); - TestHelpers.assertRowData(TestFixtures.SCHEMA, records2.get(2), recordAndPosition.record()); - - assertThat(batch21.nextRecordFromSplit()).isNull(); - assertThat(batch21.nextSplit()).isNull(); - batch21.recycle(); - - assertThat(recordBatchIterator).isExhausted(); - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestColumnStatsWatermarkExtractor.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestColumnStatsWatermarkExtractor.java deleted file mode 100644 index 8bd1214bd960..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestColumnStatsWatermarkExtractor.java +++ /dev/null @@ -1,176 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.reader; - -import static org.apache.iceberg.flink.TestFixtures.DATABASE; -import static org.apache.iceberg.types.Types.NestedField.required; -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatThrownBy; -import static org.assertj.core.api.Assumptions.assumeThat; - -import java.io.IOException; -import java.nio.file.Path; -import java.time.LocalDateTime; -import java.time.OffsetDateTime; -import java.time.ZoneOffset; -import java.util.Collection; -import java.util.List; -import java.util.Map; -import java.util.concurrent.TimeUnit; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Parameter; -import org.apache.iceberg.ParameterizedTestExtension; -import org.apache.iceberg.Parameters; -import org.apache.iceberg.Schema; -import org.apache.iceberg.data.GenericAppenderFactory; -import org.apache.iceberg.data.RandomGenericData; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.flink.HadoopTableExtension; -import org.apache.iceberg.flink.TestFixtures; -import org.apache.iceberg.flink.source.split.IcebergSourceSplit; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.types.Types; -import org.junit.jupiter.api.BeforeAll; -import org.junit.jupiter.api.TestTemplate; -import org.junit.jupiter.api.extension.ExtendWith; -import org.junit.jupiter.api.extension.RegisterExtension; -import org.junit.jupiter.api.io.TempDir; - -@ExtendWith(ParameterizedTestExtension.class) -public class TestColumnStatsWatermarkExtractor { - public static final Schema SCHEMA = - new Schema( - required(1, "timestamp_column", Types.TimestampType.withoutZone()), - required(2, "timestamptz_column", Types.TimestampType.withZone()), - required(3, "long_column", Types.LongType.get()), - required(4, "string_column", Types.StringType.get())); - - private static final GenericAppenderFactory APPENDER_FACTORY = new GenericAppenderFactory(SCHEMA); - - private static final List> TEST_RECORDS = - ImmutableList.of( - RandomGenericData.generate(SCHEMA, 3, 2L), RandomGenericData.generate(SCHEMA, 3, 19L)); - - private static final List> MIN_VALUES = - ImmutableList.of(Maps.newHashMapWithExpectedSize(3), Maps.newHashMapWithExpectedSize(3)); - - @TempDir protected Path temporaryFolder; - - @RegisterExtension - private static final HadoopTableExtension SOURCE_TABLE_RESOURCE = - new HadoopTableExtension(DATABASE, TestFixtures.TABLE, SCHEMA); - - @Parameter(index = 0) - private String columnName; - - @BeforeAll - public static void updateMinValue() { - for (int i = 0; i < TEST_RECORDS.size(); ++i) { - for (Record r : TEST_RECORDS.get(i)) { - Map minValues = MIN_VALUES.get(i); - - LocalDateTime localDateTime = (LocalDateTime) r.get(0); - minValues.merge( - "timestamp_column", localDateTime.toInstant(ZoneOffset.UTC).toEpochMilli(), Math::min); - - OffsetDateTime offsetDateTime = (OffsetDateTime) r.get(1); - minValues.merge("timestamptz_column", offsetDateTime.toInstant().toEpochMilli(), Math::min); - - minValues.merge("long_column", (Long) r.get(2), Math::min); - } - } - } - - @Parameters(name = "columnName = {0}") - public static Collection data() { - return ImmutableList.of( - new Object[] {"timestamp_column"}, - new Object[] {"timestamptz_column"}, - new Object[] {"long_column"}); - } - - @TestTemplate - public void testSingle() throws IOException { - ColumnStatsWatermarkExtractor extractor = - new ColumnStatsWatermarkExtractor(SCHEMA, columnName, TimeUnit.MILLISECONDS); - - assertThat(extractor.extractWatermark(split(0))) - .isEqualTo(MIN_VALUES.get(0).get(columnName).longValue()); - } - - @TestTemplate - public void testTimeUnit() throws IOException { - assumeThat(columnName).isEqualTo("long_column"); - ColumnStatsWatermarkExtractor extractor = - new ColumnStatsWatermarkExtractor(SCHEMA, columnName, TimeUnit.MICROSECONDS); - - assertThat(extractor.extractWatermark(split(0))) - .isEqualTo(MIN_VALUES.get(0).get(columnName) / 1000L); - } - - @TestTemplate - public void testMultipleFiles() throws IOException { - assumeThat(columnName).isEqualTo("timestamp_column"); - IcebergSourceSplit combinedSplit = - IcebergSourceSplit.fromCombinedScanTask( - ReaderUtil.createCombinedScanTask( - TEST_RECORDS, temporaryFolder, FileFormat.PARQUET, APPENDER_FACTORY)); - - ColumnStatsWatermarkExtractor extractor = - new ColumnStatsWatermarkExtractor(SCHEMA, columnName, null); - - assertThat(extractor.extractWatermark(split(0))) - .isEqualTo(MIN_VALUES.get(0).get(columnName).longValue()); - assertThat(extractor.extractWatermark(split(1))) - .isEqualTo(MIN_VALUES.get(1).get(columnName).longValue()); - assertThat(extractor.extractWatermark(combinedSplit)) - .isEqualTo(Math.min(MIN_VALUES.get(0).get(columnName), MIN_VALUES.get(1).get(columnName))); - } - - @TestTemplate - public void testWrongColumn() { - assumeThat(columnName).isEqualTo("string_column"); - assertThatThrownBy(() -> new ColumnStatsWatermarkExtractor(SCHEMA, columnName, null)) - .isInstanceOf(IllegalArgumentException.class) - .hasMessageContaining( - "Found STRING, expected a LONG or TIMESTAMP column for watermark generation."); - } - - @TestTemplate - public void testEmptyStatistics() throws IOException { - assumeThat(columnName).isEqualTo("timestamp_column"); - - // Create an extractor for a column we do not have statistics - ColumnStatsWatermarkExtractor extractor = - new ColumnStatsWatermarkExtractor(10, "missing_field"); - assertThatThrownBy(() -> extractor.extractWatermark(split(0))) - .isInstanceOf(IllegalArgumentException.class) - .hasMessageContaining("Missing statistics for column"); - } - - private IcebergSourceSplit split(int id) throws IOException { - return IcebergSourceSplit.fromCombinedScanTask( - ReaderUtil.createCombinedScanTask( - ImmutableList.of(TEST_RECORDS.get(id)), - temporaryFolder, - FileFormat.PARQUET, - APPENDER_FACTORY)); - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestIcebergSourceReader.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestIcebergSourceReader.java deleted file mode 100644 index 8d6782586676..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestIcebergSourceReader.java +++ /dev/null @@ -1,181 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.reader; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.nio.file.Path; -import java.util.Arrays; -import java.util.Collections; -import java.util.List; -import org.apache.flink.api.connector.source.SourceReaderContext; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.connector.testutils.source.reader.TestingReaderContext; -import org.apache.flink.connector.testutils.source.reader.TestingReaderOutput; -import org.apache.flink.metrics.MetricGroup; -import org.apache.flink.table.data.RowData; -import org.apache.iceberg.CombinedScanTask; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.data.GenericAppenderFactory; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.encryption.PlaintextEncryptionManager; -import org.apache.iceberg.flink.TestFixtures; -import org.apache.iceberg.flink.TestHelpers; -import org.apache.iceberg.flink.source.split.IcebergSourceSplit; -import org.apache.iceberg.flink.source.split.SerializableComparator; -import org.apache.iceberg.hadoop.HadoopFileIO; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.io.TempDir; - -public class TestIcebergSourceReader { - @TempDir protected Path temporaryFolder; - - private final GenericAppenderFactory appenderFactory = - new GenericAppenderFactory(TestFixtures.SCHEMA); - - @Test - public void testReaderMetrics() throws Exception { - TestingReaderOutput readerOutput = new TestingReaderOutput<>(); - TestingMetricGroup metricGroup = new TestingMetricGroup(); - TestingReaderContext readerContext = new TestingReaderContext(new Configuration(), metricGroup); - IcebergSourceReader reader = createReader(metricGroup, readerContext, null); - reader.start(); - - testOneSplitFetcher(reader, readerOutput, metricGroup, 1); - testOneSplitFetcher(reader, readerOutput, metricGroup, 2); - } - - @Test - public void testReaderOrder() throws Exception { - // Create 2 splits - List> recordBatchList1 = - ReaderUtil.createRecordBatchList(0L, TestFixtures.SCHEMA, 1, 1); - CombinedScanTask task1 = - ReaderUtil.createCombinedScanTask( - recordBatchList1, temporaryFolder, FileFormat.PARQUET, appenderFactory); - - List> recordBatchList2 = - ReaderUtil.createRecordBatchList(1L, TestFixtures.SCHEMA, 1, 1); - CombinedScanTask task2 = - ReaderUtil.createCombinedScanTask( - recordBatchList2, temporaryFolder, FileFormat.PARQUET, appenderFactory); - - // Sort the splits in one way - List rowDataList1 = - read( - Arrays.asList( - IcebergSourceSplit.fromCombinedScanTask(task1), - IcebergSourceSplit.fromCombinedScanTask(task2)), - 2); - - // Reverse the splits - List rowDataList2 = - read( - Arrays.asList( - IcebergSourceSplit.fromCombinedScanTask(task2), - IcebergSourceSplit.fromCombinedScanTask(task1)), - 2); - - // Check that the order of the elements is not changed - assertThat(rowDataList1).containsExactlyElementsOf(rowDataList2); - } - - private List read(List splits, long expected) throws Exception { - TestingMetricGroup metricGroup = new TestingMetricGroup(); - TestingReaderContext readerContext = new TestingReaderContext(new Configuration(), metricGroup); - // Using IdBasedComparator, so we can have a deterministic order of the splits - IcebergSourceReader reader = createReader(metricGroup, readerContext, new IdBasedComparator()); - reader.start(); - - reader.addSplits(splits); - TestingReaderOutput readerOutput = new TestingReaderOutput<>(); - while (readerOutput.getEmittedRecords().size() < expected) { - reader.pollNext(readerOutput); - } - - reader.pollNext(readerOutput); - - assertThat(readerOutput.getEmittedRecords()).hasSize((int) expected); - return readerOutput.getEmittedRecords(); - } - - private void testOneSplitFetcher( - IcebergSourceReader reader, - TestingReaderOutput readerOutput, - TestingMetricGroup metricGroup, - int expectedCount) - throws Exception { - long seed = expectedCount; - // Each split should contain only one file with one record - List> recordBatchList = - ReaderUtil.createRecordBatchList(seed, TestFixtures.SCHEMA, 1, 1); - CombinedScanTask task = - ReaderUtil.createCombinedScanTask( - recordBatchList, temporaryFolder, FileFormat.PARQUET, appenderFactory); - IcebergSourceSplit split = IcebergSourceSplit.fromCombinedScanTask(task); - reader.addSplits(Collections.singletonList(split)); - - while (readerOutput.getEmittedRecords().size() < expectedCount) { - reader.pollNext(readerOutput); - } - - assertThat(readerOutput.getEmittedRecords()).hasSize(expectedCount); - TestHelpers.assertRowData( - TestFixtures.SCHEMA, - recordBatchList.get(0).get(0), - readerOutput.getEmittedRecords().get(expectedCount - 1)); - assertThat(metricGroup.counters().get("assignedSplits").getCount()).isEqualTo(expectedCount); - - // One more poll will get null record batch. - // That will finish the split and cause split fetcher to be closed due to idleness. - // Then next split will create a new split reader. - reader.pollNext(readerOutput); - } - - private IcebergSourceReader createReader( - MetricGroup metricGroup, - SourceReaderContext readerContext, - SerializableComparator splitComparator) { - IcebergSourceReaderMetrics readerMetrics = - new IcebergSourceReaderMetrics(metricGroup, "db.tbl"); - RowDataReaderFunction readerFunction = - new RowDataReaderFunction( - new Configuration(), - TestFixtures.SCHEMA, - TestFixtures.SCHEMA, - null, - true, - new HadoopFileIO(new org.apache.hadoop.conf.Configuration()), - PlaintextEncryptionManager.instance(), - Collections.emptyList()); - return new IcebergSourceReader<>( - SerializableRecordEmitter.defaultEmitter(), - readerMetrics, - readerFunction, - splitComparator, - readerContext); - } - - private static class IdBasedComparator implements SerializableComparator { - @Override - public int compare(IcebergSourceSplit o1, IcebergSourceSplit o2) { - return o1.splitId().compareTo(o2.splitId()); - } - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestLimitableDataIterator.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestLimitableDataIterator.java deleted file mode 100644 index 36749d3ec2dc..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestLimitableDataIterator.java +++ /dev/null @@ -1,84 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.reader; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.nio.file.Path; -import java.util.Collections; -import java.util.List; -import org.apache.flink.table.data.RowData; -import org.apache.iceberg.CombinedScanTask; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.data.GenericAppenderFactory; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.encryption.EncryptionManager; -import org.apache.iceberg.encryption.PlaintextEncryptionManager; -import org.apache.iceberg.flink.TestFixtures; -import org.apache.iceberg.flink.source.RowDataFileScanTaskReader; -import org.apache.iceberg.hadoop.HadoopFileIO; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.junit.jupiter.api.BeforeAll; -import org.junit.jupiter.api.io.TempDir; -import org.junit.jupiter.params.ParameterizedTest; -import org.junit.jupiter.params.provider.ValueSource; - -public class TestLimitableDataIterator { - @TempDir private static Path temporaryFolder; - - private final RowDataFileScanTaskReader reader = - new RowDataFileScanTaskReader( - TestFixtures.SCHEMA, TestFixtures.SCHEMA, null, true, Collections.emptyList()); - private final HadoopFileIO fileIO = new HadoopFileIO(new org.apache.hadoop.conf.Configuration()); - private final EncryptionManager encryptionManager = PlaintextEncryptionManager.instance(); - - private static CombinedScanTask combinedScanTask; - private static int totalRecords; - - @BeforeAll - public static void beforeClass() throws Exception { - GenericAppenderFactory appenderFactory = new GenericAppenderFactory(TestFixtures.SCHEMA); - List> recordBatchList = - ReaderUtil.createRecordBatchList(TestFixtures.SCHEMA, 3, 2); - combinedScanTask = - ReaderUtil.createCombinedScanTask( - recordBatchList, temporaryFolder, FileFormat.PARQUET, appenderFactory); - totalRecords = 3 * 2; - } - - @ParameterizedTest - @ValueSource(longs = {-1L, 0L, 1L, 6L, 7L}) - public void testUnlimited(long limit) { - LimitableDataIterator dataIterator = - new LimitableDataIterator<>( - reader, combinedScanTask, fileIO, encryptionManager, RecordLimiter.create(limit)); - - List result = Lists.newArrayList(); - while (dataIterator.hasNext()) { - result.add(dataIterator.next()); - } - - if (limit <= 0 || limit > totalRecords) { - // read all records - assertThat(result).hasSize(totalRecords); - } else { - assertThat(result).hasSize((int) limit); - } - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestRowDataReaderFunction.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestRowDataReaderFunction.java deleted file mode 100644 index 55f9c0af3a29..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestRowDataReaderFunction.java +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.reader; - -import java.util.Collections; -import java.util.List; -import java.util.stream.Collectors; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.conversion.DataStructureConverter; -import org.apache.flink.table.data.conversion.DataStructureConverters; -import org.apache.flink.table.types.logical.RowType; -import org.apache.flink.table.types.utils.TypeConversions; -import org.apache.flink.types.Row; -import org.apache.iceberg.Schema; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.encryption.PlaintextEncryptionManager; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.TestFixtures; -import org.apache.iceberg.flink.TestHelpers; -import org.apache.iceberg.hadoop.HadoopFileIO; - -public class TestRowDataReaderFunction extends ReaderFunctionTestBase { - - protected static final RowType ROW_TYPE = FlinkSchemaUtil.convert(TestFixtures.SCHEMA); - private static final DataStructureConverter ROW_DATA_CONVERTER = - DataStructureConverters.getConverter(TypeConversions.fromLogicalToDataType(ROW_TYPE)); - - @Override - protected ReaderFunction readerFunction() { - return new RowDataReaderFunction( - new Configuration(), - TestFixtures.SCHEMA, - TestFixtures.SCHEMA, - null, - true, - new HadoopFileIO(new org.apache.hadoop.conf.Configuration()), - PlaintextEncryptionManager.instance(), - Collections.emptyList()); - } - - @Override - protected void assertRecords(List expected, List actual, Schema schema) { - List rows = toRows(actual); - TestHelpers.assertRecords(rows, expected, TestFixtures.SCHEMA); - } - - private List toRows(List actual) { - return actual.stream() - .map(rowData -> (Row) ROW_DATA_CONVERTER.toExternal(rowData)) - .collect(Collectors.toList()); - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestingMetricGroup.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestingMetricGroup.java deleted file mode 100644 index 290628c5fc90..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestingMetricGroup.java +++ /dev/null @@ -1,102 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.reader; - -import java.util.Map; -import org.apache.flink.metrics.Counter; -import org.apache.flink.metrics.Gauge; -import org.apache.flink.metrics.MetricGroup; -import org.apache.flink.metrics.SimpleCounter; -import org.apache.flink.metrics.groups.OperatorIOMetricGroup; -import org.apache.flink.metrics.groups.SourceReaderMetricGroup; -import org.apache.flink.metrics.groups.UnregisteredMetricsGroup; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; - -class TestingMetricGroup extends UnregisteredMetricsGroup implements SourceReaderMetricGroup { - private final Map counters; - - TestingMetricGroup() { - this.counters = Maps.newHashMap(); - } - - /** Pass along the reference to share the map for child metric groups. */ - private TestingMetricGroup(Map counters) { - this.counters = counters; - } - - Map counters() { - return counters; - } - - @Override - public Counter counter(String name) { - Counter counter = new SimpleCounter(); - counters.put(name, counter); - return counter; - } - - @Override - public MetricGroup addGroup(String name) { - return new TestingMetricGroup(counters); - } - - @Override - public MetricGroup addGroup(String key, String value) { - return new TestingMetricGroup(counters); - } - - @Override - public OperatorIOMetricGroup getIOMetricGroup() { - return new TestingOperatorIOMetricGroup(); - } - - @Override - public Counter getNumRecordsInErrorsCounter() { - return new SimpleCounter(); - } - - @Override - public void setPendingBytesGauge(Gauge pendingBytesGauge) {} - - @Override - public void setPendingRecordsGauge(Gauge pendingRecordsGauge) {} - - private static class TestingOperatorIOMetricGroup extends UnregisteredMetricsGroup - implements OperatorIOMetricGroup { - @Override - public Counter getNumRecordsInCounter() { - return new SimpleCounter(); - } - - @Override - public Counter getNumRecordsOutCounter() { - return new SimpleCounter(); - } - - @Override - public Counter getNumBytesInCounter() { - return new SimpleCounter(); - } - - @Override - public Counter getNumBytesOutCounter() { - return new SimpleCounter(); - } - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/split/TestIcebergSourceSplitSerializer.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/split/TestIcebergSourceSplitSerializer.java deleted file mode 100644 index 12bacdcd074d..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/source/split/TestIcebergSourceSplitSerializer.java +++ /dev/null @@ -1,183 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.source.split; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.nio.file.Path; -import java.util.List; -import java.util.concurrent.atomic.AtomicInteger; -import java.util.stream.Collectors; -import org.apache.iceberg.FileScanTask; -import org.apache.iceberg.flink.source.SplitHelpers; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.io.TempDir; - -public class TestIcebergSourceSplitSerializer { - - @TempDir protected Path temporaryFolder; - - private final IcebergSourceSplitSerializer serializer = new IcebergSourceSplitSerializer(true); - - @Test - public void testLatestVersion() throws Exception { - serializeAndDeserialize(1, 1); - serializeAndDeserialize(10, 2); - } - - private void serializeAndDeserialize(int splitCount, int filesPerSplit) throws Exception { - final List splits = - SplitHelpers.createSplitsFromTransientHadoopTable( - temporaryFolder, splitCount, filesPerSplit); - for (IcebergSourceSplit split : splits) { - byte[] result = serializer.serialize(split); - IcebergSourceSplit deserialized = serializer.deserialize(serializer.getVersion(), result); - assertSplitEquals(split, deserialized); - - byte[] cachedResult = serializer.serialize(split); - assertThat(cachedResult).isSameAs(result); - IcebergSourceSplit deserialized2 = - serializer.deserialize(serializer.getVersion(), cachedResult); - assertSplitEquals(split, deserialized2); - - split.updatePosition(0, 100); - byte[] resultAfterUpdatePosition = serializer.serialize(split); - // after position change, serialized bytes should have changed - assertThat(resultAfterUpdatePosition).isNotSameAs(cachedResult); - IcebergSourceSplit deserialized3 = - serializer.deserialize(serializer.getVersion(), resultAfterUpdatePosition); - assertSplitEquals(split, deserialized3); - } - } - - @Test - public void testV1() throws Exception { - serializeAndDeserializeV1(1, 1); - serializeAndDeserializeV1(10, 2); - } - - private void serializeAndDeserializeV1(int splitCount, int filesPerSplit) throws Exception { - final List splits = - SplitHelpers.createSplitsFromTransientHadoopTable( - temporaryFolder, splitCount, filesPerSplit); - for (IcebergSourceSplit split : splits) { - byte[] result = split.serializeV1(); - IcebergSourceSplit deserialized = IcebergSourceSplit.deserializeV1(result); - assertSplitEquals(split, deserialized); - } - } - - @Test - public void testV2() throws Exception { - serializeAndDeserializeV2(1, 1); - serializeAndDeserializeV2(10, 2); - } - - private void serializeAndDeserializeV2(int splitCount, int filesPerSplit) throws Exception { - final List splits = - SplitHelpers.createSplitsFromTransientHadoopTable( - temporaryFolder, splitCount, filesPerSplit); - for (IcebergSourceSplit split : splits) { - byte[] result = split.serializeV2(); - IcebergSourceSplit deserialized = IcebergSourceSplit.deserializeV2(result, true); - assertSplitEquals(split, deserialized); - } - } - - @Test - public void testV3WithTooManyDeleteFiles() throws Exception { - serializeAndDeserializeV3(1, 1, 5000); - } - - private void serializeAndDeserializeV3(int splitCount, int filesPerSplit, int mockDeletesPerSplit) - throws Exception { - final List splits = - SplitHelpers.createSplitsFromTransientHadoopTable( - temporaryFolder, splitCount, filesPerSplit); - final List splitsWithMockDeleteFiles = - SplitHelpers.equipSplitsWithMockDeleteFiles(splits, temporaryFolder, mockDeletesPerSplit); - - for (IcebergSourceSplit split : splitsWithMockDeleteFiles) { - byte[] result = split.serializeV3(); - IcebergSourceSplit deserialized = IcebergSourceSplit.deserializeV3(result, true); - assertSplitEquals(split, deserialized); - } - } - - @Test - public void testDeserializeV1() throws Exception { - final List splits = - SplitHelpers.createSplitsFromTransientHadoopTable(temporaryFolder, 1, 1); - for (IcebergSourceSplit split : splits) { - byte[] result = split.serializeV1(); - IcebergSourceSplit deserialized = serializer.deserialize(1, result); - assertSplitEquals(split, deserialized); - } - } - - @Test - public void testCheckpointedPosition() throws Exception { - final AtomicInteger index = new AtomicInteger(); - final List splits = - SplitHelpers.createSplitsFromTransientHadoopTable(temporaryFolder, 10, 2).stream() - .map( - split -> { - IcebergSourceSplit result; - if (index.get() % 2 == 0) { - result = IcebergSourceSplit.fromCombinedScanTask(split.task(), 1, 1); - } else { - result = split; - } - index.incrementAndGet(); - return result; - }) - .collect(Collectors.toList()); - - for (IcebergSourceSplit split : splits) { - byte[] result = serializer.serialize(split); - IcebergSourceSplit deserialized = serializer.deserialize(serializer.getVersion(), result); - assertSplitEquals(split, deserialized); - - byte[] cachedResult = serializer.serialize(split); - assertThat(cachedResult).isSameAs(result); - IcebergSourceSplit deserialized2 = - serializer.deserialize(serializer.getVersion(), cachedResult); - assertSplitEquals(split, deserialized2); - } - } - - private void assertSplitEquals(IcebergSourceSplit expected, IcebergSourceSplit actual) { - List expectedTasks = Lists.newArrayList(expected.task().tasks().iterator()); - List actualTasks = Lists.newArrayList(actual.task().tasks().iterator()); - assertThat(actualTasks).hasSameSizeAs(expectedTasks); - for (int i = 0; i < expectedTasks.size(); ++i) { - FileScanTask expectedTask = expectedTasks.get(i); - FileScanTask actualTask = actualTasks.get(i); - assertThat(actualTask.file().path()).isEqualTo(expectedTask.file().path()); - assertThat(actualTask.sizeBytes()).isEqualTo(expectedTask.sizeBytes()); - assertThat(actualTask.filesCount()).isEqualTo(expectedTask.filesCount()); - assertThat(actualTask.start()).isEqualTo(expectedTask.start()); - assertThat(actualTask.length()).isEqualTo(expectedTask.length()); - } - - assertThat(actual.fileOffset()).isEqualTo(expected.fileOffset()); - assertThat(actual.recordOffset()).isEqualTo(expected.recordOffset()); - } -} diff --git a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/util/TestFlinkPackage.java b/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/util/TestFlinkPackage.java deleted file mode 100644 index 079c70bae070..000000000000 --- a/flink/v1.17/flink/src/test/java/org/apache/iceberg/flink/util/TestFlinkPackage.java +++ /dev/null @@ -1,55 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.flink.util; - -import static org.assertj.core.api.Assertions.assertThat; - -import org.junit.jupiter.api.Test; -import org.mockito.MockedStatic; -import org.mockito.Mockito; - -public class TestFlinkPackage { - - /** This unit test would need to be adjusted as new Flink version is supported. */ - @Test - public void testVersion() { - assertThat(FlinkPackage.version()).isEqualTo("1.17.2"); - } - - @Test - public void testDefaultVersion() { - // It's difficult to reproduce a reflection error in a unit test, so we just inject a mocked - // fault to test the default logic - - // First make sure we're not caching a version result from a previous test - FlinkPackage.setVersion(null); - try (MockedStatic mockedStatic = Mockito.mockStatic(FlinkPackage.class)) { - mockedStatic.when(FlinkPackage::versionFromJar).thenThrow(RuntimeException.class); - mockedStatic.when(FlinkPackage::version).thenCallRealMethod(); - assertThat(FlinkPackage.version()).isEqualTo(FlinkPackage.FLINK_UNKNOWN_VERSION); - } - FlinkPackage.setVersion(null); - try (MockedStatic mockedStatic = Mockito.mockStatic(FlinkPackage.class)) { - mockedStatic.when(FlinkPackage::versionFromJar).thenReturn(null); - mockedStatic.when(FlinkPackage::version).thenCallRealMethod(); - FlinkPackage.setVersion(null); - assertThat(FlinkPackage.version()).isEqualTo(FlinkPackage.FLINK_UNKNOWN_VERSION); - } - } -} diff --git a/flink/v1.17/flink/src/test/resources/META-INF/services/org.apache.flink.table.factories.Factory b/flink/v1.17/flink/src/test/resources/META-INF/services/org.apache.flink.table.factories.Factory deleted file mode 100644 index 47a3c94aa991..000000000000 --- a/flink/v1.17/flink/src/test/resources/META-INF/services/org.apache.flink.table.factories.Factory +++ /dev/null @@ -1,16 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -org.apache.iceberg.flink.source.BoundedTableFactory From 38733f8fe63751826b3d99e5ab79f7e795e5c166 Mon Sep 17 00:00:00 2001 From: Steven Wu Date: Mon, 5 Aug 2024 10:09:36 -0700 Subject: [PATCH 42/55] Flink: adjust code for the new 1.20 module. also fixed the bug of missing jmh in the 1.19 module. --- .github/workflows/flink-ci.yml | 9 +---- flink/build.gradle | 9 +++-- .../shuffle/MapRangePartitionerBenchmark.java | 10 ++++-- flink/v1.20/build.gradle | 36 +++++++++---------- .../shuffle/MapRangePartitionerBenchmark.java | 10 ++++-- ...estIcebergSpeculativeExecutionSupport.java | 2 +- .../iceberg/flink/util/TestFlinkPackage.java | 2 +- gradle.properties | 4 +-- gradle/libs.versions.toml | 24 ++++++------- jmh.gradle | 12 +++---- settings.gradle | 18 +++++----- 11 files changed, 70 insertions(+), 66 deletions(-) diff --git a/.github/workflows/flink-ci.yml b/.github/workflows/flink-ci.yml index 3df36e2be86a..370375783cc2 100644 --- a/.github/workflows/flink-ci.yml +++ b/.github/workflows/flink-ci.yml @@ -74,14 +74,7 @@ jobs: strategy: matrix: jvm: [11, 17, 21] - flink: ['1.17', '1.18', '1.19'] - exclude: - # Flink 1.17 does not support Java 17. - - jvm: 17 - flink: '1.17' - # Flink 1.17 does not support Java 21. - - jvm: 21 - flink: '1.17' + flink: ['1.18', '1.19', '1.20'] env: SPARK_LOCAL_IP: localhost steps: diff --git a/flink/build.gradle b/flink/build.gradle index f049ff69b059..17ed630cc235 100644 --- a/flink/build.gradle +++ b/flink/build.gradle @@ -19,11 +19,6 @@ def flinkVersions = (System.getProperty("flinkVersions") != null ? System.getProperty("flinkVersions") : System.getProperty("defaultFlinkVersions")).split(",") - -if (flinkVersions.contains("1.17")) { - apply from: file("$projectDir/v1.17/build.gradle") -} - if (flinkVersions.contains("1.18")) { apply from: file("$projectDir/v1.18/build.gradle") } @@ -31,3 +26,7 @@ if (flinkVersions.contains("1.18")) { if (flinkVersions.contains("1.19")) { apply from: file("$projectDir/v1.19/build.gradle") } + +if (flinkVersions.contains("1.20")) { + apply from: file("$projectDir/v1.20/build.gradle") +} \ No newline at end of file diff --git a/flink/v1.19/flink/src/jmh/java/org/apache/iceberg/flink/sink/shuffle/MapRangePartitionerBenchmark.java b/flink/v1.19/flink/src/jmh/java/org/apache/iceberg/flink/sink/shuffle/MapRangePartitionerBenchmark.java index c3917165753d..007b423e592a 100644 --- a/flink/v1.19/flink/src/jmh/java/org/apache/iceberg/flink/sink/shuffle/MapRangePartitionerBenchmark.java +++ b/flink/v1.19/flink/src/jmh/java/org/apache/iceberg/flink/sink/shuffle/MapRangePartitionerBenchmark.java @@ -18,6 +18,7 @@ */ package org.apache.iceberg.flink.sink.shuffle; +import java.util.Comparator; import java.util.List; import java.util.Map; import java.util.NavigableMap; @@ -27,6 +28,8 @@ import org.apache.iceberg.Schema; import org.apache.iceberg.SortKey; import org.apache.iceberg.SortOrder; +import org.apache.iceberg.SortOrderComparators; +import org.apache.iceberg.StructLike; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.relocated.com.google.common.collect.Maps; @@ -66,6 +69,8 @@ public class MapRangePartitionerBenchmark { Types.NestedField.required(9, "name9", Types.StringType.get())); private static final SortOrder SORT_ORDER = SortOrder.builderFor(SCHEMA).asc("id").build(); + private static final Comparator SORT_ORDER_COMPARTOR = + SortOrderComparators.forSchema(SCHEMA, SORT_ORDER); private static final SortKey SORT_KEY = new SortKey(SCHEMA, SORT_ORDER); private MapRangePartitioner partitioner; @@ -82,10 +87,11 @@ public void setupBenchmark() { mapStatistics.put(sortKey, weight); }); - MapDataStatistics dataStatistics = new MapDataStatistics(mapStatistics); + MapAssignment mapAssignment = + MapAssignment.fromKeyFrequency(2, mapStatistics, 0.0, SORT_ORDER_COMPARTOR); this.partitioner = new MapRangePartitioner( - SCHEMA, SortOrder.builderFor(SCHEMA).asc("id").build(), dataStatistics, 2); + SCHEMA, SortOrder.builderFor(SCHEMA).asc("id").build(), mapAssignment); List keys = Lists.newArrayList(weights.keySet().iterator()); long[] weightsCDF = new long[keys.size()]; diff --git a/flink/v1.20/build.gradle b/flink/v1.20/build.gradle index 392a1cb124f0..f2e1fb51a1f4 100644 --- a/flink/v1.20/build.gradle +++ b/flink/v1.20/build.gradle @@ -17,7 +17,7 @@ * under the License. */ -String flinkMajorVersion = '1.19' +String flinkMajorVersion = '1.20' String scalaVersion = System.getProperty("scalaVersion") != null ? System.getProperty("scalaVersion") : System.getProperty("defaultScalaVersion") project(":iceberg-flink:iceberg-flink-${flinkMajorVersion}") { @@ -32,15 +32,15 @@ project(":iceberg-flink:iceberg-flink-${flinkMajorVersion}") { implementation project(':iceberg-parquet') implementation project(':iceberg-hive-metastore') - compileOnly libs.flink119.avro + compileOnly libs.flink120.avro // for dropwizard histogram metrics implementation - compileOnly libs.flink119.metrics.dropwizard - compileOnly libs.flink119.streaming.java - compileOnly "${libs.flink119.streaming.java.get().module}:${libs.flink119.streaming.java.get().getVersion()}:tests" - compileOnly libs.flink119.table.api.java.bridge - compileOnly "org.apache.flink:flink-table-planner_${scalaVersion}:${libs.versions.flink119.get()}" - compileOnly libs.flink119.connector.base - compileOnly libs.flink119.connector.files + compileOnly libs.flink120.metrics.dropwizard + compileOnly libs.flink120.streaming.java + compileOnly "${libs.flink120.streaming.java.get().module}:${libs.flink120.streaming.java.get().getVersion()}:tests" + compileOnly libs.flink120.table.api.java.bridge + compileOnly "org.apache.flink:flink-table-planner_${scalaVersion}:${libs.versions.flink120.get()}" + compileOnly libs.flink120.connector.base + compileOnly libs.flink120.connector.files compileOnly libs.hadoop2.hdfs compileOnly libs.hadoop2.common @@ -68,13 +68,13 @@ project(":iceberg-flink:iceberg-flink-${flinkMajorVersion}") { implementation libs.datasketches - testImplementation libs.flink119.connector.test.utils - testImplementation libs.flink119.core - testImplementation libs.flink119.runtime - testImplementation(libs.flink119.test.utilsjunit) { + testImplementation libs.flink120.connector.test.utils + testImplementation libs.flink120.core + testImplementation libs.flink120.runtime + testImplementation(libs.flink120.test.utilsjunit) { exclude group: 'junit' } - testImplementation(libs.flink119.test.utils) { + testImplementation(libs.flink120.test.utils) { exclude group: "org.apache.curator", module: 'curator-test' exclude group: 'junit' } @@ -168,7 +168,7 @@ project(":iceberg-flink:iceberg-flink-runtime-${flinkMajorVersion}") { } // for dropwizard histogram metrics implementation - implementation libs.flink119.metrics.dropwizard + implementation libs.flink120.metrics.dropwizard // for integration testing with the flink-runtime-jar // all of those dependencies are required because the integration test extends FlinkTestBase @@ -178,13 +178,13 @@ project(":iceberg-flink:iceberg-flink-runtime-${flinkMajorVersion}") { integrationImplementation project(path: ":iceberg-flink:iceberg-flink-${flinkMajorVersion}", configuration: "testArtifacts") integrationImplementation project(path: ':iceberg-api', configuration: 'testArtifacts') integrationImplementation project(path: ':iceberg-hive-metastore', configuration: 'testArtifacts') - integrationImplementation(libs.flink119.test.utils) { + integrationImplementation(libs.flink120.test.utils) { exclude group: "org.apache.curator", module: 'curator-test' exclude group: 'junit' } - integrationImplementation libs.flink119.table.api.java.bridge - integrationImplementation "org.apache.flink:flink-table-planner_${scalaVersion}:${libs.versions.flink119.get()}" + integrationImplementation libs.flink120.table.api.java.bridge + integrationImplementation "org.apache.flink:flink-table-planner_${scalaVersion}:${libs.versions.flink120.get()}" integrationImplementation libs.hadoop2.common integrationImplementation libs.hadoop2.hdfs diff --git a/flink/v1.20/flink/src/jmh/java/org/apache/iceberg/flink/sink/shuffle/MapRangePartitionerBenchmark.java b/flink/v1.20/flink/src/jmh/java/org/apache/iceberg/flink/sink/shuffle/MapRangePartitionerBenchmark.java index c3917165753d..007b423e592a 100644 --- a/flink/v1.20/flink/src/jmh/java/org/apache/iceberg/flink/sink/shuffle/MapRangePartitionerBenchmark.java +++ b/flink/v1.20/flink/src/jmh/java/org/apache/iceberg/flink/sink/shuffle/MapRangePartitionerBenchmark.java @@ -18,6 +18,7 @@ */ package org.apache.iceberg.flink.sink.shuffle; +import java.util.Comparator; import java.util.List; import java.util.Map; import java.util.NavigableMap; @@ -27,6 +28,8 @@ import org.apache.iceberg.Schema; import org.apache.iceberg.SortKey; import org.apache.iceberg.SortOrder; +import org.apache.iceberg.SortOrderComparators; +import org.apache.iceberg.StructLike; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.relocated.com.google.common.collect.Maps; @@ -66,6 +69,8 @@ public class MapRangePartitionerBenchmark { Types.NestedField.required(9, "name9", Types.StringType.get())); private static final SortOrder SORT_ORDER = SortOrder.builderFor(SCHEMA).asc("id").build(); + private static final Comparator SORT_ORDER_COMPARTOR = + SortOrderComparators.forSchema(SCHEMA, SORT_ORDER); private static final SortKey SORT_KEY = new SortKey(SCHEMA, SORT_ORDER); private MapRangePartitioner partitioner; @@ -82,10 +87,11 @@ public void setupBenchmark() { mapStatistics.put(sortKey, weight); }); - MapDataStatistics dataStatistics = new MapDataStatistics(mapStatistics); + MapAssignment mapAssignment = + MapAssignment.fromKeyFrequency(2, mapStatistics, 0.0, SORT_ORDER_COMPARTOR); this.partitioner = new MapRangePartitioner( - SCHEMA, SortOrder.builderFor(SCHEMA).asc("id").build(), dataStatistics, 2); + SCHEMA, SortOrder.builderFor(SCHEMA).asc("id").build(), mapAssignment); List keys = Lists.newArrayList(weights.keySet().iterator()); long[] weightsCDF = new long[keys.size()]; diff --git a/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSpeculativeExecutionSupport.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSpeculativeExecutionSupport.java index 41b023b93617..992b712d9d69 100644 --- a/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSpeculativeExecutionSupport.java +++ b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSpeculativeExecutionSupport.java @@ -165,7 +165,7 @@ private static Configuration configure() { Configuration configuration = new Configuration(); configuration.set(CoreOptions.CHECK_LEAKED_CLASSLOADER, false); configuration.set(RestOptions.BIND_PORT, "0"); - configuration.set(JobManagerOptions.SLOT_REQUEST_TIMEOUT, 5000L); + configuration.set(JobManagerOptions.SLOT_REQUEST_TIMEOUT, Duration.ofSeconds(5)); // Use FLIP-27 source configuration.set(FlinkConfigOptions.TABLE_EXEC_ICEBERG_USE_FLIP27_SOURCE, true); diff --git a/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/util/TestFlinkPackage.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/util/TestFlinkPackage.java index 4ba4f9d983dc..65f21f7d050c 100644 --- a/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/util/TestFlinkPackage.java +++ b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/util/TestFlinkPackage.java @@ -29,7 +29,7 @@ public class TestFlinkPackage { /** This unit test would need to be adjusted as new Flink version is supported. */ @Test public void testVersion() { - assertThat(FlinkPackage.version()).isEqualTo("1.19.0"); + assertThat(FlinkPackage.version()).isEqualTo("1.20.0"); } @Test diff --git a/gradle.properties b/gradle.properties index c6b8dec17bc5..fcbe7d8de012 100644 --- a/gradle.properties +++ b/gradle.properties @@ -16,8 +16,8 @@ jmhOutputPath=build/reports/jmh/human-readable-output.txt jmhJsonOutputPath=build/reports/jmh/results.json jmhIncludeRegex=.* -systemProp.defaultFlinkVersions=1.19 -systemProp.knownFlinkVersions=1.17,1.18,1.19 +systemProp.defaultFlinkVersions=1.20 +systemProp.knownFlinkVersions=1.18,1.19,1.20 systemProp.defaultHiveVersions=2 systemProp.knownHiveVersions=2,3 systemProp.defaultSparkVersions=3.5 diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index 12caeda95407..77e610e885f6 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -39,9 +39,9 @@ delta-spark = "3.2.0" esotericsoftware-kryo = "4.0.3" errorprone-annotations = "2.29.2" findbugs-jsr305 = "3.0.2" -flink117 = { strictly = "1.17.2"} flink118 = { strictly = "1.18.1"} flink119 = { strictly = "1.19.0"} +flink120 = { strictly = "1.20.0"} google-libraries-bom = "26.43.0" guava = "33.2.1-jre" hadoop2 = "2.7.3" @@ -108,12 +108,6 @@ datasketches = { module = "org.apache.datasketches:datasketches-java", version.r delta-standalone = { module = "io.delta:delta-standalone_2.12", version.ref = "delta-standalone" } errorprone-annotations = { module = "com.google.errorprone:error_prone_annotations", version.ref = "errorprone-annotations" } findbugs-jsr305 = { module = "com.google.code.findbugs:jsr305", version.ref = "findbugs-jsr305" } -flink117-avro = { module = "org.apache.flink:flink-avro", version.ref = "flink117" } -flink117-connector-base = { module = "org.apache.flink:flink-connector-base", version.ref = "flink117" } -flink117-connector-files = { module = "org.apache.flink:flink-connector-files", version.ref = "flink117" } -flink117-metrics-dropwizard = { module = "org.apache.flink:flink-metrics-dropwizard", version.ref = "flink117" } -flink117-streaming-java = { module = "org.apache.flink:flink-streaming-java", version.ref = "flink117" } -flink117-table-api-java-bridge = { module = "org.apache.flink:flink-table-api-java-bridge", version.ref = "flink117" } flink118-avro = { module = "org.apache.flink:flink-avro", version.ref = "flink118" } flink118-connector-base = { module = "org.apache.flink:flink-connector-base", version.ref = "flink118" } flink118-connector-files = { module = "org.apache.flink:flink-connector-files", version.ref = "flink118" } @@ -126,6 +120,12 @@ flink119-connector-files = { module = "org.apache.flink:flink-connector-files", flink119-metrics-dropwizard = { module = "org.apache.flink:flink-metrics-dropwizard", version.ref = "flink119" } flink119-streaming-java = { module = "org.apache.flink:flink-streaming-java", version.ref = "flink119" } flink119-table-api-java-bridge = { module = "org.apache.flink:flink-table-api-java-bridge", version.ref = "flink119" } +flink120-avro = { module = "org.apache.flink:flink-avro", version.ref = "flink120" } +flink120-connector-base = { module = "org.apache.flink:flink-connector-base", version.ref = "flink120" } +flink120-connector-files = { module = "org.apache.flink:flink-connector-files", version.ref = "flink120" } +flink120-metrics-dropwizard = { module = "org.apache.flink:flink-metrics-dropwizard", version.ref = "flink120" } +flink120-streaming-java = { module = "org.apache.flink:flink-streaming-java", version.ref = "flink120" } +flink120-table-api-java-bridge = { module = "org.apache.flink:flink-table-api-java-bridge", version.ref = "flink120" } google-libraries-bom = { module = "com.google.cloud:libraries-bom", version.ref = "google-libraries-bom" } guava-guava = { module = "com.google.guava:guava", version.ref = "guava" } hadoop2-client = { module = "org.apache.hadoop:hadoop-client", version.ref = "hadoop2" } @@ -180,11 +180,6 @@ assertj-core = { module = "org.assertj:assertj-core", version.ref = "assertj-cor awaitility = { module = "org.awaitility:awaitility", version.ref = "awaitility" } delta-spark = { module = "io.delta:delta-spark_2.12", version.ref = "delta-spark" } esotericsoftware-kryo = { module = "com.esotericsoftware:kryo", version.ref = "esotericsoftware-kryo" } -flink117-connector-test-utils = { module = "org.apache.flink:flink-connector-test-utils", version.ref = "flink117" } -flink117-core = { module = "org.apache.flink:flink-core", version.ref = "flink117" } -flink117-runtime = { module = "org.apache.flink:flink-runtime", version.ref = "flink117" } -flink117-test-utils = { module = "org.apache.flink:flink-test-utils", version.ref = "flink117" } -flink117-test-utilsjunit = { module = "org.apache.flink:flink-test-utils-junit", version.ref = "flink117" } flink118-connector-test-utils = { module = "org.apache.flink:flink-connector-test-utils", version.ref = "flink118" } flink118-core = { module = "org.apache.flink:flink-core", version.ref = "flink118" } flink118-runtime = { module = "org.apache.flink:flink-runtime", version.ref = "flink118" } @@ -195,6 +190,11 @@ flink119-core = { module = "org.apache.flink:flink-core", version.ref = "flink11 flink119-runtime = { module = "org.apache.flink:flink-runtime", version.ref = "flink119" } flink119-test-utils = { module = "org.apache.flink:flink-test-utils", version.ref = "flink119" } flink119-test-utilsjunit = { module = "org.apache.flink:flink-test-utils-junit", version.ref = "flink119" } +flink120-connector-test-utils = { module = "org.apache.flink:flink-connector-test-utils", version.ref = "flink120" } +flink120-core = { module = "org.apache.flink:flink-core", version.ref = "flink120" } +flink120-runtime = { module = "org.apache.flink:flink-runtime", version.ref = "flink120" } +flink120-test-utils = { module = "org.apache.flink:flink-test-utils", version.ref = "flink120" } +flink120-test-utilsjunit = { module = "org.apache.flink:flink-test-utils-junit", version.ref = "flink120" } guava-testlib = { module = "com.google.guava:guava-testlib", version.ref = "guava" } jakarta-el-api = { module = "jakarta.el:jakarta.el-api", version.ref = "jakarta-el-api" } jakarta-servlet = {module = "jakarta.servlet:jakarta.servlet-api", version.ref = "jakarta-servlet-api"} diff --git a/jmh.gradle b/jmh.gradle index 5e5e0151219f..a5d8d624270d 100644 --- a/jmh.gradle +++ b/jmh.gradle @@ -26,16 +26,16 @@ def sparkVersions = (System.getProperty("sparkVersions") != null ? System.getPro def scalaVersion = System.getProperty("scalaVersion") != null ? System.getProperty("scalaVersion") : System.getProperty("defaultScalaVersion") def jmhProjects = [project(":iceberg-core"), project(":iceberg-data")] -if (flinkVersions.contains("1.16")) { - jmhProjects.add(project(":iceberg-flink:iceberg-flink-1.16")) +if (flinkVersions.contains("1.18")) { + jmhProjects.add(project(":iceberg-flink:iceberg-flink-1.18")) } -if (flinkVersions.contains("1.17")) { - jmhProjects.add(project(":iceberg-flink:iceberg-flink-1.17")) +if (flinkVersions.contains("1.19")) { + jmhProjects.add(project(":iceberg-flink:iceberg-flink-1.19")) } -if (flinkVersions.contains("1.18")) { - jmhProjects.add(project(":iceberg-flink:iceberg-flink-1.18")) +if (flinkVersions.contains("1.20")) { + jmhProjects.add(project(":iceberg-flink:iceberg-flink-1.20")) } if (sparkVersions.contains("3.3")) { diff --git a/settings.gradle b/settings.gradle index cdc69b0e2071..1e6d92bf1e1f 100644 --- a/settings.gradle +++ b/settings.gradle @@ -112,15 +112,6 @@ if (!flinkVersions.isEmpty()) { project(':flink').name = 'iceberg-flink' } -if (flinkVersions.contains("1.17")) { - include ":iceberg-flink:flink-1.17" - include ":iceberg-flink:flink-runtime-1.17" - project(":iceberg-flink:flink-1.17").projectDir = file('flink/v1.17/flink') - project(":iceberg-flink:flink-1.17").name = "iceberg-flink-1.17" - project(":iceberg-flink:flink-runtime-1.17").projectDir = file('flink/v1.17/flink-runtime') - project(":iceberg-flink:flink-runtime-1.17").name = "iceberg-flink-runtime-1.17" -} - if (flinkVersions.contains("1.18")) { include ":iceberg-flink:flink-1.18" include ":iceberg-flink:flink-runtime-1.18" @@ -139,6 +130,15 @@ if (flinkVersions.contains("1.19")) { project(":iceberg-flink:flink-runtime-1.19").name = "iceberg-flink-runtime-1.19" } +if (flinkVersions.contains("1.20")) { + include ":iceberg-flink:flink-1.20" + include ":iceberg-flink:flink-runtime-1.20" + project(":iceberg-flink:flink-1.20").projectDir = file('flink/v1.20/flink') + project(":iceberg-flink:flink-1.20").name = "iceberg-flink-1.20" + project(":iceberg-flink:flink-runtime-1.20").projectDir = file('flink/v1.20/flink-runtime') + project(":iceberg-flink:flink-runtime-1.20").name = "iceberg-flink-runtime-1.20" +} + if (sparkVersions.contains("3.3")) { include ":iceberg-spark:spark-3.3_${scalaVersion}" include ":iceberg-spark:spark-extensions-3.3_${scalaVersion}" From 257b1d7b18f638b5925de32bcd9bbcbe5a4416c2 Mon Sep 17 00:00:00 2001 From: Eduard Tudenhoefner Date: Tue, 6 Aug 2024 18:19:12 +0200 Subject: [PATCH 43/55] Build: Add checkstyle rule to ban assert usage (#10886) --- .baseline/checkstyle/checkstyle.xml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.baseline/checkstyle/checkstyle.xml b/.baseline/checkstyle/checkstyle.xml index 1491a35db194..570fb84d70a4 100644 --- a/.baseline/checkstyle/checkstyle.xml +++ b/.baseline/checkstyle/checkstyle.xml @@ -414,6 +414,9 @@ + + + From 86611d94dbc6b28f2f7b89addc5886d2d4ea96d8 Mon Sep 17 00:00:00 2001 From: Fokko Driesprong Date: Wed, 7 Aug 2024 11:11:27 +0200 Subject: [PATCH 44/55] Build: Bump Apache Avro to 1.12.0 (#10879) --- gradle/libs.versions.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index 77e610e885f6..b2946163f38b 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -25,7 +25,7 @@ aliyun-sdk-oss = "3.10.2" antlr = "4.9.3" aircompressor = "0.27" arrow = "15.0.2" -avro = "1.11.3" +avro = "1.12.0" assertj-core = "3.26.3" awaitility = "4.2.1" awssdk-bom = "2.26.29" From 8ec65abdc5916ed19fffc6af6b58a2ee92d71c28 Mon Sep 17 00:00:00 2001 From: Piotr Findeisen Date: Wed, 7 Aug 2024 13:05:44 +0200 Subject: [PATCH 45/55] Spec: Fix rendering of unified partition struct (#10896) The angle brackets were without any escapes so docs renderer treated them as HTML. The resulting text on the website looked like an unfinished sentence: The unified partition type looks like Struct. Putting the angle brackets in backticks prevent them from being interpreted as HTML. Surrounding names like spec#0, field#1 are also put inside backticks for consistence. --- format/spec.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/format/spec.md b/format/spec.md index c3321fa6991c..c322f8174fe2 100644 --- a/format/spec.md +++ b/format/spec.md @@ -770,13 +770,13 @@ The unified partition type is a struct containing all fields that have ever been and sorted by the field ids in ascending order. In other words, the struct fields represent a union of all known partition fields sorted in ascending order by the field ids. For example, -1) spec#0 has two fields {field#1, field#2} -and then the table has evolved into spec#1 which has three fields {field#1, field#2, field#3}. -The unified partition type looks like Struct. +1) `spec#0` has two fields `{field#1, field#2}` +and then the table has evolved into `spec#1` which has three fields `{field#1, field#2, field#3}`. +The unified partition type looks like `Struct`. -2) spec#0 has two fields {field#1, field#2} -and then the table has evolved into spec#1 which has just one field {field#2}. -The unified partition type looks like Struct. +2) `spec#0` has two fields `{field#1, field#2}` +and then the table has evolved into `spec#1` which has just one field `{field#2}`. +The unified partition type looks like `Struct`. #### Commit Conflict Resolution and Retry From 71b64399dd5c74a63f97022cdb42d0cdcf615862 Mon Sep 17 00:00:00 2001 From: Tom Tanaka <43331405+tomtongue@users.noreply.github.com> Date: Wed, 7 Aug 2024 20:08:23 +0900 Subject: [PATCH 46/55] Docs: Fix catalog name for S3 MRAP example (#10897) --- docs/docs/aws.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/docs/aws.md b/docs/docs/aws.md index bba968fa5586..43e540c20673 100644 --- a/docs/docs/aws.md +++ b/docs/docs/aws.md @@ -468,8 +468,8 @@ spark-sql --conf spark.sql.catalog.my_catalog=org.apache.iceberg.spark.SparkCata --conf spark.sql.catalog.my_catalog.type=glue \ --conf spark.sql.catalog.my_catalog.io-impl=org.apache.iceberg.aws.s3.S3FileIO \ --conf spark.sql.catalog.my_catalog.s3.use-arn-region-enabled=false \ - --conf spark.sql.catalog.test.s3.access-points.my-bucket1=arn:aws:s3::123456789012:accesspoint:mfzwi23gnjvgw.mrap \ - --conf spark.sql.catalog.test.s3.access-points.my-bucket2=arn:aws:s3::123456789012:accesspoint:mfzwi23gnjvgw.mrap + --conf spark.sql.catalog.my_catalog.s3.access-points.my-bucket1=arn:aws:s3::123456789012:accesspoint:mfzwi23gnjvgw.mrap \ + --conf spark.sql.catalog.my_catalog.s3.access-points.my-bucket2=arn:aws:s3::123456789012:accesspoint:mfzwi23gnjvgw.mrap ``` For the above example, the objects in S3 on `my-bucket1` and `my-bucket2` buckets will use `arn:aws:s3::123456789012:accesspoint:mfzwi23gnjvgw.mrap` access-point for all S3 operations. From a3cbdcbae6627b42a52eb8e028ce4f83f30421f9 Mon Sep 17 00:00:00 2001 From: Robert Stupp Date: Wed, 7 Aug 2024 16:07:38 +0200 Subject: [PATCH 47/55] Add Flink 1.20 & remove Flink 1.17 in stage-binaries.sh and docs (#10888) This is a follow-up to #10881 --- dev/stage-binaries.sh | 2 +- site/docs/multi-engine-support.md | 3 ++- site/docs/releases.md | 4 ++-- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/dev/stage-binaries.sh b/dev/stage-binaries.sh index 05bf3c4253f2..29cf31e5f423 100755 --- a/dev/stage-binaries.sh +++ b/dev/stage-binaries.sh @@ -19,7 +19,7 @@ # SCALA_VERSION=2.12 -FLINK_VERSIONS=1.17,1.18,1.19 +FLINK_VERSIONS=1.18,1.19,1.20 SPARK_VERSIONS=3.3,3.4,3.5 HIVE_VERSIONS=2,3 diff --git a/site/docs/multi-engine-support.md b/site/docs/multi-engine-support.md index e4123cc579b3..a3c63276bfdb 100644 --- a/site/docs/multi-engine-support.md +++ b/site/docs/multi-engine-support.md @@ -90,9 +90,10 @@ Users should continuously upgrade their Flink version to stay up-to-date. | 1.14 | End of Life | 0.13.0 | 1.2.0 | [iceberg-flink-runtime-1.14](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-flink-runtime-1.14/1.2.0/iceberg-flink-runtime-1.14-1.2.0.jar) | | 1.15 | End of Life | 0.14.0 | 1.4.3 | [iceberg-flink-runtime-1.15](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-flink-runtime-1.15/1.4.3/iceberg-flink-runtime-1.15-1.4.3.jar) | | 1.16 | End of Life | 1.1.0 | 1.5.0 | [iceberg-flink-runtime-1.16](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-flink-runtime-1.16/1.5.0/iceberg-flink-runtime-1.16-1.5.0.jar) | -| 1.17 | Deprecated | 1.3.0 | {{ icebergVersion }} | [iceberg-flink-runtime-1.17](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-flink-runtime-1.17/{{ icebergVersion }}/iceberg-flink-runtime-1.17-{{ icebergVersion }}.jar) | +| 1.17 | Deprecated | 1.3.0 | 1.6.0 | [iceberg-flink-runtime-1.17](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-flink-runtime-1.17/{{ icebergVersion }}/iceberg-flink-runtime-1.17-{{ icebergVersion }}.jar) | | 1.18 | Maintained | 1.5.0 | {{ icebergVersion }} | [iceberg-flink-runtime-1.18](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-flink-runtime-1.18/{{ icebergVersion }}/iceberg-flink-runtime-1.18-{{ icebergVersion }}.jar) | | 1.19 | Maintained | 1.6.0 | {{ icebergVersion }} | [iceberg-flink-runtime-1.19](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-flink-runtime-1.19/{{ icebergVersion }}/iceberg-flink-runtime-1.19-{{ icebergVersion }}.jar) | +| 1.20 | Maintained | 1.7.0 | {{ icebergVersion }} | [iceberg-flink-runtime-1.20](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-flink-runtime-1.20/{{ icebergVersion }}/iceberg-flink-runtime-1.20-{{ icebergVersion }}.jar) | diff --git a/site/docs/releases.md b/site/docs/releases.md index cc29857ed802..6b48e31a0728 100644 --- a/site/docs/releases.md +++ b/site/docs/releases.md @@ -31,9 +31,9 @@ The latest version of Iceberg is [{{ icebergVersion }}](https://github.com/apach * [{{ icebergVersion }} Spark 3.4\_with Scala 2.13 runtime Jar](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-spark-runtime-3.4_2.13/{{ icebergVersion }}/iceberg-spark-runtime-3.4_2.13-{{ icebergVersion }}.jar) * [{{ icebergVersion }} Spark 3.3\_with Scala 2.12 runtime Jar](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-spark-runtime-3.3_2.12/{{ icebergVersion }}/iceberg-spark-runtime-3.3_2.12-{{ icebergVersion }}.jar) * [{{ icebergVersion }} Spark 3.3\_with Scala 2.13 runtime Jar](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-spark-runtime-3.3_2.13/{{ icebergVersion }}/iceberg-spark-runtime-3.3_2.13-{{ icebergVersion }}.jar) +* [{{ icebergVersion }} Flink 1.20 runtime Jar](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-flink-runtime-1.20/{{ icebergVersion }}/iceberg-flink-runtime-1.20-{{ icebergVersion }}.jar) +* [{{ icebergVersion }} Flink 1.19 runtime Jar](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-flink-runtime-1.19/{{ icebergVersion }}/iceberg-flink-runtime-1.19-{{ icebergVersion }}.jar) * [{{ icebergVersion }} Flink 1.18 runtime Jar](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-flink-runtime-1.18/{{ icebergVersion }}/iceberg-flink-runtime-1.18-{{ icebergVersion }}.jar) -* [{{ icebergVersion }} Flink 1.17 runtime Jar](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-flink-runtime-1.17/{{ icebergVersion }}/iceberg-flink-runtime-1.17-{{ icebergVersion }}.jar) -* [{{ icebergVersion }} Flink 1.16 runtime Jar](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-flink-runtime-1.16/{{ icebergVersion }}/iceberg-flink-runtime-1.16-{{ icebergVersion }}.jar) * [{{ icebergVersion }} Hive runtime Jar](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-hive-runtime/{{ icebergVersion }}/iceberg-hive-runtime-{{ icebergVersion }}.jar) * [{{ icebergVersion }} aws-bundle Jar](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-aws-bundle/{{ icebergVersion }}/iceberg-aws-bundle-{{ icebergVersion }}.jar) * [{{ icebergVersion }} gcp-bundle Jar](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-gcp-bundle/{{ icebergVersion }}/iceberg-gcp-bundle-{{ icebergVersion }}.jar) From 97e034b2cec9408a6f792c410a8eb8dddb452e14 Mon Sep 17 00:00:00 2001 From: Piotr Findeisen Date: Wed, 7 Aug 2024 16:08:16 +0200 Subject: [PATCH 48/55] Flink: Remove deprecated RowDataUtil.clone method (#10902) Scheduled for removal in 1.7.0. --- .../apache/iceberg/flink/data/RowDataUtil.java | 18 ------------------ .../apache/iceberg/flink/data/RowDataUtil.java | 18 ------------------ .../apache/iceberg/flink/data/RowDataUtil.java | 18 ------------------ 3 files changed, 54 deletions(-) diff --git a/flink/v1.18/flink/src/main/java/org/apache/iceberg/flink/data/RowDataUtil.java b/flink/v1.18/flink/src/main/java/org/apache/iceberg/flink/data/RowDataUtil.java index 3a8f5ccc6c03..4bd85bbd97b4 100644 --- a/flink/v1.18/flink/src/main/java/org/apache/iceberg/flink/data/RowDataUtil.java +++ b/flink/v1.18/flink/src/main/java/org/apache/iceberg/flink/data/RowDataUtil.java @@ -102,22 +102,4 @@ public static RowData clone( return ret; } - - /** - * @deprecated will be removed in 1.7.0; Not reusing FieldGetter in this method could lead to - * performance degradation, use {@link #clone(RowData, RowData, RowType, TypeSerializer[], - * RowData.FieldGetter[])} instead. - */ - @Deprecated - public static RowData clone( - RowData from, RowData reuse, RowType rowType, TypeSerializer[] fieldSerializers) { - RowData.FieldGetter[] fieldGetters = new RowData.FieldGetter[rowType.getFieldCount()]; - for (int i = 0; i < rowType.getFieldCount(); ++i) { - if (!from.isNullAt(i)) { - fieldGetters[i] = RowData.createFieldGetter(rowType.getTypeAt(i), i); - } - } - - return clone(from, reuse, rowType, fieldSerializers, fieldGetters); - } } diff --git a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/RowDataUtil.java b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/RowDataUtil.java index 3a8f5ccc6c03..4bd85bbd97b4 100644 --- a/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/RowDataUtil.java +++ b/flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/data/RowDataUtil.java @@ -102,22 +102,4 @@ public static RowData clone( return ret; } - - /** - * @deprecated will be removed in 1.7.0; Not reusing FieldGetter in this method could lead to - * performance degradation, use {@link #clone(RowData, RowData, RowType, TypeSerializer[], - * RowData.FieldGetter[])} instead. - */ - @Deprecated - public static RowData clone( - RowData from, RowData reuse, RowType rowType, TypeSerializer[] fieldSerializers) { - RowData.FieldGetter[] fieldGetters = new RowData.FieldGetter[rowType.getFieldCount()]; - for (int i = 0; i < rowType.getFieldCount(); ++i) { - if (!from.isNullAt(i)) { - fieldGetters[i] = RowData.createFieldGetter(rowType.getTypeAt(i), i); - } - } - - return clone(from, reuse, rowType, fieldSerializers, fieldGetters); - } } diff --git a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/data/RowDataUtil.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/data/RowDataUtil.java index 3a8f5ccc6c03..4bd85bbd97b4 100644 --- a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/data/RowDataUtil.java +++ b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/data/RowDataUtil.java @@ -102,22 +102,4 @@ public static RowData clone( return ret; } - - /** - * @deprecated will be removed in 1.7.0; Not reusing FieldGetter in this method could lead to - * performance degradation, use {@link #clone(RowData, RowData, RowType, TypeSerializer[], - * RowData.FieldGetter[])} instead. - */ - @Deprecated - public static RowData clone( - RowData from, RowData reuse, RowType rowType, TypeSerializer[] fieldSerializers) { - RowData.FieldGetter[] fieldGetters = new RowData.FieldGetter[rowType.getFieldCount()]; - for (int i = 0; i < rowType.getFieldCount(); ++i) { - if (!from.isNullAt(i)) { - fieldGetters[i] = RowData.createFieldGetter(rowType.getTypeAt(i), i); - } - } - - return clone(from, reuse, rowType, fieldSerializers, fieldGetters); - } } From 3bee806d0ca5ba356b3da698b3bcadb8a20d8923 Mon Sep 17 00:00:00 2001 From: Eduard Tudenhoefner Date: Thu, 8 Aug 2024 09:59:12 +0200 Subject: [PATCH 49/55] AWS: Fix flaky TestS3RestSigner (#10898) --- .../org/apache/iceberg/aws/s3/signer/S3SignerServlet.java | 4 ++-- .../apache/iceberg/aws/s3/signer/TestS3RestSigner.java | 8 +++++++- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/aws/src/test/java/org/apache/iceberg/aws/s3/signer/S3SignerServlet.java b/aws/src/test/java/org/apache/iceberg/aws/s3/signer/S3SignerServlet.java index 06c099e3be5e..ce7527af765c 100644 --- a/aws/src/test/java/org/apache/iceberg/aws/s3/signer/S3SignerServlet.java +++ b/aws/src/test/java/org/apache/iceberg/aws/s3/signer/S3SignerServlet.java @@ -148,7 +148,7 @@ private OAuthTokenResponse handleOAuth(Map requestMap) { .withToken("client-credentials-token:sub=" + requestMap.get("client_id")) .withIssuedTokenType("urn:ietf:params:oauth:token-type:access_token") .withTokenType("Bearer") - .setExpirationInSeconds(100) + .setExpirationInSeconds(10000) .build()); case "urn:ietf:params:oauth:grant-type:token-exchange": @@ -163,7 +163,7 @@ private OAuthTokenResponse handleOAuth(Map requestMap) { .withToken(token) .withIssuedTokenType("urn:ietf:params:oauth:token-type:access_token") .withTokenType("Bearer") - .setExpirationInSeconds(100) + .setExpirationInSeconds(10000) .build()); default: diff --git a/aws/src/test/java/org/apache/iceberg/aws/s3/signer/TestS3RestSigner.java b/aws/src/test/java/org/apache/iceberg/aws/s3/signer/TestS3RestSigner.java index 5e20b71e438c..08f356ca7ab1 100644 --- a/aws/src/test/java/org/apache/iceberg/aws/s3/signer/TestS3RestSigner.java +++ b/aws/src/test/java/org/apache/iceberg/aws/s3/signer/TestS3RestSigner.java @@ -107,7 +107,13 @@ public static void afterClass() throws Exception { ScheduledThreadPoolExecutor executor = ((ScheduledThreadPoolExecutor) validatingSigner.icebergSigner.tokenRefreshExecutor()); - // token expiration is set to 100s so there should be exactly one token scheduled for refresh + // token expiration is set to 10000s by the S3SignerServlet so there should be exactly one token + // scheduled for refresh. Such a high token expiration value is explicitly selected to be much + // larger than TestS3RestSigner would need to execute all tests. + // The reason why this check is done here with a high token expiration is to make sure that + // there aren't other token refreshes being scheduled after every sign request and after + // TestS3RestSigner completes all tests, there should be only this single token in the queue + // that is scheduled for refresh assertThat(executor.getPoolSize()).isEqualTo(1); assertThat(executor.getQueue()) .as("should only have a single token scheduled for refresh") From 70c506ebad2dfc6d61b99c05efd59e884282bfa6 Mon Sep 17 00:00:00 2001 From: Amogh Jahagirdar Date: Thu, 8 Aug 2024 14:35:17 -0700 Subject: [PATCH 50/55] AWS: Implement SupportsRecoveryOperations mixin for S3FileIO (#10721) --- .../apache/iceberg/aws/AwsIntegTestUtil.java | 62 ++++++++++++------- .../aws/s3/TestS3FileIOIntegration.java | 38 ++++++++++++ .../org/apache/iceberg/aws/s3/S3FileIO.java | 50 ++++++++++++++- 3 files changed, 128 insertions(+), 22 deletions(-) diff --git a/aws/src/integration/java/org/apache/iceberg/aws/AwsIntegTestUtil.java b/aws/src/integration/java/org/apache/iceberg/aws/AwsIntegTestUtil.java index bbe062d5db48..7e0ca6ed10b2 100644 --- a/aws/src/integration/java/org/apache/iceberg/aws/AwsIntegTestUtil.java +++ b/aws/src/integration/java/org/apache/iceberg/aws/AwsIntegTestUtil.java @@ -21,6 +21,7 @@ import java.util.List; import java.util.stream.Collectors; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import software.amazon.awssdk.http.urlconnection.UrlConnectionHttpClient; @@ -30,9 +31,10 @@ import software.amazon.awssdk.services.s3.S3Client; import software.amazon.awssdk.services.s3.model.Delete; import software.amazon.awssdk.services.s3.model.DeleteObjectsRequest; -import software.amazon.awssdk.services.s3.model.ListObjectsV2Request; -import software.amazon.awssdk.services.s3.model.ListObjectsV2Response; +import software.amazon.awssdk.services.s3.model.ListObjectVersionsRequest; import software.amazon.awssdk.services.s3.model.ObjectIdentifier; +import software.amazon.awssdk.services.s3.model.ObjectVersion; +import software.amazon.awssdk.services.s3.paginators.ListObjectVersionsIterable; import software.amazon.awssdk.services.s3control.S3ControlClient; import software.amazon.awssdk.services.s3control.model.CreateAccessPointRequest; import software.amazon.awssdk.services.s3control.model.DeleteAccessPointRequest; @@ -94,28 +96,46 @@ public static String testAccountId() { } public static void cleanS3Bucket(S3Client s3, String bucketName, String prefix) { - boolean hasContent = true; - while (hasContent) { - ListObjectsV2Response response = - s3.listObjectsV2( - ListObjectsV2Request.builder().bucket(bucketName).prefix(prefix).build()); - hasContent = response.hasContents(); - if (hasContent) { - s3.deleteObjects( - DeleteObjectsRequest.builder() - .bucket(bucketName) - .delete( - Delete.builder() - .objects( - response.contents().stream() - .map(obj -> ObjectIdentifier.builder().key(obj.key()).build()) - .collect(Collectors.toList())) - .build()) - .build()); - } + ListObjectVersionsIterable response = + s3.listObjectVersionsPaginator( + ListObjectVersionsRequest.builder().bucket(bucketName).prefix(prefix).build()); + List versionsToDelete = Lists.newArrayList(); + int batchDeletionSize = 1000; + response.versions().stream() + .forEach( + version -> { + versionsToDelete.add(version); + if (versionsToDelete.size() == batchDeletionSize) { + deleteObjectVersions(s3, bucketName, versionsToDelete); + versionsToDelete.clear(); + } + }); + + if (!versionsToDelete.isEmpty()) { + deleteObjectVersions(s3, bucketName, versionsToDelete); } } + private static void deleteObjectVersions( + S3Client s3, String bucket, List objectVersions) { + s3.deleteObjects( + DeleteObjectsRequest.builder() + .bucket(bucket) + .delete( + Delete.builder() + .objects( + objectVersions.stream() + .map( + obj -> + ObjectIdentifier.builder() + .key(obj.key()) + .versionId(obj.versionId()) + .build()) + .collect(Collectors.toList())) + .build()) + .build()); + } + public static void cleanGlueCatalog(GlueClient glue, List namespaces) { for (String namespace : namespaces) { try { diff --git a/aws/src/integration/java/org/apache/iceberg/aws/s3/TestS3FileIOIntegration.java b/aws/src/integration/java/org/apache/iceberg/aws/s3/TestS3FileIOIntegration.java index 18abb82ce74a..cacf04891896 100644 --- a/aws/src/integration/java/org/apache/iceberg/aws/s3/TestS3FileIOIntegration.java +++ b/aws/src/integration/java/org/apache/iceberg/aws/s3/TestS3FileIOIntegration.java @@ -53,14 +53,17 @@ import software.amazon.awssdk.services.kms.model.ListAliasesResponse; import software.amazon.awssdk.services.kms.model.ScheduleKeyDeletionRequest; import software.amazon.awssdk.services.s3.S3Client; +import software.amazon.awssdk.services.s3.model.BucketVersioningStatus; import software.amazon.awssdk.services.s3.model.GetObjectAclRequest; import software.amazon.awssdk.services.s3.model.GetObjectAclResponse; import software.amazon.awssdk.services.s3.model.GetObjectRequest; import software.amazon.awssdk.services.s3.model.GetObjectResponse; import software.amazon.awssdk.services.s3.model.ObjectCannedACL; import software.amazon.awssdk.services.s3.model.Permission; +import software.amazon.awssdk.services.s3.model.PutBucketVersioningRequest; import software.amazon.awssdk.services.s3.model.PutObjectRequest; import software.amazon.awssdk.services.s3.model.ServerSideEncryption; +import software.amazon.awssdk.services.s3.model.VersioningConfiguration; import software.amazon.awssdk.services.s3control.S3ControlClient; import software.amazon.awssdk.utils.ImmutableMap; import software.amazon.awssdk.utils.IoUtils; @@ -106,6 +109,12 @@ public static void beforeClass() { AwsIntegTestUtil.createAccessPoint(s3Control, accessPointName, bucketName); AwsIntegTestUtil.createAccessPoint( crossRegionS3Control, crossRegionAccessPointName, crossRegionBucketName); + s3.putBucketVersioning( + PutBucketVersioningRequest.builder() + .bucket(bucketName) + .versioningConfiguration( + VersioningConfiguration.builder().status(BucketVersioningStatus.ENABLED).build()) + .build()); } @AfterAll @@ -445,6 +454,35 @@ public void testPrefixDelete() { }); } + @Test + public void testFileRecoveryHappyPath() throws Exception { + S3FileIO s3FileIO = new S3FileIO(clientFactory::s3, new S3FileIOProperties()); + String filePath = String.format("s3://%s/%s/%s", bucketName, prefix, "someFile.parquet"); + write(s3FileIO, filePath); + s3FileIO.deleteFile(filePath); + assertThat(s3FileIO.newInputFile(filePath).exists()).isFalse(); + + assertThat(s3FileIO.recoverFile(filePath)).isTrue(); + assertThat(s3FileIO.newInputFile(filePath).exists()).isTrue(); + } + + @Test + public void testFileRecoveryFailsToRecover() throws Exception { + S3FileIO s3FileIO = new S3FileIO(clientFactory::s3, new S3FileIOProperties()); + s3.putBucketVersioning( + PutBucketVersioningRequest.builder() + .bucket(bucketName) + .versioningConfiguration( + VersioningConfiguration.builder().status(BucketVersioningStatus.SUSPENDED).build()) + .build()); + String filePath = String.format("s3://%s/%s/%s", bucketName, prefix, "unversionedFile.parquet"); + write(s3FileIO, filePath); + s3FileIO.deleteFile(filePath); + assertThat(s3FileIO.newInputFile(filePath).exists()).isFalse(); + + assertThat(s3FileIO.recoverFile(filePath)).isFalse(); + } + private S3FileIOProperties getDeletionTestProperties() { S3FileIOProperties properties = new S3FileIOProperties(); properties.setDeleteBatchSize(deletionBatchSize); diff --git a/aws/src/main/java/org/apache/iceberg/aws/s3/S3FileIO.java b/aws/src/main/java/org/apache/iceberg/aws/s3/S3FileIO.java index dd13e13f01a6..f7d2da5eb907 100644 --- a/aws/src/main/java/org/apache/iceberg/aws/s3/S3FileIO.java +++ b/aws/src/main/java/org/apache/iceberg/aws/s3/S3FileIO.java @@ -20,8 +20,10 @@ import java.util.Arrays; import java.util.Collection; +import java.util.Comparator; import java.util.List; import java.util.Map; +import java.util.Optional; import java.util.Set; import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutorService; @@ -37,6 +39,7 @@ import org.apache.iceberg.io.FileInfo; import org.apache.iceberg.io.InputFile; import org.apache.iceberg.io.OutputFile; +import org.apache.iceberg.io.SupportsRecoveryOperations; import org.apache.iceberg.metrics.MetricsContext; import org.apache.iceberg.relocated.com.google.common.base.Joiner; import org.apache.iceberg.relocated.com.google.common.collect.Lists; @@ -52,6 +55,7 @@ import org.apache.iceberg.util.ThreadPools; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import software.amazon.awssdk.core.exception.SdkException; import software.amazon.awssdk.services.s3.S3Client; import software.amazon.awssdk.services.s3.model.Delete; import software.amazon.awssdk.services.s3.model.DeleteObjectRequest; @@ -61,10 +65,12 @@ import software.amazon.awssdk.services.s3.model.GetObjectTaggingResponse; import software.amazon.awssdk.services.s3.model.ListObjectsV2Request; import software.amazon.awssdk.services.s3.model.ObjectIdentifier; +import software.amazon.awssdk.services.s3.model.ObjectVersion; import software.amazon.awssdk.services.s3.model.PutObjectTaggingRequest; import software.amazon.awssdk.services.s3.model.S3Exception; import software.amazon.awssdk.services.s3.model.Tag; import software.amazon.awssdk.services.s3.model.Tagging; +import software.amazon.awssdk.services.s3.paginators.ListObjectVersionsIterable; /** * FileIO implementation backed by S3. @@ -73,7 +79,7 @@ * schemes s3a, s3n, https are also treated as s3 file paths. Using this FileIO with other schemes * will result in {@link org.apache.iceberg.exceptions.ValidationException}. */ -public class S3FileIO implements CredentialSupplier, DelegateFileIO { +public class S3FileIO implements CredentialSupplier, DelegateFileIO, SupportsRecoveryOperations { private static final Logger LOG = LoggerFactory.getLogger(S3FileIO.class); private static final String DEFAULT_METRICS_IMPL = "org.apache.iceberg.hadoop.HadoopMetricsContext"; @@ -420,4 +426,46 @@ protected void finalize() throws Throwable { } } } + + @Override + public boolean recoverFile(String path) { + S3URI location = new S3URI(path, s3FileIOProperties.bucketToAccessPointMapping()); + ListObjectVersionsIterable response = + client() + .listObjectVersionsPaginator( + builder -> builder.bucket(location.bucket()).prefix(location.key())); + + // Recover to the last modified version, not isLatest, + // since isLatest is true for deletion markers. + Optional recoverVersion = + response.versions().stream().max(Comparator.comparing(ObjectVersion::lastModified)); + + return recoverVersion.map(version -> recoverObject(version, location.bucket())).orElse(false); + } + + private boolean recoverObject(ObjectVersion version, String bucket) { + if (version.isLatest()) { + return true; + } + + LOG.info("Attempting to recover object {}", version.key()); + try { + // Perform a copy instead of deleting the delete marker + // so that recovery does not rely on delete permissions + client() + .copyObject( + builder -> + builder + .sourceBucket(bucket) + .sourceKey(version.key()) + .sourceVersionId(version.versionId()) + .destinationBucket(bucket) + .destinationKey(version.key())); + } catch (SdkException e) { + LOG.warn("Failed to recover object {}", version.key(), e); + return false; + } + + return true; + } } From d17a7f189afa25c6be37df1415f4e2f8594effbe Mon Sep 17 00:00:00 2001 From: Naveen Kumar Date: Fri, 9 Aug 2024 15:04:36 +0530 Subject: [PATCH 51/55] Core: Remove deprecated APIs for 1.7.0 (#10818) --- .palantir/revapi.yml | 75 +++++++++++++++++++ .../iceberg/common/DynConstructors.java | 18 +---- .../org/apache/iceberg/common/DynFields.java | 15 ---- .../org/apache/iceberg/common/DynMethods.java | 38 +--------- .../iceberg/BaseMetastoreTableOperations.java | 11 --- .../apache/iceberg/FileScanTaskParser.java | 29 ------- .../org/apache/iceberg/SnapshotProducer.java | 12 --- .../org/apache/iceberg/io/ContentCache.java | 25 +------ .../apache/iceberg/rest/auth/OAuth2Util.java | 20 ----- .../iceberg/TestFileScanTaskParser.java | 30 -------- .../iceberg/hive/HiveOperationsBase.java | 10 --- 11 files changed, 81 insertions(+), 202 deletions(-) diff --git a/.palantir/revapi.yml b/.palantir/revapi.yml index 3018840b4513..e58ce70ded7a 100644 --- a/.palantir/revapi.yml +++ b/.palantir/revapi.yml @@ -1056,6 +1056,81 @@ acceptedBreaks: - code: "java.method.removed" old: "method org.apache.iceberg.DataFiles.Builder org.apache.iceberg.DataFiles.Builder::withEqualityFieldIds(java.util.List)" justification: "Deprecations for 1.6.0 release" + "1.6.0": + org.apache.iceberg:iceberg-common: + - code: "java.method.removed" + old: "method org.apache.iceberg.common.DynFields.StaticField org.apache.iceberg.common.DynFields.Builder::buildStaticChecked()\ + \ throws java.lang.NoSuchFieldException" + justification: "Removing deprecated code" + - code: "java.method.removed" + old: "method java.lang.Class org.apache.iceberg.common.DynConstructors.Ctor::getConstructedClass()" + justification: "Removing deprecated code" + - code: "java.method.removed" + old: "method org.apache.iceberg.common.DynConstructors.Builder org.apache.iceberg.common.DynConstructors.Builder::hiddenImpl(java.lang.Class[])" + justification: "Removing deprecated code" + - code: "java.method.removed" + old: "method org.apache.iceberg.common.DynMethods.Builder org.apache.iceberg.common.DynMethods.Builder::ctorImpl(java.lang.Class,\ + \ java.lang.Class[])" + justification: "Removing deprecated code" + - code: "java.method.removed" + old: "method org.apache.iceberg.common.DynMethods.Builder org.apache.iceberg.common.DynMethods.Builder::ctorImpl(java.lang.String,\ + \ java.lang.Class[])" + justification: "Removing deprecated code" + - code: "java.method.visibilityReduced" + old: "method R org.apache.iceberg.common.DynMethods.UnboundMethod::invokeChecked(java.lang.Object,\ + \ java.lang.Object[]) throws java.lang.Exception" + new: "method R org.apache.iceberg.common.DynMethods.UnboundMethod::invokeChecked(java.lang.Object,\ + \ java.lang.Object[]) throws java.lang.Exception" + justification: "Reduced visibility and scoped to package" + org.apache.iceberg:iceberg-core: + - code: "java.class.removed" + old: "enum org.apache.iceberg.BaseMetastoreTableOperations.CommitStatus" + justification: "Removing deprecated code" + - code: "java.method.removed" + old: "method java.lang.String org.apache.iceberg.FileScanTaskParser::toJson(org.apache.iceberg.FileScanTask)" + justification: "Removing deprecated code" + - code: "java.method.removed" + old: "method org.apache.iceberg.FileScanTask org.apache.iceberg.FileScanTaskParser::fromJson(java.lang.String,\ + \ boolean)" + justification: "Removing deprecated code" + - code: "java.method.removed" + old: "method org.apache.iceberg.io.ContentCache.CacheEntry org.apache.iceberg.io.ContentCache::get(java.lang.String,\ + \ java.util.function.Function)" + justification: "Removing deprecated code" + - code: "java.method.removed" + old: "method org.apache.iceberg.io.ContentCache.CacheEntry org.apache.iceberg.io.ContentCache::getIfPresent(java.lang.String)" + justification: "Removing deprecated code" + - code: "java.method.removed" + old: "method org.apache.iceberg.io.InputFile org.apache.iceberg.io.ContentCache::tryCache(org.apache.iceberg.io.FileIO,\ + \ java.lang.String, long)" + justification: "Removing deprecated code" + - code: "java.method.removed" + old: "method org.apache.iceberg.io.OutputFile org.apache.iceberg.SnapshotProducer::newManifestOutput()\ + \ @ org.apache.iceberg.BaseOverwriteFiles" + justification: "Removing deprecated code" + - code: "java.method.removed" + old: "method org.apache.iceberg.io.OutputFile org.apache.iceberg.SnapshotProducer::newManifestOutput()\ + \ @ org.apache.iceberg.BaseReplacePartitions" + justification: "Removing deprecated code" + - code: "java.method.removed" + old: "method org.apache.iceberg.io.OutputFile org.apache.iceberg.SnapshotProducer::newManifestOutput()\ + \ @ org.apache.iceberg.BaseRewriteManifests" + justification: "Removing deprecated code" + - code: "java.method.removed" + old: "method org.apache.iceberg.io.OutputFile org.apache.iceberg.SnapshotProducer::newManifestOutput()\ + \ @ org.apache.iceberg.StreamingDelete" + justification: "Removing deprecated code" + - code: "java.method.removed" + old: "method void org.apache.iceberg.rest.auth.OAuth2Util.AuthSession::(java.util.Map, java.lang.String, java.lang.String, java.lang.String,\ + \ java.lang.String, java.lang.String)" + justification: "Removing deprecated code" + - code: "java.method.returnTypeChanged" + old: "method org.apache.iceberg.BaseMetastoreTableOperations.CommitStatus org.apache.iceberg.BaseMetastoreTableOperations::checkCommitStatus(java.lang.String,\ + \ org.apache.iceberg.TableMetadata)" + new: "method org.apache.iceberg.BaseMetastoreOperations.CommitStatus org.apache.iceberg.BaseMetastoreTableOperations::checkCommitStatus(java.lang.String,\ + \ org.apache.iceberg.TableMetadata)" + justification: "Removing deprecated code" apache-iceberg-0.14.0: org.apache.iceberg:iceberg-api: - code: "java.class.defaultSerializationChanged" diff --git a/common/src/main/java/org/apache/iceberg/common/DynConstructors.java b/common/src/main/java/org/apache/iceberg/common/DynConstructors.java index 7c777112871a..7ec8716c86a4 100644 --- a/common/src/main/java/org/apache/iceberg/common/DynConstructors.java +++ b/common/src/main/java/org/apache/iceberg/common/DynConstructors.java @@ -43,12 +43,6 @@ private Ctor(Constructor constructor, Class constructed) { this.constructed = constructed; } - /** @deprecated since 1.6.0, will be removed in 1.7.0 */ - @Deprecated - public Class getConstructedClass() { - return constructed; - } - public C newInstanceChecked(Object... args) throws Exception { try { if (args.length > ctor.getParameterCount()) { @@ -82,6 +76,8 @@ public R invoke(Object target, Object... args) { return (R) newInstance(args); } + /** @deprecated since 1.7.0, visibility will be reduced in 1.8.0 */ + @Deprecated // will become package-private @Override @SuppressWarnings("unchecked") public R invokeChecked(Object target, Object... args) throws Exception { @@ -172,16 +168,6 @@ public Builder impl(Class targetClass, Class... types) { return this; } - /** - * @deprecated since 1.6.0, will be removed in 1.7.0; This varargs method conflicts with {@link - * #hiddenImpl(Class, Class...)}. Use {@link #builder(Class)} instead. - */ - @Deprecated - public Builder hiddenImpl(Class... types) { - hiddenImpl(baseClass, types); - return this; - } - public Builder hiddenImpl(String className, Class... types) { // don't do any work if an implementation has been found if (ctor != null) { diff --git a/common/src/main/java/org/apache/iceberg/common/DynFields.java b/common/src/main/java/org/apache/iceberg/common/DynFields.java index e88affa0cbdd..cc397d329e94 100644 --- a/common/src/main/java/org/apache/iceberg/common/DynFields.java +++ b/common/src/main/java/org/apache/iceberg/common/DynFields.java @@ -386,21 +386,6 @@ public BoundField build(Object target) { return this.build().bind(target); } - /** - * Returns the first valid implementation as a StaticField or throws a NoSuchFieldException if - * there is none. - * - * @param Java class stored in the field - * @return a {@link StaticField} with a valid implementation - * @throws IllegalStateException if the method is not static - * @throws NoSuchFieldException if no implementation was found - * @deprecated since 1.6.0, will be removed in 1.7.0 - */ - @Deprecated - public StaticField buildStaticChecked() throws NoSuchFieldException { - return this.buildChecked().asStatic(); - } - /** * Returns the first valid implementation as a StaticField or throws a RuntimeException if there * is none. diff --git a/common/src/main/java/org/apache/iceberg/common/DynMethods.java b/common/src/main/java/org/apache/iceberg/common/DynMethods.java index fc0e578c7dd1..65a69bd0e12c 100644 --- a/common/src/main/java/org/apache/iceberg/common/DynMethods.java +++ b/common/src/main/java/org/apache/iceberg/common/DynMethods.java @@ -51,10 +51,8 @@ public static class UnboundMethod { (method == null || method.isVarArgs()) ? -1 : method.getParameterTypes().length; } - /** @deprecated since 1.6.0, will be removed in 1.7.0 */ - @Deprecated // will become private @SuppressWarnings("unchecked") - public R invokeChecked(Object target, Object... args) throws Exception { + R invokeChecked(Object target, Object... args) throws Exception { try { if (argLength < 0) { return (R) method.invoke(target, args); @@ -127,6 +125,8 @@ public String toString() { /** Singleton {@link UnboundMethod}, performs no operation and returns null. */ private static final UnboundMethod NOOP = new UnboundMethod(null, "NOOP") { + /** @deprecated since 1.7.0, visibility will be reduced in 1.8.0 */ + @Deprecated // will become package-private @Override public R invokeChecked(Object target, Object... args) { return null; @@ -315,38 +315,6 @@ public Builder impl(Class targetClass, Class... argClasses) { return this; } - /** @deprecated since 1.6.0, will be removed in 1.7.0 */ - @Deprecated - public Builder ctorImpl(Class targetClass, Class... argClasses) { - // don't do any work if an implementation has been found - if (method != null) { - return this; - } - - try { - this.method = new DynConstructors.Builder().impl(targetClass, argClasses).buildChecked(); - } catch (NoSuchMethodException e) { - // not the right implementation - } - return this; - } - - /** @deprecated since 1.6.0, will be removed in 1.7.0 */ - @Deprecated - public Builder ctorImpl(String className, Class... argClasses) { - // don't do any work if an implementation has been found - if (method != null) { - return this; - } - - try { - this.method = new DynConstructors.Builder().impl(className, argClasses).buildChecked(); - } catch (NoSuchMethodException e) { - // not the right implementation - } - return this; - } - /** * Checks for an implementation, first finding the given class by name. * diff --git a/core/src/main/java/org/apache/iceberg/BaseMetastoreTableOperations.java b/core/src/main/java/org/apache/iceberg/BaseMetastoreTableOperations.java index 90d435811020..5c82bc877a15 100644 --- a/core/src/main/java/org/apache/iceberg/BaseMetastoreTableOperations.java +++ b/core/src/main/java/org/apache/iceberg/BaseMetastoreTableOperations.java @@ -284,17 +284,6 @@ public long newSnapshotId() { }; } - /** - * @deprecated since 1.6.0, will be removed in 1.7.0; Use {@link - * BaseMetastoreOperations.CommitStatus} instead - */ - @Deprecated - protected enum CommitStatus { - FAILURE, - SUCCESS, - UNKNOWN - } - /** * Attempt to load the table and see if any current or past metadata location matches the one we * were attempting to set. This is used as a last resort when we are dealing with exceptions that diff --git a/core/src/main/java/org/apache/iceberg/FileScanTaskParser.java b/core/src/main/java/org/apache/iceberg/FileScanTaskParser.java index a6ea41319f4e..7ae7dc74a72e 100644 --- a/core/src/main/java/org/apache/iceberg/FileScanTaskParser.java +++ b/core/src/main/java/org/apache/iceberg/FileScanTaskParser.java @@ -40,35 +40,6 @@ public class FileScanTaskParser { private FileScanTaskParser() {} - /** - * Serialize file scan task to JSON string - * - * @deprecated will be removed in 1.7.0; use {@link ScanTaskParser#toJson(FileScanTask)} instead - */ - @Deprecated - public static String toJson(FileScanTask fileScanTask) { - Preconditions.checkArgument(fileScanTask != null, "Invalid file scan task: null"); - return JsonUtil.generate( - generator -> { - generator.writeStartObject(); - toJson(fileScanTask, generator); - generator.writeEndObject(); - }, - false); - } - - /** - * Deserialize file scan task from JSON string - * - * @deprecated will be removed in 1.7.0; use {@link ScanTaskParser#fromJson(String, boolean)} - * instead - */ - @Deprecated - public static FileScanTask fromJson(String json, boolean caseSensitive) { - Preconditions.checkArgument(json != null, "Invalid JSON string for file scan task: null"); - return JsonUtil.parse(json, node -> fromJson(node, caseSensitive)); - } - static void toJson(FileScanTask fileScanTask, JsonGenerator generator) throws IOException { Preconditions.checkArgument(fileScanTask != null, "Invalid file scan task: null"); Preconditions.checkArgument(generator != null, "Invalid JSON generator: null"); diff --git a/core/src/main/java/org/apache/iceberg/SnapshotProducer.java b/core/src/main/java/org/apache/iceberg/SnapshotProducer.java index 0a040fe34471..74997cc89849 100644 --- a/core/src/main/java/org/apache/iceberg/SnapshotProducer.java +++ b/core/src/main/java/org/apache/iceberg/SnapshotProducer.java @@ -500,18 +500,6 @@ protected OutputFile manifestListPath() { "snap-%d-%d-%s", snapshotId(), attempt.incrementAndGet(), commitUUID)))); } - /** - * @deprecated will be removed in 1.7.0; Use {@link SnapshotProducer#newManifestOutputFile} - * instead - */ - @Deprecated - protected OutputFile newManifestOutput() { - return ops.io() - .newOutputFile( - ops.metadataFileLocation( - FileFormat.AVRO.addExtension(commitUUID + "-m" + manifestCount.getAndIncrement()))); - } - protected EncryptedOutputFile newManifestOutputFile() { String manifestFileLocation = ops.metadataFileLocation( diff --git a/core/src/main/java/org/apache/iceberg/io/ContentCache.java b/core/src/main/java/org/apache/iceberg/io/ContentCache.java index 7942c69d5d77..ce37cfb08934 100644 --- a/core/src/main/java/org/apache/iceberg/io/ContentCache.java +++ b/core/src/main/java/org/apache/iceberg/io/ContentCache.java @@ -28,7 +28,6 @@ import java.nio.ByteBuffer; import java.time.Duration; import java.util.List; -import java.util.function.Function; import org.apache.iceberg.exceptions.NotFoundException; import org.apache.iceberg.exceptions.ValidationException; import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; @@ -111,24 +110,6 @@ public CacheStats stats() { return cache.stats(); } - /** @deprecated will be removed in 1.7; use {@link #tryCache(InputFile)} instead */ - @Deprecated - public CacheEntry get(String key, Function mappingFunction) { - return cache.get(key, mappingFunction); - } - - /** @deprecated will be removed in 1.7; use {@link #tryCache(InputFile)} instead */ - @Deprecated - public CacheEntry getIfPresent(String location) { - return cache.getIfPresent(location); - } - - /** @deprecated will be removed in 1.7; use {@link #tryCache(InputFile)} instead */ - @Deprecated - public InputFile tryCache(FileIO io, String location, long length) { - return tryCache(io.newInputFile(location, length)); - } - /** * Try cache the file-content of file in the given location upon stream reading. * @@ -173,11 +154,7 @@ public String toString() { .toString(); } - /** @deprecated will be removed in 1.7; use {@link FileContent} instead. */ - @Deprecated - private static class CacheEntry {} - - private static class FileContent extends CacheEntry { + private static class FileContent { private final long length; private final List buffers; diff --git a/core/src/main/java/org/apache/iceberg/rest/auth/OAuth2Util.java b/core/src/main/java/org/apache/iceberg/rest/auth/OAuth2Util.java index 189e5fde2cad..52c89af9d474 100644 --- a/core/src/main/java/org/apache/iceberg/rest/auth/OAuth2Util.java +++ b/core/src/main/java/org/apache/iceberg/rest/auth/OAuth2Util.java @@ -465,26 +465,6 @@ public AuthSession(Map baseHeaders, AuthConfig config) { this.config = config; } - /** @deprecated since 1.6.0, will be removed in 1.7.0 */ - @Deprecated - public AuthSession( - Map baseHeaders, - String token, - String tokenType, - String credential, - String scope, - String oauth2ServerUri) { - this( - baseHeaders, - AuthConfig.builder() - .token(token) - .tokenType(tokenType) - .credential(credential) - .scope(scope) - .oauth2ServerUri(oauth2ServerUri) - .build()); - } - public Map headers() { return headers; } diff --git a/core/src/test/java/org/apache/iceberg/TestFileScanTaskParser.java b/core/src/test/java/org/apache/iceberg/TestFileScanTaskParser.java index 137e7897385b..c4a9fdf2340a 100644 --- a/core/src/test/java/org/apache/iceberg/TestFileScanTaskParser.java +++ b/core/src/test/java/org/apache/iceberg/TestFileScanTaskParser.java @@ -31,14 +31,6 @@ public class TestFileScanTaskParser { @Test public void testNullArguments() { - assertThatThrownBy(() -> FileScanTaskParser.toJson(null)) - .isInstanceOf(IllegalArgumentException.class) - .hasMessage("Invalid file scan task: null"); - - assertThatThrownBy(() -> FileScanTaskParser.fromJson((String) null, true)) - .isInstanceOf(IllegalArgumentException.class) - .hasMessage("Invalid JSON string for file scan task: null"); - assertThatThrownBy(() -> ScanTaskParser.toJson(null)) .isInstanceOf(IllegalArgumentException.class) .hasMessage("Invalid scan task: null"); @@ -48,28 +40,6 @@ public void testNullArguments() { .hasMessage("Invalid JSON string for scan task: null"); } - @ParameterizedTest - @ValueSource(booleans = {true, false}) - public void testFileScanTaskParser(boolean caseSensitive) { - PartitionSpec spec = TestBase.SPEC; - FileScanTask fileScanTask = createFileScanTask(spec, caseSensitive); - String jsonStr = FileScanTaskParser.toJson(fileScanTask); - assertThat(jsonStr).isEqualTo(fileScanTaskJsonWithoutTaskType()); - FileScanTask deserializedTask = FileScanTaskParser.fromJson(jsonStr, caseSensitive); - assertFileScanTaskEquals(fileScanTask, deserializedTask, spec, caseSensitive); - } - - /** Test backward compatibility where task-type field is absent from the JSON string */ - @ParameterizedTest - @ValueSource(booleans = {true, false}) - public void testFileScanTaskParserWithoutTaskTypeField(boolean caseSensitive) { - PartitionSpec spec = TestBase.SPEC; - FileScanTask fileScanTask = createFileScanTask(spec, caseSensitive); - FileScanTask deserializedTask = - FileScanTaskParser.fromJson(fileScanTaskJsonWithoutTaskType(), caseSensitive); - assertFileScanTaskEquals(fileScanTask, deserializedTask, spec, caseSensitive); - } - @ParameterizedTest @ValueSource(booleans = {true, false}) public void testScanTaskParser(boolean caseSensitive) { diff --git a/hive-metastore/src/main/java/org/apache/iceberg/hive/HiveOperationsBase.java b/hive-metastore/src/main/java/org/apache/iceberg/hive/HiveOperationsBase.java index 055a14246e77..6500e724a4f0 100644 --- a/hive-metastore/src/main/java/org/apache/iceberg/hive/HiveOperationsBase.java +++ b/hive-metastore/src/main/java/org/apache/iceberg/hive/HiveOperationsBase.java @@ -31,7 +31,6 @@ import org.apache.iceberg.ClientPool; import org.apache.iceberg.Schema; import org.apache.iceberg.SchemaParser; -import org.apache.iceberg.TableMetadata; import org.apache.iceberg.TableProperties; import org.apache.iceberg.exceptions.NoSuchIcebergTableException; import org.apache.iceberg.io.FileIO; @@ -150,15 +149,6 @@ default void persistTable(Table hmsTable, boolean updateHiveTable, String metada } } - /** - * @deprecated since 1.6.0, will be removed in 1.7.0; Use {@link #storageDescriptor(Schema, - * String, boolean)} instead - */ - @Deprecated - static StorageDescriptor storageDescriptor(TableMetadata metadata, boolean hiveEngineEnabled) { - return storageDescriptor(metadata.schema(), metadata.location(), hiveEngineEnabled); - } - static StorageDescriptor storageDescriptor( Schema schema, String location, boolean hiveEngineEnabled) { final StorageDescriptor storageDescriptor = new StorageDescriptor(); From 79620e198009fa243c278c66fd442d107b46206a Mon Sep 17 00:00:00 2001 From: Naveen Kumar Date: Fri, 9 Aug 2024 20:21:00 +0530 Subject: [PATCH 52/55] Core, Flink: Fix build warnings (#10899) --- core/src/main/java/org/apache/iceberg/BaseEntriesTable.java | 2 +- .../org/apache/iceberg/actions/SizeBasedFileRewriter.java | 3 ++- .../iceberg/rest/ExponentialHttpRequestRetryStrategy.java | 2 +- core/src/main/java/org/apache/iceberg/util/Pair.java | 5 ----- .../main/java/org/apache/iceberg/util/ParallelIterable.java | 1 + core/src/main/java/org/apache/iceberg/util/Tasks.java | 4 +++- .../java/org/apache/iceberg/view/BaseViewOperations.java | 1 + .../flink/sink/shuffle/MapRangePartitionerBenchmark.java | 3 ++- 8 files changed, 11 insertions(+), 10 deletions(-) diff --git a/core/src/main/java/org/apache/iceberg/BaseEntriesTable.java b/core/src/main/java/org/apache/iceberg/BaseEntriesTable.java index f4019d688cb8..526bb42ea687 100644 --- a/core/src/main/java/org/apache/iceberg/BaseEntriesTable.java +++ b/core/src/main/java/org/apache/iceberg/BaseEntriesTable.java @@ -262,7 +262,7 @@ private boolean fileContent(BoundReference ref) { return ref.fieldId() == DataFile.CONTENT.fieldId(); } - private boolean contentMatch(Integer fileContentId) { + private boolean contentMatch(Integer fileContentId) { if (FileContent.DATA.id() == fileContentId) { return ManifestContent.DATA.id() == manifestContentId; } else if (FileContent.EQUALITY_DELETES.id() == fileContentId diff --git a/core/src/main/java/org/apache/iceberg/actions/SizeBasedFileRewriter.java b/core/src/main/java/org/apache/iceberg/actions/SizeBasedFileRewriter.java index fb3c27220cb2..cea7003c1a38 100644 --- a/core/src/main/java/org/apache/iceberg/actions/SizeBasedFileRewriter.java +++ b/core/src/main/java/org/apache/iceberg/actions/SizeBasedFileRewriter.java @@ -229,7 +229,8 @@ protected long numOutputFiles(long inputSize) { // the remainder file is of a valid size for this rewrite so keep it return numFilesWithRemainder; - } else if (avgFileSizeWithoutRemainder < Math.min(1.1 * targetFileSize, writeMaxFileSize())) { + } else if (avgFileSizeWithoutRemainder + < Math.min(1.1 * targetFileSize, (double) writeMaxFileSize())) { // if the reminder is distributed amongst other files, // the average file size will be no more than 10% bigger than the target file size // so round down and distribute remainder amongst other files diff --git a/core/src/main/java/org/apache/iceberg/rest/ExponentialHttpRequestRetryStrategy.java b/core/src/main/java/org/apache/iceberg/rest/ExponentialHttpRequestRetryStrategy.java index aadb97bc7112..263b3c305af0 100644 --- a/core/src/main/java/org/apache/iceberg/rest/ExponentialHttpRequestRetryStrategy.java +++ b/core/src/main/java/org/apache/iceberg/rest/ExponentialHttpRequestRetryStrategy.java @@ -149,7 +149,7 @@ public TimeValue getRetryInterval(HttpResponse response, int execCount, HttpCont } } - int delayMillis = 1000 * (int) Math.min(Math.pow(2.0, (long) execCount - 1), 64.0); + int delayMillis = 1000 * (int) Math.min(Math.pow(2.0, (long) execCount - 1.0), 64.0); int jitter = ThreadLocalRandom.current().nextInt(Math.max(1, (int) (delayMillis * 0.1))); return TimeValue.ofMilliseconds(delayMillis + jitter); diff --git a/core/src/main/java/org/apache/iceberg/util/Pair.java b/core/src/main/java/org/apache/iceberg/util/Pair.java index bd3a934f6f04..e36321c8e2c9 100644 --- a/core/src/main/java/org/apache/iceberg/util/Pair.java +++ b/core/src/main/java/org/apache/iceberg/util/Pair.java @@ -58,11 +58,6 @@ public Schema load(Pair, Class> key) { private X first; private Y second; - /** Constructor used by Avro */ - private Pair(Schema schema) { - this.schema = schema; - } - private Pair(X first, Y second) { this.first = first; this.second = second; diff --git a/core/src/main/java/org/apache/iceberg/util/ParallelIterable.java b/core/src/main/java/org/apache/iceberg/util/ParallelIterable.java index 16fa6f3d8537..27cd96a39733 100644 --- a/core/src/main/java/org/apache/iceberg/util/ParallelIterable.java +++ b/core/src/main/java/org/apache/iceberg/util/ParallelIterable.java @@ -101,6 +101,7 @@ private ParallelIterator( } @Override + @SuppressWarnings("FutureReturnValueIgnored") public void close() { // close first, avoid new task submit this.closed.set(true); diff --git a/core/src/main/java/org/apache/iceberg/util/Tasks.java b/core/src/main/java/org/apache/iceberg/util/Tasks.java index 02d2b834311f..14804e040755 100644 --- a/core/src/main/java/org/apache/iceberg/util/Tasks.java +++ b/core/src/main/java/org/apache/iceberg/util/Tasks.java @@ -450,7 +450,9 @@ private void runTaskWithRetry(Task task, I item) thr } int delayMs = - (int) Math.min(minSleepTimeMs * Math.pow(scaleFactor, attempt - 1), maxSleepTimeMs); + (int) + Math.min( + minSleepTimeMs * Math.pow(scaleFactor, attempt - 1), (double) maxSleepTimeMs); int jitter = ThreadLocalRandom.current().nextInt(Math.max(1, (int) (delayMs * 0.1))); LOG.warn("Retrying task after failure: {}", e.getMessage(), e); diff --git a/core/src/main/java/org/apache/iceberg/view/BaseViewOperations.java b/core/src/main/java/org/apache/iceberg/view/BaseViewOperations.java index 18b452f98367..df96b90eb728 100644 --- a/core/src/main/java/org/apache/iceberg/view/BaseViewOperations.java +++ b/core/src/main/java/org/apache/iceberg/view/BaseViewOperations.java @@ -102,6 +102,7 @@ public ViewMetadata refresh() { } @Override + @SuppressWarnings("ImmutablesReferenceEquality") public void commit(ViewMetadata base, ViewMetadata metadata) { // if the metadata is already out of date, reject it if (base != current()) { diff --git a/flink/v1.20/flink/src/jmh/java/org/apache/iceberg/flink/sink/shuffle/MapRangePartitionerBenchmark.java b/flink/v1.20/flink/src/jmh/java/org/apache/iceberg/flink/sink/shuffle/MapRangePartitionerBenchmark.java index 007b423e592a..592e7ff16241 100644 --- a/flink/v1.20/flink/src/jmh/java/org/apache/iceberg/flink/sink/shuffle/MapRangePartitionerBenchmark.java +++ b/flink/v1.20/flink/src/jmh/java/org/apache/iceberg/flink/sink/shuffle/MapRangePartitionerBenchmark.java @@ -18,6 +18,7 @@ */ package org.apache.iceberg.flink.sink.shuffle; +import java.nio.charset.StandardCharsets; import java.util.Comparator; import java.util.List; import java.util.Map; @@ -139,7 +140,7 @@ private static String randomString(String prefix) { buffer[i] = (byte) CHARS.charAt(ThreadLocalRandom.current().nextInt(CHARS.length())); } - return prefix + new String(buffer); + return prefix + new String(buffer, StandardCharsets.UTF_8); } /** find the index where weightsUDF[index] < weight && weightsUDF[index+1] >= weight */ From ae08334cad1f1a9eebb9cdcf48ce5084da9bc44d Mon Sep 17 00:00:00 2001 From: Manu Zhang Date: Mon, 12 Aug 2024 14:12:20 +0800 Subject: [PATCH 53/55] Build: Bump Spark 3.5 to 3.5.2 (#10918) --- gradle/libs.versions.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index b2946163f38b..1cc38d44ac9d 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -81,7 +81,7 @@ slf4j = "1.7.36" snowflake-jdbc = "3.18.0" spark-hive33 = "3.3.4" spark-hive34 = "3.4.3" -spark-hive35 = "3.5.1" +spark-hive35 = "3.5.2" spring-boot = "2.7.18" spring-web = "5.3.37" sqlite-jdbc = "3.46.0.1" From 6ae2956f317abfbe25e4b6be1d6d6581f4087158 Mon Sep 17 00:00:00 2001 From: nk1506 Date: Tue, 16 Apr 2024 18:19:50 +0530 Subject: [PATCH 54/55] Add Manifest Stats in snapshot summary. --- .../apache/iceberg/BaseRewriteManifests.java | 1 + .../java/org/apache/iceberg/FastAppend.java | 2 + .../iceberg/MergingSnapshotProducer.java | 1 + .../org/apache/iceberg/SnapshotSummary.java | 28 ++++++ .../iceberg/metrics/CommitMetricsResult.java | 12 +++ .../apache/iceberg/TestCommitReporting.java | 11 +++ .../apache/iceberg/TestRewriteManifests.java | 34 ++++++- .../java/org/apache/iceberg/TestSnapshot.java | 40 ++++++++ .../apache/iceberg/TestSnapshotSummary.java | 23 +++++ .../actions/TestRewriteDataFilesAction.java | 23 ++++- .../iceberg/spark/sql/TestSnapshotsTable.java | 94 +++++++++++++++++++ 11 files changed, 265 insertions(+), 4 deletions(-) create mode 100644 spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/sql/TestSnapshotsTable.java diff --git a/core/src/main/java/org/apache/iceberg/BaseRewriteManifests.java b/core/src/main/java/org/apache/iceberg/BaseRewriteManifests.java index dce6d4a995bd..985fb7944087 100644 --- a/core/src/main/java/org/apache/iceberg/BaseRewriteManifests.java +++ b/core/src/main/java/org/apache/iceberg/BaseRewriteManifests.java @@ -190,6 +190,7 @@ public List apply(TableMetadata base, Snapshot snapshot) { List apply = Lists.newArrayList(); Iterables.addAll(apply, newManifestsWithMetadata); apply.addAll(keptManifests); + apply.forEach(summaryBuilder::addedManifestStats); return apply; } diff --git a/core/src/main/java/org/apache/iceberg/FastAppend.java b/core/src/main/java/org/apache/iceberg/FastAppend.java index 4976a8081c44..1c574d2f07b7 100644 --- a/core/src/main/java/org/apache/iceberg/FastAppend.java +++ b/core/src/main/java/org/apache/iceberg/FastAppend.java @@ -162,6 +162,8 @@ public List apply(TableMetadata base, Snapshot snapshot) { manifests.addAll(snapshot.allManifests(ops.io())); } + manifests.forEach(summaryBuilder::addedManifestStats); + return manifests; } diff --git a/core/src/main/java/org/apache/iceberg/MergingSnapshotProducer.java b/core/src/main/java/org/apache/iceberg/MergingSnapshotProducer.java index b4c0567ab73a..cedc62294c1d 100644 --- a/core/src/main/java/org/apache/iceberg/MergingSnapshotProducer.java +++ b/core/src/main/java/org/apache/iceberg/MergingSnapshotProducer.java @@ -869,6 +869,7 @@ public List apply(TableMetadata base, Snapshot snapshot) { List manifests = Lists.newArrayList(); Iterables.addAll(manifests, mergeManager.mergeManifests(unmergedManifests)); Iterables.addAll(manifests, deleteMergeManager.mergeManifests(unmergedDeleteManifests)); + manifests.forEach(summaryBuilder::addedManifestStats); return manifests; } diff --git a/core/src/main/java/org/apache/iceberg/SnapshotSummary.java b/core/src/main/java/org/apache/iceberg/SnapshotSummary.java index 22c9df2a8eaf..7ca4a6ff14fa 100644 --- a/core/src/main/java/org/apache/iceberg/SnapshotSummary.java +++ b/core/src/main/java/org/apache/iceberg/SnapshotSummary.java @@ -58,6 +58,8 @@ public class SnapshotSummary { public static final String SOURCE_SNAPSHOT_ID_PROP = "source-snapshot-id"; public static final String REPLACE_PARTITIONS_PROP = "replace-partitions"; public static final String EXTRA_METADATA_PREFIX = "snapshot-property."; + public static final String TOTAL_DATA_MANIFEST_FILES = "total-data-manifest-files"; + public static final String TOTAL_DELETE_MANIFEST_FILES = "total-delete-manifest-files"; public static final MapJoiner MAP_JOINER = Joiner.on(",").withKeyValueSeparator("="); @@ -144,6 +146,10 @@ public void addedManifest(ManifestFile manifest) { metrics.addedManifest(manifest); } + public void addedManifestStats(ManifestFile manifest) { + metrics.addedManifestStats(manifest); + } + public void set(String property, String value) { properties.put(property, value); } @@ -229,6 +235,8 @@ private static class UpdateMetrics { private long removedPosDeletes = 0L; private long addedEqDeletes = 0L; private long removedEqDeletes = 0L; + private long totalDataManifestFiles = 0L; + private long totalDeleteManifestFiles = 0L; private boolean trustSizeAndDeleteCounts = true; void clear() { @@ -248,6 +256,8 @@ void clear() { this.removedPosDeletes = 0L; this.addedEqDeletes = 0L; this.removedEqDeletes = 0L; + this.totalDataManifestFiles = 0L; + this.totalDeleteManifestFiles = 0L; this.trustSizeAndDeleteCounts = true; } @@ -263,6 +273,12 @@ void addTo(ImmutableMap.Builder builder) { setIf(removedDeleteFiles > 0, builder, REMOVED_DELETE_FILES_PROP, removedDeleteFiles); setIf(addedRecords > 0, builder, ADDED_RECORDS_PROP, addedRecords); setIf(deletedRecords > 0, builder, DELETED_RECORDS_PROP, deletedRecords); + setIf(totalDataManifestFiles > 0, builder, TOTAL_DATA_MANIFEST_FILES, totalDataManifestFiles); + setIf( + totalDeleteManifestFiles > 0, + builder, + TOTAL_DELETE_MANIFEST_FILES, + totalDeleteManifestFiles); if (trustSizeAndDeleteCounts) { setIf(addedSize > 0, builder, ADDED_FILE_SIZE_PROP, addedSize); @@ -336,6 +352,16 @@ void addedManifest(ManifestFile manifest) { } } + void addedManifestStats(ManifestFile manifest) { + switch (manifest.content()) { + case DATA: + this.totalDataManifestFiles++; + break; + case DELETES: + this.totalDeleteManifestFiles++; + } + } + void merge(UpdateMetrics other) { this.addedFiles += other.addedFiles; this.removedFiles += other.removedFiles; @@ -353,6 +379,8 @@ void merge(UpdateMetrics other) { this.removedPosDeletes += other.removedPosDeletes; this.addedEqDeletes += other.addedEqDeletes; this.removedEqDeletes += other.removedEqDeletes; + this.totalDataManifestFiles += other.totalDataManifestFiles; + this.totalDeleteManifestFiles += other.totalDeleteManifestFiles; this.trustSizeAndDeleteCounts = trustSizeAndDeleteCounts && other.trustSizeAndDeleteCounts; } } diff --git a/core/src/main/java/org/apache/iceberg/metrics/CommitMetricsResult.java b/core/src/main/java/org/apache/iceberg/metrics/CommitMetricsResult.java index ad66e8d32408..e5e4972603dc 100644 --- a/core/src/main/java/org/apache/iceberg/metrics/CommitMetricsResult.java +++ b/core/src/main/java/org/apache/iceberg/metrics/CommitMetricsResult.java @@ -50,6 +50,8 @@ public interface CommitMetricsResult { String ADDED_EQ_DELETES = "added-equality-deletes"; String REMOVED_EQ_DELETES = "removed-equality-deletes"; String TOTAL_EQ_DELETES = "total-equality-deletes"; + String TOTAL_DATA_MANIFEST_FILES = "total-data-manifest-files"; + String TOTAL_DELETE_MANIFEST_FILES = "total-delete-manifest-files"; @Nullable TimerResult totalDuration(); @@ -123,6 +125,12 @@ public interface CommitMetricsResult { @Nullable CounterResult totalEqualityDeletes(); + @Nullable + CounterResult totalDataManifestFiles(); + + @Nullable + CounterResult totalDeleteManifestFiles(); + static CommitMetricsResult from( CommitMetrics commitMetrics, Map snapshotSummary) { Preconditions.checkArgument(null != commitMetrics, "Invalid commit metrics: null"); @@ -163,6 +171,10 @@ static CommitMetricsResult from( .removedEqualityDeletes( counterFrom(snapshotSummary, SnapshotSummary.REMOVED_EQ_DELETES_PROP)) .totalEqualityDeletes(counterFrom(snapshotSummary, SnapshotSummary.TOTAL_EQ_DELETES_PROP)) + .totalDataManifestFiles( + counterFrom(snapshotSummary, SnapshotSummary.TOTAL_DATA_MANIFEST_FILES)) + .totalDeleteManifestFiles( + counterFrom(snapshotSummary, SnapshotSummary.TOTAL_DELETE_MANIFEST_FILES)) .build(); } diff --git a/core/src/test/java/org/apache/iceberg/TestCommitReporting.java b/core/src/test/java/org/apache/iceberg/TestCommitReporting.java index 41b301668722..ecfaac618e2c 100644 --- a/core/src/test/java/org/apache/iceberg/TestCommitReporting.java +++ b/core/src/test/java/org/apache/iceberg/TestCommitReporting.java @@ -83,6 +83,9 @@ public void addAndDeleteDataFiles() { assertThat(metrics.removedFilesSizeInBytes().value()).isEqualTo(20L); assertThat(metrics.totalFilesSizeInBytes().value()).isEqualTo(0L); + + assertThat(metrics.totalDataManifestFiles().value()).isEqualTo(1L); + assertThat(metrics.totalDeleteManifestFiles()).isNull(); } @TestTemplate @@ -122,6 +125,9 @@ public void addAndDeleteDeleteFiles() { assertThat(metrics.addedFilesSizeInBytes().value()).isEqualTo(30L); assertThat(metrics.totalFilesSizeInBytes().value()).isEqualTo(30L); + assertThat(metrics.totalDataManifestFiles()).isNull(); + assertThat(metrics.totalDeleteManifestFiles().value()).isEqualTo(1L); + // now remove those 2 positional + 1 equality delete files table .newRewrite() @@ -153,6 +159,9 @@ public void addAndDeleteDeleteFiles() { assertThat(metrics.removedFilesSizeInBytes().value()).isEqualTo(30L); assertThat(metrics.totalFilesSizeInBytes().value()).isEqualTo(0L); + + assertThat(metrics.totalDataManifestFiles()).isNull(); + assertThat(metrics.totalDeleteManifestFiles().value()).isEqualTo(1L); } @TestTemplate @@ -191,5 +200,7 @@ public void addAndDeleteManifests() throws IOException { assertThat(metrics.addedDataFiles().value()).isEqualTo(1L); assertThat(metrics.addedRecords().value()).isEqualTo(1L); assertThat(metrics.addedFilesSizeInBytes().value()).isEqualTo(10L); + assertThat(metrics.totalDataManifestFiles().value()).isEqualTo(2L); + assertThat(metrics.totalDeleteManifestFiles()).isNull(); } } diff --git a/core/src/test/java/org/apache/iceberg/TestRewriteManifests.java b/core/src/test/java/org/apache/iceberg/TestRewriteManifests.java index f1d23de32a42..b7233659c509 100644 --- a/core/src/test/java/org/apache/iceberg/TestRewriteManifests.java +++ b/core/src/test/java/org/apache/iceberg/TestRewriteManifests.java @@ -20,6 +20,7 @@ import static org.apache.iceberg.TableProperties.MANIFEST_MERGE_ENABLED; import static org.apache.iceberg.TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED; +import static org.apache.iceberg.TestSnapshot.testManifestStats; import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.assertThatThrownBy; import static org.assertj.core.api.Assumptions.assumeThat; @@ -62,6 +63,8 @@ public void testRewriteManifestsAppendedDirectly() throws IOException { "manifest-file-1.avro", manifestEntry(ManifestEntry.Status.ADDED, null, FILE_A)); table.newFastAppend().appendManifest(newManifest).commit(); + + testManifestStats(table); long appendId = table.currentSnapshot().snapshotId(); assertThat(table.currentSnapshot().allManifests(table.io())).hasSize(1); @@ -71,6 +74,7 @@ public void testRewriteManifestsAppendedDirectly() throws IOException { List manifests = table.currentSnapshot().allManifests(table.io()); assertThat(manifests).hasSize(1); + testManifestStats(table); validateManifestEntries( manifests.get(0), ids(appendId), files(FILE_A), statuses(ManifestEntry.Status.EXISTING)); } @@ -87,6 +91,8 @@ public void testRewriteManifestsWithScanExecutor() throws IOException { table.newFastAppend().appendManifest(newManifest).commit(); + testManifestStats(table); + assertThat(table.currentSnapshot().allManifests(table.io())).hasSize(1); AtomicInteger scanThreadsIndex = new AtomicInteger(0); table @@ -106,6 +112,7 @@ public void testRewriteManifestsWithScanExecutor() throws IOException { List manifests = table.currentSnapshot().allManifests(table.io()); assertThat(manifests).hasSize(1); + testManifestStats(table); assertThat(scanThreadsIndex.get()) .as("Thread should be created in provided pool") .isGreaterThan(0); @@ -129,11 +136,13 @@ public void testRewriteManifestsGeneratedAndAppendedDirectly() throws IOExceptio assertThat(table.currentSnapshot().allManifests(table.io())).hasSize(2); + testManifestStats(table); table.rewriteManifests().clusterBy(file -> "").commit(); List manifests = table.currentSnapshot().allManifests(table.io()); assertThat(manifests).hasSize(1); + testManifestStats(table); // get the correct file order List files; List ids; @@ -162,12 +171,14 @@ public void testReplaceManifestsSeparate() { assertThat(table.currentSnapshot().allManifests(table.io())).hasSize(1); - // cluster by path will split the manifest into two + testManifestStats(table); + // cluster by path will split the manifest into two table.rewriteManifests().clusterBy(file -> file.path()).commit(); List manifests = table.currentSnapshot().allManifests(table.io()); assertThat(manifests).hasSize(2); + testManifestStats(table); manifests.sort(Comparator.comparing(ManifestFile::path)); validateManifestEntries( @@ -185,6 +196,7 @@ public void testReplaceManifestsConsolidate() throws IOException { table.newFastAppend().appendFile(FILE_B).commit(); long appendIdB = table.currentSnapshot().snapshotId(); + testManifestStats(table); assertThat(table.currentSnapshot().allManifests(table.io())).hasSize(2); // cluster by constant will combine manifests into one @@ -194,6 +206,7 @@ public void testReplaceManifestsConsolidate() throws IOException { List manifests = table.currentSnapshot().allManifests(table.io()); assertThat(manifests).hasSize(1); + testManifestStats(table); // get the file order correct List files; List ids; @@ -277,6 +290,8 @@ public void testReplaceManifestsMaxSize() { assertThat(table.currentSnapshot().allManifests(table.io())).hasSize(1); + testManifestStats(table); + // cluster by constant will combine manifests into one but small target size will create one per // entry BaseRewriteManifests rewriteManifests = spy((BaseRewriteManifests) table.rewriteManifests()); @@ -319,6 +334,7 @@ public void testConcurrentRewriteManifest() throws IOException { }) .commit(); + testManifestStats(table); assertThat(table.currentSnapshot().allManifests(table.io())).hasSize(2); // commit the rewrite manifests in progress - this should perform a full rewrite as the manifest @@ -362,6 +378,7 @@ public void testAppendDuringRewriteManifest() { table.newFastAppend().appendFile(FILE_B).commit(); long appendIdB = table.currentSnapshot().snapshotId(); + testManifestStats(table); assertThat(table.currentSnapshot().allManifests(table.io())).hasSize(2); // commit the rewrite manifests in progress @@ -395,6 +412,7 @@ public void testRewriteManifestDuringAppend() { assertThat(table.currentSnapshot().allManifests(table.io())).hasSize(1); + testManifestStats(table); // commit the append in progress append.commit(); long appendIdB = table.currentSnapshot().snapshotId(); @@ -422,6 +440,9 @@ public void testBasicManifestReplacement() throws IOException { ManifestFile firstSnapshotManifest = firstSnapshotManifests.get(0); table.newFastAppend().appendFile(FILE_C).appendFile(FILE_D).commit(); + + testManifestStats(table); + Snapshot secondSnapshot = table.currentSnapshot(); ManifestFile firstNewManifest = @@ -439,6 +460,8 @@ public void testBasicManifestReplacement() throws IOException { rewriteManifests.addManifest(secondNewManifest); rewriteManifests.commit(); + testManifestStats(table); + Snapshot snapshot = table.currentSnapshot(); List manifests = snapshot.allManifests(table.io()); assertThat(manifests).hasSize(3); @@ -480,12 +503,17 @@ public void testBasicManifestReplacementWithSnapshotIdInheritance() throws IOExc table.newFastAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); + testManifestStats(table); + Snapshot firstSnapshot = table.currentSnapshot(); List firstSnapshotManifests = firstSnapshot.allManifests(table.io()); assertThat(firstSnapshotManifests).hasSize(1); ManifestFile firstSnapshotManifest = firstSnapshotManifests.get(0); table.newFastAppend().appendFile(FILE_C).appendFile(FILE_D).commit(); + + testManifestStats(table); + Snapshot secondSnapshot = table.currentSnapshot(); ManifestFile firstNewManifest = @@ -503,6 +531,8 @@ public void testBasicManifestReplacementWithSnapshotIdInheritance() throws IOExc rewriteManifests.addManifest(secondNewManifest); rewriteManifests.commit(); + testManifestStats(table); + Snapshot snapshot = table.currentSnapshot(); List manifests = snapshot.allManifests(table.io()); assertThat(manifests).hasSize(3); @@ -573,6 +603,8 @@ public void testWithMultiplePartitionSpec() throws IOException { table.newAppend().appendFile(newFileZ).commit(); + testManifestStats(table); + assertThat(table.currentSnapshot().allManifests(table.io())).hasSize(3); RewriteManifests rewriteManifests = table.rewriteManifests(); diff --git a/core/src/test/java/org/apache/iceberg/TestSnapshot.java b/core/src/test/java/org/apache/iceberg/TestSnapshot.java index 8a30036f3242..be02568d089c 100644 --- a/core/src/test/java/org/apache/iceberg/TestSnapshot.java +++ b/core/src/test/java/org/apache/iceberg/TestSnapshot.java @@ -18,6 +18,8 @@ */ package org.apache.iceberg; +import static org.apache.iceberg.SnapshotSummary.TOTAL_DATA_MANIFEST_FILES; +import static org.apache.iceberg.SnapshotSummary.TOTAL_DELETE_MANIFEST_FILES; import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assumptions.assumeThat; @@ -40,11 +42,15 @@ protected static List parameters() { public void testAppendFilesFromTable() { table.newFastAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); + testManifestStats(table); + // collect data files from deserialization Iterable filesToAdd = table.currentSnapshot().addedDataFiles(table.io()); table.newDelete().deleteFile(FILE_A).deleteFile(FILE_B).commit(); + testManifestStats(table); + Snapshot oldSnapshot = table.currentSnapshot(); AppendFiles fastAppend = table.newFastAppend(); @@ -53,6 +59,9 @@ public void testAppendFilesFromTable() { } Snapshot newSnapshot = fastAppend.apply(); + + testManifestStats(table); + validateSnapshot(oldSnapshot, newSnapshot, FILE_A, FILE_B); } @@ -60,6 +69,8 @@ public void testAppendFilesFromTable() { public void testAppendFoundFiles() { table.newFastAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); + testManifestStats(table); + Iterable filesToAdd = FindFiles.in(table) .inPartition(table.spec(), StaticDataTask.Row.of(0)) @@ -68,6 +79,8 @@ public void testAppendFoundFiles() { table.newDelete().deleteFile(FILE_A).deleteFile(FILE_B).commit(); + testManifestStats(table); + Snapshot oldSnapshot = table.currentSnapshot(); AppendFiles fastAppend = table.newFastAppend(); @@ -76,6 +89,8 @@ public void testAppendFoundFiles() { } Snapshot newSnapshot = fastAppend.apply(); + + testManifestStats(table); validateSnapshot(oldSnapshot, newSnapshot, FILE_A, FILE_B); } @@ -83,16 +98,24 @@ public void testAppendFoundFiles() { public void testCachedDataFiles() { table.newFastAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); + testManifestStats(table); + table.updateSpec().addField(Expressions.truncate("data", 2)).commit(); + testManifestStats(table); + DataFile secondSnapshotDataFile = newDataFile("data_bucket=8/data_trunc_2=aa"); table.newFastAppend().appendFile(secondSnapshotDataFile).commit(); + testManifestStats(table); + DataFile thirdSnapshotDataFile = newDataFile("data_bucket=8/data_trunc_2=bb"); table.newOverwrite().deleteFile(FILE_A).addFile(thirdSnapshotDataFile).commit(); + testManifestStats(table); + Snapshot thirdSnapshot = table.currentSnapshot(); Iterable removedDataFiles = thirdSnapshot.removedDataFiles(FILE_IO); @@ -118,8 +141,12 @@ public void testCachedDeleteFiles() { table.newFastAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); + testManifestStats(table); + table.updateSpec().addField(Expressions.truncate("data", 2)).commit(); + testManifestStats(table); + int specId = table.spec().specId(); DataFile secondSnapshotDataFile = newDataFile("data_bucket=8/data_trunc_2=aa"); @@ -131,6 +158,8 @@ public void testCachedDeleteFiles() { .addDeletes(secondSnapshotDeleteFile) .commit(); + testManifestStats(table); + DeleteFile thirdSnapshotDeleteFile = newDeleteFile(specId, "data_bucket=8/data_trunc_2=aa"); ImmutableSet replacedDeleteFiles = ImmutableSet.of(secondSnapshotDeleteFile); @@ -141,6 +170,8 @@ public void testCachedDeleteFiles() { .rewriteFiles(ImmutableSet.of(), replacedDeleteFiles, ImmutableSet.of(), newDeleteFiles) .commit(); + testManifestStats(table); + Snapshot thirdSnapshot = table.currentSnapshot(); Iterable removedDeleteFiles = thirdSnapshot.removedDeleteFiles(FILE_IO); @@ -275,4 +306,13 @@ private void runAddedDeleteFileSequenceNumberTest( .as("File sequence number mismatch") .isEqualTo(expectedSequenceNumber); } + + public static void testManifestStats(Table table) { + assertThat(table.currentSnapshot().summary().get(TOTAL_DATA_MANIFEST_FILES)) + .isEqualTo(String.valueOf(table.currentSnapshot().dataManifests(table.io()).size())); + + int deletedManifestCount = table.currentSnapshot().deleteManifests(table.io()).size(); + assertThat(table.currentSnapshot().summary().get(TOTAL_DELETE_MANIFEST_FILES)) + .isEqualTo(deletedManifestCount == 0 ? null : String.valueOf(deletedManifestCount)); + } } diff --git a/core/src/test/java/org/apache/iceberg/TestSnapshotSummary.java b/core/src/test/java/org/apache/iceberg/TestSnapshotSummary.java index 529e0cc614f6..d4db286a82f1 100644 --- a/core/src/test/java/org/apache/iceberg/TestSnapshotSummary.java +++ b/core/src/test/java/org/apache/iceberg/TestSnapshotSummary.java @@ -89,6 +89,29 @@ public void testFileSizeSummaryWithDeletes() { .containsEntry(SnapshotSummary.ADD_POS_DELETE_FILES_PROP, "1"); } + @TestTemplate + public void testManifestStatSummaryWithDeletes() { + if (formatVersion == 1) { + return; + } + + // fast append + table.newFastAppend().appendFile(FILE_A).commit(); + Map summary = table.currentSnapshot().summary(); + + assertThat(summary) + .containsEntry(SnapshotSummary.TOTAL_DATA_MANIFEST_FILES, "1") + .doesNotContainKey(SnapshotSummary.TOTAL_DELETE_MANIFEST_FILES); + + table.newRowDelta().addDeletes(FILE_A_DELETES).addDeletes(FILE_A2_DELETES).commit(); + table.refresh(); + summary = table.currentSnapshot().summary(); + + assertThat(summary) + .containsEntry(SnapshotSummary.TOTAL_DATA_MANIFEST_FILES, "1") + .containsEntry(SnapshotSummary.TOTAL_DELETE_MANIFEST_FILES, "1"); + } + @TestTemplate public void testIcebergVersionInSummary() { table.newFastAppend().appendFile(FILE_A).commit(); diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java index b67ee87c7d3e..601539784617 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java @@ -19,6 +19,7 @@ package org.apache.iceberg.spark.actions; import static org.apache.iceberg.TableProperties.COMMIT_NUM_RETRIES; +import static org.apache.iceberg.TestSnapshot.testManifestStats; import static org.apache.iceberg.types.Types.NestedField.optional; import static org.apache.iceberg.types.Types.NestedField.required; import static org.apache.spark.sql.functions.current_date; @@ -180,8 +181,10 @@ public void testBinPackUnpartitionedTable() { assertThat(result.rewrittenBytesCount()).isEqualTo(dataSizeBefore); shouldHaveFiles(table, 1); - List actual = currentData(); + testManifestStats(table); + + List actual = currentData(); assertEquals("Rows must match", expectedRecords, actual); } @@ -200,8 +203,10 @@ public void testBinPackPartitionedTable() { assertThat(result.rewrittenBytesCount()).isEqualTo(dataSizeBefore); shouldHaveFiles(table, 4); - List actualRecords = currentData(); + testManifestStats(table); + + List actualRecords = currentData(); assertEquals("Rows must match", expectedRecords, actualRecords); } @@ -226,6 +231,8 @@ public void testBinPackWithFilter() { shouldHaveFiles(table, 7); + testManifestStats(table); + List actualRecords = currentData(); assertEquals("Rows must match", expectedRecords, actualRecords); } @@ -252,6 +259,8 @@ public void testBinPackWithFilterOnBucketExpression() { shouldHaveFiles(table, 7); + testManifestStats(table); + List actualRecords = currentData(); assertEquals("Rows must match", expectedRecords, actualRecords); } @@ -283,9 +292,10 @@ public void testBinPackAfterPartitionChange() { .hasSize(1); assertThat(result.rewrittenBytesCount()).isEqualTo(dataSizeBefore); + testManifestStats(table); + List postRewriteData = currentData(); assertEquals("We shouldn't have changed the data", originalData, postRewriteData); - shouldHaveSnapshots(table, 2); shouldHaveACleanCache(table); shouldHaveFiles(table, 20); @@ -331,6 +341,8 @@ public void testBinPackWithDeletes() { .isEqualTo(2); assertThat(result.rewrittenBytesCount()).isGreaterThan(0L).isLessThan(dataSizeBefore); + testManifestStats(table); + List actualRecords = currentData(); assertEquals("Rows must match", expectedRecords, actualRecords); assertThat(actualRecords).as("7 rows are removed").hasSize(total - 7); @@ -361,6 +373,9 @@ public void testBinPackWithDeleteAllData() { .rewriteDataFiles(table) .option(SizeBasedDataRewriter.DELETE_FILE_THRESHOLD, "1") .execute(); + + testManifestStats(table); + assertThat(result.rewrittenDataFilesCount()).as("Action should rewrite 1 data files").isOne(); assertThat(result.rewrittenBytesCount()).isEqualTo(dataSizeBefore); @@ -435,6 +450,8 @@ public void testBinPackWithStartingSequenceNumberV1Compatibility() { assertThat(result.addedDataFilesCount()).as("Action should add 4 data files").isEqualTo(4); assertThat(result.rewrittenBytesCount()).isEqualTo(dataSizeBefore); + testManifestStats(table); + shouldHaveFiles(table, 4); List actualRecords = currentData(); assertEquals("Rows must match", expectedRecords, actualRecords); diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/sql/TestSnapshotsTable.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/sql/TestSnapshotsTable.java new file mode 100644 index 000000000000..98c54e8cbb62 --- /dev/null +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/sql/TestSnapshotsTable.java @@ -0,0 +1,94 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.spark.sql; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.util.List; +import java.util.Map; +import org.apache.iceberg.MetadataTableType; +import org.apache.iceberg.spark.CatalogTestBase; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.TestTemplate; + +public class TestSnapshotsTable extends CatalogTestBase { + + @BeforeEach + public void createTables() { + sql( + "CREATE TABLE %s (id int, data string) USING iceberg " + + "TBLPROPERTIES" + + "('format-version'='2'," + + "'write.delete.mode'='merge-on-read')", + tableName); + sql("INSERT INTO %s VALUES (1, 'a1'),(2, 'a2'),(3, 'a3')", tableName); + } + + @AfterEach + public void removeTables() { + sql("DROP TABLE IF EXISTS %s", tableName); + } + + @TestTemplate + public void testSnapshotsTable() { + List sql = sql("SELECT * FROM %s.%s", tableName, MetadataTableType.SNAPSHOTS); + assertThat(sql).hasSize(1); + } + + @TestTemplate + public void testTotalDataManifestFilesWithSnapshotsTableSummary() { + List sql = sql("SELECT * FROM %s.%s", tableName, MetadataTableType.SNAPSHOTS); + assertThat(sql).hasSize(1); + Map summary = (Map) sql.get(0)[5]; + assertThat(summary.get("total-data-manifest-files")).isEqualTo("1"); + assertThat(summary.get("total-delete-manifest-files")).isEqualTo(null); + sql("INSERT INTO %s VALUES (4, 'a4')", tableName); + sql = sql("SELECT * FROM %s.%s", tableName, MetadataTableType.SNAPSHOTS); + assertThat(sql).hasSize(2); + summary = (Map) sql.get(1)[5]; + assertThat(summary.get("total-data-manifest-files")).isEqualTo("2"); + assertThat(summary.get("total-delete-manifest-files")).isEqualTo(null); + } + + @TestTemplate + public void testTotalDeleteManifestFilesWithSnapshotsTableSummary() { + List sql = sql("SELECT * FROM %s.%s", tableName, MetadataTableType.SNAPSHOTS); + assertThat(sql).hasSize(1); + Map summary = (Map) sql.get(0)[5]; + assertThat(summary.get("total-data-manifest-files")).isEqualTo("1"); + assertThat(summary.get("total-delete-manifest-files")).isEqualTo(null); + sql("INSERT INTO %s VALUES (1, 'a1'),(2, 'a2'),(3, 'a3'),(4, 'a4')", tableName); + sql("INSERT INTO %s VALUES (1, 'a1'),(2, 'a2'),(3, 'a3'),(4, 'a4'),(5, 'a5')", tableName); + sql("INSERT INTO %s VALUES (1, 'b1'),(2, 'b2'),(3, 'b3'),(4, 'b4')", tableName); + sql("INSERT INTO %s VALUES (1, 'b1'),(2, 'b2'),(3, 'b3'),(4, 'b4'),(5, 'b5')", tableName); + sql = sql("SELECT * FROM %s.%s", tableName, MetadataTableType.SNAPSHOTS); + assertThat(sql).hasSize(5); + summary = (Map) sql.get(4)[5]; + assertThat(summary.get("total-data-manifest-files")).isEqualTo("5"); + assertThat(summary.get("total-delete-manifest-files")).isEqualTo(null); + + sql("DELETE FROM %s WHERE id = 1", tableName); + sql = sql("SELECT * FROM %s.%s", tableName, MetadataTableType.SNAPSHOTS); + assertThat(sql).hasSize(6); + summary = (Map) sql.get(5)[5]; + assertThat(summary.get("total-data-manifest-files")).isEqualTo("5"); + assertThat(summary.get("total-delete-manifest-files")).isEqualTo("1"); + } +} From da834c11c24c806fc9f9118e72ccc478b0b40131 Mon Sep 17 00:00:00 2001 From: nk1506 Date: Mon, 12 Aug 2024 16:38:44 +0530 Subject: [PATCH 55/55] Fixed conflicts --- .palantir/revapi.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.palantir/revapi.yml b/.palantir/revapi.yml index e58ce70ded7a..50d98690c9b7 100644 --- a/.palantir/revapi.yml +++ b/.palantir/revapi.yml @@ -1086,6 +1086,12 @@ acceptedBreaks: - code: "java.class.removed" old: "enum org.apache.iceberg.BaseMetastoreTableOperations.CommitStatus" justification: "Removing deprecated code" + - code: "java.method.addedToInterface" + new: "method org.apache.iceberg.metrics.CounterResult org.apache.iceberg.metrics.CommitMetricsResult::totalDataManifestFiles()" + justification: "Added new parameters for manifest stats" + - code: "java.method.addedToInterface" + new: "method org.apache.iceberg.metrics.CounterResult org.apache.iceberg.metrics.CommitMetricsResult::totalDeleteManifestFiles()" + justification: "Added new parameters for manifest stats" - code: "java.method.removed" old: "method java.lang.String org.apache.iceberg.FileScanTaskParser::toJson(org.apache.iceberg.FileScanTask)" justification: "Removing deprecated code"