apache · JingsongLi · Aug 6, 2024 · Apr 1, 2024 · Aug 2, 2024 · Aug 5, 2024
diff --git a/docs/layouts/shortcodes/generated/flink_connector_configuration.html b/docs/layouts/shortcodes/generated/flink_connector_configuration.html
@@ -254,6 +254,12 @@
             <td>Duration</td>
             <td>If the new snapshot has not been generated when the checkpoint starts to trigger, the enumerator will block the checkpoint and wait for the new snapshot. Set the maximum waiting time to avoid infinite waiting, if timeout, the checkpoint will fail. Note that it should be set smaller than the checkpoint timeout.</td>
         </tr>
+        <tr>
+            <td><h5>streaming-read.shuffle-by-partition</h5></td>
+            <td style="word-wrap: break-word;">true</td>
+            <td>Boolean</td>
+            <td>Whether shuffle by partition and bucket when streaming read.</td>
+        </tr>
         <tr>
             <td><h5>unaware-bucket.compaction.parallelism</h5></td>
             <td style="word-wrap: break-word;">(none)</td>

diff --git a/...link/paimon-flink-common/src/main/java/org/apache/paimon/flink/FlinkConnectorOptions.java b/...link/paimon-flink-common/src/main/java/org/apache/paimon/flink/FlinkConnectorOptions.java
@@ -217,6 +217,13 @@ public class FlinkConnectorOptions {
                                     + " Note: This is dangerous and is likely to cause data errors if downstream"
                                     + " is used to calculate aggregation and the input is not complete changelog.");
 
+    public static final ConfigOption<Boolean> STREAMING_READ_SHUFFLE_BY_PARTITION =
+            key("streaming-read.shuffle-by-partition")
+                    .booleanType()
+                    .defaultValue(true)
+                    .withDescription(
+                            "Whether shuffle by partition and bucket when streaming read.");
+
     /**
      * Weight of writer buffer in managed memory, Flink will compute the memory size for writer
      * according to the weight, the actual memory used depends on the running environment.

diff --git a/...nk-common/src/main/java/org/apache/paimon/flink/source/ContinuousFileSplitEnumerator.java b/...nk-common/src/main/java/org/apache/paimon/flink/source/ContinuousFileSplitEnumerator.java
@@ -23,6 +23,7 @@
 import org.apache.paimon.flink.source.assigners.PreAssignSplitAssigner;
 import org.apache.paimon.flink.source.assigners.SplitAssigner;
 import org.apache.paimon.table.BucketMode;
+import org.apache.paimon.table.sink.ChannelComputer;
 import org.apache.paimon.table.source.DataSplit;
 import org.apache.paimon.table.source.EndOfScanException;
 import org.apache.paimon.table.source.SnapshotNotExistPlan;
@@ -75,6 +76,8 @@ public class ContinuousFileSplitEnumerator
 
     private final int splitMaxNum;
 
+    private final boolean shuffleByPartition;
+
     @Nullable protected Long nextSnapshotId;
 
     protected boolean finished = false;
@@ -88,7 +91,8 @@ public ContinuousFileSplitEnumerator(
             long discoveryInterval,
             StreamTableScan scan,
             BucketMode bucketMode,
-            int splitMaxPerTask) {
+            int splitMaxPerTask,
+            boolean shuffleByPartition) {
         checkArgument(discoveryInterval > 0L);
         this.context = checkNotNull(context);
         this.nextSnapshotId = nextSnapshotId;
@@ -98,6 +102,7 @@ public ContinuousFileSplitEnumerator(
         this.scan = scan;
         this.splitAssigner = createSplitAssigner(bucketMode);
         this.splitMaxNum = context.currentParallelism() * splitMaxPerTask;
+        this.shuffleByPartition = shuffleByPartition;
         addSplits(remainSplits);
 
         this.consumerProgressCalculator =
@@ -275,7 +280,12 @@ protected synchronized void assignSplits() {
     }
 
     protected int assignSuggestedTask(FileStoreSourceSplit split) {
-        return ((DataSplit) split.split()).bucket() % context.currentParallelism();
+        DataSplit dataSplit = ((DataSplit) split.split());
+        if (shuffleByPartition) {
+            return ChannelComputer.select(
+                    dataSplit.partition(), dataSplit.bucket(), context.currentParallelism());
+        }
+        return ChannelComputer.select(dataSplit.bucket(), context.currentParallelism());
     }
 
     protected SplitAssigner createSplitAssigner(BucketMode bucketMode) {

diff --git a/...-flink-common/src/main/java/org/apache/paimon/flink/source/ContinuousFileStoreSource.java b/...-flink-common/src/main/java/org/apache/paimon/flink/source/ContinuousFileStoreSource.java
@@ -19,7 +19,9 @@
 package org.apache.paimon.flink.source;
 
 import org.apache.paimon.CoreOptions;
+import org.apache.paimon.flink.FlinkConnectorOptions;
 import org.apache.paimon.flink.metrics.FlinkMetricRegistry;
+import org.apache.paimon.options.Options;
 import org.apache.paimon.table.BucketMode;
 import org.apache.paimon.table.source.ReadBuilder;
 import org.apache.paimon.table.source.StreamDataTableScan;
@@ -99,14 +101,15 @@ protected SplitEnumerator<FileStoreSourceSplit, PendingSplitsCheckpoint> buildEn
             Collection<FileStoreSourceSplit> splits,
             @Nullable Long nextSnapshotId,
             StreamTableScan scan) {
-        CoreOptions coreOptions = CoreOptions.fromMap(options);
+        Options options = Options.fromMap(this.options);
         return new ContinuousFileSplitEnumerator(
                 context,
                 splits,
                 nextSnapshotId,
-                coreOptions.continuousDiscoveryInterval().toMillis(),
+                options.get(CoreOptions.CONTINUOUS_DISCOVERY_INTERVAL).toMillis(),
                 scan,
                 bucketMode,
-                coreOptions.scanSplitMaxPerTask());
+                options.get(CoreOptions.SCAN_MAX_SPLITS_PER_TASK),
+                options.get(FlinkConnectorOptions.STREAMING_READ_SHUFFLE_BY_PARTITION));
     }
 }
diff --git a/.../paimon-flink-common/src/main/java/org/apache/paimon/flink/source/FlinkSourceBuilder.java b/.../paimon-flink-common/src/main/java/org/apache/paimon/flink/source/FlinkSourceBuilder.java
@@ -305,7 +305,8 @@ private DataStream<RowData> buildContinuousStreamOperator() {
                         produceTypeInfo(),
                         createReadBuilder(),
                         conf.get(CoreOptions.CONTINUOUS_DISCOVERY_INTERVAL).toMillis(),
-                        watermarkStrategy == null);
+                        watermarkStrategy == null,
+                        conf.get(FlinkConnectorOptions.STREAMING_READ_SHUFFLE_BY_PARTITION));
         if (parallelism != null) {
             dataStream.getTransformation().setParallelism(parallelism);
         }

diff --git a/.../main/java/org/apache/paimon/flink/source/align/AlignedContinuousFileSplitEnumerator.java b/.../main/java/org/apache/paimon/flink/source/align/AlignedContinuousFileSplitEnumerator.java
@@ -94,15 +94,17 @@ public AlignedContinuousFileSplitEnumerator(
             StreamTableScan scan,
             BucketMode bucketMode,
             long alignTimeout,
-            int splitPerTaskMax) {
+            int splitPerTaskMax,
+            boolean shuffleByPartition) {
         super(
                 context,
                 remainSplits,
                 nextSnapshotId,
                 discoveryInterval,
                 scan,
                 bucketMode,
-                splitPerTaskMax);
+                splitPerTaskMax,
+                shuffleByPartition);
         this.pendingPlans = new ArrayBlockingQueue<>(MAX_PENDING_PLAN);
         this.alignedAssigner = (AlignedSplitAssigner) super.splitAssigner;
         this.nextSnapshotId = nextSnapshotId;

diff --git a/.../src/main/java/org/apache/paimon/flink/source/align/AlignedContinuousFileStoreSource.java b/.../src/main/java/org/apache/paimon/flink/source/align/AlignedContinuousFileStoreSource.java
@@ -91,6 +91,7 @@ protected SplitEnumerator<FileStoreSourceSplit, PendingSplitsCheckpoint> buildEn
                 scan,
                 bucketMode,
                 options.get(FlinkConnectorOptions.SOURCE_CHECKPOINT_ALIGN_TIMEOUT).toMillis(),
-                options.get(CoreOptions.SCAN_MAX_SPLITS_PER_TASK));
+                options.get(CoreOptions.SCAN_MAX_SPLITS_PER_TASK),
+                options.get(FlinkConnectorOptions.STREAMING_READ_SHUFFLE_BY_PARTITION));
     }
 }
diff --git a/...n-flink-common/src/main/java/org/apache/paimon/flink/source/operator/MonitorFunction.java b/...n-flink-common/src/main/java/org/apache/paimon/flink/source/operator/MonitorFunction.java
@@ -229,15 +229,20 @@ public static DataStream<RowData> buildSource(
             TypeInformation<RowData> typeInfo,
             ReadBuilder readBuilder,
             long monitorInterval,
-            boolean emitSnapshotWatermark) {
+            boolean emitSnapshotWatermark,
+            boolean shuffleByPartition) {
         return env.addSource(
                         new MonitorFunction(readBuilder, monitorInterval, emitSnapshotWatermark),
                         name + "-Monitor",
                         new JavaTypeInfo<>(Split.class))
                 .forceNonParallel()
                 .partitionCustom(
-                        (key, numPartitions) ->
-                                ChannelComputer.select(key.f0, key.f1, numPartitions),
+                        (key, numPartitions) -> {
+                            if (shuffleByPartition) {
+                                return ChannelComputer.select(key.f0, key.f1, numPartitions);
+                            }
+                            return ChannelComputer.select(key.f1, numPartitions);
+                        },
                         split -> {
                             DataSplit dataSplit = (DataSplit) split;
                             return Tuple2.of(dataSplit.partition(), dataSplit.bucket());

diff --git a/...ommon/src/test/java/org/apache/paimon/flink/source/ContinuousFileSplitEnumeratorTest.java b/...ommon/src/test/java/org/apache/paimon/flink/source/ContinuousFileSplitEnumeratorTest.java
@@ -878,7 +878,7 @@ public Builder withBucketMode(BucketMode bucketMode) {
 
         public ContinuousFileSplitEnumerator build() {
             return new ContinuousFileSplitEnumerator(
-                    context, initialSplits, null, discoveryInterval, scan, bucketMode, 10);
+                    context, initialSplits, null, discoveryInterval, scan, bucketMode, 10, false);
         }
     }
 

diff --git a/...t/java/org/apache/paimon/flink/source/align/AlignedContinuousFileSplitEnumeratorTest.java b/...t/java/org/apache/paimon/flink/source/align/AlignedContinuousFileSplitEnumeratorTest.java
@@ -245,7 +245,15 @@ public Builder setAlignedTimeout(long timeout) {
 
         public AlignedContinuousFileSplitEnumerator build() {
             return new AlignedContinuousFileSplitEnumerator(
-                    context, initialSplits, null, discoveryInterval, scan, bucketMode, timeout, 10);
+                    context,
+                    initialSplits,
+                    null,
+                    discoveryInterval,
+                    scan,
+                    bucketMode,
+                    timeout,
+                    10,
+                    false);
         }
     }
 }