apache · ConeyLiu · Dec 12, 2024 · Dec 16, 2024 · corleyma · Dec 13, 2024
diff --git a/mkdocs/docs/configuration.md b/mkdocs/docs/configuration.md
@@ -52,6 +52,14 @@ For example, `PYICEBERG_CATALOG__DEFAULT__S3__ACCESS_KEY_ID`, sets `s3.access-ke
 
 Iceberg tables support table properties to configure table behavior.
 
+### Read options
+
+| Key                            | Options       | Default            | Description                                                                                         |
+|--------------------------------|---------------|--------------------|-----------------------------------------------------------------------------------------------------|
+| `read.split.target-size`       | Size in bytes | 134217728 (128 MB) | Target size when combining data input splits with `plan_tasks`                                      |
+| `read.split.planning-lookback` | Integer       | 10                 | Number of bins to consider when combining input splits with `plan_tasks`                            |
+| `read.split.open-file-cost`    | Integer       | 4194304 (4 MB)     | The estimated cost to open a file, used as a minimum weight when combining splits with `plan_tasks` |
+
 ### Write options
 
 | Key                                    | Options                           | Default | Description                                                                                 |

diff --git a/pyiceberg/manifest.py b/pyiceberg/manifest.py
@@ -105,6 +105,9 @@ def _missing_(cls, value: object) -> Union[None, str]:
                 return member
         return None
 
+    def is_splittable(self) -> bool:
+        return self == FileFormat.AVRO or self == FileFormat.PARQUET or self == FileFormat.ORC
+
     def __repr__(self) -> str:
         """Return the string representation of the FileFormat class."""
         return f"FileFormat.{self.name}"

diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py
@@ -130,10 +130,12 @@
 from pyiceberg.types import (
     strtobool,
 )
+from pyiceberg.utils.bin_packing import ListPacker as ListPacker
+from pyiceberg.utils.bin_packing import PackingIterator
 from pyiceberg.utils.concurrent import ExecutorFactory
 from pyiceberg.utils.config import Config
 from pyiceberg.utils.deprecated import deprecated, deprecation_message
-from pyiceberg.utils.properties import property_as_bool
+from pyiceberg.utils.properties import property_as_bool, property_as_int
 
 if TYPE_CHECKING:
     import daft
@@ -191,6 +193,15 @@ class TableProperties:
     DELETE_MODE_MERGE_ON_READ = "merge-on-read"
     DELETE_MODE_DEFAULT = DELETE_MODE_COPY_ON_WRITE
 
+    READ_SPLIT_SIZE = "read.split.target-size"
+    READ_SPLIT_SIZE_DEFAULT = 128 * 1024 * 1024  # 128 MB
+
+    READ_SPLIT_LOOKBACK = "read.split.planning-lookback"
+    READ_SPLIT_LOOKBACK_DEFAULT = 10
+
+    READ_SPLIT_OPEN_FILE_COST = "read.split.open-file-cost"
+    READ_SPLIT_OPEN_FILE_COST_DEFAULT = 4 * 1024 * 1024  # 4 MB
+
     DEFAULT_NAME_MAPPING = "schema.name-mapping.default"
     FORMAT_VERSION = "format-version"
     DEFAULT_FORMAT_VERSION = 2
@@ -1229,7 +1240,8 @@ def with_case_sensitive(self: S, case_sensitive: bool = True) -> S:
 
 
 class ScanTask(ABC):
-    pass
+    @abstractmethod
+    def size_in_bytes(self) -> int: ...
 
 
 @dataclass(init=False)
@@ -1253,6 +1265,26 @@ def __init__(
         self.start = start or 0
         self.length = length or data_file.file_size_in_bytes
 
+    def size_in_bytes(self) -> int:
+        return self.length + sum(f.file_size_in_bytes for f in self.delete_files)
+
+
+@dataclass(init=False)
+class CombinedFileScanTask(ScanTask):
+    """Task representing combined multiple file scan tasks.
+
+    Used in plan_tasks. File can be split into multiple FileScanTask based on
+    split_offsets and then combined into read.split.target-size.
+    """
+
+    tasks: List[FileScanTask]
+
+    def __init__(self, tasks: List[FileScanTask]) -> None:
+        self.tasks = tasks
+
+    def size_in_bytes(self) -> int:
+        return sum(f.size_in_bytes() for f in self.tasks)
+
 
 def _open_manifest(
     io: FileIO,
@@ -1423,6 +1455,66 @@ def plan_files(self) -> Iterable[FileScanTask]:
             for data_entry in data_entries
         ]
 
+    def _target_split_size(self) -> int:
+        table_value = property_as_int(
+            self.table_metadata.properties, TableProperties.READ_SPLIT_SIZE, TableProperties.READ_SPLIT_SIZE_DEFAULT
+        )
+        return property_as_int(self.options, TableProperties.READ_SPLIT_SIZE, table_value)  # type: ignore
+
+    def _loop_back(self) -> int:
+        table_value = property_as_int(
+            self.table_metadata.properties, TableProperties.READ_SPLIT_LOOKBACK, TableProperties.READ_SPLIT_LOOKBACK_DEFAULT
+        )
+        return property_as_int(self.options, TableProperties.READ_SPLIT_LOOKBACK, table_value)  # type: ignore
+
+    def _split_open_file_cost(self) -> int:
+        table_value = property_as_int(
+            self.table_metadata.properties,
+            TableProperties.READ_SPLIT_OPEN_FILE_COST,
+            TableProperties.READ_SPLIT_OPEN_FILE_COST_DEFAULT,
+        )
+        return property_as_int(self.options, TableProperties.READ_SPLIT_OPEN_FILE_COST, table_value)  # type: ignore
+
+    def plan_task(self) -> Iterable[CombinedFileScanTask]:
+        """Plan balanced combined tasks for this scan by splitting large and combining small tasks.
+
+        Returns:
+            List of CombinedFileScanTasks
+        """
+        split_size = self._target_split_size()
+        loop_back = self._loop_back()
+        open_file_cost = self._split_open_file_cost()
+
+        def split(task: FileScanTask) -> List[FileScanTask]:
+            data_file = task.file
+            if not data_file.file_format.is_splittable() or not data_file.split_offsets:
+                return [task]
+
+            split_offsets = data_file.split_offsets
+            if not all(split_offsets[i] <= split_offsets[i + 1] for i in range(len(split_offsets) - 1)):
+                # split offsets must be strictly ascending
+                return [task]
+
+            all_tasks = []
+            for i in range(len(split_offsets) - 1):
+                all_tasks.append(
+                    FileScanTask(data_file, task.delete_files, split_offsets[i], split_offsets[i + 1] - split_offsets[i])
+                )
+
+            all_tasks.append(
+                FileScanTask(data_file, task.delete_files, split_offsets[-1], data_file.file_size_in_bytes - split_offsets[-1])
+            )
+
+            return all_tasks
+
+        def weight_func(task: FileScanTask) -> int:
+            return max(task.size_in_bytes(), (1 + len(task.delete_files)) * open_file_cost)
+
+        file_tasks = self.plan_files()
+        split_file_tasks = list(itertools.chain.from_iterable(map(split, file_tasks)))
+        packing_iterator = PackingIterator(split_file_tasks, split_size, loop_back, weight_func, False)
+        return list(map(CombinedFileScanTask, packing_iterator))
+
     def to_arrow(self) -> pa.Table:
         """Read an Arrow table eagerly from this DataScan.
 

diff --git a/tests/integration/test_reads.py b/tests/integration/test_reads.py
@@ -55,6 +55,7 @@
     StringType,
     TimestampType,
 )
+from pyiceberg.utils.bin_packing import PackingIterator
 from pyiceberg.utils.concurrent import ExecutorFactory
 
 DEFAULT_PROPERTIES = {"write.parquet.compression-codec": "zstd"}
@@ -873,3 +874,76 @@ def test_table_scan_empty_table(catalog: Catalog) -> None:
     result_table = tbl.scan().to_arrow()
 
     assert len(result_table) == 0
+
+
+@pytest.mark.integration
+def test_plan_tasks(session_catalog: Catalog) -> None:
+    from pyiceberg.table import TableProperties
+
+    table_name = "default.test_plan_tasks"
+    try:
+        session_catalog.drop_table(table_name)
+    except NoSuchTableError:
+        pass  # Just to make sure that the table doesn't exist
+
+    tbl = session_catalog.create_table(
+        table_name,
+        Schema(
+            NestedField(1, "number", LongType()),
+        ),
+        properties={TableProperties.PARQUET_ROW_GROUP_LIMIT: "1"},
+    )
+
+    # Write 10 row groups, that should end up as 10 batches
+    entries = 10
+    tbl.append(
+        pa.Table.from_pylist(
+            [
+                {
+                    "number": number,
+                }
+                for number in range(entries)
+            ],
+        )
+    )
+
+    assert len(tbl.inspect.files()) == 1
+
+    plan_files = list(tbl.scan().plan_files())
+    assert len(plan_files) == 1
+    data_file = plan_files[0].file
+    assert data_file.split_offsets is not None and len(data_file.split_offsets) == 10
+
+    plan_tasks = list(tbl.scan(options={TableProperties.READ_SPLIT_SIZE: 1}).plan_task())
+    assert len(plan_tasks) == 10
+
+    split_offsets = []
+    for task in plan_tasks:
+        assert len(task.tasks) == 1
+        split_offsets.append(task.tasks[0].start)
+
+    assert split_offsets == plan_files[0].file.split_offsets
+
+    split_sizes = []
+    for i in range(1, len(data_file.split_offsets)):
+        split_sizes.append(data_file.split_offsets[i] - data_file.split_offsets[i - 1])
+
+    split_sizes.append(data_file.file_size_in_bytes - data_file.split_offsets[-1])
+
+    read_split_size = int(data_file.file_size_in_bytes / 4)
+    read_split_open_file_cost = 1
+    read_split_lookback = 5
+
+    plan_tasks = list(
+        tbl.scan(
+            options={
+                TableProperties.READ_SPLIT_SIZE: read_split_size,
+                TableProperties.READ_SPLIT_OPEN_FILE_COST: read_split_open_file_cost,
+                TableProperties.READ_SPLIT_LOOKBACK: read_split_lookback,
+            }
+        ).plan_task()
+    )
+
+    assert len(plan_tasks) == len(
+        list(PackingIterator(split_sizes, read_split_size, read_split_lookback, lambda size: size, False))
+    )