Skip to content

Commit

Permalink
Fix overwrite when filtering complete files (#1023)
Browse files Browse the repository at this point in the history
  • Loading branch information
ndrluis authored and sungwy committed Aug 9, 2024
1 parent 96cf3b6 commit 09997c4
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 2 deletions.
4 changes: 3 additions & 1 deletion pyiceberg/table/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -622,7 +622,9 @@ def delete(self, delete_filter: Union[str, BooleanExpression], snapshot_properti
filtered_df = df.filter(preserve_row_filter)

# Only rewrite if there are records being deleted
if len(df) != len(filtered_df):
if len(filtered_df) == 0:
replaced_files.append((original_file.file, []))
elif len(df) != len(filtered_df):
replaced_files.append((
original_file.file,
list(
Expand Down
41 changes: 40 additions & 1 deletion tests/integration/test_writes/test_writes.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,12 +38,13 @@
from pyiceberg.catalog.rest import RestCatalog
from pyiceberg.catalog.sql import SqlCatalog
from pyiceberg.exceptions import NoSuchTableError
from pyiceberg.expressions import In
from pyiceberg.io.pyarrow import _dataframe_to_data_files
from pyiceberg.partitioning import PartitionField, PartitionSpec
from pyiceberg.schema import Schema
from pyiceberg.table import TableProperties
from pyiceberg.transforms import IdentityTransform
from pyiceberg.types import IntegerType, LongType, NestedField
from pyiceberg.types import IntegerType, LongType, NestedField, StringType
from utils import _create_table


Expand Down Expand Up @@ -1293,3 +1294,41 @@ def test_rest_catalog_with_empty_catalog_name_append_data(session_catalog: Catal
)
tbl = _create_table(test_catalog, identifier, data=[])
tbl.append(arrow_table_with_null)


@pytest.mark.integration
def test_table_v1_with_null_nested_namespace(session_catalog: Catalog, arrow_table_with_null: pa.Table) -> None:
identifier = "default.lower.table_v1_with_null_nested_namespace"
tbl = _create_table(session_catalog, identifier, {"format-version": "1"}, [arrow_table_with_null])
assert tbl.format_version == 1, f"Expected v1, got: v{tbl.format_version}"
# TODO: Add session_catalog.table_exists check here when we integrate a REST catalog image
# that supports HEAD request on table endpoint

# assert session_catalog.table_exists(identifier)

# We expect no error here
session_catalog.drop_table(identifier)


@pytest.mark.integration
def test_overwrite_all_data_with_filter(session_catalog: Catalog) -> None:
schema = Schema(
NestedField(1, "id", StringType(), required=True),
NestedField(2, "name", StringType(), required=False),
identifier_field_ids=[1],
)

data = pa.Table.from_pylist(
[
{"id": "1", "name": "Amsterdam"},
{"id": "2", "name": "San Francisco"},
{"id": "3", "name": "Drachten"},
],
schema=schema.as_arrow(),
)

identifier = "default.test_overwrite_all_data_with_filter"
tbl = _create_table(session_catalog, identifier, data=[data], schema=schema)
tbl.overwrite(data, In("id", ["1", "2", "3"]))

assert len(tbl.scan().to_arrow()) == 3

0 comments on commit 09997c4

Please sign in to comment.