From 11ce7a6998583280311d3b911dc7a637c387b1ee Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sat, 14 Dec 2024 14:54:27 +0100 Subject: [PATCH 01/57] feat: initial setup for S3TablesCatalog --- pyiceberg/catalog/__init__.py | 4 +- pyiceberg/catalog/s3tables.py | 228 +++++++++++++++++++++++++++++++++ pyiceberg/exceptions.py | 5 + pyproject.toml | 1 + tests/catalog/test_s3tables.py | 71 ++++++++++ 5 files changed, 307 insertions(+), 2 deletions(-) create mode 100644 pyiceberg/catalog/s3tables.py create mode 100644 tests/catalog/test_s3tables.py diff --git a/pyiceberg/catalog/__init__.py b/pyiceberg/catalog/__init__.py index aad225eae6..a5eb8feea9 100644 --- a/pyiceberg/catalog/__init__.py +++ b/pyiceberg/catalog/__init__.py @@ -903,8 +903,8 @@ def _get_default_warehouse_location(self, database_name: str, table_name: str) - raise ValueError("No default path is set, please specify a location when creating a table") @staticmethod - def _write_metadata(metadata: TableMetadata, io: FileIO, metadata_path: str) -> None: - ToOutputFile.table_metadata(metadata, io.new_output(metadata_path)) + def _write_metadata(metadata: TableMetadata, io: FileIO, metadata_path: str, overwrite: bool = False) -> None: + ToOutputFile.table_metadata(metadata, io.new_output(metadata_path), overwrite=overwrite) @staticmethod def _get_metadata_location(location: str, new_version: int = 0) -> str: diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py new file mode 100644 index 0000000000..82cd862571 --- /dev/null +++ b/pyiceberg/catalog/s3tables.py @@ -0,0 +1,228 @@ +import re +from typing import TYPE_CHECKING +from typing import List +from typing import Optional +from typing import Set +from typing import Tuple +from typing import Union + +import boto3 + +from pyiceberg.catalog import DEPRECATED_BOTOCORE_SESSION, MetastoreCatalog +from pyiceberg.catalog import WAREHOUSE_LOCATION +from pyiceberg.catalog import Catalog +from pyiceberg.catalog import PropertiesUpdateSummary +from pyiceberg.exceptions import S3TablesError +from pyiceberg.exceptions import TableBucketNotFound +from pyiceberg.io import AWS_ACCESS_KEY_ID +from pyiceberg.io import AWS_REGION +from pyiceberg.io import AWS_SECRET_ACCESS_KEY +from pyiceberg.io import AWS_SESSION_TOKEN +from pyiceberg.io import load_file_io +from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionSpec +from pyiceberg.schema import Schema +from pyiceberg.serializers import FromInputFile +from pyiceberg.table import CommitTableResponse +from pyiceberg.table import CreateTableTransaction +from pyiceberg.table import Table +from pyiceberg.table import TableRequirement +from pyiceberg.table.metadata import new_table_metadata +from pyiceberg.table.sorting import UNSORTED_SORT_ORDER, SortOrder +from pyiceberg.table.update import TableUpdate +from pyiceberg.typedef import EMPTY_DICT, Identifier +from pyiceberg.typedef import Properties +from pyiceberg.utils.properties import get_first_property_value + +if TYPE_CHECKING: + import pyarrow as pa + +S3TABLES_PROFILE_NAME = "s3tables.profile-name" +S3TABLES_REGION = "s3tables.region" +S3TABLES_ACCESS_KEY_ID = "s3tables.access-key-id" +S3TABLES_SECRET_ACCESS_KEY = "s3tables.secret-access-key" +S3TABLES_SESSION_TOKEN = "s3tables.session-token" + +S3TABLES_ENDPOINT = "s3tables.endpoint" + + +class S3TableCatalog(MetastoreCatalog): + def __init__(self, name: str, **properties: str): + super().__init__(name, **properties) + + session = boto3.Session( + profile_name=properties.get(S3TABLES_PROFILE_NAME), + region_name=get_first_property_value(properties, S3TABLES_REGION, AWS_REGION), + botocore_session=properties.get(DEPRECATED_BOTOCORE_SESSION), + aws_access_key_id=get_first_property_value(properties, S3TABLES_ACCESS_KEY_ID, AWS_ACCESS_KEY_ID), + aws_secret_access_key=get_first_property_value( + properties, S3TABLES_SECRET_ACCESS_KEY, AWS_SECRET_ACCESS_KEY + ), + aws_session_token=get_first_property_value(properties, S3TABLES_SESSION_TOKEN, AWS_SESSION_TOKEN), + ) + # TODO: s3tables client only supported from boto3>=1.35.74 so this can crash + # TODO: set a custom user-agent for api calls like the Java implementation + self.s3tables = session.client("s3tables") + # TODO: handle malformed properties instead of just raising a key error here + self.table_bucket_arn = self.properties[WAREHOUSE_LOCATION] + try: + self.s3tables.get_table_bucket(tableBucketARN=self.table_bucket_arn) + except self.s3tables.exceptions.NotFoundException as e: + raise TableBucketNotFound(e) from e + + def commit_table( + self, table: Table, requirements: Tuple[TableRequirement, ...], updates: Tuple[TableUpdate, ...] + ) -> CommitTableResponse: + return super().commit_table(table, requirements, updates) + + def create_namespace(self, namespace: Union[str, Identifier], properties: Properties = ...) -> None: + valid_namespace: str = self._validate_namespace_identifier(namespace) + self.s3tables.create_namespace(tableBucketARN=self.table_bucket_arn, namespace=[valid_namespace]) + + def _validate_namespace_identifier(self, namespace: Union[str, Identifier]) -> str: + # for naming rules see: https://docs.aws.amazon.com/AmazonS3/latest/userguide/s3-tables-buckets-naming.html + # TODO: extract into constant variables + pattern = re.compile("[a-z0-9][a-z0-9_]{2,62}") + reserved = "aws_s3_metadata" + + namespace = self.identifier_to_database(namespace) + + if not pattern.fullmatch(namespace): + ... + + if namespace == reserved: + ... + + return namespace + + def _validate_database_and_table_identifier(self, identifier: Union[str, Identifier]) -> Tuple[str, str]: + # for naming rules see: https://docs.aws.amazon.com/AmazonS3/latest/userguide/s3-tables-buckets-naming.html + # TODO: extract into constant variables + pattern = re.compile("[a-z0-9][a-z0-9_]{2,62}") + + namespace, table_name = self.identifier_to_database_and_table(identifier) + + namespace = self._validate_namespace_identifier(namespace) + + if not pattern.fullmatch(table_name): + ... + + return namespace, table_name + + def create_table( + self, + identifier: Union[str, Identifier], + schema: Union[Schema, "pa.Schema"], + location: Optional[str] = None, + partition_spec: PartitionSpec = UNPARTITIONED_PARTITION_SPEC, + sort_order: SortOrder = UNSORTED_SORT_ORDER, + properties: Properties = EMPTY_DICT, + ) -> Table: + namespace, table_name = self._validate_database_and_table_identifier(identifier) + + schema: Schema = self._convert_schema_if_needed(schema) # type: ignore + + # TODO: check whether namespace exists and if it does, whether table_name already exists + self.s3tables.create_table( + tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name, format="ICEBERG" + ) + + # location is given by s3 table bucket + response = self.s3tables.get_table(tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name) + version_token = response["versionToken"] + + location = response["warehouseLocation"] + metadata_location = self._get_metadata_location(location=location) + metadata = new_table_metadata( + location=location, + schema=schema, + partition_spec=partition_spec, + sort_order=sort_order, + properties=properties, + ) + + io = load_file_io(properties=self.properties, location=metadata_location) + # TODO: this triggers unsupported list operation error, setting overwrite=True is a workaround for now + # TODO: we can perform this check manually maybe? + self._write_metadata(metadata, io, metadata_location, overwrite=True) + # TODO: after writing need to update table metadata location + # can this operation fail if the version token does not match? + self.s3tables.update_table_metadata_location( + tableBucketARN=self.table_bucket_arn, + namespace=namespace, + name=table_name, + versionToken=version_token, + metadataLocation=metadata_location, + ) + + return self.load_table(identifier=identifier) + + def create_table_transaction( + self, + identifier: Union[str, Identifier], + schema: Union[Schema, "pa.Schema"], + location: Optional[str] = None, + partition_spec: PartitionSpec = ..., + sort_order: SortOrder = ..., + properties: Properties = ..., + ) -> CreateTableTransaction: + return super().create_table_transaction(identifier, schema, location, partition_spec, sort_order, properties) + + def drop_namespace(self, namespace: Union[str, Identifier]) -> None: + return super().drop_namespace(namespace) + + def drop_table(self, identifier: Union[str, Identifier]) -> None: + return super().drop_table(identifier) + + def drop_view(self, identifier: Union[str, Identifier]) -> None: + return super().drop_view(identifier) + + def list_namespaces(self, namespace: Union[str, Identifier] = ...) -> List[Identifier]: + # TODO: handle pagination + response = self.s3tables.list_namespaces(tableBucketARN=self.table_bucket_arn) + return [tuple(namespace["namespace"]) for namespace in response["namespaces"]] + + def list_tables(self, namespace: Union[str, Identifier]) -> List[Identifier]: + return super().list_tables(namespace) + + def list_views(self, namespace: Union[str, Identifier]) -> List[Identifier]: + return super().list_views(namespace) + + def load_namespace_properties(self, namespace: Union[str, Identifier]) -> Properties: + return super().load_namespace_properties(namespace) + + def load_table(self, identifier: Union[str, Identifier]) -> Table: + namespace, table_name = self._validate_database_and_table_identifier(identifier) + # TODO: raise a NoSuchTableError if it does not exist + response = self.s3tables.get_table_metadata_location( + tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name + ) + # TODO: we might need to catch if table is not initialized i.e. does not have metadata setup yet + metadata_location = response["metadataLocation"] + + io = load_file_io(properties=self.properties, location=metadata_location) + file = io.new_input(metadata_location) + metadata = FromInputFile.table_metadata(file) + return Table( + identifier=(namespace, table_name), + metadata=metadata, + metadata_location=metadata_location, + io=self._load_file_io(metadata.properties, metadata_location), + catalog=self, + ) + + def purge_table(self, identifier: Union[str, Identifier]) -> None: + return super().purge_table(identifier) + + def register_table(self, identifier: Union[str, Identifier], metadata_location: str) -> Table: + return super().register_table(identifier, metadata_location) + + def rename_table(self, from_identifier: Union[str, Identifier], to_identifier: Union[str, Identifier]) -> Table: + return super().rename_table(from_identifier, to_identifier) + + def table_exists(self, identifier: Union[str, Identifier]) -> bool: + return super().table_exists(identifier) + + def update_namespace_properties( + self, namespace: Union[str, Identifier], removals: Optional[Set[str]] = None, updates: Properties = ... + ) -> PropertiesUpdateSummary: + return super().update_namespace_properties(namespace, removals, updates) diff --git a/pyiceberg/exceptions.py b/pyiceberg/exceptions.py index 56574ff471..6096096f45 100644 --- a/pyiceberg/exceptions.py +++ b/pyiceberg/exceptions.py @@ -111,6 +111,11 @@ class ConditionalCheckFailedException(DynamoDbError): class GenericDynamoDbError(DynamoDbError): pass +class S3TablesError(Exception): + pass + +class TableBucketNotFound(S3TablesError): + pass class CommitFailedException(Exception): """Commit failed, refresh and try again.""" diff --git a/pyproject.toml b/pyproject.toml index 58dac055ca..f734cdd560 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1206,6 +1206,7 @@ sql-postgres = ["sqlalchemy", "psycopg2-binary"] sql-sqlite = ["sqlalchemy"] gcsfs = ["gcsfs"] rest-sigv4 = ["boto3"] +s3tables = ["boto3"] [tool.pytest.ini_options] markers = [ diff --git a/tests/catalog/test_s3tables.py b/tests/catalog/test_s3tables.py new file mode 100644 index 0000000000..220b0ad507 --- /dev/null +++ b/tests/catalog/test_s3tables.py @@ -0,0 +1,71 @@ +import boto3 +from pyiceberg.schema import Schema +import pytest + +from pyiceberg.catalog.s3tables import S3TableCatalog +from pyiceberg.exceptions import TableBucketNotFound + + +@pytest.fixture +def database_name(database_name): + # naming rules prevent "-" in namespaces for s3 table buckets + return database_name.replace("-", "_") + + +@pytest.fixture +def table_name(table_name): + # naming rules prevent "-" in table namees for s3 table buckets + return table_name.replace("-", "_") + +@pytest.fixture +def table_bucket_arn(): + import os + # since the moto library does not support s3tables as of 2024-12-14 we have to test against a real AWS endpoint + # in one of the supported regions. + + return os.environ["ARN"] + + +def test_s3tables_boto_api(table_bucket_arn): + client = boto3.client("s3tables") + response = client.list_namespaces(tableBucketARN=table_bucket_arn) + print(response["namespaces"]) + + response = client.get_table_bucket(tableBucketARN=table_bucket_arn + "abc") + print(response) + + + +def test_s3tables_namespaces_api(table_bucket_arn): + client = boto3.client("s3tables") + response = client.create_namespace(tableBucketARN=table_bucket_arn, namespace=["one", "two"]) + print(response) + response = client.list_namespaces(tableBucketARN=table_bucket_arn) + print(response) + +def test_creating_catalog_validates_s3_table_bucket_exists(table_bucket_arn): + properties = {"warehouse": f"{table_bucket_arn}-modified"} + with pytest.raises(TableBucketNotFound): + S3TableCatalog(name="test_s3tables_catalog", **properties) + + +def test_create_namespace(table_bucket_arn, database_name: str): + properties = {"warehouse": table_bucket_arn} + catalog = S3TableCatalog(name="test_s3tables_catalog", **properties) + catalog.create_namespace(namespace=database_name) + namespaces = catalog.list_namespaces() + assert (database_name,) in namespaces + + +def test_create_table(table_bucket_arn, database_name: str, table_name:str, table_schema_nested: Schema): + properties = {"warehouse": table_bucket_arn} + catalog = S3TableCatalog(name="test_s3tables_catalog", **properties) + identifier = (database_name, table_name) + + catalog.create_namespace(namespace=database_name) + print(database_name, table_name) + # this fails with + # OSError: When completing multiple part upload for key 'metadata/00000-55a9c37c-b822-4a81-ac0e-1efbcd145dba.metadata.json' in bucket '14e4e036-d4ae-44f8-koana45eruw + # Uunable to parse ExceptionName: S3TablesUnsupportedHeader Message: S3 Tables does not support the following header: x-amz-api-version value: 2006-03-01 + table = catalog.create_table(identifier=identifier, schema=table_schema_nested) + print(table) From 6932281b89e522971771d46cb666480fd1ff48d0 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sat, 14 Dec 2024 17:41:40 +0100 Subject: [PATCH 02/57] feat: support create_table using FsspecFileIO --- pyiceberg/catalog/s3tables.py | 7 ++++++- tests/catalog/test_s3tables.py | 27 ++++----------------------- 2 files changed, 10 insertions(+), 24 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index 82cd862571..740e72a124 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -14,7 +14,7 @@ from pyiceberg.catalog import PropertiesUpdateSummary from pyiceberg.exceptions import S3TablesError from pyiceberg.exceptions import TableBucketNotFound -from pyiceberg.io import AWS_ACCESS_KEY_ID +from pyiceberg.io import AWS_ACCESS_KEY_ID, PY_IO_IMPL from pyiceberg.io import AWS_REGION from pyiceberg.io import AWS_SECRET_ACCESS_KEY from pyiceberg.io import AWS_SESSION_TOKEN @@ -44,10 +44,15 @@ S3TABLES_ENDPOINT = "s3tables.endpoint" +# pyarrow does not support writing to S3 Table buckets as of 2024-12-14 https://github.com/apache/iceberg-python/issues/1404#issuecomment-2543174146 +S3TABLES_FILE_IO_DEFAULT = "pyiceberg.io.fsspec.FsspecFileIO" + class S3TableCatalog(MetastoreCatalog): def __init__(self, name: str, **properties: str): super().__init__(name, **properties) + # TODO: implement a proper check for FileIO + self.properties[PY_IO_IMPL] = S3TABLES_FILE_IO_DEFAULT session = boto3.Session( profile_name=properties.get(S3TABLES_PROFILE_NAME), diff --git a/tests/catalog/test_s3tables.py b/tests/catalog/test_s3tables.py index 220b0ad507..855b144a14 100644 --- a/tests/catalog/test_s3tables.py +++ b/tests/catalog/test_s3tables.py @@ -26,23 +26,6 @@ def table_bucket_arn(): return os.environ["ARN"] -def test_s3tables_boto_api(table_bucket_arn): - client = boto3.client("s3tables") - response = client.list_namespaces(tableBucketARN=table_bucket_arn) - print(response["namespaces"]) - - response = client.get_table_bucket(tableBucketARN=table_bucket_arn + "abc") - print(response) - - - -def test_s3tables_namespaces_api(table_bucket_arn): - client = boto3.client("s3tables") - response = client.create_namespace(tableBucketARN=table_bucket_arn, namespace=["one", "two"]) - print(response) - response = client.list_namespaces(tableBucketARN=table_bucket_arn) - print(response) - def test_creating_catalog_validates_s3_table_bucket_exists(table_bucket_arn): properties = {"warehouse": f"{table_bucket_arn}-modified"} with pytest.raises(TableBucketNotFound): @@ -58,14 +41,12 @@ def test_create_namespace(table_bucket_arn, database_name: str): def test_create_table(table_bucket_arn, database_name: str, table_name:str, table_schema_nested: Schema): - properties = {"warehouse": table_bucket_arn} + # setting FileIO to FsspecFileIO explicitly is required as pyarrwo does not work with S3 Table Buckets yet + properties = {"warehouse": table_bucket_arn, "py-io-impl": "pyiceberg.io.fsspec.FsspecFileIO"} catalog = S3TableCatalog(name="test_s3tables_catalog", **properties) identifier = (database_name, table_name) catalog.create_namespace(namespace=database_name) - print(database_name, table_name) - # this fails with - # OSError: When completing multiple part upload for key 'metadata/00000-55a9c37c-b822-4a81-ac0e-1efbcd145dba.metadata.json' in bucket '14e4e036-d4ae-44f8-koana45eruw - # Uunable to parse ExceptionName: S3TablesUnsupportedHeader Message: S3 Tables does not support the following header: x-amz-api-version value: 2006-03-01 table = catalog.create_table(identifier=identifier, schema=table_schema_nested) - print(table) + + assert table == catalog.load_table(identifier) From f8bec25da1d6c9e1802735c2569c11057fe87035 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sat, 14 Dec 2024 18:13:38 +0100 Subject: [PATCH 03/57] feat: implement drop_table --- pyiceberg/catalog/s3tables.py | 38 ++++++++++++++++++++++++++-------- tests/catalog/test_s3tables.py | 18 +++++++++++++++- 2 files changed, 46 insertions(+), 10 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index 740e72a124..7234e7223a 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -8,18 +8,22 @@ import boto3 -from pyiceberg.catalog import DEPRECATED_BOTOCORE_SESSION, MetastoreCatalog +from pyiceberg.catalog import DEPRECATED_BOTOCORE_SESSION from pyiceberg.catalog import WAREHOUSE_LOCATION from pyiceberg.catalog import Catalog +from pyiceberg.catalog import MetastoreCatalog from pyiceberg.catalog import PropertiesUpdateSummary +from pyiceberg.exceptions import NoSuchTableError from pyiceberg.exceptions import S3TablesError from pyiceberg.exceptions import TableBucketNotFound -from pyiceberg.io import AWS_ACCESS_KEY_ID, PY_IO_IMPL +from pyiceberg.io import AWS_ACCESS_KEY_ID from pyiceberg.io import AWS_REGION from pyiceberg.io import AWS_SECRET_ACCESS_KEY from pyiceberg.io import AWS_SESSION_TOKEN +from pyiceberg.io import PY_IO_IMPL from pyiceberg.io import load_file_io -from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionSpec +from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC +from pyiceberg.partitioning import PartitionSpec from pyiceberg.schema import Schema from pyiceberg.serializers import FromInputFile from pyiceberg.table import CommitTableResponse @@ -27,9 +31,11 @@ from pyiceberg.table import Table from pyiceberg.table import TableRequirement from pyiceberg.table.metadata import new_table_metadata -from pyiceberg.table.sorting import UNSORTED_SORT_ORDER, SortOrder +from pyiceberg.table.sorting import UNSORTED_SORT_ORDER +from pyiceberg.table.sorting import SortOrder from pyiceberg.table.update import TableUpdate -from pyiceberg.typedef import EMPTY_DICT, Identifier +from pyiceberg.typedef import EMPTY_DICT +from pyiceberg.typedef import Identifier from pyiceberg.typedef import Properties from pyiceberg.utils.properties import get_first_property_value @@ -176,7 +182,18 @@ def drop_namespace(self, namespace: Union[str, Identifier]) -> None: return super().drop_namespace(namespace) def drop_table(self, identifier: Union[str, Identifier]) -> None: - return super().drop_table(identifier) + namespace, table_name = self._validate_database_and_table_identifier(identifier) + try: + response = self.s3tables.get_table( + tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name + ) + except self.s3tables.exceptions.NotFoundException as e: + raise NoSuchTableError(f"No table with identifier {identifier} exists.") from e + + # TODO: handle conflicts due to versionToken mismatch that might occur + self.s3tables.delete_table( + tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name, versionToken=response["versionToken"] + ) def drop_view(self, identifier: Union[str, Identifier]) -> None: return super().drop_view(identifier) @@ -198,9 +215,12 @@ def load_namespace_properties(self, namespace: Union[str, Identifier]) -> Proper def load_table(self, identifier: Union[str, Identifier]) -> Table: namespace, table_name = self._validate_database_and_table_identifier(identifier) # TODO: raise a NoSuchTableError if it does not exist - response = self.s3tables.get_table_metadata_location( - tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name - ) + try: + response = self.s3tables.get_table_metadata_location( + tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name + ) + except self.s3tables.exceptions.NotFoundException as e: + raise NoSuchTableError(f"No table with identifier {identifier} exists.") from e # TODO: we might need to catch if table is not initialized i.e. does not have metadata setup yet metadata_location = response["metadataLocation"] diff --git a/tests/catalog/test_s3tables.py b/tests/catalog/test_s3tables.py index 855b144a14..9a68cdb794 100644 --- a/tests/catalog/test_s3tables.py +++ b/tests/catalog/test_s3tables.py @@ -3,7 +3,7 @@ import pytest from pyiceberg.catalog.s3tables import S3TableCatalog -from pyiceberg.exceptions import TableBucketNotFound +from pyiceberg.exceptions import NoSuchTableError, TableBucketNotFound @pytest.fixture @@ -50,3 +50,19 @@ def test_create_table(table_bucket_arn, database_name: str, table_name:str, tabl table = catalog.create_table(identifier=identifier, schema=table_schema_nested) assert table == catalog.load_table(identifier) + + + +def test_drop_table(table_bucket_arn, database_name: str, table_name:str, table_schema_nested: Schema): + # setting FileIO to FsspecFileIO explicitly is required as pyarrwo does not work with S3 Table Buckets yet + properties = {"warehouse": table_bucket_arn, "py-io-impl": "pyiceberg.io.fsspec.FsspecFileIO"} + catalog = S3TableCatalog(name="test_s3tables_catalog", **properties) + identifier = (database_name, table_name) + + catalog.create_namespace(namespace=database_name) + catalog.create_table(identifier=identifier, schema=table_schema_nested) + + catalog.drop_table(identifier=identifier) + + with pytest.raises(NoSuchTableError): + catalog.load_table(identifier=identifier) From 9018e704c6f84ad0d92a907775613b72c5bba994 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sat, 14 Dec 2024 18:20:02 +0100 Subject: [PATCH 04/57] feat: implement drop_namespace --- pyiceberg/catalog/s3tables.py | 9 +++++++-- tests/catalog/test_s3tables.py | 9 +++++++++ 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index 7234e7223a..fd7d7addc0 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -13,7 +13,7 @@ from pyiceberg.catalog import Catalog from pyiceberg.catalog import MetastoreCatalog from pyiceberg.catalog import PropertiesUpdateSummary -from pyiceberg.exceptions import NoSuchTableError +from pyiceberg.exceptions import NamespaceNotEmptyError, NoSuchTableError from pyiceberg.exceptions import S3TablesError from pyiceberg.exceptions import TableBucketNotFound from pyiceberg.io import AWS_ACCESS_KEY_ID @@ -179,7 +179,12 @@ def create_table_transaction( return super().create_table_transaction(identifier, schema, location, partition_spec, sort_order, properties) def drop_namespace(self, namespace: Union[str, Identifier]) -> None: - return super().drop_namespace(namespace) + namespace = self._validate_namespace_identifier(namespace) + try: + self.s3tables.delete_namespace(tableBucketARN=self.table_bucket_arn, namespace=namespace) + except self.s3tables.exceptions.ConflictException as e: + raise NamespaceNotEmptyError(f"Namespace {namespace} is not empty.") from e + def drop_table(self, identifier: Union[str, Identifier]) -> None: namespace, table_name = self._validate_database_and_table_identifier(identifier) diff --git a/tests/catalog/test_s3tables.py b/tests/catalog/test_s3tables.py index 9a68cdb794..0503fff9c1 100644 --- a/tests/catalog/test_s3tables.py +++ b/tests/catalog/test_s3tables.py @@ -39,6 +39,15 @@ def test_create_namespace(table_bucket_arn, database_name: str): namespaces = catalog.list_namespaces() assert (database_name,) in namespaces +def test_drop_namespace(table_bucket_arn, database_name: str): + properties = {"warehouse": table_bucket_arn} + catalog = S3TableCatalog(name="test_s3tables_catalog", **properties) + catalog.create_namespace(namespace=database_name) + assert (database_name,) in catalog.list_namespaces() + catalog.drop_namespace(namespace=database_name) + assert (database_name,) not in catalog.list_namespaces() + + def test_create_table(table_bucket_arn, database_name: str, table_name:str, table_schema_nested: Schema): # setting FileIO to FsspecFileIO explicitly is required as pyarrwo does not work with S3 Table Buckets yet From b053fbb09b1c13d35ada0366bb68f16d0c8b7bc1 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sun, 15 Dec 2024 12:01:36 +0100 Subject: [PATCH 05/57] test: validate how version conflict is handled with s3tables API --- tests/catalog/test_s3tables.py | 40 +++++++++++++++++++++++++++++----- 1 file changed, 34 insertions(+), 6 deletions(-) diff --git a/tests/catalog/test_s3tables.py b/tests/catalog/test_s3tables.py index 0503fff9c1..c7b9d13df8 100644 --- a/tests/catalog/test_s3tables.py +++ b/tests/catalog/test_s3tables.py @@ -1,9 +1,12 @@ +import uuid + import boto3 -from pyiceberg.schema import Schema import pytest from pyiceberg.catalog.s3tables import S3TableCatalog -from pyiceberg.exceptions import NoSuchTableError, TableBucketNotFound +from pyiceberg.exceptions import NoSuchTableError +from pyiceberg.exceptions import TableBucketNotFound +from pyiceberg.schema import Schema @pytest.fixture @@ -17,15 +20,41 @@ def table_name(table_name): # naming rules prevent "-" in table namees for s3 table buckets return table_name.replace("-", "_") + @pytest.fixture def table_bucket_arn(): import os + # since the moto library does not support s3tables as of 2024-12-14 we have to test against a real AWS endpoint # in one of the supported regions. return os.environ["ARN"] +def test_s3tables_api_raises_on_conflicting_version_tokens(table_bucket_arn, database_name, table_name): + client = boto3.client("s3tables") + client.create_namespace(tableBucketARN=table_bucket_arn, namespace=[database_name]) + response = client.create_table( + tableBucketARN=table_bucket_arn, namespace=database_name, name=table_name, format="ICEBERG" + ) + version_token = response["versionToken"] + scrambled_version_token = version_token[::-1] + + warehouse_location = client.get_table(tableBucketARN=table_bucket_arn, namespace=database_name, name=table_name)[ + "warehouseLocation" + ] + metadata_location = f"{warehouse_location}/metadata/00001-{uuid.uuid4()}.metadata.json" + + with pytest.raises(client.exceptions.ConflictException): + client.update_table_metadata_location( + tableBucketARN=table_bucket_arn, + namespace=database_name, + name=table_name, + versionToken=scrambled_version_token, + metadataLocation=metadata_location, + ) + + def test_creating_catalog_validates_s3_table_bucket_exists(table_bucket_arn): properties = {"warehouse": f"{table_bucket_arn}-modified"} with pytest.raises(TableBucketNotFound): @@ -39,6 +68,7 @@ def test_create_namespace(table_bucket_arn, database_name: str): namespaces = catalog.list_namespaces() assert (database_name,) in namespaces + def test_drop_namespace(table_bucket_arn, database_name: str): properties = {"warehouse": table_bucket_arn} catalog = S3TableCatalog(name="test_s3tables_catalog", **properties) @@ -48,8 +78,7 @@ def test_drop_namespace(table_bucket_arn, database_name: str): assert (database_name,) not in catalog.list_namespaces() - -def test_create_table(table_bucket_arn, database_name: str, table_name:str, table_schema_nested: Schema): +def test_create_table(table_bucket_arn, database_name: str, table_name: str, table_schema_nested: Schema): # setting FileIO to FsspecFileIO explicitly is required as pyarrwo does not work with S3 Table Buckets yet properties = {"warehouse": table_bucket_arn, "py-io-impl": "pyiceberg.io.fsspec.FsspecFileIO"} catalog = S3TableCatalog(name="test_s3tables_catalog", **properties) @@ -61,8 +90,7 @@ def test_create_table(table_bucket_arn, database_name: str, table_name:str, tabl assert table == catalog.load_table(identifier) - -def test_drop_table(table_bucket_arn, database_name: str, table_name:str, table_schema_nested: Schema): +def test_drop_table(table_bucket_arn, database_name: str, table_name: str, table_schema_nested: Schema): # setting FileIO to FsspecFileIO explicitly is required as pyarrwo does not work with S3 Table Buckets yet properties = {"warehouse": table_bucket_arn, "py-io-impl": "pyiceberg.io.fsspec.FsspecFileIO"} catalog = S3TableCatalog(name="test_s3tables_catalog", **properties) From f8fff8ac0201b028e834efedc566770c12fa07cd Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sun, 15 Dec 2024 13:40:23 +0100 Subject: [PATCH 06/57] feat: implement commit_table --- pyiceberg/catalog/s3tables.py | 52 +++++++++++++++++++++++++++++++--- tests/catalog/test_s3tables.py | 30 ++++++++++++++++++++ 2 files changed, 78 insertions(+), 4 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index fd7d7addc0..1fb384b21d 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -13,7 +13,9 @@ from pyiceberg.catalog import Catalog from pyiceberg.catalog import MetastoreCatalog from pyiceberg.catalog import PropertiesUpdateSummary -from pyiceberg.exceptions import NamespaceNotEmptyError, NoSuchTableError +from pyiceberg.exceptions import CommitFailedException +from pyiceberg.exceptions import NamespaceNotEmptyError +from pyiceberg.exceptions import NoSuchTableError from pyiceberg.exceptions import S3TablesError from pyiceberg.exceptions import TableBucketNotFound from pyiceberg.io import AWS_ACCESS_KEY_ID @@ -83,7 +85,47 @@ def __init__(self, name: str, **properties: str): def commit_table( self, table: Table, requirements: Tuple[TableRequirement, ...], updates: Tuple[TableUpdate, ...] ) -> CommitTableResponse: - return super().commit_table(table, requirements, updates) + table_identifier = table.name() + database_name, table_name = self.identifier_to_database_and_table(table_identifier, NoSuchTableError) + + # TODO: loading table and getting versionToken should be an atomic operation, otherwise + # the table can change between those two API calls without noticing + # -> change this into an internal API that returns table information along with versionToken + current_table = self.load_table(identifier=table_identifier) + version_token = self.s3tables.get_table( + tableBucketARN=self.table_bucket_arn, namespace=database_name, name=table_name + )["versionToken"] + + updated_staged_table = self._update_and_stage_table(current_table, table_identifier, requirements, updates) + if current_table and updated_staged_table.metadata == current_table.metadata: + # no changes, do nothing + return CommitTableResponse( + metadata=current_table.metadata, metadata_location=current_table.metadata_location + ) + + self._write_metadata( + metadata=updated_staged_table.metadata, + io=updated_staged_table.io, + metadata_path=updated_staged_table.metadata_location, + overwrite=True, + ) + + # try to update metadata location which will fail if the versionToken changed meanwhile + try: + self.s3tables.update_table_metadata_location( + tableBucketARN=self.table_bucket_arn, + namespace=database_name, + name=table_name, + versionToken=version_token, + metadataLocation=updated_staged_table.metadata_location, + ) + except self.s3tables.exceptions.ConflictException as e: + raise CommitFailedException( + f"Cannot commit {database_name}.{table_name} because of a concurrent update to the table version {version_token}." + ) from e + return CommitTableResponse( + metadata=updated_staged_table.metadata, metadata_location=updated_staged_table.metadata_location + ) def create_namespace(self, namespace: Union[str, Identifier], properties: Properties = ...) -> None: valid_namespace: str = self._validate_namespace_identifier(namespace) @@ -185,7 +227,6 @@ def drop_namespace(self, namespace: Union[str, Identifier]) -> None: except self.s3tables.exceptions.ConflictException as e: raise NamespaceNotEmptyError(f"Namespace {namespace} is not empty.") from e - def drop_table(self, identifier: Union[str, Identifier]) -> None: namespace, table_name = self._validate_database_and_table_identifier(identifier) try: @@ -197,7 +238,10 @@ def drop_table(self, identifier: Union[str, Identifier]) -> None: # TODO: handle conflicts due to versionToken mismatch that might occur self.s3tables.delete_table( - tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name, versionToken=response["versionToken"] + tableBucketARN=self.table_bucket_arn, + namespace=namespace, + name=table_name, + versionToken=response["versionToken"], ) def drop_view(self, identifier: Union[str, Identifier]) -> None: diff --git a/tests/catalog/test_s3tables.py b/tests/catalog/test_s3tables.py index c7b9d13df8..d185bd795d 100644 --- a/tests/catalog/test_s3tables.py +++ b/tests/catalog/test_s3tables.py @@ -7,6 +7,7 @@ from pyiceberg.exceptions import NoSuchTableError from pyiceberg.exceptions import TableBucketNotFound from pyiceberg.schema import Schema +from pyiceberg.types import IntegerType @pytest.fixture @@ -103,3 +104,32 @@ def test_drop_table(table_bucket_arn, database_name: str, table_name: str, table with pytest.raises(NoSuchTableError): catalog.load_table(identifier=identifier) + + +def test_commit_table(table_bucket_arn, database_name: str, table_name: str, table_schema_nested: Schema): + # setting FileIO to FsspecFileIO explicitly is required as pyarrwo does not work with S3 Table Buckets yet + properties = {"warehouse": table_bucket_arn, "py-io-impl": "pyiceberg.io.fsspec.FsspecFileIO"} + catalog = S3TableCatalog(name="test_s3tables_catalog", **properties) + identifier = (database_name, table_name) + + catalog.create_namespace(namespace=database_name) + table = catalog.create_table(identifier=identifier, schema=table_schema_nested) + + last_updated_ms = table.metadata.last_updated_ms + original_table_metadata_location = table.metadata_location + original_table_last_updated_ms = table.metadata.last_updated_ms + + + + transaction = table.transaction() + update = transaction.update_schema() + update.add_column(path="b", field_type=IntegerType()) + update.commit() + transaction.commit_transaction() + + updated_table_metadata = table.metadata + assert updated_table_metadata.current_schema_id == 1 + assert len(updated_table_metadata.schemas) == 2 + assert updated_table_metadata.last_updated_ms > last_updated_ms + assert updated_table_metadata.metadata_log[0].metadata_file == original_table_metadata_location + assert updated_table_metadata.metadata_log[0].timestamp_ms == original_table_last_updated_ms From 579bce674c5c063a2149936b460f8ffbb0599f73 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Wed, 18 Dec 2024 21:37:10 +0100 Subject: [PATCH 07/57] feat: implement table_exists --- pyiceberg/catalog/s3tables.py | 9 ++++++++- tests/catalog/test_s3tables.py | 12 ++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index 1fb384b21d..947157320d 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -294,7 +294,14 @@ def rename_table(self, from_identifier: Union[str, Identifier], to_identifier: U return super().rename_table(from_identifier, to_identifier) def table_exists(self, identifier: Union[str, Identifier]) -> bool: - return super().table_exists(identifier) + namespace, table_name = self._validate_database_and_table_identifier(identifier) + try: + self.s3tables.get_table( + tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name + ) + except self.s3tables.exceptions.NotFoundException: + return False + return True def update_namespace_properties( self, namespace: Union[str, Identifier], removals: Optional[Set[str]] = None, updates: Properties = ... diff --git a/tests/catalog/test_s3tables.py b/tests/catalog/test_s3tables.py index d185bd795d..f33d04d953 100644 --- a/tests/catalog/test_s3tables.py +++ b/tests/catalog/test_s3tables.py @@ -91,6 +91,18 @@ def test_create_table(table_bucket_arn, database_name: str, table_name: str, tab assert table == catalog.load_table(identifier) +def test_table_exists(table_bucket_arn, database_name: str, table_name: str, table_schema_nested: Schema): + # setting FileIO to FsspecFileIO explicitly is required as pyarrwo does not work with S3 Table Buckets yet + properties = {"warehouse": table_bucket_arn, "py-io-impl": "pyiceberg.io.fsspec.FsspecFileIO"} + catalog = S3TableCatalog(name="test_s3tables_catalog", **properties) + identifier = (database_name, table_name) + + catalog.create_namespace(namespace=database_name) + assert not catalog.table_exists(identifier=identifier) + catalog.create_table(identifier=identifier, schema=table_schema_nested) + assert catalog.table_exists(identifier=identifier) + + def test_drop_table(table_bucket_arn, database_name: str, table_name: str, table_schema_nested: Schema): # setting FileIO to FsspecFileIO explicitly is required as pyarrwo does not work with S3 Table Buckets yet properties = {"warehouse": table_bucket_arn, "py-io-impl": "pyiceberg.io.fsspec.FsspecFileIO"} From 5ef779018c7a434870d9d55c656af7c5d271645e Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Wed, 18 Dec 2024 21:53:07 +0100 Subject: [PATCH 08/57] feat: implement list_tables --- pyiceberg/catalog/s3tables.py | 7 ++++++- tests/catalog/test_s3tables.py | 12 ++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index 947157320d..047c689afd 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -253,7 +253,12 @@ def list_namespaces(self, namespace: Union[str, Identifier] = ...) -> List[Ident return [tuple(namespace["namespace"]) for namespace in response["namespaces"]] def list_tables(self, namespace: Union[str, Identifier]) -> List[Identifier]: - return super().list_tables(namespace) + namespace = self._validate_namespace_identifier(namespace) + paginator = self.s3tables.get_paginator("list_tables") + tables: List[str] = [] + for page in paginator.paginate(tableBucketARN=self.table_bucket_arn, namespace=namespace): + tables.extend(table["name"] for table in page["tables"]) + return tables def list_views(self, namespace: Union[str, Identifier]) -> List[Identifier]: return super().list_views(namespace) diff --git a/tests/catalog/test_s3tables.py b/tests/catalog/test_s3tables.py index f33d04d953..e74bddc973 100644 --- a/tests/catalog/test_s3tables.py +++ b/tests/catalog/test_s3tables.py @@ -103,6 +103,18 @@ def test_table_exists(table_bucket_arn, database_name: str, table_name: str, tab assert catalog.table_exists(identifier=identifier) +def test_list_tables(table_bucket_arn, database_name: str, table_name: str, table_schema_nested: Schema): + # setting FileIO to FsspecFileIO explicitly is required as pyarrwo does not work with S3 Table Buckets yet + properties = {"warehouse": table_bucket_arn, "py-io-impl": "pyiceberg.io.fsspec.FsspecFileIO"} + catalog = S3TableCatalog(name="test_s3tables_catalog", **properties) + identifier = (database_name, table_name) + + catalog.create_namespace(namespace=database_name) + assert not catalog.list_tables(namespace=database_name) + catalog.create_table(identifier=identifier, schema=table_schema_nested) + assert catalog.list_tables(namespace=database_name) + + def test_drop_table(table_bucket_arn, database_name: str, table_name: str, table_schema_nested: Schema): # setting FileIO to FsspecFileIO explicitly is required as pyarrwo does not work with S3 Table Buckets yet properties = {"warehouse": table_bucket_arn, "py-io-impl": "pyiceberg.io.fsspec.FsspecFileIO"} From 6d496dfa06a5632dcb43b1d1ccc694d468c04cab Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Wed, 18 Dec 2024 21:59:14 +0100 Subject: [PATCH 09/57] refactor: improve list_namespace --- pyiceberg/catalog/s3tables.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index 047c689afd..9e223dcc7a 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -247,10 +247,17 @@ def drop_table(self, identifier: Union[str, Identifier]) -> None: def drop_view(self, identifier: Union[str, Identifier]) -> None: return super().drop_view(identifier) - def list_namespaces(self, namespace: Union[str, Identifier] = ...) -> List[Identifier]: - # TODO: handle pagination - response = self.s3tables.list_namespaces(tableBucketARN=self.table_bucket_arn) - return [tuple(namespace["namespace"]) for namespace in response["namespaces"]] + def list_namespaces(self, namespace: Union[str, Identifier] = ()) -> List[Identifier]: + # TODO: s3tables only support single level namespaces + if namespace: + namespace = self._validate_namespace_identifier(namespace) + paginator = self.s3tables.get_paginator("list_namespaces") + + namespaces: List[Identifier] = [] + for page in paginator.paginate(tableBucketARN=self.table_bucket_arn): + namespaces.extend(tuple(entry["namespace"]) for entry in page["namespaces"]) + + return namespaces def list_tables(self, namespace: Union[str, Identifier]) -> List[Identifier]: namespace = self._validate_namespace_identifier(namespace) From 33b08f0eeb7bcd4811db23144698576db48b9d12 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Wed, 18 Dec 2024 21:59:50 +0100 Subject: [PATCH 10/57] fix: return Identifier from list_tables --- pyiceberg/catalog/s3tables.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index 9e223dcc7a..f974c354bd 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -262,9 +262,9 @@ def list_namespaces(self, namespace: Union[str, Identifier] = ()) -> List[Identi def list_tables(self, namespace: Union[str, Identifier]) -> List[Identifier]: namespace = self._validate_namespace_identifier(namespace) paginator = self.s3tables.get_paginator("list_tables") - tables: List[str] = [] + tables: List[Identifier] = [] for page in paginator.paginate(tableBucketARN=self.table_bucket_arn, namespace=namespace): - tables.extend(table["name"] for table in page["tables"]) + tables.extend((namespace, table["name"]) for table in page["tables"]) return tables def list_views(self, namespace: Union[str, Identifier]) -> List[Identifier]: From 798e65d23e775d9a1c98d3d24c8945473b5906b1 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sat, 21 Dec 2024 14:22:25 +0100 Subject: [PATCH 11/57] feat: implement rename table --- pyiceberg/catalog/s3tables.py | 22 ++++++++++++++++++---- tests/catalog/test_s3tables.py | 19 +++++++++++++++++++ 2 files changed, 37 insertions(+), 4 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index f974c354bd..54d57e347d 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -303,14 +303,28 @@ def register_table(self, identifier: Union[str, Identifier], metadata_location: return super().register_table(identifier, metadata_location) def rename_table(self, from_identifier: Union[str, Identifier], to_identifier: Union[str, Identifier]) -> Table: - return super().rename_table(from_identifier, to_identifier) + from_namespace, from_table_name = self._validate_database_and_table_identifier(from_identifier) + to_namespace, to_table_name = self._validate_database_and_table_identifier(to_identifier) + + version_token = self.s3tables.get_table( + tableBucketARN=self.table_bucket_arn, namespace=from_namespace, name=from_table_name + )["versionToken"] + + self.s3tables.rename_table( + tableBucketARN=self.table_bucket_arn, + namespace=from_namespace, + name=from_table_name, + newNamespaceName=to_namespace, + newName=to_table_name, + versionToken=version_token + ) + + return self.load_table(to_identifier) def table_exists(self, identifier: Union[str, Identifier]) -> bool: namespace, table_name = self._validate_database_and_table_identifier(identifier) try: - self.s3tables.get_table( - tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name - ) + self.s3tables.get_table(tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name) except self.s3tables.exceptions.NotFoundException: return False return True diff --git a/tests/catalog/test_s3tables.py b/tests/catalog/test_s3tables.py index e74bddc973..3750b8a357 100644 --- a/tests/catalog/test_s3tables.py +++ b/tests/catalog/test_s3tables.py @@ -103,6 +103,25 @@ def test_table_exists(table_bucket_arn, database_name: str, table_name: str, tab assert catalog.table_exists(identifier=identifier) +def test_rename_table(table_bucket_arn, database_name: str, table_name: str, table_schema_nested: Schema): + # setting FileIO to FsspecFileIO explicitly is required as pyarrwo does not work with S3 Table Buckets yet + properties = {"warehouse": table_bucket_arn, "py-io-impl": "pyiceberg.io.fsspec.FsspecFileIO"} + catalog = S3TableCatalog(name="test_s3tables_catalog", **properties) + identifier = (database_name, table_name) + + catalog.create_namespace(namespace=database_name) + catalog.create_table(identifier=identifier, schema=table_schema_nested) + + to_database_name = f"{database_name}new" + to_table_name = f"{table_name}new" + to_identifier = (to_database_name, to_table_name) + catalog.create_namespace(namespace=to_database_name) + catalog.rename_table(from_identifier=identifier, to_identifier=to_identifier) + + assert not catalog.table_exists(identifier=identifier) + assert catalog.table_exists(identifier=to_identifier) + + def test_list_tables(table_bucket_arn, database_name: str, table_name: str, table_schema_nested: Schema): # setting FileIO to FsspecFileIO explicitly is required as pyarrwo does not work with S3 Table Buckets yet properties = {"warehouse": table_bucket_arn, "py-io-impl": "pyiceberg.io.fsspec.FsspecFileIO"} From 5297ca72965592328cbeecca2eee78fec92c8372 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sat, 21 Dec 2024 14:32:03 +0100 Subject: [PATCH 12/57] feat: implement load_namespace_properties --- pyiceberg/catalog/s3tables.py | 11 +++++++++-- tests/catalog/test_s3tables.py | 9 +++++++-- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index 54d57e347d..c77651444b 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -271,7 +271,14 @@ def list_views(self, namespace: Union[str, Identifier]) -> List[Identifier]: return super().list_views(namespace) def load_namespace_properties(self, namespace: Union[str, Identifier]) -> Properties: - return super().load_namespace_properties(namespace) + namespace = self._validate_namespace_identifier(namespace) + response = self.s3tables.get_namespace(tableBucketARN=self.table_bucket_arn, namespace=namespace) + return { + "namespace": response["namespace"], + "createdAt": response["createdAt"], + "createdBy": response["createdBy"], + "ownerAccountId": response["ownerAccountId"], + } def load_table(self, identifier: Union[str, Identifier]) -> Table: namespace, table_name = self._validate_database_and_table_identifier(identifier) @@ -316,7 +323,7 @@ def rename_table(self, from_identifier: Union[str, Identifier], to_identifier: U name=from_table_name, newNamespaceName=to_namespace, newName=to_table_name, - versionToken=version_token + versionToken=version_token, ) return self.load_table(to_identifier) diff --git a/tests/catalog/test_s3tables.py b/tests/catalog/test_s3tables.py index 3750b8a357..5b7393b3a0 100644 --- a/tests/catalog/test_s3tables.py +++ b/tests/catalog/test_s3tables.py @@ -70,6 +70,13 @@ def test_create_namespace(table_bucket_arn, database_name: str): assert (database_name,) in namespaces +def test_load_namespace_properties(table_bucket_arn, database_name: str): + properties = {"warehouse": table_bucket_arn} + catalog = S3TableCatalog(name="test_s3tables_catalog", **properties) + catalog.create_namespace(namespace=database_name) + assert database_name in catalog.load_namespace_properties(database_name)["namespace"] + + def test_drop_namespace(table_bucket_arn, database_name: str): properties = {"warehouse": table_bucket_arn} catalog = S3TableCatalog(name="test_s3tables_catalog", **properties) @@ -162,8 +169,6 @@ def test_commit_table(table_bucket_arn, database_name: str, table_name: str, tab original_table_metadata_location = table.metadata_location original_table_last_updated_ms = table.metadata.last_updated_ms - - transaction = table.transaction() update = transaction.update_schema() update.add_column(path="b", field_type=IntegerType()) From 3817431bc03bf559f86d7300f9261d0aea2639f1 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sat, 21 Dec 2024 14:38:07 +0100 Subject: [PATCH 13/57] refactor: move some methods around --- pyiceberg/catalog/s3tables.py | 47 ++++++++++++++++++----------------- 1 file changed, 24 insertions(+), 23 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index c77651444b..2370a5099c 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -10,7 +10,6 @@ from pyiceberg.catalog import DEPRECATED_BOTOCORE_SESSION from pyiceberg.catalog import WAREHOUSE_LOCATION -from pyiceberg.catalog import Catalog from pyiceberg.catalog import MetastoreCatalog from pyiceberg.catalog import PropertiesUpdateSummary from pyiceberg.exceptions import CommitFailedException @@ -157,6 +156,7 @@ def _validate_database_and_table_identifier(self, identifier: Union[str, Identif namespace = self._validate_namespace_identifier(namespace) if not pattern.fullmatch(table_name): + # TODO: raise proper errors for invalid table_name ... return namespace, table_name @@ -209,16 +209,6 @@ def create_table( return self.load_table(identifier=identifier) - def create_table_transaction( - self, - identifier: Union[str, Identifier], - schema: Union[Schema, "pa.Schema"], - location: Optional[str] = None, - partition_spec: PartitionSpec = ..., - sort_order: SortOrder = ..., - properties: Properties = ..., - ) -> CreateTableTransaction: - return super().create_table_transaction(identifier, schema, location, partition_spec, sort_order, properties) def drop_namespace(self, namespace: Union[str, Identifier]) -> None: namespace = self._validate_namespace_identifier(namespace) @@ -244,9 +234,6 @@ def drop_table(self, identifier: Union[str, Identifier]) -> None: versionToken=response["versionToken"], ) - def drop_view(self, identifier: Union[str, Identifier]) -> None: - return super().drop_view(identifier) - def list_namespaces(self, namespace: Union[str, Identifier] = ()) -> List[Identifier]: # TODO: s3tables only support single level namespaces if namespace: @@ -267,9 +254,6 @@ def list_tables(self, namespace: Union[str, Identifier]) -> List[Identifier]: tables.extend((namespace, table["name"]) for table in page["tables"]) return tables - def list_views(self, namespace: Union[str, Identifier]) -> List[Identifier]: - return super().list_views(namespace) - def load_namespace_properties(self, namespace: Union[str, Identifier]) -> Properties: namespace = self._validate_namespace_identifier(namespace) response = self.s3tables.get_namespace(tableBucketARN=self.table_bucket_arn, namespace=namespace) @@ -303,12 +287,6 @@ def load_table(self, identifier: Union[str, Identifier]) -> Table: catalog=self, ) - def purge_table(self, identifier: Union[str, Identifier]) -> None: - return super().purge_table(identifier) - - def register_table(self, identifier: Union[str, Identifier], metadata_location: str) -> Table: - return super().register_table(identifier, metadata_location) - def rename_table(self, from_identifier: Union[str, Identifier], to_identifier: Union[str, Identifier]) -> Table: from_namespace, from_table_name = self._validate_database_and_table_identifier(from_identifier) to_namespace, to_table_name = self._validate_database_and_table_identifier(to_identifier) @@ -340,3 +318,26 @@ def update_namespace_properties( self, namespace: Union[str, Identifier], removals: Optional[Set[str]] = None, updates: Properties = ... ) -> PropertiesUpdateSummary: return super().update_namespace_properties(namespace, removals, updates) + + def create_table_transaction( + self, + identifier: Union[str, Identifier], + schema: Union[Schema, "pa.Schema"], + location: Optional[str] = None, + partition_spec: PartitionSpec = ..., + sort_order: SortOrder = ..., + properties: Properties = ..., + ) -> CreateTableTransaction: + return super().create_table_transaction(identifier, schema, location, partition_spec, sort_order, properties) + + def purge_table(self, identifier: Union[str, Identifier]) -> None: + return super().purge_table(identifier) + + def register_table(self, identifier: Union[str, Identifier], metadata_location: str) -> Table: + return super().register_table(identifier, metadata_location) + + def drop_view(self, identifier: Union[str, Identifier]) -> None: + return super().drop_view(identifier) + + def list_views(self, namespace: Union[str, Identifier]) -> List[Identifier]: + return super().list_views(namespace) From c4a6485a5ea80b3a7a7fdf6b07ea2b95a9fb2010 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sat, 21 Dec 2024 14:39:04 +0100 Subject: [PATCH 14/57] feat: raise NotImplementedError for views functionality --- pyiceberg/catalog/s3tables.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index 2370a5099c..bb0007b796 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -337,7 +337,7 @@ def register_table(self, identifier: Union[str, Identifier], metadata_location: return super().register_table(identifier, metadata_location) def drop_view(self, identifier: Union[str, Identifier]) -> None: - return super().drop_view(identifier) + raise NotImplementedError def list_views(self, namespace: Union[str, Identifier]) -> List[Identifier]: - return super().list_views(namespace) + raise NotImplementedError From 7bc8c3e160adfb4f50eb26f30a096d2e281102c2 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sat, 21 Dec 2024 14:43:11 +0100 Subject: [PATCH 15/57] feat: raise NotImplementedError for purge_table --- pyiceberg/catalog/s3tables.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index bb0007b796..c91f76eb08 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -331,7 +331,9 @@ def create_table_transaction( return super().create_table_transaction(identifier, schema, location, partition_spec, sort_order, properties) def purge_table(self, identifier: Union[str, Identifier]) -> None: - return super().purge_table(identifier) + # purge is not supported as s3tables doesn't support delete operations + # table maintenance is automated + raise NotImplementedError def register_table(self, identifier: Union[str, Identifier], metadata_location: str) -> Table: return super().register_table(identifier, metadata_location) From 3de30877d08ef6a2d4ac1068a2f55380e93c85c8 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sat, 21 Dec 2024 14:52:50 +0100 Subject: [PATCH 16/57] feat: raise NotImplementedError for update_namespace_properties --- pyiceberg/catalog/s3tables.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index c91f76eb08..058d2acc59 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -315,9 +315,10 @@ def table_exists(self, identifier: Union[str, Identifier]) -> bool: return True def update_namespace_properties( - self, namespace: Union[str, Identifier], removals: Optional[Set[str]] = None, updates: Properties = ... + self, namespace: Union[str, Identifier], removals: Optional[Set[str]] = None, updates: Properties = EMPTY_DICT ) -> PropertiesUpdateSummary: - return super().update_namespace_properties(namespace, removals, updates) + # namespace properties are read only + raise NotImplementedError def create_table_transaction( self, From 988485bf1b37fa51b8737fd4cd00108c8767343d Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sat, 21 Dec 2024 15:02:57 +0100 Subject: [PATCH 17/57] feat: raise NotImplementedError for register_table --- pyiceberg/catalog/s3tables.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index 058d2acc59..3c4a1efda7 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -337,7 +337,7 @@ def purge_table(self, identifier: Union[str, Identifier]) -> None: raise NotImplementedError def register_table(self, identifier: Union[str, Identifier], metadata_location: str) -> Table: - return super().register_table(identifier, metadata_location) + raise NotImplementedError def drop_view(self, identifier: Union[str, Identifier]) -> None: raise NotImplementedError From 6a28cb94625d061f93753cbd6fadb55e40ab3b71 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sat, 21 Dec 2024 15:10:51 +0100 Subject: [PATCH 18/57] fix: don't override create_table_transaction --- pyiceberg/catalog/s3tables.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index 3c4a1efda7..555e9783e0 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -320,17 +320,6 @@ def update_namespace_properties( # namespace properties are read only raise NotImplementedError - def create_table_transaction( - self, - identifier: Union[str, Identifier], - schema: Union[Schema, "pa.Schema"], - location: Optional[str] = None, - partition_spec: PartitionSpec = ..., - sort_order: SortOrder = ..., - properties: Properties = ..., - ) -> CreateTableTransaction: - return super().create_table_transaction(identifier, schema, location, partition_spec, sort_order, properties) - def purge_table(self, identifier: Union[str, Identifier]) -> None: # purge is not supported as s3tables doesn't support delete operations # table maintenance is automated From a545c8247af8221f8f93df600466dc5191d6ab4e Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sat, 21 Dec 2024 15:15:57 +0100 Subject: [PATCH 19/57] chore: run formatter --- pyiceberg/catalog/s3tables.py | 68 ++++++++++------------------------ pyiceberg/exceptions.py | 3 ++ tests/catalog/test_s3tables.py | 7 +--- 3 files changed, 25 insertions(+), 53 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index 555e9783e0..b4aa7283b5 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -1,43 +1,24 @@ import re -from typing import TYPE_CHECKING -from typing import List -from typing import Optional -from typing import Set -from typing import Tuple -from typing import Union +from typing import TYPE_CHECKING, List, Optional, Set, Tuple, Union import boto3 -from pyiceberg.catalog import DEPRECATED_BOTOCORE_SESSION -from pyiceberg.catalog import WAREHOUSE_LOCATION -from pyiceberg.catalog import MetastoreCatalog -from pyiceberg.catalog import PropertiesUpdateSummary -from pyiceberg.exceptions import CommitFailedException -from pyiceberg.exceptions import NamespaceNotEmptyError -from pyiceberg.exceptions import NoSuchTableError -from pyiceberg.exceptions import S3TablesError -from pyiceberg.exceptions import TableBucketNotFound -from pyiceberg.io import AWS_ACCESS_KEY_ID -from pyiceberg.io import AWS_REGION -from pyiceberg.io import AWS_SECRET_ACCESS_KEY -from pyiceberg.io import AWS_SESSION_TOKEN -from pyiceberg.io import PY_IO_IMPL -from pyiceberg.io import load_file_io -from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC -from pyiceberg.partitioning import PartitionSpec +from pyiceberg.catalog import DEPRECATED_BOTOCORE_SESSION, WAREHOUSE_LOCATION, MetastoreCatalog, PropertiesUpdateSummary +from pyiceberg.exceptions import ( + CommitFailedException, + NamespaceNotEmptyError, + NoSuchTableError, + TableBucketNotFound, +) +from pyiceberg.io import AWS_ACCESS_KEY_ID, AWS_REGION, AWS_SECRET_ACCESS_KEY, AWS_SESSION_TOKEN, PY_IO_IMPL, load_file_io +from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionSpec from pyiceberg.schema import Schema from pyiceberg.serializers import FromInputFile -from pyiceberg.table import CommitTableResponse -from pyiceberg.table import CreateTableTransaction -from pyiceberg.table import Table -from pyiceberg.table import TableRequirement +from pyiceberg.table import CommitTableResponse, Table, TableRequirement from pyiceberg.table.metadata import new_table_metadata -from pyiceberg.table.sorting import UNSORTED_SORT_ORDER -from pyiceberg.table.sorting import SortOrder +from pyiceberg.table.sorting import UNSORTED_SORT_ORDER, SortOrder from pyiceberg.table.update import TableUpdate -from pyiceberg.typedef import EMPTY_DICT -from pyiceberg.typedef import Identifier -from pyiceberg.typedef import Properties +from pyiceberg.typedef import EMPTY_DICT, Identifier, Properties from pyiceberg.utils.properties import get_first_property_value if TYPE_CHECKING: @@ -66,9 +47,7 @@ def __init__(self, name: str, **properties: str): region_name=get_first_property_value(properties, S3TABLES_REGION, AWS_REGION), botocore_session=properties.get(DEPRECATED_BOTOCORE_SESSION), aws_access_key_id=get_first_property_value(properties, S3TABLES_ACCESS_KEY_ID, AWS_ACCESS_KEY_ID), - aws_secret_access_key=get_first_property_value( - properties, S3TABLES_SECRET_ACCESS_KEY, AWS_SECRET_ACCESS_KEY - ), + aws_secret_access_key=get_first_property_value(properties, S3TABLES_SECRET_ACCESS_KEY, AWS_SECRET_ACCESS_KEY), aws_session_token=get_first_property_value(properties, S3TABLES_SESSION_TOKEN, AWS_SESSION_TOKEN), ) # TODO: s3tables client only supported from boto3>=1.35.74 so this can crash @@ -91,16 +70,14 @@ def commit_table( # the table can change between those two API calls without noticing # -> change this into an internal API that returns table information along with versionToken current_table = self.load_table(identifier=table_identifier) - version_token = self.s3tables.get_table( - tableBucketARN=self.table_bucket_arn, namespace=database_name, name=table_name - )["versionToken"] + version_token = self.s3tables.get_table(tableBucketARN=self.table_bucket_arn, namespace=database_name, name=table_name)[ + "versionToken" + ] updated_staged_table = self._update_and_stage_table(current_table, table_identifier, requirements, updates) if current_table and updated_staged_table.metadata == current_table.metadata: # no changes, do nothing - return CommitTableResponse( - metadata=current_table.metadata, metadata_location=current_table.metadata_location - ) + return CommitTableResponse(metadata=current_table.metadata, metadata_location=current_table.metadata_location) self._write_metadata( metadata=updated_staged_table.metadata, @@ -175,9 +152,7 @@ def create_table( schema: Schema = self._convert_schema_if_needed(schema) # type: ignore # TODO: check whether namespace exists and if it does, whether table_name already exists - self.s3tables.create_table( - tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name, format="ICEBERG" - ) + self.s3tables.create_table(tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name, format="ICEBERG") # location is given by s3 table bucket response = self.s3tables.get_table(tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name) @@ -209,7 +184,6 @@ def create_table( return self.load_table(identifier=identifier) - def drop_namespace(self, namespace: Union[str, Identifier]) -> None: namespace = self._validate_namespace_identifier(namespace) try: @@ -220,9 +194,7 @@ def drop_namespace(self, namespace: Union[str, Identifier]) -> None: def drop_table(self, identifier: Union[str, Identifier]) -> None: namespace, table_name = self._validate_database_and_table_identifier(identifier) try: - response = self.s3tables.get_table( - tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name - ) + response = self.s3tables.get_table(tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name) except self.s3tables.exceptions.NotFoundException as e: raise NoSuchTableError(f"No table with identifier {identifier} exists.") from e diff --git a/pyiceberg/exceptions.py b/pyiceberg/exceptions.py index 6096096f45..f633e4c143 100644 --- a/pyiceberg/exceptions.py +++ b/pyiceberg/exceptions.py @@ -111,12 +111,15 @@ class ConditionalCheckFailedException(DynamoDbError): class GenericDynamoDbError(DynamoDbError): pass + class S3TablesError(Exception): pass + class TableBucketNotFound(S3TablesError): pass + class CommitFailedException(Exception): """Commit failed, refresh and try again.""" diff --git a/tests/catalog/test_s3tables.py b/tests/catalog/test_s3tables.py index 5b7393b3a0..2cbdca15c3 100644 --- a/tests/catalog/test_s3tables.py +++ b/tests/catalog/test_s3tables.py @@ -4,8 +4,7 @@ import pytest from pyiceberg.catalog.s3tables import S3TableCatalog -from pyiceberg.exceptions import NoSuchTableError -from pyiceberg.exceptions import TableBucketNotFound +from pyiceberg.exceptions import NoSuchTableError, TableBucketNotFound from pyiceberg.schema import Schema from pyiceberg.types import IntegerType @@ -35,9 +34,7 @@ def table_bucket_arn(): def test_s3tables_api_raises_on_conflicting_version_tokens(table_bucket_arn, database_name, table_name): client = boto3.client("s3tables") client.create_namespace(tableBucketARN=table_bucket_arn, namespace=[database_name]) - response = client.create_table( - tableBucketARN=table_bucket_arn, namespace=database_name, name=table_name, format="ICEBERG" - ) + response = client.create_table(tableBucketARN=table_bucket_arn, namespace=database_name, name=table_name, format="ICEBERG") version_token = response["versionToken"] scrambled_version_token = version_token[::-1] From 7164e096bd1ed2e8a519cafb43fcd129bfca9a35 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Mon, 23 Dec 2024 14:19:27 +0100 Subject: [PATCH 20/57] feat: raise exceptions if boto3 doesn't support s3tables --- pyiceberg/catalog/s3tables.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index b4aa7283b5..b8009191b2 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -2,6 +2,7 @@ from typing import TYPE_CHECKING, List, Optional, Set, Tuple, Union import boto3 +from boto3.session import UnknownServiceError from pyiceberg.catalog import DEPRECATED_BOTOCORE_SESSION, WAREHOUSE_LOCATION, MetastoreCatalog, PropertiesUpdateSummary from pyiceberg.exceptions import ( @@ -9,6 +10,7 @@ NamespaceNotEmptyError, NoSuchTableError, TableBucketNotFound, + S3TablesError ) from pyiceberg.io import AWS_ACCESS_KEY_ID, AWS_REGION, AWS_SECRET_ACCESS_KEY, AWS_SESSION_TOKEN, PY_IO_IMPL, load_file_io from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionSpec @@ -50,9 +52,11 @@ def __init__(self, name: str, **properties: str): aws_secret_access_key=get_first_property_value(properties, S3TABLES_SECRET_ACCESS_KEY, AWS_SECRET_ACCESS_KEY), aws_session_token=get_first_property_value(properties, S3TABLES_SESSION_TOKEN, AWS_SESSION_TOKEN), ) - # TODO: s3tables client only supported from boto3>=1.35.74 so this can crash - # TODO: set a custom user-agent for api calls like the Java implementation - self.s3tables = session.client("s3tables") + try: + self.s3tables = session.client("s3tables") + except UnknownServiceError as e: + raise S3TablesError("'s3tables' requires boto3>=1.35.74. Current version: {boto3.__version__}.") from e + # TODO: handle malformed properties instead of just raising a key error here self.table_bucket_arn = self.properties[WAREHOUSE_LOCATION] try: From efa04e3d2e4aec65b17fe94a0d3474ed91630261 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Mon, 23 Dec 2024 14:24:55 +0100 Subject: [PATCH 21/57] feat: make endpoint configurable --- pyiceberg/catalog/s3tables.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index b8009191b2..ab3e77a016 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -53,7 +53,7 @@ def __init__(self, name: str, **properties: str): aws_session_token=get_first_property_value(properties, S3TABLES_SESSION_TOKEN, AWS_SESSION_TOKEN), ) try: - self.s3tables = session.client("s3tables") + self.s3tables = session.client("s3tables", endpoint_url=properties.get(S3TABLES_ENDPOINT)) except UnknownServiceError as e: raise S3TablesError("'s3tables' requires boto3>=1.35.74. Current version: {boto3.__version__}.") from e From f1454937d7d1220e021e250e456bbc725a54919c Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Mon, 23 Dec 2024 14:34:53 +0100 Subject: [PATCH 22/57] feat: explicitly configure tableBucketARN --- pyiceberg/catalog/s3tables.py | 6 ++-- tests/catalog/test_s3tables.py | 50 +++++++++++----------------------- 2 files changed, 20 insertions(+), 36 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index ab3e77a016..4675a9cf76 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -32,6 +32,8 @@ S3TABLES_SECRET_ACCESS_KEY = "s3tables.secret-access-key" S3TABLES_SESSION_TOKEN = "s3tables.session-token" +S3TABLES_TABLE_BUCKET_ARN = "s3tables.table-bucket-arn" + S3TABLES_ENDPOINT = "s3tables.endpoint" # pyarrow does not support writing to S3 Table buckets as of 2024-12-14 https://github.com/apache/iceberg-python/issues/1404#issuecomment-2543174146 @@ -44,6 +46,8 @@ def __init__(self, name: str, **properties: str): # TODO: implement a proper check for FileIO self.properties[PY_IO_IMPL] = S3TABLES_FILE_IO_DEFAULT + self.table_bucket_arn = self.properties[S3TABLES_TABLE_BUCKET_ARN] + session = boto3.Session( profile_name=properties.get(S3TABLES_PROFILE_NAME), region_name=get_first_property_value(properties, S3TABLES_REGION, AWS_REGION), @@ -57,8 +61,6 @@ def __init__(self, name: str, **properties: str): except UnknownServiceError as e: raise S3TablesError("'s3tables' requires boto3>=1.35.74. Current version: {boto3.__version__}.") from e - # TODO: handle malformed properties instead of just raising a key error here - self.table_bucket_arn = self.properties[WAREHOUSE_LOCATION] try: self.s3tables.get_table_bucket(tableBucketARN=self.table_bucket_arn) except self.s3tables.exceptions.NotFoundException as e: diff --git a/tests/catalog/test_s3tables.py b/tests/catalog/test_s3tables.py index 2cbdca15c3..67cdeb21c8 100644 --- a/tests/catalog/test_s3tables.py +++ b/tests/catalog/test_s3tables.py @@ -30,6 +30,12 @@ def table_bucket_arn(): return os.environ["ARN"] +@pytest.fixture +def catalog(table_bucket_arn): + # setting FileIO to FsspecFileIO explicitly is required as pyarrow does not work with S3 Table Buckets yet + properties = {"s3tables.table-bucket-arn": table_bucket_arn, "py-io-impl": "pyiceberg.io.fsspec.FsspecFileIO"} + return S3TableCatalog(name="test_s3tables_catalog", **properties) + def test_s3tables_api_raises_on_conflicting_version_tokens(table_bucket_arn, database_name, table_name): client = boto3.client("s3tables") @@ -54,39 +60,30 @@ def test_s3tables_api_raises_on_conflicting_version_tokens(table_bucket_arn, dat def test_creating_catalog_validates_s3_table_bucket_exists(table_bucket_arn): - properties = {"warehouse": f"{table_bucket_arn}-modified"} + properties = {"s3tables.table-bucket-arn": f"{table_bucket_arn}-modified"} with pytest.raises(TableBucketNotFound): S3TableCatalog(name="test_s3tables_catalog", **properties) -def test_create_namespace(table_bucket_arn, database_name: str): - properties = {"warehouse": table_bucket_arn} - catalog = S3TableCatalog(name="test_s3tables_catalog", **properties) +def test_create_namespace(catalog, database_name: str): catalog.create_namespace(namespace=database_name) namespaces = catalog.list_namespaces() assert (database_name,) in namespaces -def test_load_namespace_properties(table_bucket_arn, database_name: str): - properties = {"warehouse": table_bucket_arn} - catalog = S3TableCatalog(name="test_s3tables_catalog", **properties) +def test_load_namespace_properties(catalog, database_name: str): catalog.create_namespace(namespace=database_name) assert database_name in catalog.load_namespace_properties(database_name)["namespace"] -def test_drop_namespace(table_bucket_arn, database_name: str): - properties = {"warehouse": table_bucket_arn} - catalog = S3TableCatalog(name="test_s3tables_catalog", **properties) +def test_drop_namespace(catalog, database_name: str): catalog.create_namespace(namespace=database_name) assert (database_name,) in catalog.list_namespaces() catalog.drop_namespace(namespace=database_name) assert (database_name,) not in catalog.list_namespaces() -def test_create_table(table_bucket_arn, database_name: str, table_name: str, table_schema_nested: Schema): - # setting FileIO to FsspecFileIO explicitly is required as pyarrwo does not work with S3 Table Buckets yet - properties = {"warehouse": table_bucket_arn, "py-io-impl": "pyiceberg.io.fsspec.FsspecFileIO"} - catalog = S3TableCatalog(name="test_s3tables_catalog", **properties) +def test_create_table(catalog, database_name: str, table_name: str, table_schema_nested: Schema): identifier = (database_name, table_name) catalog.create_namespace(namespace=database_name) @@ -95,10 +92,7 @@ def test_create_table(table_bucket_arn, database_name: str, table_name: str, tab assert table == catalog.load_table(identifier) -def test_table_exists(table_bucket_arn, database_name: str, table_name: str, table_schema_nested: Schema): - # setting FileIO to FsspecFileIO explicitly is required as pyarrwo does not work with S3 Table Buckets yet - properties = {"warehouse": table_bucket_arn, "py-io-impl": "pyiceberg.io.fsspec.FsspecFileIO"} - catalog = S3TableCatalog(name="test_s3tables_catalog", **properties) +def test_table_exists(catalog, database_name: str, table_name: str, table_schema_nested: Schema): identifier = (database_name, table_name) catalog.create_namespace(namespace=database_name) @@ -107,10 +101,7 @@ def test_table_exists(table_bucket_arn, database_name: str, table_name: str, tab assert catalog.table_exists(identifier=identifier) -def test_rename_table(table_bucket_arn, database_name: str, table_name: str, table_schema_nested: Schema): - # setting FileIO to FsspecFileIO explicitly is required as pyarrwo does not work with S3 Table Buckets yet - properties = {"warehouse": table_bucket_arn, "py-io-impl": "pyiceberg.io.fsspec.FsspecFileIO"} - catalog = S3TableCatalog(name="test_s3tables_catalog", **properties) +def test_rename_table(catalog, database_name: str, table_name: str, table_schema_nested: Schema): identifier = (database_name, table_name) catalog.create_namespace(namespace=database_name) @@ -126,10 +117,7 @@ def test_rename_table(table_bucket_arn, database_name: str, table_name: str, tab assert catalog.table_exists(identifier=to_identifier) -def test_list_tables(table_bucket_arn, database_name: str, table_name: str, table_schema_nested: Schema): - # setting FileIO to FsspecFileIO explicitly is required as pyarrwo does not work with S3 Table Buckets yet - properties = {"warehouse": table_bucket_arn, "py-io-impl": "pyiceberg.io.fsspec.FsspecFileIO"} - catalog = S3TableCatalog(name="test_s3tables_catalog", **properties) +def test_list_tables(catalog, database_name: str, table_name: str, table_schema_nested: Schema): identifier = (database_name, table_name) catalog.create_namespace(namespace=database_name) @@ -138,10 +126,7 @@ def test_list_tables(table_bucket_arn, database_name: str, table_name: str, tabl assert catalog.list_tables(namespace=database_name) -def test_drop_table(table_bucket_arn, database_name: str, table_name: str, table_schema_nested: Schema): - # setting FileIO to FsspecFileIO explicitly is required as pyarrwo does not work with S3 Table Buckets yet - properties = {"warehouse": table_bucket_arn, "py-io-impl": "pyiceberg.io.fsspec.FsspecFileIO"} - catalog = S3TableCatalog(name="test_s3tables_catalog", **properties) +def test_drop_table(catalog, database_name: str, table_name: str, table_schema_nested: Schema): identifier = (database_name, table_name) catalog.create_namespace(namespace=database_name) @@ -153,10 +138,7 @@ def test_drop_table(table_bucket_arn, database_name: str, table_name: str, table catalog.load_table(identifier=identifier) -def test_commit_table(table_bucket_arn, database_name: str, table_name: str, table_schema_nested: Schema): - # setting FileIO to FsspecFileIO explicitly is required as pyarrwo does not work with S3 Table Buckets yet - properties = {"warehouse": table_bucket_arn, "py-io-impl": "pyiceberg.io.fsspec.FsspecFileIO"} - catalog = S3TableCatalog(name="test_s3tables_catalog", **properties) +def test_commit_table(catalog, database_name: str, table_name: str, table_schema_nested: Schema): identifier = (database_name, table_name) catalog.create_namespace(namespace=database_name) From 2592610fa7eb79ae7fba14a19f8d337004c32079 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Mon, 23 Dec 2024 14:37:04 +0100 Subject: [PATCH 23/57] fix: remove defaulting to FsspecIO --- pyiceberg/catalog/s3tables.py | 5 ----- tests/catalog/test_s3tables.py | 2 +- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index 4675a9cf76..b8d4ebf672 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -36,15 +36,10 @@ S3TABLES_ENDPOINT = "s3tables.endpoint" -# pyarrow does not support writing to S3 Table buckets as of 2024-12-14 https://github.com/apache/iceberg-python/issues/1404#issuecomment-2543174146 -S3TABLES_FILE_IO_DEFAULT = "pyiceberg.io.fsspec.FsspecFileIO" - class S3TableCatalog(MetastoreCatalog): def __init__(self, name: str, **properties: str): super().__init__(name, **properties) - # TODO: implement a proper check for FileIO - self.properties[PY_IO_IMPL] = S3TABLES_FILE_IO_DEFAULT self.table_bucket_arn = self.properties[S3TABLES_TABLE_BUCKET_ARN] diff --git a/tests/catalog/test_s3tables.py b/tests/catalog/test_s3tables.py index 67cdeb21c8..c761ab79ad 100644 --- a/tests/catalog/test_s3tables.py +++ b/tests/catalog/test_s3tables.py @@ -32,7 +32,7 @@ def table_bucket_arn(): @pytest.fixture def catalog(table_bucket_arn): - # setting FileIO to FsspecFileIO explicitly is required as pyarrow does not work with S3 Table Buckets yet + # pyarrow does not support writing to S3 Table buckets as of 2024-12-14 https://github.com/apache/iceberg-python/issues/1404#issuecomment-2543174146 properties = {"s3tables.table-bucket-arn": table_bucket_arn, "py-io-impl": "pyiceberg.io.fsspec.FsspecFileIO"} return S3TableCatalog(name="test_s3tables_catalog", **properties) From 798cfb3837645de134c30d27111cd51f1afa6778 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Mon, 23 Dec 2024 14:55:12 +0100 Subject: [PATCH 24/57] feat: raise exceptions for invalid namespace/table name --- pyiceberg/catalog/s3tables.py | 31 ++++++++++++------------------- pyiceberg/exceptions.py | 7 +++++++ 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index b8d4ebf672..7a3c56ae01 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -4,13 +4,15 @@ import boto3 from boto3.session import UnknownServiceError -from pyiceberg.catalog import DEPRECATED_BOTOCORE_SESSION, WAREHOUSE_LOCATION, MetastoreCatalog, PropertiesUpdateSummary +from pyiceberg.catalog import DEPRECATED_BOTOCORE_SESSION, MetastoreCatalog, PropertiesUpdateSummary from pyiceberg.exceptions import ( CommitFailedException, NamespaceNotEmptyError, NoSuchTableError, TableBucketNotFound, - S3TablesError + S3TablesError, + InvalidNamespaceName, + InvalidTableName, ) from pyiceberg.io import AWS_ACCESS_KEY_ID, AWS_REGION, AWS_SECRET_ACCESS_KEY, AWS_SESSION_TOKEN, PY_IO_IMPL, load_file_io from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionSpec @@ -36,6 +38,10 @@ S3TABLES_ENDPOINT = "s3tables.endpoint" +# for naming rules see: https://docs.aws.amazon.com/AmazonS3/latest/userguide/s3-tables-buckets-naming.html +S3TABLES_VALID_NAME_REGEX = pattern = re.compile("[a-z0-9][a-z0-9_]{2,62}") +S3TABLES_RESERVED_NAMESPACE = "aws_s3_metadata" + class S3TableCatalog(MetastoreCatalog): def __init__(self, name: str, **properties: str): @@ -109,33 +115,20 @@ def create_namespace(self, namespace: Union[str, Identifier], properties: Proper self.s3tables.create_namespace(tableBucketARN=self.table_bucket_arn, namespace=[valid_namespace]) def _validate_namespace_identifier(self, namespace: Union[str, Identifier]) -> str: - # for naming rules see: https://docs.aws.amazon.com/AmazonS3/latest/userguide/s3-tables-buckets-naming.html - # TODO: extract into constant variables - pattern = re.compile("[a-z0-9][a-z0-9_]{2,62}") - reserved = "aws_s3_metadata" - namespace = self.identifier_to_database(namespace) - if not pattern.fullmatch(namespace): - ... - - if namespace == reserved: - ... + if not S3TABLES_VALID_NAME_REGEX.fullmatch(namespace) or namespace == S3TABLES_RESERVED_NAMESPACE: + raise InvalidNamespaceName("The specified namespace name is not valid.") return namespace def _validate_database_and_table_identifier(self, identifier: Union[str, Identifier]) -> Tuple[str, str]: - # for naming rules see: https://docs.aws.amazon.com/AmazonS3/latest/userguide/s3-tables-buckets-naming.html - # TODO: extract into constant variables - pattern = re.compile("[a-z0-9][a-z0-9_]{2,62}") - namespace, table_name = self.identifier_to_database_and_table(identifier) namespace = self._validate_namespace_identifier(namespace) - if not pattern.fullmatch(table_name): - # TODO: raise proper errors for invalid table_name - ... + if not S3TABLES_VALID_NAME_REGEX.fullmatch(table_name): + raise InvalidTableName("The specified table name is not valid.") return namespace, table_name diff --git a/pyiceberg/exceptions.py b/pyiceberg/exceptions.py index f633e4c143..245fffd3c1 100644 --- a/pyiceberg/exceptions.py +++ b/pyiceberg/exceptions.py @@ -115,6 +115,13 @@ class GenericDynamoDbError(DynamoDbError): class S3TablesError(Exception): pass +class InvalidNamespaceName(S3TablesError): + pass + + +class InvalidTableName(S3TablesError): + pass + class TableBucketNotFound(S3TablesError): pass From 3b402e4c17a5231826833189bef13305c42d750b Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sun, 29 Dec 2024 14:48:10 +0100 Subject: [PATCH 25/57] feat: improve error handling for create_table --- pyiceberg/catalog/s3tables.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index 7a3c56ae01..d9ddeff5dc 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -164,17 +164,20 @@ def create_table( io = load_file_io(properties=self.properties, location=metadata_location) # TODO: this triggers unsupported list operation error, setting overwrite=True is a workaround for now - # TODO: we can perform this check manually maybe? self._write_metadata(metadata, io, metadata_location, overwrite=True) - # TODO: after writing need to update table metadata location - # can this operation fail if the version token does not match? - self.s3tables.update_table_metadata_location( - tableBucketARN=self.table_bucket_arn, - namespace=namespace, - name=table_name, - versionToken=version_token, - metadataLocation=metadata_location, - ) + + try: + self.s3tables.update_table_metadata_location( + tableBucketARN=self.table_bucket_arn, + namespace=namespace, + name=table_name, + versionToken=version_token, + metadataLocation=metadata_location, + ) + except self.s3tables.exceptions.ConflictException as e: + raise CommitFailedException( + f"Cannot create {namespace}.{table_name} because of a concurrent update to the table version {version_token}." + ) from e return self.load_table(identifier=identifier) From 138f62e5818dfe780adf74389fde046f5cadf8fe Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sun, 29 Dec 2024 14:49:24 +0100 Subject: [PATCH 26/57] feat: improve error handling for delete_table --- pyiceberg/catalog/s3tables.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index d9ddeff5dc..92c8f78077 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -195,13 +195,18 @@ def drop_table(self, identifier: Union[str, Identifier]) -> None: except self.s3tables.exceptions.NotFoundException as e: raise NoSuchTableError(f"No table with identifier {identifier} exists.") from e - # TODO: handle conflicts due to versionToken mismatch that might occur - self.s3tables.delete_table( - tableBucketARN=self.table_bucket_arn, - namespace=namespace, - name=table_name, - versionToken=response["versionToken"], - ) + version_token = response["versionToken"] + try: + self.s3tables.delete_table( + tableBucketARN=self.table_bucket_arn, + namespace=namespace, + name=table_name, + versionToken=version_token, + ) + except self.s3tables.exceptions.ConflictException as e: + raise CommitFailedException( + f"Cannot delete {namespace}.{table_name} because of a concurrent update to the table version {version_token}." + ) from e def list_namespaces(self, namespace: Union[str, Identifier] = ()) -> List[Identifier]: # TODO: s3tables only support single level namespaces From 8d64396684abf81040f9ac9331559b58ff473c9b Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sun, 29 Dec 2024 14:50:49 +0100 Subject: [PATCH 27/57] chore: cleanup comments --- pyiceberg/catalog/s3tables.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index 92c8f78077..b95f2ba8d2 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -209,7 +209,6 @@ def drop_table(self, identifier: Union[str, Identifier]) -> None: ) from e def list_namespaces(self, namespace: Union[str, Identifier] = ()) -> List[Identifier]: - # TODO: s3tables only support single level namespaces if namespace: namespace = self._validate_namespace_identifier(namespace) paginator = self.s3tables.get_paginator("list_namespaces") @@ -240,7 +239,7 @@ def load_namespace_properties(self, namespace: Union[str, Identifier]) -> Proper def load_table(self, identifier: Union[str, Identifier]) -> Table: namespace, table_name = self._validate_database_and_table_identifier(identifier) - # TODO: raise a NoSuchTableError if it does not exist + try: response = self.s3tables.get_table_metadata_location( tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name From 642ccd83c6ce841500eb6e70b340b2b70d5ce4e4 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sun, 29 Dec 2024 14:55:16 +0100 Subject: [PATCH 28/57] feat: catch missing metadata for load_table --- pyiceberg/catalog/s3tables.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index b95f2ba8d2..077d93d7d0 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -246,8 +246,10 @@ def load_table(self, identifier: Union[str, Identifier]) -> Table: ) except self.s3tables.exceptions.NotFoundException as e: raise NoSuchTableError(f"No table with identifier {identifier} exists.") from e - # TODO: we might need to catch if table is not initialized i.e. does not have metadata setup yet - metadata_location = response["metadataLocation"] + + metadata_location = response.get("metadataLocation") + if not metadata_location: + raise S3TablesError("No table metadata found.") io = load_file_io(properties=self.properties, location=metadata_location) file = io.new_input(metadata_location) From 3c17fc0448c1e461592ac5a056c6dad562c9c4e3 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sun, 29 Dec 2024 16:02:17 +0100 Subject: [PATCH 29/57] feat: handle missing namespace and preexisting table --- pyiceberg/catalog/s3tables.py | 15 ++++++++++++--- tests/catalog/test_s3tables.py | 19 ++++++++++++++++++- 2 files changed, 30 insertions(+), 4 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index 077d93d7d0..6ae2db3f32 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -13,6 +13,8 @@ S3TablesError, InvalidNamespaceName, InvalidTableName, + TableAlreadyExistsError, + NoSuchNamespaceError ) from pyiceberg.io import AWS_ACCESS_KEY_ID, AWS_REGION, AWS_SECRET_ACCESS_KEY, AWS_SESSION_TOKEN, PY_IO_IMPL, load_file_io from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionSpec @@ -145,10 +147,17 @@ def create_table( schema: Schema = self._convert_schema_if_needed(schema) # type: ignore - # TODO: check whether namespace exists and if it does, whether table_name already exists - self.s3tables.create_table(tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name, format="ICEBERG") + try: + self.s3tables.create_table(tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name, format="ICEBERG") + except self.s3tables.exceptions.NotFoundException as e: + raise NoSuchNamespaceError( + f"Cannot create {namespace}.{table_name} because no such namespace exists." + ) from e + except self.s3tables.exceptions.ConflictException as e: + raise TableAlreadyExistsError( + f"Cannot create {namespace}.{table_name} because a table of the same name already exists in the namespace." + ) from e - # location is given by s3 table bucket response = self.s3tables.get_table(tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name) version_token = response["versionToken"] diff --git a/tests/catalog/test_s3tables.py b/tests/catalog/test_s3tables.py index c761ab79ad..cd764737f2 100644 --- a/tests/catalog/test_s3tables.py +++ b/tests/catalog/test_s3tables.py @@ -4,7 +4,7 @@ import pytest from pyiceberg.catalog.s3tables import S3TableCatalog -from pyiceberg.exceptions import NoSuchTableError, TableBucketNotFound +from pyiceberg.exceptions import NoSuchTableError, TableBucketNotFound, NoSuchNamespaceError from pyiceberg.schema import Schema from pyiceberg.types import IntegerType @@ -59,6 +59,14 @@ def test_s3tables_api_raises_on_conflicting_version_tokens(table_bucket_arn, dat ) +def test_s3tables_api_raises_on_preexisting_table(table_bucket_arn, database_name, table_name): + client = boto3.client("s3tables") + client.create_namespace(tableBucketARN=table_bucket_arn, namespace=[database_name]) + client.create_table(tableBucketARN=table_bucket_arn, namespace=database_name, name=table_name, format="ICEBERG") + with pytest.raises(client.exceptions.ConflictException): + client.create_table(tableBucketARN=table_bucket_arn, namespace=database_name, name=table_name, format="ICEBERG") + + def test_creating_catalog_validates_s3_table_bucket_exists(table_bucket_arn): properties = {"s3tables.table-bucket-arn": f"{table_bucket_arn}-modified"} with pytest.raises(TableBucketNotFound): @@ -92,6 +100,15 @@ def test_create_table(catalog, database_name: str, table_name: str, table_schema assert table == catalog.load_table(identifier) +def test_create_table_in_invalid_namespace_raises_exception(catalog, database_name: str, table_name: str, table_schema_nested: Schema): + identifier = (database_name, table_name) + + with pytest.raises(NoSuchNamespaceError): + catalog.create_table(identifier=identifier, schema=table_schema_nested) + + + + def test_table_exists(catalog, database_name: str, table_name: str, table_schema_nested: Schema): identifier = (database_name, table_name) From ec647fb068dac8d0977bad06e61ccb715bd7dc49 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sun, 29 Dec 2024 16:12:21 +0100 Subject: [PATCH 30/57] feat: handle versionToken and table in an atomic operation --- pyiceberg/catalog/s3tables.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index 6ae2db3f32..e8bbebe960 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -75,13 +75,7 @@ def commit_table( table_identifier = table.name() database_name, table_name = self.identifier_to_database_and_table(table_identifier, NoSuchTableError) - # TODO: loading table and getting versionToken should be an atomic operation, otherwise - # the table can change between those two API calls without noticing - # -> change this into an internal API that returns table information along with versionToken - current_table = self.load_table(identifier=table_identifier) - version_token = self.s3tables.get_table(tableBucketARN=self.table_bucket_arn, namespace=database_name, name=table_name)[ - "versionToken" - ] + current_table, version_token = self._load_table_and_version(identifier=table_identifier) updated_staged_table = self._update_and_stage_table(current_table, table_identifier, requirements, updates) if current_table and updated_staged_table.metadata == current_table.metadata: @@ -172,7 +166,7 @@ def create_table( ) io = load_file_io(properties=self.properties, location=metadata_location) - # TODO: this triggers unsupported list operation error, setting overwrite=True is a workaround for now + # this triggers unsupported list operation error, setting overwrite=True is a workaround for now self._write_metadata(metadata, io, metadata_location, overwrite=True) try: @@ -247,6 +241,10 @@ def load_namespace_properties(self, namespace: Union[str, Identifier]) -> Proper } def load_table(self, identifier: Union[str, Identifier]) -> Table: + table, _ = self._load_table_and_version(identifier) + return table + + def _load_table_and_version(self, identifier: Union[str, Identifier]) -> Tuple[Table, str]: namespace, table_name = self._validate_database_and_table_identifier(identifier) try: @@ -260,6 +258,8 @@ def load_table(self, identifier: Union[str, Identifier]) -> Table: if not metadata_location: raise S3TablesError("No table metadata found.") + version_token = response["versionToken"] + io = load_file_io(properties=self.properties, location=metadata_location) file = io.new_input(metadata_location) metadata = FromInputFile.table_metadata(file) @@ -269,7 +269,7 @@ def load_table(self, identifier: Union[str, Identifier]) -> Table: metadata_location=metadata_location, io=self._load_file_io(metadata.properties, metadata_location), catalog=self, - ) + ), version_token def rename_table(self, from_identifier: Union[str, Identifier], to_identifier: Union[str, Identifier]) -> Table: from_namespace, from_table_name = self._validate_database_and_table_identifier(from_identifier) From 9396e7373a1a866fbc2bd811e12befa20f0705bc Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sun, 29 Dec 2024 16:13:00 +0100 Subject: [PATCH 31/57] chore: run formatter --- pyiceberg/catalog/s3tables.py | 18 +++++++++--------- pyiceberg/exceptions.py | 1 + tests/catalog/test_s3tables.py | 9 +++++---- 3 files changed, 15 insertions(+), 13 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index e8bbebe960..85971d9c4b 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -7,16 +7,16 @@ from pyiceberg.catalog import DEPRECATED_BOTOCORE_SESSION, MetastoreCatalog, PropertiesUpdateSummary from pyiceberg.exceptions import ( CommitFailedException, + InvalidNamespaceName, + InvalidTableName, NamespaceNotEmptyError, + NoSuchNamespaceError, NoSuchTableError, - TableBucketNotFound, S3TablesError, - InvalidNamespaceName, - InvalidTableName, TableAlreadyExistsError, - NoSuchNamespaceError + TableBucketNotFound, ) -from pyiceberg.io import AWS_ACCESS_KEY_ID, AWS_REGION, AWS_SECRET_ACCESS_KEY, AWS_SESSION_TOKEN, PY_IO_IMPL, load_file_io +from pyiceberg.io import AWS_ACCESS_KEY_ID, AWS_REGION, AWS_SECRET_ACCESS_KEY, AWS_SESSION_TOKEN, load_file_io from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionSpec from pyiceberg.schema import Schema from pyiceberg.serializers import FromInputFile @@ -142,11 +142,11 @@ def create_table( schema: Schema = self._convert_schema_if_needed(schema) # type: ignore try: - self.s3tables.create_table(tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name, format="ICEBERG") + self.s3tables.create_table( + tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name, format="ICEBERG" + ) except self.s3tables.exceptions.NotFoundException as e: - raise NoSuchNamespaceError( - f"Cannot create {namespace}.{table_name} because no such namespace exists." - ) from e + raise NoSuchNamespaceError(f"Cannot create {namespace}.{table_name} because no such namespace exists.") from e except self.s3tables.exceptions.ConflictException as e: raise TableAlreadyExistsError( f"Cannot create {namespace}.{table_name} because a table of the same name already exists in the namespace." diff --git a/pyiceberg/exceptions.py b/pyiceberg/exceptions.py index 245fffd3c1..8367d91b79 100644 --- a/pyiceberg/exceptions.py +++ b/pyiceberg/exceptions.py @@ -115,6 +115,7 @@ class GenericDynamoDbError(DynamoDbError): class S3TablesError(Exception): pass + class InvalidNamespaceName(S3TablesError): pass diff --git a/tests/catalog/test_s3tables.py b/tests/catalog/test_s3tables.py index cd764737f2..d379e73b82 100644 --- a/tests/catalog/test_s3tables.py +++ b/tests/catalog/test_s3tables.py @@ -4,7 +4,7 @@ import pytest from pyiceberg.catalog.s3tables import S3TableCatalog -from pyiceberg.exceptions import NoSuchTableError, TableBucketNotFound, NoSuchNamespaceError +from pyiceberg.exceptions import NoSuchNamespaceError, NoSuchTableError, TableBucketNotFound from pyiceberg.schema import Schema from pyiceberg.types import IntegerType @@ -30,6 +30,7 @@ def table_bucket_arn(): return os.environ["ARN"] + @pytest.fixture def catalog(table_bucket_arn): # pyarrow does not support writing to S3 Table buckets as of 2024-12-14 https://github.com/apache/iceberg-python/issues/1404#issuecomment-2543174146 @@ -100,15 +101,15 @@ def test_create_table(catalog, database_name: str, table_name: str, table_schema assert table == catalog.load_table(identifier) -def test_create_table_in_invalid_namespace_raises_exception(catalog, database_name: str, table_name: str, table_schema_nested: Schema): +def test_create_table_in_invalid_namespace_raises_exception( + catalog, database_name: str, table_name: str, table_schema_nested: Schema +): identifier = (database_name, table_name) with pytest.raises(NoSuchNamespaceError): catalog.create_table(identifier=identifier, schema=table_schema_nested) - - def test_table_exists(catalog, database_name: str, table_name: str, table_schema_nested: Schema): identifier = (database_name, table_name) From 11f6f66c1f7fa528451b374d06e20323e4e779e5 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sun, 29 Dec 2024 16:17:09 +0100 Subject: [PATCH 32/57] chore: add type hints for tests --- pyiceberg/catalog/s3tables.py | 15 +++++++------- tests/catalog/test_s3tables.py | 36 +++++++++++++++++----------------- 2 files changed, 25 insertions(+), 26 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index 85971d9c4b..fae82b0ee8 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -2,7 +2,6 @@ from typing import TYPE_CHECKING, List, Optional, Set, Tuple, Union import boto3 -from boto3.session import UnknownServiceError from pyiceberg.catalog import DEPRECATED_BOTOCORE_SESSION, MetastoreCatalog, PropertiesUpdateSummary from pyiceberg.exceptions import ( @@ -20,10 +19,10 @@ from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionSpec from pyiceberg.schema import Schema from pyiceberg.serializers import FromInputFile -from pyiceberg.table import CommitTableResponse, Table, TableRequirement +from pyiceberg.table import CommitTableResponse, Table from pyiceberg.table.metadata import new_table_metadata from pyiceberg.table.sorting import UNSORTED_SORT_ORDER, SortOrder -from pyiceberg.table.update import TableUpdate +from pyiceberg.table.update import TableRequirement, TableUpdate from pyiceberg.typedef import EMPTY_DICT, Identifier, Properties from pyiceberg.utils.properties import get_first_property_value @@ -61,7 +60,7 @@ def __init__(self, name: str, **properties: str): ) try: self.s3tables = session.client("s3tables", endpoint_url=properties.get(S3TABLES_ENDPOINT)) - except UnknownServiceError as e: + except boto3.session.UnknownServiceError as e: raise S3TablesError("'s3tables' requires boto3>=1.35.74. Current version: {boto3.__version__}.") from e try: @@ -106,7 +105,7 @@ def commit_table( metadata=updated_staged_table.metadata, metadata_location=updated_staged_table.metadata_location ) - def create_namespace(self, namespace: Union[str, Identifier], properties: Properties = ...) -> None: + def create_namespace(self, namespace: Union[str, Identifier], properties: Properties = EMPTY_DICT) -> None: valid_namespace: str = self._validate_namespace_identifier(namespace) self.s3tables.create_namespace(tableBucketARN=self.table_bucket_arn, namespace=[valid_namespace]) @@ -155,10 +154,10 @@ def create_table( response = self.s3tables.get_table(tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name) version_token = response["versionToken"] - location = response["warehouseLocation"] - metadata_location = self._get_metadata_location(location=location) + warehouse_location = response["warehouseLocation"] + metadata_location = self._get_metadata_location(location=warehouse_location) metadata = new_table_metadata( - location=location, + location=warehouse_location, schema=schema, partition_spec=partition_spec, sort_order=sort_order, diff --git a/tests/catalog/test_s3tables.py b/tests/catalog/test_s3tables.py index d379e73b82..101eae485e 100644 --- a/tests/catalog/test_s3tables.py +++ b/tests/catalog/test_s3tables.py @@ -10,19 +10,19 @@ @pytest.fixture -def database_name(database_name): +def database_name(database_name: str) -> str: # naming rules prevent "-" in namespaces for s3 table buckets return database_name.replace("-", "_") @pytest.fixture -def table_name(table_name): +def table_name(table_name: str) -> str: # naming rules prevent "-" in table namees for s3 table buckets return table_name.replace("-", "_") @pytest.fixture -def table_bucket_arn(): +def table_bucket_arn() -> str: import os # since the moto library does not support s3tables as of 2024-12-14 we have to test against a real AWS endpoint @@ -32,13 +32,13 @@ def table_bucket_arn(): @pytest.fixture -def catalog(table_bucket_arn): +def catalog(table_bucket_arn: str) -> S3TableCatalog: # pyarrow does not support writing to S3 Table buckets as of 2024-12-14 https://github.com/apache/iceberg-python/issues/1404#issuecomment-2543174146 properties = {"s3tables.table-bucket-arn": table_bucket_arn, "py-io-impl": "pyiceberg.io.fsspec.FsspecFileIO"} return S3TableCatalog(name="test_s3tables_catalog", **properties) -def test_s3tables_api_raises_on_conflicting_version_tokens(table_bucket_arn, database_name, table_name): +def test_s3tables_api_raises_on_conflicting_version_tokens(table_bucket_arn: str, database_name: str, table_name: str) -> None: client = boto3.client("s3tables") client.create_namespace(tableBucketARN=table_bucket_arn, namespace=[database_name]) response = client.create_table(tableBucketARN=table_bucket_arn, namespace=database_name, name=table_name, format="ICEBERG") @@ -60,7 +60,7 @@ def test_s3tables_api_raises_on_conflicting_version_tokens(table_bucket_arn, dat ) -def test_s3tables_api_raises_on_preexisting_table(table_bucket_arn, database_name, table_name): +def test_s3tables_api_raises_on_preexisting_table(table_bucket_arn: str, database_name: str, table_name: str) -> None: client = boto3.client("s3tables") client.create_namespace(tableBucketARN=table_bucket_arn, namespace=[database_name]) client.create_table(tableBucketARN=table_bucket_arn, namespace=database_name, name=table_name, format="ICEBERG") @@ -68,31 +68,31 @@ def test_s3tables_api_raises_on_preexisting_table(table_bucket_arn, database_nam client.create_table(tableBucketARN=table_bucket_arn, namespace=database_name, name=table_name, format="ICEBERG") -def test_creating_catalog_validates_s3_table_bucket_exists(table_bucket_arn): +def test_creating_catalog_validates_s3_table_bucket_exists(table_bucket_arn: str) -> None: properties = {"s3tables.table-bucket-arn": f"{table_bucket_arn}-modified"} with pytest.raises(TableBucketNotFound): S3TableCatalog(name="test_s3tables_catalog", **properties) -def test_create_namespace(catalog, database_name: str): +def test_create_namespace(catalog: S3TableCatalog, database_name: str) -> None: catalog.create_namespace(namespace=database_name) namespaces = catalog.list_namespaces() assert (database_name,) in namespaces -def test_load_namespace_properties(catalog, database_name: str): +def test_load_namespace_properties(catalog: S3TableCatalog, database_name: str) -> None: catalog.create_namespace(namespace=database_name) assert database_name in catalog.load_namespace_properties(database_name)["namespace"] -def test_drop_namespace(catalog, database_name: str): +def test_drop_namespace(catalog: S3TableCatalog, database_name: str) -> None: catalog.create_namespace(namespace=database_name) assert (database_name,) in catalog.list_namespaces() catalog.drop_namespace(namespace=database_name) assert (database_name,) not in catalog.list_namespaces() -def test_create_table(catalog, database_name: str, table_name: str, table_schema_nested: Schema): +def test_create_table(catalog: S3TableCatalog, database_name: str, table_name: str, table_schema_nested: Schema) -> None: identifier = (database_name, table_name) catalog.create_namespace(namespace=database_name) @@ -102,15 +102,15 @@ def test_create_table(catalog, database_name: str, table_name: str, table_schema def test_create_table_in_invalid_namespace_raises_exception( - catalog, database_name: str, table_name: str, table_schema_nested: Schema -): + catalog: S3TableCatalog, database_name: str, table_name: str, table_schema_nested: Schema +) -> None: identifier = (database_name, table_name) with pytest.raises(NoSuchNamespaceError): catalog.create_table(identifier=identifier, schema=table_schema_nested) -def test_table_exists(catalog, database_name: str, table_name: str, table_schema_nested: Schema): +def test_table_exists(catalog: S3TableCatalog, database_name: str, table_name: str, table_schema_nested: Schema) -> None: identifier = (database_name, table_name) catalog.create_namespace(namespace=database_name) @@ -119,7 +119,7 @@ def test_table_exists(catalog, database_name: str, table_name: str, table_schema assert catalog.table_exists(identifier=identifier) -def test_rename_table(catalog, database_name: str, table_name: str, table_schema_nested: Schema): +def test_rename_table(catalog: S3TableCatalog, database_name: str, table_name: str, table_schema_nested: Schema) -> None: identifier = (database_name, table_name) catalog.create_namespace(namespace=database_name) @@ -135,7 +135,7 @@ def test_rename_table(catalog, database_name: str, table_name: str, table_schema assert catalog.table_exists(identifier=to_identifier) -def test_list_tables(catalog, database_name: str, table_name: str, table_schema_nested: Schema): +def test_list_tables(catalog: S3TableCatalog, database_name: str, table_name: str, table_schema_nested: Schema) -> None: identifier = (database_name, table_name) catalog.create_namespace(namespace=database_name) @@ -144,7 +144,7 @@ def test_list_tables(catalog, database_name: str, table_name: str, table_schema_ assert catalog.list_tables(namespace=database_name) -def test_drop_table(catalog, database_name: str, table_name: str, table_schema_nested: Schema): +def test_drop_table(catalog: S3TableCatalog, database_name: str, table_name: str, table_schema_nested: Schema) -> None: identifier = (database_name, table_name) catalog.create_namespace(namespace=database_name) @@ -156,7 +156,7 @@ def test_drop_table(catalog, database_name: str, table_name: str, table_schema_n catalog.load_table(identifier=identifier) -def test_commit_table(catalog, database_name: str, table_name: str, table_schema_nested: Schema): +def test_commit_table(catalog: S3TableCatalog, database_name: str, table_name: str, table_schema_nested: Schema) -> None: identifier = (database_name, table_name) catalog.create_namespace(namespace=database_name) From 92fad8e70975d1cb98785f35a522ed0b374d9c9c Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sat, 4 Jan 2025 13:29:37 +0100 Subject: [PATCH 33/57] fix: no longer enforce FsspecFileIO --- tests/catalog/test_s3tables.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/catalog/test_s3tables.py b/tests/catalog/test_s3tables.py index 101eae485e..76663de25a 100644 --- a/tests/catalog/test_s3tables.py +++ b/tests/catalog/test_s3tables.py @@ -33,8 +33,7 @@ def table_bucket_arn() -> str: @pytest.fixture def catalog(table_bucket_arn: str) -> S3TableCatalog: - # pyarrow does not support writing to S3 Table buckets as of 2024-12-14 https://github.com/apache/iceberg-python/issues/1404#issuecomment-2543174146 - properties = {"s3tables.table-bucket-arn": table_bucket_arn, "py-io-impl": "pyiceberg.io.fsspec.FsspecFileIO"} + properties = {"s3tables.table-bucket-arn": table_bucket_arn} return S3TableCatalog(name="test_s3tables_catalog", **properties) From cb23a72ed21947f4f6666ea2d6ebf859c9ed47eb Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sat, 4 Jan 2025 13:37:01 +0100 Subject: [PATCH 34/57] test: remove tests for boto3 behavior --- tests/catalog/test_s3tables.py | 30 ------------------------------ 1 file changed, 30 deletions(-) diff --git a/tests/catalog/test_s3tables.py b/tests/catalog/test_s3tables.py index 76663de25a..d9f05df430 100644 --- a/tests/catalog/test_s3tables.py +++ b/tests/catalog/test_s3tables.py @@ -37,36 +37,6 @@ def catalog(table_bucket_arn: str) -> S3TableCatalog: return S3TableCatalog(name="test_s3tables_catalog", **properties) -def test_s3tables_api_raises_on_conflicting_version_tokens(table_bucket_arn: str, database_name: str, table_name: str) -> None: - client = boto3.client("s3tables") - client.create_namespace(tableBucketARN=table_bucket_arn, namespace=[database_name]) - response = client.create_table(tableBucketARN=table_bucket_arn, namespace=database_name, name=table_name, format="ICEBERG") - version_token = response["versionToken"] - scrambled_version_token = version_token[::-1] - - warehouse_location = client.get_table(tableBucketARN=table_bucket_arn, namespace=database_name, name=table_name)[ - "warehouseLocation" - ] - metadata_location = f"{warehouse_location}/metadata/00001-{uuid.uuid4()}.metadata.json" - - with pytest.raises(client.exceptions.ConflictException): - client.update_table_metadata_location( - tableBucketARN=table_bucket_arn, - namespace=database_name, - name=table_name, - versionToken=scrambled_version_token, - metadataLocation=metadata_location, - ) - - -def test_s3tables_api_raises_on_preexisting_table(table_bucket_arn: str, database_name: str, table_name: str) -> None: - client = boto3.client("s3tables") - client.create_namespace(tableBucketARN=table_bucket_arn, namespace=[database_name]) - client.create_table(tableBucketARN=table_bucket_arn, namespace=database_name, name=table_name, format="ICEBERG") - with pytest.raises(client.exceptions.ConflictException): - client.create_table(tableBucketARN=table_bucket_arn, namespace=database_name, name=table_name, format="ICEBERG") - - def test_creating_catalog_validates_s3_table_bucket_exists(table_bucket_arn: str) -> None: properties = {"s3tables.table-bucket-arn": f"{table_bucket_arn}-modified"} with pytest.raises(TableBucketNotFound): From 5251ffa57493a0c243ee3ff4ccdc560fac165d0f Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sat, 4 Jan 2025 13:44:14 +0100 Subject: [PATCH 35/57] test: verify column was created on commit --- tests/catalog/test_s3tables.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/catalog/test_s3tables.py b/tests/catalog/test_s3tables.py index d9f05df430..15d9ba57ed 100644 --- a/tests/catalog/test_s3tables.py +++ b/tests/catalog/test_s3tables.py @@ -147,3 +147,4 @@ def test_commit_table(catalog: S3TableCatalog, database_name: str, table_name: s assert updated_table_metadata.last_updated_ms > last_updated_ms assert updated_table_metadata.metadata_log[0].metadata_file == original_table_metadata_location assert updated_table_metadata.metadata_log[0].timestamp_ms == original_table_last_updated_ms + assert table.schema().columns[-1].name == "b" From b544b5073ca9d0a9ebc790f7158d70cec6457c2f Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sat, 4 Jan 2025 13:57:59 +0100 Subject: [PATCH 36/57] test: verify new data can be committed to table --- tests/catalog/test_s3tables.py | 31 +++++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/tests/catalog/test_s3tables.py b/tests/catalog/test_s3tables.py index 15d9ba57ed..f97371540b 100644 --- a/tests/catalog/test_s3tables.py +++ b/tests/catalog/test_s3tables.py @@ -1,6 +1,3 @@ -import uuid - -import boto3 import pytest from pyiceberg.catalog.s3tables import S3TableCatalog @@ -125,7 +122,9 @@ def test_drop_table(catalog: S3TableCatalog, database_name: str, table_name: str catalog.load_table(identifier=identifier) -def test_commit_table(catalog: S3TableCatalog, database_name: str, table_name: str, table_schema_nested: Schema) -> None: +def test_commit_new_column_to_table( + catalog: S3TableCatalog, database_name: str, table_name: str, table_schema_nested: Schema +) -> None: identifier = (database_name, table_name) catalog.create_namespace(namespace=database_name) @@ -148,3 +147,27 @@ def test_commit_table(catalog: S3TableCatalog, database_name: str, table_name: s assert updated_table_metadata.metadata_log[0].metadata_file == original_table_metadata_location assert updated_table_metadata.metadata_log[0].timestamp_ms == original_table_last_updated_ms assert table.schema().columns[-1].name == "b" + + +def test_commit_new_data_to_table( + catalog: S3TableCatalog, database_name: str, table_name: str, table_schema_nested: Schema +) -> None: + identifier = (database_name, table_name) + + catalog.create_namespace(namespace=database_name) + table = catalog.create_table(identifier=identifier, schema=table_schema_nested) + + row_count = table.scan().to_arrow().num_rows + last_updated_ms = table.metadata.last_updated_ms + original_table_metadata_location = table.metadata_location + original_table_last_updated_ms = table.metadata.last_updated_ms + + transaction = table.transaction() + transaction.append(table.scan().to_arrow()) + transaction.commit_transaction() + + updated_table_metadata = table.metadata + assert updated_table_metadata.last_updated_ms > last_updated_ms + assert updated_table_metadata.metadata_log[0].metadata_file == original_table_metadata_location + assert updated_table_metadata.metadata_log[0].timestamp_ms == original_table_last_updated_ms + assert table.scan().to_arrow().num_rows == 2 * row_count From a02678a37a738d2bf52efd76be1a2de9155abcd9 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sun, 5 Jan 2025 17:11:33 +0100 Subject: [PATCH 37/57] docs: update documentation for create_table --- pyiceberg/catalog/s3tables.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index fae82b0ee8..0485bb3072 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -140,6 +140,9 @@ def create_table( schema: Schema = self._convert_schema_if_needed(schema) # type: ignore + # creating a new table with S3 Tables is a two step process. We first have to create an S3 Table with the + # S3 Tables API and then write the new metadata.json to the warehouseLocaiton associated with the newly + # created S3 Table. try: self.s3tables.create_table( tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name, format="ICEBERG" @@ -165,7 +168,8 @@ def create_table( ) io = load_file_io(properties=self.properties, location=metadata_location) - # this triggers unsupported list operation error, setting overwrite=True is a workaround for now + # this triggers unsupported list operation error as S3 Table Buckets only support a subset of the S3 Bucket API, + # setting overwrite=True is a workaround for now since it prevents a call to list_objects self._write_metadata(metadata, io, metadata_location, overwrite=True) try: From 1facfe18962cd5d43f2a2a07c6a247fb6e529762 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sun, 5 Jan 2025 17:25:29 +0100 Subject: [PATCH 38/57] test: set AWS regions explicitly --- tests/catalog/test_s3tables.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/catalog/test_s3tables.py b/tests/catalog/test_s3tables.py index f97371540b..930b11ec9a 100644 --- a/tests/catalog/test_s3tables.py +++ b/tests/catalog/test_s3tables.py @@ -30,12 +30,12 @@ def table_bucket_arn() -> str: @pytest.fixture def catalog(table_bucket_arn: str) -> S3TableCatalog: - properties = {"s3tables.table-bucket-arn": table_bucket_arn} + properties = {"s3tables.table-bucket-arn": table_bucket_arn, "s3tables.region": "us-east-1", "s3.region": "us-east-1"} return S3TableCatalog(name="test_s3tables_catalog", **properties) def test_creating_catalog_validates_s3_table_bucket_exists(table_bucket_arn: str) -> None: - properties = {"s3tables.table-bucket-arn": f"{table_bucket_arn}-modified"} + properties = {"s3tables.table-bucket-arn": f"{table_bucket_arn}-modified", "s3tables.region": "us-east-1"} with pytest.raises(TableBucketNotFound): S3TableCatalog(name="test_s3tables_catalog", **properties) From 096ab6f8ad3875e06badf610f9f7be7471a116f7 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Mon, 6 Jan 2025 17:19:43 +0100 Subject: [PATCH 39/57] Apply suggestions from code review Co-authored-by: Kevin Liu --- pyiceberg/catalog/s3tables.py | 2 +- tests/catalog/test_s3tables.py | 8 ++++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index 0485bb3072..1408322ea7 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -141,7 +141,7 @@ def create_table( schema: Schema = self._convert_schema_if_needed(schema) # type: ignore # creating a new table with S3 Tables is a two step process. We first have to create an S3 Table with the - # S3 Tables API and then write the new metadata.json to the warehouseLocaiton associated with the newly + # S3 Tables API and then write the new metadata.json to the warehouseLocation associated with the newly # created S3 Table. try: self.s3tables.create_table( diff --git a/tests/catalog/test_s3tables.py b/tests/catalog/test_s3tables.py index 930b11ec9a..d9f48e363a 100644 --- a/tests/catalog/test_s3tables.py +++ b/tests/catalog/test_s3tables.py @@ -26,11 +26,15 @@ def table_bucket_arn() -> str: # in one of the supported regions. return os.environ["ARN"] +@pytest.fixture +def aws_region() -> str: + import os + return os.environ["AWS_REGION"] @pytest.fixture -def catalog(table_bucket_arn: str) -> S3TableCatalog: - properties = {"s3tables.table-bucket-arn": table_bucket_arn, "s3tables.region": "us-east-1", "s3.region": "us-east-1"} +def catalog(table_bucket_arn: str, aws_region: str) -> S3TableCatalog: + properties = {"s3tables.table-bucket-arn": table_bucket_arn, "s3tables.region": aws_region} return S3TableCatalog(name="test_s3tables_catalog", **properties) From 4fd1d61e76e1f9c156c83f3c749fe217589c6153 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Mon, 6 Jan 2025 17:52:34 +0100 Subject: [PATCH 40/57] test: commit new data to table --- tests/catalog/test_s3tables.py | 62 ++++++++++++++++++++++++++++++---- 1 file changed, 56 insertions(+), 6 deletions(-) diff --git a/tests/catalog/test_s3tables.py b/tests/catalog/test_s3tables.py index d9f48e363a..d1f6e6ffd1 100644 --- a/tests/catalog/test_s3tables.py +++ b/tests/catalog/test_s3tables.py @@ -26,12 +26,15 @@ def table_bucket_arn() -> str: # in one of the supported regions. return os.environ["ARN"] + + @pytest.fixture def aws_region() -> str: import os return os.environ["AWS_REGION"] + @pytest.fixture def catalog(table_bucket_arn: str, aws_region: str) -> S3TableCatalog: properties = {"s3tables.table-bucket-arn": table_bucket_arn, "s3tables.region": aws_region} @@ -153,15 +156,62 @@ def test_commit_new_column_to_table( assert table.schema().columns[-1].name == "b" -def test_commit_new_data_to_table( - catalog: S3TableCatalog, database_name: str, table_name: str, table_schema_nested: Schema -) -> None: +def test_write_pyarrow_table(catalog: S3TableCatalog, database_name: str, table_name: str) -> None: identifier = (database_name, table_name) + catalog.create_namespace(namespace=database_name) + import pyarrow as pa + + pyarrow_table = pa.Table.from_arrays( + [ + pa.array([None, "A", "B", "C"]), # 'foo' column + pa.array([1, 2, 3, 4]), # 'bar' column + pa.array([True, None, False, True]), # 'baz' column + pa.array([None, "A", "B", "C"]), # 'large' column + ], + schema=pa.schema( + [ + pa.field("foo", pa.large_string(), nullable=True), + pa.field("bar", pa.int32(), nullable=False), + pa.field("baz", pa.bool_(), nullable=True), + pa.field("large", pa.large_string(), nullable=True), + ] + ), + ) + table = catalog.create_table(identifier=identifier, schema=pyarrow_table.schema) + table.append(pyarrow_table) + + assert table.scan().to_arrow().num_rows == pyarrow_table.num_rows + + +def test_commit_new_data_to_table(catalog: S3TableCatalog, database_name: str, table_name: str) -> None: + identifier = (database_name, table_name) catalog.create_namespace(namespace=database_name) - table = catalog.create_table(identifier=identifier, schema=table_schema_nested) + + import pyarrow as pa + + pyarrow_table = pa.Table.from_arrays( + [ + pa.array([None, "A", "B", "C"]), # 'foo' column + pa.array([1, 2, 3, 4]), # 'bar' column + pa.array([True, None, False, True]), # 'baz' column + pa.array([None, "A", "B", "C"]), # 'large' column + ], + schema=pa.schema( + [ + pa.field("foo", pa.large_string(), nullable=True), + pa.field("bar", pa.int32(), nullable=False), + pa.field("baz", pa.bool_(), nullable=True), + pa.field("large", pa.large_string(), nullable=True), + ] + ), + ) + + table = catalog.create_table(identifier=identifier, schema=pyarrow_table.schema) + table.append(pyarrow_table) row_count = table.scan().to_arrow().num_rows + assert row_count last_updated_ms = table.metadata.last_updated_ms original_table_metadata_location = table.metadata_location original_table_last_updated_ms = table.metadata.last_updated_ms @@ -172,6 +222,6 @@ def test_commit_new_data_to_table( updated_table_metadata = table.metadata assert updated_table_metadata.last_updated_ms > last_updated_ms - assert updated_table_metadata.metadata_log[0].metadata_file == original_table_metadata_location - assert updated_table_metadata.metadata_log[0].timestamp_ms == original_table_last_updated_ms + assert updated_table_metadata.metadata_log[-1].metadata_file == original_table_metadata_location + assert updated_table_metadata.metadata_log[-1].timestamp_ms == original_table_last_updated_ms assert table.scan().to_arrow().num_rows == 2 * row_count From f7b960aa5a30bc3e6d86260b576ae85d37737681 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Mon, 6 Jan 2025 17:56:05 +0100 Subject: [PATCH 41/57] feat: clarify update_namespace_properties error --- pyiceberg/catalog/s3tables.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index 1408322ea7..01c42f9e3b 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -305,7 +305,7 @@ def update_namespace_properties( self, namespace: Union[str, Identifier], removals: Optional[Set[str]] = None, updates: Properties = EMPTY_DICT ) -> PropertiesUpdateSummary: # namespace properties are read only - raise NotImplementedError + raise NotImplementedError("Namespace properties are read only") def purge_table(self, identifier: Union[str, Identifier]) -> None: # purge is not supported as s3tables doesn't support delete operations From c72164bcd523ca3f83b2dd6fb9763d07fc8586ff Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Mon, 6 Jan 2025 17:59:55 +0100 Subject: [PATCH 42/57] feat: raise error when setting custom namespace properties --- pyiceberg/catalog/s3tables.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index 01c42f9e3b..d5f5bbc305 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -106,6 +106,8 @@ def commit_table( ) def create_namespace(self, namespace: Union[str, Identifier], properties: Properties = EMPTY_DICT) -> None: + if properties: + raise NotImplementedError("Setting namespace properties is not supported.") valid_namespace: str = self._validate_namespace_identifier(namespace) self.s3tables.create_namespace(tableBucketARN=self.table_bucket_arn, namespace=[valid_namespace]) @@ -305,7 +307,7 @@ def update_namespace_properties( self, namespace: Union[str, Identifier], removals: Optional[Set[str]] = None, updates: Properties = EMPTY_DICT ) -> PropertiesUpdateSummary: # namespace properties are read only - raise NotImplementedError("Namespace properties are read only") + raise NotImplementedError("Namespace properties are read only.") def purge_table(self, identifier: Union[str, Identifier]) -> None: # purge is not supported as s3tables doesn't support delete operations From c90bf3165a343beb604da3405f1281e16b061caf Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Tue, 7 Jan 2025 08:57:31 +0100 Subject: [PATCH 43/57] refactor: change S3TableCatalog -> S3TablesCatalog --- pyiceberg/catalog/s3tables.py | 2 +- tests/catalog/test_s3tables.py | 32 ++++++++++++++++---------------- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index d5f5bbc305..ed6a8027f2 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -44,7 +44,7 @@ S3TABLES_RESERVED_NAMESPACE = "aws_s3_metadata" -class S3TableCatalog(MetastoreCatalog): +class S3TablesCatalog(MetastoreCatalog): def __init__(self, name: str, **properties: str): super().__init__(name, **properties) diff --git a/tests/catalog/test_s3tables.py b/tests/catalog/test_s3tables.py index d1f6e6ffd1..c5cf959d4a 100644 --- a/tests/catalog/test_s3tables.py +++ b/tests/catalog/test_s3tables.py @@ -1,6 +1,6 @@ import pytest -from pyiceberg.catalog.s3tables import S3TableCatalog +from pyiceberg.catalog.s3tables import S3TablesCatalog from pyiceberg.exceptions import NoSuchNamespaceError, NoSuchTableError, TableBucketNotFound from pyiceberg.schema import Schema from pyiceberg.types import IntegerType @@ -36,36 +36,36 @@ def aws_region() -> str: @pytest.fixture -def catalog(table_bucket_arn: str, aws_region: str) -> S3TableCatalog: +def catalog(table_bucket_arn: str, aws_region: str) -> S3TablesCatalog: properties = {"s3tables.table-bucket-arn": table_bucket_arn, "s3tables.region": aws_region} - return S3TableCatalog(name="test_s3tables_catalog", **properties) + return S3TablesCatalog(name="test_s3tables_catalog", **properties) def test_creating_catalog_validates_s3_table_bucket_exists(table_bucket_arn: str) -> None: properties = {"s3tables.table-bucket-arn": f"{table_bucket_arn}-modified", "s3tables.region": "us-east-1"} with pytest.raises(TableBucketNotFound): - S3TableCatalog(name="test_s3tables_catalog", **properties) + S3TablesCatalog(name="test_s3tables_catalog", **properties) -def test_create_namespace(catalog: S3TableCatalog, database_name: str) -> None: +def test_create_namespace(catalog: S3TablesCatalog, database_name: str) -> None: catalog.create_namespace(namespace=database_name) namespaces = catalog.list_namespaces() assert (database_name,) in namespaces -def test_load_namespace_properties(catalog: S3TableCatalog, database_name: str) -> None: +def test_load_namespace_properties(catalog: S3TablesCatalog, database_name: str) -> None: catalog.create_namespace(namespace=database_name) assert database_name in catalog.load_namespace_properties(database_name)["namespace"] -def test_drop_namespace(catalog: S3TableCatalog, database_name: str) -> None: +def test_drop_namespace(catalog: S3TablesCatalog, database_name: str) -> None: catalog.create_namespace(namespace=database_name) assert (database_name,) in catalog.list_namespaces() catalog.drop_namespace(namespace=database_name) assert (database_name,) not in catalog.list_namespaces() -def test_create_table(catalog: S3TableCatalog, database_name: str, table_name: str, table_schema_nested: Schema) -> None: +def test_create_table(catalog: S3TablesCatalog, database_name: str, table_name: str, table_schema_nested: Schema) -> None: identifier = (database_name, table_name) catalog.create_namespace(namespace=database_name) @@ -75,7 +75,7 @@ def test_create_table(catalog: S3TableCatalog, database_name: str, table_name: s def test_create_table_in_invalid_namespace_raises_exception( - catalog: S3TableCatalog, database_name: str, table_name: str, table_schema_nested: Schema + catalog: S3TablesCatalog, database_name: str, table_name: str, table_schema_nested: Schema ) -> None: identifier = (database_name, table_name) @@ -83,7 +83,7 @@ def test_create_table_in_invalid_namespace_raises_exception( catalog.create_table(identifier=identifier, schema=table_schema_nested) -def test_table_exists(catalog: S3TableCatalog, database_name: str, table_name: str, table_schema_nested: Schema) -> None: +def test_table_exists(catalog: S3TablesCatalog, database_name: str, table_name: str, table_schema_nested: Schema) -> None: identifier = (database_name, table_name) catalog.create_namespace(namespace=database_name) @@ -92,7 +92,7 @@ def test_table_exists(catalog: S3TableCatalog, database_name: str, table_name: s assert catalog.table_exists(identifier=identifier) -def test_rename_table(catalog: S3TableCatalog, database_name: str, table_name: str, table_schema_nested: Schema) -> None: +def test_rename_table(catalog: S3TablesCatalog, database_name: str, table_name: str, table_schema_nested: Schema) -> None: identifier = (database_name, table_name) catalog.create_namespace(namespace=database_name) @@ -108,7 +108,7 @@ def test_rename_table(catalog: S3TableCatalog, database_name: str, table_name: s assert catalog.table_exists(identifier=to_identifier) -def test_list_tables(catalog: S3TableCatalog, database_name: str, table_name: str, table_schema_nested: Schema) -> None: +def test_list_tables(catalog: S3TablesCatalog, database_name: str, table_name: str, table_schema_nested: Schema) -> None: identifier = (database_name, table_name) catalog.create_namespace(namespace=database_name) @@ -117,7 +117,7 @@ def test_list_tables(catalog: S3TableCatalog, database_name: str, table_name: st assert catalog.list_tables(namespace=database_name) -def test_drop_table(catalog: S3TableCatalog, database_name: str, table_name: str, table_schema_nested: Schema) -> None: +def test_drop_table(catalog: S3TablesCatalog, database_name: str, table_name: str, table_schema_nested: Schema) -> None: identifier = (database_name, table_name) catalog.create_namespace(namespace=database_name) @@ -130,7 +130,7 @@ def test_drop_table(catalog: S3TableCatalog, database_name: str, table_name: str def test_commit_new_column_to_table( - catalog: S3TableCatalog, database_name: str, table_name: str, table_schema_nested: Schema + catalog: S3TablesCatalog, database_name: str, table_name: str, table_schema_nested: Schema ) -> None: identifier = (database_name, table_name) @@ -156,7 +156,7 @@ def test_commit_new_column_to_table( assert table.schema().columns[-1].name == "b" -def test_write_pyarrow_table(catalog: S3TableCatalog, database_name: str, table_name: str) -> None: +def test_write_pyarrow_table(catalog: S3TablesCatalog, database_name: str, table_name: str) -> None: identifier = (database_name, table_name) catalog.create_namespace(namespace=database_name) @@ -184,7 +184,7 @@ def test_write_pyarrow_table(catalog: S3TableCatalog, database_name: str, table_ assert table.scan().to_arrow().num_rows == pyarrow_table.num_rows -def test_commit_new_data_to_table(catalog: S3TableCatalog, database_name: str, table_name: str) -> None: +def test_commit_new_data_to_table(catalog: S3TablesCatalog, database_name: str, table_name: str) -> None: identifier = (database_name, table_name) catalog.create_namespace(namespace=database_name) From 78bbe54c1bd1eebb554eebd6e358280e5cfda48f Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Tue, 7 Jan 2025 08:59:42 +0100 Subject: [PATCH 44/57] feat: raise error on specified table location --- pyiceberg/catalog/s3tables.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index ed6a8027f2..a732afcdd9 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -138,6 +138,8 @@ def create_table( sort_order: SortOrder = UNSORTED_SORT_ORDER, properties: Properties = EMPTY_DICT, ) -> Table: + if location: + raise NotImplementedError("S3 Tables does not support user specified table locations.") namespace, table_name = self._validate_database_and_table_identifier(identifier) schema: Schema = self._convert_schema_if_needed(schema) # type: ignore From bd38d15cf08ef932b8fcdd2b056505358d751463 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Tue, 7 Jan 2025 17:57:00 +0100 Subject: [PATCH 45/57] feat: return empty list when querying a hierarchical namespace --- pyiceberg/catalog/s3tables.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index a732afcdd9..0237dcd0ae 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -220,7 +220,8 @@ def drop_table(self, identifier: Union[str, Identifier]) -> None: def list_namespaces(self, namespace: Union[str, Identifier] = ()) -> List[Identifier]: if namespace: - namespace = self._validate_namespace_identifier(namespace) + # hierarchical namespaces are not supported + return [] paginator = self.s3tables.get_paginator("list_namespaces") namespaces: List[Identifier] = [] From 85d50fe545e85955b566f5d596ff24b4dff750dc Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Tue, 7 Jan 2025 18:00:22 +0100 Subject: [PATCH 46/57] refactor: use get_table_metadata_location instead of get_table --- pyiceberg/catalog/s3tables.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index 0237dcd0ae..6384823705 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -158,7 +158,7 @@ def create_table( f"Cannot create {namespace}.{table_name} because a table of the same name already exists in the namespace." ) from e - response = self.s3tables.get_table(tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name) + response = self.s3tables.get_table_metadata_location(tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name) version_token = response["versionToken"] warehouse_location = response["warehouseLocation"] From c8434d4b28eabe657eec7d4deace2090b7352da2 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Tue, 7 Jan 2025 18:02:57 +0100 Subject: [PATCH 47/57] refactor: extract 'ICEBERG' table format into constant --- pyiceberg/catalog/s3tables.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index 6384823705..efc22a42c3 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -43,6 +43,8 @@ S3TABLES_VALID_NAME_REGEX = pattern = re.compile("[a-z0-9][a-z0-9_]{2,62}") S3TABLES_RESERVED_NAMESPACE = "aws_s3_metadata" +S3TABLES_FORMAT = "ICEBERG" + class S3TablesCatalog(MetastoreCatalog): def __init__(self, name: str, **properties: str): @@ -149,7 +151,7 @@ def create_table( # created S3 Table. try: self.s3tables.create_table( - tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name, format="ICEBERG" + tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name, format=S3TABLES_FORMAT ) except self.s3tables.exceptions.NotFoundException as e: raise NoSuchNamespaceError(f"Cannot create {namespace}.{table_name} because no such namespace exists.") from e From e479c0447f6ec26c5ec37858e7267413bfa60e50 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Tue, 7 Jan 2025 18:05:44 +0100 Subject: [PATCH 48/57] feat: change s3tables.table-bucket-arn -> s3tables.warehouse --- pyiceberg/catalog/s3tables.py | 2 +- tests/catalog/test_s3tables.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index efc22a42c3..4d22807687 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -35,7 +35,7 @@ S3TABLES_SECRET_ACCESS_KEY = "s3tables.secret-access-key" S3TABLES_SESSION_TOKEN = "s3tables.session-token" -S3TABLES_TABLE_BUCKET_ARN = "s3tables.table-bucket-arn" +S3TABLES_TABLE_BUCKET_ARN = "s3tables.warehouse" S3TABLES_ENDPOINT = "s3tables.endpoint" diff --git a/tests/catalog/test_s3tables.py b/tests/catalog/test_s3tables.py index c5cf959d4a..dc3bb88238 100644 --- a/tests/catalog/test_s3tables.py +++ b/tests/catalog/test_s3tables.py @@ -37,12 +37,12 @@ def aws_region() -> str: @pytest.fixture def catalog(table_bucket_arn: str, aws_region: str) -> S3TablesCatalog: - properties = {"s3tables.table-bucket-arn": table_bucket_arn, "s3tables.region": aws_region} + properties = {"s3tables.warehouse": table_bucket_arn, "s3tables.region": aws_region} return S3TablesCatalog(name="test_s3tables_catalog", **properties) def test_creating_catalog_validates_s3_table_bucket_exists(table_bucket_arn: str) -> None: - properties = {"s3tables.table-bucket-arn": f"{table_bucket_arn}-modified", "s3tables.region": "us-east-1"} + properties = {"s3tables.warehouse": f"{table_bucket_arn}-modified", "s3tables.region": "us-east-1"} with pytest.raises(TableBucketNotFound): S3TablesCatalog(name="test_s3tables_catalog", **properties) From acfd06c072dc17a971d226fbde09e0bebd226ee4 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Tue, 7 Jan 2025 18:09:03 +0100 Subject: [PATCH 49/57] Apply suggestions from code review Co-authored-by: Honah J. --- pyiceberg/catalog/s3tables.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index 4d22807687..1854dbfc9b 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -40,7 +40,7 @@ S3TABLES_ENDPOINT = "s3tables.endpoint" # for naming rules see: https://docs.aws.amazon.com/AmazonS3/latest/userguide/s3-tables-buckets-naming.html -S3TABLES_VALID_NAME_REGEX = pattern = re.compile("[a-z0-9][a-z0-9_]{2,62}") +S3TABLES_VALID_NAME_REGEX = pattern = re.compile("[a-z0-9][a-z0-9_]{1,61}[a-z0-9]") S3TABLES_RESERVED_NAMESPACE = "aws_s3_metadata" S3TABLES_FORMAT = "ICEBERG" From 01c9003f3f65c6defd4089afae206cebc977ca2e Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Tue, 7 Jan 2025 18:13:19 +0100 Subject: [PATCH 50/57] feat: add link to naming-rules for invalid name errors --- pyiceberg/catalog/s3tables.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index 1854dbfc9b..4d89108d8d 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -117,7 +117,7 @@ def _validate_namespace_identifier(self, namespace: Union[str, Identifier]) -> s namespace = self.identifier_to_database(namespace) if not S3TABLES_VALID_NAME_REGEX.fullmatch(namespace) or namespace == S3TABLES_RESERVED_NAMESPACE: - raise InvalidNamespaceName("The specified namespace name is not valid.") + raise InvalidNamespaceName("The specified namespace name is not valid. See https://docs.aws.amazon.com/AmazonS3/latest/userguide/s3-tables-buckets-naming.html for naming rules.") return namespace @@ -127,7 +127,7 @@ def _validate_database_and_table_identifier(self, identifier: Union[str, Identif namespace = self._validate_namespace_identifier(namespace) if not S3TABLES_VALID_NAME_REGEX.fullmatch(table_name): - raise InvalidTableName("The specified table name is not valid.") + raise InvalidTableName("The specified table name is not valid. See https://docs.aws.amazon.com/AmazonS3/latest/userguide/s3-tables-buckets-naming.html for naming rules.") return namespace, table_name From 1d25b69741df084b9e3857c7764c30146bf8d4a9 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Tue, 7 Jan 2025 18:22:14 +0100 Subject: [PATCH 51/57] feat: delete s3 table if writing new_table_metadata is unsuccessful --- pyiceberg/catalog/s3tables.py | 63 ++++++++++++++++++----------------- 1 file changed, 33 insertions(+), 30 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index 4d89108d8d..2cfdf9595b 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -150,9 +150,9 @@ def create_table( # S3 Tables API and then write the new metadata.json to the warehouseLocation associated with the newly # created S3 Table. try: - self.s3tables.create_table( + version_token = self.s3tables.create_table( tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name, format=S3TABLES_FORMAT - ) + )["versionToken"] except self.s3tables.exceptions.NotFoundException as e: raise NoSuchNamespaceError(f"Cannot create {namespace}.{table_name} because no such namespace exists.") from e except self.s3tables.exceptions.ConflictException as e: @@ -160,36 +160,39 @@ def create_table( f"Cannot create {namespace}.{table_name} because a table of the same name already exists in the namespace." ) from e - response = self.s3tables.get_table_metadata_location(tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name) - version_token = response["versionToken"] - - warehouse_location = response["warehouseLocation"] - metadata_location = self._get_metadata_location(location=warehouse_location) - metadata = new_table_metadata( - location=warehouse_location, - schema=schema, - partition_spec=partition_spec, - sort_order=sort_order, - properties=properties, - ) - - io = load_file_io(properties=self.properties, location=metadata_location) - # this triggers unsupported list operation error as S3 Table Buckets only support a subset of the S3 Bucket API, - # setting overwrite=True is a workaround for now since it prevents a call to list_objects - self._write_metadata(metadata, io, metadata_location, overwrite=True) - try: - self.s3tables.update_table_metadata_location( - tableBucketARN=self.table_bucket_arn, - namespace=namespace, - name=table_name, - versionToken=version_token, - metadataLocation=metadata_location, + response = self.s3tables.get_table_metadata_location(tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name) + warehouse_location = response["warehouseLocation"] + + metadata_location = self._get_metadata_location(location=warehouse_location) + metadata = new_table_metadata( + location=warehouse_location, + schema=schema, + partition_spec=partition_spec, + sort_order=sort_order, + properties=properties, ) - except self.s3tables.exceptions.ConflictException as e: - raise CommitFailedException( - f"Cannot create {namespace}.{table_name} because of a concurrent update to the table version {version_token}." - ) from e + + io = load_file_io(properties=self.properties, location=metadata_location) + # this triggers unsupported list operation error as S3 Table Buckets only support a subset of the S3 Bucket API, + # setting overwrite=True is a workaround for now since it prevents a call to list_objects + self._write_metadata(metadata, io, metadata_location, overwrite=True) + + try: + self.s3tables.update_table_metadata_location( + tableBucketARN=self.table_bucket_arn, + namespace=namespace, + name=table_name, + versionToken=version_token, + metadataLocation=metadata_location, + ) + except self.s3tables.exceptions.ConflictException as e: + raise CommitFailedException( + f"Cannot create {namespace}.{table_name} because of a concurrent update to the table version {version_token}." + ) from e + except: + self.s3tables.delete_table(tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name) + raise return self.load_table(identifier=identifier) From 188c45de8b3cb04c29386f67c7f7e422c27f0cc4 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Tue, 7 Jan 2025 18:22:38 +0100 Subject: [PATCH 52/57] chore: run linter --- pyiceberg/catalog/s3tables.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index 2cfdf9595b..1e6b8585d6 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -117,7 +117,9 @@ def _validate_namespace_identifier(self, namespace: Union[str, Identifier]) -> s namespace = self.identifier_to_database(namespace) if not S3TABLES_VALID_NAME_REGEX.fullmatch(namespace) or namespace == S3TABLES_RESERVED_NAMESPACE: - raise InvalidNamespaceName("The specified namespace name is not valid. See https://docs.aws.amazon.com/AmazonS3/latest/userguide/s3-tables-buckets-naming.html for naming rules.") + raise InvalidNamespaceName( + "The specified namespace name is not valid. See https://docs.aws.amazon.com/AmazonS3/latest/userguide/s3-tables-buckets-naming.html for naming rules." + ) return namespace @@ -127,7 +129,9 @@ def _validate_database_and_table_identifier(self, identifier: Union[str, Identif namespace = self._validate_namespace_identifier(namespace) if not S3TABLES_VALID_NAME_REGEX.fullmatch(table_name): - raise InvalidTableName("The specified table name is not valid. See https://docs.aws.amazon.com/AmazonS3/latest/userguide/s3-tables-buckets-naming.html for naming rules.") + raise InvalidTableName( + "The specified table name is not valid. See https://docs.aws.amazon.com/AmazonS3/latest/userguide/s3-tables-buckets-naming.html for naming rules." + ) return namespace, table_name @@ -161,7 +165,9 @@ def create_table( ) from e try: - response = self.s3tables.get_table_metadata_location(tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name) + response = self.s3tables.get_table_metadata_location( + tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name + ) warehouse_location = response["warehouseLocation"] metadata_location = self._get_metadata_location(location=warehouse_location) From de310507c4fcf60fb983fb06f00234fa19572039 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Tue, 7 Jan 2025 19:15:06 +0100 Subject: [PATCH 53/57] test: rename test_s3tables.py -> integration_test_s3tables.py --- ..._s3tables.py => integration_test_s3tables.py} | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) rename tests/catalog/{test_s3tables.py => integration_test_s3tables.py} (93%) diff --git a/tests/catalog/test_s3tables.py b/tests/catalog/integration_test_s3tables.py similarity index 93% rename from tests/catalog/test_s3tables.py rename to tests/catalog/integration_test_s3tables.py index dc3bb88238..7b880bee31 100644 --- a/tests/catalog/test_s3tables.py +++ b/tests/catalog/integration_test_s3tables.py @@ -20,19 +20,25 @@ def table_name(table_name: str) -> str: @pytest.fixture def table_bucket_arn() -> str: + """Set the environment variable AWS_TEST_S3_TABLE_BUCKET_ARN for an S3 table bucket to test.""" import os - # since the moto library does not support s3tables as of 2024-12-14 we have to test against a real AWS endpoint - # in one of the supported regions. - - return os.environ["ARN"] + table_bucket_arn = os.getenv("AWS_TEST_S3_TABLE_BUCKET_ARN") + if not table_bucket_arn: + raise ValueError( + "Please specify a table bucket arn to run the test by setting environment variable AWS_TEST_S3_TABLE_BUCKET_ARN" + ) + return table_bucket_arn @pytest.fixture def aws_region() -> str: import os - return os.environ["AWS_REGION"] + aws_region = os.getenv("AWS_REGION") + if not aws_region: + raise ValueError("Please specify an AWS region to run the test by setting environment variable AWS_REGION") + return aws_region @pytest.fixture From 2e1c3833f5f374cbc9ee485f79c75c50f63282fc Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Wed, 8 Jan 2025 09:08:15 +0100 Subject: [PATCH 54/57] chore: update poetry.lock --- poetry.lock | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/poetry.lock b/poetry.lock index 7bc22bec33..934280dcbd 100644 --- a/poetry.lock +++ b/poetry.lock @@ -5342,6 +5342,7 @@ pyarrow = ["pyarrow"] ray = ["pandas", "pyarrow", "ray", "ray"] rest-sigv4 = ["boto3"] s3fs = ["s3fs"] +s3tables = ["boto3"] snappy = ["python-snappy"] sql-postgres = ["psycopg2-binary", "sqlalchemy"] sql-sqlite = ["sqlalchemy"] @@ -5350,4 +5351,4 @@ zstandard = ["zstandard"] [metadata] lock-version = "2.0" python-versions = "^3.9, !=3.9.7" -content-hash = "59e5678cd718f658c5bd099c03051564ee60f991e5f222bf92da13d1dd025a42" +content-hash = "10f9fb23f30dddeb93ad5c7ba7757816af5813293332e88929301c418ad7b256" From fdd1d7a4e23e4f0a05ce53f8a1472363fdc11e08 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Wed, 8 Jan 2025 20:19:53 +0100 Subject: [PATCH 55/57] fix: add license to files --- pyiceberg/catalog/s3tables.py | 16 ++++++++++++++++ tests/catalog/integration_test_s3tables.py | 16 ++++++++++++++++ 2 files changed, 32 insertions(+) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index 1e6b8585d6..a577613c92 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -1,3 +1,19 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. import re from typing import TYPE_CHECKING, List, Optional, Set, Tuple, Union diff --git a/tests/catalog/integration_test_s3tables.py b/tests/catalog/integration_test_s3tables.py index 7b880bee31..351f530122 100644 --- a/tests/catalog/integration_test_s3tables.py +++ b/tests/catalog/integration_test_s3tables.py @@ -1,3 +1,19 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. import pytest from pyiceberg.catalog.s3tables import S3TablesCatalog From 8b1cec0af084c9dc1fc42f5586ca8ce284bbcc31 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Thu, 9 Jan 2025 08:56:43 +0100 Subject: [PATCH 56/57] fix: raise error when creating a table during a transaction --- pyiceberg/catalog/s3tables.py | 79 ++++++++++++++-------- tests/catalog/integration_test_s3tables.py | 36 ++++++++++ 2 files changed, 86 insertions(+), 29 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index a577613c92..14c249448e 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -35,7 +35,7 @@ from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionSpec from pyiceberg.schema import Schema from pyiceberg.serializers import FromInputFile -from pyiceberg.table import CommitTableResponse, Table +from pyiceberg.table import CommitTableResponse, CreateTableTransaction, Table from pyiceberg.table.metadata import new_table_metadata from pyiceberg.table.sorting import UNSORTED_SORT_ORDER, SortOrder from pyiceberg.table.update import TableRequirement, TableUpdate @@ -92,36 +92,57 @@ def commit_table( table_identifier = table.name() database_name, table_name = self.identifier_to_database_and_table(table_identifier, NoSuchTableError) - current_table, version_token = self._load_table_and_version(identifier=table_identifier) - - updated_staged_table = self._update_and_stage_table(current_table, table_identifier, requirements, updates) - if current_table and updated_staged_table.metadata == current_table.metadata: - # no changes, do nothing - return CommitTableResponse(metadata=current_table.metadata, metadata_location=current_table.metadata_location) - - self._write_metadata( - metadata=updated_staged_table.metadata, - io=updated_staged_table.io, - metadata_path=updated_staged_table.metadata_location, - overwrite=True, - ) - - # try to update metadata location which will fail if the versionToken changed meanwhile + current_table: Optional[Table] + version_token: Optional[str] try: - self.s3tables.update_table_metadata_location( - tableBucketARN=self.table_bucket_arn, - namespace=database_name, - name=table_name, - versionToken=version_token, - metadataLocation=updated_staged_table.metadata_location, + current_table, version_token = self._load_table_and_version(identifier=table_identifier) + except NoSuchTableError: + current_table = None + version_token = None + + if current_table: + updated_staged_table = self._update_and_stage_table(current_table, table_identifier, requirements, updates) + if updated_staged_table.metadata == current_table.metadata: + # no changes, do nothing + return CommitTableResponse(metadata=current_table.metadata, metadata_location=current_table.metadata_location) + + self._write_metadata( + metadata=updated_staged_table.metadata, + io=updated_staged_table.io, + metadata_path=updated_staged_table.metadata_location, + overwrite=True, ) - except self.s3tables.exceptions.ConflictException as e: - raise CommitFailedException( - f"Cannot commit {database_name}.{table_name} because of a concurrent update to the table version {version_token}." - ) from e - return CommitTableResponse( - metadata=updated_staged_table.metadata, metadata_location=updated_staged_table.metadata_location - ) + + # try to update metadata location which will fail if the versionToken changed meanwhile + try: + self.s3tables.update_table_metadata_location( + tableBucketARN=self.table_bucket_arn, + namespace=database_name, + name=table_name, + versionToken=version_token, + metadataLocation=updated_staged_table.metadata_location, + ) + except self.s3tables.exceptions.ConflictException as e: + raise CommitFailedException( + f"Cannot commit {database_name}.{table_name} because of a concurrent update to the table version {version_token}." + ) from e + return CommitTableResponse( + metadata=updated_staged_table.metadata, metadata_location=updated_staged_table.metadata_location + ) + else: + # table does not exist, create it + raise NotImplementedError("Creating a table on commit is currently not supported.") + + def create_table_transaction( + self, + identifier: Union[str, Identifier], + schema: Union[Schema, "pa.Schema"], + location: Optional[str] = None, + partition_spec: PartitionSpec = UNPARTITIONED_PARTITION_SPEC, + sort_order: SortOrder = UNSORTED_SORT_ORDER, + properties: Properties = EMPTY_DICT, + ) -> CreateTableTransaction: + raise NotImplementedError("create_table_transaction currently not supported.") def create_namespace(self, namespace: Union[str, Identifier], properties: Properties = EMPTY_DICT) -> None: if properties: diff --git a/tests/catalog/integration_test_s3tables.py b/tests/catalog/integration_test_s3tables.py index 351f530122..e803b5a2b9 100644 --- a/tests/catalog/integration_test_s3tables.py +++ b/tests/catalog/integration_test_s3tables.py @@ -18,7 +18,9 @@ from pyiceberg.catalog.s3tables import S3TablesCatalog from pyiceberg.exceptions import NoSuchNamespaceError, NoSuchTableError, TableBucketNotFound +from pyiceberg.partitioning import PartitionField, PartitionSpec from pyiceberg.schema import Schema +from pyiceberg.transforms import IdentityTransform from pyiceberg.types import IntegerType @@ -247,3 +249,37 @@ def test_commit_new_data_to_table(catalog: S3TablesCatalog, database_name: str, assert updated_table_metadata.metadata_log[-1].metadata_file == original_table_metadata_location assert updated_table_metadata.metadata_log[-1].timestamp_ms == original_table_last_updated_ms assert table.scan().to_arrow().num_rows == 2 * row_count + + +def test_create_table_transaction( + catalog: S3TablesCatalog, database_name: str, table_name: str, table_schema_nested: str +) -> None: + identifier = (database_name, table_name) + catalog.create_namespace(namespace=database_name) + + with catalog.create_table_transaction( + identifier, + table_schema_nested, + partition_spec=PartitionSpec(PartitionField(source_id=1, field_id=1000, transform=IdentityTransform(), name="foo")), + ) as txn: + last_updated_metadata = txn.table_metadata.last_updated_ms + with txn.update_schema() as update_schema: + update_schema.add_column(path="b", field_type=IntegerType()) + + with txn.update_spec() as update_spec: + update_spec.add_identity("bar") + + txn.set_properties(test_a="test_aa", test_b="test_b", test_c="test_c") + + table = catalog.load_table(identifier) + + assert table.schema().find_field("b").field_type == IntegerType() + assert table.properties == {"test_a": "test_aa", "test_b": "test_b", "test_c": "test_c"} + assert table.spec().last_assigned_field_id == 1001 + assert table.spec().fields_by_source_id(1)[0].name == "foo" + assert table.spec().fields_by_source_id(1)[0].field_id == 1000 + assert table.spec().fields_by_source_id(1)[0].transform == IdentityTransform() + assert table.spec().fields_by_source_id(2)[0].name == "bar" + assert table.spec().fields_by_source_id(2)[0].field_id == 1001 + assert table.spec().fields_by_source_id(2)[0].transform == IdentityTransform() + assert table.metadata.last_updated_ms > last_updated_metadata From 99e569be012eeb014f73f58dda11bed3ce5bd99d Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Thu, 9 Jan 2025 09:01:26 +0100 Subject: [PATCH 57/57] test: mark create_table_transaction test wiht xfail --- tests/catalog/integration_test_s3tables.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/catalog/integration_test_s3tables.py b/tests/catalog/integration_test_s3tables.py index e803b5a2b9..8cfcbe7b44 100644 --- a/tests/catalog/integration_test_s3tables.py +++ b/tests/catalog/integration_test_s3tables.py @@ -251,6 +251,7 @@ def test_commit_new_data_to_table(catalog: S3TablesCatalog, database_name: str, assert table.scan().to_arrow().num_rows == 2 * row_count +@pytest.mark.xfail(raises=NotImplementedError, reason="create_table_transaction not implemented yet") def test_create_table_transaction( catalog: S3TablesCatalog, database_name: str, table_name: str, table_schema_nested: str ) -> None: