diff --git a/dlt/common/libs/deltalake.py b/dlt/common/libs/deltalake.py index d98795d07c..d4cb46c600 100644 --- a/dlt/common/libs/deltalake.py +++ b/dlt/common/libs/deltalake.py @@ -5,13 +5,15 @@ from dlt.common import logger from dlt.common.libs.pyarrow import pyarrow as pa from dlt.common.libs.pyarrow import cast_arrow_schema_types -from dlt.common.schema.typing import TWriteDisposition +from dlt.common.schema.typing import TWriteDisposition, TTableSchema +from dlt.common.schema.utils import get_first_column_name_with_prop, get_columns_names_with_prop from dlt.common.exceptions import MissingDependencyException from dlt.common.storages import FilesystemConfiguration from dlt.common.utils import assert_min_pkg_version from dlt.destinations.impl.filesystem.filesystem import FilesystemClient try: + import deltalake from deltalake import write_deltalake, DeltaTable from deltalake.writer import try_get_deltatable except ModuleNotFoundError: @@ -74,7 +76,7 @@ def write_delta_table( partition_by: Optional[Union[List[str], str]] = None, storage_options: Optional[Dict[str, str]] = None, ) -> None: - """Writes in-memory Arrow table to on-disk Delta table. + """Writes in-memory Arrow data to on-disk Delta table. Thin wrapper around `deltalake.write_deltalake`. """ @@ -93,31 +95,73 @@ def write_delta_table( ) -def get_delta_tables(pipeline: Pipeline, *tables: str) -> Dict[str, DeltaTable]: - """Returns Delta tables in `pipeline.default_schema` as `deltalake.DeltaTable` objects. +def merge_delta_table( + table: DeltaTable, + data: Union[pa.Table, pa.RecordBatchReader], + schema: TTableSchema, +) -> None: + """Merges in-memory Arrow data into on-disk Delta table.""" + + strategy = schema["x-merge-strategy"] # type: ignore[typeddict-item] + if strategy == "upsert": + # `DeltaTable.merge` does not support automatic schema evolution + # https://github.com/delta-io/delta-rs/issues/2282 + _evolve_delta_table_schema(table, data.schema) + + if "parent" in schema: + unique_column = get_first_column_name_with_prop(schema, "unique") + predicate = f"target.{unique_column} = source.{unique_column}" + else: + primary_keys = get_columns_names_with_prop(schema, "primary_key") + predicate = " AND ".join([f"target.{c} = source.{c}" for c in primary_keys]) + + qry = ( + table.merge( + source=ensure_delta_compatible_arrow_data(data), + predicate=predicate, + source_alias="source", + target_alias="target", + ) + .when_matched_update_all() + .when_not_matched_insert_all() + ) + + qry.execute() + else: + ValueError(f'Merge strategy "{strategy}" not supported.') + + +def get_delta_tables( + pipeline: Pipeline, *tables: str, schema_name: str = None +) -> Dict[str, DeltaTable]: + """Returns Delta tables in `pipeline.default_schema (default)` as `deltalake.DeltaTable` objects. Returned object is a dictionary with table names as keys and `DeltaTable` objects as values. Optionally filters dictionary by table names specified as `*tables*`. - Raises ValueError if table name specified as `*tables` is not found. + Raises ValueError if table name specified as `*tables` is not found. You may try to switch to other + schemas via `schema_name` argument. """ from dlt.common.schema.utils import get_table_format - with pipeline.destination_client() as client: + with pipeline.destination_client(schema_name=schema_name) as client: assert isinstance( client, FilesystemClient ), "The `get_delta_tables` function requires a `filesystem` destination." schema_delta_tables = [ t["name"] - for t in pipeline.default_schema.tables.values() - if get_table_format(pipeline.default_schema.tables, t["name"]) == "delta" + for t in client.schema.tables.values() + if get_table_format(client.schema.tables, t["name"]) == "delta" ] if len(tables) > 0: invalid_tables = set(tables) - set(schema_delta_tables) if len(invalid_tables) > 0: + available_schemas = "" + if len(pipeline.schema_names) > 1: + available_schemas = f" Available schemas are {pipeline.schema_names}" raise ValueError( - "Schema does not contain Delta tables with these names: " - f"{', '.join(invalid_tables)}." + f"Schema {client.schema.name} does not contain Delta tables with these names: " + f"{', '.join(invalid_tables)}.{available_schemas}" ) schema_delta_tables = [t for t in schema_delta_tables if t in tables] table_dirs = client.get_table_dirs(schema_delta_tables, remote=True) @@ -145,3 +189,16 @@ def _deltalake_storage_options(config: FilesystemConfiguration) -> Dict[str, str + ". dlt will use the values in `deltalake_storage_options`." ) return {**creds, **extra_options} + + +def _evolve_delta_table_schema(delta_table: DeltaTable, arrow_schema: pa.Schema) -> None: + """Evolves `delta_table` schema if different from `arrow_schema`. + + Adds column(s) to `delta_table` present in `arrow_schema` but not in `delta_table`. + """ + new_fields = [ + deltalake.Field.from_pyarrow(field) + for field in ensure_delta_compatible_arrow_schema(arrow_schema) + if field not in delta_table.to_pyarrow_dataset().schema + ] + delta_table.alter.add_columns(new_fields) diff --git a/dlt/common/storages/file_storage.py b/dlt/common/storages/file_storage.py index 7d14b8f7f7..f26cc060a3 100644 --- a/dlt/common/storages/file_storage.py +++ b/dlt/common/storages/file_storage.py @@ -3,7 +3,6 @@ import re import stat import errno -import tempfile import shutil import pathvalidate from typing import IO, Any, Optional, List, cast @@ -29,10 +28,8 @@ def save(self, relative_path: str, data: Any) -> str: @staticmethod def save_atomic(storage_path: str, relative_path: str, data: Any, file_type: str = "t") -> str: mode = "w" + file_type - with tempfile.NamedTemporaryFile( - dir=storage_path, mode=mode, delete=False, encoding=encoding_for_mode(mode) - ) as f: - tmp_path = f.name + tmp_path = os.path.join(storage_path, uniq_id(8)) + with open(tmp_path, mode=mode, encoding=encoding_for_mode(mode)) as f: f.write(data) try: dest_path = os.path.join(storage_path, relative_path) @@ -116,11 +113,11 @@ def open_file(self, relative_path: str, mode: str = "r") -> IO[Any]: return FileStorage.open_zipsafe_ro(self.make_full_path(relative_path), mode) return open(self.make_full_path(relative_path), mode, encoding=encoding_for_mode(mode)) - def open_temp(self, delete: bool = False, mode: str = "w", file_type: str = None) -> IO[Any]: - mode = mode + file_type or self.file_type - return tempfile.NamedTemporaryFile( - dir=self.storage_path, mode=mode, delete=delete, encoding=encoding_for_mode(mode) - ) + # def open_temp(self, delete: bool = False, mode: str = "w", file_type: str = None) -> IO[Any]: + # mode = mode + file_type or self.file_type + # return tempfile.NamedTemporaryFile( + # dir=self.storage_path, mode=mode, delete=delete, encoding=encoding_for_mode(mode) + # ) def has_file(self, relative_path: str) -> bool: return os.path.isfile(self.make_full_path(relative_path)) diff --git a/dlt/destinations/fs_client.py b/dlt/destinations/fs_client.py index 3233446594..14e77b6b4e 100644 --- a/dlt/destinations/fs_client.py +++ b/dlt/destinations/fs_client.py @@ -3,9 +3,12 @@ from abc import ABC, abstractmethod from fsspec import AbstractFileSystem +from dlt.common.schema import Schema + class FSClientBase(ABC): fs_client: AbstractFileSystem + schema: Schema @property @abstractmethod diff --git a/dlt/destinations/impl/filesystem/filesystem.py b/dlt/destinations/impl/filesystem/filesystem.py index 2e09871ba9..3b1e16bdc8 100644 --- a/dlt/destinations/impl/filesystem/filesystem.py +++ b/dlt/destinations/impl/filesystem/filesystem.py @@ -3,7 +3,7 @@ import base64 from types import TracebackType -from typing import ClassVar, List, Type, Iterable, Iterator, Optional, Tuple, Sequence, cast +from typing import Dict, List, Type, Iterable, Iterator, Optional, Tuple, Sequence, cast from fsspec import AbstractFileSystem from contextlib import contextmanager @@ -13,7 +13,7 @@ from dlt.common.storages.fsspec_filesystem import glob_files from dlt.common.typing import DictStrAny from dlt.common.schema import Schema, TSchemaTables, TTableSchema -from dlt.common.schema.utils import get_first_column_name_with_prop, get_columns_names_with_prop +from dlt.common.schema.utils import get_columns_names_with_prop from dlt.common.storages import FileStorage, fsspec_from_config from dlt.common.storages.load_package import ( LoadJobInfo, @@ -56,36 +56,36 @@ def __init__( self._job_client: FilesystemClient = None def run(self) -> None: - # pick local filesystem pathlib or posix for buckets - self.is_local_filesystem = self._job_client.config.protocol == "file" - self.pathlib = os.path if self.is_local_filesystem else posixpath - - self.destination_file_name = path_utils.create_path( - self._job_client.config.layout, - self._file_name, - self._job_client.schema.name, - self._load_id, - current_datetime=self._job_client.config.current_datetime, - load_package_timestamp=dlt.current.load_package()["state"]["created_at"], - extra_placeholders=self._job_client.config.extra_placeholders, - ) + self.__is_local_filesystem = self._job_client.config.protocol == "file" # We would like to avoid failing for local filesystem where # deeply nested directory will not exist before writing a file. # It `auto_mkdir` is disabled by default in fsspec so we made some # trade offs between different options and decided on this. # remote_path = f"{client.config.protocol}://{posixpath.join(dataset_path, destination_file_name)}" remote_path = self.make_remote_path() - if self.is_local_filesystem: - self._job_client.fs_client.makedirs(self.pathlib.dirname(remote_path), exist_ok=True) + if self.__is_local_filesystem: + # use os.path for local file name + self._job_client.fs_client.makedirs(os.path.dirname(remote_path), exist_ok=True) self._job_client.fs_client.put_file(self._file_path, remote_path) def make_remote_path(self) -> str: """Returns path on the remote filesystem to which copy the file, without scheme. For local filesystem a native path is used""" + destination_file_name = path_utils.create_path( + self._job_client.config.layout, + self._file_name, + self._job_client.schema.name, + self._load_id, + current_datetime=self._job_client.config.current_datetime, + load_package_timestamp=dlt.current.load_package()["state"]["created_at"], + extra_placeholders=self._job_client.config.extra_placeholders, + ) + # pick local filesystem pathlib or posix for buckets + pathlib = os.path if self.__is_local_filesystem else posixpath # path.join does not normalize separators and available # normalization functions are very invasive and may string the trailing separator - return self.pathlib.join( # type: ignore[no-any-return] + return pathlib.join( # type: ignore[no-any-return] self._job_client.dataset_path, - path_utils.normalize_path_sep(self.pathlib, self.destination_file_name), + path_utils.normalize_path_sep(pathlib, destination_file_name), ) def make_remote_uri(self) -> str: @@ -98,89 +98,81 @@ def metrics(self) -> Optional[LoadJobMetrics]: class DeltaLoadFilesystemJob(FilesystemLoadJob): def __init__(self, file_path: str) -> None: - super().__init__( - file_path=file_path, - ) - - def run(self) -> None: - # pick local filesystem pathlib or posix for buckets - # TODO: since we pass _job_client via run_managed and not set_env_vars it is hard - # to write a handler with those two line below only in FilesystemLoadJob - self.is_local_filesystem = self._job_client.config.protocol == "file" - self.pathlib = os.path if self.is_local_filesystem else posixpath - self.destination_file_name = self._job_client.make_remote_uri( - self._job_client.get_table_dir(self.load_table_name) - ) + super().__init__(file_path=file_path) + # create Arrow dataset from Parquet files from dlt.common.libs.pyarrow import pyarrow as pa - from dlt.common.libs.deltalake import ( - DeltaTable, - write_delta_table, - ensure_delta_compatible_arrow_schema, - _deltalake_storage_options, - try_get_deltatable, - ) - # create Arrow dataset from Parquet files - file_paths = ReferenceFollowupJobRequest.resolve_references(self._file_path) - arrow_ds = pa.dataset.dataset(file_paths) + self.file_paths = ReferenceFollowupJobRequest.resolve_references(self._file_path) + self.arrow_ds = pa.dataset.dataset(self.file_paths) - # create Delta table object + def make_remote_path(self) -> str: + # remote path is table dir - delta will create its file structure inside it + return self._job_client.get_table_dir(self.load_table_name) - storage_options = _deltalake_storage_options(self._job_client.config) - dt = try_get_deltatable(self.destination_file_name, storage_options=storage_options) + def run(self) -> None: + logger.info(f"Will copy file(s) {self.file_paths} to delta table {self.make_remote_uri()}") - # get partition columns - part_cols = get_columns_names_with_prop(self._load_table, "partition") + from dlt.common.libs.deltalake import write_delta_table, merge_delta_table # explicitly check if there is data # (https://github.com/delta-io/delta-rs/issues/2686) - if arrow_ds.head(1).num_rows == 0: - if dt is None: - # create new empty Delta table with schema from Arrow table - DeltaTable.create( - table_uri=self.destination_file_name, - schema=ensure_delta_compatible_arrow_schema(arrow_ds.schema), - mode="overwrite", - partition_by=part_cols, - storage_options=storage_options, - ) + if self.arrow_ds.head(1).num_rows == 0: + self._create_or_evolve_delta_table() return - arrow_rbr = arrow_ds.scanner().to_reader() # RecordBatchReader - - if self._load_table["write_disposition"] == "merge" and dt is not None: - assert self._load_table["x-merge-strategy"] in self._job_client.capabilities.supported_merge_strategies # type: ignore[typeddict-item] - - if self._load_table["x-merge-strategy"] == "upsert": # type: ignore[typeddict-item] - if "parent" in self._load_table: - unique_column = get_first_column_name_with_prop(self._load_table, "unique") - predicate = f"target.{unique_column} = source.{unique_column}" - else: - primary_keys = get_columns_names_with_prop(self._load_table, "primary_key") - predicate = " AND ".join([f"target.{c} = source.{c}" for c in primary_keys]) - - qry = ( - dt.merge( - source=arrow_rbr, - predicate=predicate, - source_alias="source", - target_alias="target", - ) - .when_matched_update_all() - .when_not_matched_insert_all() + with self.arrow_ds.scanner().to_reader() as arrow_rbr: # RecordBatchReader + if self._load_table["write_disposition"] == "merge" and self._delta_table is not None: + assert self._load_table["x-merge-strategy"] in self._job_client.capabilities.supported_merge_strategies # type: ignore[typeddict-item] + merge_delta_table( + table=self._delta_table, + data=arrow_rbr, + schema=self._load_table, + ) + else: + write_delta_table( + table_or_uri=( + self.make_remote_uri() if self._delta_table is None else self._delta_table + ), + data=arrow_rbr, + write_disposition=self._load_table["write_disposition"], + partition_by=self._partition_columns, + storage_options=self._storage_options, ) - qry.execute() + @property + def _storage_options(self) -> Dict[str, str]: + from dlt.common.libs.deltalake import _deltalake_storage_options + + return _deltalake_storage_options(self._job_client.config) - else: - write_delta_table( - table_or_uri=self.destination_file_name if dt is None else dt, - data=arrow_rbr, - write_disposition=self._load_table["write_disposition"], - partition_by=part_cols, - storage_options=storage_options, + @property + def _delta_table(self) -> Optional["DeltaTable"]: # type: ignore[name-defined] # noqa: F821 + from dlt.common.libs.deltalake import try_get_deltatable + + return try_get_deltatable(self.make_remote_uri(), storage_options=self._storage_options) + + @property + def _partition_columns(self) -> List[str]: + return get_columns_names_with_prop(self._load_table, "partition") + + def _create_or_evolve_delta_table(self) -> None: + from dlt.common.libs.deltalake import ( + DeltaTable, + ensure_delta_compatible_arrow_schema, + _evolve_delta_table_schema, + ) + + if self._delta_table is None: + DeltaTable.create( + table_uri=self.make_remote_uri(), + schema=ensure_delta_compatible_arrow_schema(self.arrow_ds.schema), + mode="overwrite", + partition_by=self._partition_columns, + storage_options=self._storage_options, ) + else: + _evolve_delta_table_schema(self._delta_table, self.arrow_ds.schema) class FilesystemLoadJobWithFollowup(HasFollowupJobs, FilesystemLoadJob): diff --git a/docs/examples/parent_child_relationship/parent_child_relationship.py b/docs/examples/parent_child_relationship/parent_child_relationship.py index 39c9f577cc..6de00ffb28 100644 --- a/docs/examples/parent_child_relationship/parent_child_relationship.py +++ b/docs/examples/parent_child_relationship/parent_child_relationship.py @@ -22,6 +22,7 @@ from typing import List, Dict, Any, Generator import dlt + # Define a dlt resource with write disposition to 'merge' @dlt.resource(name="parent_with_children", write_disposition={"disposition": "merge"}) def data_source() -> Generator[List[Dict[str, Any]], None, None]: @@ -44,6 +45,7 @@ def data_source() -> Generator[List[Dict[str, Any]], None, None]: yield data + # Function to add parent_id to each child record within a parent record def add_parent_id(record: Dict[str, Any]) -> Dict[str, Any]: parent_id_key = "parent_id" @@ -51,6 +53,7 @@ def add_parent_id(record: Dict[str, Any]) -> Dict[str, Any]: child[parent_id_key] = record[parent_id_key] return record + if __name__ == "__main__": # Create and configure the dlt pipeline pipeline = dlt.pipeline( @@ -60,10 +63,6 @@ def add_parent_id(record: Dict[str, Any]) -> Dict[str, Any]: ) # Run the pipeline - load_info = pipeline.run( - data_source() - .add_map(add_parent_id), - primary_key="parent_id" - ) + load_info = pipeline.run(data_source().add_map(add_parent_id), primary_key="parent_id") # Output the load information after pipeline execution print(load_info) diff --git a/docs/examples/parent_child_relationship/test_parent_child_relationship.py b/docs/examples/parent_child_relationship/test_parent_child_relationship.py index f671040823..95d1bade97 100644 --- a/docs/examples/parent_child_relationship/test_parent_child_relationship.py +++ b/docs/examples/parent_child_relationship/test_parent_child_relationship.py @@ -1,4 +1,3 @@ - import pytest from tests.utils import skipifgithubfork @@ -29,6 +28,7 @@ from typing import List, Dict, Any, Generator import dlt + # Define a dlt resource with write disposition to 'merge' @dlt.resource(name="parent_with_children", write_disposition={"disposition": "merge"}) def data_source() -> Generator[List[Dict[str, Any]], None, None]: @@ -51,6 +51,7 @@ def data_source() -> Generator[List[Dict[str, Any]], None, None]: yield data + # Function to add parent_id to each child record within a parent record def add_parent_id(record: Dict[str, Any]) -> Dict[str, Any]: parent_id_key = "parent_id" @@ -58,6 +59,7 @@ def add_parent_id(record: Dict[str, Any]) -> Dict[str, Any]: child[parent_id_key] = record[parent_id_key] return record + @skipifgithubfork @pytest.mark.forked def test_parent_child_relationship(): @@ -69,10 +71,6 @@ def test_parent_child_relationship(): ) # Run the pipeline - load_info = pipeline.run( - data_source() - .add_map(add_parent_id), - primary_key="parent_id" - ) + load_info = pipeline.run(data_source().add_map(add_parent_id), primary_key="parent_id") # Output the load information after pipeline execution print(load_info) diff --git a/poetry.lock b/poetry.lock index d54a73a2ef..230b354b97 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand. [[package]] name = "about-time" @@ -2102,27 +2102,27 @@ typing-extensions = ">=3.10.0" [[package]] name = "databricks-sql-connector" -version = "3.1.2" +version = "3.3.0" description = "Databricks SQL Connector for Python" optional = true python-versions = "<4.0.0,>=3.8.0" files = [ - {file = "databricks_sql_connector-3.1.2-py3-none-any.whl", hash = "sha256:5292bc25b4d8d58d301079b55086331764f067e24862c9365698b2eeddedb737"}, - {file = "databricks_sql_connector-3.1.2.tar.gz", hash = "sha256:da0df114e0824d49ccfea36c4679c95689fe359191b056ad516446a058307c37"}, + {file = "databricks_sql_connector-3.3.0-py3-none-any.whl", hash = "sha256:55ee5a4a11291bf91a235ac76e41b419ddd66a9a321065a8bfaf119acbb26d6b"}, + {file = "databricks_sql_connector-3.3.0.tar.gz", hash = "sha256:19e82965da4c86574adfe9f788c17b4494d98eb8075ba4fd4306573d2edbf194"}, ] [package.dependencies] lz4 = ">=4.0.2,<5.0.0" numpy = [ - {version = ">=1.16.6", markers = "python_version >= \"3.8\" and python_version < \"3.11\""}, - {version = ">=1.23.4", markers = "python_version >= \"3.11\""}, + {version = ">=1.16.6,<2.0.0", markers = "python_version >= \"3.8\" and python_version < \"3.11\""}, + {version = ">=1.23.4,<2.0.0", markers = "python_version >= \"3.11\""}, ] oauthlib = ">=3.1.0,<4.0.0" openpyxl = ">=3.0.10,<4.0.0" pandas = {version = ">=1.2.5,<2.2.0", markers = "python_version >= \"3.8\""} -pyarrow = ">=14.0.1,<15.0.0" +pyarrow = ">=14.0.1,<17" requests = ">=2.18.1,<3.0.0" -thrift = ">=0.16.0,<0.17.0" +thrift = ">=0.16.0,<0.21.0" urllib3 = ">=1.26" [package.extras] @@ -2377,25 +2377,24 @@ files = [ [[package]] name = "deltalake" -version = "0.17.4" +version = "0.19.1" description = "Native Delta Lake Python binding based on delta-rs with Pandas integration" optional = true python-versions = ">=3.8" files = [ - {file = "deltalake-0.17.4-cp38-abi3-macosx_10_12_x86_64.whl", hash = "sha256:3f048bd4cdd3500fbb0d1b34046966ca4b7cefd1e9df71460b881ee8ad7f844a"}, - {file = "deltalake-0.17.4-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:b539265d8293794872e1dc3b2daad50abe05ab425e961824b3ac1155bb294604"}, - {file = "deltalake-0.17.4-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:55e6be5f5ab8d5d34d2ea58d86e93eec2da5d2476e3c15e9520239457618bca4"}, - {file = "deltalake-0.17.4-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:94dde6c2d0a07e9ce47be367d016541d3a499839350852205819353441e1a9c1"}, - {file = "deltalake-0.17.4-cp38-abi3-win_amd64.whl", hash = "sha256:f51f499d50dad88bdc18c5ed7c2319114759f3220f83aa2d32166c19accee4ce"}, - {file = "deltalake-0.17.4.tar.gz", hash = "sha256:c3c10577afc46d4b10ed16246d814a8c40b3663099066681eeba89f908373814"}, + {file = "deltalake-0.19.1-cp38-abi3-macosx_10_12_x86_64.whl", hash = "sha256:ddaaaa9c85a17791c3997cf320ac11dc1725d16cf4b6f0ff1b130853e7b56cd0"}, + {file = "deltalake-0.19.1-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:e0184d5a3f0d4f4f1fb992c3bdc8736329b78b6a4faf1a278109ec35d9945c1d"}, + {file = "deltalake-0.19.1-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ec9d117fcf6c198f3d554be2f3a6291ca3838530650db236741ff48d4d47abb4"}, + {file = "deltalake-0.19.1-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:447ef721319ed15f7b5f6da507efd5fed0e6172e5ae55ac044d5b8fc9b812e47"}, + {file = "deltalake-0.19.1-cp38-abi3-win_amd64.whl", hash = "sha256:b15bc343a9f8f3de80fbedcebd5d9472b539eb0f538a71739c7fcf699089127e"}, + {file = "deltalake-0.19.1.tar.gz", hash = "sha256:5e09fabb221fb81e989c283c16278eaffb6e85706d98364abcda5c0c6ca73598"}, ] [package.dependencies] -pyarrow = ">=8" -pyarrow-hotfix = "*" +pyarrow = ">=16" [package.extras] -devel = ["mypy (>=1.8.0,<1.9.0)", "packaging (>=20)", "pytest", "pytest-benchmark", "pytest-cov", "pytest-mock", "pytest-timeout", "ruff (>=0.3.0,<0.4.0)", "sphinx (<=4.5)", "sphinx-rtd-theme", "toml", "wheel"] +devel = ["azure-storage-blob (==12.20.0)", "mypy (==1.10.1)", "packaging (>=20)", "pytest", "pytest-benchmark", "pytest-cov", "pytest-mock", "pytest-timeout", "ruff (==0.5.2)", "sphinx (<=4.5)", "sphinx-rtd-theme", "toml", "wheel"] pandas = ["pandas"] pyspark = ["delta-spark", "numpy (==1.22.2)", "pyspark"] @@ -4567,17 +4566,17 @@ testing = ["pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", [[package]] name = "lancedb" -version = "0.9.0" +version = "0.13.0b1" description = "lancedb" optional = false -python-versions = ">=3.8" +python-versions = ">=3.9" files = [ - {file = "lancedb-0.9.0-cp38-abi3-macosx_10_15_x86_64.whl", hash = "sha256:b1ca08797c72c93ae512aa1078f1891756da157d910fbae8e194fac3528fc1ac"}, - {file = "lancedb-0.9.0-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:15129791f03c2c04b95f914ced2c1556b43d73a24710207b9af77b6e4008bdeb"}, - {file = "lancedb-0.9.0-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3f093d89447a2039b820d2540a0b64df3024e4549b6808ebd26b44fbe0345cc6"}, - {file = "lancedb-0.9.0-cp38-abi3-manylinux_2_24_aarch64.whl", hash = "sha256:a8c1f6777e217d2277451038866d280fa5fb38bd161795e51703b043c26dd345"}, - {file = "lancedb-0.9.0-cp38-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:78dd5800a1148f89d33b7e98d1c8b1c42dee146f03580abc1ca83cb05273ff7f"}, - {file = "lancedb-0.9.0-cp38-abi3-win_amd64.whl", hash = "sha256:ba5bdc727d3bc131f17414f42372acde5817073feeb553793a3d20003caa1658"}, + {file = "lancedb-0.13.0b1-cp38-abi3-macosx_10_15_x86_64.whl", hash = "sha256:687b9a08be55e6fa9520255b1b06dcd2e6ba6c64c947410821e9a3a52b2f48ec"}, + {file = "lancedb-0.13.0b1-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:ac00684f7e90ffc1b386298670e2c4ddaea8c0b61b6eb1b51dbd4e74feb87a86"}, + {file = "lancedb-0.13.0b1-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bbe8fc15bfeec89b6b2a4a42b4b919b6d3e138cf8684af35f77f361d73fe90cd"}, + {file = "lancedb-0.13.0b1-cp38-abi3-manylinux_2_24_aarch64.whl", hash = "sha256:231e1f00d724c468922f7951d902622d4ccb21c2db2a148b845beaebee5d35b3"}, + {file = "lancedb-0.13.0b1-cp38-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:fecdd71f137e52193bfb5843610f32fe025a60a1edf5f80530704de879706c6b"}, + {file = "lancedb-0.13.0b1-cp38-abi3-win_amd64.whl", hash = "sha256:7852d9c04a4402407af06bbbf78bf339a169f1df2bf5c70da586ca733ec40a68"}, ] [package.dependencies] @@ -4587,7 +4586,7 @@ deprecation = "*" overrides = ">=0.7" packaging = "*" pydantic = ">=1.10" -pylance = "0.13.0" +pylance = "0.16.1" ratelimiter = ">=1.0,<2.0" requests = ">=2.31.0" retry = ">=0.9.2" @@ -4598,8 +4597,8 @@ azure = ["adlfs (>=2024.2.0)"] clip = ["open-clip", "pillow", "torch"] dev = ["pre-commit", "ruff"] docs = ["mkdocs", "mkdocs-jupyter", "mkdocs-material", "mkdocstrings[python]"] -embeddings = ["awscli (>=1.29.57)", "boto3 (>=1.28.57)", "botocore (>=1.31.57)", "cohere", "google-generativeai", "huggingface-hub", "instructorembedding", "ollama", "open-clip-torch", "openai (>=1.6.1)", "pillow", "sentence-transformers", "torch"] -tests = ["aiohttp", "boto3", "duckdb", "pandas (>=1.4)", "polars (>=0.19)", "pytest", "pytest-asyncio", "pytest-mock", "pytz", "tantivy"] +embeddings = ["awscli (>=1.29.57)", "boto3 (>=1.28.57)", "botocore (>=1.31.57)", "cohere", "google-generativeai", "huggingface-hub", "ibm-watsonx-ai (>=1.1.2)", "instructorembedding", "ollama", "open-clip-torch", "openai (>=1.6.1)", "pillow", "sentence-transformers", "torch"] +tests = ["aiohttp", "boto3", "duckdb", "pandas (>=1.4)", "polars (>=0.19,<=1.3.0)", "pytest", "pytest-asyncio", "pytest-mock", "pytz", "tantivy"] [[package]] name = "lazy-object-proxy" @@ -6660,63 +6659,52 @@ files = [ [[package]] name = "pyarrow" -version = "14.0.2" +version = "16.1.0" description = "Python library for Apache Arrow" optional = false python-versions = ">=3.8" files = [ - {file = "pyarrow-14.0.2-cp310-cp310-macosx_10_14_x86_64.whl", hash = "sha256:ba9fe808596c5dbd08b3aeffe901e5f81095baaa28e7d5118e01354c64f22807"}, - {file = "pyarrow-14.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:22a768987a16bb46220cef490c56c671993fbee8fd0475febac0b3e16b00a10e"}, - {file = "pyarrow-14.0.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2dbba05e98f247f17e64303eb876f4a80fcd32f73c7e9ad975a83834d81f3fda"}, - {file = "pyarrow-14.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a898d134d00b1eca04998e9d286e19653f9d0fcb99587310cd10270907452a6b"}, - {file = "pyarrow-14.0.2-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:87e879323f256cb04267bb365add7208f302df942eb943c93a9dfeb8f44840b1"}, - {file = "pyarrow-14.0.2-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:76fc257559404ea5f1306ea9a3ff0541bf996ff3f7b9209fc517b5e83811fa8e"}, - {file = "pyarrow-14.0.2-cp310-cp310-win_amd64.whl", hash = "sha256:b0c4a18e00f3a32398a7f31da47fefcd7a927545b396e1f15d0c85c2f2c778cd"}, - {file = "pyarrow-14.0.2-cp311-cp311-macosx_10_14_x86_64.whl", hash = "sha256:87482af32e5a0c0cce2d12eb3c039dd1d853bd905b04f3f953f147c7a196915b"}, - {file = "pyarrow-14.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:059bd8f12a70519e46cd64e1ba40e97eae55e0cbe1695edd95384653d7626b23"}, - {file = "pyarrow-14.0.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3f16111f9ab27e60b391c5f6d197510e3ad6654e73857b4e394861fc79c37200"}, - {file = "pyarrow-14.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:06ff1264fe4448e8d02073f5ce45a9f934c0f3db0a04460d0b01ff28befc3696"}, - {file = "pyarrow-14.0.2-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:6dd4f4b472ccf4042f1eab77e6c8bce574543f54d2135c7e396f413046397d5a"}, - {file = "pyarrow-14.0.2-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:32356bfb58b36059773f49e4e214996888eeea3a08893e7dbde44753799b2a02"}, - {file = "pyarrow-14.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:52809ee69d4dbf2241c0e4366d949ba035cbcf48409bf404f071f624ed313a2b"}, - {file = "pyarrow-14.0.2-cp312-cp312-macosx_10_14_x86_64.whl", hash = "sha256:c87824a5ac52be210d32906c715f4ed7053d0180c1060ae3ff9b7e560f53f944"}, - {file = "pyarrow-14.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:a25eb2421a58e861f6ca91f43339d215476f4fe159eca603c55950c14f378cc5"}, - {file = "pyarrow-14.0.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5c1da70d668af5620b8ba0a23f229030a4cd6c5f24a616a146f30d2386fec422"}, - {file = "pyarrow-14.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2cc61593c8e66194c7cdfae594503e91b926a228fba40b5cf25cc593563bcd07"}, - {file = "pyarrow-14.0.2-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:78ea56f62fb7c0ae8ecb9afdd7893e3a7dbeb0b04106f5c08dbb23f9c0157591"}, - {file = "pyarrow-14.0.2-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:37c233ddbce0c67a76c0985612fef27c0c92aef9413cf5aa56952f359fcb7379"}, - {file = "pyarrow-14.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:e4b123ad0f6add92de898214d404e488167b87b5dd86e9a434126bc2b7a5578d"}, - {file = "pyarrow-14.0.2-cp38-cp38-macosx_10_14_x86_64.whl", hash = "sha256:e354fba8490de258be7687f341bc04aba181fc8aa1f71e4584f9890d9cb2dec2"}, - {file = "pyarrow-14.0.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:20e003a23a13da963f43e2b432483fdd8c38dc8882cd145f09f21792e1cf22a1"}, - {file = "pyarrow-14.0.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fc0de7575e841f1595ac07e5bc631084fd06ca8b03c0f2ecece733d23cd5102a"}, - {file = "pyarrow-14.0.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:66e986dc859712acb0bd45601229021f3ffcdfc49044b64c6d071aaf4fa49e98"}, - {file = "pyarrow-14.0.2-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:f7d029f20ef56673a9730766023459ece397a05001f4e4d13805111d7c2108c0"}, - {file = "pyarrow-14.0.2-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:209bac546942b0d8edc8debda248364f7f668e4aad4741bae58e67d40e5fcf75"}, - {file = "pyarrow-14.0.2-cp38-cp38-win_amd64.whl", hash = "sha256:1e6987c5274fb87d66bb36816afb6f65707546b3c45c44c28e3c4133c010a881"}, - {file = "pyarrow-14.0.2-cp39-cp39-macosx_10_14_x86_64.whl", hash = "sha256:a01d0052d2a294a5f56cc1862933014e696aa08cc7b620e8c0cce5a5d362e976"}, - {file = "pyarrow-14.0.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a51fee3a7db4d37f8cda3ea96f32530620d43b0489d169b285d774da48ca9785"}, - {file = "pyarrow-14.0.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:64df2bf1ef2ef14cee531e2dfe03dd924017650ffaa6f9513d7a1bb291e59c15"}, - {file = "pyarrow-14.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3c0fa3bfdb0305ffe09810f9d3e2e50a2787e3a07063001dcd7adae0cee3601a"}, - {file = "pyarrow-14.0.2-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:c65bf4fd06584f058420238bc47a316e80dda01ec0dfb3044594128a6c2db794"}, - {file = "pyarrow-14.0.2-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:63ac901baec9369d6aae1cbe6cca11178fb018a8d45068aaf5bb54f94804a866"}, - {file = "pyarrow-14.0.2-cp39-cp39-win_amd64.whl", hash = "sha256:75ee0efe7a87a687ae303d63037d08a48ef9ea0127064df18267252cfe2e9541"}, - {file = "pyarrow-14.0.2.tar.gz", hash = "sha256:36cef6ba12b499d864d1def3e990f97949e0b79400d08b7cf74504ffbd3eb025"}, + {file = "pyarrow-16.1.0-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:17e23b9a65a70cc733d8b738baa6ad3722298fa0c81d88f63ff94bf25eaa77b9"}, + {file = "pyarrow-16.1.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:4740cc41e2ba5d641071d0ab5e9ef9b5e6e8c7611351a5cb7c1d175eaf43674a"}, + {file = "pyarrow-16.1.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:98100e0268d04e0eec47b73f20b39c45b4006f3c4233719c3848aa27a03c1aef"}, + {file = "pyarrow-16.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f68f409e7b283c085f2da014f9ef81e885d90dcd733bd648cfba3ef265961848"}, + {file = "pyarrow-16.1.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:a8914cd176f448e09746037b0c6b3a9d7688cef451ec5735094055116857580c"}, + {file = "pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:48be160782c0556156d91adbdd5a4a7e719f8d407cb46ae3bb4eaee09b3111bd"}, + {file = "pyarrow-16.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:9cf389d444b0f41d9fe1444b70650fea31e9d52cfcb5f818b7888b91b586efff"}, + {file = "pyarrow-16.1.0-cp311-cp311-macosx_10_15_x86_64.whl", hash = "sha256:d0ebea336b535b37eee9eee31761813086d33ed06de9ab6fc6aaa0bace7b250c"}, + {file = "pyarrow-16.1.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2e73cfc4a99e796727919c5541c65bb88b973377501e39b9842ea71401ca6c1c"}, + {file = "pyarrow-16.1.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bf9251264247ecfe93e5f5a0cd43b8ae834f1e61d1abca22da55b20c788417f6"}, + {file = "pyarrow-16.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ddf5aace92d520d3d2a20031d8b0ec27b4395cab9f74e07cc95edf42a5cc0147"}, + {file = "pyarrow-16.1.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:25233642583bf658f629eb230b9bb79d9af4d9f9229890b3c878699c82f7d11e"}, + {file = "pyarrow-16.1.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:a33a64576fddfbec0a44112eaf844c20853647ca833e9a647bfae0582b2ff94b"}, + {file = "pyarrow-16.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:185d121b50836379fe012753cf15c4ba9638bda9645183ab36246923875f8d1b"}, + {file = "pyarrow-16.1.0-cp312-cp312-macosx_10_15_x86_64.whl", hash = "sha256:2e51ca1d6ed7f2e9d5c3c83decf27b0d17bb207a7dea986e8dc3e24f80ff7d6f"}, + {file = "pyarrow-16.1.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:06ebccb6f8cb7357de85f60d5da50e83507954af617d7b05f48af1621d331c9a"}, + {file = "pyarrow-16.1.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b04707f1979815f5e49824ce52d1dceb46e2f12909a48a6a753fe7cafbc44a0c"}, + {file = "pyarrow-16.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0d32000693deff8dc5df444b032b5985a48592c0697cb6e3071a5d59888714e2"}, + {file = "pyarrow-16.1.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:8785bb10d5d6fd5e15d718ee1d1f914fe768bf8b4d1e5e9bf253de8a26cb1628"}, + {file = "pyarrow-16.1.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:e1369af39587b794873b8a307cc6623a3b1194e69399af0efd05bb202195a5a7"}, + {file = "pyarrow-16.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:febde33305f1498f6df85e8020bca496d0e9ebf2093bab9e0f65e2b4ae2b3444"}, + {file = "pyarrow-16.1.0-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:b5f5705ab977947a43ac83b52ade3b881eb6e95fcc02d76f501d549a210ba77f"}, + {file = "pyarrow-16.1.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:0d27bf89dfc2576f6206e9cd6cf7a107c9c06dc13d53bbc25b0bd4556f19cf5f"}, + {file = "pyarrow-16.1.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0d07de3ee730647a600037bc1d7b7994067ed64d0eba797ac74b2bc77384f4c2"}, + {file = "pyarrow-16.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fbef391b63f708e103df99fbaa3acf9f671d77a183a07546ba2f2c297b361e83"}, + {file = "pyarrow-16.1.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:19741c4dbbbc986d38856ee7ddfdd6a00fc3b0fc2d928795b95410d38bb97d15"}, + {file = "pyarrow-16.1.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:f2c5fb249caa17b94e2b9278b36a05ce03d3180e6da0c4c3b3ce5b2788f30eed"}, + {file = "pyarrow-16.1.0-cp38-cp38-win_amd64.whl", hash = "sha256:e6b6d3cd35fbb93b70ade1336022cc1147b95ec6af7d36906ca7fe432eb09710"}, + {file = "pyarrow-16.1.0-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:18da9b76a36a954665ccca8aa6bd9f46c1145f79c0bb8f4f244f5f8e799bca55"}, + {file = "pyarrow-16.1.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:99f7549779b6e434467d2aa43ab2b7224dd9e41bdde486020bae198978c9e05e"}, + {file = "pyarrow-16.1.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f07fdffe4fd5b15f5ec15c8b64584868d063bc22b86b46c9695624ca3505b7b4"}, + {file = "pyarrow-16.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ddfe389a08ea374972bd4065d5f25d14e36b43ebc22fc75f7b951f24378bf0b5"}, + {file = "pyarrow-16.1.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:3b20bd67c94b3a2ea0a749d2a5712fc845a69cb5d52e78e6449bbd295611f3aa"}, + {file = "pyarrow-16.1.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:ba8ac20693c0bb0bf4b238751d4409e62852004a8cf031c73b0e0962b03e45e3"}, + {file = "pyarrow-16.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:31a1851751433d89a986616015841977e0a188662fcffd1a5677453f1df2de0a"}, + {file = "pyarrow-16.1.0.tar.gz", hash = "sha256:15fbb22ea96d11f0b5768504a3f961edab25eaf4197c341720c4a387f6c60315"}, ] [package.dependencies] numpy = ">=1.16.6" -[[package]] -name = "pyarrow-hotfix" -version = "0.6" -description = "" -optional = true -python-versions = ">=3.5" -files = [ - {file = "pyarrow_hotfix-0.6-py3-none-any.whl", hash = "sha256:dcc9ae2d220dff0083be6a9aa8e0cdee5182ad358d4931fce825c545e5c89178"}, - {file = "pyarrow_hotfix-0.6.tar.gz", hash = "sha256:79d3e030f7ff890d408a100ac16d6f00b14d44a502d7897cd9fc3e3a534e9945"}, -] - [[package]] name = "pyasn1" version = "0.5.0" @@ -6993,22 +6981,22 @@ tests = ["coverage[toml] (==5.0.4)", "pytest (>=6.0.0,<7.0.0)"] [[package]] name = "pylance" -version = "0.13.0" +version = "0.16.1" description = "python wrapper for Lance columnar format" optional = false python-versions = ">=3.9" files = [ - {file = "pylance-0.13.0-cp39-abi3-macosx_10_15_x86_64.whl", hash = "sha256:2f3d6f9eec1f59f45dccb01075ba79868b8d37c8371d6210bcf6418217a0dd8b"}, - {file = "pylance-0.13.0-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:f4861ab466c94b0f9a4b4e6de6e1dfa02f40e7242d8db87447bc7bb7d89606ac"}, - {file = "pylance-0.13.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e3cb92547e145f5bfb0ea7d6f483953913b9bdd44c45bea84fc95a18da9f5853"}, - {file = "pylance-0.13.0-cp39-abi3-manylinux_2_24_aarch64.whl", hash = "sha256:d1ddd7700924bc6b6b0774ea63d2aa23f9210a86cd6d6af0cdfa987df776d50d"}, - {file = "pylance-0.13.0-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:c51d4b6e59cf4dc97c11a35b299f11e80dbdf392e2d8dc498573c26474a3c19e"}, - {file = "pylance-0.13.0-cp39-abi3-win_amd64.whl", hash = "sha256:4018ba016f1445874960a4ba2ad5c80cb380f3116683282ee8beabd38fa8989d"}, + {file = "pylance-0.16.1-cp39-abi3-macosx_10_15_x86_64.whl", hash = "sha256:7092303ae21bc162edd98e20fc39785fa1ec6b67f04132977ac0fd63110ba16f"}, + {file = "pylance-0.16.1-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:7c2ebdf89928c68f053ab9e369a5477da0a2ba70d47c00075dc10a37039d9e90"}, + {file = "pylance-0.16.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b4525c2fd8095830b753a3efb7285f358b016836086683fe977f9f1de8e6866c"}, + {file = "pylance-0.16.1-cp39-abi3-manylinux_2_24_aarch64.whl", hash = "sha256:645f0ab338bc4bd42bf3321bbb4053261979117aefd8477c2192ba624de27778"}, + {file = "pylance-0.16.1-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:3a7464d60aca51e89196a79c638bcbff0bddb77158946e2ea6b5fcbc6cfc63e1"}, + {file = "pylance-0.16.1-cp39-abi3-win_amd64.whl", hash = "sha256:d12c628dfbd49efde15a5512247065341f3efb29989dd08fb5a7023f013471ee"}, ] [package.dependencies] -numpy = ">=1.22" -pyarrow = ">=12,<15.0.1" +numpy = ">=1.22,<2" +pyarrow = ">=12" [package.extras] benchmarks = ["pytest-benchmark"] @@ -9696,4 +9684,4 @@ weaviate = ["weaviate-client"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<3.13" -content-hash = "a64fdd2845d27c9abc344809be68cba08f46641aabdc07416c37c802450fe4f3" +content-hash = "2b8d00f91f33a380b2399989dcac0d1d106d0bd2cd8865c5b7e27a19885753b5" diff --git a/pyproject.toml b/pyproject.toml index f33bbbefcf..74161f5ccc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -80,7 +80,7 @@ databricks-sql-connector = {version = ">=2.9.3", optional = true} clickhouse-driver = { version = ">=0.2.7", optional = true } clickhouse-connect = { version = ">=0.7.7", optional = true } lancedb = { version = ">=0.8.2", optional = true, markers = "python_version >= '3.9'", allow-prereleases = true } -deltalake = { version = ">=0.17.4", optional = true } +deltalake = { version = ">=0.19.0", optional = true } [tool.poetry.extras] gcp = ["grpcio", "google-cloud-bigquery", "db-dtypes", "gcsfs"] diff --git a/tests/libs/test_deltalake.py b/tests/libs/test_deltalake.py index 3e2d7cc3f6..dc5586eb32 100644 --- a/tests/libs/test_deltalake.py +++ b/tests/libs/test_deltalake.py @@ -95,21 +95,9 @@ def arrow_data( # type: ignore[return] client = cast(FilesystemClient, client) storage_options = _deltalake_storage_options(client.config) - with pytest.raises(Exception): - # bug in `delta-rs` causes error when writing big decimal values - # https://github.com/delta-io/delta-rs/issues/2510 - # if this test fails, the bug has been fixed and we should remove this - # note from the docs: - write_delta_table( - remote_dir + "/corrupt_delta_table", - arrow_table_all_data_types("arrow-table", include_decimal_default_precision=True)[0], - write_disposition="append", - storage_options=storage_options, - ) - arrow_table = arrow_table_all_data_types( "arrow-table", - include_decimal_default_precision=False, + include_decimal_default_precision=True, include_decimal_arrow_max_precision=True, num_rows=2, )[0] diff --git a/tests/load/pipeline/test_filesystem_pipeline.py b/tests/load/pipeline/test_filesystem_pipeline.py index 4b8707e989..d88eba7c06 100644 --- a/tests/load/pipeline/test_filesystem_pipeline.py +++ b/tests/load/pipeline/test_filesystem_pipeline.py @@ -15,7 +15,7 @@ from dlt.common.storages.configuration import FilesystemConfiguration from dlt.common.storages.load_package import ParsedLoadJobFileName from dlt.common.utils import uniq_id -from dlt.common.exceptions import DependencyVersionException +from dlt.common.schema.typing import TWriteDisposition from dlt.destinations import filesystem from dlt.destinations.impl.filesystem.filesystem import FilesystemClient from dlt.destinations.impl.filesystem.typing import TExtraPlaceholders @@ -580,6 +580,103 @@ def two_part(): @pytest.mark.essential +@pytest.mark.parametrize( + "destination_config", + destinations_configs( + table_format_filesystem_configs=True, + table_format="delta", + bucket_subset=(FILE_BUCKET), + ), + ids=lambda x: x.name, +) +@pytest.mark.parametrize( + "write_disposition", + ( + "append", + "replace", + pytest.param({"disposition": "merge", "strategy": "upsert"}, id="upsert"), + ), +) +def test_delta_table_schema_evolution( + destination_config: DestinationTestConfiguration, + write_disposition: TWriteDisposition, +) -> None: + """Tests schema evolution (adding new columns) for `delta` table format.""" + from dlt.common.libs.deltalake import get_delta_tables, ensure_delta_compatible_arrow_data + from dlt.common.libs.pyarrow import pyarrow + + @dlt.resource( + write_disposition=write_disposition, + primary_key="pk", + table_format="delta", + ) + def delta_table(data): + yield data + + pipeline = destination_config.setup_pipeline("fs_pipe", dev_mode=True) + + # create Arrow table with one column, one row + pk_field = pyarrow.field("pk", pyarrow.int64(), nullable=False) + schema = pyarrow.schema([pk_field]) + arrow_table = pyarrow.Table.from_pydict({"pk": [1]}, schema=schema) + assert arrow_table.shape == (1, 1) + + # initial load + info = pipeline.run(delta_table(arrow_table)) + assert_load_info(info) + dt = get_delta_tables(pipeline, "delta_table")["delta_table"] + expected = ensure_delta_compatible_arrow_data(arrow_table) + actual = dt.to_pyarrow_table() + assert actual.equals(expected) + + # create Arrow table with many columns, two rows + arrow_table = arrow_table_all_data_types( + "arrow-table", + include_decimal_default_precision=True, + include_decimal_arrow_max_precision=True, + include_not_normalized_name=False, + include_null=False, + num_rows=2, + )[0] + arrow_table = arrow_table.add_column(0, pk_field, [[1, 2]]) + + # second load — this should evolve the schema (i.e. add the new columns) + info = pipeline.run(delta_table(arrow_table)) + assert_load_info(info) + dt = get_delta_tables(pipeline, "delta_table")["delta_table"] + actual = dt.to_pyarrow_table() + expected = ensure_delta_compatible_arrow_data(arrow_table) + if write_disposition == "append": + # just check shape and schema for `append`, because table comparison is + # more involved than with the other dispositions + assert actual.num_rows == 3 + actual.schema.equals(expected.schema) + else: + assert actual.sort_by("pk").equals(expected.sort_by("pk")) + + # create empty Arrow table with additional column + arrow_table = arrow_table.append_column( + pyarrow.field("another_new_column", pyarrow.string()), + [["foo", "foo"]], + ) + empty_arrow_table = arrow_table.schema.empty_table() + + # load 3 — this should evolve the schema without changing data + info = pipeline.run(delta_table(empty_arrow_table)) + assert_load_info(info) + dt = get_delta_tables(pipeline, "delta_table")["delta_table"] + actual = dt.to_pyarrow_table() + expected_schema = ensure_delta_compatible_arrow_data(arrow_table).schema + assert actual.schema.equals(expected_schema) + expected_num_rows = 3 if write_disposition == "append" else 2 + assert actual.num_rows == expected_num_rows + # new column should have NULLs only + assert ( + actual.column("another_new_column").combine_chunks().to_pylist() + == [None] * expected_num_rows + ) + + @pytest.mark.parametrize( "destination_config", destinations_configs( @@ -607,7 +704,7 @@ def delta_table(data): # create empty Arrow table with schema arrow_table = arrow_table_all_data_types( "arrow-table", - include_decimal_default_precision=False, + include_decimal_default_precision=True, include_decimal_arrow_max_precision=True, include_not_normalized_name=False, include_null=False, @@ -643,22 +740,6 @@ def delta_table(data): ensure_delta_compatible_arrow_data(empty_arrow_table).schema ) - # run 3: empty Arrow table with different schema - # this should not alter the Delta table - empty_arrow_table_2 = pa.schema( - [pa.field("foo", pa.int64()), pa.field("bar", pa.string())] - ).empty_table() - - info = pipeline.run(delta_table(empty_arrow_table_2)) - assert_load_info(info) - dt = get_delta_tables(pipeline, "delta_table")["delta_table"] - assert dt.version() == 1 # still 1, no new commit was done - dt_arrow_table = dt.to_pyarrow_table() - assert dt_arrow_table.shape == (2, empty_arrow_table.num_columns) # shape did not change - assert dt_arrow_table.schema.equals( # schema did not change - ensure_delta_compatible_arrow_data(empty_arrow_table).schema - ) - # test `dlt.mark.materialize_table_schema()` users_materialize_table_schema.apply_hints(table_format="delta") info = pipeline.run(users_materialize_table_schema()) @@ -810,6 +891,22 @@ def parent_delta(): with pytest.raises(ValueError): get_delta_tables(pipeline, "non_existing_table") + # test unknown schema + with pytest.raises(FileNotFoundError): + get_delta_tables(pipeline, "non_existing_table", schema_name="aux_2") + + # load to a new schema and under new name + aux_schema = dlt.Schema("aux_2") + # NOTE: you cannot have a file with name + info = pipeline.run(parent_delta().with_name("aux_delta"), schema=aux_schema) + # also state in seprate package + assert_load_info(info, expected_load_packages=2) + delta_tables = get_delta_tables(pipeline, schema_name="aux_2") + assert "aux_delta__child" in delta_tables.keys() + get_delta_tables(pipeline, "aux_delta", schema_name="aux_2") + with pytest.raises(ValueError): + get_delta_tables(pipeline, "aux_delta") + @pytest.mark.parametrize( "destination_config", diff --git a/tests/pipeline/utils.py b/tests/pipeline/utils.py index d3d87f0e0b..dfb5f3f82d 100644 --- a/tests/pipeline/utils.py +++ b/tests/pipeline/utils.py @@ -177,24 +177,27 @@ def _load_file(client: FSClientBase, filepath) -> List[Dict[str, Any]]: # -def _load_tables_to_dicts_fs(p: dlt.Pipeline, *table_names: str) -> Dict[str, List[Dict[str, Any]]]: +def _load_tables_to_dicts_fs( + p: dlt.Pipeline, *table_names: str, schema_name: str = None +) -> Dict[str, List[Dict[str, Any]]]: """For now this will expect the standard layout in the filesystem destination, if changed the results will not be correct""" - client = p._fs_client() + client = p._fs_client(schema_name=schema_name) + assert isinstance(client, FilesystemClient) + result: Dict[str, Any] = {} delta_table_names = [ table_name for table_name in table_names - if get_table_format(p.default_schema.tables, table_name) == "delta" + if get_table_format(client.schema.tables, table_name) == "delta" ] if len(delta_table_names) > 0: from dlt.common.libs.deltalake import get_delta_tables - delta_tables = get_delta_tables(p, *table_names) + delta_tables = get_delta_tables(p, *table_names, schema_name=schema_name) for table_name in table_names: - if table_name in p.default_schema.data_table_names() and table_name in delta_table_names: - assert isinstance(client, FilesystemClient) + if table_name in client.schema.data_table_names() and table_name in delta_table_names: dt = delta_tables[table_name] result[table_name] = dt.to_pyarrow_table().to_pylist() else: @@ -244,7 +247,7 @@ def _sort_list_of_dicts(list_: List[Dict[str, Any]], sortkey: str) -> List[Dict[ return sorted(list_, key=lambda d: d[sortkey]) if _is_filesystem(p): - result = _load_tables_to_dicts_fs(p, *table_names) + result = _load_tables_to_dicts_fs(p, *table_names, schema_name=schema_name) else: result = _load_tables_to_dicts_sql(p, *table_names, schema_name=schema_name)