diff --git a/.github/workflows/test_destination_athena.yml b/.github/workflows/test_destination_athena.yml index 1169fab0de..03eb7f9434 100644 --- a/.github/workflows/test_destination_athena.yml +++ b/.github/workflows/test_destination_athena.yml @@ -67,7 +67,7 @@ jobs: - name: Install dependencies # if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' - run: poetry install --no-interaction -E athena --with sentry-sdk --with pipeline + run: poetry install --no-interaction -E athena --with sentry-sdk --with pipeline,ibis - name: create secrets.toml run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml diff --git a/.github/workflows/test_destination_athena_iceberg.yml b/.github/workflows/test_destination_athena_iceberg.yml index 7ccefcc055..3412e789e3 100644 --- a/.github/workflows/test_destination_athena_iceberg.yml +++ b/.github/workflows/test_destination_athena_iceberg.yml @@ -67,7 +67,7 @@ jobs: - name: Install dependencies # if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' - run: poetry install --no-interaction -E athena --with sentry-sdk --with pipeline + run: poetry install --no-interaction -E athena --with sentry-sdk --with pipeline,ibis - name: create secrets.toml run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml diff --git a/.github/workflows/test_destination_bigquery.yml b/.github/workflows/test_destination_bigquery.yml index 7afc9b8a00..eb8b63f757 100644 --- a/.github/workflows/test_destination_bigquery.yml +++ b/.github/workflows/test_destination_bigquery.yml @@ -66,7 +66,7 @@ jobs: - name: Install dependencies # if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' - run: poetry install --no-interaction -E bigquery --with providers -E parquet --with sentry-sdk --with pipeline + run: poetry install --no-interaction -E bigquery --with providers -E parquet --with sentry-sdk --with pipeline,ibis - name: create secrets.toml run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml diff --git a/.github/workflows/test_destination_clickhouse.yml b/.github/workflows/test_destination_clickhouse.yml index 7f297db971..46464ea462 100644 --- a/.github/workflows/test_destination_clickhouse.yml +++ b/.github/workflows/test_destination_clickhouse.yml @@ -61,7 +61,7 @@ jobs: key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-gcp - name: Install dependencies - run: poetry install --no-interaction -E clickhouse --with providers -E parquet --with sentry-sdk --with pipeline + run: poetry install --no-interaction -E clickhouse --with providers -E parquet --with sentry-sdk --with pipeline,ibis - name: create secrets.toml run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml diff --git a/.github/workflows/test_destination_databricks.yml b/.github/workflows/test_destination_databricks.yml index 1656fe27f4..c1609de863 100644 --- a/.github/workflows/test_destination_databricks.yml +++ b/.github/workflows/test_destination_databricks.yml @@ -64,7 +64,7 @@ jobs: key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-gcp - name: Install dependencies - run: poetry install --no-interaction -E databricks -E s3 -E gs -E az -E parquet --with sentry-sdk --with pipeline + run: poetry install --no-interaction -E databricks -E s3 -E gs -E az -E parquet --with sentry-sdk --with pipeline,ibis - name: create secrets.toml run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml diff --git a/.github/workflows/test_destination_dremio.yml b/.github/workflows/test_destination_dremio.yml index 45c6d17db1..4bc48c54db 100644 --- a/.github/workflows/test_destination_dremio.yml +++ b/.github/workflows/test_destination_dremio.yml @@ -65,7 +65,7 @@ jobs: key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-gcp - name: Install dependencies - run: poetry install --no-interaction -E s3 -E gs -E az -E parquet --with sentry-sdk --with pipeline + run: poetry install --no-interaction -E s3 -E gs -E az -E parquet --with sentry-sdk --with pipeline,ibis - run: | poetry run pytest tests/load --ignore tests/load/sources diff --git a/.github/workflows/test_destination_motherduck.yml b/.github/workflows/test_destination_motherduck.yml index 0014b17655..db81131266 100644 --- a/.github/workflows/test_destination_motherduck.yml +++ b/.github/workflows/test_destination_motherduck.yml @@ -64,7 +64,7 @@ jobs: key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-motherduck - name: Install dependencies - run: poetry install --no-interaction -E motherduck -E s3 -E gs -E az -E parquet --with sentry-sdk --with pipeline + run: poetry install --no-interaction -E motherduck -E s3 -E gs -E az -E parquet --with sentry-sdk --with pipeline,ibis - name: create secrets.toml run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml diff --git a/.github/workflows/test_destination_mssql.yml b/.github/workflows/test_destination_mssql.yml index 8b899e7da2..6fdd7a5bc5 100644 --- a/.github/workflows/test_destination_mssql.yml +++ b/.github/workflows/test_destination_mssql.yml @@ -69,7 +69,7 @@ jobs: key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-gcp - name: Install dependencies - run: poetry install --no-interaction -E mssql -E s3 -E gs -E az -E parquet --with sentry-sdk --with pipeline + run: poetry install --no-interaction -E mssql -E s3 -E gs -E az -E parquet --with sentry-sdk --with pipeline,ibis - name: create secrets.toml run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml diff --git a/.github/workflows/test_destination_snowflake.yml b/.github/workflows/test_destination_snowflake.yml index a720c479bd..73a2a8f6e7 100644 --- a/.github/workflows/test_destination_snowflake.yml +++ b/.github/workflows/test_destination_snowflake.yml @@ -64,7 +64,7 @@ jobs: key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-gcp - name: Install dependencies - run: poetry install --no-interaction -E snowflake -E s3 -E gs -E az -E parquet --with sentry-sdk --with pipeline + run: poetry install --no-interaction -E snowflake -E s3 -E gs -E az -E parquet --with sentry-sdk --with pipeline,ibis - name: create secrets.toml run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml diff --git a/.github/workflows/test_destination_synapse.yml b/.github/workflows/test_destination_synapse.yml index be1b493916..8f6bf1eb29 100644 --- a/.github/workflows/test_destination_synapse.yml +++ b/.github/workflows/test_destination_synapse.yml @@ -67,7 +67,7 @@ jobs: key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-gcp - name: Install dependencies - run: poetry install --no-interaction -E synapse -E parquet --with sentry-sdk --with pipeline + run: poetry install --no-interaction -E synapse -E parquet --with sentry-sdk --with pipeline,ibis - name: create secrets.toml run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml diff --git a/.github/workflows/test_destinations.yml b/.github/workflows/test_destinations.yml index 933248d994..a9306c2f9c 100644 --- a/.github/workflows/test_destinations.yml +++ b/.github/workflows/test_destinations.yml @@ -77,8 +77,10 @@ jobs: # key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-redshift - name: Install dependencies - # if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' - run: poetry install --no-interaction -E redshift -E postgis -E postgres -E gs -E s3 -E az -E parquet -E duckdb -E cli -E filesystem --with sentry-sdk --with pipeline -E deltalake + run: poetry install --no-interaction -E redshift -E postgis -E postgres -E gs -E s3 -E az -E parquet -E duckdb -E cli -E filesystem --with sentry-sdk --with pipeline,ibis -E deltalake -E pyiceberg + + - name: Upgrade sqlalchemy + run: poetry run pip install sqlalchemy==2.0.18 # minimum version required by `pyiceberg` - name: create secrets.toml run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml diff --git a/.github/workflows/test_local_destinations.yml b/.github/workflows/test_local_destinations.yml index 4947a46a3b..706bae1b0c 100644 --- a/.github/workflows/test_local_destinations.yml +++ b/.github/workflows/test_local_destinations.yml @@ -95,7 +95,10 @@ jobs: key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-local-destinations - name: Install dependencies - run: poetry install --no-interaction -E postgres -E postgis -E duckdb -E parquet -E filesystem -E cli -E weaviate -E qdrant -E sftp --with sentry-sdk --with pipeline -E deltalake + run: poetry install --no-interaction -E postgres -E postgis -E duckdb -E parquet -E filesystem -E cli -E weaviate -E qdrant -E sftp --with sentry-sdk --with pipeline,ibis -E deltalake -E pyiceberg + + - name: Upgrade sqlalchemy + run: poetry run pip install sqlalchemy==2.0.18 # minimum version required by `pyiceberg` - name: Start SFTP server run: docker compose -f "tests/load/filesystem_sftp/docker-compose.yml" up -d diff --git a/.github/workflows/test_sqlalchemy_destinations.yml b/.github/workflows/test_sqlalchemy_destinations.yml index c2572b322d..1f00373674 100644 --- a/.github/workflows/test_sqlalchemy_destinations.yml +++ b/.github/workflows/test_sqlalchemy_destinations.yml @@ -86,7 +86,7 @@ jobs: key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-local-destinations - name: Install dependencies - run: poetry install --no-interaction -E parquet -E filesystem -E sqlalchemy -E cli --with sentry-sdk --with pipeline && poetry run pip install mysqlclient && poetry run pip install "sqlalchemy==${{ matrix.sqlalchemy }}" + run: poetry install --no-interaction -E parquet -E filesystem -E sqlalchemy -E cli --with sentry-sdk --with pipeline,ibis && poetry run pip install mysqlclient && poetry run pip install "sqlalchemy==${{ matrix.sqlalchemy }}" - name: create secrets.toml run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml diff --git a/Makefile b/Makefile index 2a7f6dac0a..975a8a42da 100644 --- a/Makefile +++ b/Makefile @@ -44,7 +44,7 @@ has-poetry: poetry --version dev: has-poetry - poetry install --all-extras --with docs,providers,pipeline,sources,sentry-sdk,airflow + poetry install --all-extras --with docs,providers,pipeline,sources,sentry-sdk lint: ./tools/check-package.sh @@ -63,7 +63,6 @@ format: lint-snippets: cd docs/tools && poetry run python check_embedded_snippets.py full - lint-and-test-snippets: lint-snippets poetry run mypy --config-file mypy.ini docs/website docs/tools --exclude docs/tools/lint_setup --exclude docs/website/docs_processed poetry run flake8 --max-line-length=200 docs/website docs/tools --exclude docs/website/.dlt-repo @@ -82,7 +81,7 @@ lint-security: poetry run bandit -r dlt/ -n 3 -l test: - (set -a && . tests/.env && poetry run pytest tests) + poetry run pytest tests test-load-local: DESTINATION__POSTGRES__CREDENTIALS=postgresql://loader:loader@localhost:5432/dlt_data DESTINATION__DUCKDB__CREDENTIALS=duckdb:///_storage/test_quack.duckdb poetry run pytest tests -k '(postgres or duckdb)' diff --git a/dlt/__init__.py b/dlt/__init__.py index e8a1b7bf92..328817efd2 100644 --- a/dlt/__init__.py +++ b/dlt/__init__.py @@ -42,7 +42,6 @@ ) from dlt.pipeline import progress from dlt import destinations -from dlt.destinations.dataset import dataset as _dataset pipeline = _pipeline current = _current @@ -80,7 +79,6 @@ "TCredentials", "sources", "destinations", - "_dataset", ] # verify that no injection context was created diff --git a/dlt/cli/command_wrappers.py b/dlt/cli/command_wrappers.py index 0e6491688e..847b5daabb 100644 --- a/dlt/cli/command_wrappers.py +++ b/dlt/cli/command_wrappers.py @@ -43,14 +43,14 @@ def init_command_wrapper( destination_type: str, repo_location: str, branch: str, - omit_core_sources: bool = False, + eject_source: bool = False, ) -> None: init_command( source_name, destination_type, repo_location, branch, - omit_core_sources, + eject_source, ) diff --git a/dlt/cli/init_command.py b/dlt/cli/init_command.py index ac8adcc588..e81fa80c36 100644 --- a/dlt/cli/init_command.py +++ b/dlt/cli/init_command.py @@ -157,7 +157,7 @@ def _list_core_sources() -> Dict[str, SourceConfiguration]: sources: Dict[str, SourceConfiguration] = {} for source_name in files_ops.get_sources_names(core_sources_storage, source_type="core"): sources[source_name] = files_ops.get_core_source_configuration( - core_sources_storage, source_name + core_sources_storage, source_name, eject_source=False ) return sources @@ -295,7 +295,7 @@ def init_command( destination_type: str, repo_location: str, branch: str = None, - omit_core_sources: bool = False, + eject_source: bool = False, ) -> None: # try to import the destination and get config spec destination_reference = Destination.from_reference(destination_type) @@ -310,13 +310,9 @@ def init_command( # discover type of source source_type: files_ops.TSourceType = "template" - if ( - source_name in files_ops.get_sources_names(core_sources_storage, source_type="core") - ) and not omit_core_sources: + if source_name in files_ops.get_sources_names(core_sources_storage, source_type="core"): source_type = "core" else: - if omit_core_sources: - fmt.echo("Omitting dlt core sources.") verified_sources_storage = _clone_and_get_verified_sources_storage(repo_location, branch) if source_name in files_ops.get_sources_names( verified_sources_storage, source_type="verified" @@ -380,7 +376,7 @@ def init_command( else: if source_type == "core": source_configuration = files_ops.get_core_source_configuration( - core_sources_storage, source_name + core_sources_storage, source_name, eject_source ) from importlib.metadata import Distribution @@ -392,6 +388,9 @@ def init_command( if canonical_source_name in extras: source_configuration.requirements.update_dlt_extras(canonical_source_name) + + # create remote modified index to copy files when ejecting + remote_modified = {file_name: None for file_name in source_configuration.files} else: if not is_valid_schema_name(source_name): raise InvalidSchemaName(source_name) @@ -536,11 +535,17 @@ def init_command( "Creating a new pipeline with the dlt core source %s (%s)" % (fmt.bold(source_name), source_configuration.doc) ) - fmt.echo( - "NOTE: Beginning with dlt 1.0.0, the source %s will no longer be copied from the" - " verified sources repo but imported from dlt.sources. You can provide the" - " --omit-core-sources flag to revert to the old behavior." % (fmt.bold(source_name)) - ) + if eject_source: + fmt.echo( + "NOTE: Source code of %s will be ejected. Remember to modify the pipeline " + "example script to import the ejected source." % (fmt.bold(source_name)) + ) + else: + fmt.echo( + "NOTE: Beginning with dlt 1.0.0, the source %s will no longer be copied from" + " the verified sources repo but imported from dlt.sources. You can provide the" + " --eject flag to revert to the old behavior." % (fmt.bold(source_name)) + ) elif source_configuration.source_type == "verified": fmt.echo( "Creating and configuring a new pipeline with the verified source %s (%s)" diff --git a/dlt/cli/pipeline_files.py b/dlt/cli/pipeline_files.py index b6f8f85271..c0139fe2a7 100644 --- a/dlt/cli/pipeline_files.py +++ b/dlt/cli/pipeline_files.py @@ -226,11 +226,31 @@ def get_template_configuration( ) +def _get_source_files(sources_storage: FileStorage, source_name: str) -> List[str]: + """Get all files that belong to source `source_name`""" + files: List[str] = [] + for root, subdirs, _files in os.walk(sources_storage.make_full_path(source_name)): + # filter unwanted files + for subdir in list(subdirs): + if any(fnmatch.fnmatch(subdir, ignore) for ignore in IGNORE_FILES): + subdirs.remove(subdir) + rel_root = sources_storage.to_relative_path(root) + files.extend( + [ + os.path.join(rel_root, file) + for file in _files + if all(not fnmatch.fnmatch(file, ignore) for ignore in IGNORE_FILES) + ] + ) + return files + + def get_core_source_configuration( - sources_storage: FileStorage, source_name: str + sources_storage: FileStorage, source_name: str, eject_source: bool ) -> SourceConfiguration: src_pipeline_file = CORE_SOURCE_TEMPLATE_MODULE_NAME + "/" + source_name + PIPELINE_FILE_SUFFIX dest_pipeline_file = source_name + PIPELINE_FILE_SUFFIX + files: List[str] = _get_source_files(sources_storage, source_name) if eject_source else [] return SourceConfiguration( "core", @@ -238,7 +258,7 @@ def get_core_source_configuration( sources_storage, src_pipeline_file, dest_pipeline_file, - [".gitignore"], + files, SourceRequirements([]), _get_docstring_for_module(sources_storage, source_name), False, @@ -259,21 +279,7 @@ def get_verified_source_configuration( f"Pipeline example script {example_script} could not be found in the repository", source_name, ) - # get all files recursively - files: List[str] = [] - for root, subdirs, _files in os.walk(sources_storage.make_full_path(source_name)): - # filter unwanted files - for subdir in list(subdirs): - if any(fnmatch.fnmatch(subdir, ignore) for ignore in IGNORE_FILES): - subdirs.remove(subdir) - rel_root = sources_storage.to_relative_path(root) - files.extend( - [ - os.path.join(rel_root, file) - for file in _files - if all(not fnmatch.fnmatch(file, ignore) for ignore in IGNORE_FILES) - ] - ) + files = _get_source_files(sources_storage, source_name) # read requirements requirements_path = os.path.join(source_name, utils.REQUIREMENTS_TXT) if sources_storage.has_file(requirements_path): diff --git a/dlt/cli/plugins.py b/dlt/cli/plugins.py index cc2d4594b9..1712efbbd7 100644 --- a/dlt/cli/plugins.py +++ b/dlt/cli/plugins.py @@ -84,14 +84,10 @@ def configure_parser(self, parser: argparse.ArgumentParser) -> None: ) parser.add_argument( - "--omit-core-sources", + "--eject", default=False, action="store_true", - help=( - "When present, will not create the new pipeline with a core source of the given" - " name but will take a source of this name from the default or provided" - " location." - ), + help="Ejects the source code of the core source like sql_database", ) def execute(self, args: argparse.Namespace) -> None: @@ -107,7 +103,7 @@ def execute(self, args: argparse.Namespace) -> None: args.destination, args.location, args.branch, - args.omit_core_sources, + args.eject, ) diff --git a/dlt/cli/source_detection.py b/dlt/cli/source_detection.py index 7067f8b896..0769605d01 100644 --- a/dlt/cli/source_detection.py +++ b/dlt/cli/source_detection.py @@ -29,8 +29,7 @@ def find_call_arguments_to_replace( if not isinstance(dn_node, ast.Constant) or not isinstance(dn_node.value, str): raise CliCommandInnerException( "init", - f"The pipeline script {init_script_name} must pass the {t_arg_name} as" - f" string to '{arg_name}' function in line {dn_node.lineno}", + f"The pipeline script {init_script_name} must pass the {t_arg_name} as string to '{arg_name}' function in line {dn_node.lineno}", # type: ignore[attr-defined] ) else: transformed_nodes.append((dn_node, ast.Constant(value=t_value, kind=None))) diff --git a/dlt/common/configuration/providers/toml.py b/dlt/common/configuration/providers/toml.py index 3636565fae..e586fef225 100644 --- a/dlt/common/configuration/providers/toml.py +++ b/dlt/common/configuration/providers/toml.py @@ -124,6 +124,12 @@ def _read_google_colab_secrets(self, name: str, file_name: str) -> tomlkit.TOMLD """Try to load the toml from google colab userdata object""" try: from google.colab import userdata + from dlt.common.runtime.exec_info import is_notebook + + # make sure we work in interactive mode (get_ipython() is available) + # when dlt cli is run, userdata is available but without a kernel + if not is_notebook(): + return None try: return tomlkit.loads(userdata.get(file_name)) diff --git a/dlt/common/configuration/specs/aws_credentials.py b/dlt/common/configuration/specs/aws_credentials.py index 5f69be6a33..a75cd85225 100644 --- a/dlt/common/configuration/specs/aws_credentials.py +++ b/dlt/common/configuration/specs/aws_credentials.py @@ -8,6 +8,7 @@ CredentialsWithDefault, configspec, ) +from dlt.common.configuration.specs.mixins import WithObjectStoreRsCredentials, WithPyicebergConfig from dlt.common.configuration.specs.exceptions import ( InvalidBoto3Session, ObjectStoreRsCredentialsException, @@ -16,7 +17,9 @@ @configspec -class AwsCredentialsWithoutDefaults(CredentialsConfiguration): +class AwsCredentialsWithoutDefaults( + CredentialsConfiguration, WithObjectStoreRsCredentials, WithPyicebergConfig +): # credentials without boto implementation aws_access_key_id: str = None aws_secret_access_key: TSecretStrValue = None @@ -77,6 +80,16 @@ def to_object_store_rs_credentials(self) -> Dict[str, str]: return creds + def to_pyiceberg_fileio_config(self) -> Dict[str, Any]: + return { + "s3.access-key-id": self.aws_access_key_id, + "s3.secret-access-key": self.aws_secret_access_key, + "s3.session-token": self.aws_session_token, + "s3.region": self.region_name, + "s3.endpoint": self.endpoint_url, + "s3.connect-timeout": 300, + } + @configspec class AwsCredentials(AwsCredentialsWithoutDefaults, CredentialsWithDefault): diff --git a/dlt/common/configuration/specs/azure_credentials.py b/dlt/common/configuration/specs/azure_credentials.py index cf6ec493de..aabd0b471a 100644 --- a/dlt/common/configuration/specs/azure_credentials.py +++ b/dlt/common/configuration/specs/azure_credentials.py @@ -8,6 +8,7 @@ CredentialsWithDefault, configspec, ) +from dlt.common.configuration.specs.mixins import WithObjectStoreRsCredentials, WithPyicebergConfig from dlt import version from dlt.common.utils import without_none @@ -15,7 +16,7 @@ @configspec -class AzureCredentialsBase(CredentialsConfiguration): +class AzureCredentialsBase(CredentialsConfiguration, WithObjectStoreRsCredentials): azure_storage_account_name: str = None azure_account_host: Optional[str] = None """Alternative host when accessing blob storage endpoint ie. my_account.dfs.core.windows.net""" @@ -32,7 +33,7 @@ def to_object_store_rs_credentials(self) -> Dict[str, str]: @configspec -class AzureCredentialsWithoutDefaults(AzureCredentialsBase): +class AzureCredentialsWithoutDefaults(AzureCredentialsBase, WithPyicebergConfig): """Credentials for Azure Blob Storage, compatible with adlfs""" azure_storage_account_key: Optional[TSecretStrValue] = None @@ -49,6 +50,13 @@ def to_adlfs_credentials(self) -> Dict[str, Any]: account_host=self.azure_account_host, ) + def to_pyiceberg_fileio_config(self) -> Dict[str, Any]: + return { + "adlfs.account-name": self.azure_storage_account_name, + "adlfs.account-key": self.azure_storage_account_key, + "adlfs.sas-token": self.azure_storage_sas_token, + } + def create_sas_token(self) -> None: try: from azure.storage.blob import generate_account_sas, ResourceTypes @@ -72,7 +80,7 @@ def on_partial(self) -> None: @configspec -class AzureServicePrincipalCredentialsWithoutDefaults(AzureCredentialsBase): +class AzureServicePrincipalCredentialsWithoutDefaults(AzureCredentialsBase, WithPyicebergConfig): azure_tenant_id: str = None azure_client_id: str = None azure_client_secret: TSecretStrValue = None @@ -86,6 +94,14 @@ def to_adlfs_credentials(self) -> Dict[str, Any]: client_secret=self.azure_client_secret, ) + def to_pyiceberg_fileio_config(self) -> Dict[str, Any]: + return { + "adlfs.account-name": self.azure_storage_account_name, + "adlfs.tenant-id": self.azure_tenant_id, + "adlfs.client-id": self.azure_client_id, + "adlfs.client-secret": self.azure_client_secret, + } + @configspec class AzureCredentials(AzureCredentialsWithoutDefaults, CredentialsWithDefault): diff --git a/dlt/common/configuration/specs/base_configuration.py b/dlt/common/configuration/specs/base_configuration.py index 8d913d0542..41d1d7a0ca 100644 --- a/dlt/common/configuration/specs/base_configuration.py +++ b/dlt/common/configuration/specs/base_configuration.py @@ -359,7 +359,7 @@ def _get_resolvable_dataclass_fields(cls) -> Iterator[TDtcField]: def get_resolvable_fields(cls) -> Dict[str, type]: """Returns a mapping of fields to their type hints. Dunders should not be resolved and are not returned""" return { - f.name: eval(f.type) if isinstance(f.type, str) else f.type # type: ignore[arg-type] + f.name: eval(f.type) if isinstance(f.type, str) else f.type for f in cls._get_resolvable_dataclass_fields() } diff --git a/dlt/common/configuration/specs/config_providers_context.py b/dlt/common/configuration/specs/config_providers_context.py index 5d1a5b7f26..a244ab571f 100644 --- a/dlt/common/configuration/specs/config_providers_context.py +++ b/dlt/common/configuration/specs/config_providers_context.py @@ -1,5 +1,4 @@ import contextlib -import dataclasses import io from typing import ClassVar, List @@ -8,10 +7,6 @@ ConfigProvider, ContextProvider, ) -from dlt.common.configuration.specs.base_configuration import ( - ContainerInjectableContext, - NotResolved, -) from dlt.common.configuration.specs import ( GcpServiceAccountCredentials, BaseConfiguration, @@ -137,7 +132,7 @@ def _airflow_providers() -> List[ConfigProvider]: # check if we are in task context and provide more info from airflow.operators.python import get_current_context # noqa - ti: TaskInstance = get_current_context()["ti"] # type: ignore + ti: TaskInstance = get_current_context()["ti"] # type: ignore[assignment,unused-ignore] # log outside of stderr/out redirect if secrets_toml_var is None: diff --git a/dlt/common/configuration/specs/exceptions.py b/dlt/common/configuration/specs/exceptions.py index 928e46a8a0..fe87ef24d7 100644 --- a/dlt/common/configuration/specs/exceptions.py +++ b/dlt/common/configuration/specs/exceptions.py @@ -72,3 +72,7 @@ def __init__(self, spec: Type[Any], native_value: Any): class ObjectStoreRsCredentialsException(ConfigurationException): pass + + +class UnsupportedAuthenticationMethodException(ConfigurationException): + pass diff --git a/dlt/common/configuration/specs/gcp_credentials.py b/dlt/common/configuration/specs/gcp_credentials.py index 60ab1d4b56..17519b032a 100644 --- a/dlt/common/configuration/specs/gcp_credentials.py +++ b/dlt/common/configuration/specs/gcp_credentials.py @@ -11,7 +11,9 @@ InvalidGoogleServicesJson, NativeValueError, OAuth2ScopesRequired, + UnsupportedAuthenticationMethodException, ) +from dlt.common.configuration.specs.mixins import WithObjectStoreRsCredentials, WithPyicebergConfig from dlt.common.exceptions import MissingDependencyException from dlt.common.typing import DictStrAny, TSecretStrValue, StrAny from dlt.common.configuration.specs.base_configuration import ( @@ -23,7 +25,7 @@ @configspec -class GcpCredentials(CredentialsConfiguration): +class GcpCredentials(CredentialsConfiguration, WithObjectStoreRsCredentials, WithPyicebergConfig): token_uri: Final[str] = dataclasses.field( default="https://oauth2.googleapis.com/token", init=False, repr=False, compare=False ) @@ -126,6 +128,12 @@ def to_native_credentials(self) -> Any: else: return ServiceAccountCredentials.from_service_account_info(self) + def to_pyiceberg_fileio_config(self) -> Dict[str, Any]: + raise UnsupportedAuthenticationMethodException( + "Service Account authentication not supported with `iceberg` table format. Use OAuth" + " authentication instead." + ) + def __str__(self) -> str: return f"{self.client_email}@{self.project_id}" @@ -176,11 +184,19 @@ def to_native_representation(self) -> str: return json.dumps(self._info_dict()) def to_object_store_rs_credentials(self) -> Dict[str, str]: - raise NotImplementedError( - "`object_store` Rust crate does not support OAuth for GCP credentials. Reference:" - " https://docs.rs/object_store/latest/object_store/gcp." + raise UnsupportedAuthenticationMethodException( + "OAuth authentication not supported with `delta` table format. Use Service Account or" + " Application Default Credentials authentication instead." ) + def to_pyiceberg_fileio_config(self) -> Dict[str, Any]: + self.auth() + return { + "gcs.project-id": self.project_id, + "gcs.oauth2.token": self.token, + "gcs.oauth2.token-expires-at": (pendulum.now().timestamp() + 60) * 1000, + } + def auth(self, scopes: Union[str, List[str]] = None, redirect_url: str = None) -> None: if not self.refresh_token: self.add_scopes(scopes) @@ -313,6 +329,12 @@ def to_native_credentials(self) -> Any: else: return super().to_native_credentials() + def to_pyiceberg_fileio_config(self) -> Dict[str, Any]: + raise UnsupportedAuthenticationMethodException( + "Application Default Credentials authentication not supported with `iceberg` table" + " format. Use OAuth authentication instead." + ) + @configspec class GcpServiceAccountCredentials( @@ -334,3 +356,9 @@ def parse_native_representation(self, native_value: Any) -> None: except NativeValueError: pass GcpOAuthCredentialsWithoutDefaults.parse_native_representation(self, native_value) + + def to_pyiceberg_fileio_config(self) -> Dict[str, Any]: + if self.has_default_credentials(): + return GcpDefaultCredentials.to_pyiceberg_fileio_config(self) + else: + return GcpOAuthCredentialsWithoutDefaults.to_pyiceberg_fileio_config(self) diff --git a/dlt/common/configuration/specs/mixins.py b/dlt/common/configuration/specs/mixins.py new file mode 100644 index 0000000000..2f843aee5b --- /dev/null +++ b/dlt/common/configuration/specs/mixins.py @@ -0,0 +1,24 @@ +from typing import Dict, Any +from abc import abstractmethod, ABC + + +class WithObjectStoreRsCredentials(ABC): + @abstractmethod + def to_object_store_rs_credentials(self) -> Dict[str, Any]: + """Returns credentials dictionary for object_store Rust crate. + + Can be used for libraries that build on top of the object_store crate, such as `deltalake`. + + https://docs.rs/object_store/latest/object_store/ + """ + pass + + +class WithPyicebergConfig(ABC): + @abstractmethod + def to_pyiceberg_fileio_config(self) -> Dict[str, Any]: + """Returns `pyiceberg` FileIO configuration dictionary. + + https://py.iceberg.apache.org/configuration/#fileio + """ + pass diff --git a/dlt/common/data_writers/buffered.py b/dlt/common/data_writers/buffered.py index e2b6c9a442..6ef431a4d0 100644 --- a/dlt/common/data_writers/buffered.py +++ b/dlt/common/data_writers/buffered.py @@ -242,7 +242,7 @@ def _flush_items(self, allow_empty_file: bool = False) -> None: if self.writer_spec.is_binary_format: self._file = self.open(self._file_name, "wb") # type: ignore else: - self._file = self.open(self._file_name, "wt", encoding="utf-8", newline="") # type: ignore + self._file = self.open(self._file_name, "wt", encoding="utf-8", newline="") self._writer = self.writer_cls(self._file, caps=self._caps) # type: ignore[assignment] self._writer.write_header(self._current_columns) # write buffer diff --git a/dlt/common/destination/reference.py b/dlt/common/destination/reference.py index e27f99cde7..827034ddca 100644 --- a/dlt/common/destination/reference.py +++ b/dlt/common/destination/reference.py @@ -67,7 +67,7 @@ TDestinationConfig = TypeVar("TDestinationConfig", bound="DestinationClientConfiguration") TDestinationClient = TypeVar("TDestinationClient", bound="JobClientBase") TDestinationDwhClient = TypeVar("TDestinationDwhClient", bound="DestinationClientDwhConfiguration") -TDatasetType = Literal["dbapi", "ibis"] +TDatasetType = Literal["auto", "default", "ibis"] DEFAULT_FILE_LAYOUT = "{table_name}/{load_id}.{file_id}.{ext}" @@ -76,7 +76,7 @@ try: from dlt.common.libs.pandas import DataFrame from dlt.common.libs.pyarrow import Table as ArrowTable - from dlt.common.libs.ibis import BaseBackend as IbisBackend + from dlt.helpers.ibis import BaseBackend as IbisBackend except MissingDependencyException: DataFrame = Any ArrowTable = Any @@ -535,7 +535,7 @@ def fetchone(self) -> Optional[Tuple[Any, ...]]: ... # modifying access parameters - def limit(self, limit: int) -> "SupportsReadableRelation": + def limit(self, limit: int, **kwargs: Any) -> "SupportsReadableRelation": """limit the result to 'limit' items""" ... @@ -557,6 +557,10 @@ def __getitem__(self, columns: Union[str, Sequence[str]]) -> "SupportsReadableRe """set which columns will be selected""" ... + def __getattr__(self, attr: str) -> Any: + """get an attribute of the relation""" + ... + def __copy__(self) -> "SupportsReadableRelation": """create a copy of the relation object""" ... @@ -588,6 +592,10 @@ def __getattr__(self, table: str) -> SupportsReadableRelation: ... def ibis(self) -> IbisBackend: ... + def row_counts( + self, *, data_tables: bool = True, dlt_tables: bool = False, table_names: List[str] = None + ) -> SupportsReadableRelation: ... + class JobClientBase(ABC): def __init__( diff --git a/dlt/common/destination/utils.py b/dlt/common/destination/utils.py index 0bad5b152e..c98344b687 100644 --- a/dlt/common/destination/utils.py +++ b/dlt/common/destination/utils.py @@ -38,7 +38,7 @@ def verify_schema_capabilities( exception_log: List[Exception] = [] # combined casing function case_identifier = lambda ident: capabilities.casefold_identifier( - (str if capabilities.has_case_sensitive_identifiers else str.casefold)(ident) # type: ignore + (str if capabilities.has_case_sensitive_identifiers else str.casefold)(ident) ) table_name_lookup: DictStrStr = {} # name collision explanation diff --git a/dlt/common/incremental/typing.py b/dlt/common/incremental/typing.py index 460e2f234b..2ca981bff0 100644 --- a/dlt/common/incremental/typing.py +++ b/dlt/common/incremental/typing.py @@ -8,6 +8,8 @@ LastValueFunc = Callable[[Sequence[TCursorValue]], Any] OnCursorValueMissing = Literal["raise", "include", "exclude"] +TIncrementalRange = Literal["open", "closed"] + class IncrementalColumnState(TypedDict): initial_value: Optional[Any] @@ -26,3 +28,5 @@ class IncrementalArgs(TypedDict, total=False): allow_external_schedulers: Optional[bool] lag: Optional[Union[float, int]] on_cursor_value_missing: Optional[OnCursorValueMissing] + range_start: Optional[TIncrementalRange] + range_end: Optional[TIncrementalRange] diff --git a/dlt/common/libs/deltalake.py b/dlt/common/libs/deltalake.py index 4047bc3a1a..0f938e7102 100644 --- a/dlt/common/libs/deltalake.py +++ b/dlt/common/libs/deltalake.py @@ -10,6 +10,7 @@ from dlt.common.exceptions import MissingDependencyException from dlt.common.storages import FilesystemConfiguration from dlt.common.utils import assert_min_pkg_version +from dlt.common.configuration.specs.mixins import WithObjectStoreRsCredentials from dlt.destinations.impl.filesystem.filesystem import FilesystemClient try: @@ -191,10 +192,9 @@ def get_delta_tables( def _deltalake_storage_options(config: FilesystemConfiguration) -> Dict[str, str]: """Returns dict that can be passed as `storage_options` in `deltalake` library.""" - creds = {} # type: ignore + creds = {} extra_options = {} - # TODO: create a mixin with to_object_store_rs_credentials for a proper discovery - if hasattr(config.credentials, "to_object_store_rs_credentials"): + if isinstance(config.credentials, WithObjectStoreRsCredentials): creds = config.credentials.to_object_store_rs_credentials() if config.deltalake_storage_options is not None: extra_options = config.deltalake_storage_options diff --git a/dlt/common/libs/pyiceberg.py b/dlt/common/libs/pyiceberg.py new file mode 100644 index 0000000000..19ce9abbf2 --- /dev/null +++ b/dlt/common/libs/pyiceberg.py @@ -0,0 +1,192 @@ +from typing import Dict, Any, List, Optional + +from dlt import version, Pipeline +from dlt.common.libs.pyarrow import cast_arrow_schema_types +from dlt.common.schema.typing import TWriteDisposition +from dlt.common.utils import assert_min_pkg_version +from dlt.common.exceptions import MissingDependencyException +from dlt.common.storages.configuration import FileSystemCredentials +from dlt.common.configuration.specs import CredentialsConfiguration +from dlt.common.configuration.specs.mixins import WithPyicebergConfig +from dlt.destinations.impl.filesystem.filesystem import FilesystemClient + + +try: + from pyiceberg.table import Table as IcebergTable + from pyiceberg.catalog import MetastoreCatalog + import pyarrow as pa +except ModuleNotFoundError: + raise MissingDependencyException( + "dlt pyiceberg helpers", + [f"{version.DLT_PKG_NAME}[pyiceberg]"], + "Install `pyiceberg` so dlt can create Iceberg tables in the `filesystem` destination.", + ) + + +def ensure_iceberg_compatible_arrow_schema(schema: pa.Schema) -> pa.Schema: + ARROW_TO_ICEBERG_COMPATIBLE_ARROW_TYPE_MAP = { + pa.types.is_time: pa.string(), + pa.types.is_decimal256: pa.string(), # pyarrow does not allow downcasting to decimal128 + } + return cast_arrow_schema_types(schema, ARROW_TO_ICEBERG_COMPATIBLE_ARROW_TYPE_MAP) + + +def ensure_iceberg_compatible_arrow_data(data: pa.Table) -> pa.Table: + schema = ensure_iceberg_compatible_arrow_schema(data.schema) + return data.cast(schema) + + +def write_iceberg_table( + table: IcebergTable, + data: pa.Table, + write_disposition: TWriteDisposition, +) -> None: + if write_disposition == "append": + table.append(ensure_iceberg_compatible_arrow_data(data)) + elif write_disposition == "replace": + table.overwrite(ensure_iceberg_compatible_arrow_data(data)) + + +def get_sql_catalog(credentials: FileSystemCredentials) -> "SqlCatalog": # type: ignore[name-defined] # noqa: F821 + assert_min_pkg_version( + pkg_name="sqlalchemy", + version="2.0.18", + msg=( + "`sqlalchemy>=2.0.18` is needed for `iceberg` table format on `filesystem` destination." + ), + ) + + from pyiceberg.catalog.sql import SqlCatalog + + return SqlCatalog( + "default", + uri="sqlite:///:memory:", + **_get_fileio_config(credentials), + ) + + +def create_or_evolve_table( + catalog: MetastoreCatalog, + client: FilesystemClient, + table_name: str, + namespace_name: Optional[str] = None, + schema: Optional[pa.Schema] = None, + partition_columns: Optional[List[str]] = None, +) -> MetastoreCatalog: + # add table to catalog + table_id = f"{namespace_name}.{table_name}" + table_path = f"{client.dataset_path}/{table_name}" + metadata_path = f"{table_path}/metadata" + if client.fs_client.exists(metadata_path): + # found metadata; register existing table + table = _register_table(table_id, metadata_path, catalog, client) + + # evolve schema + if schema is not None: + with table.update_schema() as update: + update.union_by_name(ensure_iceberg_compatible_arrow_schema(schema)) + else: + # found no metadata; create new table + assert schema is not None + with catalog.create_table_transaction( + table_id, + schema=ensure_iceberg_compatible_arrow_schema(schema), + location=_make_path(table_path, client), + ) as txn: + # add partitioning + with txn.update_spec() as update_spec: + for col in partition_columns: + update_spec.add_identity(col) + + return catalog + + +def get_catalog( + client: FilesystemClient, + table_name: str, + namespace_name: Optional[str] = None, + schema: Optional[pa.Schema] = None, + partition_columns: Optional[List[str]] = None, +) -> MetastoreCatalog: + """Returns single-table, ephemeral, in-memory Iceberg catalog.""" + + # create in-memory catalog + catalog: MetastoreCatalog = get_sql_catalog(client.config.credentials) + + # create namespace + if namespace_name is None: + namespace_name = client.dataset_name + catalog.create_namespace(namespace_name) + + # add table to catalog + catalog = create_or_evolve_table( + catalog=catalog, + client=client, + table_name=table_name, + namespace_name=namespace_name, + schema=schema, + partition_columns=partition_columns, + ) + + return catalog + + +def get_iceberg_tables( + pipeline: Pipeline, *tables: str, schema_name: Optional[str] = None +) -> Dict[str, IcebergTable]: + from dlt.common.schema.utils import get_table_format + + with pipeline.destination_client(schema_name=schema_name) as client: + assert isinstance( + client, FilesystemClient + ), "The `get_iceberg_tables` function requires a `filesystem` destination." + + schema_iceberg_tables = [ + t["name"] + for t in client.schema.tables.values() + if get_table_format(client.schema.tables, t["name"]) == "iceberg" + ] + if len(tables) > 0: + invalid_tables = set(tables) - set(schema_iceberg_tables) + if len(invalid_tables) > 0: + available_schemas = "" + if len(pipeline.schema_names) > 1: + available_schemas = f" Available schemas are {pipeline.schema_names}" + raise ValueError( + f"Schema {client.schema.name} does not contain Iceberg tables with these names:" + f" {', '.join(invalid_tables)}.{available_schemas}" + ) + schema_iceberg_tables = [t for t in schema_iceberg_tables if t in tables] + + return { + name: get_catalog(client, name).load_table(f"{pipeline.dataset_name}.{name}") + for name in schema_iceberg_tables + } + + +def _get_fileio_config(credentials: CredentialsConfiguration) -> Dict[str, Any]: + if isinstance(credentials, WithPyicebergConfig): + return credentials.to_pyiceberg_fileio_config() + return {} + + +def _get_last_metadata_file(metadata_path: str, client: FilesystemClient) -> str: + # TODO: implement faster way to obtain `last_metadata_file` (listing is slow) + metadata_files = [f for f in client.fs_client.ls(metadata_path) if f.endswith(".json")] + return _make_path(sorted(metadata_files)[-1], client) + + +def _register_table( + identifier: str, + metadata_path: str, + catalog: MetastoreCatalog, + client: FilesystemClient, +) -> IcebergTable: + last_metadata_file = _get_last_metadata_file(metadata_path, client) + return catalog.register_table(identifier, last_metadata_file) + + +def _make_path(path: str, client: FilesystemClient) -> str: + # don't use file protocol for local files because duckdb does not support it + # https://github.com/duckdb/duckdb/issues/13669 + return path if client.is_local_filesystem else client.config.make_url(path) diff --git a/dlt/common/logger.py b/dlt/common/logger.py index b163c15672..634e305805 100644 --- a/dlt/common/logger.py +++ b/dlt/common/logger.py @@ -47,7 +47,7 @@ def is_logging() -> bool: def log_level() -> str: if not LOGGER: raise RuntimeError("Logger not initialized") - return logging.getLevelName(LOGGER.level) # type: ignore + return logging.getLevelName(LOGGER.level) def is_json_logging(log_format: str) -> bool: diff --git a/dlt/common/metrics.py b/dlt/common/metrics.py index d6acf19d0d..2f9f574dd0 100644 --- a/dlt/common/metrics.py +++ b/dlt/common/metrics.py @@ -9,7 +9,7 @@ class DataWriterMetrics(NamedTuple): created: float last_modified: float - def __add__(self, other: Tuple[object, ...], /) -> Tuple[object, ...]: + def __add__(self, other: Tuple[object, ...], /) -> Tuple[object, ...]: # type: ignore[override] if isinstance(other, DataWriterMetrics): return DataWriterMetrics( self.file_path if self.file_path == other.file_path else "", diff --git a/dlt/common/normalizers/json/__init__.py b/dlt/common/normalizers/json/__init__.py index 725f6a8355..ae5e06fe2e 100644 --- a/dlt/common/normalizers/json/__init__.py +++ b/dlt/common/normalizers/json/__init__.py @@ -36,6 +36,10 @@ def extend_schema(self) -> None: def extend_table(self, table_name: str) -> None: pass + @abc.abstractmethod + def remove_table(self, table_name: str) -> None: + pass + @classmethod @abc.abstractmethod def update_normalizer_config(cls, schema: Schema, config: TNormalizerConfig) -> None: diff --git a/dlt/common/normalizers/json/relational.py b/dlt/common/normalizers/json/relational.py index e365017125..36845b2e14 100644 --- a/dlt/common/normalizers/json/relational.py +++ b/dlt/common/normalizers/json/relational.py @@ -1,4 +1,16 @@ -from typing import Dict, List, Mapping, Optional, Sequence, Tuple, cast, TypedDict, Any +from typing import ( + ClassVar, + Dict, + List, + Mapping, + Optional, + Sequence, + Tuple, + Type, + cast, + TypedDict, + Any, +) from dlt.common.normalizers.exceptions import InvalidJsonNormalizer from dlt.common.normalizers.typing import TJSONNormalizer @@ -14,6 +26,9 @@ from dlt.common.schema.utils import ( column_name_validator, is_nested_table, + get_nested_tables, + has_column_with_prop, + get_first_column_name_with_prop, ) from dlt.common.utils import update_dict_nested from dlt.common.normalizers.json import ( @@ -48,6 +63,7 @@ class DataItemNormalizer(DataItemNormalizerBase[RelationalNormalizerConfig]): # other constants EMPTY_KEY_IDENTIFIER = "_empty" # replace empty keys with this + RELATIONAL_CONFIG_TYPE: ClassVar[Type[RelationalNormalizerConfig]] = RelationalNormalizerConfig normalizer_config: RelationalNormalizerConfig propagation_config: RelationalNormalizerConfigPropagation @@ -310,20 +326,38 @@ def extend_table(self, table_name: str) -> None: Table name should be normalized. """ table = self.schema.tables.get(table_name) - if not is_nested_table(table) and table.get("write_disposition") == "merge": - DataItemNormalizer.update_normalizer_config( + # add root key prop when merge disposition is used or any of nested tables needs row_key + if not is_nested_table(table) and ( + table.get("write_disposition") == "merge" + or any( + has_column_with_prop(t, "root_key", include_incomplete=True) + for t in get_nested_tables(self.schema.tables, table_name) + ) + ): + # get row id column from table, assume that we propagate it into c_dlt_root_id always + c_dlt_id = get_first_column_name_with_prop(table, "row_key", include_incomplete=True) + self.update_normalizer_config( self.schema, { "propagation": { "tables": { table_name: { - TColumnName(self.c_dlt_id): TColumnName(self.c_dlt_root_id) + TColumnName(c_dlt_id or self.c_dlt_id): TColumnName( + self.c_dlt_root_id + ) } } } }, ) + def remove_table(self, table_name: str) -> None: + """Called by the Schema when table is removed from it.""" + config = self.get_normalizer_config(self.schema) + if propagation := config.get("propagation"): + if tables := propagation.get("tables"): + tables.pop(table_name, None) + def normalize_data_item( self, item: TDataItem, load_id: str, table_name: str ) -> TNormalizedRowIterator: @@ -352,8 +386,8 @@ def normalize_data_item( def ensure_this_normalizer(cls, norm_config: TJSONNormalizer) -> None: # make sure schema has right normalizer present_normalizer = norm_config["module"] - if present_normalizer != __name__: - raise InvalidJsonNormalizer(__name__, present_normalizer) + if present_normalizer != cls.__module__: + raise InvalidJsonNormalizer(cls.__module__, present_normalizer) @classmethod def update_normalizer_config(cls, schema: Schema, config: RelationalNormalizerConfig) -> None: @@ -371,8 +405,10 @@ def get_normalizer_config(cls, schema: Schema) -> RelationalNormalizerConfig: cls.ensure_this_normalizer(norm_config) return cast(RelationalNormalizerConfig, norm_config.get("config", {})) - @staticmethod - def _validate_normalizer_config(schema: Schema, config: RelationalNormalizerConfig) -> None: + @classmethod + def _validate_normalizer_config( + cls, schema: Schema, config: RelationalNormalizerConfig + ) -> None: """Normalizes all known column identifiers according to the schema and then validates the configuration""" def _normalize_prop( @@ -397,7 +433,7 @@ def _normalize_prop( ) validate_dict( - RelationalNormalizerConfig, + cls.RELATIONAL_CONFIG_TYPE, config, "./normalizers/json/config", validator_f=column_name_validator(schema.naming), diff --git a/dlt/common/reflection/utils.py b/dlt/common/reflection/utils.py index c612c5a4f1..27c7bd8758 100644 --- a/dlt/common/reflection/utils.py +++ b/dlt/common/reflection/utils.py @@ -90,24 +90,24 @@ def rewrite_python_script( last_line = -1 last_offset = -1 # sort transformed nodes by line and offset - for node, t_value in sorted(transformed_nodes, key=lambda n: (n[0].lineno, n[0].col_offset)): + for node, t_value in sorted(transformed_nodes, key=lambda n: (n[0].lineno, n[0].col_offset)): # type: ignore[attr-defined] # do we have a line changed - if last_line != node.lineno - 1: + if last_line != node.lineno - 1: # type: ignore[attr-defined] # add remainder from the previous line if last_offset >= 0: script_lines.append(source_script_lines[last_line][last_offset:]) # add all new lines from previous line to current - script_lines.extend(source_script_lines[last_line + 1 : node.lineno - 1]) + script_lines.extend(source_script_lines[last_line + 1 : node.lineno - 1]) # type: ignore[attr-defined] # add trailing characters until node in current line starts - script_lines.append(source_script_lines[node.lineno - 1][: node.col_offset]) + script_lines.append(source_script_lines[node.lineno - 1][: node.col_offset]) # type: ignore[attr-defined] elif last_offset >= 0: # no line change, add the characters from the end of previous node to the current - script_lines.append(source_script_lines[last_line][last_offset : node.col_offset]) + script_lines.append(source_script_lines[last_line][last_offset : node.col_offset]) # type: ignore[attr-defined] # replace node value script_lines.append(ast_unparse(t_value).strip()) - last_line = node.end_lineno - 1 - last_offset = node.end_col_offset + last_line = node.end_lineno - 1 # type: ignore[attr-defined] + last_offset = node.end_col_offset # type: ignore[attr-defined] # add all that was missing if last_offset >= 0: diff --git a/dlt/common/schema/schema.py b/dlt/common/schema/schema.py index d6031a08fa..f2d75638fe 100644 --- a/dlt/common/schema/schema.py +++ b/dlt/common/schema/schema.py @@ -451,10 +451,12 @@ def drop_tables( ) -> List[TTableSchema]: """Drops tables from the schema and returns the dropped tables""" result = [] + # TODO: make sure all nested tables to table_names are also dropped for table_name in table_names: table = self.get_table(table_name) if table and (not seen_data_only or utils.has_table_seen_data(table)): result.append(self._schema_tables.pop(table_name)) + self.data_item_normalizer.remove_table(table_name) return result def filter_row_with_hint( @@ -525,7 +527,7 @@ def get_new_table_columns( Typically they come from the destination schema. Columns that are in `existing_columns` and not in `table_name` columns are ignored. Optionally includes incomplete columns (without data type)""" - casefold_f: Callable[[str], str] = str.casefold if not case_sensitive else str # type: ignore[assignment] + casefold_f: Callable[[str], str] = str.casefold if not case_sensitive else str casefold_existing = { casefold_f(col_name): col for col_name, col in existing_columns.items() } diff --git a/dlt/common/schema/utils.py b/dlt/common/schema/utils.py index 038abdc4d0..4f9e0eb42e 100644 --- a/dlt/common/schema/utils.py +++ b/dlt/common/schema/utils.py @@ -457,16 +457,8 @@ def diff_table( * when columns with the same name have different data types * when table links to different parent tables """ - if tab_a["name"] != tab_b["name"]: - raise TablePropertiesConflictException( - schema_name, tab_a["name"], "name", tab_a["name"], tab_b["name"] - ) - table_name = tab_a["name"] - # check if table properties can be merged - if tab_a.get("parent") != tab_b.get("parent"): - raise TablePropertiesConflictException( - schema_name, table_name, "parent", tab_a.get("parent"), tab_b.get("parent") - ) + # allow for columns to differ + ensure_compatible_tables(schema_name, tab_a, tab_b, ensure_columns=False) # get new columns, changes in the column data type or other properties are not allowed tab_a_columns = tab_a["columns"] @@ -474,18 +466,6 @@ def diff_table( for col_b_name, col_b in tab_b["columns"].items(): if col_b_name in tab_a_columns: col_a = tab_a_columns[col_b_name] - # we do not support changing data types of columns - if is_complete_column(col_a) and is_complete_column(col_b): - if not compare_complete_columns(tab_a_columns[col_b_name], col_b): - # attempt to update to incompatible columns - raise CannotCoerceColumnException( - schema_name, - table_name, - col_b_name, - col_b["data_type"], - tab_a_columns[col_b_name]["data_type"], - None, - ) # all other properties can change merged_column = merge_column(copy(col_a), col_b) if merged_column != col_a: @@ -494,6 +474,8 @@ def diff_table( new_columns.append(col_b) # return partial table containing only name and properties that differ (column, filters etc.) + table_name = tab_a["name"] + partial_table: TPartialTableSchema = { "name": table_name, "columns": {} if new_columns is None else {c["name"]: c for c in new_columns}, @@ -519,6 +501,50 @@ def diff_table( return partial_table +def ensure_compatible_tables( + schema_name: str, tab_a: TTableSchema, tab_b: TPartialTableSchema, ensure_columns: bool = True +) -> None: + """Ensures that `tab_a` and `tab_b` can be merged without conflicts. Conflicts are detected when + + - tables have different names + - nested tables have different parents + - tables have any column with incompatible types + + Note: all the identifiers must be already normalized + + """ + if tab_a["name"] != tab_b["name"]: + raise TablePropertiesConflictException( + schema_name, tab_a["name"], "name", tab_a["name"], tab_b["name"] + ) + table_name = tab_a["name"] + # check if table properties can be merged + if tab_a.get("parent") != tab_b.get("parent"): + raise TablePropertiesConflictException( + schema_name, table_name, "parent", tab_a.get("parent"), tab_b.get("parent") + ) + + if not ensure_columns: + return + + tab_a_columns = tab_a["columns"] + for col_b_name, col_b in tab_b["columns"].items(): + if col_b_name in tab_a_columns: + col_a = tab_a_columns[col_b_name] + # we do not support changing data types of columns + if is_complete_column(col_a) and is_complete_column(col_b): + if not compare_complete_columns(tab_a_columns[col_b_name], col_b): + # attempt to update to incompatible columns + raise CannotCoerceColumnException( + schema_name, + table_name, + col_b_name, + col_b["data_type"], + tab_a_columns[col_b_name]["data_type"], + None, + ) + + # def compare_tables(tab_a: TTableSchema, tab_b: TTableSchema) -> bool: # try: # table_name = tab_a["name"] diff --git a/dlt/common/time.py b/dlt/common/time.py index 4ce411baa4..74c32e4ea0 100644 --- a/dlt/common/time.py +++ b/dlt/common/time.py @@ -164,17 +164,30 @@ def detect_datetime_format(value: str) -> Optional[str]: ): "%Y-%m-%dT%H:%M:%S.%fZ", # UTC with fractional seconds re.compile( r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\+\d{2}:\d{2}$" - ): "%Y-%m-%dT%H:%M:%S%z", # Timezone offset + ): "%Y-%m-%dT%H:%M:%S%z", # Positive timezone offset re.compile( r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\+\d{4}$" - ): "%Y-%m-%dT%H:%M:%S%z", # Timezone without colon - # Full datetime with fractional seconds and timezone + ): "%Y-%m-%dT%H:%M:%S%z", # Positive timezone without colon + # Full datetime with fractional seconds and positive timezone offset re.compile( r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d+\+\d{2}:\d{2}$" ): "%Y-%m-%dT%H:%M:%S.%f%z", re.compile( r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d+\+\d{4}$" - ): "%Y-%m-%dT%H:%M:%S.%f%z", # Timezone without colon + ): "%Y-%m-%dT%H:%M:%S.%f%z", # Positive timezone without colon + re.compile( + r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}-\d{2}:\d{2}$" + ): "%Y-%m-%dT%H:%M:%S%z", # Negative timezone offset + re.compile( + r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}-\d{4}$" + ): "%Y-%m-%dT%H:%M:%S%z", # Negative timezone without colon + # Full datetime with fractional seconds and negative timezone offset + re.compile( + r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d+-\d{2}:\d{2}$" + ): "%Y-%m-%dT%H:%M:%S.%f%z", + re.compile( + r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d+-\d{4}$" + ): "%Y-%m-%dT%H:%M:%S.%f%z", # Negative Timezone without colon # Datetime without timezone re.compile(r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}$"): "%Y-%m-%dT%H:%M:%S", # No timezone re.compile(r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}$"): "%Y-%m-%dT%H:%M", # Minute precision diff --git a/dlt/common/typing.py b/dlt/common/typing.py index a3364d1b07..a0322fe01e 100644 --- a/dlt/common/typing.py +++ b/dlt/common/typing.py @@ -446,7 +446,7 @@ def get_generic_type_argument_from_instance( if cls_: orig_param_type = get_args(cls_)[0] if orig_param_type in (Any, CallableAny) and sample_value is not None: - orig_param_type = type(sample_value) + orig_param_type = type(sample_value) # type: ignore[assignment] return orig_param_type # type: ignore @@ -484,3 +484,18 @@ def decorator( return func return decorator + + +def add_value_to_literal(literal: Any, value: Any) -> None: + """Extends a Literal at runtime with a new value. + + Args: + literal (Type[Any]): Literal to extend + value (Any): Value to add + + """ + type_args = get_args(literal) + + if value not in type_args: + type_args += (value,) + literal.__args__ = type_args diff --git a/dlt/destinations/dataset.py b/dlt/destinations/dataset.py deleted file mode 100644 index 27a7f5a7af..0000000000 --- a/dlt/destinations/dataset.py +++ /dev/null @@ -1,412 +0,0 @@ -from typing import Any, Generator, Sequence, Union, TYPE_CHECKING, Tuple - -from contextlib import contextmanager - -from dlt import version -from dlt.common.json import json -from dlt.common.exceptions import MissingDependencyException -from dlt.common.destination import AnyDestination -from dlt.common.destination.reference import ( - SupportsReadableRelation, - SupportsReadableDataset, - TDatasetType, - TDestinationReferenceArg, - Destination, - JobClientBase, - WithStateSync, - DestinationClientDwhConfiguration, - DestinationClientStagingConfiguration, - DestinationClientConfiguration, - DestinationClientDwhWithStagingConfiguration, -) - -from dlt.common.schema.typing import TTableSchemaColumns -from dlt.destinations.sql_client import SqlClientBase, WithSqlClient -from dlt.common.schema import Schema -from dlt.common.exceptions import DltException - -if TYPE_CHECKING: - try: - from dlt.common.libs.ibis import BaseBackend as IbisBackend - except MissingDependencyException: - IbisBackend = Any -else: - IbisBackend = Any - - -class DatasetException(DltException): - pass - - -class ReadableRelationHasQueryException(DatasetException): - def __init__(self, attempted_change: str) -> None: - msg = ( - "This readable relation was created with a provided sql query. You cannot change" - f" {attempted_change}. Please change the orignal sql query." - ) - super().__init__(msg) - - -class ReadableRelationUnknownColumnException(DatasetException): - def __init__(self, column_name: str) -> None: - msg = ( - f"The selected column {column_name} is not known in the dlt schema for this releation." - ) - super().__init__(msg) - - -class ReadableDBAPIRelation(SupportsReadableRelation): - def __init__( - self, - *, - readable_dataset: "ReadableDBAPIDataset", - provided_query: Any = None, - table_name: str = None, - limit: int = None, - selected_columns: Sequence[str] = None, - ) -> None: - """Create a lazy evaluated relation to for the dataset of a destination""" - - # NOTE: we can keep an assertion here, this class will not be created by the user - assert bool(table_name) != bool( - provided_query - ), "Please provide either an sql query OR a table_name" - - self._dataset = readable_dataset - - self._provided_query = provided_query - self._table_name = table_name - self._limit = limit - self._selected_columns = selected_columns - - # wire protocol functions - self.df = self._wrap_func("df") # type: ignore - self.arrow = self._wrap_func("arrow") # type: ignore - self.fetchall = self._wrap_func("fetchall") # type: ignore - self.fetchmany = self._wrap_func("fetchmany") # type: ignore - self.fetchone = self._wrap_func("fetchone") # type: ignore - - self.iter_df = self._wrap_iter("iter_df") # type: ignore - self.iter_arrow = self._wrap_iter("iter_arrow") # type: ignore - self.iter_fetch = self._wrap_iter("iter_fetch") # type: ignore - - @property - def sql_client(self) -> SqlClientBase[Any]: - return self._dataset.sql_client - - @property - def schema(self) -> Schema: - return self._dataset.schema - - @property - def query(self) -> Any: - """build the query""" - if self._provided_query: - return self._provided_query - - table_name = self.sql_client.make_qualified_table_name( - self.schema.naming.normalize_tables_path(self._table_name) - ) - - maybe_limit_clause_1 = "" - maybe_limit_clause_2 = "" - if self._limit: - maybe_limit_clause_1, maybe_limit_clause_2 = self.sql_client._limit_clause_sql( - self._limit - ) - - selector = "*" - if self._selected_columns: - selector = ",".join( - [ - self.sql_client.escape_column_name(self.schema.naming.normalize_path(c)) - for c in self._selected_columns - ] - ) - - return f"SELECT {maybe_limit_clause_1} {selector} FROM {table_name} {maybe_limit_clause_2}" - - @property - def columns_schema(self) -> TTableSchemaColumns: - return self.compute_columns_schema() - - @columns_schema.setter - def columns_schema(self, new_value: TTableSchemaColumns) -> None: - raise NotImplementedError("columns schema in ReadableDBAPIRelation can only be computed") - - def compute_columns_schema(self) -> TTableSchemaColumns: - """provide schema columns for the cursor, may be filtered by selected columns""" - - columns_schema = ( - self.schema.tables.get(self._table_name, {}).get("columns", {}) if self.schema else {} - ) - - if not columns_schema: - return None - if not self._selected_columns: - return columns_schema - - filtered_columns: TTableSchemaColumns = {} - for sc in self._selected_columns: - sc = self.schema.naming.normalize_path(sc) - if sc not in columns_schema.keys(): - raise ReadableRelationUnknownColumnException(sc) - filtered_columns[sc] = columns_schema[sc] - - return filtered_columns - - @contextmanager - def cursor(self) -> Generator[SupportsReadableRelation, Any, Any]: - """Gets a DBApiCursor for the current relation""" - with self.sql_client as client: - # this hacky code is needed for mssql to disable autocommit, read iterators - # will not work otherwise. in the future we should be able to create a readony - # client which will do this automatically - if hasattr(self.sql_client, "_conn") and hasattr(self.sql_client._conn, "autocommit"): - self.sql_client._conn.autocommit = False - with client.execute_query(self.query) as cursor: - if columns_schema := self.columns_schema: - cursor.columns_schema = columns_schema - yield cursor - - def _wrap_iter(self, func_name: str) -> Any: - """wrap SupportsReadableRelation generators in cursor context""" - - def _wrap(*args: Any, **kwargs: Any) -> Any: - with self.cursor() as cursor: - yield from getattr(cursor, func_name)(*args, **kwargs) - - return _wrap - - def _wrap_func(self, func_name: str) -> Any: - """wrap SupportsReadableRelation functions in cursor context""" - - def _wrap(*args: Any, **kwargs: Any) -> Any: - with self.cursor() as cursor: - return getattr(cursor, func_name)(*args, **kwargs) - - return _wrap - - def __copy__(self) -> "ReadableDBAPIRelation": - return self.__class__( - readable_dataset=self._dataset, - provided_query=self._provided_query, - table_name=self._table_name, - limit=self._limit, - selected_columns=self._selected_columns, - ) - - def limit(self, limit: int) -> "ReadableDBAPIRelation": - if self._provided_query: - raise ReadableRelationHasQueryException("limit") - rel = self.__copy__() - rel._limit = limit - return rel - - def select(self, *columns: str) -> "ReadableDBAPIRelation": - if self._provided_query: - raise ReadableRelationHasQueryException("select") - rel = self.__copy__() - rel._selected_columns = columns - # NOTE: the line below will ensure that no unknown columns are selected if - # schema is known - rel.compute_columns_schema() - return rel - - def __getitem__(self, columns: Union[str, Sequence[str]]) -> "SupportsReadableRelation": - if isinstance(columns, str): - return self.select(columns) - elif isinstance(columns, Sequence): - return self.select(*columns) - else: - raise TypeError(f"Invalid argument type: {type(columns).__name__}") - - def head(self, limit: int = 5) -> "ReadableDBAPIRelation": - return self.limit(limit) - - -class ReadableDBAPIDataset(SupportsReadableDataset): - """Access to dataframes and arrowtables in the destination dataset via dbapi""" - - def __init__( - self, - destination: TDestinationReferenceArg, - dataset_name: str, - schema: Union[Schema, str, None] = None, - ) -> None: - self._destination = Destination.from_reference(destination) - self._provided_schema = schema - self._dataset_name = dataset_name - self._sql_client: SqlClientBase[Any] = None - self._schema: Schema = None - - def ibis(self) -> IbisBackend: - """return a connected ibis backend""" - from dlt.common.libs.ibis import create_ibis_backend - - self._ensure_client_and_schema() - return create_ibis_backend( - self._destination, - self._destination_client(self.schema), - ) - - @property - def schema(self) -> Schema: - self._ensure_client_and_schema() - return self._schema - - @property - def sql_client(self) -> SqlClientBase[Any]: - self._ensure_client_and_schema() - return self._sql_client - - def _destination_client(self, schema: Schema) -> JobClientBase: - return get_destination_clients( - schema, destination=self._destination, destination_dataset_name=self._dataset_name - )[0] - - def _ensure_client_and_schema(self) -> None: - """Lazy load schema and client""" - - # full schema given, nothing to do - if not self._schema and isinstance(self._provided_schema, Schema): - self._schema = self._provided_schema - - # schema name given, resolve it from destination by name - elif not self._schema and isinstance(self._provided_schema, str): - with self._destination_client(Schema(self._provided_schema)) as client: - if isinstance(client, WithStateSync): - stored_schema = client.get_stored_schema(self._provided_schema) - if stored_schema: - self._schema = Schema.from_stored_schema(json.loads(stored_schema.schema)) - else: - self._schema = Schema(self._provided_schema) - - # no schema name given, load newest schema from destination - elif not self._schema: - with self._destination_client(Schema(self._dataset_name)) as client: - if isinstance(client, WithStateSync): - stored_schema = client.get_stored_schema() - if stored_schema: - self._schema = Schema.from_stored_schema(json.loads(stored_schema.schema)) - - # default to empty schema with dataset name - if not self._schema: - self._schema = Schema(self._dataset_name) - - # here we create the client bound to the resolved schema - if not self._sql_client: - destination_client = self._destination_client(self._schema) - if isinstance(destination_client, WithSqlClient): - self._sql_client = destination_client.sql_client - else: - raise Exception( - f"Destination {destination_client.config.destination_type} does not support" - " SqlClient." - ) - - def __call__(self, query: Any) -> ReadableDBAPIRelation: - return ReadableDBAPIRelation(readable_dataset=self, provided_query=query) # type: ignore[abstract] - - def table(self, table_name: str) -> SupportsReadableRelation: - return ReadableDBAPIRelation( - readable_dataset=self, - table_name=table_name, - ) # type: ignore[abstract] - - def __getitem__(self, table_name: str) -> SupportsReadableRelation: - """access of table via dict notation""" - return self.table(table_name) - - def __getattr__(self, table_name: str) -> SupportsReadableRelation: - """access of table via property notation""" - return self.table(table_name) - - -def dataset( - destination: TDestinationReferenceArg, - dataset_name: str, - schema: Union[Schema, str, None] = None, - dataset_type: TDatasetType = "dbapi", -) -> SupportsReadableDataset: - if dataset_type == "dbapi": - return ReadableDBAPIDataset(destination, dataset_name, schema) - raise NotImplementedError(f"Dataset of type {dataset_type} not implemented") - - -# helpers -def get_destination_client_initial_config( - destination: AnyDestination, - default_schema_name: str, - dataset_name: str, - as_staging: bool = False, -) -> DestinationClientConfiguration: - client_spec = destination.spec - - # this client supports many schemas and datasets - if issubclass(client_spec, DestinationClientDwhConfiguration): - if issubclass(client_spec, DestinationClientStagingConfiguration): - spec: DestinationClientDwhConfiguration = client_spec(as_staging_destination=as_staging) - else: - spec = client_spec() - - spec._bind_dataset_name(dataset_name, default_schema_name) - return spec - - return client_spec() - - -def get_destination_clients( - schema: Schema, - destination: AnyDestination = None, - destination_dataset_name: str = None, - destination_initial_config: DestinationClientConfiguration = None, - staging: AnyDestination = None, - staging_dataset_name: str = None, - staging_initial_config: DestinationClientConfiguration = None, - # pipeline specific settings - default_schema_name: str = None, -) -> Tuple[JobClientBase, JobClientBase]: - destination = Destination.from_reference(destination) if destination else None - staging = Destination.from_reference(staging) if staging else None - - try: - # resolve staging config in order to pass it to destination client config - staging_client = None - if staging: - if not staging_initial_config: - # this is just initial config - without user configuration injected - staging_initial_config = get_destination_client_initial_config( - staging, - dataset_name=staging_dataset_name, - default_schema_name=default_schema_name, - as_staging=True, - ) - # create the client - that will also resolve the config - staging_client = staging.client(schema, staging_initial_config) - - if not destination_initial_config: - # config is not provided then get it with injected credentials - initial_config = get_destination_client_initial_config( - destination, - dataset_name=destination_dataset_name, - default_schema_name=default_schema_name, - ) - - # attach the staging client config to destination client config - if its type supports it - if ( - staging_client - and isinstance(initial_config, DestinationClientDwhWithStagingConfiguration) - and isinstance(staging_client.config, DestinationClientStagingConfiguration) - ): - initial_config.staging_config = staging_client.config - # create instance with initial_config properly set - client = destination.client(schema, initial_config) - return client, staging_client - except ModuleNotFoundError: - client_spec = destination.spec() - raise MissingDependencyException( - f"{client_spec.destination_type} destination", - [f"{version.DLT_PKG_NAME}[{client_spec.destination_type}]"], - "Dependencies for specific destinations are available as extras of dlt", - ) diff --git a/dlt/destinations/dataset/__init__.py b/dlt/destinations/dataset/__init__.py new file mode 100644 index 0000000000..e0eef681b8 --- /dev/null +++ b/dlt/destinations/dataset/__init__.py @@ -0,0 +1,19 @@ +from dlt.destinations.dataset.factory import ( + dataset, +) +from dlt.destinations.dataset.dataset import ( + ReadableDBAPIDataset, + get_destination_clients, +) +from dlt.destinations.dataset.utils import ( + get_destination_clients, + get_destination_client_initial_config, +) + + +__all__ = [ + "dataset", + "ReadableDBAPIDataset", + "get_destination_client_initial_config", + "get_destination_clients", +] diff --git a/dlt/destinations/dataset/dataset.py b/dlt/destinations/dataset/dataset.py new file mode 100644 index 0000000000..fc55393a60 --- /dev/null +++ b/dlt/destinations/dataset/dataset.py @@ -0,0 +1,168 @@ +from typing import Any, Union, TYPE_CHECKING, List + +from dlt.common.json import json + +from dlt.common.exceptions import MissingDependencyException + +from dlt.common.destination.reference import ( + SupportsReadableRelation, + SupportsReadableDataset, + TDestinationReferenceArg, + Destination, + JobClientBase, + WithStateSync, +) + +from dlt.destinations.sql_client import SqlClientBase, WithSqlClient +from dlt.common.schema import Schema +from dlt.destinations.dataset.relation import ReadableDBAPIRelation +from dlt.destinations.dataset.utils import get_destination_clients +from dlt.common.destination.reference import TDatasetType + +if TYPE_CHECKING: + try: + from dlt.helpers.ibis import BaseBackend as IbisBackend + except MissingDependencyException: + IbisBackend = Any +else: + IbisBackend = Any + + +class ReadableDBAPIDataset(SupportsReadableDataset): + """Access to dataframes and arrowtables in the destination dataset via dbapi""" + + def __init__( + self, + destination: TDestinationReferenceArg, + dataset_name: str, + schema: Union[Schema, str, None] = None, + dataset_type: TDatasetType = "auto", + ) -> None: + self._destination = Destination.from_reference(destination) + self._provided_schema = schema + self._dataset_name = dataset_name + self._sql_client: SqlClientBase[Any] = None + self._schema: Schema = None + self._dataset_type = dataset_type + + def ibis(self) -> IbisBackend: + """return a connected ibis backend""" + from dlt.helpers.ibis import create_ibis_backend + + self._ensure_client_and_schema() + return create_ibis_backend( + self._destination, + self._destination_client(self.schema), + ) + + @property + def schema(self) -> Schema: + self._ensure_client_and_schema() + return self._schema + + @property + def sql_client(self) -> SqlClientBase[Any]: + self._ensure_client_and_schema() + return self._sql_client + + def _destination_client(self, schema: Schema) -> JobClientBase: + return get_destination_clients( + schema, destination=self._destination, destination_dataset_name=self._dataset_name + )[0] + + def _ensure_client_and_schema(self) -> None: + """Lazy load schema and client""" + + # full schema given, nothing to do + if not self._schema and isinstance(self._provided_schema, Schema): + self._schema = self._provided_schema + + # schema name given, resolve it from destination by name + elif not self._schema and isinstance(self._provided_schema, str): + with self._destination_client(Schema(self._provided_schema)) as client: + if isinstance(client, WithStateSync): + stored_schema = client.get_stored_schema(self._provided_schema) + if stored_schema: + self._schema = Schema.from_stored_schema(json.loads(stored_schema.schema)) + else: + self._schema = Schema(self._provided_schema) + + # no schema name given, load newest schema from destination + elif not self._schema: + with self._destination_client(Schema(self._dataset_name)) as client: + if isinstance(client, WithStateSync): + stored_schema = client.get_stored_schema() + if stored_schema: + self._schema = Schema.from_stored_schema(json.loads(stored_schema.schema)) + + # default to empty schema with dataset name + if not self._schema: + self._schema = Schema(self._dataset_name) + + # here we create the client bound to the resolved schema + if not self._sql_client: + destination_client = self._destination_client(self._schema) + if isinstance(destination_client, WithSqlClient): + self._sql_client = destination_client.sql_client + else: + raise Exception( + f"Destination {destination_client.config.destination_type} does not support" + " SqlClient." + ) + + def __call__(self, query: Any) -> ReadableDBAPIRelation: + return ReadableDBAPIRelation(readable_dataset=self, provided_query=query) # type: ignore[abstract] + + def table(self, table_name: str) -> SupportsReadableRelation: + # we can create an ibis powered relation if ibis is available + if table_name in self.schema.tables and self._dataset_type in ("auto", "ibis"): + try: + from dlt.helpers.ibis import create_unbound_ibis_table + from dlt.destinations.dataset.ibis_relation import ReadableIbisRelation + + unbound_table = create_unbound_ibis_table(self.sql_client, self.schema, table_name) + return ReadableIbisRelation(readable_dataset=self, ibis_object=unbound_table, columns_schema=self.schema.tables[table_name]["columns"]) # type: ignore[abstract] + except MissingDependencyException: + # if ibis is explicitly requested, reraise + if self._dataset_type == "ibis": + raise + + # fallback to the standard dbapi relation + return ReadableDBAPIRelation( + readable_dataset=self, + table_name=table_name, + ) # type: ignore[abstract] + + def row_counts( + self, *, data_tables: bool = True, dlt_tables: bool = False, table_names: List[str] = None + ) -> SupportsReadableRelation: + """Returns a dictionary of table names and their row counts, returns counts of all data tables by default""" + """If table_names is provided, only the tables in the list are returned regardless of the data_tables and dlt_tables flags""" + + selected_tables = table_names or [] + if not selected_tables: + if data_tables: + selected_tables += self.schema.data_table_names(seen_data_only=True) + if dlt_tables: + selected_tables += self.schema.dlt_table_names() + + # Build UNION ALL query to get row counts for all selected tables + queries = [] + for table in selected_tables: + queries.append( + f"SELECT '{table}' as table_name, COUNT(*) as row_count FROM" + f" {self.sql_client.make_qualified_table_name(table)}" + ) + + query = " UNION ALL ".join(queries) + + # Execute query and build result dict + return self(query) + + def __getitem__(self, table_name: str) -> SupportsReadableRelation: + """access of table via dict notation""" + return self.table(table_name) + + def __getattr__(self, table_name: str) -> SupportsReadableRelation: + """access of table via property notation""" + return self.table(table_name) diff --git a/dlt/destinations/dataset/exceptions.py b/dlt/destinations/dataset/exceptions.py new file mode 100644 index 0000000000..17e8f6b563 --- /dev/null +++ b/dlt/destinations/dataset/exceptions.py @@ -0,0 +1,22 @@ +from dlt.common.exceptions import DltException + + +class DatasetException(DltException): + pass + + +class ReadableRelationHasQueryException(DatasetException): + def __init__(self, attempted_change: str) -> None: + msg = ( + "This readable relation was created with a provided sql query. You cannot change" + f" {attempted_change}. Please change the orignal sql query." + ) + super().__init__(msg) + + +class ReadableRelationUnknownColumnException(DatasetException): + def __init__(self, column_name: str) -> None: + msg = ( + f"The selected column {column_name} is not known in the dlt schema for this releation." + ) + super().__init__(msg) diff --git a/dlt/destinations/dataset/factory.py b/dlt/destinations/dataset/factory.py new file mode 100644 index 0000000000..8ea0ddf7a1 --- /dev/null +++ b/dlt/destinations/dataset/factory.py @@ -0,0 +1,22 @@ +from typing import Union + + +from dlt.common.destination import AnyDestination +from dlt.common.destination.reference import ( + SupportsReadableDataset, + TDatasetType, + TDestinationReferenceArg, +) + +from dlt.common.schema import Schema + +from dlt.destinations.dataset.dataset import ReadableDBAPIDataset + + +def dataset( + destination: TDestinationReferenceArg, + dataset_name: str, + schema: Union[Schema, str, None] = None, + dataset_type: TDatasetType = "auto", +) -> SupportsReadableDataset: + return ReadableDBAPIDataset(destination, dataset_name, schema, dataset_type) diff --git a/dlt/destinations/dataset/ibis_relation.py b/dlt/destinations/dataset/ibis_relation.py new file mode 100644 index 0000000000..632298ad56 --- /dev/null +++ b/dlt/destinations/dataset/ibis_relation.py @@ -0,0 +1,224 @@ +from typing import TYPE_CHECKING, Any, Union, Sequence + +from functools import partial + +from dlt.common.exceptions import MissingDependencyException +from dlt.destinations.dataset.relation import BaseReadableDBAPIRelation +from dlt.common.schema.typing import TTableSchemaColumns + + +if TYPE_CHECKING: + from dlt.destinations.dataset.dataset import ReadableDBAPIDataset +else: + ReadableDBAPIDataset = Any + +try: + from dlt.helpers.ibis import Expr +except MissingDependencyException: + Expr = Any + +# map dlt destination to sqlglot dialect +DIALECT_MAP = { + "dlt.destinations.duckdb": "duckdb", # works + "dlt.destinations.motherduck": "duckdb", # works + "dlt.destinations.clickhouse": "clickhouse", # works + "dlt.destinations.databricks": "databricks", # works + "dlt.destinations.bigquery": "bigquery", # works + "dlt.destinations.postgres": "postgres", # works + "dlt.destinations.redshift": "redshift", # works + "dlt.destinations.snowflake": "snowflake", # works + "dlt.destinations.mssql": "tsql", # works + "dlt.destinations.synapse": "tsql", # works + "dlt.destinations.athena": "trino", # works + "dlt.destinations.filesystem": "duckdb", # works + "dlt.destinations.dremio": "presto", # works + # NOTE: can we discover the current dialect in sqlalchemy? + "dlt.destinations.sqlalchemy": "mysql", # may work +} + +# NOTE: some dialects are not supported by ibis, but by sqlglot, these need to +# be transpiled with a intermediary step +TRANSPILE_VIA_MAP = { + "tsql": "postgres", + "databricks": "postgres", + "clickhouse": "postgres", + "redshift": "postgres", + "presto": "postgres", +} + + +class ReadableIbisRelation(BaseReadableDBAPIRelation): + def __init__( + self, + *, + readable_dataset: ReadableDBAPIDataset, + ibis_object: Any = None, + columns_schema: TTableSchemaColumns = None, + ) -> None: + """Create a lazy evaluated relation to for the dataset of a destination""" + super().__init__(readable_dataset=readable_dataset) + self._ibis_object = ibis_object + self._columns_schema = columns_schema + + @property + def query(self) -> Any: + """build the query""" + + from dlt.helpers.ibis import ibis, sqlglot + + destination_type = self._dataset._destination.destination_type + target_dialect = DIALECT_MAP[destination_type] + + # render sql directly if possible + if target_dialect not in TRANSPILE_VIA_MAP: + return ibis.to_sql(self._ibis_object, dialect=target_dialect) + + # here we need to transpile first + transpile_via = TRANSPILE_VIA_MAP[target_dialect] + sql = ibis.to_sql(self._ibis_object, dialect=transpile_via) + sql = sqlglot.transpile(sql, read=transpile_via, write=target_dialect)[0] + return sql + + @property + def columns_schema(self) -> TTableSchemaColumns: + return self.compute_columns_schema() + + @columns_schema.setter + def columns_schema(self, new_value: TTableSchemaColumns) -> None: + raise NotImplementedError("columns schema in ReadableDBAPIRelation can only be computed") + + def compute_columns_schema(self) -> TTableSchemaColumns: + """provide schema columns for the cursor, may be filtered by selected columns""" + # TODO: provide column lineage tracing with sqlglot lineage + return self._columns_schema + + def _proxy_expression_method(self, method_name: str, *args: Any, **kwargs: Any) -> Any: + """Proxy method calls to the underlying ibis expression, allowing to wrap the resulting expression in a new relation""" + + # Get the method from the expression + method = getattr(self._ibis_object, method_name) + + # unwrap args and kwargs if they are relations + args = tuple( + arg._ibis_object if isinstance(arg, ReadableIbisRelation) else arg for arg in args + ) + kwargs = { + k: v._ibis_object if isinstance(v, ReadableIbisRelation) else v + for k, v in kwargs.items() + } + + # casefold string params, we assume these are column names + args = tuple( + self.sql_client.capabilities.casefold_identifier(arg) if isinstance(arg, str) else arg + for arg in args + ) + kwargs = { + k: self.sql_client.capabilities.casefold_identifier(v) if isinstance(v, str) else v + for k, v in kwargs.items() + } + + # Call it with provided args + result = method(*args, **kwargs) + + # calculate columns schema for the result, some operations we know will not change the schema + # and select will just reduce the amount of column + columns_schema = None + if method_name == "select": + columns_schema = self._get_filtered_columns_schema(args) + elif method_name in ["filter", "limit", "order_by", "head"]: + columns_schema = self._columns_schema + + # If result is an ibis expression, wrap it in a new relation else return raw result + return self.__class__( + readable_dataset=self._dataset, ibis_object=result, columns_schema=columns_schema + ) + + def __getattr__(self, name: str) -> Any: + """Wrap all callable attributes of the expression""" + + attr = getattr(self._ibis_object, name, None) + + # try casefolded name for ibis columns access + if attr is None: + name = self.sql_client.capabilities.casefold_identifier(name) + attr = getattr(self._ibis_object, name, None) + + if attr is None: + raise AttributeError(f"'{self.__class__.__name__}' object has no attribute '{name}'") + + if not callable(attr): + # NOTE: we don't need to forward columns schema for non-callable attributes, these are usually columns + return self.__class__(readable_dataset=self._dataset, ibis_object=attr) + + return partial(self._proxy_expression_method, name) + + def __getitem__(self, columns: Union[str, Sequence[str]]) -> "ReadableIbisRelation": + # casefold column-names + columns = [columns] if isinstance(columns, str) else columns + columns = [self.sql_client.capabilities.casefold_identifier(col) for col in columns] + expr = self._ibis_object[columns] + return self.__class__( + readable_dataset=self._dataset, + ibis_object=expr, + columns_schema=self._get_filtered_columns_schema(columns), + ) + + def _get_filtered_columns_schema(self, columns: Sequence[str]) -> TTableSchemaColumns: + if not self._columns_schema: + return None + try: + return {col: self._columns_schema[col] for col in columns} + except KeyError: + # NOTE: select statements can contain new columns not present in the original schema + # here we just break the column schema inheritance chain + return None + + # forward ibis methods defined on interface + def limit(self, limit: int, **kwargs: Any) -> "ReadableIbisRelation": + """limit the result to 'limit' items""" + return self._proxy_expression_method("limit", limit, **kwargs) # type: ignore + + def head(self, limit: int = 5) -> "ReadableIbisRelation": + """limit the result to 5 items by default""" + return self._proxy_expression_method("head", limit) # type: ignore + + def select(self, *columns: str) -> "ReadableIbisRelation": + """set which columns will be selected""" + return self._proxy_expression_method("select", *columns) # type: ignore + + # forward ibis comparison and math operators + def __lt__(self, other: Any) -> "ReadableIbisRelation": + return self._proxy_expression_method("__lt__", other) # type: ignore + + def __gt__(self, other: Any) -> "ReadableIbisRelation": + return self._proxy_expression_method("__gt__", other) # type: ignore + + def __ge__(self, other: Any) -> "ReadableIbisRelation": + return self._proxy_expression_method("__ge__", other) # type: ignore + + def __le__(self, other: Any) -> "ReadableIbisRelation": + return self._proxy_expression_method("__le__", other) # type: ignore + + def __eq__(self, other: Any) -> bool: + return self._proxy_expression_method("__eq__", other) # type: ignore + + def __ne__(self, other: Any) -> bool: + return self._proxy_expression_method("__ne__", other) # type: ignore + + def __and__(self, other: Any) -> "ReadableIbisRelation": + return self._proxy_expression_method("__and__", other) # type: ignore + + def __or__(self, other: Any) -> "ReadableIbisRelation": + return self._proxy_expression_method("__or__", other) # type: ignore + + def __mul__(self, other: Any) -> "ReadableIbisRelation": + return self._proxy_expression_method("__mul__", other) # type: ignore + + def __div__(self, other: Any) -> "ReadableIbisRelation": + return self._proxy_expression_method("__div__", other) # type: ignore + + def __add__(self, other: Any) -> "ReadableIbisRelation": + return self._proxy_expression_method("__add__", other) # type: ignore + + def __sub__(self, other: Any) -> "ReadableIbisRelation": + return self._proxy_expression_method("__sub__", other) # type: ignore diff --git a/dlt/destinations/dataset/relation.py b/dlt/destinations/dataset/relation.py new file mode 100644 index 0000000000..2cdb7640df --- /dev/null +++ b/dlt/destinations/dataset/relation.py @@ -0,0 +1,207 @@ +from typing import Any, Generator, Sequence, Union, TYPE_CHECKING + +from contextlib import contextmanager + + +from dlt.common.destination.reference import ( + SupportsReadableRelation, +) + +from dlt.destinations.dataset.exceptions import ( + ReadableRelationHasQueryException, + ReadableRelationUnknownColumnException, +) + +from dlt.common.schema.typing import TTableSchemaColumns +from dlt.destinations.sql_client import SqlClientBase +from dlt.common.schema import Schema + +if TYPE_CHECKING: + from dlt.destinations.dataset.dataset import ReadableDBAPIDataset +else: + ReadableDBAPIDataset = Any + + +class BaseReadableDBAPIRelation(SupportsReadableRelation): + def __init__( + self, + *, + readable_dataset: "ReadableDBAPIDataset", + ) -> None: + """Create a lazy evaluated relation to for the dataset of a destination""" + + self._dataset = readable_dataset + + # wire protocol functions + self.df = self._wrap_func("df") # type: ignore + self.arrow = self._wrap_func("arrow") # type: ignore + self.fetchall = self._wrap_func("fetchall") # type: ignore + self.fetchmany = self._wrap_func("fetchmany") # type: ignore + self.fetchone = self._wrap_func("fetchone") # type: ignore + + self.iter_df = self._wrap_iter("iter_df") # type: ignore + self.iter_arrow = self._wrap_iter("iter_arrow") # type: ignore + self.iter_fetch = self._wrap_iter("iter_fetch") # type: ignore + + @property + def sql_client(self) -> SqlClientBase[Any]: + return self._dataset.sql_client + + @property + def schema(self) -> Schema: + return self._dataset.schema + + @property + def query(self) -> Any: + raise NotImplementedError("No query in ReadableDBAPIRelation") + + @contextmanager + def cursor(self) -> Generator[SupportsReadableRelation, Any, Any]: + """Gets a DBApiCursor for the current relation""" + with self.sql_client as client: + # this hacky code is needed for mssql to disable autocommit, read iterators + # will not work otherwise. in the future we should be able to create a readony + # client which will do this automatically + if hasattr(self.sql_client, "_conn") and hasattr(self.sql_client._conn, "autocommit"): + self.sql_client._conn.autocommit = False + with client.execute_query(self.query) as cursor: + if columns_schema := self.columns_schema: + cursor.columns_schema = columns_schema + yield cursor + + def _wrap_iter(self, func_name: str) -> Any: + """wrap SupportsReadableRelation generators in cursor context""" + + def _wrap(*args: Any, **kwargs: Any) -> Any: + with self.cursor() as cursor: + yield from getattr(cursor, func_name)(*args, **kwargs) + + return _wrap + + def _wrap_func(self, func_name: str) -> Any: + """wrap SupportsReadableRelation functions in cursor context""" + + def _wrap(*args: Any, **kwargs: Any) -> Any: + with self.cursor() as cursor: + return getattr(cursor, func_name)(*args, **kwargs) + + return _wrap + + +class ReadableDBAPIRelation(BaseReadableDBAPIRelation): + def __init__( + self, + *, + readable_dataset: "ReadableDBAPIDataset", + provided_query: Any = None, + table_name: str = None, + limit: int = None, + selected_columns: Sequence[str] = None, + ) -> None: + """Create a lazy evaluated relation to for the dataset of a destination""" + + # NOTE: we can keep an assertion here, this class will not be created by the user + assert bool(table_name) != bool( + provided_query + ), "Please provide either an sql query OR a table_name" + + super().__init__(readable_dataset=readable_dataset) + + self._provided_query = provided_query + self._table_name = table_name + self._limit = limit + self._selected_columns = selected_columns + + @property + def query(self) -> Any: + """build the query""" + if self._provided_query: + return self._provided_query + + table_name = self.sql_client.make_qualified_table_name( + self.schema.naming.normalize_path(self._table_name) + ) + + maybe_limit_clause_1 = "" + maybe_limit_clause_2 = "" + if self._limit: + maybe_limit_clause_1, maybe_limit_clause_2 = self.sql_client._limit_clause_sql( + self._limit + ) + + selector = "*" + if self._selected_columns: + selector = ",".join( + [ + self.sql_client.escape_column_name(self.schema.naming.normalize_tables_path(c)) + for c in self._selected_columns + ] + ) + + return f"SELECT {maybe_limit_clause_1} {selector} FROM {table_name} {maybe_limit_clause_2}" + + @property + def columns_schema(self) -> TTableSchemaColumns: + return self.compute_columns_schema() + + @columns_schema.setter + def columns_schema(self, new_value: TTableSchemaColumns) -> None: + raise NotImplementedError("columns schema in ReadableDBAPIRelation can only be computed") + + def compute_columns_schema(self) -> TTableSchemaColumns: + """provide schema columns for the cursor, may be filtered by selected columns""" + + columns_schema = ( + self.schema.tables.get(self._table_name, {}).get("columns", {}) if self.schema else {} + ) + + if not columns_schema: + return None + if not self._selected_columns: + return columns_schema + + filtered_columns: TTableSchemaColumns = {} + for sc in self._selected_columns: + sc = self.schema.naming.normalize_path(sc) + if sc not in columns_schema.keys(): + raise ReadableRelationUnknownColumnException(sc) + filtered_columns[sc] = columns_schema[sc] + + return filtered_columns + + def __copy__(self) -> "ReadableDBAPIRelation": + return self.__class__( + readable_dataset=self._dataset, + provided_query=self._provided_query, + table_name=self._table_name, + limit=self._limit, + selected_columns=self._selected_columns, + ) + + def limit(self, limit: int, **kwargs: Any) -> "ReadableDBAPIRelation": + if self._provided_query: + raise ReadableRelationHasQueryException("limit") + rel = self.__copy__() + rel._limit = limit + return rel + + def select(self, *columns: str) -> "ReadableDBAPIRelation": + if self._provided_query: + raise ReadableRelationHasQueryException("select") + rel = self.__copy__() + rel._selected_columns = columns + # NOTE: the line below will ensure that no unknown columns are selected if + # schema is known + rel.compute_columns_schema() + return rel + + def __getitem__(self, columns: Union[str, Sequence[str]]) -> "SupportsReadableRelation": + if isinstance(columns, str): + return self.select(columns) + elif isinstance(columns, Sequence): + return self.select(*columns) + else: + raise TypeError(f"Invalid argument type: {type(columns).__name__}") + + def head(self, limit: int = 5) -> "ReadableDBAPIRelation": + return self.limit(limit) diff --git a/dlt/destinations/dataset/utils.py b/dlt/destinations/dataset/utils.py new file mode 100644 index 0000000000..766fbc13ea --- /dev/null +++ b/dlt/destinations/dataset/utils.py @@ -0,0 +1,95 @@ +from typing import Tuple + +from dlt import version + +from dlt.common.exceptions import MissingDependencyException + +from dlt.common.destination import AnyDestination +from dlt.common.destination.reference import ( + Destination, + JobClientBase, + DestinationClientDwhConfiguration, + DestinationClientStagingConfiguration, + DestinationClientConfiguration, + DestinationClientDwhWithStagingConfiguration, +) + +from dlt.common.schema import Schema + + +# helpers +def get_destination_client_initial_config( + destination: AnyDestination, + default_schema_name: str, + dataset_name: str, + as_staging: bool = False, +) -> DestinationClientConfiguration: + client_spec = destination.spec + + # this client supports many schemas and datasets + if issubclass(client_spec, DestinationClientDwhConfiguration): + if issubclass(client_spec, DestinationClientStagingConfiguration): + spec: DestinationClientDwhConfiguration = client_spec(as_staging_destination=as_staging) + else: + spec = client_spec() + + spec._bind_dataset_name(dataset_name, default_schema_name) + return spec + + return client_spec() + + +def get_destination_clients( + schema: Schema, + destination: AnyDestination = None, + destination_dataset_name: str = None, + destination_initial_config: DestinationClientConfiguration = None, + staging: AnyDestination = None, + staging_dataset_name: str = None, + staging_initial_config: DestinationClientConfiguration = None, + # pipeline specific settings + default_schema_name: str = None, +) -> Tuple[JobClientBase, JobClientBase]: + destination = Destination.from_reference(destination) if destination else None + staging = Destination.from_reference(staging) if staging else None + + try: + # resolve staging config in order to pass it to destination client config + staging_client = None + if staging: + if not staging_initial_config: + # this is just initial config - without user configuration injected + staging_initial_config = get_destination_client_initial_config( + staging, + dataset_name=staging_dataset_name, + default_schema_name=default_schema_name, + as_staging=True, + ) + # create the client - that will also resolve the config + staging_client = staging.client(schema, staging_initial_config) + + if not destination_initial_config: + # config is not provided then get it with injected credentials + initial_config = get_destination_client_initial_config( + destination, + dataset_name=destination_dataset_name, + default_schema_name=default_schema_name, + ) + + # attach the staging client config to destination client config - if its type supports it + if ( + staging_client + and isinstance(initial_config, DestinationClientDwhWithStagingConfiguration) + and isinstance(staging_client.config, DestinationClientStagingConfiguration) + ): + initial_config.staging_config = staging_client.config + # create instance with initial_config properly set + client = destination.client(schema, initial_config) + return client, staging_client + except ModuleNotFoundError: + client_spec = destination.spec() + raise MissingDependencyException( + f"{client_spec.destination_type} destination", + [f"{version.DLT_PKG_NAME}[{client_spec.destination_type}]"], + "Dependencies for specific destinations are available as extras of dlt", + ) diff --git a/dlt/destinations/impl/bigquery/bigquery.py b/dlt/destinations/impl/bigquery/bigquery.py index 2b3927e7c9..10a344f768 100644 --- a/dlt/destinations/impl/bigquery/bigquery.py +++ b/dlt/destinations/impl/bigquery/bigquery.py @@ -401,10 +401,7 @@ def _get_info_schema_columns_query( return query, folded_table_names def _get_column_def_sql(self, column: TColumnSchema, table: PreparedTableSchema = None) -> str: - name = self.sql_client.escape_column_name(column["name"]) - column_def_sql = ( - f"{name} {self.type_mapper.to_destination_type(column, table)} {self._gen_not_null(column.get('nullable', True))}" - ) + column_def_sql = super()._get_column_def_sql(column, table) if column.get(ROUND_HALF_EVEN_HINT, False): column_def_sql += " OPTIONS (rounding_mode='ROUND_HALF_EVEN')" if column.get(ROUND_HALF_AWAY_FROM_ZERO_HINT, False): diff --git a/dlt/destinations/impl/clickhouse/clickhouse.py b/dlt/destinations/impl/clickhouse/clickhouse.py index 3a5f5c3e28..a407e56361 100644 --- a/dlt/destinations/impl/clickhouse/clickhouse.py +++ b/dlt/destinations/impl/clickhouse/clickhouse.py @@ -292,11 +292,10 @@ def _get_table_update_sql( return sql - @staticmethod - def _gen_not_null(v: bool) -> str: + def _gen_not_null(self, v: bool) -> str: # ClickHouse fields are not nullable by default. # We use the `Nullable` modifier instead of NULL / NOT NULL modifiers to cater for ALTER statement. - pass + return "" def _from_db_type( self, ch_t: str, precision: Optional[int], scale: Optional[int] diff --git a/dlt/destinations/impl/databricks/configuration.py b/dlt/destinations/impl/databricks/configuration.py index c95b6eba4c..21338bd310 100644 --- a/dlt/destinations/impl/databricks/configuration.py +++ b/dlt/destinations/impl/databricks/configuration.py @@ -4,6 +4,7 @@ from dlt.common.typing import TSecretStrValue from dlt.common.configuration.specs.base_configuration import CredentialsConfiguration, configspec from dlt.common.destination.reference import DestinationClientDwhWithStagingConfiguration +from dlt.common.configuration.exceptions import ConfigurationValueError DATABRICKS_APPLICATION_ID = "dltHub_dlt" @@ -15,6 +16,8 @@ class DatabricksCredentials(CredentialsConfiguration): server_hostname: str = None http_path: str = None access_token: Optional[TSecretStrValue] = None + client_id: Optional[TSecretStrValue] = None + client_secret: Optional[TSecretStrValue] = None http_headers: Optional[Dict[str, str]] = None session_configuration: Optional[Dict[str, Any]] = None """Dict of session parameters that will be passed to `databricks.sql.connect`""" @@ -27,9 +30,18 @@ class DatabricksCredentials(CredentialsConfiguration): "server_hostname", "http_path", "catalog", + "client_id", + "client_secret", "access_token", ] + def on_resolved(self) -> None: + if not ((self.client_id and self.client_secret) or self.access_token): + raise ConfigurationValueError( + "No valid authentication method detected. Provide either 'client_id' and" + " 'client_secret' for OAuth, or 'access_token' for token-based authentication." + ) + def to_connector_params(self) -> Dict[str, Any]: conn_params = dict( catalog=self.catalog, diff --git a/dlt/destinations/impl/databricks/databricks.py b/dlt/destinations/impl/databricks/databricks.py index 2bb68a607e..a83db6ec34 100644 --- a/dlt/destinations/impl/databricks/databricks.py +++ b/dlt/destinations/impl/databricks/databricks.py @@ -264,12 +264,6 @@ def _from_db_type( ) -> TColumnType: return self.type_mapper.from_destination_type(bq_t, precision, scale) - def _get_column_def_sql(self, c: TColumnSchema, table: PreparedTableSchema = None) -> str: - name = self.sql_client.escape_column_name(c["name"]) - return ( - f"{name} {self.type_mapper.to_destination_type(c,table)} {self._gen_not_null(c.get('nullable', True))}" - ) - def _get_storage_table_query_columns(self) -> List[str]: fields = super()._get_storage_table_query_columns() fields[2] = ( # Override because this is the only way to get data type with precision diff --git a/dlt/destinations/impl/databricks/sql_client.py b/dlt/destinations/impl/databricks/sql_client.py index 8bff4e0d73..16e1e73d93 100644 --- a/dlt/destinations/impl/databricks/sql_client.py +++ b/dlt/destinations/impl/databricks/sql_client.py @@ -11,10 +11,12 @@ Tuple, Union, Dict, + cast, + Callable, ) - -from databricks import sql as databricks_lib +from databricks.sdk.core import Config, oauth_service_principal +from databricks import sql as databricks_lib # type: ignore[attr-defined] from databricks.sql.client import ( Connection as DatabricksSqlConnection, Cursor as DatabricksSqlCursor, @@ -73,8 +75,22 @@ def __init__( self._conn: DatabricksSqlConnection = None self.credentials = credentials + def _get_oauth_credentials(self) -> Optional[Callable[[], Dict[str, str]]]: + config = Config( + host=f"https://{self.credentials.server_hostname}", + client_id=self.credentials.client_id, + client_secret=self.credentials.client_secret, + ) + return cast(Callable[[], Dict[str, str]], oauth_service_principal(config)) + def open_connection(self) -> DatabricksSqlConnection: conn_params = self.credentials.to_connector_params() + + if self.credentials.client_id and self.credentials.client_secret: + conn_params["credentials_provider"] = self._get_oauth_credentials + else: + conn_params["access_token"] = self.credentials.access_token + self._conn = databricks_lib.connect( **conn_params, schema=self.dataset_name, use_inline_params="silent" ) diff --git a/dlt/destinations/impl/dremio/dremio.py b/dlt/destinations/impl/dremio/dremio.py index ab23f58ab4..e3a090c824 100644 --- a/dlt/destinations/impl/dremio/dremio.py +++ b/dlt/destinations/impl/dremio/dremio.py @@ -151,12 +151,6 @@ def _from_db_type( ) -> TColumnType: return self.type_mapper.from_destination_type(bq_t, precision, scale) - def _get_column_def_sql(self, c: TColumnSchema, table: PreparedTableSchema = None) -> str: - name = self.sql_client.escape_column_name(c["name"]) - return ( - f"{name} {self.type_mapper.to_destination_type(c,table)} {self._gen_not_null(c.get('nullable', True))}" - ) - def _create_merge_followup_jobs( self, table_chain: Sequence[PreparedTableSchema] ) -> List[FollowupJobRequest]: diff --git a/dlt/destinations/impl/duckdb/duck.py b/dlt/destinations/impl/duckdb/duck.py index 3bd4c83e1f..2b3370270b 100644 --- a/dlt/destinations/impl/duckdb/duck.py +++ b/dlt/destinations/impl/duckdb/duck.py @@ -74,17 +74,6 @@ def create_load_job( job = DuckDbCopyJob(file_path) return job - def _get_column_def_sql(self, c: TColumnSchema, table: PreparedTableSchema = None) -> str: - hints_str = " ".join( - self.active_hints.get(h, "") - for h in self.active_hints.keys() - if c.get(h, False) is True - ) - column_name = self.sql_client.escape_column_name(c["name"]) - return ( - f"{column_name} {self.type_mapper.to_destination_type(c,table)} {hints_str} {self._gen_not_null(c.get('nullable', True))}" - ) - def _from_db_type( self, pq_t: str, precision: Optional[int], scale: Optional[int] ) -> TColumnType: diff --git a/dlt/destinations/impl/filesystem/factory.py b/dlt/destinations/impl/filesystem/factory.py index 2463da58fa..906bd157e4 100644 --- a/dlt/destinations/impl/filesystem/factory.py +++ b/dlt/destinations/impl/filesystem/factory.py @@ -19,7 +19,7 @@ def filesystem_loader_file_format_selector( *, table_schema: TTableSchema, ) -> t.Tuple[TLoaderFileFormat, t.Sequence[TLoaderFileFormat]]: - if table_schema.get("table_format") == "delta": + if table_schema.get("table_format") in ("delta", "iceberg"): return ("parquet", ["parquet"]) return (preferred_loader_file_format, supported_loader_file_formats) @@ -43,7 +43,7 @@ def _raw_capabilities(self) -> DestinationCapabilitiesContext: caps = DestinationCapabilitiesContext.generic_capabilities( preferred_loader_file_format="jsonl", loader_file_format_selector=filesystem_loader_file_format_selector, - supported_table_formats=["delta"], + supported_table_formats=["delta", "iceberg"], supported_merge_strategies=["upsert"], merge_strategies_selector=filesystem_merge_strategies_selector, ) diff --git a/dlt/destinations/impl/filesystem/filesystem.py b/dlt/destinations/impl/filesystem/filesystem.py index 1739c87fb3..ccf764811b 100644 --- a/dlt/destinations/impl/filesystem/filesystem.py +++ b/dlt/destinations/impl/filesystem/filesystem.py @@ -119,16 +119,27 @@ def metrics(self) -> Optional[LoadJobMetrics]: return m._replace(remote_url=self.make_remote_url()) -class DeltaLoadFilesystemJob(FilesystemLoadJob): +class TableFormatLoadFilesystemJob(FilesystemLoadJob): def __init__(self, file_path: str) -> None: super().__init__(file_path=file_path) self.file_paths = ReferenceFollowupJobRequest.resolve_references(self._file_path) def make_remote_path(self) -> str: - # remote path is table dir - delta will create its file structure inside it return self._job_client.get_table_dir(self.load_table_name) + @property + def arrow_dataset(self) -> Any: + from dlt.common.libs.pyarrow import pyarrow + + return pyarrow.dataset.dataset(self.file_paths) + + @property + def _partition_columns(self) -> List[str]: + return get_columns_names_with_prop(self._load_table, "partition") + + +class DeltaLoadFilesystemJob(TableFormatLoadFilesystemJob): def run(self) -> None: # create Arrow dataset from Parquet files from dlt.common.libs.pyarrow import pyarrow as pa @@ -138,7 +149,7 @@ def run(self) -> None: f"Will copy file(s) {self.file_paths} to delta table {self.make_remote_url()} [arrow" f" buffer: {pa.total_allocated_bytes()}]" ) - source_ds = pa.dataset.dataset(self.file_paths) + source_ds = self.arrow_dataset delta_table = self._delta_table() # explicitly check if there is data @@ -148,9 +159,6 @@ def run(self) -> None: else: with source_ds.scanner().to_reader() as arrow_rbr: # RecordBatchReader if self._load_table["write_disposition"] == "merge" and delta_table is not None: - self._load_table["x-merge-strategy"] = resolve_merge_strategy( # type: ignore[typeddict-unknown-key] - self._schema.tables, self._load_table, self._job_client.capabilities - ) merge_delta_table( table=delta_table, data=arrow_rbr, @@ -188,10 +196,6 @@ def _delta_table(self) -> Optional["DeltaTable"]: # type: ignore[name-defined] else: return None - @property - def _partition_columns(self) -> List[str]: - return get_columns_names_with_prop(self._load_table, "partition") - def _create_or_evolve_delta_table(self, arrow_ds: "Dataset", delta_table: "DeltaTable") -> "DeltaTable": # type: ignore[name-defined] # noqa: F821 from dlt.common.libs.deltalake import ( DeltaTable, @@ -211,13 +215,36 @@ def _create_or_evolve_delta_table(self, arrow_ds: "Dataset", delta_table: "Delta return _evolve_delta_table_schema(delta_table, arrow_ds.schema) +class IcebergLoadFilesystemJob(TableFormatLoadFilesystemJob): + def run(self) -> None: + from dlt.common.libs.pyiceberg import write_iceberg_table + + write_iceberg_table( + table=self._iceberg_table(), + data=self.arrow_dataset.to_table(), + write_disposition=self._load_table["write_disposition"], + ) + + def _iceberg_table(self) -> "pyiceberg.table.Table": # type: ignore[name-defined] # noqa: F821 + from dlt.common.libs.pyiceberg import get_catalog + + catalog = get_catalog( + client=self._job_client, + table_name=self.load_table_name, + schema=self.arrow_dataset.schema, + partition_columns=self._partition_columns, + ) + return catalog.load_table(self.table_identifier) + + @property + def table_identifier(self) -> str: + return f"{self._job_client.dataset_name}.{self.load_table_name}" + + class FilesystemLoadJobWithFollowup(HasFollowupJobs, FilesystemLoadJob): def create_followup_jobs(self, final_state: TLoadJobState) -> List[FollowupJobRequest]: jobs = super().create_followup_jobs(final_state) - if self._load_table.get("table_format") == "delta": - # delta table jobs only require table chain followup jobs - pass - elif final_state == "completed": + if final_state == "completed": ref_job = ReferenceFollowupJobRequest( original_file_name=self.file_name(), remote_paths=[self._job_client.make_remote_url(self.make_remote_path())], @@ -394,6 +421,13 @@ def prepare_load_table(self, table_name: str) -> PreparedTableSchema: if table["write_disposition"] == "merge": table["write_disposition"] = "append" table.pop("table_format", None) + merge_strategy = resolve_merge_strategy(self.schema.tables, table, self.capabilities) + if table["write_disposition"] == "merge": + if merge_strategy is None: + # no supported merge strategies, fall back to append + table["write_disposition"] = "append" + else: + table["x-merge-strategy"] = merge_strategy # type: ignore[typeddict-unknown-key] return table def get_table_dir(self, table_name: str, remote: bool = False) -> str: @@ -458,12 +492,20 @@ def create_load_job( # where we want to load the state the regular way if table["name"] == self.schema.state_table_name and not self.config.as_staging_destination: return FinalizedLoadJob(file_path) - if table.get("table_format") == "delta": - import dlt.common.libs.deltalake # assert dependencies are installed + table_format = table.get("table_format") + if table_format in ("delta", "iceberg"): # a reference job for a delta table indicates a table chain followup job if ReferenceFollowupJobRequest.is_reference_job(file_path): - return DeltaLoadFilesystemJob(file_path) + if table_format == "delta": + import dlt.common.libs.deltalake + + return DeltaLoadFilesystemJob(file_path) + elif table_format == "iceberg": + import dlt.common.libs.pyiceberg + + return IcebergLoadFilesystemJob(file_path) + # otherwise just continue return FinalizedLoadJobWithFollowupJobs(file_path) @@ -494,10 +536,10 @@ def should_load_data_to_staging_dataset(self, table_name: str) -> bool: def should_truncate_table_before_load(self, table_name: str) -> bool: table = self.prepare_load_table(table_name) - return ( - table["write_disposition"] == "replace" - and not table.get("table_format") == "delta" # Delta can do a logical replace - ) + return table["write_disposition"] == "replace" and not table.get("table_format") in ( + "delta", + "iceberg", + ) # Delta/Iceberg can do a logical replace # # state stuff @@ -718,7 +760,7 @@ def create_table_chain_completed_followup_jobs( jobs = super().create_table_chain_completed_followup_jobs( table_chain, completed_table_chain_jobs ) - if table_chain[0].get("table_format") == "delta": + if table_chain[0].get("table_format") in ("delta", "iceberg"): for table in table_chain: table_job_paths = [ job.file_path diff --git a/dlt/destinations/impl/filesystem/sql_client.py b/dlt/destinations/impl/filesystem/sql_client.py index d03a00b418..e6b84343bb 100644 --- a/dlt/destinations/impl/filesystem/sql_client.py +++ b/dlt/destinations/impl/filesystem/sql_client.py @@ -13,6 +13,7 @@ from dlt.common.destination.reference import DBApiCursor +from dlt.common.storages.fsspec_filesystem import AZURE_BLOB_STORAGE_PROTOCOLS from dlt.destinations.sql_client import raise_database_error from dlt.destinations.impl.duckdb.sql_client import DuckDbSqlClient @@ -169,8 +170,9 @@ def create_authentication(self, persistent: bool = False, secret_name: str = Non # native google storage implementation is not supported.. elif self.fs_client.config.protocol in ["gs", "gcs"]: logger.warn( - "For gs/gcs access via duckdb please use the gs/gcs s3 compatibility layer. Falling" - " back to fsspec." + "For gs/gcs access via duckdb please use the gs/gcs s3 compatibility layer if" + " possible (not supported when using `iceberg` table format). Falling back to" + " fsspec." ) self._conn.register_filesystem(self.fs_client.fs_client) @@ -192,7 +194,7 @@ def open_connection(self) -> duckdb.DuckDBPyConnection: # the line below solves problems with certificate path lookup on linux # see duckdb docs - if self.fs_client.config.protocol in ["az", "abfss"]: + if self.fs_client.config.protocol in AZURE_BLOB_STORAGE_PROTOCOLS: self._conn.sql("SET azure_transport_option_type = 'curl';") return self._conn @@ -212,14 +214,17 @@ def create_views_for_tables(self, tables: Dict[str, str]) -> None: # unknown views will not be created continue - # only create view if it does not exist in the current schema yet - existing_tables = [tname[0] for tname in self._conn.execute("SHOW TABLES").fetchall()] - if view_name in existing_tables: - continue - # NOTE: if this is staging configuration then `prepare_load_table` will remove some info # from table schema, if we ever extend this to handle staging destination, this needs to change schema_table = self.fs_client.prepare_load_table(table_name) + table_format = schema_table.get("table_format") + + # skip if view already exists and does not need to be replaced each time + existing_tables = [tname[0] for tname in self._conn.execute("SHOW TABLES").fetchall()] + needs_replace = table_format == "iceberg" or self.fs_client.config.protocol == "abfss" + if view_name in existing_tables and not needs_replace: + continue + # discover file type folder = self.fs_client.get_table_dir(table_name) files = self.fs_client.list_table_files(table_name) @@ -256,8 +261,17 @@ def create_views_for_tables(self, tables: Dict[str, str]) -> None: # create from statement from_statement = "" - if schema_table.get("table_format") == "delta": + if table_format == "delta": from_statement = f"delta_scan('{resolved_folder}')" + elif table_format == "iceberg": + from dlt.common.libs.pyiceberg import _get_last_metadata_file + + self._setup_iceberg(self._conn) + metadata_path = f"{resolved_folder}/metadata" + last_metadata_file = _get_last_metadata_file(metadata_path, self.fs_client) + # skip schema inference to make nested data types work + # https://github.com/duckdb/duckdb_iceberg/issues/47 + from_statement = f"iceberg_scan('{last_metadata_file}', skip_schema_inference=True)" elif first_file_type == "parquet": from_statement = f"read_parquet([{resolved_files_string}])" elif first_file_type == "jsonl": @@ -267,12 +281,14 @@ def create_views_for_tables(self, tables: Dict[str, str]) -> None: else: raise NotImplementedError( f"Unknown filetype {first_file_type} for table {table_name}. Currently only" - " jsonl and parquet files as well as delta tables are supported." + " jsonl and parquet files as well as delta and iceberg tables are supported." ) # create table view_name = self.make_qualified_table_name(view_name) - create_table_sql_base = f"CREATE VIEW {view_name} AS SELECT * FROM {from_statement}" + create_table_sql_base = ( + f"CREATE OR REPLACE VIEW {view_name} AS SELECT * FROM {from_statement}" + ) self._conn.execute(create_table_sql_base) @contextmanager @@ -299,6 +315,16 @@ def execute_query(self, query: AnyStr, *args: Any, **kwargs: Any) -> Iterator[DB with super().execute_query(query, *args, **kwargs) as cursor: yield cursor + @staticmethod + def _setup_iceberg(conn: duckdb.DuckDBPyConnection) -> None: + # needed to make persistent secrets work in new connection + # https://github.com/duckdb/duckdb_iceberg/issues/83 + conn.execute("FROM duckdb_secrets();") + + # `duckdb_iceberg` extension does not support autoloading + # https://github.com/duckdb/duckdb_iceberg/issues/71 + conn.execute("INSTALL iceberg; LOAD iceberg;") + def __del__(self) -> None: if self.memory_db: self.memory_db.close() diff --git a/dlt/destinations/impl/mssql/mssql.py b/dlt/destinations/impl/mssql/mssql.py index 27aebe07f2..7b48a6b551 100644 --- a/dlt/destinations/impl/mssql/mssql.py +++ b/dlt/destinations/impl/mssql/mssql.py @@ -115,11 +115,7 @@ def _get_column_def_sql(self, c: TColumnSchema, table: PreparedTableSchema = Non else: db_type = self.type_mapper.to_destination_type(c, table) - hints_str = " ".join( - self.active_hints.get(h, "") - for h in self.active_hints.keys() - if c.get(h, False) is True - ) + hints_str = self._get_column_hints_sql(c) column_name = self.sql_client.escape_column_name(c["name"]) return f"{column_name} {db_type} {hints_str} {self._gen_not_null(c.get('nullable', True))}" diff --git a/dlt/destinations/impl/postgres/postgres.py b/dlt/destinations/impl/postgres/postgres.py index 2459ee1dbe..3d54b59f93 100644 --- a/dlt/destinations/impl/postgres/postgres.py +++ b/dlt/destinations/impl/postgres/postgres.py @@ -161,18 +161,6 @@ def create_load_job( job = PostgresCsvCopyJob(file_path) return job - def _get_column_def_sql(self, c: TColumnSchema, table: PreparedTableSchema = None) -> str: - hints_ = " ".join( - self.active_hints.get(h, "") - for h in self.active_hints.keys() - if c.get(h, False) is True - ) - column_name = self.sql_client.escape_column_name(c["name"]) - nullability = self._gen_not_null(c.get("nullable", True)) - column_type = self.type_mapper.to_destination_type(c, table) - - return f"{column_name} {column_type} {hints_} {nullability}" - def _create_replace_followup_jobs( self, table_chain: Sequence[PreparedTableSchema] ) -> List[FollowupJobRequest]: diff --git a/dlt/destinations/impl/redshift/redshift.py b/dlt/destinations/impl/redshift/redshift.py index 2335166761..b1aa37ce6a 100644 --- a/dlt/destinations/impl/redshift/redshift.py +++ b/dlt/destinations/impl/redshift/redshift.py @@ -153,6 +153,7 @@ def __init__( capabilities, ) super().__init__(schema, config, sql_client) + self.active_hints = HINT_TO_REDSHIFT_ATTR self.sql_client = sql_client self.config: RedshiftClientConfiguration = config self.type_mapper = self.capabilities.get_type_mapper() @@ -162,17 +163,6 @@ def _create_merge_followup_jobs( ) -> List[FollowupJobRequest]: return [RedshiftMergeJob.from_table_chain(table_chain, self.sql_client)] - def _get_column_def_sql(self, c: TColumnSchema, table: PreparedTableSchema = None) -> str: - hints_str = " ".join( - HINT_TO_REDSHIFT_ATTR.get(h, "") - for h in HINT_TO_REDSHIFT_ATTR.keys() - if c.get(h, False) is True - ) - column_name = self.sql_client.escape_column_name(c["name"]) - return ( - f"{column_name} {self.type_mapper.to_destination_type(c,table)} {hints_str} {self._gen_not_null(c.get('nullable', True))}" - ) - def create_load_job( self, table: PreparedTableSchema, file_path: str, load_id: str, restore: bool = False ) -> LoadJob: diff --git a/dlt/destinations/impl/snowflake/configuration.py b/dlt/destinations/impl/snowflake/configuration.py index 4a89a1564b..2e589ea095 100644 --- a/dlt/destinations/impl/snowflake/configuration.py +++ b/dlt/destinations/impl/snowflake/configuration.py @@ -138,6 +138,24 @@ class SnowflakeClientConfiguration(DestinationClientDwhWithStagingConfiguration) query_tag: Optional[str] = None """A tag with placeholders to tag sessions executing jobs""" + create_indexes: bool = False + """Whether UNIQUE or PRIMARY KEY constrains should be created""" + + def __init__( + self, + *, + credentials: SnowflakeCredentials = None, + create_indexes: bool = False, + destination_name: str = None, + environment: str = None, + ) -> None: + super().__init__( + credentials=credentials, + destination_name=destination_name, + environment=environment, + ) + self.create_indexes = create_indexes + def fingerprint(self) -> str: """Returns a fingerprint of host part of a connection string""" if self.credentials and self.credentials.host: diff --git a/dlt/destinations/impl/snowflake/snowflake.py b/dlt/destinations/impl/snowflake/snowflake.py index e5146139f2..786cdc0b77 100644 --- a/dlt/destinations/impl/snowflake/snowflake.py +++ b/dlt/destinations/impl/snowflake/snowflake.py @@ -1,6 +1,7 @@ -from typing import Optional, Sequence, List +from typing import Optional, Sequence, List, Dict, Set from urllib.parse import urlparse, urlunparse +from dlt.common import logger from dlt.common.data_writers.configuration import CsvFormatConfiguration from dlt.common.destination import DestinationCapabilitiesContext from dlt.common.destination.reference import ( @@ -15,13 +16,15 @@ AwsCredentialsWithoutDefaults, AzureCredentialsWithoutDefaults, ) +from dlt.common.schema.utils import get_columns_names_with_prop from dlt.common.storages.configuration import FilesystemConfiguration, ensure_canonical_az_url from dlt.common.storages.file_storage import FileStorage -from dlt.common.schema import TColumnSchema, Schema -from dlt.common.schema.typing import TColumnType +from dlt.common.schema import TColumnSchema, Schema, TColumnHint +from dlt.common.schema.typing import TColumnType, TTableSchema from dlt.common.storages.fsspec_filesystem import AZURE_BLOB_STORAGE_PROTOCOLS, S3_PROTOCOLS from dlt.common.typing import TLoaderFileFormat +from dlt.common.utils import uniq_id from dlt.destinations.job_client_impl import SqlJobClientWithStagingDataset from dlt.destinations.exceptions import LoadJobTerminalException @@ -29,6 +32,8 @@ from dlt.destinations.impl.snowflake.sql_client import SnowflakeSqlClient from dlt.destinations.job_impl import ReferenceFollowupJobRequest +SUPPORTED_HINTS: Dict[TColumnHint, str] = {"unique": "UNIQUE"} + class SnowflakeLoadJob(RunnableLoadJob, HasFollowupJobs): def __init__( @@ -238,6 +243,7 @@ def __init__( self.config: SnowflakeClientConfiguration = config self.sql_client: SnowflakeSqlClient = sql_client # type: ignore self.type_mapper = self.capabilities.get_type_mapper() + self.active_hints = SUPPORTED_HINTS if self.config.create_indexes else {} def create_load_job( self, table: PreparedTableSchema, file_path: str, load_id: str, restore: bool = False @@ -264,6 +270,33 @@ def _make_add_column_sql( "ADD COLUMN\n" + ",\n".join(self._get_column_def_sql(c, table) for c in new_columns) ] + def _get_constraints_sql( + self, table_name: str, new_columns: Sequence[TColumnSchema], generate_alter: bool + ) -> str: + # "primary_key": "PRIMARY KEY" + if self.config.create_indexes: + partial: TTableSchema = { + "name": table_name, + "columns": {c["name"]: c for c in new_columns}, + } + # Add PK constraint if pk_columns exist + pk_columns = get_columns_names_with_prop(partial, "primary_key") + if pk_columns: + if generate_alter: + logger.warning( + f"PRIMARY KEY on {table_name} constraint cannot be added in ALTER TABLE and" + " is ignored" + ) + else: + pk_constraint_name = list( + self._norm_and_escape_columns(f"PK_{table_name}_{uniq_id(4)}") + )[0] + quoted_pk_cols = ", ".join( + self.sql_client.escape_column_name(col) for col in pk_columns + ) + return f",\nCONSTRAINT {pk_constraint_name} PRIMARY KEY ({quoted_pk_cols})" + return "" + def _get_table_update_sql( self, table_name: str, @@ -287,11 +320,5 @@ def _from_db_type( ) -> TColumnType: return self.type_mapper.from_destination_type(bq_t, precision, scale) - def _get_column_def_sql(self, c: TColumnSchema, table: PreparedTableSchema = None) -> str: - name = self.sql_client.escape_column_name(c["name"]) - return ( - f"{name} {self.type_mapper.to_destination_type(c,table)} {self._gen_not_null(c.get('nullable', True))}" - ) - def should_truncate_table_before_load_on_staging_destination(self, table_name: str) -> bool: return self.config.truncate_tables_on_staging_destination_before_load diff --git a/dlt/destinations/impl/sqlalchemy/db_api_client.py b/dlt/destinations/impl/sqlalchemy/db_api_client.py index 6f3ff065bf..27c4f2f1f9 100644 --- a/dlt/destinations/impl/sqlalchemy/db_api_client.py +++ b/dlt/destinations/impl/sqlalchemy/db_api_client.py @@ -84,7 +84,7 @@ def __init__(self, curr: sa.engine.CursorResult) -> None: def _get_columns(self) -> List[str]: try: - return list(self.native_cursor.keys()) # type: ignore[attr-defined] + return list(self.native_cursor.keys()) except ResourceClosedError: # this happens if now rows are returned return [] @@ -314,7 +314,7 @@ def execute_sql( self, sql: Union[AnyStr, sa.sql.Executable], *args: Any, **kwargs: Any ) -> Optional[Sequence[Sequence[Any]]]: with self.execute_query(sql, *args, **kwargs) as cursor: - if cursor.returns_rows: # type: ignore[attr-defined] + if cursor.returns_rows: return cursor.fetchall() return None diff --git a/dlt/destinations/impl/sqlalchemy/factory.py b/dlt/destinations/impl/sqlalchemy/factory.py index edd827ed00..e61ac1fb6a 100644 --- a/dlt/destinations/impl/sqlalchemy/factory.py +++ b/dlt/destinations/impl/sqlalchemy/factory.py @@ -81,6 +81,9 @@ def adjust_capabilities( caps.max_column_identifier_length = dialect.max_identifier_length caps.supports_native_boolean = dialect.supports_native_boolean if dialect.name == "mysql": + # correct max identifier length + # dialect uses 255 (max length for aliases) instead of 64 (max length of identifiers) + caps.max_identifier_length = 64 caps.format_datetime_literal = _format_mysql_datetime_literal return caps diff --git a/dlt/destinations/job_client_impl.py b/dlt/destinations/job_client_impl.py index d1f211b1e9..888c80c006 100644 --- a/dlt/destinations/job_client_impl.py +++ b/dlt/destinations/job_client_impl.py @@ -7,6 +7,7 @@ from typing import ( Any, ClassVar, + Dict, List, Optional, Sequence, @@ -14,21 +15,18 @@ Type, Iterable, Iterator, - Generator, ) import zlib import re -from contextlib import contextmanager -from contextlib import suppress from dlt.common import pendulum, logger +from dlt.common.destination.capabilities import DataTypeMapper from dlt.common.json import json from dlt.common.schema.typing import ( C_DLT_LOAD_ID, COLUMN_HINTS, TColumnType, TColumnSchemaBase, - TTableFormat, ) from dlt.common.schema.utils import ( get_inherited_table_hint, @@ -40,11 +38,11 @@ from dlt.common.storages import FileStorage from dlt.common.storages.load_package import LoadJobInfo, ParsedLoadJobFileName from dlt.common.schema import TColumnSchema, Schema, TTableSchemaColumns, TSchemaTables +from dlt.common.schema import TColumnHint from dlt.common.destination.reference import ( PreparedTableSchema, StateInfo, StorageSchemaInfo, - SupportsReadableDataset, WithStateSync, DestinationClientConfiguration, DestinationClientDwhConfiguration, @@ -55,9 +53,7 @@ JobClientBase, HasFollowupJobs, CredentialsConfiguration, - SupportsReadableRelation, ) -from dlt.destinations.dataset import ReadableDBAPIDataset from dlt.destinations.exceptions import DatabaseUndefinedRelation from dlt.destinations.job_impl import ( @@ -154,6 +150,8 @@ def __init__( self.state_table_columns = ", ".join( sql_client.escape_column_name(col) for col in state_table_["columns"] ) + self.active_hints: Dict[TColumnHint, str] = {} + self.type_mapper: DataTypeMapper = None super().__init__(schema, config, sql_client.capabilities) self.sql_client = sql_client assert isinstance(config, DestinationClientDwhConfiguration) @@ -569,6 +567,7 @@ def _get_table_update_sql( # build CREATE sql = self._make_create_table(qualified_name, table) + " (\n" sql += ",\n".join([self._get_column_def_sql(c, table) for c in new_columns]) + sql += self._get_constraints_sql(table_name, new_columns, generate_alter) sql += ")" sql_result.append(sql) else: @@ -582,8 +581,16 @@ def _get_table_update_sql( sql_result.extend( [sql_base + col_statement for col_statement in add_column_statements] ) + constraints_sql = self._get_constraints_sql(table_name, new_columns, generate_alter) + if constraints_sql: + sql_result.append(constraints_sql) return sql_result + def _get_constraints_sql( + self, table_name: str, new_columns: Sequence[TColumnSchema], generate_alter: bool + ) -> str: + return "" + def _check_table_update_hints( self, table_name: str, new_columns: Sequence[TColumnSchema], generate_alter: bool ) -> None: @@ -613,12 +620,22 @@ def _check_table_update_hints( " existing tables." ) - @abstractmethod def _get_column_def_sql(self, c: TColumnSchema, table: PreparedTableSchema = None) -> str: - pass + hints_ = self._get_column_hints_sql(c) + column_name = self.sql_client.escape_column_name(c["name"]) + nullability = self._gen_not_null(c.get("nullable", True)) + column_type = self.type_mapper.to_destination_type(c, table) + + return f"{column_name} {column_type} {hints_} {nullability}" + + def _get_column_hints_sql(self, c: TColumnSchema) -> str: + return " ".join( + self.active_hints.get(h, "") + for h in self.active_hints.keys() + if c.get(h, False) is True # use ColumnPropInfos to get default value + ) - @staticmethod - def _gen_not_null(nullable: bool) -> str: + def _gen_not_null(self, nullable: bool) -> str: return "NOT NULL" if not nullable else "" def _create_table_update( diff --git a/dlt/extract/exceptions.py b/dlt/extract/exceptions.py index f4d2b1f302..e832833428 100644 --- a/dlt/extract/exceptions.py +++ b/dlt/extract/exceptions.py @@ -3,7 +3,6 @@ from dlt.common.exceptions import DltException from dlt.common.utils import get_callable_name -from dlt.extract.items import ValidateItem, TDataItems class ExtractorException(DltException): diff --git a/dlt/extract/extract.py b/dlt/extract/extract.py index 25c3a0dbae..c062a74920 100644 --- a/dlt/extract/extract.py +++ b/dlt/extract/extract.py @@ -87,7 +87,12 @@ def choose_schema() -> Schema: schema_ = schema # take pipeline schema to make newest version visible to the resources elif pipeline.default_schema_name: - schema_ = pipeline.schemas[pipeline.default_schema_name].clone() + # clones with name which will drop previous hashes + schema_ = pipeline.schemas[pipeline.default_schema_name].clone( + with_name=pipeline.default_schema_name + ) + # delete data tables + schema_.drop_tables(schema_.data_table_names(include_incomplete=True)) else: schema_ = pipeline._make_schema_with_default_name() return schema_ diff --git a/dlt/extract/hints.py b/dlt/extract/hints.py index 000e5c4cdb..22a0062acf 100644 --- a/dlt/extract/hints.py +++ b/dlt/extract/hints.py @@ -37,7 +37,8 @@ InconsistentTableTemplate, ) from dlt.extract.incremental import Incremental, TIncrementalConfig -from dlt.extract.items import TFunHintTemplate, TTableHintTemplate, TableNameMeta, ValidateItem +from dlt.extract.items import TFunHintTemplate, TTableHintTemplate, TableNameMeta +from dlt.extract.items_transform import ValidateItem from dlt.extract.utils import ensure_table_schema_columns, ensure_table_schema_columns_hint from dlt.extract.validation import create_item_validator diff --git a/dlt/extract/incremental/__init__.py b/dlt/extract/incremental/__init__.py index 28d33bb71f..ce06292864 100644 --- a/dlt/extract/incremental/__init__.py +++ b/dlt/extract/incremental/__init__.py @@ -42,8 +42,10 @@ LastValueFunc, OnCursorValueMissing, IncrementalArgs, + TIncrementalRange, ) -from dlt.extract.items import SupportsPipe, TTableHintTemplate, ItemTransform +from dlt.extract.items import SupportsPipe, TTableHintTemplate +from dlt.extract.items_transform import ItemTransform from dlt.extract.incremental.transform import ( JsonIncremental, ArrowIncremental, @@ -104,6 +106,11 @@ class Incremental(ItemTransform[TDataItem], BaseConfiguration, Generic[TCursorVa Note that if logical "end date" is present then also "end_value" will be set which means that resource state is not used and exactly this range of date will be loaded on_cursor_value_missing: Specify what happens when the cursor_path does not exist in a record or a record has `None` at the cursor_path: raise, include, exclude lag: Optional value used to define a lag or attribution window. For datetime cursors, this is interpreted as seconds. For other types, it uses the + or - operator depending on the last_value_func. + range_start: Decide whether the incremental filtering range is `open` or `closed` on the start value side. Default is `closed`. + Setting this to `open` means that items with the same cursor value as the last value from the previous run (or `initial_value`) are excluded from the result. + The `open` range disables deduplication logic so it can serve as an optimization when you know cursors don't overlap between pipeline runs. + range_end: Decide whether the incremental filtering range is `open` or `closed` on the end value side. Default is `open` (exact `end_value` is excluded). + Setting this to `closed` means that items with the exact same cursor value as the `end_value` are included in the result. """ # this is config/dataclass so declare members @@ -116,6 +123,8 @@ class Incremental(ItemTransform[TDataItem], BaseConfiguration, Generic[TCursorVa on_cursor_value_missing: OnCursorValueMissing = "raise" lag: Optional[float] = None duplicate_cursor_warning_threshold: ClassVar[int] = 200 + range_start: TIncrementalRange = "closed" + range_end: TIncrementalRange = "open" # incremental acting as empty EMPTY: ClassVar["Incremental[Any]"] = None @@ -132,6 +141,8 @@ def __init__( allow_external_schedulers: bool = False, on_cursor_value_missing: OnCursorValueMissing = "raise", lag: Optional[float] = None, + range_start: TIncrementalRange = "closed", + range_end: TIncrementalRange = "open", ) -> None: # make sure that path is valid if cursor_path: @@ -174,9 +185,11 @@ def __init__( self.start_out_of_range: bool = False """Becomes true on the first item that is out of range of `start_value`. I.e. when using `max` this is a value that is lower than `start_value`""" - self._transformers: Dict[str, IncrementalTransform] = {} + self._transformers: Dict[Type[IncrementalTransform], IncrementalTransform] = {} self._bound_pipe: SupportsPipe = None """Bound pipe""" + self.range_start = range_start + self.range_end = range_end @property def primary_key(self) -> Optional[TTableHintTemplate[TColumnNames]]: @@ -190,22 +203,6 @@ def primary_key(self, value: str) -> None: for transform in self._transformers.values(): transform.primary_key = value - def _make_transforms(self) -> None: - types = [("arrow", ArrowIncremental), ("json", JsonIncremental)] - for dt, kls in types: - self._transformers[dt] = kls( - self.resource_name, - self.cursor_path, - self.initial_value, - self.start_value, - self.end_value, - self.last_value_func, - self._primary_key, - set(self._cached_state["unique_hashes"]), - self.on_cursor_value_missing, - self.lag, - ) - @classmethod def from_existing_state( cls, resource_name: str, cursor_path: str @@ -489,7 +486,8 @@ def bind(self, pipe: SupportsPipe) -> "Incremental[TCursorValue]": ) # cache state self._cached_state = self.get_state() - self._make_transforms() + # Clear transforms so we get new instances + self._transformers.clear() return self def can_close(self) -> bool: @@ -520,15 +518,34 @@ def __str__(self) -> str: f" {self.last_value_func}" ) + def _make_or_get_transformer(self, cls: Type[IncrementalTransform]) -> IncrementalTransform: + if transformer := self._transformers.get(cls): + return transformer + transformer = self._transformers[cls] = cls( + self.resource_name, + self.cursor_path, + self.initial_value, + self.start_value, + self.end_value, + self.last_value_func, + self._primary_key, + set(self._cached_state["unique_hashes"]), + self.on_cursor_value_missing, + self.lag, + self.range_start, + self.range_end, + ) + return transformer + def _get_transformer(self, items: TDataItems) -> IncrementalTransform: # Assume list is all of the same type for item in items if isinstance(items, list) else [items]: if is_arrow_item(item): - return self._transformers["arrow"] + return self._make_or_get_transformer(ArrowIncremental) elif pandas is not None and isinstance(item, pandas.DataFrame): - return self._transformers["arrow"] - return self._transformers["json"] - return self._transformers["json"] + return self._make_or_get_transformer(ArrowIncremental) + return self._make_or_get_transformer(JsonIncremental) + return self._make_or_get_transformer(JsonIncremental) def __call__(self, rows: TDataItems, meta: Any = None) -> Optional[TDataItems]: if rows is None: diff --git a/dlt/extract/incremental/lag.py b/dlt/extract/incremental/lag.py index ee102a9961..dfafa2cd11 100644 --- a/dlt/extract/incremental/lag.py +++ b/dlt/extract/incremental/lag.py @@ -20,7 +20,7 @@ def _apply_lag_to_value( parsed_value = ensure_pendulum_datetime(value) if is_str else value if isinstance(parsed_value, (datetime, date)): - parsed_value = _apply_lag_to_datetime(lag, parsed_value, last_value_func, is_str_date) + parsed_value = _apply_lag_to_datetime(lag, parsed_value, last_value_func, is_str_date) # type: ignore[assignment] # go back to string or pass exact type value = parsed_value.strftime(value_format) if value_format else parsed_value # type: ignore[assignment] diff --git a/dlt/extract/incremental/transform.py b/dlt/extract/incremental/transform.py index 22b1194b51..1d213e26c2 100644 --- a/dlt/extract/incremental/transform.py +++ b/dlt/extract/incremental/transform.py @@ -13,7 +13,12 @@ IncrementalPrimaryKeyMissing, IncrementalCursorPathHasValueNone, ) -from dlt.common.incremental.typing import TCursorValue, LastValueFunc, OnCursorValueMissing +from dlt.common.incremental.typing import ( + TCursorValue, + LastValueFunc, + OnCursorValueMissing, + TIncrementalRange, +) from dlt.extract.utils import resolve_column_value from dlt.extract.items import TTableHintTemplate @@ -57,6 +62,8 @@ def __init__( unique_hashes: Set[str], on_cursor_value_missing: OnCursorValueMissing = "raise", lag: Optional[float] = None, + range_start: TIncrementalRange = "closed", + range_end: TIncrementalRange = "open", ) -> None: self.resource_name = resource_name self.cursor_path = cursor_path @@ -71,6 +78,9 @@ def __init__( self.start_unique_hashes = set(unique_hashes) self.on_cursor_value_missing = on_cursor_value_missing self.lag = lag + self.range_start = range_start + self.range_end = range_end + # compile jsonpath self._compiled_cursor_path = compile_path(cursor_path) # for simple column name we'll fallback to search in dict @@ -107,6 +117,8 @@ def __call__( def deduplication_disabled(self) -> bool: """Skip deduplication when length of the key is 0 or if lag is applied.""" # disable deduplication if end value is set - state is not saved + if self.range_start == "open": + return True if self.end_value is not None: return True # disable deduplication if lag is applied - destination must deduplicate ranges @@ -191,10 +203,10 @@ def __call__( # Filter end value ranges exclusively, so in case of "max" function we remove values >= end_value if self.end_value is not None: try: - if ( - last_value_func((row_value, self.end_value)) != self.end_value - or last_value_func((row_value,)) == self.end_value - ): + if last_value_func((row_value, self.end_value)) != self.end_value: + return None, False, True + + if self.range_end == "open" and last_value_func((row_value,)) == self.end_value: return None, False, True except Exception as ex: raise IncrementalCursorInvalidCoercion( @@ -221,6 +233,9 @@ def __call__( ) from ex # new_value is "less" or equal to last_value (the actual max) if last_value == new_value: + if self.range_start == "open": + # We only want greater than last_value + return None, False, False # use func to compute row_value into last_value compatible processed_row_value = last_value_func((row_value,)) # skip the record that is not a start_value or new_value: that record was already processed @@ -258,6 +273,31 @@ def __call__( class ArrowIncremental(IncrementalTransform): _dlt_index = "_dlt_index" + def __init__(self, *args: Any, **kwargs: Any) -> None: + super().__init__(*args, **kwargs) + if self.last_value_func is max: + self.compute = pa.compute.max + self.end_compare = ( + pa.compute.less if self.range_end == "open" else pa.compute.less_equal + ) + self.last_value_compare = ( + pa.compute.greater_equal if self.range_start == "closed" else pa.compute.greater + ) + self.new_value_compare = pa.compute.greater + elif self.last_value_func is min: + self.compute = pa.compute.min + self.end_compare = ( + pa.compute.greater if self.range_end == "open" else pa.compute.greater_equal + ) + self.last_value_compare = ( + pa.compute.less_equal if self.range_start == "closed" else pa.compute.less + ) + self.new_value_compare = pa.compute.less + else: + raise NotImplementedError( + "Only min or max last_value_func is supported for arrow tables" + ) + def compute_unique_values(self, item: "TAnyArrowItem", unique_columns: List[str]) -> List[str]: if not unique_columns: return [] @@ -312,28 +352,13 @@ def __call__( if not tbl: # row is None or empty arrow table return tbl, start_out_of_range, end_out_of_range - if self.last_value_func is max: - compute = pa.compute.max - end_compare = pa.compute.less - last_value_compare = pa.compute.greater_equal - new_value_compare = pa.compute.greater - elif self.last_value_func is min: - compute = pa.compute.min - end_compare = pa.compute.greater - last_value_compare = pa.compute.less_equal - new_value_compare = pa.compute.less - else: - raise NotImplementedError( - "Only min or max last_value_func is supported for arrow tables" - ) - # TODO: Json path support. For now assume the cursor_path is a column name cursor_path = self.cursor_path # The new max/min value try: # NOTE: datetimes are always pendulum in UTC - row_value = from_arrow_scalar(compute(tbl[cursor_path])) + row_value = from_arrow_scalar(self.compute(tbl[cursor_path])) cursor_data_type = tbl.schema.field(cursor_path).type row_value_scalar = to_arrow_scalar(row_value, cursor_data_type) except KeyError as e: @@ -364,10 +389,10 @@ def __call__( cursor_data_type, str(ex), ) from ex - tbl = tbl.filter(end_compare(tbl[cursor_path], end_value_scalar)) + tbl = tbl.filter(self.end_compare(tbl[cursor_path], end_value_scalar)) # Is max row value higher than end value? # NOTE: pyarrow bool *always* evaluates to python True. `as_py()` is necessary - end_out_of_range = not end_compare(row_value_scalar, end_value_scalar).as_py() + end_out_of_range = not self.end_compare(row_value_scalar, end_value_scalar).as_py() if self.start_value is not None: try: @@ -383,7 +408,7 @@ def __call__( str(ex), ) from ex # Remove rows lower or equal than the last start value - keep_filter = last_value_compare(tbl[cursor_path], start_value_scalar) + keep_filter = self.last_value_compare(tbl[cursor_path], start_value_scalar) start_out_of_range = bool(pa.compute.any(pa.compute.invert(keep_filter)).as_py()) tbl = tbl.filter(keep_filter) if not self.deduplication_disabled: @@ -407,7 +432,7 @@ def __call__( if ( self.last_value is None - or new_value_compare( + or self.new_value_compare( row_value_scalar, to_arrow_scalar(self.last_value, cursor_data_type) ).as_py() ): # Last value has changed diff --git a/dlt/extract/items.py b/dlt/extract/items.py index 888787e6b7..ad7447c163 100644 --- a/dlt/extract/items.py +++ b/dlt/extract/items.py @@ -1,21 +1,16 @@ -import inspect from abc import ABC, abstractmethod from typing import ( Any, Callable, - ClassVar, - Generic, Iterator, Iterable, Literal, Optional, Protocol, - TypeVar, Union, Awaitable, TYPE_CHECKING, NamedTuple, - Generator, ) from concurrent.futures import Future @@ -28,7 +23,6 @@ TDynHintType, ) - TDecompositionStrategy = Literal["none", "scc"] TDeferredDataItems = Callable[[], TDataItems] TAwaitableDataItems = Awaitable[TDataItems] @@ -113,6 +107,10 @@ def gen(self) -> TPipeStep: """A data generating step""" ... + def replace_gen(self, gen: TPipeStep) -> None: + """Replaces data generating step. Assumes that you know what are you doing""" + ... + def __getitem__(self, i: int) -> TPipeStep: """Get pipe step at index""" ... @@ -129,112 +127,3 @@ def has_parent(self) -> bool: def close(self) -> None: """Closes pipe generator""" ... - - -ItemTransformFunctionWithMeta = Callable[[TDataItem, str], TAny] -ItemTransformFunctionNoMeta = Callable[[TDataItem], TAny] -ItemTransformFunc = Union[ItemTransformFunctionWithMeta[TAny], ItemTransformFunctionNoMeta[TAny]] - - -class ItemTransform(ABC, Generic[TAny]): - _f_meta: ItemTransformFunctionWithMeta[TAny] = None - _f: ItemTransformFunctionNoMeta[TAny] = None - - placement_affinity: ClassVar[float] = 0 - """Tell how strongly an item sticks to start (-1) or end (+1) of pipe.""" - - def __init__(self, transform_f: ItemTransformFunc[TAny]) -> None: - # inspect the signature - sig = inspect.signature(transform_f) - # TODO: use TypeGuard here to get rid of type ignore - if len(sig.parameters) == 1: - self._f = transform_f # type: ignore - else: # TODO: do better check - self._f_meta = transform_f # type: ignore - - def bind(self: "ItemTransform[TAny]", pipe: SupportsPipe) -> "ItemTransform[TAny]": - return self - - @abstractmethod - def __call__(self, item: TDataItems, meta: Any = None) -> Optional[TDataItems]: - """Transforms `item` (a list of TDataItem or a single TDataItem) and returns or yields TDataItems. Returns None to consume item (filter out)""" - pass - - -class FilterItem(ItemTransform[bool]): - # mypy needs those to type correctly - _f_meta: ItemTransformFunctionWithMeta[bool] - _f: ItemTransformFunctionNoMeta[bool] - - def __call__(self, item: TDataItems, meta: Any = None) -> Optional[TDataItems]: - if isinstance(item, list): - # preserve empty lists - if len(item) == 0: - return item - - if self._f_meta: - item = [i for i in item if self._f_meta(i, meta)] - else: - item = [i for i in item if self._f(i)] - if not item: - # item was fully consumed by the filter - return None - return item - else: - if self._f_meta: - return item if self._f_meta(item, meta) else None - else: - return item if self._f(item) else None - - -class MapItem(ItemTransform[TDataItem]): - # mypy needs those to type correctly - _f_meta: ItemTransformFunctionWithMeta[TDataItem] - _f: ItemTransformFunctionNoMeta[TDataItem] - - def __call__(self, item: TDataItems, meta: Any = None) -> Optional[TDataItems]: - if isinstance(item, list): - if self._f_meta: - return [self._f_meta(i, meta) for i in item] - else: - return [self._f(i) for i in item] - else: - if self._f_meta: - return self._f_meta(item, meta) - else: - return self._f(item) - - -class YieldMapItem(ItemTransform[Iterator[TDataItem]]): - # mypy needs those to type correctly - _f_meta: ItemTransformFunctionWithMeta[TDataItem] - _f: ItemTransformFunctionNoMeta[TDataItem] - - def __call__(self, item: TDataItems, meta: Any = None) -> Optional[TDataItems]: - if isinstance(item, list): - for i in item: - if self._f_meta: - yield from self._f_meta(i, meta) - else: - yield from self._f(i) - else: - if self._f_meta: - yield from self._f_meta(item, meta) - else: - yield from self._f(item) - - -class ValidateItem(ItemTransform[TDataItem]): - """Base class for validators of data items. - - Subclass should implement the `__call__` method to either return the data item(s) or raise `extract.exceptions.ValidationError`. - See `PydanticValidator` for possible implementation. - """ - - placement_affinity: ClassVar[float] = 0.9 # stick to end but less than incremental - - table_name: str - - def bind(self, pipe: SupportsPipe) -> ItemTransform[TDataItem]: - self.table_name = pipe.name - return self diff --git a/dlt/extract/items_transform.py b/dlt/extract/items_transform.py new file mode 100644 index 0000000000..12375640bc --- /dev/null +++ b/dlt/extract/items_transform.py @@ -0,0 +1,179 @@ +import inspect +import time + +from abc import ABC, abstractmethod +from typing import ( + Any, + Callable, + ClassVar, + Generic, + Iterator, + Optional, + Union, +) +from concurrent.futures import Future + +from dlt.common.typing import ( + TAny, + TDataItem, + TDataItems, +) + +from dlt.extract.utils import ( + wrap_iterator, +) + +from dlt.extract.items import SupportsPipe + + +ItemTransformFunctionWithMeta = Callable[[TDataItem, str], TAny] +ItemTransformFunctionNoMeta = Callable[[TDataItem], TAny] +ItemTransformFunc = Union[ItemTransformFunctionWithMeta[TAny], ItemTransformFunctionNoMeta[TAny]] + + +class ItemTransform(ABC, Generic[TAny]): + _f_meta: ItemTransformFunctionWithMeta[TAny] = None + _f: ItemTransformFunctionNoMeta[TAny] = None + + placement_affinity: ClassVar[float] = 0 + """Tell how strongly an item sticks to start (-1) or end (+1) of pipe.""" + + def __init__(self, transform_f: ItemTransformFunc[TAny]) -> None: + # inspect the signature + sig = inspect.signature(transform_f) + # TODO: use TypeGuard here to get rid of type ignore + if len(sig.parameters) == 1: + self._f = transform_f # type: ignore + else: # TODO: do better check + self._f_meta = transform_f # type: ignore + + def bind(self: "ItemTransform[TAny]", pipe: SupportsPipe) -> "ItemTransform[TAny]": + return self + + @abstractmethod + def __call__(self, item: TDataItems, meta: Any = None) -> Optional[TDataItems]: + """Transforms `item` (a list of TDataItem or a single TDataItem) and returns or yields TDataItems. Returns None to consume item (filter out)""" + pass + + +class FilterItem(ItemTransform[bool]): + # mypy needs those to type correctly + _f_meta: ItemTransformFunctionWithMeta[bool] + _f: ItemTransformFunctionNoMeta[bool] + + def __call__(self, item: TDataItems, meta: Any = None) -> Optional[TDataItems]: + if isinstance(item, list): + # preserve empty lists + if len(item) == 0: + return item + + if self._f_meta: + item = [i for i in item if self._f_meta(i, meta)] + else: + item = [i for i in item if self._f(i)] + if not item: + # item was fully consumed by the filter + return None + return item + else: + if self._f_meta: + return item if self._f_meta(item, meta) else None + else: + return item if self._f(item) else None + + +class MapItem(ItemTransform[TDataItem]): + # mypy needs those to type correctly + _f_meta: ItemTransformFunctionWithMeta[TDataItem] + _f: ItemTransformFunctionNoMeta[TDataItem] + + def __call__(self, item: TDataItems, meta: Any = None) -> Optional[TDataItems]: + if isinstance(item, list): + if self._f_meta: + return [self._f_meta(i, meta) for i in item] + else: + return [self._f(i) for i in item] + else: + if self._f_meta: + return self._f_meta(item, meta) + else: + return self._f(item) + + +class YieldMapItem(ItemTransform[Iterator[TDataItem]]): + # mypy needs those to type correctly + _f_meta: ItemTransformFunctionWithMeta[TDataItem] + _f: ItemTransformFunctionNoMeta[TDataItem] + + def __call__(self, item: TDataItems, meta: Any = None) -> Optional[TDataItems]: + if isinstance(item, list): + for i in item: + if self._f_meta: + yield from self._f_meta(i, meta) + else: + yield from self._f(i) + else: + if self._f_meta: + yield from self._f_meta(item, meta) + else: + yield from self._f(item) + + +class ValidateItem(ItemTransform[TDataItem]): + """Base class for validators of data items. + + Subclass should implement the `__call__` method to either return the data item(s) or raise `extract.exceptions.ValidationError`. + See `PydanticValidator` for possible implementation. + """ + + placement_affinity: ClassVar[float] = 0.9 # stick to end but less than incremental + + table_name: str + + def bind(self, pipe: SupportsPipe) -> ItemTransform[TDataItem]: + self.table_name = pipe.name + return self + + +class LimitItem(ItemTransform[TDataItem]): + placement_affinity: ClassVar[float] = 1.1 # stick to end right behind incremental + + def __init__(self, max_items: Optional[int], max_time: Optional[float]) -> None: + self.max_items = max_items if max_items is not None else -1 + self.max_time = max_time + + def bind(self, pipe: SupportsPipe) -> "LimitItem": + # we also wrap iterators to make them stoppable + if isinstance(pipe.gen, Iterator): + pipe.replace_gen(wrap_iterator(pipe.gen)) + + self.gen = pipe.gen + self.count = 0 + self.exhausted = False + self.start_time = time.time() + + return self + + def __call__(self, item: TDataItems, meta: Any = None) -> Optional[TDataItems]: + self.count += 1 + + # detect when the limit is reached, max time or yield count + if ( + (self.count == self.max_items) + or (self.max_time and time.time() - self.start_time > self.max_time) + or self.max_items == 0 + ): + self.exhausted = True + if inspect.isgenerator(self.gen): + self.gen.close() + + # if max items is not 0, we return the last item + # otherwise never return anything + if self.max_items != 0: + return item + + # do not return any late arriving items + if self.exhausted: + return None + + return item diff --git a/dlt/extract/pipe.py b/dlt/extract/pipe.py index 02b52c4623..e70365b4f4 100644 --- a/dlt/extract/pipe.py +++ b/dlt/extract/pipe.py @@ -27,12 +27,12 @@ UnclosablePipe, ) from dlt.extract.items import ( - ItemTransform, ResolvablePipeItem, SupportsPipe, TPipeStep, TPipedDataItems, ) +from dlt.extract.items_transform import ItemTransform from dlt.extract.utils import ( check_compat_transformer, simulate_func_call, @@ -122,7 +122,23 @@ def steps(self) -> List[TPipeStep]: def find(self, *step_type: AnyType) -> int: """Finds a step with object of type `step_type`""" - return next((i for i, v in enumerate(self._steps) if isinstance(v, step_type)), -1) + found = self.find_all(step_type) + return found[0] if found else -1 + + def find_all(self, *step_type: AnyType) -> List[int]: + """Finds all steps with object of type `step_type`""" + return [i for i, v in enumerate(self._steps) if isinstance(v, step_type)] + + def get_by_type(self, *step_type: AnyType) -> TPipeStep: + """Gets first step found with object of type `step_type`""" + return next((v for v in self._steps if isinstance(v, step_type)), None) + + def remove_by_type(self, *step_type: AnyType) -> int: + """Deletes first step found with object of type `step_type`, returns previous index""" + step_index = self.find(*step_type) + if step_index >= 0: + self.remove_step(step_index) + return step_index def __getitem__(self, i: int) -> TPipeStep: return self._steps[i] diff --git a/dlt/extract/pipe_iterator.py b/dlt/extract/pipe_iterator.py index 465040f9f4..38641c0626 100644 --- a/dlt/extract/pipe_iterator.py +++ b/dlt/extract/pipe_iterator.py @@ -24,7 +24,11 @@ ) from dlt.common.configuration.container import Container from dlt.common.exceptions import PipelineException -from dlt.common.pipeline import unset_current_pipe_name, set_current_pipe_name +from dlt.common.pipeline import ( + unset_current_pipe_name, + set_current_pipe_name, + get_current_pipe_name, +) from dlt.common.utils import get_callable_name from dlt.extract.exceptions import ( @@ -180,7 +184,6 @@ def __next__(self) -> PipeItem: item = pipe_item.item # if item is iterator, then add it as a new source if isinstance(item, Iterator): - # print(f"adding iterable {item}") self._sources.append( SourcePipeItem(item, pipe_item.step, pipe_item.pipe, pipe_item.meta) ) @@ -291,7 +294,6 @@ def _get_source_item(self) -> ResolvablePipeItem: first_evaluated_index = self._current_source_index # always go round robin if None was returned or item is to be run as future self._current_source_index = (self._current_source_index - 1) % sources_count - except StopIteration: # remove empty iterator and try another source self._sources.pop(self._current_source_index) diff --git a/dlt/extract/resource.py b/dlt/extract/resource.py index 42e3905162..366e6e1a88 100644 --- a/dlt/extract/resource.py +++ b/dlt/extract/resource.py @@ -2,7 +2,7 @@ from functools import partial from typing import ( AsyncIterable, - AsyncIterator, + cast, ClassVar, Callable, Iterable, @@ -34,13 +34,16 @@ from dlt.extract.items import ( DataItemWithMeta, - ItemTransformFunc, - ItemTransformFunctionWithMeta, TableNameMeta, +) +from dlt.extract.items_transform import ( FilterItem, MapItem, YieldMapItem, ValidateItem, + LimitItem, + ItemTransformFunc, + ItemTransformFunctionWithMeta, ) from dlt.extract.pipe_iterator import ManagedPipeIterator from dlt.extract.pipe import Pipe, TPipeStep @@ -214,29 +217,22 @@ def requires_args(self) -> bool: return True @property - def incremental(self) -> IncrementalResourceWrapper: + def incremental(self) -> Optional[IncrementalResourceWrapper]: """Gets incremental transform if it is in the pipe""" - incremental: IncrementalResourceWrapper = None - step_no = self._pipe.find(IncrementalResourceWrapper, Incremental) - if step_no >= 0: - incremental = self._pipe.steps[step_no] # type: ignore - return incremental + return cast( + Optional[IncrementalResourceWrapper], + self._pipe.get_by_type(IncrementalResourceWrapper, Incremental), + ) @property def validator(self) -> Optional[ValidateItem]: """Gets validator transform if it is in the pipe""" - validator: ValidateItem = None - step_no = self._pipe.find(ValidateItem) - if step_no >= 0: - validator = self._pipe.steps[step_no] # type: ignore[assignment] - return validator + return cast(Optional[ValidateItem], self._pipe.get_by_type(ValidateItem)) @validator.setter def validator(self, validator: Optional[ValidateItem]) -> None: """Add/remove or replace the validator in pipe""" - step_no = self._pipe.find(ValidateItem) - if step_no >= 0: - self._pipe.remove_step(step_no) + step_no = self._pipe.remove_by_type(ValidateItem) if validator: self.add_step(validator, insert_at=step_no if step_no >= 0 else None) @@ -347,72 +343,37 @@ def add_filter( self._pipe.insert_step(FilterItem(item_filter), insert_at) return self - def add_limit(self: TDltResourceImpl, max_items: int) -> TDltResourceImpl: # noqa: A003 + def add_limit( + self: TDltResourceImpl, + max_items: Optional[int] = None, + max_time: Optional[float] = None, + ) -> TDltResourceImpl: # noqa: A003 """Adds a limit `max_items` to the resource pipe. - This mutates the encapsulated generator to stop after `max_items` items are yielded. This is useful for testing and debugging. + This mutates the encapsulated generator to stop after `max_items` items are yielded. This is useful for testing and debugging. - Notes: - 1. Transformers won't be limited. They should process all the data they receive fully to avoid inconsistencies in generated datasets. - 2. Each yielded item may contain several records. `add_limit` only limits the "number of yields", not the total number of records. - 3. Async resources with a limit added may occasionally produce one item more than the limit on some runs. This behavior is not deterministic. + Notes: + 1. Transformers won't be limited. They should process all the data they receive fully to avoid inconsistencies in generated datasets. + 2. Each yielded item may contain several records. `add_limit` only limits the "number of yields", not the total number of records. + 3. Async resources with a limit added may occasionally produce one item more than the limit on some runs. This behavior is not deterministic. Args: - max_items (int): The maximum number of items to yield - Returns: - "DltResource": returns self + max_items (int): The maximum number of items to yield, set to None for no limit + max_time (float): The maximum number of seconds for this generator to run after it was opened, set to None for no limit + Returns: + "DltResource": returns self """ - # make sure max_items is a number, to allow "None" as value for unlimited - if max_items is None: - max_items = -1 - - def _gen_wrap(gen: TPipeStep) -> TPipeStep: - """Wrap a generator to take the first `max_items` records""" - - # zero items should produce empty generator - if max_items == 0: - return - - count = 0 - is_async_gen = False - if callable(gen): - gen = gen() # type: ignore - - # wrap async gen already here - if isinstance(gen, AsyncIterator): - gen = wrap_async_iterator(gen) - is_async_gen = True - - try: - for i in gen: # type: ignore # TODO: help me fix this later - yield i - if i is not None: - count += 1 - # async gen yields awaitable so we must count one awaitable more - # so the previous one is evaluated and yielded. - # new awaitable will be cancelled - if count == max_items + int(is_async_gen): - return - finally: - if inspect.isgenerator(gen): - gen.close() - return - - # transformers should be limited by their input, so we only limit non-transformers - if not self.is_transformer: - gen = self._pipe.gen - # wrap gen directly - if inspect.isgenerator(gen): - self._pipe.replace_gen(_gen_wrap(gen)) - else: - # keep function as function to not evaluate generators before pipe starts - self._pipe.replace_gen(partial(_gen_wrap, gen)) - else: + if self.is_transformer: logger.warning( f"Setting add_limit to a transformer {self.name} has no effect. Set the limit on" " the top level resource." ) + else: + # remove existing limit if any + self._pipe.remove_by_type(LimitItem) + self.add_step(LimitItem(max_items=max_items, max_time=max_time)) + return self def parallelize(self: TDltResourceImpl) -> TDltResourceImpl: @@ -445,9 +406,7 @@ def add_step( return self def _remove_incremental_step(self) -> None: - step_no = self._pipe.find(Incremental, IncrementalResourceWrapper) - if step_no >= 0: - self._pipe.remove_step(step_no) + self._pipe.remove_by_type(Incremental, IncrementalResourceWrapper) def set_incremental( self, diff --git a/dlt/extract/utils.py b/dlt/extract/utils.py index 68570d0995..0bcd13155e 100644 --- a/dlt/extract/utils.py +++ b/dlt/extract/utils.py @@ -183,6 +183,17 @@ def check_compat_transformer(name: str, f: AnyFun, sig: inspect.Signature) -> in return meta_arg +def wrap_iterator(gen: Iterator[TDataItems]) -> Iterator[TDataItems]: + """Wraps an iterator into a generator""" + if inspect.isgenerator(gen): + return gen + + def wrapped_gen() -> Iterator[TDataItems]: + yield from gen + + return wrapped_gen() + + def wrap_async_iterator( gen: AsyncIterator[TDataItems], ) -> Generator[Awaitable[TDataItems], None, None]: diff --git a/dlt/extract/validation.py b/dlt/extract/validation.py index 4cd321b88c..d9fe70a90b 100644 --- a/dlt/extract/validation.py +++ b/dlt/extract/validation.py @@ -8,7 +8,8 @@ from dlt.common.typing import TDataItems from dlt.common.schema.typing import TAnySchemaColumns, TSchemaContract, TSchemaEvolutionMode -from dlt.extract.items import TTableHintTemplate, ValidateItem +from dlt.extract.items import TTableHintTemplate +from dlt.extract.items_transform import ValidateItem _TPydanticModel = TypeVar("_TPydanticModel", bound=PydanticBaseModel) diff --git a/dlt/helpers/airflow_helper.py b/dlt/helpers/airflow_helper.py index 99458a3949..aaa19ea97d 100644 --- a/dlt/helpers/airflow_helper.py +++ b/dlt/helpers/airflow_helper.py @@ -18,7 +18,7 @@ from airflow.configuration import conf from airflow.models import TaskInstance from airflow.utils.task_group import TaskGroup - from airflow.operators.dummy import DummyOperator # type: ignore + from airflow.operators.dummy import DummyOperator from airflow.operators.python import PythonOperator, get_current_context except ModuleNotFoundError: raise MissingDependencyException("Airflow", ["apache-airflow>=2.5"]) @@ -255,7 +255,7 @@ def _run( # use task logger if self.use_task_logger: - ti: TaskInstance = get_current_context()["ti"] # type: ignore + ti: TaskInstance = get_current_context()["ti"] # type: ignore[assignment,unused-ignore] logger.LOGGER = ti.log # set global number of buffered items diff --git a/dlt/helpers/dbt/profiles.yml b/dlt/helpers/dbt/profiles.yml index a2a0014e4e..fd114478fb 100644 --- a/dlt/helpers/dbt/profiles.yml +++ b/dlt/helpers/dbt/profiles.yml @@ -83,6 +83,7 @@ duckdb: extensions: - httpfs - parquet + - iceberg # TODO: emit the config of duck db motherduck: diff --git a/dlt/common/libs/ibis.py b/dlt/helpers/ibis.py similarity index 64% rename from dlt/common/libs/ibis.py rename to dlt/helpers/ibis.py index ba6f363e66..e15bb9bc16 100644 --- a/dlt/common/libs/ibis.py +++ b/dlt/helpers/ibis.py @@ -1,14 +1,16 @@ -from typing import cast +from typing import cast, Any from dlt.common.exceptions import MissingDependencyException - from dlt.common.destination.reference import TDestinationReferenceArg, Destination, JobClientBase +from dlt.common.schema import Schema +from dlt.destinations.sql_client import SqlClientBase try: import ibis # type: ignore - from ibis import BaseBackend + import sqlglot + from ibis import BaseBackend, Expr except ModuleNotFoundError: - raise MissingDependencyException("dlt ibis Helpers", ["ibis"]) + raise MissingDependencyException("dlt ibis helpers", ["ibis-framework"]) SUPPORTED_DESTINATIONS = [ @@ -29,6 +31,22 @@ ] +# Map dlt data types to ibis data types +DATA_TYPE_MAP = { + "text": "string", + "double": "float64", + "bool": "boolean", + "timestamp": "timestamp", + "bigint": "int64", + "binary": "binary", + "json": "string", # Store JSON as string in ibis + "decimal": "decimal", + "wei": "int64", # Wei is a large integer + "date": "date", + "time": "time", +} + + def create_ibis_backend( destination: TDestinationReferenceArg, client: JobClientBase ) -> BaseBackend: @@ -105,17 +123,55 @@ def create_ibis_backend( ) from dlt.destinations.impl.duckdb.factory import DuckDbCredentials - # we create an in memory duckdb and create all tables on there - duck = duckdb.connect(":memory:") + # we create an in memory duckdb and create the ibis backend from it fs_client = cast(FilesystemClient, client) - creds = DuckDbCredentials(duck) sql_client = FilesystemSqlClient( - fs_client, dataset_name=fs_client.dataset_name, credentials=creds + fs_client, + dataset_name=fs_client.dataset_name, + credentials=DuckDbCredentials(duckdb.connect()), ) - + # do not use context manager to not return and close the cloned connection + duckdb_conn = sql_client.open_connection() + # make all tables available here # NOTE: we should probably have the option for the user to only select a subset of tables here - with sql_client as _: - sql_client.create_views_for_all_tables() - con = ibis.duckdb.from_connection(duck) + sql_client.create_views_for_all_tables() + # why this works now: whenever a clone of connection is made, all SET commands + # apply only to it. old code was setting `curl` on the internal clone of sql_client + # now we export this clone directly to ibis to it works + con = ibis.duckdb.from_connection(duckdb_conn) return con + + +def create_unbound_ibis_table( + sql_client: SqlClientBase[Any], schema: Schema, table_name: str +) -> Expr: + """Create an unbound ibis table from a dlt schema""" + + if table_name not in schema.tables: + raise Exception( + f"Table {table_name} not found in schema. Available tables: {schema.tables.keys()}" + ) + table_schema = schema.tables[table_name] + + # Convert dlt table schema columns to ibis schema + ibis_schema = { + sql_client.capabilities.casefold_identifier(col_name): DATA_TYPE_MAP[ + col_info.get("data_type", "string") + ] + for col_name, col_info in table_schema.get("columns", {}).items() + } + + # normalize table name + table_path = sql_client.make_qualified_table_name_path(table_name, escape=False) + + catalog = None + if len(table_path) == 3: + catalog, database, table = table_path + else: + database, table = table_path + + # create unbound ibis table and return in dlt wrapper + unbound_table = ibis.table(schema=ibis_schema, name=table, database=database, catalog=catalog) + + return unbound_table diff --git a/dlt/normalize/normalize.py b/dlt/normalize/normalize.py index 32db5034b4..1d81d70b10 100644 --- a/dlt/normalize/normalize.py +++ b/dlt/normalize/normalize.py @@ -20,7 +20,7 @@ LoadStorage, ParsedLoadJobFileName, ) -from dlt.common.schema import TSchemaUpdate, Schema +from dlt.common.schema import Schema from dlt.common.schema.exceptions import CannotCoerceColumnException from dlt.common.pipeline import ( NormalizeInfo, @@ -34,7 +34,7 @@ from dlt.normalize.configuration import NormalizeConfiguration from dlt.normalize.exceptions import NormalizeJobFailed from dlt.normalize.worker import w_normalize_files, group_worker_files, TWorkerRV -from dlt.normalize.validate import verify_normalized_table +from dlt.normalize.validate import validate_and_update_schema, verify_normalized_table # normalize worker wrapping function signature @@ -80,16 +80,6 @@ def create_storages(self) -> None: config=self.config._load_storage_config, ) - def update_schema(self, schema: Schema, schema_updates: List[TSchemaUpdate]) -> None: - for schema_update in schema_updates: - for table_name, table_updates in schema_update.items(): - logger.info( - f"Updating schema for table {table_name} with {len(table_updates)} deltas" - ) - for partial_table in table_updates: - # merge columns where we expect identifiers to be normalized - schema.update_table(partial_table, normalize_identifiers=False) - def map_parallel(self, schema: Schema, load_id: str, files: Sequence[str]) -> TWorkerRV: workers: int = getattr(self.pool, "_max_workers", 1) chunk_files = group_worker_files(files, workers) @@ -123,7 +113,7 @@ def map_parallel(self, schema: Schema, load_id: str, files: Sequence[str]) -> TW result: TWorkerRV = pending.result() try: # gather schema from all manifests, validate consistency and combine - self.update_schema(schema, result[0]) + validate_and_update_schema(schema, result[0]) summary.schema_updates.extend(result.schema_updates) summary.file_metrics.extend(result.file_metrics) # update metrics @@ -162,7 +152,7 @@ def map_single(self, schema: Schema, load_id: str, files: Sequence[str]) -> TWor load_id, files, ) - self.update_schema(schema, result.schema_updates) + validate_and_update_schema(schema, result.schema_updates) self.collector.update("Files", len(result.file_metrics)) self.collector.update( "Items", sum(result.file_metrics, EMPTY_DATA_WRITER_METRICS).items_count @@ -237,23 +227,11 @@ def spool_schema_files(self, load_id: str, schema: Schema, files: Sequence[str]) self.load_storage.import_extracted_package( load_id, self.normalize_storage.extracted_packages ) - logger.info(f"Created new load package {load_id} on loading volume") - try: - # process parallel - self.spool_files( - load_id, schema.clone(update_normalizers=True), self.map_parallel, files - ) - except CannotCoerceColumnException as exc: - # schema conflicts resulting from parallel executing - logger.warning( - f"Parallel schema update conflict, switching to single thread ({str(exc)}" - ) - # start from scratch - self.load_storage.new_packages.delete_package(load_id) - self.load_storage.import_extracted_package( - load_id, self.normalize_storage.extracted_packages - ) - self.spool_files(load_id, schema.clone(update_normalizers=True), self.map_single, files) + logger.info(f"Created new load package {load_id} on loading volume with ") + # get number of workers with default == 1 if not set (ie. NullExecutor) + workers: int = getattr(self.pool, "_max_workers", 1) + map_f: TMapFuncType = self.map_parallel if workers > 1 else self.map_single + self.spool_files(load_id, schema.clone(update_normalizers=True), map_f, files) return load_id diff --git a/dlt/normalize/validate.py b/dlt/normalize/validate.py index 648deb5da9..868ba3115b 100644 --- a/dlt/normalize/validate.py +++ b/dlt/normalize/validate.py @@ -1,7 +1,10 @@ +from typing import List + from dlt.common.destination.capabilities import DestinationCapabilitiesContext from dlt.common.schema import Schema -from dlt.common.schema.typing import TTableSchema +from dlt.common.schema.typing import TTableSchema, TSchemaUpdate from dlt.common.schema.utils import ( + ensure_compatible_tables, find_incomplete_columns, get_first_column_name_with_prop, is_nested_table, @@ -10,6 +13,21 @@ from dlt.common import logger +def validate_and_update_schema(schema: Schema, schema_updates: List[TSchemaUpdate]) -> None: + """Updates `schema` tables with partial tables in `schema_updates`""" + for schema_update in schema_updates: + for table_name, table_updates in schema_update.items(): + logger.info(f"Updating schema for table {table_name} with {len(table_updates)} deltas") + for partial_table in table_updates: + # ensure updates will pass + if existing_table := schema.tables.get(partial_table["name"]): + ensure_compatible_tables(schema.name, existing_table, partial_table) + + for partial_table in table_updates: + # merge columns where we expect identifiers to be normalized + schema.update_table(partial_table, normalize_identifiers=False) + + def verify_normalized_table( schema: Schema, table: TTableSchema, capabilities: DestinationCapabilitiesContext ) -> None: diff --git a/dlt/pipeline/pipeline.py b/dlt/pipeline/pipeline.py index 70d160ea67..74466a09e4 100644 --- a/dlt/pipeline/pipeline.py +++ b/dlt/pipeline/pipeline.py @@ -1750,10 +1750,18 @@ def __getstate__(self) -> Any: # pickle only the SupportsPipeline protocol fields return {"pipeline_name": self.pipeline_name} - def _dataset( - self, schema: Union[Schema, str, None] = None, dataset_type: TDatasetType = "dbapi" + def dataset( + self, schema: Union[Schema, str, None] = None, dataset_type: TDatasetType = "auto" ) -> SupportsReadableDataset: - """Access helper to dataset""" + """Returns a dataset object for querying the destination data. + + Args: + schema: Schema name or Schema object to use. If None, uses the default schema if set. + dataset_type: Type of dataset interface to return. Defaults to 'auto' which will select ibis if available + otherwise it will fallback to the standard dbapi interface. + Returns: + A dataset object that supports querying the destination data. + """ if schema is None: schema = self.default_schema if self.default_schema_name else None return dataset( diff --git a/dlt/sources/helpers/transform.py b/dlt/sources/helpers/transform.py index 32843e2aa2..45738fe4fb 100644 --- a/dlt/sources/helpers/transform.py +++ b/dlt/sources/helpers/transform.py @@ -2,7 +2,7 @@ from typing import Any, Dict, Sequence, Union from dlt.common.typing import TDataItem -from dlt.extract.items import ItemTransformFunctionNoMeta +from dlt.extract.items_transform import ItemTransformFunctionNoMeta import jsonpath_ng diff --git a/dlt/sources/rest_api/config_setup.py b/dlt/sources/rest_api/config_setup.py index d03a4fd59b..bf62c6c4f7 100644 --- a/dlt/sources/rest_api/config_setup.py +++ b/dlt/sources/rest_api/config_setup.py @@ -20,6 +20,7 @@ from dlt.common.configuration import resolve_configuration from dlt.common.schema.utils import merge_columns from dlt.common.utils import update_dict_nested, exclude_keys +from dlt.common.typing import add_value_to_literal from dlt.common import jsonpath from dlt.extract.incremental import Incremental @@ -64,6 +65,8 @@ ResponseActionDict, Endpoint, EndpointResource, + AuthType, + PaginatorType, ) @@ -103,6 +106,7 @@ def register_paginator( "Your custom paginator has to be a subclass of BasePaginator" ) PAGINATOR_MAP[paginator_name] = paginator_class + add_value_to_literal(PaginatorType, paginator_name) def get_paginator_class(paginator_name: str) -> Type[BasePaginator]: @@ -153,6 +157,8 @@ def register_auth( ) AUTH_MAP[auth_name] = auth_class + add_value_to_literal(AuthType, auth_name) + def get_auth_class(auth_type: str) -> Type[AuthConfigBase]: try: @@ -285,7 +291,7 @@ def build_resource_dependency_graph( resolved_param_map[resource_name] = None break assert isinstance(endpoint_resource["endpoint"], dict) - # connect transformers to resources via resolved params + # find resolved parameters to connect dependent resources resolved_params = _find_resolved_params(endpoint_resource["endpoint"]) # set of resources in resolved params diff --git a/dlt/sources/sql_database/helpers.py b/dlt/sources/sql_database/helpers.py index a8be2a6427..ee38c7dd98 100644 --- a/dlt/sources/sql_database/helpers.py +++ b/dlt/sources/sql_database/helpers.py @@ -94,12 +94,16 @@ def __init__( self.end_value = incremental.end_value self.row_order: TSortOrder = self.incremental.row_order self.on_cursor_value_missing = self.incremental.on_cursor_value_missing + self.range_start = self.incremental.range_start + self.range_end = self.incremental.range_end else: self.cursor_column = None self.last_value = None self.end_value = None self.row_order = None self.on_cursor_value_missing = None + self.range_start = None + self.range_end = None def _make_query(self) -> SelectAny: table = self.table @@ -110,11 +114,11 @@ def _make_query(self) -> SelectAny: # generate where if last_value_func is max: # Query ordered and filtered according to last_value function - filter_op = operator.ge - filter_op_end = operator.lt + filter_op = operator.ge if self.range_start == "closed" else operator.gt + filter_op_end = operator.lt if self.range_end == "open" else operator.le elif last_value_func is min: - filter_op = operator.le - filter_op_end = operator.gt + filter_op = operator.le if self.range_start == "closed" else operator.lt + filter_op_end = operator.gt if self.range_end == "open" else operator.ge else: # Custom last_value, load everything and let incremental handle filtering return query # type: ignore[no-any-return] diff --git a/docs/examples/backfill_in_chunks/__init__.py b/docs/examples/backfill_in_chunks/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/docs/examples/backfill_in_chunks/backfill_in_chunks.py b/docs/examples/backfill_in_chunks/backfill_in_chunks.py new file mode 100644 index 0000000000..a758d67f7b --- /dev/null +++ b/docs/examples/backfill_in_chunks/backfill_in_chunks.py @@ -0,0 +1,85 @@ +""" +--- +title: Backfilling in chunks +description: Learn how to backfill in chunks of defined size +keywords: [incremental loading, backfilling, chunks,example] +--- + +In this example, you'll find a Python script that will load from a sql_database source in chunks of defined size. This is useful for backfilling in multiple pipeline runs as +opposed to backfilling in one very large pipeline run which may fail due to memory issues on ephemeral storage or just take a very long time to complete without seeing any +progress in the destination. + +We'll learn how to: + +- Connect to a mysql database with the sql_database source +- Select one table to load and apply incremental loading hints as well as the primary key +- Set the chunk size and limit the number of chunks to load in one pipeline run +- Create a pipeline and backfill the table in the defined chunks +- Use the datasets accessor to inspect and assert the load progress + +""" + +import pandas as pd + +import dlt +from dlt.sources.sql_database import sql_database + + +if __name__ == "__main__": + # NOTE: this is a live table in the rfam database, so the number of final rows may change + TOTAL_TABLE_ROWS = 4178 + RFAM_CONNECTION_STRING = "mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam" + + # create sql database source that only loads the family table in chunks of 1000 rows + source = sql_database(RFAM_CONNECTION_STRING, table_names=["family"], chunk_size=1000) + + # we apply some hints to the table, we know the rfam_id is unique and that we can order + # and load incrementally on the created datetime column + source.family.apply_hints( + primary_key="rfam_id", + incremental=dlt.sources.incremental( + cursor_path="created", initial_value=None, row_order="asc" + ), + ) + + # with limit we can limit the number of chunks to load, with a chunk size of 1000 and a limit of 1 + # we will load 1000 rows per pipeline run + source.add_limit(1) + + # create pipeline + pipeline = dlt.pipeline( + pipeline_name="rfam", destination="duckdb", dataset_name="rfam_data", dev_mode=True + ) + + def _assert_unique_row_count(df: pd.DataFrame, num_rows: int) -> None: + """Assert that a dataframe has the correct number of unique rows""" + # NOTE: this check is dependent on reading the full table back from the destination into memory, + # so it is only useful for testing before you do a large backfill. + assert len(df) == num_rows + assert len(set(df.rfam_id.tolist())) == num_rows + + # after the first run, the family table in the destination should contain the first 1000 rows + pipeline.run(source) + _assert_unique_row_count(pipeline.dataset().family.df(), 1000) + + # after the second run, the family table in the destination should contain 1999 rows + # there is some overlap on the incremental to prevent skipping rows + pipeline.run(source) + _assert_unique_row_count(pipeline.dataset().family.df(), 1999) + + # ... + pipeline.run(source) + _assert_unique_row_count(pipeline.dataset().family.df(), 2998) + + # ... + pipeline.run(source) + _assert_unique_row_count(pipeline.dataset().family.df(), 3997) + + # the final run will load all the rows until the end of the table + pipeline.run(source) + _assert_unique_row_count(pipeline.dataset().family.df(), TOTAL_TABLE_ROWS) + + # NOTE: in a production environment you will likely: + # * be using much larger chunk sizes and limits + # * run the pipeline in a loop to load all the rows + # * and programmatically check if the table is fully loaded and abort the loop if this is the case. diff --git a/docs/tools/check_embedded_snippets.py b/docs/tools/check_embedded_snippets.py index e8399fce6e..b917cafee1 100644 --- a/docs/tools/check_embedded_snippets.py +++ b/docs/tools/check_embedded_snippets.py @@ -21,7 +21,7 @@ SNIPPET_MARKER = "```" -ALLOWED_LANGUAGES = ["py", "toml", "json", "yaml", "text", "sh", "bat", "sql"] +ALLOWED_LANGUAGES = ["py", "toml", "json", "yaml", "text", "sh", "bat", "sql", "hcl"] LINT_TEMPLATE = "./lint_setup/template.py" LINT_FILE = "./lint_setup/lint_me.py" @@ -163,8 +163,11 @@ def parse_snippets(snippets: List[Snippet], verbose: bool) -> None: json.loads(snippet.code) elif snippet.language == "yaml": yaml.safe_load(snippet.code) - # ignore text and sh scripts - elif snippet.language in ["text", "sh", "bat", "sql"]: + elif snippet.language == "hcl": + # TODO: implement hcl parsers + pass + # ignore all other scripts + elif snippet.language in ALLOWED_LANGUAGES: pass else: raise ValueError(f"Unknown language {snippet.language}") diff --git a/docs/website/docs/build-a-pipeline-tutorial.md b/docs/website/docs/build-a-pipeline-tutorial.md index f85d2e19ea..36d30a184f 100644 --- a/docs/website/docs/build-a-pipeline-tutorial.md +++ b/docs/website/docs/build-a-pipeline-tutorial.md @@ -262,20 +262,30 @@ In this example, the first pipeline loads the data using `pipedrive_source()`. T #### [Using the `dlt` SQL client](dlt-ecosystem/transformations/sql.md) -Another option is to leverage the `dlt` SQL client to query the loaded data and perform transformations using SQL statements. You can execute SQL statements that change the database schema or manipulate data within tables. Here's an example of inserting a row into the `customers` table using the `dlt` SQL client: +Another option is to leverage the `dlt` SQL client to query the loaded data and perform transformations using SQL statements. You can execute SQL statements that change the database schema or manipulate data within tables. Here's an example of creating a new table with aggregated sales data in duckdb: ```py -pipeline = dlt.pipeline(destination="bigquery", dataset_name="crm") +pipeline = dlt.pipeline(destination="duckdb", dataset_name="crm") with pipeline.sql_client() as client: client.execute_sql( - "INSERT INTO customers VALUES (%s, %s, %s)", 10, "Fred", "fred@fred.com" - ) + """ CREATE TABLE aggregated_sales AS + SELECT + category, + region, + SUM(amount) AS total_sales, + AVG(amount) AS average_sales + FROM + sales + GROUP BY + category, + region; + """) ``` In this example, the `execute_sql` method of the SQL client allows you to execute SQL statements. The statement inserts a row with values into the `customers` table. -#### [Using Pandas](dlt-ecosystem/transformations/pandas.md) +#### [Using Pandas](dlt-ecosystem/transformations/python.md) You can fetch query results as Pandas data frames and perform transformations using Pandas functionalities. Here's an example of reading data from the `issues` table in DuckDB and counting reaction types using Pandas: @@ -287,11 +297,8 @@ pipeline = dlt.pipeline( dev_mode=True ) -with pipeline.sql_client() as client: - with client.execute_query( - 'SELECT "reactions__+1", "reactions__-1", reactions__laugh, reactions__hooray, reactions__rocket FROM issues' - ) as cursor: - reactions = cursor.df() +# get a dataframe of all reactions from the dataset +reactions = pipeline.dataset().issues.select("reactions__+1", "reactions__-1", "reactions__laugh", "reactions__hooray", "reactions__rocket").df() counts = reactions.sum(0).sort_values(0, ascending=False) ``` diff --git a/docs/website/docs/dlt-ecosystem/destinations/clickhouse.md b/docs/website/docs/dlt-ecosystem/destinations/clickhouse.md index 3bd1ae8e15..40ee5d71e8 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/clickhouse.md +++ b/docs/website/docs/dlt-ecosystem/destinations/clickhouse.md @@ -229,8 +229,7 @@ To set up GCS staging with HMAC authentication in dlt: 1. Create HMAC keys for your GCS service account by following the [Google Cloud guide](https://cloud.google.com/storage/docs/authentication/managing-hmackeys#create). -2. Configure the HMAC keys (`aws_access_key_id` and `aws_secret_access_key`) in your dlt project's ClickHouse destination settings in `config.toml`, similar to how you would configure AWS S3 - credentials: +2. Configure the HMAC keys (`aws_access_key_id` and `aws_secret_access_key`) as well as `endpoint_url` in your dlt project's ClickHouse destination settings in `config.toml`, similar to how you would configure AWS S3 credentials: ```toml [destination.filesystem] diff --git a/docs/website/docs/dlt-ecosystem/destinations/databricks.md b/docs/website/docs/dlt-ecosystem/destinations/databricks.md index 513a3b792f..a28a42f761 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/databricks.md +++ b/docs/website/docs/dlt-ecosystem/destinations/databricks.md @@ -52,7 +52,7 @@ If you already have your Databricks workspace set up, you can skip to the [Loade Add a new role assignment and select "Storage Blob Data Contributor" as the role. Under "Members" select "Managed Identity" and add the Databricks Access Connector you created in the previous step. -### 2. Set up a metastore and Unity Catalog and get your access token +### 2. Set up a metastore and Unity Catalog 1. Now go to your Databricks workspace @@ -85,10 +85,123 @@ If you already have your Databricks workspace set up, you can skip to the [Loade Go to "Catalog" and click "Create Catalog". Name your catalog and select the storage location you created in the previous step. -8. Create your access token +## Authentication - Click your email in the top right corner and go to "User Settings". Go to "Developer" -> "Access Tokens". - Generate a new token and save it. You will use it in your `dlt` configuration. +`dlt` currently supports two options for authentication: +1. [OAuth2](#oauth) (recommended) allows you to authenticate to Databricks using a service principal via OAuth2 M2M. +2. [Access token](#access_token) approach using a developer access token. This method may be deprecated in the future by Databricks. + +### Using OAuth2 + +You can authenticate to Databricks using a service principal via OAuth2 M2M. To enable it: + +1. Follow the instructions in the Databricks documentation: [Authenticate access to Databricks using OAuth M2M](https://docs.databricks.com/en/dev-tools/auth/oauth-m2m.html) +to create a service principal and retrieve the `client_id` and `client_secret`. + +2. Once you have the service principal credentials, update your credentials with any of the options shown below: + + + + + +```toml +# secrets.toml +[destination.databricks.credentials] +server_hostname = "MY_DATABRICKS.azuredatabricks.net" +http_path = "/sql/1.0/warehouses/12345" +catalog = "my_catalog" +client_id = "XXX" +client_secret = "XXX" +``` + + + + +```sh +export DESTINATIONS__DATABRICKS__CREDENTIALS__SERVER_HOSTNAME="MY_DATABRICKS.azuredatabricks.net" +export DESTINATIONS__DATABRICKS__CREDENTIALS__HTTP_PATH="/sql/1.0/warehouses/12345" +export DESTINATIONS__DATABRICKS__CREDENTIALS__CATALOG="my_catalog" +export DESTINATIONS__DATABRICKS__CREDENTIALS__CLIENT_ID="XXX" +export DESTINATIONS__DATABRICKS__CREDENTIALS__CLIENT_SECRET="XXX" +``` + + + + +```py +import os + +# Do not set up the secrets directly in the code! +# What you can do is reassign env variables. +os.environ["DESTINATIONS__DATABRICKS__CREDENTIALS__SERVER_HOSTNAME"] = "MY_DATABRICKS.azuredatabricks.net" +os.environ["DESTINATIONS__DATABRICKS__CREDENTIALS__HTTP_PATH"]="/sql/1.0/warehouses/12345" +os.environ["DESTINATIONS__DATABRICKS__CREDENTIALS__CATALOG"]="my_catalog" +os.environ["DESTINATIONS__DATABRICKS__CREDENTIALS__CLIENT_ID"]=os.environ.get("CLIENT_ID") +os.environ["DESTINATIONS__DATABRICKS__CREDENTIALS__CLIENT_SECRET"]=os.environ.get("CLIENT_SECRET") +``` + + + +### Using access token + +To create your access token: + +1. Click your email in the top right corner and go to "User Settings". Go to "Developer" -> "Access Tokens". +Generate a new token and save it. +2. Set up credentials in a desired way: + + + + + +```toml +# secrets.toml +[destination.databricks.credentials] +server_hostname = "MY_DATABRICKS.azuredatabricks.net" +http_path = "/sql/1.0/warehouses/12345" +catalog = "my_catalog" +access_token = "XXX" +``` + + + + +```sh +export DESTINATIONS__DATABRICKS__CREDENTIALS__SERVER_HOSTNAME="MY_DATABRICKS.azuredatabricks.net" +export DESTINATIONS__DATABRICKS__CREDENTIALS__HTTP_PATH="/sql/1.0/warehouses/12345" +export DESTINATIONS__DATABRICKS__CREDENTIALS__CATALOG="my_catalog" +export DESTINATIONS__DATABRICKS__CREDENTIALS__ACCESS_TOKEN="XXX" +``` + + + + +```py +import os + +# Do not set up the secrets directly in the code! +# What you can do is reassign env variables. +os.environ["DESTINATIONS__DATABRICKS__CREDENTIALS__SERVER_HOSTNAME"] = "MY_DATABRICKS.azuredatabricks.net" +os.environ["DESTINATIONS__DATABRICKS__CREDENTIALS__HTTP_PATH"]="/sql/1.0/warehouses/12345" +os.environ["DESTINATIONS__DATABRICKS__CREDENTIALS__CATALOG"]="my_catalog" +os.environ["DESTINATIONS__DATABRICKS__CREDENTIALS__ACCESS_TOKEN"]=os.environ.get("ACCESS_TOKEN") +``` + + ## Loader setup guide @@ -106,9 +219,9 @@ pip install -r requirements.txt This will install dlt with the `databricks` extra, which contains the Databricks Python dbapi client. -**4. Enter your credentials into `.dlt/secrets.toml`.** +**3. Enter your credentials into `.dlt/secrets.toml`.** -This should include your connection parameters and your personal access token. +This should include your connection parameters and your authentication credentials. You can find your server hostname and HTTP path in the Databricks workspace dashboard. Go to "SQL Warehouses", select your warehouse (default is called "Starter Warehouse"), and go to "Connection details". @@ -118,11 +231,14 @@ Example: [destination.databricks.credentials] server_hostname = "MY_DATABRICKS.azuredatabricks.net" http_path = "/sql/1.0/warehouses/12345" -access_token = "MY_ACCESS_TOKEN" +client_id = "XXX" +client_secret = "XXX" catalog = "my_catalog" ``` -See [staging support](#staging-support) for authentication options when `dlt` copies files from buckets. +You can find other options for specifying credentials in the [Authentication section](#authentication). + +See [Staging support](#staging-support) for authentication options when `dlt` copies files from buckets. ## Write disposition All write dispositions are supported. @@ -132,8 +248,7 @@ To load data into Databricks, you must set up a staging filesystem by configurin dlt will upload the data in Parquet files (or JSONL, if configured) to the bucket and then use `COPY INTO` statements to ingest the data into Databricks. -For more information on staging, see the [staging support](#staging-support) section below. - +For more information on staging, see the [Staging support](#staging-support) section below. ## Supported file formats * [Parquet](../file-formats/parquet.md) supported when staging is enabled. @@ -141,13 +256,13 @@ For more information on staging, see the [staging support](#staging-support) sec The JSONL format has some limitations when used with Databricks: -1. Compression must be disabled to load jsonl files in Databricks. Set `data_writer.disable_compression` to `true` in the dlt config when using this format. +1. Compression must be disabled to load JSONL files in Databricks. Set `data_writer.disable_compression` to `true` in the dlt config when using this format. 2. The following data types are not supported when using the JSONL format with `databricks`: `decimal`, `json`, `date`, `binary`. Use `parquet` if your data contains these types. 3. The `bigint` data type with precision is not supported with the JSONL format. ## Staging support -Databricks supports both Amazon S3, Azure Blob Storage and Google Cloud Storage as staging locations. `dlt` will upload files in Parquet format to the staging location and will instruct Databricks to load data from there. +Databricks supports both Amazon S3, Azure Blob Storage, and Google Cloud Storage as staging locations. `dlt` will upload files in Parquet format to the staging location and will instruct Databricks to load data from there. ### Databricks and Amazon S3 @@ -155,19 +270,50 @@ Please refer to the [S3 documentation](./filesystem.md#aws-s3) for details on co Example to set up Databricks with S3 as a staging destination: + + + + +```toml +# secrets.toml +[destination.filesystem] +bucket_url = "s3://your-bucket-name" + +[destination.filesystem.credentials] +aws_access_key_id="XXX" +aws_secret_access_key="XXX" +``` + + + + +```sh +export DESTINATIONS__FILESYSTEM__BUCKET_URL="s3://your-bucket-name" +export DESTINATIONS__FILESYSTEM__CREDENTIALS__AWS_ACCESS_KEY_ID="XXX" +export DESTINATIONS__FILESYSTEM__CREDENTIALS__AWS_SECRET_ACCESS_KEY="XXX" +``` + + + + ```py -import dlt +import os -# Create a dlt pipeline that will load -# chess player data to the Databricks destination -# via staging on S3 -pipeline = dlt.pipeline( - pipeline_name='chess_pipeline', - destination='databricks', - staging=dlt.destinations.filesystem('s3://your-bucket-name'), # add this to activate the staging location - dataset_name='player_data', -) +# Do not set up the secrets directly in the code! +# What you can do is reassign env variables. +os.environ["DESTINATIONS__FILESYSTEM__BUCKET_URL"] = "s3://your-bucket-name" +os.environ["DESTINATIONS__FILESYSTEM__CREDENTIALS__AWS_ACCESS_KEY_ID"] = os.environ.get("AWS_ACCESS_KEY_ID") +os.environ["DESTINATIONS__FILESYSTEM__CREDENTIALS__AWS_SECRET_ACCESS_KEY"] = os.environ.get("AWS_SECRET_ACCESS_KEY") ``` + + ### Databricks and Azure Blob Storage @@ -186,22 +332,54 @@ dlt is able to adapt the other representation (i.e., `az://container-name/path`) Example to set up Databricks with Azure as a staging destination: + + + + +```toml +# secrets.toml +[destination.filesystem] +bucket_url = "abfss://container_name@storage_account_name.dfs.core.windows.net/path" + +[destination.filesystem.credentials] +azure_storage_account_name="XXX" +azure_storage_account_key="XXX" +``` + + + + +```sh +export DESTINATIONS__FILESYSTEM__BUCKET_URL="abfss://container_name@storage_account_name.dfs.core.windows.net/path" +export DESTINATIONS__FILESYSTEM__CREDENTIALS__AZURE_STORAGE_ACCOUNT_NAME="XXX" +export DESTINATIONS__FILESYSTEM__CREDENTIALS__AZURE_STORAGE_ACCOUNT_KEY="XXX" +``` + + + + ```py -# Create a dlt pipeline that will load -# chess player data to the Databricks destination -# via staging on Azure Blob Storage -pipeline = dlt.pipeline( - pipeline_name='chess_pipeline', - destination='databricks', - staging=dlt.destinations.filesystem('abfss://dlt-ci-data@dltdata.dfs.core.windows.net'), # add this to activate the staging location - dataset_name='player_data' -) +import os + +# Do not set up the secrets directly in the code! +# What you can do is reassign env variables. +os.environ["DESTINATIONS__FILESYSTEM__BUCKET_URL"] = "abfss://container_name@storage_account_name.dfs.core.windows.net/path" +os.environ["DESTINATIONS__FILESYSTEM__CREDENTIALS__AZURE_STORAGE_ACCOUNT_NAME"] = os.environ.get("AZURE_STORAGE_ACCOUNT_NAME") +os.environ["DESTINATIONS__FILESYSTEM__CREDENTIALS__AZURE_STORAGE_ACCOUNT_KEY"] = os.environ.get("AZURE_STORAGE_ACCOUNT_KEY") ``` + + ### Databricks and Google Cloud Storage -In order to load from Google Cloud Storage stage you must set-up the credentials via **named credential**. See below. Databricks does not allow to pass Google Credentials -explicitly in SQL Statements. +In order to load from Google Cloud Storage stage, you must set up the credentials via a **named credential**. See below. Databricks does not allow you to pass Google Credentials explicitly in SQL statements. ### Use external locations and stored credentials `dlt` forwards bucket credentials to the `COPY INTO` SQL command by default. You may prefer to use [external locations or stored credentials instead](https://docs.databricks.com/en/sql/language-manual/sql-ref-external-locations.html#external-location) that are stored on the Databricks side. @@ -212,7 +390,7 @@ If you set up an external location for your staging path, you can tell `dlt` to is_staging_external_location=true ``` -If you set up Databricks credentials named, for example, **credential_x**, you can tell `dlt` to use it: +If you set up Databricks credentials named, for example, **credential_x**, you can tell `dlt` to use them: ```toml [destination.databricks] staging_credentials_name="credential_x" @@ -233,8 +411,8 @@ This destination [integrates with dbt](../transformations/dbt/dbt.md) via [dbt-d ### Syncing of `dlt` state This destination fully supports [dlt state sync](../../general-usage/state#syncing-state-with-destination). -### Databricks User Agent -We enable Databricks to identify that the connection is created by dlt. +### Databricks user agent +We enable Databricks to identify that the connection is created by `dlt`. Databricks will use this user agent identifier to better understand the usage patterns associated with dlt integration. The connection identifier is `dltHub_dlt`. diff --git a/docs/website/docs/dlt-ecosystem/destinations/delta-iceberg.md b/docs/website/docs/dlt-ecosystem/destinations/delta-iceberg.md new file mode 100644 index 0000000000..7a056d6b40 --- /dev/null +++ b/docs/website/docs/dlt-ecosystem/destinations/delta-iceberg.md @@ -0,0 +1,168 @@ +--- +title: Delta / Iceberg +description: Delta / Iceberg `dlt` destination +keywords: [delta, iceberg, destination, data warehouse] +--- + +# Delta and Iceberg table formats +`dlt` supports writing [Delta](https://delta.io/) and [Iceberg](https://iceberg.apache.org/) tables when using the [filesystem](./filesystem.md) destination. + +## How it works +`dlt` uses the [deltalake](https://pypi.org/project/deltalake/) and [pyiceberg](https://pypi.org/project/pyiceberg/) libraries to write Delta and Iceberg tables, respectively. One or multiple Parquet files are prepared during the extract and normalize steps. In the load step, these Parquet files are exposed as an Arrow data structure and fed into `deltalake` or `pyiceberg`. + +## Iceberg single-user ephemeral catalog +`dlt` uses single-table, ephemeral, in-memory, sqlite-based [Iceberg catalog](https://iceberg.apache.org/concepts/catalog/)s. These catalogs are created "on demand" when a pipeline is run, and do not persist afterwards. If a table already exists in the filesystem, it gets registered into the catalog using its latest metadata file. This allows for a serverless setup. It is currently not possible to connect your own Iceberg catalog. + +:::caution +While ephemeral catalogs make it easy to get started with Iceberg, it comes with limitations: +- concurrent writes are not handled and may lead to corrupt table state +- we cannot guarantee that reads concurrent with writes are clean +- the latest manifest file needs to be searched for using file listing—this can become slow with large tables, especially in cloud object stores +::: + +## Delta dependencies + +You need the `deltalake` package to use this format: + +```sh +pip install "dlt[deltalake]" +``` + +You also need `pyarrow>=17.0.0`: + +```sh +pip install 'pyarrow>=17.0.0' +``` + +## Iceberg dependencies + +You need Python version 3.9 or higher and the `pyiceberg` package to use this format: + +```sh +pip install "dlt[pyiceberg]" +``` + +You also need `sqlalchemy>=2.0.18`: + +```sh +pip install 'sqlalchemy>=2.0.18' +``` + +## Set table format + +Set the `table_format` argument to `delta` or `iceberg` when defining your resource: + +```py +@dlt.resource(table_format="delta") +def my_delta_resource(): + ... +``` + +or when calling `run` on your pipeline: + +```py +pipeline.run(my_resource, table_format="delta") +``` + +:::note +`dlt` always uses Parquet as `loader_file_format` when using the `delta` or `iceberg` table format. Any setting of `loader_file_format` is disregarded. +::: + + +## Table format partitioning +Both `delta` and `iceberg` tables can be partitioned by specifying one or more `partition` column hints. This example partitions a Delta table by the `foo` column: + +```py +@dlt.resource( + table_format="delta", + columns={"foo": {"partition": True}} +) +def my_delta_resource(): + ... +``` + +:::note +Delta uses [Hive-style partitioning](https://delta.io/blog/pros-cons-hive-style-partionining/), while Iceberg uses [hidden partioning](https://iceberg.apache.org/docs/latest/partitioning/). +::: + +:::caution +Partition evolution (changing partition columns after a table has been created) is not supported. +::: + +## Table access helper functions +You can use the `get_delta_tables` and `get_iceberg_tables` helper functions to acccess native table objects. For `delta` these are `deltalake` [DeltaTable](https://delta-io.github.io/delta-rs/api/delta_table/) objects, for `iceberg` these are `pyiceberg` [Table](https://py.iceberg.apache.org/reference/pyiceberg/table/#pyiceberg.table.Table) objects. + +```py +from dlt.common.libs.deltalake import get_delta_tables +# from dlt.common.libs.pyiceberg import get_iceberg_tables + +... + +# get dictionary of DeltaTable objects +delta_tables = get_delta_tables(pipeline) + +# execute operations on DeltaTable objects +delta_tables["my_delta_table"].optimize.compact() +delta_tables["another_delta_table"].optimize.z_order(["col_a", "col_b"]) +# delta_tables["my_delta_table"].vacuum() +# etc. +``` + +## Table format Google Cloud Storage authentication + +Note that not all authentication methods are supported when using table formats on Google Cloud Storage: + +| Authentication method | `delta` | `iceberg` | +| -- | -- | -- | +| [Service Account](bigquery.md#setup-guide) | ✅ | ❌ | +| [OAuth](../destinations/bigquery.md#oauth-20-authentication) | ❌ | ✅ | +| [Application Default Credentials](bigquery.md#using-default-credentials) | ✅ | ❌ | + +:::note +The [S3-compatible](#using-s3-compatible-storage) interface for Google Cloud Storage is not supported when using `iceberg`. +::: + +## Iceberg Azure scheme +The `az` [scheme](#supported-schemes) is not supported when using the `iceberg` table format. Please use the `abfss` scheme. This is because `pyiceberg`, which `dlt` used under the hood, currently does not support `az`. + +## Table format `merge` support (**experimental**) +The [`upsert`](../../general-usage/incremental-loading.md#upsert-strategy) merge strategy is supported for `delta`. For `iceberg`, the `merge` write disposition is not supported and falls back to `append`. + +:::caution +The `upsert` merge strategy for the filesystem destination with Delta table format is **experimental**. +::: + +```py +@dlt.resource( + write_disposition={"disposition": "merge", "strategy": "upsert"}, + primary_key="my_primary_key", + table_format="delta" +) +def my_upsert_resource(): + ... +... +``` + +### Known limitations +- `hard_delete` hint not supported +- Deleting records from nested tables not supported + - This means updates to JSON columns that involve element removals are not propagated. For example, if you first load `{"key": 1, "nested": [1, 2]}` and then load `{"key": 1, "nested": [1]}`, then the record for element `2` will not be deleted from the nested table. + +## Delta table format storage options +You can pass storage options by configuring `destination.filesystem.deltalake_storage_options`: + +```toml +[destination.filesystem] +deltalake_storage_options = '{"AWS_S3_LOCKING_PROVIDER": "dynamodb", "DELTA_DYNAMO_TABLE_NAME": "custom_table_name"}' +``` + +`dlt` passes these options to the `storage_options` argument of the `write_deltalake` method in the `deltalake` library. Look at their [documentation](https://delta-io.github.io/delta-rs/api/delta_writer/#deltalake.write_deltalake) to see which options can be used. + +You don't need to specify credentials here. `dlt` merges the required credentials with the options you provided before passing it as `storage_options`. + +>❗When using `s3`, you need to specify storage options to [configure](https://delta-io.github.io/delta-rs/usage/writing/writing-to-s3-with-locking-provider/) locking behavior. + +## Delta table format memory usage +:::caution +Beware that when loading a large amount of data for one table, the underlying rust implementation will consume a lot of memory. This is a known issue and the maintainers are actively working on a solution. You can track the progress [here](https://github.com/delta-io/delta-rs/pull/2289). Until the issue is resolved, you can mitigate the memory consumption by doing multiple smaller incremental pipeline runs. +::: \ No newline at end of file diff --git a/docs/website/docs/dlt-ecosystem/destinations/duckdb.md b/docs/website/docs/dlt-ecosystem/destinations/duckdb.md index 2b284e991a..a4537195ff 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/duckdb.md +++ b/docs/website/docs/dlt-ecosystem/destinations/duckdb.md @@ -118,7 +118,7 @@ to disable tz adjustments. ## Destination configuration -By default, a DuckDB database will be created in the current working directory with a name `.duckdb` (`chess.duckdb` in the example above). After loading, it is available in `read/write` mode via `with pipeline.sql_client() as con:`, which is a wrapper over `DuckDBPyConnection`. See [duckdb docs](https://duckdb.org/docs/api/python/overview#persistent-storage) for details. +By default, a DuckDB database will be created in the current working directory with a name `.duckdb` (`chess.duckdb` in the example above). After loading, it is available in **read/write** mode via `with pipeline.sql_client() as con:`, which is a wrapper over `DuckDBPyConnection`. See [duckdb docs](https://duckdb.org/docs/api/python/overview#persistent-storage) for details. If you want to **read** data, use [pipeline.dataset()](../../general-usage/dataset-access/dataset) instead of `sql_client`. The `duckdb` credentials do not require any secret values. [You are free to pass the credentials and configuration explicitly](../../general-usage/destination.md#pass-explicit-credentials). For example: ```py diff --git a/docs/website/docs/dlt-ecosystem/destinations/filesystem.md b/docs/website/docs/dlt-ecosystem/destinations/filesystem.md index 9b243b9429..de3d12e8e1 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/filesystem.md +++ b/docs/website/docs/dlt-ecosystem/destinations/filesystem.md @@ -108,7 +108,8 @@ You need to create an S3 bucket and a user who can access that bucket. dlt does #### Using S3 compatible storage -To use an S3 compatible storage other than AWS S3, such as [MinIO](https://min.io/) or [Cloudflare R2](https://www.cloudflare.com/en-ca/developer-platform/r2/), you may supply an `endpoint_url` in the config. This should be set along with AWS credentials: +To use an S3 compatible storage other than AWS S3, such as [MinIO](https://min.io/), [Cloudflare R2](https://www.cloudflare.com/en-ca/developer-platform/r2/) or [Google +Cloud Storage](https://cloud.google.com/storage/docs/interoperability), you may supply an `endpoint_url` in the config. This should be set along with AWS credentials: ```toml [destination.filesystem] @@ -166,6 +167,8 @@ Run `pip install "dlt[az]"` which will install the `adlfs` package to interface Edit the credentials in `.dlt/secrets.toml`, you'll see AWS credentials by default; replace them with your Azure credentials. +#### Supported schemes + `dlt` supports both forms of the blob storage urls: ```toml [destination.filesystem] @@ -404,29 +407,6 @@ The filesystem destination handles the write dispositions as follows: - `replace` - all files that belong to such tables are deleted from the dataset folder, and then the current set of files is added. - `merge` - falls back to `append` -### Merge with Delta table format (experimental) -The [`upsert`](../../general-usage/incremental-loading.md#upsert-strategy) merge strategy is supported when using the [Delta table format](#delta-table-format). - -:::caution -The `upsert` merge strategy for the filesystem destination with Delta table format is experimental. -::: - -```py -@dlt.resource( - write_disposition={"disposition": "merge", "strategy": "upsert"}, - primary_key="my_primary_key", - table_format="delta" -) -def my_upsert_resource(): - ... -... -``` - -#### Known limitations -- `hard_delete` hint not supported -- Deleting records from nested tables not supported - - This means updates to JSON columns that involve element removals are not propagated. For example, if you first load `{"key": 1, "nested": [1, 2]}` and then load `{"key": 1, "nested": [1]}`, then the record for element `2` will not be deleted from the nested table. - ## File compression The filesystem destination in the dlt library uses `gzip` compression by default for efficiency, which may result in the files being stored in a compressed format. This format may not be easily readable as plain text or JSON Lines (`jsonl`) files. If you encounter files that seem unreadable, they may be compressed. @@ -645,88 +625,9 @@ You can choose the following file formats: ## Supported table formats -You can choose the following table formats: -* [Delta table](../table-formats/delta.md) is supported - -### Delta table format - -You need the `deltalake` package to use this format: - -```sh -pip install "dlt[deltalake]" -``` - -You also need `pyarrow>=17.0.0`: - -```sh -pip install 'pyarrow>=17.0.0' -``` - -Set the `table_format` argument to `delta` when defining your resource: - -```py -@dlt.resource(table_format="delta") -def my_delta_resource(): - ... -``` - -:::note -`dlt` always uses Parquet as `loader_file_format` when using the `delta` table format. Any setting of `loader_file_format` is disregarded. -::: - -:::caution -Beware that when loading a large amount of data for one table, the underlying rust implementation will consume a lot of memory. This is a known issue and the maintainers are actively working on a solution. You can track the progress [here](https://github.com/delta-io/delta-rs/pull/2289). Until the issue is resolved, you can mitigate the memory consumption by doing multiple smaller incremental pipeline runs. -::: - -#### Delta table partitioning -A Delta table can be partitioned ([Hive-style partitioning](https://delta.io/blog/pros-cons-hive-style-partionining/)) by specifying one or more `partition` column hints. This example partitions the Delta table by the `foo` column: - -```py -@dlt.resource( - table_format="delta", - columns={"foo": {"partition": True}} -) -def my_delta_resource(): - ... -``` - -:::caution -It is **not** possible to change partition columns after the Delta table has been created. Trying to do so causes an error stating that the partition columns don't match. -::: - - -#### Storage options -You can pass storage options by configuring `destination.filesystem.deltalake_storage_options`: - -```toml -[destination.filesystem] -deltalake_storage_options = '{"AWS_S3_LOCKING_PROVIDER": "dynamodb", "DELTA_DYNAMO_TABLE_NAME": "custom_table_name"}' -``` - -`dlt` passes these options to the `storage_options` argument of the `write_deltalake` method in the `deltalake` library. Look at their [documentation](https://delta-io.github.io/delta-rs/api/delta_writer/#deltalake.write_deltalake) to see which options can be used. - -You don't need to specify credentials here. `dlt` merges the required credentials with the options you provided before passing it as `storage_options`. - ->❗When using `s3`, you need to specify storage options to [configure](https://delta-io.github.io/delta-rs/usage/writing/writing-to-s3-with-locking-provider/) locking behavior. - -#### `get_delta_tables` helper -You can use the `get_delta_tables` helper function to get `deltalake` [DeltaTable](https://delta-io.github.io/delta-rs/api/delta_table/) objects for your Delta tables: - -```py -from dlt.common.libs.deltalake import get_delta_tables - -... - -# get dictionary of DeltaTable objects -delta_tables = get_delta_tables(pipeline) - -# execute operations on DeltaTable objects -delta_tables["my_delta_table"].optimize.compact() -delta_tables["another_delta_table"].optimize.z_order(["col_a", "col_b"]) -# delta_tables["my_delta_table"].vacuum() -# etc. - -``` +You can choose the following [table formats](./delta-iceberg.md): +* Delta table +* Iceberg ## Syncing of dlt state This destination fully supports [dlt state sync](../../general-usage/state#syncing-state-with-destination). To this end, special folders and files will be created at your destination which hold information about your pipeline state, schemas, and completed loads. These folders DO NOT respect your settings in the layout section. When using filesystem as a staging destination, not all of these folders are created, as the state and schemas are managed in the regular way by the final destination you have configured. diff --git a/docs/website/docs/dlt-ecosystem/destinations/snowflake.md b/docs/website/docs/dlt-ecosystem/destinations/snowflake.md index 07cf822973..28684c39ac 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/snowflake.md +++ b/docs/website/docs/dlt-ecosystem/destinations/snowflake.md @@ -200,6 +200,12 @@ Note that we ignore missing columns `ERROR_ON_COLUMN_COUNT_MISMATCH = FALSE` and ## Supported column hints Snowflake supports the following [column hints](../../general-usage/schema#tables-and-columns): * `cluster` - Creates a cluster column(s). Many columns per table are supported and only when a new table is created. +* `unique` - Creates UNIQUE hint on a Snowflake column, can be added to many columns. ([optional](#additional-destination-options)) +* `primary_key` - Creates PRIMARY KEY on selected column(s), may be compound. ([optional](#additional-destination-options)) + +`unique` and `primary_key` are not enforced and `dlt` does not instruct Snowflake to `RELY` on them when +query planning. + ## Table and column identifiers Snowflake supports both case-sensitive and case-insensitive identifiers. All unquoted and uppercase identifiers resolve case-insensitively in SQL statements. Case-insensitive [naming conventions](../../general-usage/naming-convention.md#case-sensitive-and-insensitive-destinations) like the default **snake_case** will generate case-insensitive identifiers. Case-sensitive (like **sql_cs_v1**) will generate @@ -308,6 +314,7 @@ pipeline = dlt.pipeline( ## Additional destination options You can define your own stage to PUT files and disable the removal of the staged files after loading. +You can also opt-in to [create indexes](#supported-column-hints). ```toml [destination.snowflake] @@ -315,6 +322,8 @@ You can define your own stage to PUT files and disable the removal of the staged stage_name="DLT_STAGE" # Whether to keep or delete the staged files after COPY INTO succeeds keep_staged_files=true +# Add UNIQUE and PRIMARY KEY hints to tables +create_indexes=true ``` ### Setting up CSV format diff --git a/docs/website/docs/dlt-ecosystem/table-formats/iceberg.md b/docs/website/docs/dlt-ecosystem/table-formats/iceberg.md index 233ae0ce21..edca521e52 100644 --- a/docs/website/docs/dlt-ecosystem/table-formats/iceberg.md +++ b/docs/website/docs/dlt-ecosystem/table-formats/iceberg.md @@ -10,5 +10,5 @@ keywords: [iceberg, table formats] ## Supported destinations -Supported by: **Athena** +Supported by: **Athena**, **filesystem** diff --git a/docs/website/docs/dlt-ecosystem/transformations/dbt/dbt.md b/docs/website/docs/dlt-ecosystem/transformations/dbt/dbt.md index 449f8b8bde..59eb340ef2 100644 --- a/docs/website/docs/dlt-ecosystem/transformations/dbt/dbt.md +++ b/docs/website/docs/dlt-ecosystem/transformations/dbt/dbt.md @@ -1,10 +1,10 @@ --- -title: Transform the data with dbt +title: Transforming data with dbt description: Transforming the data loaded by a dlt pipeline with dbt keywords: [transform, dbt, runner] --- -# Transform the data with dbt +# Transforming data with dbt [dbt](https://github.com/dbt-labs/dbt-core) is a framework that allows for the simple structuring of your transformations into DAGs. The benefits of using dbt include: @@ -105,8 +105,8 @@ You can run the example with dbt debug log: `RUNTIME__LOG_LEVEL=DEBUG python dbt ## Other transforming tools -If you want to transform the data before loading, you can use Python. If you want to transform the data after loading, you can use dbt or one of the following: +If you want to transform your data before loading, you can use Python. If you want to transform your data after loading, you can use dbt or one of the following: 1. [`dlt` SQL client.](../sql.md) -2. [Pandas.](../pandas.md) +2. [Python with dataframes or arrow tables.](../python.md) diff --git a/docs/website/docs/dlt-ecosystem/transformations/index.md b/docs/website/docs/dlt-ecosystem/transformations/index.md new file mode 100644 index 0000000000..6c51e8cd8d --- /dev/null +++ b/docs/website/docs/dlt-ecosystem/transformations/index.md @@ -0,0 +1,27 @@ +--- +title: Transforming your data +description: How to transform your data +keywords: [datasets, data, access, transformations] +--- +import DocCardList from '@theme/DocCardList'; + +# Transforming data + +If you'd like to transform your data after a pipeline load, you have 3 options available to you: + +* [Using dbt](./dbt/dbt.md) - dlt provides a convenient dbt wrapper to make integration easier. +* [Using the `dlt` SQL client](./sql.md) - dlt exposes an SQL client to transform data on your destination directly using SQL. +* [Using Python with DataFrames or Arrow tables](./python.md) - you can also transform your data using Arrow tables and DataFrames in Python. + +If you need to preprocess some of your data before it is loaded, you can learn about strategies to: + +* [Rename columns.](../../general-usage/customising-pipelines/renaming_columns) +* [Pseudonymize columns.](../../general-usage/customising-pipelines/pseudonymizing_columns) +* [Remove columns.](../../general-usage/customising-pipelines/removing_columns) + +This is particularly useful if you are trying to remove data related to PII or other sensitive data, you want to remove columns that are not needed for your use case or you are using a destination that does not support certain data types in your source data. + + +# Learn more + + diff --git a/docs/website/docs/dlt-ecosystem/transformations/pandas.md b/docs/website/docs/dlt-ecosystem/transformations/pandas.md deleted file mode 100644 index e431313d1c..0000000000 --- a/docs/website/docs/dlt-ecosystem/transformations/pandas.md +++ /dev/null @@ -1,42 +0,0 @@ ---- -title: Transform the data with Pandas -description: Transform the data loaded by a dlt pipeline with Pandas -keywords: [transform, pandas] ---- - -# Transform the data with Pandas - -You can fetch the results of any SQL query as a dataframe. If the destination supports that -natively (i.e., BigQuery and DuckDB), `dlt` uses the native method. Thanks to this, reading -dataframes can be really fast! The example below reads GitHub reactions data from the `issues` table and -counts the reaction types. - -```py -pipeline = dlt.pipeline( - pipeline_name="github_pipeline", - destination="duckdb", - dataset_name="github_reactions", - dev_mode=True -) -with pipeline.sql_client() as client: - with client.execute_query( - 'SELECT "reactions__+1", "reactions__-1", reactions__laugh, reactions__hooray, reactions__rocket FROM issues' - ) as cursor: - # calling `df` on a cursor, returns the data as a pandas data frame - reactions = cursor.df() -counts = reactions.sum(0).sort_values(0, ascending=False) -``` - -The `df` method above returns all the data in the cursor as a data frame. You can also fetch data in -chunks by passing the `chunk_size` argument to the `df` method. - -Once your data is in a Pandas dataframe, you can transform it as needed. - -## Other transforming tools - -If you want to transform the data before loading, you can use Python. If you want to transform the -data after loading, you can use Pandas or one of the following: - -1. [dbt.](dbt/dbt.md) (recommended) -2. [`dlt` SQL client.](sql.md) - diff --git a/docs/website/docs/dlt-ecosystem/transformations/python.md b/docs/website/docs/dlt-ecosystem/transformations/python.md new file mode 100644 index 0000000000..d43f8caaca --- /dev/null +++ b/docs/website/docs/dlt-ecosystem/transformations/python.md @@ -0,0 +1,109 @@ +--- +title: Transforming data in Python with Arrow tables or DataFrames +description: Transforming data loaded by a dlt pipeline with pandas dataframes or arrow tables +keywords: [transform, pandas] +--- + +# Transforming data in Python with Arrow tables or DataFrames + +You can transform your data in Python using Pandas DataFrames or Arrow tables. To get started, please read the [dataset docs](../../general-usage/dataset-access/dataset). + + +## Interactively transforming your data in Python + +Using the methods explained in the [dataset docs](../../general-usage/dataset-access/dataset), you can fetch data from your destination into a DataFrame or Arrow table in your local Python process and work with it interactively. This even works for filesystem destinations: + + +The example below reads GitHub reactions data from the `issues` table and +counts the reaction types. + +```py +pipeline = dlt.pipeline( + pipeline_name="github_pipeline", + destination="duckdb", + dataset_name="github_reactions", + dev_mode=True +) + +# get a dataframe of all reactions from the dataset +reactions = pipeline.dataset().issues.select("reactions__+1", "reactions__-1", "reactions__laugh", "reactions__hooray", "reactions__rocket").df() + +# calculate and print out the sum of all reactions +counts = reactions.sum(0).sort_values(0, ascending=False) +print(counts) + +# alternatively, you can fetch the data as an arrow table +reactions = pipeline.dataset().issues.select("reactions__+1", "reactions__-1", "reactions__laugh", "reactions__hooray", "reactions__rocket").arrow() +# ... do transformations on the arrow table +``` + +## Persisting your transformed data + +Since dlt supports DataFrames and Arrow tables from resources directly, you can use the same pipeline to load the transformed data back into the destination. + + +### A simple example + +A simple example that creates a new table from an existing user table but only with columns that do not contain private information. Note that we use the `iter_arrow()` method on the relation to iterate over the arrow table instead of fetching it all at once. + +```py +pipeline = dlt.pipeline( + pipeline_name="users_pipeline", + destination="duckdb", + dataset_name="users_raw", + dev_mode=True +) + +# get user relation with only a few columns selected, but omitting email and name +users = pipeline.dataset().users.select("age", "amount_spent", "country") + +# load the data into a new table called users_clean in the same dataset +pipeline.run(users.iter_arrow(chunk_size=1000), table_name="users_clean") +``` + +### A more complex example + +The example above could easily be done in SQL. Let's assume you'd like to actually do in Python some Arrow transformations. For this will create a resources from which we can yield the modified Arrow tables. The same is possibly with DataFrames. + +```py +import pyarrow.compute as pc + +pipeline = dlt.pipeline( + pipeline_name="users_pipeline", + destination="duckdb", + dataset_name="users_raw", + dev_mode=True +) + +# NOTE: this resource will work like a regular resource and support write_disposition, primary_key, etc. +# NOTE: For selecting only users above 18, we could also use the filter method on the relation with ibis expressions +@dlt.resource(table_name="users_clean") +def users_clean(): + users = pipeline.dataset().users + for arrow_table in users.iter_arrow(chunk_size=1000): + + # we want to filter out users under 18 + age_filter = pc.greater_equal(arrow_table["age"], 18) + arrow_table = arrow_table.filter(age_filter) + + # we want to hash the email column + arrow_table = arrow_table.append_column("email_hash", pc.sha256(arrow_table["email"])) + + # we want to remove the email column and name column + arrow_table = arrow_table.drop(["email", "name"]) + + # yield the transformed arrow table + yield arrow_table + + +pipeline.run(users_clean()) +``` + +## Other transforming tools + +If you want to transform your data before loading, you can use Python. If you want to transform the +data after loading, you can use Pandas or one of the following: + +1. [dbt.](dbt/dbt.md) (recommended) +2. [`dlt` SQL client.](sql.md) + diff --git a/docs/website/docs/dlt-ecosystem/transformations/sql.md b/docs/website/docs/dlt-ecosystem/transformations/sql.md index ffd348d1a0..60f3e7f7a5 100644 --- a/docs/website/docs/dlt-ecosystem/transformations/sql.md +++ b/docs/website/docs/dlt-ecosystem/transformations/sql.md @@ -1,33 +1,52 @@ --- -title: Transform the data with SQL +title: Transforming data with SQL description: Transforming the data loaded by a dlt pipeline with the dlt SQL client keywords: [transform, sql] --- -# Transform the data using the `dlt` SQL client +# Transforming data using the `dlt` SQL client A simple alternative to dbt is to query the data using the `dlt` SQL client and then perform the -transformations using Python. The `execute_sql` method allows you to execute any SQL statement, +transformations using SQL statements in Python. The `execute_sql` method allows you to execute any SQL statement, including statements that change the database schema or data in the tables. In the example below, we insert a row into the `customers` table. Note that the syntax is the same as for any standard `dbapi` connection. +:::info +* This method will work for all SQL destinations supported by `dlt`, but not for the filesystem destination. +* Read the [SQL client docs](../../ general-usage/dataset-access/dataset) for more information on how to access data with the SQL client. +* If you are simply trying to read data, you should use the powerful [dataset interface](../../general-usage/dataset-access/dataset) instead. +::: + + +Typically you will use this type of transformation if you can create or update tables directly from existing tables +without any need to insert data from your Python environment. + +The example below creates a new table `aggregated_sales` that contains the total and average sales for each category and region + + ```py -pipeline = dlt.pipeline(destination="bigquery", dataset_name="crm") -try: - with pipeline.sql_client() as client: - client.execute_sql( - "INSERT INTO customers VALUES (%s, %s, %s)", - 10, - "Fred", - "fred@fred.com" - ) -except Exception: - ... +pipeline = dlt.pipeline(destination="duckdb", dataset_name="crm") + +# NOTE: this is the duckdb sql dialect, other destinations may use different expressions +with pipeline.sql_client() as client: + client.execute_sql( + """ CREATE OR REPLACE TABLE aggregated_sales AS + SELECT + category, + region, + SUM(amount) AS total_sales, + AVG(amount) AS average_sales + FROM + sales + GROUP BY + category, + region; + """) ``` -In the case of SELECT queries, the data is returned as a list of rows, with the elements of a row -corresponding to selected columns. +You can also use the `execute_sql` method to run select queries. The data is returned as a list of rows, with the elements of a row +corresponding to selected columns. A more convenient way to extract data is to use dlt datasets. ```py try: @@ -44,9 +63,9 @@ except Exception: ## Other transforming tools -If you want to transform the data before loading, you can use Python. If you want to transform the +If you want to transform your data before loading, you can use Python. If you want to transform the data after loading, you can use SQL or one of the following: 1. [dbt](dbt/dbt.md) (recommended). -2. [Pandas](pandas.md). +2. [Python with DataFrames or Arrow tables](python.md). diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/rest_api/basic.md b/docs/website/docs/dlt-ecosystem/verified-sources/rest_api/basic.md index 14d9ecb04b..ea3c9c768b 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/rest_api/basic.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/rest_api/basic.md @@ -306,7 +306,7 @@ A resource configuration is used to define a [dlt resource](../../../general-usa - `write_disposition`: The write disposition for the resource. - `primary_key`: The primary key for the resource. - `include_from_parent`: A list of fields from the parent resource to be included in the resource output. See the [resource relationships](#include-fields-from-the-parent-resource) section for more details. -- `processing_steps`: A list of [processing steps](#processing-steps-filter-and-transform-data) to filter and transform the data. +- `processing_steps`: A list of [processing steps](#processing-steps-filter-and-transform-data) to filter and transform your data. - `selected`: A flag to indicate if the resource is selected for loading. This could be useful when you want to load data only from child resources and not from the parent resource. - `auth`: An optional `AuthConfig` instance. If passed, is used over the one defined in the [client](#client) definition. Example: ```py diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/sql_database/advanced.md b/docs/website/docs/dlt-ecosystem/verified-sources/sql_database/advanced.md index 6ff3a267d2..954c1fb493 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/sql_database/advanced.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/sql_database/advanced.md @@ -16,7 +16,7 @@ Efficient data management often requires loading only new or updated data from y Incremental loading uses a cursor column (e.g., timestamp or auto-incrementing ID) to load only data newer than a specified initial value, enhancing efficiency by reducing processing time and resource use. Read [here](../../../walkthroughs/sql-incremental-configuration) for more details on incremental loading with `dlt`. -#### How to configure +### How to configure 1. **Choose a cursor column**: Identify a column in your SQL table that can serve as a reliable indicator of new or updated rows. Common choices include timestamp columns or auto-incrementing IDs. 1. **Set an initial value**: Choose a starting value for the cursor to begin loading data. This could be a specific timestamp or ID from which you wish to start loading data. 1. **Deduplication**: When using incremental loading, the system automatically handles the deduplication of rows based on the primary key (if available) or row hash for tables without a primary key. @@ -27,7 +27,7 @@ Incremental loading uses a cursor column (e.g., timestamp or auto-incrementing I If your cursor column name contains special characters (e.g., `$`) you need to escape it when passing it to the `incremental` function. For example, if your cursor column is `example_$column`, you should pass it as `"'example_$column'"` or `'"example_$column"'` to the `incremental` function: `incremental("'example_$column'", initial_value=...)`. ::: -#### Examples +### Examples 1. **Incremental loading with the resource `sql_table`**. @@ -52,7 +52,7 @@ If your cursor column name contains special characters (e.g., `$`) you need to e print(extract_info) ``` - Behind the scene, the loader generates a SQL query filtering rows with `last_modified` values greater than the incremental value. In the first run, this is the initial value (midnight (00:00:00) January 1, 2024). + Behind the scene, the loader generates a SQL query filtering rows with `last_modified` values greater or equal to the incremental value. In the first run, this is the initial value (midnight (00:00:00) January 1, 2024). In subsequent runs, it is the latest value of `last_modified` that `dlt` stores in [state](../../../general-usage/state). 2. **Incremental loading with the source `sql_database`**. @@ -78,6 +78,49 @@ If your cursor column name contains special characters (e.g., `$`) you need to e * `apply_hints` is a powerful method that enables schema modifications after resource creation, like adjusting write disposition and primary keys. You can choose from various tables and use `apply_hints` multiple times to create pipelines with merged, appended, or replaced resources. ::: +### Inclusive and exclusive filtering + +By default the incremental filtering is inclusive on the start value side so that +rows with cursor equal to the last run's cursor are fetched again from the database. + +The SQL query generated looks something like this (assuming `last_value_func` is `max`): + +```sql +SELECT * FROM family +WHERE last_modified >= :start_value +ORDER BY last_modified ASC +``` + +That means some rows overlapping with the previous load are fetched from the database. +Duplicates are then filtered out by dlt using either the primary key or a hash of the row's contents. + +This ensures there are no gaps in the extracted sequence. But it does come with some performance overhead, +both due to the deduplication processing and the cost of fetching redundant records from the database. + +This is not always needed. If you know that your data does not contain overlapping cursor values then you +can optimize extraction by passing `range_start="open"` to incremental. + +This both disables the deduplication process and changes the operator used in the SQL `WHERE` clause from `>=` (greater-or-equal) to `>` (greater than), so that no overlapping rows are fetched. + +E.g. + +```py +table = sql_table( + table='family', + incremental=dlt.sources.incremental( + 'last_modified', # Cursor column name + initial_value=pendulum.DateTime(2024, 1, 1, 0, 0, 0), # Initial cursor value + range_start="open", # exclude the start value + ) +) +``` + +It's a good option if: + +* The cursor is an auto incrementing ID +* The cursor is a high precision timestamp and two records are never created at exactly the same time +* Your pipeline runs are timed in such a way that new data is not generated during the load + ## Parallelized extraction You can extract each table in a separate thread (no multiprocessing at this point). This will decrease loading time if your queries take time to execute or your network latency/speed is low. To enable this, declare your sources/resources as follows: @@ -213,3 +256,24 @@ SOURCES__SQL_DATABASE__CHUNK_SIZE=1000 SOURCES__SQL_DATABASE__CHAT_MESSAGE__INCREMENTAL__CURSOR_PATH=updated_at ``` +### Configure many sources side by side with custom sections +`dlt` allows you to rename any source to place the source configuration into custom section or to have many instances +of the source created side by side. For example: +```py +from dlt.sources.sql_database import sql_database + +my_db = sql_database.with_args(name="my_db", section="my_db")(table_names=["chat_message"]) +print(my_db.name) +``` +Here we create a renamed version of the `sql_database` and then instantiate it. Such source will read +credentials from: +```toml +[sources.my_db] +credentials="mssql+pyodbc://loader.database.windows.net/dlt_data?trusted_connection=yes&driver=ODBC+Driver+17+for+SQL+Server" +schema="data" +backend="pandas" +chunk_size=1000 + +[sources.my_db.chat_message.incremental] +cursor_path="updated_at" +``` diff --git a/docs/website/docs/general-usage/dataset-access/dataset.md b/docs/website/docs/general-usage/dataset-access/dataset.md index 68635383c5..f9c01603f6 100644 --- a/docs/website/docs/general-usage/dataset-access/dataset.md +++ b/docs/website/docs/general-usage/dataset-access/dataset.md @@ -19,7 +19,7 @@ Here's a full example of how to retrieve data from a pipeline and load it into a # and you have loaded data to a table named 'items' in the destination # Step 1: Get the readable dataset from the pipeline -dataset = pipeline._dataset() +dataset = pipeline.dataset() # Step 2: Access a table as a ReadableRelation items_relation = dataset.items # Or dataset["items"] @@ -39,7 +39,10 @@ Assuming you have a `Pipeline` object (let's call it `pipeline`), you can obtain ```py # Get the readable dataset from the pipeline -dataset = pipeline._dataset() +dataset = pipeline.dataset() + +# print the row counts of all tables in the destination as dataframe +print(dataset.row_counts().df()) ``` ### Access tables as `ReadableRelation` @@ -116,6 +119,18 @@ for items_chunk in items_relation.iter_fetch(chunk_size=500): The methods available on the ReadableRelation correspond to the methods available on the cursor returned by the SQL client. Please refer to the [SQL client](./sql-client.md#supported-methods-on-the-cursor) guide for more information. +## Special queries + +You can use the `row_counts` method to get the row counts of all tables in the destination as a DataFrame. + +```py +# print the row counts of all tables in the destination as dataframe +print(dataset.row_counts().df()) + +# or as tuples +print(dataset.row_counts().fetchall()) +``` + ## Modifying queries You can refine your data retrieval by limiting the number of records, selecting specific columns, or chaining these operations. @@ -156,6 +171,64 @@ You can combine `select`, `limit`, and other methods. arrow_table = items_relation.select("col1", "col2").limit(50).arrow() ``` +## Modifying queries with ibis expressions + +If you install the amazing [ibis](https://ibis-project.org/) library, you can use ibis expressions to modify your queries. + +```sh +pip install ibis-framework +``` + +dlt will then wrap an `ibis.UnboundTable` with a `ReadableIbisRelation` object under the hood that will allow you to modify the query of a reltaion using ibis expressions: + +```py +# now that ibis is installed, we can get a dataset with ibis relations +dataset = pipeline.dataset() + +# get two relations +items_relation = dataset["items"] +order_relation = dataset["orders"] + +# join them using an ibis expression +joined_relation = items_relation.join(order_relation, items_relation.id == order_relation.item_id) + +# now we can use the ibis expression to filter the data +filtered_relation = joined_relation.filter(order_relation.status == "completed") + +# we can inspect the query that will be used to read the data +print(filtered_relation.query) + +# and finally fetch the data as a pandas dataframe, the same way we would do with a normal relation +df = filtered_relation.df() + +# a few more examples + +# filter for rows where the id is in the list of ids +items_relation.filter(items_relation.id.isin([1, 2, 3])).df() + +# limit and offset +items_relation.limit(10, offset=5).arrow() + +# mutate columns by adding a new colums that always is 10 times the value of the id column +items_relation.mutate(new_id=items_relation.id * 10).df() + +# sort asc and desc +import ibis +items_relation.order_by(ibis.desc("id"), ibis.asc("price")).limit(10) + +# group by and aggregate +items_relation.group_by("item_group").having(items_table.count() >= 1000).aggregate(sum_id=items_table.id.sum()).df() + +# subqueries +items_relation.filter(items_table.category.isin(beverage_categories.name)).df() +``` + +You can learn more about the available expressions on the [ibis for sql users](https://ibis-project.org/tutorials/ibis-for-sql-users) page. + +:::note +Keep in mind that you can use only methods that modify the executed query and none of the methods ibis provides for fetching data. This is done with the same methods defined on the regular relations explained above. If you need full native ibis integration, please read the ibis section in the advanced part further down. Additionally, not all ibis expressions may be supported by all destinations and sql dialects. +::: + ## Supported destinations All SQL and filesystem destinations supported by `dlt` can utilize this data access interface. For filesystem destinations, `dlt` [uses **DuckDB** under the hood](./sql-client.md#the-filesystem-sql-client) to create views from Parquet or JSONL files dynamically. This allows you to query data stored in files using the same interface as you would with SQL databases. If you plan on accessing data in buckets or the filesystem a lot this way, it is advised to load data as Parquet instead of JSONL, as **DuckDB** is able to only load the parts of the data actually needed for the query to work. @@ -226,7 +299,9 @@ other_pipeline = dlt.pipeline(pipeline_name="other_pipeline", destination="duckd other_pipeline.run(limited_items_relation.iter_arrow(chunk_size=10_000), table_name="limited_items") ``` -### Using `ibis` to query the data +Learn more about [transforming data in Python with Arrow tables or DataFrames](../../dlt-ecosystem/transformations/python). + +### Using `ibis` to query data Visit the [Native Ibis integration](./ibis-backend.md) guide to learn more. diff --git a/docs/website/docs/general-usage/dataset-access/ibis-backend.md b/docs/website/docs/general-usage/dataset-access/ibis-backend.md index 8f4b0fb6b6..bc8487940e 100644 --- a/docs/website/docs/general-usage/dataset-access/ibis-backend.md +++ b/docs/website/docs/general-usage/dataset-access/ibis-backend.md @@ -6,7 +6,7 @@ keywords: [data, dataset, ibis] # Ibis -Ibis is a powerful portable Python dataframe library. Learn more about what it is and how to use it in the [official documentation](https://ibis-project.org/). +Ibis is a powerful portable Python dataframe library. Learn more about what it is and how to use it in the [official documentation](https://ibis-project.org/). `dlt` provides an easy way to hand over your loaded dataset to an Ibis backend connection. @@ -28,7 +28,7 @@ pip install ibis-framework[duckdb] ```py # get the dataset from the pipeline -dataset = pipeline._dataset() +dataset = pipeline.dataset() dataset_name = pipeline.dataset_name # get the native ibis connection from the dataset @@ -46,4 +46,3 @@ print(table.limit(10).execute()) # Visit the ibis docs to learn more about the available methods ``` - diff --git a/docs/website/docs/general-usage/destination.md b/docs/website/docs/general-usage/destination.md index fa133b6257..ba42869957 100644 --- a/docs/website/docs/general-usage/destination.md +++ b/docs/website/docs/general-usage/destination.md @@ -128,7 +128,7 @@ When loading data, `dlt` will access the destination in two cases: 1. At the beginning of the `run` method to sync the pipeline state with the destination (or if you call `pipeline.sync_destination` explicitly). 2. In the `pipeline.load` method - to migrate the schema and load the load package. -Obviously, `dlt` will access the destination when you instantiate [sql_client](../dlt-ecosystem/transformations/sql.md). +`dlt` will also access the destination when you instantiate [sql_client](../dlt-ecosystem/transformations/sql.md). :::note `dlt` will not import the destination dependencies or access destination configuration if access is not needed. You can build multi-stage pipelines where steps are executed in separate processes or containers - the `extract` and `normalize` step do not need destination dependencies, configuration, and actual connection. diff --git a/docs/website/docs/general-usage/incremental-loading.md b/docs/website/docs/general-usage/incremental-loading.md index 3f452f0d16..5008795ed4 100644 --- a/docs/website/docs/general-usage/incremental-loading.md +++ b/docs/website/docs/general-usage/incremental-loading.md @@ -693,7 +693,7 @@ august_issues = repo_issues( ... ``` -Note that dlt's incremental filtering considers the ranges half-closed. `initial_value` is inclusive, `end_value` is exclusive, so chaining ranges like above works without overlaps. +Note that dlt's incremental filtering considers the ranges half-closed. `initial_value` is inclusive, `end_value` is exclusive, so chaining ranges like above works without overlaps. This behaviour can be changed with the `range_start` (default `"closed"`) and `range_end` (default `"open"`) arguments. ### Declare row order to not request unnecessary data @@ -793,6 +793,9 @@ def some_data(last_timestamp=dlt.sources.incremental("item.ts", primary_key=())) yield {"delta": i, "item": {"ts": pendulum.now().timestamp()}} ``` +This deduplication process is always enabled when `range_start` is set to `"closed"` (default). +When you pass `range_start="open"` no deduplication is done as it is not needed as rows with the previous cursor value are excluded. This can be a useful optimization to avoid the performance overhead of deduplication if the cursor field is guaranteed to be unique. + ### Using `dlt.sources.incremental` with dynamically created resources When resources are [created dynamically](source.md#create-resources-dynamically), it is possible to use the `dlt.sources.incremental` definition as well. diff --git a/docs/website/docs/general-usage/resource.md b/docs/website/docs/general-usage/resource.md index 199eaf9b5d..b8d51caf75 100644 --- a/docs/website/docs/general-usage/resource.md +++ b/docs/website/docs/general-usage/resource.md @@ -405,11 +405,26 @@ dlt.pipeline(destination="duckdb").run(my_resource().add_limit(10)) The code above will extract `15*10=150` records. This is happening because in each iteration, 15 records are yielded, and we're limiting the number of iterations to 10. ::: -Some constraints of `add_limit` include: +Altenatively you can also apply a time limit to the resource. The code below will run the extraction for 10 seconds and extract how ever many items are yielded in that time. In combination with incrementals, this can be useful for batched loading or for loading on machines that have a run time limit. + +```py +dlt.pipeline(destination="duckdb").run(my_resource().add_limit(max_time=10)) +``` + +You can also apply a combination of both limits. In this case the extraction will stop as soon as either limit is reached. + +```py +dlt.pipeline(destination="duckdb").run(my_resource().add_limit(max_items=10, max_time=10)) +``` + + +Some notes about the `add_limit`: 1. `add_limit` does not skip any items. It closes the iterator/generator that produces data after the limit is reached. 2. You cannot limit transformers. They should process all the data they receive fully to avoid inconsistencies in generated datasets. 3. Async resources with a limit added may occasionally produce one item more than the limit on some runs. This behavior is not deterministic. +4. Calling add limit on a resource will replace any previously set limits settings. +5. For time-limited resources, the timer starts when the first item is processed. When resources are processed sequentially (FIFO mode), each resource's time limit applies also sequentially. In the default round robin mode, the time limits will usually run concurrently. :::tip If you are parameterizing the value of `add_limit` and sometimes need it to be disabled, you can set `None` or `-1` to disable the limiting. diff --git a/docs/website/docs/general-usage/source.md b/docs/website/docs/general-usage/source.md index a5f1f04dee..9c6c2aac13 100644 --- a/docs/website/docs/general-usage/source.md +++ b/docs/website/docs/general-usage/source.md @@ -52,7 +52,6 @@ Do not extract data in the source function. Leave that task to your resources if If this is impractical (for example, you want to reflect a database to create resources for tables), make sure you do not call the source function too often. [See this note if you plan to deploy on Airflow](../walkthroughs/deploy-a-pipeline/deploy-with-airflow-composer.md#2-modify-dag-file) - ## Customize sources ### Access and select resources to load @@ -108,12 +107,40 @@ load_info = pipeline.run(pipedrive_source().add_limit(10)) print(load_info) ``` +You can also apply a time limit to the source: + +```py +pipeline.run(pipedrive_source().add_limit(max_time=10)) +``` + +Or limit by both, the limit that is reached first will stop the extraction: + +```py +pipeline.run(pipedrive_source().add_limit(max_items=10, max_time=10)) +``` + :::note -Note that `add_limit` **does not limit the number of records** but rather the "number of yields". `dlt` will close the iterator/generator that produces data after the limit is reached. +Note that `add_limit` **does not limit the number of records** but rather the "number of yields". `dlt` will close the iterator/generator that produces data after the limit is reached. Please read in more detail about the `add_limit` on the resource page. ::: Find more on sampling data [here](resource.md#sample-from-large-data). +### Rename the source +`dlt` allows you to rename the source ie. to place the source configuration into custom section or to have many instances +of the source created side by side. For example: +```py +from dlt.sources.sql_database import sql_database + +my_db = sql_database.with_args(name="my_db", section="my_db")(table_names=["table_1"]) +print(my_db.name) +``` +Here we create a renamed version of the `sql_database` and then instantiate it. Such source will read +credentials from: +```toml +[sources.my_db.my_db.credentials] +password="..." +``` + ### Add more resources to existing source You can add a custom resource to a source after it was created. Imagine that you want to score all the deals with a keras model that will tell you if the deal is a fraud or not. In order to do that, you declare a new [transformer that takes the data from](resource.md#feeding-data-from-one-resource-into-another) `deals` resource and add it to the source. diff --git a/docs/website/docs/general-usage/state.md b/docs/website/docs/general-usage/state.md index 46aa1d63ce..d1fb426452 100644 --- a/docs/website/docs/general-usage/state.md +++ b/docs/website/docs/general-usage/state.md @@ -123,14 +123,13 @@ def comments(user_id: str): # on the first pipeline run, the user_comments table does not yet exist so do not check at all # alternatively, catch DatabaseUndefinedRelation which is raised when an unknown table is selected if not current_pipeline.first_run: - with current_pipeline.sql_client() as client: - # we may get the last user comment or None which we replace with 0 - max_id = ( - client.execute_sql( - "SELECT MAX(_id) FROM user_comments WHERE user_id=?", user_id - )[0][0] - or 0 - ) + # get user comments table from pipeline dataset + user_comments = current_pipeline.dataset().user_comments + # get last user comment id with ibis expression, ibis-extras need to be installed + max_id_df = user_comments.filter(user_comments.user_id == user_id).select(user_comments["_id"].max()).df() + # if there are no comments for the user, max_id will be None, so we replace it with 0 + max_id = max_id_df[0][0] if len(max_id_df.index) else 0 + # use max_id to filter our results (we simulate an API query) yield from [ {"_id": i, "value": letter, "user_id": user_id} diff --git a/docs/website/docs/intro.md b/docs/website/docs/intro.md index b20d41c494..bc227b85ad 100644 --- a/docs/website/docs/intro.md +++ b/docs/website/docs/intro.md @@ -70,6 +70,10 @@ pipeline = dlt.pipeline( ) load_info = pipeline.run(source) + +# print load info and posts table as dataframe +print(load_info) +print(pipeline.dataset().posts.df()) ``` Follow the [REST API source tutorial](./tutorial/rest-api) to learn more about the source configuration and pagination methods. @@ -92,6 +96,10 @@ pipeline = dlt.pipeline( ) load_info = pipeline.run(source) + +# print load info and the "family" table as dataframe +print(load_info) +print(pipeline.dataset().family.df()) ``` Follow the [SQL source tutorial](./tutorial/sql-database) to learn more about the source configuration and supported databases. @@ -116,6 +124,10 @@ pipeline = dlt.pipeline( ) load_info = pipeline.run(resource) + +# print load info and the "example" table as dataframe +print(load_info) +print(pipeline.dataset().example.df()) ``` Follow the [filesystem source tutorial](./tutorial/filesystem) to learn more about the source configuration and supported storage services. @@ -128,7 +140,7 @@ dlt is able to load data from Python generators or directly from Python data str ```py import dlt -@dlt.resource +@dlt.resource(table_name="foo_data") def foo(): for i in range(10): yield {"id": i, "name": f"This is item {i}"} @@ -139,6 +151,10 @@ pipeline = dlt.pipeline( ) load_info = pipeline.run(foo) + +# print load info and the "foo_data" table as dataframe +print(load_info) +print(pipeline.dataset().foo_data.df()) ``` Check out the [Python data structures tutorial](./tutorial/load-data-from-an-api) to learn about dlt fundamentals and advanced usage scenarios. diff --git a/docs/website/docs/reference/command-line-interface.md b/docs/website/docs/reference/command-line-interface.md index 825d33d548..2af750f43c 100644 --- a/docs/website/docs/reference/command-line-interface.md +++ b/docs/website/docs/reference/command-line-interface.md @@ -20,9 +20,22 @@ This command creates a new dlt pipeline script that loads data from `source` to This command can be used several times in the same folder to add more sources, destinations, and pipelines. It will also update the verified source code to the newest version if run again with an existing `source` name. You are warned if files will be overwritten or if the `dlt` version needs an upgrade to run a particular pipeline. +### Ejecting source code of the core sources like `sql_database`. +We merged a few sources to the core library. You can still eject source code and hack them with the `--eject` flag: +```sh +dlt init sql_database duckdb --eject +``` +will copy the source code of `sql_database` to your project. Remember to modify the pipeline example script to import from the local folder! + ### Specify your own "verified sources" repository You can use the `--location ` option to specify your own repository with sources. Typically, you would [fork ours](https://github.com/dlt-hub/verified-sources) and start customizing and adding sources, e.g., to use them for your team or organization. You can also specify a branch with `--branch `, e.g., to test a version being developed. +### Using dlt 0.5.x sources +Use `--branch 0.5` if you are still on `dlt` `0.5.x` ie. +```sh +dlt init --branch 0.5 +``` + ### List all sources ```sh dlt init --list-sources diff --git a/docs/website/docs/reference/explainers/how-dlt-works.md b/docs/website/docs/reference/explainers/how-dlt-works.md index fa73babd03..570bc8cdbe 100644 --- a/docs/website/docs/reference/explainers/how-dlt-works.md +++ b/docs/website/docs/reference/explainers/how-dlt-works.md @@ -6,33 +6,71 @@ keywords: [architecture, extract, normalize, load] # How `dlt` works -`dlt` automatically turns JSON returned by any [source](../../general-usage/glossary.md#source) -(e.g., an API) into a live dataset stored in the -[destination](../../general-usage/glossary.md#destination) of your choice (e.g., Google BigQuery). It -does this by first [extracting](how-dlt-works.md#extract) the JSON data, then -[normalizing](how-dlt-works.md#normalize) it to a schema, and finally [loading](how-dlt-works#load) -it to the location where you will store it. +In a nutshell, `dlt` automatically turns data from a number of available [sources](../../dlt-ecosystem/verified-sources) (e.g., an API, a PostgreSQL database, or Python data structures) into a live dataset stored in a [destination](../../dlt-ecosystem/destinations) of your choice (e.g., Google BigQuery, a Deltalake on Azure, or by pushing the data back via reverse ETL). You can easily implement your own sources, as long as you yield data in a way that is compatible with `dlt`, such as JSON objects, Python lists and dictionaries, pandas dataframes, and arrow tables. `dlt` will be able to automatically compute the schema and move the data to your destination. -![architecture-diagram](/img/architecture-diagram.png) +![architecture-diagram](/img/dlt-onepager.png) -## Extract +## A concrete example -The Python script requests data from an API or a similar -[source](../../general-usage/glossary.md#source). Once this data is received, the script parses the -JSON and provides it to `dlt` as input, which then normalizes that data. +The main building block of `dlt` is the [pipeline](../../general-usage/glossary.md#pipeline), which orchestrates the loading of data from your source into your destination in three discrete steps when you call its `run` method. Consider this intentionally short example: -## Normalize +```py +import dlt -The configurable normalization engine in `dlt` recursively unpacks this nested structure into -relational tables (i.e., inferring data types, linking tables to create nested relationships, -etc.), making it ready to be loaded. This creates a -[schema](../../general-usage/glossary.md#schema), which will automatically evolve to accommodate any future -source data changes (e.g., new fields or tables). +pipeline = dlt.pipeline(pipeline_name="my_pipeline", destination="duckdb") +pipeline.run( + [ + {"id": 1}, + {"id": 2}, + {"id": 3, "nested": [{"id": 1}, {"id": 2}]}, + ], + table_name="items", +) +``` -## Load +This is what happens when the `run` method is executed: -The data is then loaded into your chosen [destination](../../general-usage/glossary.md#destination). -`dlt` uses configurable, idempotent, atomic loads that ensure data safely ends up there. For -example, you don't need to worry about the size of the data you are loading, and if the process is -interrupted, it is safe to retry without creating errors. +1. [Extract](how-dlt-works.md#extract) - Fully extracts the data from your source to your hard drive. In the example above, an implicit source with one resource with 3 items is created and extracted. +2. [Normalize](how-dlt-works.md#normalize) - Inspects and normalizes your data and computes a schema compatible with your destination. For the example above, the normalizer will detect one column `id` of type `int` in one table named `items`, it will furthermore detect a nested list in table items and unnest it into a child table named `items__nested`. +3. [Load](how-dlt-works#load) - Runs schema migrations if necessary on your destination and loads your data into the destination. For the example above, a new dataset on a local duckdb database is created that contains the two tables discovered in the previous steps. +## The three phases + +### Extract + +Extract can be run individually with the `extract` command on the pipeline: + +```py +pipeline.extract(data) +``` + +During the extract phase, `dlt` fully extracts the data from your [sources](../../dlt-ecosystem/verified-sources) to your hard drive into a new [load package](../../general-usage/destination-tables#load-packages-and-load-ids), which will be assigned a unique ID and will contain your raw data as received from your sources. Additionally, you can [supply schema hints](../../general-usage/resource#define-schema) to define the data types of some of the columns or add a primary key and unique indexes. You can also control this phase by [limiting](../../general-usage/resource#sample-from-large-data) the number of items extracted in one run, using [incremental cursor fields](../../general-usage/incremental-loading#incremental-loading-with-a-cursor-field), and by tuning the performance with [parallelization](../../reference/performance#extract). You can also apply filters and maps to [obfuscate](../../general-usage/customising-pipelines/pseudonymizing_columns) or [remove](../../general-usage/customising-pipelines/removing_columns) personal data, and you can use [transformers](../../examples/transformers) to create derivative data. + +### Normalize + +Normalize can be run individually with the `normalize` command on the pipeline. Normalize is dependent on having a completed extract phase and will not do anything if there is no extracted data. + +```py +pipeline.normalize() +``` + +During the normalization phase, `dlt` inspects and normalizes your data and computes a [schema](../../general-usage/schema) corresponding to the input data. The schema will automatically evolve to accommodate any future source data changes like new columns or tables. `dlt` will also unnest nested data structures into child tables and create variant columns if detected values do not match a schema computed during a previous run. The result of the normalization phase is an updated load package that holds your normalized data in a format your destination understands and a full schema which can be used to migrate your data to your destination. You can control the normalization phase, for example, by [defining the allowed nesting level](../../general-usage/source#reduce-the-nesting-level-of-generated-tables) of input data, by [applying schema contracts](../../general-usage/schema-contracts) that govern how the schema might evolve, and how rows that do not fit are treated. Performance settings are [also available](../../reference/performance#normalize). + +### Load + +Load can be run individually with the `load` command on the pipeline. Load is dependent on having a completed normalize phase and will not do anything if there is no normalized data. + +```py +pipeline.load() +``` + +During the loading phase, `dlt` first runs schema migrations as needed on your destination and then loads your data into the destination. `dlt` will load your data in smaller chunks called load jobs to be able to parallelize large loads. If the connection to the destination fails, it is safe to rerun the pipeline, and `dlt` will continue to load all load jobs from the current load package. `dlt` will also create special tables that store the internal dlt schema, information about all load packages, and some state information which, among other things, are used by the incrementals to be able to restore the incremental state from a previous run to another machine. Some ways to control the loading phase are by using different [`write_dispositions`](../../general-usage/incremental-loading#choosing-a-write-disposition) to replace the data in the destination, simply append to it, or merge on certain merge keys that you can configure per table. For some destinations, you can use a remote staging dataset on a bucket provider, and `dlt` even supports modern open table formats like [deltables and iceberg](../../dlt-ecosystem/destinations/delta-iceberg), and [reverse ETL](../../dlt-ecosystem/destinations/destination) is also possible. + +## Other notable `dlt` features + +* `dlt` is simply a Python package, so it will run [everywhere that Python runs](../../walkthroughs/deploy-a-pipeline) — locally, in notebooks, on orchestrators — you name it. +* `dlt` allows you to build and test your data pipelines locally with `duckdb` and then switch out the destination for deployment. +* `dlt` provides a user-friendly interface for [accessing your data in Python](../../general-usage/dataset-access/dataset), using [a Streamlit app](../../general-usage/dataset-access/streamlit), and leveraging [integrations](../../general-usage/dataset-access/ibis-backend) with the fabulous Ibis library. All of this even works on data lakes provided by bucket storage providers. +* `dlt` fully manages schema migrations on your destinations. You don’t even need to know how to use SQL to update your schema. It also supports [schema contracts](../../general-usage/schema-contracts) to govern how the schema might evolve. +* `dlt` offers numerous options for [monitoring and tracing](../../running-in-production/monitoring) what is happening during your loads. +* `dlt` supports you when you need to [transform your data](../../dlt-ecosystem/transformations) after the load, whether with dbt or in Python using Arrow tables and pandas DataFrames. diff --git a/docs/website/docs/reference/performance.md b/docs/website/docs/reference/performance.md index ab171ac069..0f536fa786 100644 --- a/docs/website/docs/reference/performance.md +++ b/docs/website/docs/reference/performance.md @@ -48,9 +48,7 @@ Some file formats (e.g., Parquet) do not support schema changes when writing a s Below, we set files to rotate after 100,000 items written or when the filesize exceeds 1MiB. - - - + ### Disabling and enabling file compression Several [text file formats](../dlt-ecosystem/file-formats/) have `gzip` compression enabled by default. If you wish that your load packages have uncompressed files (e.g., to debug the content easily), change `data_writer.disable_compression` in config.toml. The entry below will disable the compression of the files processed in the `normalize` stage. @@ -148,7 +146,10 @@ As before, **if you have just a single table with millions of records, you shoul -Since the normalize stage uses a process pool to create load packages concurrently, adjusting the `file_max_items` and `file_max_bytes` settings can significantly impact load behavior. By setting a lower value for `file_max_items`, you reduce the size of each data chunk sent to the destination database, which can be particularly useful for managing memory constraints on the database server. Without explicit configuration of `file_max_items`, `dlt` writes all data rows into one large intermediary file, attempting to insert all data from this single file. Configuring `file_max_items` ensures data is inserted in manageable chunks, enhancing performance and preventing potential memory issues. +The **normalize** stage in `dlt` uses a process pool to create load packages concurrently, and the settings for `file_max_items` and `file_max_bytes` play a crucial role in determining the size of data chunks. Lower values for these settings reduce the size of each chunk sent to the destination database, which is particularly helpful for managing memory constraints on the database server. By default, `dlt` writes all data rows into one large intermediary file, attempting to load all data at once. Configuring these settings enables file rotation, splitting the data into smaller, more manageable chunks. This not only improves performance but also minimizes memory-related issues when working with large tables containing millions of records. + +#### Controlling destination items size +The intermediary files generated during the **normalize** stage are also used in the **load** stage. Therefore, adjusting `file_max_items` and `file_max_bytes` in the **normalize** stage directly impacts the size and number of data chunks sent to the destination, influencing loading behavior and performance. ### Parallel pipeline config example The example below simulates the loading of a large database table with 1,000,000 records. The **config.toml** below sets the parallelization as follows: @@ -264,3 +265,29 @@ DLT_USE_JSON=simplejson Instead of using Python Requests directly, you can use the built-in [requests wrapper](../general-usage/http/requests) or [`RESTClient`](../general-usage/http/rest-client) for API calls. This will make your pipeline more resilient to intermittent network errors and other random glitches. + +## Keep pipeline working folder in a bucket on constrained environments. +`dlt` stores extracted data in load packages in order to load them atomically. In case you extract a lot of data at once (ie. backfill) or +your runtime env has constrained local storage (ie. cloud functions) you can keep your data on a bucket by using [FUSE](https://github.com/libfuse/libfuse) or +any other option which your cloud provider supplies. + +`dlt` users rename when saving files and "committing" packages (folder rename). Those may be not supported on bucket filesystems. Often +`rename` is translated into `copy` automatically. In other cases `dlt` will fallback to copy itself. + +In case of cloud function and gs bucket mounts, increasing the rename limit for folders is possible: +```hcl +volume_mounts { + mount_path = "/usr/src/ingestion/pipeline_storage" + name = "pipeline_bucket" + } +volumes { + name = "pipeline_bucket" + gcs { + bucket = google_storage_bucket.dlt_pipeline_data_bucket.name + read_only = false + mount_options = [ + "rename-dir-limit=100000" + ] + } +} +``` diff --git a/docs/website/docs/tutorial/load-data-from-an-api.md b/docs/website/docs/tutorial/load-data-from-an-api.md index ddfef2cbe8..73f780ba7a 100644 --- a/docs/website/docs/tutorial/load-data-from-an-api.md +++ b/docs/website/docs/tutorial/load-data-from-an-api.md @@ -72,7 +72,25 @@ Load package 1692364844.460054 is LOADED and contains no failed jobs `dlt` just created a database schema called **mydata** (the `dataset_name`) with a table **users** in it. -### Explore the data +### Explore data in Python + +You can use dlt [datasets](../general-usage/dataset-access/dataset) to easily query the data in pure Python. + +```py +# get the dataset +dataset = pipeline.dataset("mydata") + +# get the user relation +table = dataset.users + +# query the full table as dataframe +print(table.df()) + +# query the first 10 rows as arrow table +print(table.limit(10).arrow()) +``` + +### Explore data in Streamlit To allow a sneak peek and basic discovery, you can take advantage of [built-in integration with Streamlit](../reference/command-line-interface#show-tables-and-data-in-the-destination): diff --git a/docs/website/docs/tutorial/rest-api.md b/docs/website/docs/tutorial/rest-api.md index 56051e80de..70c7f7e964 100644 --- a/docs/website/docs/tutorial/rest-api.md +++ b/docs/website/docs/tutorial/rest-api.md @@ -246,7 +246,7 @@ pokemon_source = rest_api_source( # the primary key and write disposition { "name": "pokemon", - "primary_key": "id", + "primary_key": "name", "write_disposition": "merge", }, # The `berry` and `location` resources will use the default @@ -257,7 +257,7 @@ pokemon_source = rest_api_source( ) ``` -Run the pipeline with `python rest_api_pipeline.py`, the data for the `pokemon` resource will be merged with the existing data in the destination table based on the `id` field. +Run the pipeline with `python rest_api_pipeline.py`, the data for the `pokemon` resource will be merged with the existing data in the destination table based on the `name` field. ## Loading data incrementally diff --git a/docs/website/sidebars.js b/docs/website/sidebars.js index 274f3e82b3..ca75c29392 100644 --- a/docs/website/sidebars.js +++ b/docs/website/sidebars.js @@ -167,6 +167,7 @@ const sidebars = { 'dlt-ecosystem/destinations/synapse', 'dlt-ecosystem/destinations/clickhouse', 'dlt-ecosystem/destinations/filesystem', + 'dlt-ecosystem/destinations/delta-iceberg', 'dlt-ecosystem/destinations/postgres', 'dlt-ecosystem/destinations/redshift', 'dlt-ecosystem/destinations/snowflake', @@ -210,13 +211,10 @@ const sidebars = { }, { type: 'category', - label: 'Transform the data', + label: 'Transforming data', link: { - type: 'generated-index', - title: 'Transform the data', - description: 'If you want to transform the data after loading, you can use one of the following methods: dbt, SQL, Pandas.', - slug: 'dlt-ecosystem/transformations', - keywords: ['transformations'], + type: 'doc', + id: 'dlt-ecosystem/transformations/index', }, items: [ { @@ -227,8 +225,8 @@ const sidebars = { 'dlt-ecosystem/transformations/dbt/dbt_cloud', ] }, + 'dlt-ecosystem/transformations/python', 'dlt-ecosystem/transformations/sql', - 'dlt-ecosystem/transformations/pandas', 'general-usage/customising-pipelines/renaming_columns', 'general-usage/customising-pipelines/pseudonymizing_columns', 'general-usage/customising-pipelines/removing_columns' diff --git a/docs/website/static/img/dlt-onepager.png b/docs/website/static/img/dlt-onepager.png new file mode 100644 index 0000000000..57b5d3f658 Binary files /dev/null and b/docs/website/static/img/dlt-onepager.png differ diff --git a/mypy.ini b/mypy.ini index 769e84b13a..fdf0ceb1e6 100644 --- a/mypy.ini +++ b/mypy.ini @@ -135,3 +135,9 @@ ignore_missing_imports = True [mypy-time_machine.*] ignore_missing_imports = True + +[mypy-pyiceberg.*] +ignore_missing_imports = True + +[mypy-airflow.*] +ignore_missing_imports = True diff --git a/poetry.lock b/poetry.lock index 732ba0e219..82d9bf90f8 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. [[package]] name = "about-time" @@ -776,7 +776,7 @@ files = [ name = "atpublic" version = "5.0" description = "Keep all y'all's __all__'s in sync" -optional = true +optional = false python-versions = ">=3.8" files = [ {file = "atpublic-5.0-py3-none-any.whl", hash = "sha256:b651dcd886666b1042d1e38158a22a4f2c267748f4e97fde94bc492a4a28a3f3"}, @@ -1543,13 +1543,13 @@ files = [ [[package]] name = "cachetools" -version = "5.3.1" +version = "5.5.0" description = "Extensible memoizing collections and decorators" optional = false python-versions = ">=3.7" files = [ - {file = "cachetools-5.3.1-py3-none-any.whl", hash = "sha256:95ef631eeaea14ba2e36f06437f36463aac3a096799e876ee55e5cdccb102590"}, - {file = "cachetools-5.3.1.tar.gz", hash = "sha256:dce83f2d9b4e1f732a8cd44af8e8fab2dbe46201467fc98b3ef8f269092bf62b"}, + {file = "cachetools-5.5.0-py3-none-any.whl", hash = "sha256:02134e8439cdc2ffb62023ce1debca2944c3f289d66bb17ead3ab3dede74b292"}, + {file = "cachetools-5.5.0.tar.gz", hash = "sha256:2cc24fb4cbe39633fb7badd9db9ca6295d766d9c2995f245725a46715d050f2a"}, ] [[package]] @@ -1755,7 +1755,7 @@ PyYAML = ">=3.11" name = "clickhouse-connect" version = "0.7.8" description = "ClickHouse Database Core Driver for Python, Pandas, and Superset" -optional = true +optional = false python-versions = "~=3.8" files = [ {file = "clickhouse-connect-0.7.8.tar.gz", hash = "sha256:dad10ba90eabfe215dfb1fef59f2821a95c752988e66f1093ca8590a51539b8f"}, @@ -2208,6 +2208,26 @@ nr-date = ">=2.0.0,<3.0.0" typeapi = ">=2.0.1,<3.0.0" typing-extensions = ">=3.10.0" +[[package]] +name = "databricks-sdk" +version = "0.39.0" +description = "Databricks SDK for Python (Beta)" +optional = true +python-versions = ">=3.7" +files = [ + {file = "databricks_sdk-0.39.0-py3-none-any.whl", hash = "sha256:915fbf12b249264f74ddae2ca739530e3c4a9c5a454617ac403115d6466c2f99"}, + {file = "databricks_sdk-0.39.0.tar.gz", hash = "sha256:2e04edbb9e050f4362da804fb5dad07637c5adecfcffb4d0ca8abb5aefa36d06"}, +] + +[package.dependencies] +google-auth = ">=2.0,<3.0" +requests = ">=2.28.1,<3" + +[package.extras] +dev = ["autoflake", "databricks-connect", "httpx", "ipython", "ipywidgets", "isort", "langchain-openai", "openai", "pycodestyle", "pyfakefs", "pytest", "pytest-cov", "pytest-mock", "pytest-rerunfailures", "pytest-xdist", "requests-mock", "wheel", "yapf"] +notebook = ["ipython (>=8,<9)", "ipywidgets (>=8,<9)"] +openai = ["httpx", "langchain-openai", "openai"] + [[package]] name = "databricks-sql-connector" version = "2.9.6" @@ -2242,7 +2262,7 @@ urllib3 = ">=1.0" name = "db-dtypes" version = "1.3.0" description = "Pandas Data Types for SQL systems (BigQuery, Spanner)" -optional = true +optional = false python-versions = ">=3.7" files = [ {file = "db_dtypes-1.3.0-py2.py3-none-any.whl", hash = "sha256:7e65c59f849ccbe6f7bc4d0253edcc212a7907662906921caba3e4aadd0bc277"}, @@ -3526,7 +3546,7 @@ tqdm = ["tqdm (>=4.7.4,<5.0.0dev)"] name = "google-cloud-bigquery-storage" version = "2.27.0" description = "Google Cloud Bigquery Storage API client library" -optional = true +optional = false python-versions = ">=3.7" files = [ {file = "google_cloud_bigquery_storage-2.27.0-py2.py3-none-any.whl", hash = "sha256:3bfa8f74a61ceaffd3bfe90be5bbef440ad81c1c19ac9075188cccab34bffc2b"}, @@ -3900,6 +3920,106 @@ files = [ {file = "google_re2-1.1-4-cp39-cp39-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1f4d4f0823e8b2f6952a145295b1ff25245ce9bb136aff6fe86452e507d4c1dd"}, {file = "google_re2-1.1-4-cp39-cp39-win32.whl", hash = "sha256:1afae56b2a07bb48cfcfefaa15ed85bae26a68f5dc7f9e128e6e6ea36914e847"}, {file = "google_re2-1.1-4-cp39-cp39-win_amd64.whl", hash = "sha256:aa7d6d05911ab9c8adbf3c225a7a120ab50fd2784ac48f2f0d140c0b7afc2b55"}, + {file = "google_re2-1.1-5-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:222fc2ee0e40522de0b21ad3bc90ab8983be3bf3cec3d349c80d76c8bb1a4beb"}, + {file = "google_re2-1.1-5-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:d4763b0b9195b72132a4e7de8e5a9bf1f05542f442a9115aa27cfc2a8004f581"}, + {file = "google_re2-1.1-5-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:209649da10c9d4a93d8a4d100ecbf9cc3b0252169426bec3e8b4ad7e57d600cf"}, + {file = "google_re2-1.1-5-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:68813aa333c1604a2df4a495b2a6ed065d7c8aebf26cc7e7abb5a6835d08353c"}, + {file = "google_re2-1.1-5-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:370a23ec775ad14e9d1e71474d56f381224dcf3e72b15d8ca7b4ad7dd9cd5853"}, + {file = "google_re2-1.1-5-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:14664a66a3ddf6bc9e56f401bf029db2d169982c53eff3f5876399104df0e9a6"}, + {file = "google_re2-1.1-5-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3ea3722cc4932cbcebd553b69dce1b4a73572823cff4e6a244f1c855da21d511"}, + {file = "google_re2-1.1-5-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e14bb264c40fd7c627ef5678e295370cd6ba95ca71d835798b6e37502fc4c690"}, + {file = "google_re2-1.1-5-cp310-cp310-win32.whl", hash = "sha256:39512cd0151ea4b3969c992579c79b423018b464624ae955be685fc07d94556c"}, + {file = "google_re2-1.1-5-cp310-cp310-win_amd64.whl", hash = "sha256:ac66537aa3bc5504320d922b73156909e3c2b6da19739c866502f7827b3f9fdf"}, + {file = "google_re2-1.1-5-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:5b5ea68d54890c9edb1b930dcb2658819354e5d3f2201f811798bbc0a142c2b4"}, + {file = "google_re2-1.1-5-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:33443511b6b83c35242370908efe2e8e1e7cae749c766b2b247bf30e8616066c"}, + {file = "google_re2-1.1-5-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:413d77bdd5ba0bfcada428b4c146e87707452ec50a4091ec8e8ba1413d7e0619"}, + {file = "google_re2-1.1-5-cp311-cp311-macosx_13_0_x86_64.whl", hash = "sha256:5171686e43304996a34baa2abcee6f28b169806d0e583c16d55e5656b092a414"}, + {file = "google_re2-1.1-5-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:3b284db130283771558e31a02d8eb8fb756156ab98ce80035ae2e9e3a5f307c4"}, + {file = "google_re2-1.1-5-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:296e6aed0b169648dc4b870ff47bd34c702a32600adb9926154569ef51033f47"}, + {file = "google_re2-1.1-5-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:38d50e68ead374160b1e656bbb5d101f0b95fb4cc57f4a5c12100155001480c5"}, + {file = "google_re2-1.1-5-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2a0416a35921e5041758948bcb882456916f22845f66a93bc25070ef7262b72a"}, + {file = "google_re2-1.1-5-cp311-cp311-win32.whl", hash = "sha256:a1d59568bbb5de5dd56dd6cdc79907db26cce63eb4429260300c65f43469e3e7"}, + {file = "google_re2-1.1-5-cp311-cp311-win_amd64.whl", hash = "sha256:72f5a2f179648b8358737b2b493549370debd7d389884a54d331619b285514e3"}, + {file = "google_re2-1.1-5-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:cbc72c45937b1dc5acac3560eb1720007dccca7c9879138ff874c7f6baf96005"}, + {file = "google_re2-1.1-5-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:5fadd1417fbef7235fa9453dba4eb102e6e7d94b1e4c99d5fa3dd4e288d0d2ae"}, + {file = "google_re2-1.1-5-cp312-cp312-macosx_13_0_arm64.whl", hash = "sha256:040f85c63cc02696485b59b187a5ef044abe2f99b92b4fb399de40b7d2904ccc"}, + {file = "google_re2-1.1-5-cp312-cp312-macosx_13_0_x86_64.whl", hash = "sha256:64e3b975ee6d9bbb2420494e41f929c1a0de4bcc16d86619ab7a87f6ea80d6bd"}, + {file = "google_re2-1.1-5-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:8ee370413e00f4d828eaed0e83b8af84d7a72e8ee4f4bd5d3078bc741dfc430a"}, + {file = "google_re2-1.1-5-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:5b89383001079323f693ba592d7aad789d7a02e75adb5d3368d92b300f5963fd"}, + {file = "google_re2-1.1-5-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:63cb4fdfbbda16ae31b41a6388ea621510db82feb8217a74bf36552ecfcd50ad"}, + {file = "google_re2-1.1-5-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9ebedd84ae8be10b7a71a16162376fd67a2386fe6361ef88c622dcf7fd679daf"}, + {file = "google_re2-1.1-5-cp312-cp312-win32.whl", hash = "sha256:c8e22d1692bc2c81173330c721aff53e47ffd3c4403ff0cd9d91adfd255dd150"}, + {file = "google_re2-1.1-5-cp312-cp312-win_amd64.whl", hash = "sha256:5197a6af438bb8c4abda0bbe9c4fbd6c27c159855b211098b29d51b73e4cbcf6"}, + {file = "google_re2-1.1-5-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:b6727e0b98417e114b92688ad2aa256102ece51f29b743db3d831df53faf1ce3"}, + {file = "google_re2-1.1-5-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:711e2b6417eb579c61a4951029d844f6b95b9b373b213232efd413659889a363"}, + {file = "google_re2-1.1-5-cp38-cp38-macosx_13_0_arm64.whl", hash = "sha256:71ae8b3df22c5c154c8af0f0e99d234a450ef1644393bc2d7f53fc8c0a1e111c"}, + {file = "google_re2-1.1-5-cp38-cp38-macosx_13_0_x86_64.whl", hash = "sha256:94a04e214bc521a3807c217d50cf099bbdd0c0a80d2d996c0741dbb995b5f49f"}, + {file = "google_re2-1.1-5-cp38-cp38-macosx_14_0_arm64.whl", hash = "sha256:a770f75358508a9110c81a1257721f70c15d9bb592a2fb5c25ecbd13566e52a5"}, + {file = "google_re2-1.1-5-cp38-cp38-macosx_14_0_x86_64.whl", hash = "sha256:07c9133357f7e0b17c6694d5dcb82e0371f695d7c25faef2ff8117ef375343ff"}, + {file = "google_re2-1.1-5-cp38-cp38-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:204ca6b1cf2021548f4a9c29ac015e0a4ab0a7b6582bf2183d838132b60c8fda"}, + {file = "google_re2-1.1-5-cp38-cp38-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f0b95857c2c654f419ca684ec38c9c3325c24e6ba7d11910a5110775a557bb18"}, + {file = "google_re2-1.1-5-cp38-cp38-win32.whl", hash = "sha256:347ac770e091a0364e822220f8d26ab53e6fdcdeaec635052000845c5a3fb869"}, + {file = "google_re2-1.1-5-cp38-cp38-win_amd64.whl", hash = "sha256:ec32bb6de7ffb112a07d210cf9f797b7600645c2d5910703fa07f456dd2150e0"}, + {file = "google_re2-1.1-5-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:eb5adf89060f81c5ff26c28e261e6b4997530a923a6093c9726b8dec02a9a326"}, + {file = "google_re2-1.1-5-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:a22630c9dd9ceb41ca4316bccba2643a8b1d5c198f21c00ed5b50a94313aaf10"}, + {file = "google_re2-1.1-5-cp39-cp39-macosx_13_0_arm64.whl", hash = "sha256:544dc17fcc2d43ec05f317366375796351dec44058e1164e03c3f7d050284d58"}, + {file = "google_re2-1.1-5-cp39-cp39-macosx_13_0_x86_64.whl", hash = "sha256:19710af5ea88751c7768575b23765ce0dfef7324d2539de576f75cdc319d6654"}, + {file = "google_re2-1.1-5-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:f82995a205e08ad896f4bd5ce4847c834fab877e1772a44e5f262a647d8a1dec"}, + {file = "google_re2-1.1-5-cp39-cp39-macosx_14_0_x86_64.whl", hash = "sha256:63533c4d58da9dc4bc040250f1f52b089911699f0368e0e6e15f996387a984ed"}, + {file = "google_re2-1.1-5-cp39-cp39-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:79e00fcf0cb04ea35a22b9014712d448725ce4ddc9f08cc818322566176ca4b0"}, + {file = "google_re2-1.1-5-cp39-cp39-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bc41afcefee2da6c4ed883a93d7f527c4b960cd1d26bbb0020a7b8c2d341a60a"}, + {file = "google_re2-1.1-5-cp39-cp39-win32.whl", hash = "sha256:486730b5e1f1c31b0abc6d80abe174ce4f1188fe17d1b50698f2bf79dc6e44be"}, + {file = "google_re2-1.1-5-cp39-cp39-win_amd64.whl", hash = "sha256:4de637ca328f1d23209e80967d1b987d6b352cd01b3a52a84b4d742c69c3da6c"}, + {file = "google_re2-1.1-6-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:621e9c199d1ff0fdb2a068ad450111a84b3bf14f96dfe5a8a7a0deae5f3f4cce"}, + {file = "google_re2-1.1-6-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:220acd31e7dde95373f97c3d1f3b3bd2532b38936af28b1917ee265d25bebbf4"}, + {file = "google_re2-1.1-6-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:db34e1098d164f76251a6ece30e8f0ddfd65bb658619f48613ce71acb3f9cbdb"}, + {file = "google_re2-1.1-6-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:5152bac41d8073977582f06257219541d0fc46ad99b0bbf30e8f60198a43b08c"}, + {file = "google_re2-1.1-6-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:6191294799e373ee1735af91f55abd23b786bdfd270768a690d9d55af9ea1b0d"}, + {file = "google_re2-1.1-6-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:070cbafbb4fecbb02e98feb28a1eb292fb880f434d531f38cc33ee314b521f1f"}, + {file = "google_re2-1.1-6-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8437d078b405a59a576cbed544490fe041140f64411f2d91012e8ec05ab8bf86"}, + {file = "google_re2-1.1-6-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f00f9a9af8896040e37896d9b9fc409ad4979f1ddd85bb188694a7d95ddd1164"}, + {file = "google_re2-1.1-6-cp310-cp310-win32.whl", hash = "sha256:df26345f229a898b4fd3cafd5f82259869388cee6268fc35af16a8e2293dd4e5"}, + {file = "google_re2-1.1-6-cp310-cp310-win_amd64.whl", hash = "sha256:3665d08262c57c9b28a5bdeb88632ad792c4e5f417e5645901695ab2624f5059"}, + {file = "google_re2-1.1-6-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:b26b869d8aa1d8fe67c42836bf3416bb72f444528ee2431cfb59c0d3e02c6ce3"}, + {file = "google_re2-1.1-6-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:41fd4486c57dea4f222a6bb7f1ff79accf76676a73bdb8da0fcbd5ba73f8da71"}, + {file = "google_re2-1.1-6-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:0ee378e2e74e25960070c338c28192377c4dd41e7f4608f2688064bd2badc41e"}, + {file = "google_re2-1.1-6-cp311-cp311-macosx_13_0_x86_64.whl", hash = "sha256:a00cdbf662693367b36d075b29feb649fd7ee1b617cf84f85f2deebeda25fc64"}, + {file = "google_re2-1.1-6-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:4c09455014217a41499432b8c8f792f25f3df0ea2982203c3a8c8ca0e7895e69"}, + {file = "google_re2-1.1-6-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:6501717909185327935c7945e23bb5aa8fc7b6f237b45fe3647fa36148662158"}, + {file = "google_re2-1.1-6-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3510b04790355f199e7861c29234081900e1e1cbf2d1484da48aa0ba6d7356ab"}, + {file = "google_re2-1.1-6-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8c0e64c187ca406764f9e9ad6e750d62e69ed8f75bf2e865d0bfbc03b642361c"}, + {file = "google_re2-1.1-6-cp311-cp311-win32.whl", hash = "sha256:2a199132350542b0de0f31acbb3ca87c3a90895d1d6e5235f7792bb0af02e523"}, + {file = "google_re2-1.1-6-cp311-cp311-win_amd64.whl", hash = "sha256:83bdac8ceaece8a6db082ea3a8ba6a99a2a1ee7e9f01a9d6d50f79c6f251a01d"}, + {file = "google_re2-1.1-6-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:81985ff894cd45ab5a73025922ac28c0707759db8171dd2f2cc7a0e856b6b5ad"}, + {file = "google_re2-1.1-6-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:5635af26065e6b45456ccbea08674ae2ab62494008d9202df628df3b267bc095"}, + {file = "google_re2-1.1-6-cp312-cp312-macosx_13_0_arm64.whl", hash = "sha256:813b6f04de79f4a8fdfe05e2cb33e0ccb40fe75d30ba441d519168f9d958bd54"}, + {file = "google_re2-1.1-6-cp312-cp312-macosx_13_0_x86_64.whl", hash = "sha256:5ec2f5332ad4fd232c3f2d6748c2c7845ccb66156a87df73abcc07f895d62ead"}, + {file = "google_re2-1.1-6-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:5a687b3b32a6cbb731647393b7c4e3fde244aa557f647df124ff83fb9b93e170"}, + {file = "google_re2-1.1-6-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:39a62f9b3db5d3021a09a47f5b91708b64a0580193e5352751eb0c689e4ad3d7"}, + {file = "google_re2-1.1-6-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ca0f0b45d4a1709cbf5d21f355e5809ac238f1ee594625a1e5ffa9ff7a09eb2b"}, + {file = "google_re2-1.1-6-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a64b3796a7a616c7861247bd061c9a836b5caf0d5963e5ea8022125601cf7b09"}, + {file = "google_re2-1.1-6-cp312-cp312-win32.whl", hash = "sha256:32783b9cb88469ba4cd9472d459fe4865280a6b1acdad4480a7b5081144c4eb7"}, + {file = "google_re2-1.1-6-cp312-cp312-win_amd64.whl", hash = "sha256:259ff3fd2d39035b9cbcbf375995f83fa5d9e6a0c5b94406ff1cc168ed41d6c6"}, + {file = "google_re2-1.1-6-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:e4711bcffe190acd29104d8ecfea0c0e42b754837de3fb8aad96e6cc3c613cdc"}, + {file = "google_re2-1.1-6-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:4d081cce43f39c2e813fe5990e1e378cbdb579d3f66ded5bade96130269ffd75"}, + {file = "google_re2-1.1-6-cp38-cp38-macosx_13_0_arm64.whl", hash = "sha256:4f123b54d48450d2d6b14d8fad38e930fb65b5b84f1b022c10f2913bd956f5b5"}, + {file = "google_re2-1.1-6-cp38-cp38-macosx_13_0_x86_64.whl", hash = "sha256:e1928b304a2b591a28eb3175f9db7f17c40c12cf2d4ec2a85fdf1cc9c073ff91"}, + {file = "google_re2-1.1-6-cp38-cp38-macosx_14_0_arm64.whl", hash = "sha256:3a69f76146166aec1173003c1f547931bdf288c6b135fda0020468492ac4149f"}, + {file = "google_re2-1.1-6-cp38-cp38-macosx_14_0_x86_64.whl", hash = "sha256:fc08c388f4ebbbca345e84a0c56362180d33d11cbe9ccfae663e4db88e13751e"}, + {file = "google_re2-1.1-6-cp38-cp38-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b057adf38ce4e616486922f2f47fc7d19c827ba0a7f69d540a3664eba2269325"}, + {file = "google_re2-1.1-6-cp38-cp38-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4138c0b933ab099e96f5d8defce4486f7dfd480ecaf7f221f2409f28022ccbc5"}, + {file = "google_re2-1.1-6-cp38-cp38-win32.whl", hash = "sha256:9693e45b37b504634b1abbf1ee979471ac6a70a0035954592af616306ab05dd6"}, + {file = "google_re2-1.1-6-cp38-cp38-win_amd64.whl", hash = "sha256:5674d437baba0ea287a5a7f8f81f24265d6ae8f8c09384e2ef7b6f84b40a7826"}, + {file = "google_re2-1.1-6-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:7783137cb2e04f458a530c6d0ee9ef114815c1d48b9102f023998c371a3b060e"}, + {file = "google_re2-1.1-6-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:a49b7153935e7a303675f4deb5f5d02ab1305adefc436071348706d147c889e0"}, + {file = "google_re2-1.1-6-cp39-cp39-macosx_13_0_arm64.whl", hash = "sha256:a96a8bb309182090704593c60bdb369a2756b38fe358bbf0d40ddeb99c71769f"}, + {file = "google_re2-1.1-6-cp39-cp39-macosx_13_0_x86_64.whl", hash = "sha256:dff3d4be9f27ef8ec3705eed54f19ef4ab096f5876c15fe011628c69ba3b561c"}, + {file = "google_re2-1.1-6-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:40f818b0b39e26811fa677978112a8108269977fdab2ba0453ac4363c35d9e66"}, + {file = "google_re2-1.1-6-cp39-cp39-macosx_14_0_x86_64.whl", hash = "sha256:8a7e53538cdb40ef4296017acfbb05cab0c19998be7552db1cfb85ba40b171b9"}, + {file = "google_re2-1.1-6-cp39-cp39-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6ee18e7569fb714e5bb8c42809bf8160738637a5e71ed5a4797757a1fb4dc4de"}, + {file = "google_re2-1.1-6-cp39-cp39-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1cda4f6d1a7d5b43ea92bc395f23853fba0caf8b1e1efa6e8c48685f912fcb89"}, + {file = "google_re2-1.1-6-cp39-cp39-win32.whl", hash = "sha256:6a9cdbdc36a2bf24f897be6a6c85125876dc26fea9eb4247234aec0decbdccfd"}, + {file = "google_re2-1.1-6-cp39-cp39-win_amd64.whl", hash = "sha256:73f646cecfad7cc5b4330b4192c25f2e29730a3b8408e089ffd2078094208196"}, ] [[package]] @@ -4404,63 +4524,64 @@ files = [ [[package]] name = "ibis-framework" -version = "10.0.0.dev256" +version = "9.5.0" description = "The portable Python dataframe library" -optional = true +optional = false python-versions = "<4.0,>=3.10" files = [ - {file = "ibis_framework-10.0.0.dev256-py3-none-any.whl", hash = "sha256:d6f21278e6fd78920bbe986df2c871921142635cc4f7d5d2048cae26e307a3df"}, - {file = "ibis_framework-10.0.0.dev256.tar.gz", hash = "sha256:e9f97d8177fd88f4a3578be20519c1da79a6a7ffac678b46b790bfde67405930"}, + {file = "ibis_framework-9.5.0-py3-none-any.whl", hash = "sha256:145fe30d94f111cff332580c275ce77725c5ff7086eede93af0b371649d009c0"}, + {file = "ibis_framework-9.5.0.tar.gz", hash = "sha256:1c8a29277e63ee0dfc289bc8f550164b5e3bdaec1b76b62436c37d331bb4ef84"}, ] [package.dependencies] atpublic = ">=2.3,<6" clickhouse-connect = {version = ">=0.5.23,<1", extras = ["arrow", "numpy", "pandas"], optional = true, markers = "extra == \"clickhouse\""} db-dtypes = {version = ">=0.3,<2", optional = true, markers = "extra == \"bigquery\""} -duckdb = {version = ">=0.10,<1.2", optional = true, markers = "extra == \"duckdb\""} +duckdb = {version = ">=0.8.1,<1.2", optional = true, markers = "extra == \"duckdb\""} google-cloud-bigquery = {version = ">=3,<4", optional = true, markers = "extra == \"bigquery\""} google-cloud-bigquery-storage = {version = ">=2,<3", optional = true, markers = "extra == \"bigquery\""} -numpy = {version = ">=1.23.2,<3", optional = true, markers = "extra == \"bigquery\" or extra == \"clickhouse\" or extra == \"databricks\" or extra == \"datafusion\" or extra == \"druid\" or extra == \"duckdb\" or extra == \"exasol\" or extra == \"flink\" or extra == \"impala\" or extra == \"mssql\" or extra == \"mysql\" or extra == \"oracle\" or extra == \"polars\" or extra == \"postgres\" or extra == \"pyspark\" or extra == \"snowflake\" or extra == \"sqlite\" or extra == \"risingwave\" or extra == \"trino\""} -packaging = {version = ">=21.3,<25", optional = true, markers = "extra == \"duckdb\" or extra == \"oracle\" or extra == \"polars\" or extra == \"pyspark\""} -pandas = {version = ">=1.5.3,<3", optional = true, markers = "extra == \"bigquery\" or extra == \"clickhouse\" or extra == \"databricks\" or extra == \"datafusion\" or extra == \"druid\" or extra == \"duckdb\" or extra == \"exasol\" or extra == \"flink\" or extra == \"impala\" or extra == \"mssql\" or extra == \"mysql\" or extra == \"oracle\" or extra == \"polars\" or extra == \"postgres\" or extra == \"pyspark\" or extra == \"snowflake\" or extra == \"sqlite\" or extra == \"risingwave\" or extra == \"trino\""} +numpy = {version = ">=1.23.2,<3", optional = true, markers = "extra == \"bigquery\" or extra == \"clickhouse\" or extra == \"dask\" or extra == \"datafusion\" or extra == \"druid\" or extra == \"duckdb\" or extra == \"exasol\" or extra == \"flink\" or extra == \"impala\" or extra == \"mssql\" or extra == \"mysql\" or extra == \"oracle\" or extra == \"pandas\" or extra == \"polars\" or extra == \"postgres\" or extra == \"pyspark\" or extra == \"snowflake\" or extra == \"sqlite\" or extra == \"risingwave\" or extra == \"trino\""} +packaging = {version = ">=21.3,<25", optional = true, markers = "extra == \"dask\" or extra == \"duckdb\" or extra == \"oracle\" or extra == \"pandas\" or extra == \"polars\" or extra == \"pyspark\""} +pandas = {version = ">=1.5.3,<3", optional = true, markers = "extra == \"bigquery\" or extra == \"clickhouse\" or extra == \"dask\" or extra == \"datafusion\" or extra == \"druid\" or extra == \"duckdb\" or extra == \"exasol\" or extra == \"flink\" or extra == \"impala\" or extra == \"mssql\" or extra == \"mysql\" or extra == \"oracle\" or extra == \"pandas\" or extra == \"polars\" or extra == \"postgres\" or extra == \"pyspark\" or extra == \"snowflake\" or extra == \"sqlite\" or extra == \"risingwave\" or extra == \"trino\""} parsy = ">=2,<3" psycopg2 = {version = ">=2.8.4,<3", optional = true, markers = "extra == \"postgres\" or extra == \"risingwave\""} -pyarrow = {version = ">=10.0.1,<19", optional = true, markers = "extra == \"bigquery\" or extra == \"clickhouse\" or extra == \"databricks\" or extra == \"datafusion\" or extra == \"druid\" or extra == \"duckdb\" or extra == \"exasol\" or extra == \"flink\" or extra == \"impala\" or extra == \"mssql\" or extra == \"mysql\" or extra == \"oracle\" or extra == \"polars\" or extra == \"postgres\" or extra == \"pyspark\" or extra == \"snowflake\" or extra == \"sqlite\" or extra == \"risingwave\" or extra == \"trino\""} -pyarrow-hotfix = {version = ">=0.4,<1", optional = true, markers = "extra == \"bigquery\" or extra == \"clickhouse\" or extra == \"databricks\" or extra == \"datafusion\" or extra == \"druid\" or extra == \"duckdb\" or extra == \"exasol\" or extra == \"flink\" or extra == \"impala\" or extra == \"mssql\" or extra == \"mysql\" or extra == \"oracle\" or extra == \"polars\" or extra == \"postgres\" or extra == \"pyspark\" or extra == \"snowflake\" or extra == \"sqlite\" or extra == \"risingwave\" or extra == \"trino\""} +pyarrow = {version = ">=10.0.1,<18", optional = true, markers = "extra == \"bigquery\" or extra == \"clickhouse\" or extra == \"dask\" or extra == \"datafusion\" or extra == \"druid\" or extra == \"duckdb\" or extra == \"exasol\" or extra == \"flink\" or extra == \"impala\" or extra == \"mssql\" or extra == \"mysql\" or extra == \"oracle\" or extra == \"pandas\" or extra == \"polars\" or extra == \"postgres\" or extra == \"pyspark\" or extra == \"snowflake\" or extra == \"sqlite\" or extra == \"risingwave\" or extra == \"trino\""} +pyarrow-hotfix = {version = ">=0.4,<1", optional = true, markers = "extra == \"bigquery\" or extra == \"clickhouse\" or extra == \"dask\" or extra == \"datafusion\" or extra == \"druid\" or extra == \"duckdb\" or extra == \"exasol\" or extra == \"flink\" or extra == \"impala\" or extra == \"mssql\" or extra == \"mysql\" or extra == \"oracle\" or extra == \"pandas\" or extra == \"polars\" or extra == \"postgres\" or extra == \"pyspark\" or extra == \"snowflake\" or extra == \"sqlite\" or extra == \"risingwave\" or extra == \"trino\""} pydata-google-auth = {version = ">=1.4.0,<2", optional = true, markers = "extra == \"bigquery\""} pyodbc = {version = ">=4.0.39,<6", optional = true, markers = "extra == \"mssql\""} python-dateutil = ">=2.8.2,<3" pytz = ">=2022.7" -rich = {version = ">=12.4.4,<14", optional = true, markers = "extra == \"bigquery\" or extra == \"clickhouse\" or extra == \"databricks\" or extra == \"datafusion\" or extra == \"druid\" or extra == \"duckdb\" or extra == \"exasol\" or extra == \"flink\" or extra == \"impala\" or extra == \"mssql\" or extra == \"mysql\" or extra == \"oracle\" or extra == \"polars\" or extra == \"postgres\" or extra == \"pyspark\" or extra == \"snowflake\" or extra == \"sqlite\" or extra == \"risingwave\" or extra == \"trino\""} +rich = {version = ">=12.4.4,<14", optional = true, markers = "extra == \"bigquery\" or extra == \"clickhouse\" or extra == \"dask\" or extra == \"datafusion\" or extra == \"druid\" or extra == \"duckdb\" or extra == \"exasol\" or extra == \"flink\" or extra == \"impala\" or extra == \"mssql\" or extra == \"mysql\" or extra == \"oracle\" or extra == \"pandas\" or extra == \"polars\" or extra == \"postgres\" or extra == \"pyspark\" or extra == \"snowflake\" or extra == \"sqlite\" or extra == \"risingwave\" or extra == \"trino\""} snowflake-connector-python = {version = ">=3.0.2,<3.3.0b1 || >3.3.0b1,<4", optional = true, markers = "extra == \"snowflake\""} -sqlglot = ">=23.4,<25.30" -toolz = ">=0.11,<2" +sqlglot = ">=23.4,<25.21" +toolz = ">=0.11,<1" typing-extensions = ">=4.3.0,<5" [package.extras] -bigquery = ["db-dtypes (>=0.3,<2)", "google-cloud-bigquery (>=3,<4)", "google-cloud-bigquery-storage (>=2,<3)", "numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<19)", "pyarrow-hotfix (>=0.4,<1)", "pydata-google-auth (>=1.4.0,<2)", "rich (>=12.4.4,<14)"] -clickhouse = ["clickhouse-connect[arrow,numpy,pandas] (>=0.5.23,<1)", "numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<19)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)"] -databricks = ["databricks-sql-connector-core (>=4,<5)", "numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<19)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)"] -datafusion = ["datafusion (>=0.6,<43)", "numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<19)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)"] +bigquery = ["db-dtypes (>=0.3,<2)", "google-cloud-bigquery (>=3,<4)", "google-cloud-bigquery-storage (>=2,<3)", "numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<18)", "pyarrow-hotfix (>=0.4,<1)", "pydata-google-auth (>=1.4.0,<2)", "rich (>=12.4.4,<14)"] +clickhouse = ["clickhouse-connect[arrow,numpy,pandas] (>=0.5.23,<1)", "numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<18)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)"] +dask = ["dask[array,dataframe] (>=2022.9.1,<2024.3.0)", "numpy (>=1.23.2,<3)", "packaging (>=21.3,<25)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<18)", "pyarrow-hotfix (>=0.4,<1)", "regex (>=2021.7.6)", "rich (>=12.4.4,<14)"] +datafusion = ["datafusion (>=0.6,<41)", "numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<18)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)"] decompiler = ["black (>=22.1.0,<25)"] deltalake = ["deltalake (>=0.9.0,<1)"] -druid = ["numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<19)", "pyarrow-hotfix (>=0.4,<1)", "pydruid (>=0.6.7,<1)", "rich (>=12.4.4,<14)"] -duckdb = ["duckdb (>=0.10,<1.2)", "numpy (>=1.23.2,<3)", "packaging (>=21.3,<25)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<19)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)"] +druid = ["numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<18)", "pyarrow-hotfix (>=0.4,<1)", "pydruid (>=0.6.7,<1)", "rich (>=12.4.4,<14)"] +duckdb = ["duckdb (>=0.8.1,<1.2)", "numpy (>=1.23.2,<3)", "packaging (>=21.3,<25)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<18)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)"] examples = ["pins[gcs] (>=0.8.3,<1)"] -exasol = ["numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<19)", "pyarrow-hotfix (>=0.4,<1)", "pyexasol[pandas] (>=0.25.2,<1)", "rich (>=12.4.4,<14)"] -flink = ["numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<19)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)"] +exasol = ["numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<18)", "pyarrow-hotfix (>=0.4,<1)", "pyexasol[pandas] (>=0.25.2,<1)", "rich (>=12.4.4,<14)"] +flink = ["numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<18)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)"] geospatial = ["geoarrow-types (>=0.2,<1)", "geopandas (>=0.6,<2)", "pyproj (>=3.3.0,<4)", "shapely (>=2,<3)"] -impala = ["impyla (>=0.17,<1)", "numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<19)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)"] -mssql = ["numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<19)", "pyarrow-hotfix (>=0.4,<1)", "pyodbc (>=4.0.39,<6)", "rich (>=12.4.4,<14)"] -mysql = ["mysqlclient (>=2.2.4,<3)", "numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<19)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)"] -oracle = ["numpy (>=1.23.2,<3)", "oracledb (>=1.3.1,<3)", "packaging (>=21.3,<25)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<19)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)"] -polars = ["numpy (>=1.23.2,<3)", "packaging (>=21.3,<25)", "pandas (>=1.5.3,<3)", "polars (>=1,<2)", "pyarrow (>=10.0.1,<19)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)"] -postgres = ["numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "psycopg2 (>=2.8.4,<3)", "pyarrow (>=10.0.1,<19)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)"] -pyspark = ["numpy (>=1.23.2,<3)", "packaging (>=21.3,<25)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<19)", "pyarrow-hotfix (>=0.4,<1)", "pyspark (>=3.3.3,<4)", "rich (>=12.4.4,<14)"] -risingwave = ["numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "psycopg2 (>=2.8.4,<3)", "pyarrow (>=10.0.1,<19)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)"] -snowflake = ["numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<19)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)", "snowflake-connector-python (>=3.0.2,!=3.3.0b1,<4)"] -sqlite = ["numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<19)", "pyarrow-hotfix (>=0.4,<1)", "regex (>=2021.7.6)", "rich (>=12.4.4,<14)"] -trino = ["numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<19)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)", "trino (>=0.321,<1)"] +impala = ["impyla (>=0.17,<1)", "numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<18)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)"] +mssql = ["numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<18)", "pyarrow-hotfix (>=0.4,<1)", "pyodbc (>=4.0.39,<6)", "rich (>=12.4.4,<14)"] +mysql = ["numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<18)", "pyarrow-hotfix (>=0.4,<1)", "pymysql (>=1,<2)", "rich (>=12.4.4,<14)"] +oracle = ["numpy (>=1.23.2,<3)", "oracledb (>=1.3.1,<3)", "packaging (>=21.3,<25)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<18)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)"] +pandas = ["numpy (>=1.23.2,<3)", "packaging (>=21.3,<25)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<18)", "pyarrow-hotfix (>=0.4,<1)", "regex (>=2021.7.6)", "rich (>=12.4.4,<14)"] +polars = ["numpy (>=1.23.2,<3)", "packaging (>=21.3,<25)", "pandas (>=1.5.3,<3)", "polars (>=1,<2)", "pyarrow (>=10.0.1,<18)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)"] +postgres = ["numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "psycopg2 (>=2.8.4,<3)", "pyarrow (>=10.0.1,<18)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)"] +pyspark = ["numpy (>=1.23.2,<3)", "packaging (>=21.3,<25)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<18)", "pyarrow-hotfix (>=0.4,<1)", "pyspark (>=3.3.3,<4)", "rich (>=12.4.4,<14)"] +risingwave = ["numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "psycopg2 (>=2.8.4,<3)", "pyarrow (>=10.0.1,<18)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)"] +snowflake = ["numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<18)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)", "snowflake-connector-python (>=3.0.2,!=3.3.0b1,<4)"] +sqlite = ["numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<18)", "pyarrow-hotfix (>=0.4,<1)", "regex (>=2021.7.6)", "rich (>=12.4.4,<14)"] +trino = ["numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<18)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)", "trino (>=0.321,<1)"] visualization = ["graphviz (>=0.16,<1)"] [[package]] @@ -5112,7 +5233,7 @@ source = ["Cython (>=0.29.35)"] name = "lz4" version = "4.3.3" description = "LZ4 Bindings for Python" -optional = true +optional = false python-versions = ">=3.8" files = [ {file = "lz4-4.3.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b891880c187e96339474af2a3b2bfb11a8e4732ff5034be919aa9029484cd201"}, @@ -5771,44 +5892,49 @@ files = [ [[package]] name = "mypy" -version = "1.10.0" +version = "1.12.1" description = "Optional static typing for Python" optional = false python-versions = ">=3.8" files = [ - {file = "mypy-1.10.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:da1cbf08fb3b851ab3b9523a884c232774008267b1f83371ace57f412fe308c2"}, - {file = "mypy-1.10.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:12b6bfc1b1a66095ab413160a6e520e1dc076a28f3e22f7fb25ba3b000b4ef99"}, - {file = "mypy-1.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9e36fb078cce9904c7989b9693e41cb9711e0600139ce3970c6ef814b6ebc2b2"}, - {file = "mypy-1.10.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:2b0695d605ddcd3eb2f736cd8b4e388288c21e7de85001e9f85df9187f2b50f9"}, - {file = "mypy-1.10.0-cp310-cp310-win_amd64.whl", hash = "sha256:cd777b780312ddb135bceb9bc8722a73ec95e042f911cc279e2ec3c667076051"}, - {file = "mypy-1.10.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:3be66771aa5c97602f382230165b856c231d1277c511c9a8dd058be4784472e1"}, - {file = "mypy-1.10.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:8b2cbaca148d0754a54d44121b5825ae71868c7592a53b7292eeb0f3fdae95ee"}, - {file = "mypy-1.10.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1ec404a7cbe9fc0e92cb0e67f55ce0c025014e26d33e54d9e506a0f2d07fe5de"}, - {file = "mypy-1.10.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e22e1527dc3d4aa94311d246b59e47f6455b8729f4968765ac1eacf9a4760bc7"}, - {file = "mypy-1.10.0-cp311-cp311-win_amd64.whl", hash = "sha256:a87dbfa85971e8d59c9cc1fcf534efe664d8949e4c0b6b44e8ca548e746a8d53"}, - {file = "mypy-1.10.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:a781f6ad4bab20eef8b65174a57e5203f4be627b46291f4589879bf4e257b97b"}, - {file = "mypy-1.10.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b808e12113505b97d9023b0b5e0c0705a90571c6feefc6f215c1df9381256e30"}, - {file = "mypy-1.10.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8f55583b12156c399dce2df7d16f8a5095291354f1e839c252ec6c0611e86e2e"}, - {file = "mypy-1.10.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4cf18f9d0efa1b16478c4c129eabec36148032575391095f73cae2e722fcf9d5"}, - {file = "mypy-1.10.0-cp312-cp312-win_amd64.whl", hash = "sha256:bc6ac273b23c6b82da3bb25f4136c4fd42665f17f2cd850771cb600bdd2ebeda"}, - {file = "mypy-1.10.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9fd50226364cd2737351c79807775136b0abe084433b55b2e29181a4c3c878c0"}, - {file = "mypy-1.10.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:f90cff89eea89273727d8783fef5d4a934be2fdca11b47def50cf5d311aff727"}, - {file = "mypy-1.10.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fcfc70599efde5c67862a07a1aaf50e55bce629ace26bb19dc17cece5dd31ca4"}, - {file = "mypy-1.10.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:075cbf81f3e134eadaf247de187bd604748171d6b79736fa9b6c9685b4083061"}, - {file = "mypy-1.10.0-cp38-cp38-win_amd64.whl", hash = "sha256:3f298531bca95ff615b6e9f2fc0333aae27fa48052903a0ac90215021cdcfa4f"}, - {file = "mypy-1.10.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:fa7ef5244615a2523b56c034becde4e9e3f9b034854c93639adb667ec9ec2976"}, - {file = "mypy-1.10.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3236a4c8f535a0631f85f5fcdffba71c7feeef76a6002fcba7c1a8e57c8be1ec"}, - {file = "mypy-1.10.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4a2b5cdbb5dd35aa08ea9114436e0d79aceb2f38e32c21684dcf8e24e1e92821"}, - {file = "mypy-1.10.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:92f93b21c0fe73dc00abf91022234c79d793318b8a96faac147cd579c1671746"}, - {file = "mypy-1.10.0-cp39-cp39-win_amd64.whl", hash = "sha256:28d0e038361b45f099cc086d9dd99c15ff14d0188f44ac883010e172ce86c38a"}, - {file = "mypy-1.10.0-py3-none-any.whl", hash = "sha256:f8c083976eb530019175aabadb60921e73b4f45736760826aa1689dda8208aee"}, - {file = "mypy-1.10.0.tar.gz", hash = "sha256:3d087fcbec056c4ee34974da493a826ce316947485cef3901f511848e687c131"}, + {file = "mypy-1.12.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:3d7d4371829184e22fda4015278fbfdef0327a4b955a483012bd2d423a788801"}, + {file = "mypy-1.12.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f59f1dfbf497d473201356966e353ef09d4daec48caeacc0254db8ef633a28a5"}, + {file = "mypy-1.12.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b947097fae68004b8328c55161ac9db7d3566abfef72d9d41b47a021c2fba6b1"}, + {file = "mypy-1.12.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:96af62050971c5241afb4701c15189ea9507db89ad07794a4ee7b4e092dc0627"}, + {file = "mypy-1.12.1-cp310-cp310-win_amd64.whl", hash = "sha256:d90da248f4c2dba6c44ddcfea94bb361e491962f05f41990ff24dbd09969ce20"}, + {file = "mypy-1.12.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:1230048fec1380faf240be6385e709c8570604d2d27ec6ca7e573e3bc09c3735"}, + {file = "mypy-1.12.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:02dcfe270c6ea13338210908f8cadc8d31af0f04cee8ca996438fe6a97b4ec66"}, + {file = "mypy-1.12.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a5a437c9102a6a252d9e3a63edc191a3aed5f2fcb786d614722ee3f4472e33f6"}, + {file = "mypy-1.12.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:186e0c8346efc027ee1f9acf5ca734425fc4f7dc2b60144f0fbe27cc19dc7931"}, + {file = "mypy-1.12.1-cp311-cp311-win_amd64.whl", hash = "sha256:673ba1140a478b50e6d265c03391702fa11a5c5aff3f54d69a62a48da32cb811"}, + {file = "mypy-1.12.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:9fb83a7be97c498176fb7486cafbb81decccaef1ac339d837c377b0ce3743a7f"}, + {file = "mypy-1.12.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:389e307e333879c571029d5b93932cf838b811d3f5395ed1ad05086b52148fb0"}, + {file = "mypy-1.12.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:94b2048a95a21f7a9ebc9fbd075a4fcd310410d078aa0228dbbad7f71335e042"}, + {file = "mypy-1.12.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4ee5932370ccf7ebf83f79d1c157a5929d7ea36313027b0d70a488493dc1b179"}, + {file = "mypy-1.12.1-cp312-cp312-win_amd64.whl", hash = "sha256:19bf51f87a295e7ab2894f1d8167622b063492d754e69c3c2fed6563268cb42a"}, + {file = "mypy-1.12.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:d34167d43613ffb1d6c6cdc0cc043bb106cac0aa5d6a4171f77ab92a3c758bcc"}, + {file = "mypy-1.12.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:427878aa54f2e2c5d8db31fa9010c599ed9f994b3b49e64ae9cd9990c40bd635"}, + {file = "mypy-1.12.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5fcde63ea2c9f69d6be859a1e6dd35955e87fa81de95bc240143cf00de1f7f81"}, + {file = "mypy-1.12.1-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:d54d840f6c052929f4a3d2aab2066af0f45a020b085fe0e40d4583db52aab4e4"}, + {file = "mypy-1.12.1-cp313-cp313-win_amd64.whl", hash = "sha256:20db6eb1ca3d1de8ece00033b12f793f1ea9da767334b7e8c626a4872090cf02"}, + {file = "mypy-1.12.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:b16fe09f9c741d85a2e3b14a5257a27a4f4886c171d562bc5a5e90d8591906b8"}, + {file = "mypy-1.12.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:0dcc1e843d58f444fce19da4cce5bd35c282d4bde232acdeca8279523087088a"}, + {file = "mypy-1.12.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e10ba7de5c616e44ad21005fa13450cd0de7caaa303a626147d45307492e4f2d"}, + {file = "mypy-1.12.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:0e6fe449223fa59fbee351db32283838a8fee8059e0028e9e6494a03802b4004"}, + {file = "mypy-1.12.1-cp38-cp38-win_amd64.whl", hash = "sha256:dc6e2a2195a290a7fd5bac3e60b586d77fc88e986eba7feced8b778c373f9afe"}, + {file = "mypy-1.12.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:de5b2a8988b4e1269a98beaf0e7cc71b510d050dce80c343b53b4955fff45f19"}, + {file = "mypy-1.12.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:843826966f1d65925e8b50d2b483065c51fc16dc5d72647e0236aae51dc8d77e"}, + {file = "mypy-1.12.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9fe20f89da41a95e14c34b1ddb09c80262edcc295ad891f22cc4b60013e8f78d"}, + {file = "mypy-1.12.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:8135ffec02121a75f75dc97c81af7c14aa4ae0dda277132cfcd6abcd21551bfd"}, + {file = "mypy-1.12.1-cp39-cp39-win_amd64.whl", hash = "sha256:a7b76fa83260824300cc4834a3ab93180db19876bce59af921467fd03e692810"}, + {file = "mypy-1.12.1-py3-none-any.whl", hash = "sha256:ce561a09e3bb9863ab77edf29ae3a50e65685ad74bba1431278185b7e5d5486e"}, + {file = "mypy-1.12.1.tar.gz", hash = "sha256:f5b3936f7a6d0e8280c9bdef94c7ce4847f5cdfc258fbb2c29a8c1711e8bb96d"}, ] [package.dependencies] mypy-extensions = ">=1.0.0" tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""} -typing-extensions = ">=4.1.0" +typing-extensions = ">=4.6.0" [package.extras] dmypy = ["psutil (>=4.0)"] @@ -6545,7 +6671,7 @@ future = "*" name = "parsy" version = "2.1" description = "Easy-to-use parser combinators, for parsing in pure Python" -optional = true +optional = false python-versions = ">=3.7" files = [ {file = "parsy-2.1-py3-none-any.whl", hash = "sha256:8f18e7b11985e7802e7e3ecbd8291c6ca243d29820b1186e4c84605db4efffa0"}, @@ -6980,7 +7106,7 @@ test = ["enum34", "ipaddress", "mock", "pywin32", "wmi"] name = "psycopg2" version = "2.9.10" description = "psycopg2 - Python-PostgreSQL Database Adapter" -optional = true +optional = false python-versions = ">=3.8" files = [ {file = "psycopg2-2.9.10-cp310-cp310-win32.whl", hash = "sha256:5df2b672140f95adb453af93a7d669d7a7bf0a56bcd26f1502329166f4a61716"}, @@ -7143,7 +7269,7 @@ test = ["cffi", "hypothesis", "pandas", "pytest", "pytz"] name = "pyarrow-hotfix" version = "0.6" description = "" -optional = true +optional = false python-versions = ">=3.5" files = [ {file = "pyarrow_hotfix-0.6-py3-none-any.whl", hash = "sha256:dcc9ae2d220dff0083be6a9aa8e0cdee5182ad358d4931fce825c545e5c89178"}, @@ -7356,7 +7482,7 @@ typing-extensions = ">=4.6.0,<4.7.0 || >4.7.0" name = "pydata-google-auth" version = "1.9.0" description = "PyData helpers for authenticating to Google APIs" -optional = true +optional = false python-versions = ">=3.9" files = [ {file = "pydata-google-auth-1.9.0.tar.gz", hash = "sha256:2f546e88f007dfdb050087556eb46d6008e351386a7b368096797fae5df374f2"}, @@ -7420,6 +7546,74 @@ files = [ [package.extras] plugins = ["importlib-metadata"] +[[package]] +name = "pyiceberg" +version = "0.8.1" +description = "Apache Iceberg is an open table format for huge analytic datasets" +optional = true +python-versions = "!=2.7.*,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,!=3.7.*,!=3.8.*,>=3.9" +files = [ + {file = "pyiceberg-0.8.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:7c121d1d3baf64510db94740ad870ae4b6eb9eb59a5ff7ecb4e96f7510666b2f"}, + {file = "pyiceberg-0.8.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2a6f14aa588a3883fc7fddc136ca75b75660b4abb0b55b4c541619953f8971e7"}, + {file = "pyiceberg-0.8.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7c720c2a191ac6faf01fe4c0f4c01c64b94bf064185b0292003d42939049277c"}, + {file = "pyiceberg-0.8.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:d421d6e51ac1c581cba9fce96aa6b9118cf4a02270066a7fdc9490ab5d57ece9"}, + {file = "pyiceberg-0.8.1-cp310-cp310-win_amd64.whl", hash = "sha256:ae11fb0515ea0a046370e09a7f6039a7e86622ab910360eaa732f0106b8f00c7"}, + {file = "pyiceberg-0.8.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:9488954c9eb5ce42ca6b816fc61873f219414cfdb9e9928d1c4a302702be1d89"}, + {file = "pyiceberg-0.8.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:44179e0fb844887b440c162279ba526dfe0e0f72d32945236528838518b55af0"}, + {file = "pyiceberg-0.8.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3e121c6f5505d8ec711a1dd1690e07156cd54fb3d0844d5d991e02f1593f2708"}, + {file = "pyiceberg-0.8.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:5961a288f2d4bbb2ab300c803da1bf0e70cea837e3f14b14108827cc821af252"}, + {file = "pyiceberg-0.8.1-cp311-cp311-win_amd64.whl", hash = "sha256:dbe192324a6fb552c2fd29cab51086e21fa248ea2a0b95fbab921dede49e5a69"}, + {file = "pyiceberg-0.8.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:60430f0d8f6d650ed7d1893d038b847565a8e9ac135a1cc812e57d24f0482f6c"}, + {file = "pyiceberg-0.8.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f0f697977dac672d8b00e125836423585a97ebf59a28b865b1296a2b6ee81c51"}, + {file = "pyiceberg-0.8.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:370de7c230970ff858f713d150164d492ba8450e771e59a0c520520b13ea6226"}, + {file = "pyiceberg-0.8.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:3036ed226020d50e30648a71f968cf78bde5d6b609294508e60754e100e5ef36"}, + {file = "pyiceberg-0.8.1-cp312-cp312-win_amd64.whl", hash = "sha256:9ac9555f3bd25a31059229089ae639cf738a8e8286a175cea128561ac1ed9452"}, + {file = "pyiceberg-0.8.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:51da3a553d3a881042bf436e66a91cc2b6c4a3fea0e174cd73af2eb6ed255323"}, + {file = "pyiceberg-0.8.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:863f1dce7340e6ed870706a3fa4a73457178dae8529725bb80522ddcd4253afb"}, + {file = "pyiceberg-0.8.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4dbf52b39080a6a2cda6a5126a74e3a88d5b206f609c128d001a728b36b81075"}, + {file = "pyiceberg-0.8.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:eb77d65e8efbb883c163817e4a9c373d907110ab6343c1b816b48f336955d4d7"}, + {file = "pyiceberg-0.8.1-cp39-cp39-win_amd64.whl", hash = "sha256:1fcd35b7de0eddc3fd8fd0c38b98741217ef6de4eeb0e72b798b4007692aa76c"}, + {file = "pyiceberg-0.8.1-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:6f0f56f8fc61bcd795f6a3d03e8ce6bee09ebaa64425eb08327e975f906d98be"}, + {file = "pyiceberg-0.8.1-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:d7099c6631743ad29c451de2bebd9ed3c96c42bcb1fe5d5d5c93aec895858e3f"}, + {file = "pyiceberg-0.8.1-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d6436f5a782491115f64131882a737d77c9dc0040493e1b7f9b3081ea8cf6a26"}, + {file = "pyiceberg-0.8.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:c1d75b40a98a327f7436eb0d6187c51834c44b79adf61c6945b33645f4afbf17"}, + {file = "pyiceberg-0.8.1-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:8de988fa2363e6a51b40b85b5ff1e8261cda5bfc14ac54dd4ebe58391b95acae"}, + {file = "pyiceberg-0.8.1-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:dd06c5b606011155aa0b76e7b001e30f1c40ab2fb3eeb8a0652b88629259c2bb"}, + {file = "pyiceberg-0.8.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e8142f0dbc12dda0e6d7aaf564a3fbb0f17fc934630e7cf866773c8caaebf666"}, + {file = "pyiceberg-0.8.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:6126ee3a46ff975f15abf2085f184591d21643bffb96330907e003eea0b63005"}, + {file = "pyiceberg-0.8.1.tar.gz", hash = "sha256:4502f0cfddf6f7cd48b9cd54016bce0ab94052b0ab01efcfa515879074f4c8e3"}, +] + +[package.dependencies] +cachetools = ">=5.5.0,<6.0.0" +click = ">=7.1.1,<9.0.0" +fsspec = ">=2023.1.0" +mmh3 = ">=4.0.0,<6.0.0" +pydantic = ">=2.0,<2.4.0 || >2.4.0,<2.4.1 || >2.4.1,<3.0" +pyparsing = ">=3.1.0,<4.0.0" +requests = ">=2.20.0,<3.0.0" +rich = ">=10.11.0,<14.0.0" +sortedcontainers = "2.4.0" +strictyaml = ">=1.7.0,<2.0.0" +tenacity = ">=8.2.3,<10.0.0" + +[package.extras] +adlfs = ["adlfs (>=2023.1.0)"] +daft = ["getdaft (>=0.2.12)"] +duckdb = ["duckdb (>=0.5.0,<2.0.0)", "pyarrow (>=14.0.0,<19.0.0)"] +dynamodb = ["boto3 (>=1.24.59)"] +gcsfs = ["gcsfs (>=2023.1.0)"] +glue = ["boto3 (>=1.24.59)", "mypy-boto3-glue (>=1.28.18)"] +hive = ["thrift (>=0.13.0,<1.0.0)"] +pandas = ["pandas (>=1.0.0,<3.0.0)", "pyarrow (>=14.0.0,<19.0.0)"] +pyarrow = ["pyarrow (>=14.0.0,<19.0.0)"] +ray = ["pandas (>=1.0.0,<3.0.0)", "pyarrow (>=14.0.0,<19.0.0)", "ray (==2.10.0)", "ray (>=2.10.0,<3.0.0)"] +s3fs = ["s3fs (>=2023.1.0)"] +snappy = ["python-snappy (>=0.6.0,<1.0.0)"] +sql-postgres = ["psycopg2-binary (>=2.9.6)", "sqlalchemy (>=2.0.18,<3.0.0)"] +sql-sqlite = ["sqlalchemy (>=2.0.18,<3.0.0)"] +zstandard = ["zstandard (>=0.13.0,<1.0.0)"] + [[package]] name = "pyjwt" version = "2.8.0" @@ -9165,13 +9359,13 @@ typing-extensions = "*" [[package]] name = "sqlglot" -version = "25.24.5" +version = "25.20.2" description = "An easily customizable SQL parser and transpiler" optional = false python-versions = ">=3.7" files = [ - {file = "sqlglot-25.24.5-py3-none-any.whl", hash = "sha256:f8a8870d1f5cdd2e2dc5c39a5030a0c7b0a91264fb8972caead3dac8e8438873"}, - {file = "sqlglot-25.24.5.tar.gz", hash = "sha256:6d3d604034301ca3b614d6b4148646b4033317b7a93d1801e9661495eb4b4fcf"}, + {file = "sqlglot-25.20.2-py3-none-any.whl", hash = "sha256:cdbfd7ce3f2f39f32bd7b4c23fd9e0fd261636a6b14285b914e8def25fd0a567"}, + {file = "sqlglot-25.20.2.tar.gz", hash = "sha256:169fe8308dd70d7bd40117b2221b62bdc7c4e2ea8eb07394b2a6146cdedf05ab"}, ] [package.extras] @@ -9226,6 +9420,20 @@ files = [ [package.dependencies] pbr = ">=2.0.0,<2.1.0 || >2.1.0" +[[package]] +name = "strictyaml" +version = "1.7.3" +description = "Strict, typed YAML parser" +optional = true +python-versions = ">=3.7.0" +files = [ + {file = "strictyaml-1.7.3-py3-none-any.whl", hash = "sha256:fb5c8a4edb43bebb765959e420f9b3978d7f1af88c80606c03fb420888f5d1c7"}, + {file = "strictyaml-1.7.3.tar.gz", hash = "sha256:22f854a5fcab42b5ddba8030a0e4be51ca89af0267961c8d6cfa86395586c407"}, +] + +[package.dependencies] +python-dateutil = ">=2.6.0" + [[package]] name = "sympy" version = "1.12" @@ -9548,13 +9756,13 @@ files = [ [[package]] name = "toolz" -version = "1.0.0" +version = "0.12.1" description = "List processing tools and functional utilities" -optional = true -python-versions = ">=3.8" +optional = false +python-versions = ">=3.7" files = [ - {file = "toolz-1.0.0-py3-none-any.whl", hash = "sha256:292c8f1c4e7516bf9086f8850935c799a874039c8bcf959d47b600e4c44a6236"}, - {file = "toolz-1.0.0.tar.gz", hash = "sha256:2c86e3d9a04798ac556793bced838816296a2f085017664e4995cb40a1047a02"}, + {file = "toolz-0.12.1-py3-none-any.whl", hash = "sha256:d22731364c07d72eea0a0ad45bafb2c2937ab6fd38a3507bf55eae8744aa7d85"}, + {file = "toolz-0.12.1.tar.gz", hash = "sha256:ecca342664893f177a13dac0e6b41cbd8ac25a358e5f215316d43e2100224f4d"}, ] [[package]] @@ -10429,7 +10637,7 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p name = "zstandard" version = "0.22.0" description = "Zstandard bindings for Python" -optional = true +optional = false python-versions = ">=3.8" files = [ {file = "zstandard-0.22.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:275df437ab03f8c033b8a2c181e51716c32d831082d93ce48002a5227ec93019"}, @@ -10492,7 +10700,7 @@ az = ["adlfs"] bigquery = ["db-dtypes", "gcsfs", "google-cloud-bigquery", "grpcio", "pyarrow"] cli = ["cron-descriptor", "pipdeptree"] clickhouse = ["adlfs", "clickhouse-connect", "clickhouse-driver", "gcsfs", "pyarrow", "s3fs"] -databricks = ["databricks-sql-connector"] +databricks = ["databricks-sdk", "databricks-sql-connector"] deltalake = ["deltalake", "pyarrow"] dremio = ["pyarrow"] duckdb = ["duckdb"] @@ -10505,6 +10713,7 @@ mssql = ["pyodbc"] parquet = ["pyarrow"] postgis = ["psycopg2-binary", "psycopg2cffi"] postgres = ["psycopg2-binary", "psycopg2cffi"] +pyiceberg = ["pyarrow", "pyiceberg", "sqlalchemy"] qdrant = ["qdrant-client"] redshift = ["psycopg2-binary", "psycopg2cffi"] s3 = ["botocore", "s3fs"] @@ -10518,4 +10727,4 @@ weaviate = ["weaviate-client"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<3.13" -content-hash = "c0607d05ab37a1a6addf3ae7264bf5972cb6ce6e46df1dcdc2da3cff72e5008e" +content-hash = "5513aca05ae04d7941f2a890d0fefa86a08371508a2d319c1e558c29ff8a45f3" diff --git a/pyproject.toml b/pyproject.toml index 7377b03fde..646ed215a3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "dlt" -version = "1.4.1" +version = "1.5.0" description = "dlt is an open-source python-first scalable data loading library that does not require any backend to run." authors = ["dltHub Inc. "] maintainers = [ "Marcin Rudolf ", "Adrian Brudaru ", "Anton Burnashev ", "David Scharf " ] @@ -29,7 +29,7 @@ requests = ">=2.26.0" pendulum = ">=2.1.2" simplejson = ">=3.17.5" PyYAML = ">=5.4.1" -semver = ">=2.13.0" +semver = ">=3.0.0" hexbytes = ">=0.2.2" tzdata = ">=2022.1" tomlkit = ">=0.11.3" @@ -75,7 +75,7 @@ cron-descriptor = {version = ">=1.2.32", optional = true} pipdeptree = {version = ">=2.9.0,<2.10", optional = true} pyathena = {version = ">=2.9.6", optional = true} weaviate-client = {version = ">=3.22", optional = true} -adlfs = {version = ">=2022.4.0", optional = true} +adlfs = {version = ">=2024.7.0", optional = true} pyodbc = {version = ">=4.0.39", optional = true} qdrant-client = {version = ">=1.8", optional = true, extras = ["fastembed"]} databricks-sql-connector = {version = ">=2.9.3", optional = true} @@ -89,6 +89,13 @@ alembic = {version = ">1.10.0", optional = true} paramiko = {version = ">=3.3.0", optional = true} sqlglot = {version = ">=20.0.0", optional = true} db-dtypes = { version = ">=1.2.0", optional = true } +# `sql-sqlite` extra leads to dependency conflict with `apache-airflow` because `apache-airflow` +# requires `sqlalchemy<2.0.0` while the extra requires `sqlalchemy>=2.0.18` +# https://github.com/apache/airflow/issues/28723 +# pyiceberg = { version = ">=0.7.1", optional = true, extras = ["sql-sqlite"] } +# we will rely on manual installation of `sqlalchemy>=2.0.18` instead +pyiceberg = { version = ">=0.8.1", python = ">=3.9", optional = true } +databricks-sdk = {version = ">=0.38.0", optional = true} [tool.poetry.extras] gcp = ["grpcio", "google-cloud-bigquery", "db-dtypes", "gcsfs"] @@ -111,13 +118,14 @@ weaviate = ["weaviate-client"] mssql = ["pyodbc"] synapse = ["pyodbc", "adlfs", "pyarrow"] qdrant = ["qdrant-client"] -databricks = ["databricks-sql-connector"] +databricks = ["databricks-sql-connector", "databricks-sdk"] clickhouse = ["clickhouse-driver", "clickhouse-connect", "s3fs", "gcsfs", "adlfs", "pyarrow"] dremio = ["pyarrow"] lancedb = ["lancedb", "pyarrow", "tantivy"] deltalake = ["deltalake", "pyarrow"] sql_database = ["sqlalchemy"] sqlalchemy = ["sqlalchemy", "alembic"] +pyiceberg = ["pyiceberg", "pyarrow", "sqlalchemy"] postgis = ["psycopg2-binary", "psycopg2cffi"] [tool.poetry.scripts] @@ -134,7 +142,7 @@ sqlfluff = "^2.3.2" types-deprecated = "^1.2.9.2" pytest-console-scripts = "^1.4.1" pytest = "^7.0.0" -mypy = "^1.10.0" +mypy = ">=1.11.0,<1.13.0" flake8 = "^5.0.0" bandit = "^1.7.0" black = "^23.7.0" @@ -167,7 +175,6 @@ pytest-mock = "^3.14.0" types-regex = "^2024.5.15.20240519" flake8-print = "^5.0.0" mimesis = "^7.0.0" -ibis-framework = { version = ">=9.0.0", markers = "python_version >= '3.10'", optional = true, extras = ["duckdb", "postgres", "bigquery", "snowflake", "mssql", "clickhouse"]} shapely = ">=2.0.6" [tool.poetry.group.sources] @@ -205,6 +212,12 @@ optional = true [tool.poetry.group.airflow.dependencies] apache-airflow = {version = "^2.8.0", markers = "python_version < '3.12'"} +[tool.poetry.group.ibis] +optional = true + +[tool.poetry.group.ibis.dependencies] +ibis-framework = { version = ">=9.0.0,<10.0.0", markers = "python_version >= '3.10'", extras = ["duckdb", "postgres", "bigquery", "snowflake", "mssql", "clickhouse"]} + [tool.poetry.group.providers] optional = true diff --git a/tests/cli/test_init_command.py b/tests/cli/test_init_command.py index 8e1affd164..d81cd8c858 100644 --- a/tests/cli/test_init_command.py +++ b/tests/cli/test_init_command.py @@ -262,6 +262,21 @@ def test_init_all_sources_isolated(cloned_init_repo: FileStorage) -> None: assert_index_version_constraint(files, candidate) +def test_init_core_sources_ejected(cloned_init_repo: FileStorage) -> None: + repo_dir = get_repo_dir(cloned_init_repo) + # ensure we test both sources form verified sources and core sources + source_candidates = set(CORE_SOURCES) + for candidate in source_candidates: + clean_test_storage() + repo_dir = get_repo_dir(cloned_init_repo) + files = get_project_files(clear_all_sources=False) + with set_working_dir(files.storage_path): + init_command.init_command(candidate, "bigquery", repo_dir, eject_source=True) + assert_requirements_txt(files, "bigquery") + # check if files copied + assert files.has_folder(candidate) + + @pytest.mark.parametrize("destination_name", IMPLEMENTED_DESTINATIONS) def test_init_all_destinations( destination_name: str, project_files: FileStorage, repo_dir: str @@ -279,25 +294,6 @@ def test_custom_destination_note(repo_dir: str, project_files: FileStorage): assert "to add a destination function that will consume your data" in _out -@pytest.mark.parametrize("omit", [True, False]) -# this will break if we have new core sources that are not in verified sources anymore -@pytest.mark.parametrize("source", set(CORE_SOURCES) - {"rest_api"}) -def test_omit_core_sources( - source: str, omit: bool, project_files: FileStorage, repo_dir: str -) -> None: - with io.StringIO() as buf, contextlib.redirect_stdout(buf): - init_command.init_command(source, "destination", repo_dir, omit_core_sources=omit) - _out = buf.getvalue() - - # check messaging - assert ("Omitting dlt core sources" in _out) == omit - assert ("will no longer be copied from the" in _out) == (not omit) - - # if we omit core sources, there will be a folder with the name of the source from the verified sources repo - assert project_files.has_folder(source) == omit - assert (f"dlt.sources.{source}" in project_files.load(f"{source}_pipeline.py")) == (not omit) - - def test_init_code_update_index_diff(repo_dir: str, project_files: FileStorage) -> None: sources_storage = FileStorage(os.path.join(repo_dir, SOURCES_MODULE_NAME)) new_content = '"""New docstrings"""' diff --git a/tests/common/configuration/test_toml_provider.py b/tests/common/configuration/test_toml_provider.py index 481c21b7bb..9538849976 100644 --- a/tests/common/configuration/test_toml_provider.py +++ b/tests/common/configuration/test_toml_provider.py @@ -4,6 +4,7 @@ import yaml from typing import Any, Dict, Type import datetime # noqa: I251 +from unittest.mock import Mock import dlt from dlt.common import pendulum, json @@ -538,11 +539,28 @@ def loader() -> Dict[str, Any]: def test_colab_toml() -> None: + import builtins + # use a path without any settings files try: sys.path.append("tests/common/cases/modules") - # secrets are in user data + + # ipython not present provider: SettingsTomlProvider = SecretsTomlProvider("tests/common/null", global_dir=None) + assert provider.is_empty + + get_ipython_m = Mock() + get_ipython_m.return_value = "google.colab.Shell" + # make it available to all modules + builtins.get_ipython = get_ipython_m # type: ignore[attr-defined] + # test mock + assert get_ipython() == "google.colab.Shell" # type: ignore[name-defined] # noqa + from dlt.common.runtime.exec_info import is_notebook + + assert is_notebook() + + # secrets are in user data + provider = SecretsTomlProvider("tests/common/null", global_dir=None) assert provider.to_toml() == 'api_key="api"' # config is not in userdata provider = ConfigTomlProvider("tests/common/null", "unknown") @@ -551,4 +569,5 @@ def test_colab_toml() -> None: provider = SecretsTomlProvider("tests/common/cases/configuration/.dlt", global_dir=None) assert provider.get_value("secret_value", str, None) == ("2137", "secret_value") finally: + delattr(builtins, "get_ipython") sys.path.pop() diff --git a/tests/common/normalizers/test_json_relational.py b/tests/common/normalizers/test_json_relational.py index 35bc80add2..c35ecdef7f 100644 --- a/tests/common/normalizers/test_json_relational.py +++ b/tests/common/normalizers/test_json_relational.py @@ -880,6 +880,35 @@ def test_propagation_update_on_table_change(norm: RelationalNormalizer): "table_3" ] == {"_dlt_id": "_dlt_root_id", "prop1": "prop2"} + # force propagation when table has nested table that needs root_key + # also use custom name for row_key + table_4 = new_table( + "table_4", write_disposition="replace", columns=[{"name": "primary_key", "row_key": True}] + ) + table_4_nested = new_table( + "table_4__nested", + parent_table_name="table_4", + columns=[{"name": "_dlt_root_id", "root_key": True}], + ) + # must add table_4 first + norm.schema.update_table(table_4) + norm.schema.update_table(table_4_nested) + # row key table_4 not propagated because it was added before nested that needs that + # TODO: maybe fix it + assert ( + "table_4" not in norm.schema._normalizers_config["json"]["config"]["propagation"]["tables"] + ) + norm.schema.update_table(table_4) + # also custom key was used + assert norm.schema._normalizers_config["json"]["config"]["propagation"]["tables"][ + "table_4" + ] == {"primary_key": "_dlt_root_id"} + # drop table from schema + norm.schema.drop_tables(["table_4"]) + assert ( + "table_4" not in norm.schema._normalizers_config["json"]["config"]["propagation"]["tables"] + ) + def test_caching_perf(norm: RelationalNormalizer) -> None: from time import time @@ -893,6 +922,10 @@ def test_caching_perf(norm: RelationalNormalizer) -> None: print(f"{time() - start}") +def test_extend_table(norm: RelationalNormalizer) -> None: + pass + + def set_max_nesting(norm: RelationalNormalizer, max_nesting: int) -> None: RelationalNormalizer.update_normalizer_config(norm.schema, {"max_nesting": max_nesting}) norm._reset() diff --git a/tests/common/schema/test_inference.py b/tests/common/schema/test_inference.py index 7f06cdb71e..adbb34b1f0 100644 --- a/tests/common/schema/test_inference.py +++ b/tests/common/schema/test_inference.py @@ -441,27 +441,6 @@ def test_update_schema_table_prop_conflict(schema: Schema) -> None: assert exc_val.value.val2 == "tab_parent" -def test_update_schema_column_conflict(schema: Schema) -> None: - tab1 = utils.new_table( - "tab1", - write_disposition="append", - columns=[ - {"name": "col1", "data_type": "text", "nullable": False}, - ], - ) - schema.update_table(tab1) - tab1_u1 = deepcopy(tab1) - # simulate column that had other datatype inferred - tab1_u1["columns"]["col1"]["data_type"] = "bool" - with pytest.raises(CannotCoerceColumnException) as exc_val: - schema.update_table(tab1_u1) - assert exc_val.value.column_name == "col1" - assert exc_val.value.from_type == "bool" - assert exc_val.value.to_type == "text" - # whole column mismatch - assert exc_val.value.coerced_value is None - - def _add_preferred_types(schema: Schema) -> None: schema._settings["preferred_types"] = {} schema._settings["preferred_types"][TSimpleRegex("timestamp")] = "timestamp" diff --git a/tests/common/schema/test_merges.py b/tests/common/schema/test_merges.py index 8e0c350e7c..b76fe944b5 100644 --- a/tests/common/schema/test_merges.py +++ b/tests/common/schema/test_merges.py @@ -353,7 +353,7 @@ def test_diff_tables() -> None: assert "test" in partial["columns"] -def test_diff_tables_conflicts() -> None: +def test_tables_conflicts() -> None: # conflict on parents table: TTableSchema = { # type: ignore[typeddict-unknown-key] "name": "table", @@ -366,6 +366,8 @@ def test_diff_tables_conflicts() -> None: other = utils.new_table("table") with pytest.raises(TablePropertiesConflictException) as cf_ex: utils.diff_table("schema", table, other) + with pytest.raises(TablePropertiesConflictException) as cf_ex: + utils.ensure_compatible_tables("schema", table, other) assert cf_ex.value.table_name == "table" assert cf_ex.value.prop_name == "parent" @@ -373,6 +375,8 @@ def test_diff_tables_conflicts() -> None: other = utils.new_table("other_name") with pytest.raises(TablePropertiesConflictException) as cf_ex: utils.diff_table("schema", table, other) + with pytest.raises(TablePropertiesConflictException) as cf_ex: + utils.ensure_compatible_tables("schema", table, other) assert cf_ex.value.table_name == "table" assert cf_ex.value.prop_name == "name" @@ -380,7 +384,10 @@ def test_diff_tables_conflicts() -> None: changed = deepcopy(table) changed["columns"]["test"]["data_type"] = "bigint" with pytest.raises(CannotCoerceColumnException): - utils.diff_table("schema", table, changed) + utils.ensure_compatible_tables("schema", table, changed) + # but diff now accepts different data types + merged_table = utils.diff_table("schema", table, changed) + assert merged_table["columns"]["test"]["data_type"] == "bigint" def test_merge_tables() -> None: diff --git a/tests/common/test_time.py b/tests/common/test_time.py index 8c25983d46..9c7a1567e2 100644 --- a/tests/common/test_time.py +++ b/tests/common/test_time.py @@ -132,8 +132,26 @@ def test_datetime_to_timestamp_helpers( [ ("2024-10-20T15:30:00Z", "%Y-%m-%dT%H:%M:%SZ"), # UTC 'Z' ("2024-10-20T15:30:00.123456Z", "%Y-%m-%dT%H:%M:%S.%fZ"), # UTC 'Z' with fractional seconds - ("2024-10-20T15:30:00+02:00", "%Y-%m-%dT%H:%M:%S%z"), # Timezone offset - ("2024-10-20T15:30:00+0200", "%Y-%m-%dT%H:%M:%S%z"), # Timezone without colon + ("2024-10-20T15:30:00+02:00", "%Y-%m-%dT%H:%M:%S%z"), # Positive timezone offset + ("2024-10-20T15:30:00+0200", "%Y-%m-%dT%H:%M:%S%z"), # Positive timezone offset (no colon) + ( + "2024-10-20T15:30:00.123456+02:00", + "%Y-%m-%dT%H:%M:%S.%f%z", + ), # Positive timezone offset with fractional seconds + ( + "2024-10-20T15:30:00.123456+0200", + "%Y-%m-%dT%H:%M:%S.%f%z", + ), # Positive timezone offset with fractional seconds (no colon) + ("2024-10-20T15:30:00-02:00", "%Y-%m-%dT%H:%M:%S%z"), # Negative timezone offset + ("2024-10-20T15:30:00-0200", "%Y-%m-%dT%H:%M:%S%z"), # Negative timezone offset (no colon) + ( + "2024-10-20T15:30:00.123456-02:00", + "%Y-%m-%dT%H:%M:%S.%f%z", + ), # Negative timezone offset with fractional seconds + ( + "2024-10-20T15:30:00.123456-0200", + "%Y-%m-%dT%H:%M:%S.%f%z", + ), # Negative timezone offset with fractional seconds (no colon) ("2024-10-20T15:30:00", "%Y-%m-%dT%H:%M:%S"), # No timezone ("2024-10-20T15:30", "%Y-%m-%dT%H:%M"), # Minute precision ("2024-10-20T15", "%Y-%m-%dT%H"), # Hour precision diff --git a/tests/common/test_typing.py b/tests/common/test_typing.py index 2749e3ebb1..e81c3e7fa2 100644 --- a/tests/common/test_typing.py +++ b/tests/common/test_typing.py @@ -43,6 +43,7 @@ is_union_type, is_annotated, is_callable_type, + add_value_to_literal, ) @@ -293,3 +294,19 @@ def test_secret_type() -> None: assert TSecretStrValue("x_str") == "x_str" assert TSecretStrValue({}) == "{}" + + +def test_add_value_to_literal() -> None: + TestLiteral = Literal["red", "blue"] + + add_value_to_literal(TestLiteral, "green") + + assert get_args(TestLiteral) == ("red", "blue", "green") + + add_value_to_literal(TestLiteral, "red") + assert get_args(TestLiteral) == ("red", "blue", "green") + + TestSingleLiteral = Literal["red"] + add_value_to_literal(TestSingleLiteral, "green") + add_value_to_literal(TestSingleLiteral, "blue") + assert get_args(TestSingleLiteral) == ("red", "green", "blue") diff --git a/tests/conftest.py b/tests/conftest.py index 6088fa976c..a5a349f8d9 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -120,6 +120,9 @@ def _create_pipeline_instance_id(self) -> str: # disable googleapiclient logging logging.getLogger("googleapiclient.discovery_cache").setLevel("WARNING") + # disable pyiceberg logging + logging.getLogger("pyiceberg").setLevel("WARNING") + # reset and init airflow db import warnings diff --git a/tests/destinations/test_readable_dbapi_dataset.py b/tests/destinations/test_readable_dbapi_dataset.py index 4745735371..e3b318e8d4 100644 --- a/tests/destinations/test_readable_dbapi_dataset.py +++ b/tests/destinations/test_readable_dbapi_dataset.py @@ -2,60 +2,60 @@ import dlt import pytest -from dlt.destinations.dataset import ( +from dlt.destinations.dataset.exceptions import ( ReadableRelationHasQueryException, ReadableRelationUnknownColumnException, ) def test_query_builder() -> None: - dataset = dlt.pipeline(destination="duckdb", pipeline_name="pipeline")._dataset() + dataset = dlt.pipeline(destination="duckdb", pipeline_name="pipeline").dataset() # default query for a table - assert dataset.my_table.query.strip() == 'SELECT * FROM "pipeline_dataset"."my_table"' # type: ignore[attr-defined] + assert dataset.my_table.query.strip() == 'SELECT * FROM "pipeline_dataset"."my_table"' # head query assert ( - dataset.my_table.head().query.strip() # type: ignore[attr-defined] + dataset.my_table.head().query.strip() == 'SELECT * FROM "pipeline_dataset"."my_table" LIMIT 5' ) # limit query assert ( - dataset.my_table.limit(24).query.strip() # type: ignore[attr-defined] + dataset.my_table.limit(24).query.strip() == 'SELECT * FROM "pipeline_dataset"."my_table" LIMIT 24' ) # select columns assert ( - dataset.my_table.select("col1", "col2").query.strip() # type: ignore[attr-defined] + dataset.my_table.select("col1", "col2").query.strip() == 'SELECT "col1","col2" FROM "pipeline_dataset"."my_table"' ) # also indexer notation assert ( - dataset.my_table[["col1", "col2"]].query.strip() # type: ignore[attr-defined] + dataset.my_table[["col1", "col2"]].query.strip() == 'SELECT "col1","col2" FROM "pipeline_dataset"."my_table"' ) # identifiers are normalized assert ( - dataset["MY_TABLE"].select("CoL1", "cOl2").query.strip() # type: ignore[attr-defined] + dataset["MY_TABLE"].select("CoL1", "cOl2").query.strip() == 'SELECT "co_l1","c_ol2" FROM "pipeline_dataset"."my_table"' ) assert ( - dataset["MY__TABLE"].select("Co__L1", "cOl2").query.strip() # type: ignore[attr-defined] + dataset["MY__TABLE"].select("Co__L1", "cOl2").query.strip() == 'SELECT "co__l1","c_ol2" FROM "pipeline_dataset"."my__table"' ) # limit and select chained assert ( - dataset.my_table.select("col1", "col2").limit(24).query.strip() # type: ignore[attr-defined] + dataset.my_table.select("col1", "col2").limit(24).query.strip() == 'SELECT "col1","col2" FROM "pipeline_dataset"."my_table" LIMIT 24' ) def test_copy_and_chaining() -> None: - dataset = dlt.pipeline(destination="duckdb", pipeline_name="pipeline")._dataset() + dataset = dlt.pipeline(destination="duckdb", pipeline_name="pipeline").dataset() # create releation and set some stuff on it relation = dataset.items @@ -65,22 +65,22 @@ def test_copy_and_chaining() -> None: relation2 = relation.__copy__() assert relation != relation2 - assert relation._limit == relation2._limit # type: ignore[attr-defined] - assert relation._table_name == relation2._table_name # type: ignore[attr-defined] - assert relation._provided_query == relation2._provided_query # type: ignore[attr-defined] - assert relation._selected_columns == relation2._selected_columns # type: ignore[attr-defined] + assert relation._limit == relation2._limit + assert relation._table_name == relation2._table_name + assert relation._provided_query == relation2._provided_query + assert relation._selected_columns == relation2._selected_columns # test copy while chaining limit relation3 = relation2.limit(22) assert relation2 != relation3 - assert relation2._limit != relation3._limit # type: ignore[attr-defined] + assert relation2._limit != relation3._limit # test last setting prevails chaining - assert relation.limit(23).limit(67).limit(11)._limit == 11 # type: ignore[attr-defined] + assert relation.limit(23).limit(67).limit(11)._limit == 11 def test_computed_schema_columns() -> None: - dataset = dlt.pipeline(destination="duckdb", pipeline_name="pipeline")._dataset() + dataset = dlt.pipeline(destination="duckdb", pipeline_name="pipeline").dataset() relation = dataset.items # no schema present @@ -107,7 +107,7 @@ def test_computed_schema_columns() -> None: def test_prevent_changing_relation_with_query() -> None: - dataset = dlt.pipeline(destination="duckdb", pipeline_name="pipeline")._dataset() + dataset = dlt.pipeline(destination="duckdb", pipeline_name="pipeline").dataset() relation = dataset("SELECT * FROM something") with pytest.raises(ReadableRelationHasQueryException): diff --git a/tests/extract/test_extract_pipe.py b/tests/extract/test_extract_pipe.py index d40639a594..659888269a 100644 --- a/tests/extract/test_extract_pipe.py +++ b/tests/extract/test_extract_pipe.py @@ -10,7 +10,8 @@ from dlt.common import sleep from dlt.common.typing import TDataItems from dlt.extract.exceptions import CreatePipeException, ResourceExtractionError, UnclosablePipe -from dlt.extract.items import DataItemWithMeta, FilterItem, MapItem, YieldMapItem +from dlt.extract.items import DataItemWithMeta +from dlt.extract.items_transform import FilterItem, MapItem, YieldMapItem from dlt.extract.pipe import Pipe from dlt.extract.pipe_iterator import PipeIterator, ManagedPipeIterator, PipeItem diff --git a/tests/extract/test_incremental.py b/tests/extract/test_incremental.py index 725872b621..9ad7d28e88 100644 --- a/tests/extract/test_incremental.py +++ b/tests/extract/test_incremental.py @@ -5,8 +5,9 @@ from datetime import datetime, date # noqa: I251 from itertools import chain, count from time import sleep -from typing import Any, Optional, Literal, Sequence, Dict +from typing import Any, Optional, Literal, Sequence, Dict, Iterable from unittest import mock +import itertools import duckdb import pyarrow as pa @@ -35,7 +36,7 @@ IncrementalPrimaryKeyMissing, ) from dlt.extract.incremental.lag import apply_lag -from dlt.extract.items import ValidateItem +from dlt.extract.items_transform import ValidateItem from dlt.extract.resource import DltResource from dlt.pipeline.exceptions import PipelineStepFailed from dlt.sources.helpers.transform import take_first @@ -228,7 +229,7 @@ def test_pandas_index_as_dedup_key() -> None: no_index_r = some_data.with_name(new_name="no_index") p.run(no_index_r) p.run(no_index_r) - data_ = p._dataset().no_index.arrow() + data_ = p.dataset().no_index.arrow() assert data_.schema.names == ["created_at", "id"] assert data_["id"].to_pylist() == ["a", "b", "c", "d", "e", "f", "g"] @@ -240,7 +241,7 @@ def test_pandas_index_as_dedup_key() -> None: unnamed_index_r.incremental.primary_key = "__index_level_0__" p.run(unnamed_index_r) p.run(unnamed_index_r) - data_ = p._dataset().unnamed_index.arrow() + data_ = p.dataset().unnamed_index.arrow() assert data_.schema.names == ["created_at", "id", "index_level_0"] # indexes 2 and 3 are removed from second batch because they were in the previous batch # and the created_at overlapped so they got deduplicated @@ -258,7 +259,7 @@ def _make_named_index(df_: pd.DataFrame) -> pd.DataFrame: named_index_r.incremental.primary_key = "order_id" p.run(named_index_r) p.run(named_index_r) - data_ = p._dataset().named_index.arrow() + data_ = p.dataset().named_index.arrow() assert data_.schema.names == ["created_at", "id", "order_id"] assert data_["order_id"].to_pylist() == [0, 1, 2, 3, 4, 0, 1, 4] @@ -268,7 +269,7 @@ def _make_named_index(df_: pd.DataFrame) -> pd.DataFrame: ) p.run(named_index_impl_r) p.run(named_index_impl_r) - data_ = p._dataset().named_index_impl.arrow() + data_ = p.dataset().named_index_impl.arrow() assert data_.schema.names == ["created_at", "id"] assert data_["id"].to_pylist() == ["a", "b", "c", "d", "e", "f", "g"] @@ -1522,6 +1523,7 @@ def some_data(last_timestamp=dlt.sources.incremental("ts", primary_key=())): @pytest.mark.parametrize("item_type", ALL_TEST_DATA_ITEM_FORMATS) def test_apply_hints_incremental(item_type: TestDataItemFormat) -> None: + os.environ["COMPLETED_PROB"] = "1.0" # make it complete immediately p = dlt.pipeline(pipeline_name=uniq_id(), destination="dummy") data = [{"created_at": 1}, {"created_at": 2}, {"created_at": 3}] source_items = data_to_item_format(item_type, data) @@ -3851,3 +3853,166 @@ def some_data(): for col in table_schema["columns"].values(): assert "incremental" not in col + + +@pytest.mark.parametrize("item_type", ALL_TEST_DATA_ITEM_FORMATS) +@pytest.mark.parametrize("last_value_func", [min, max]) +def test_start_range_open(item_type: TestDataItemFormat, last_value_func: Any) -> None: + data_range: Iterable[int] = range(1, 12) + if last_value_func == max: + initial_value = 5 + # Only items higher than inital extracted + expected_items = list(range(6, 12)) + order_dir = "ASC" + elif last_value_func == min: + data_range = reversed(data_range) # type: ignore[call-overload] + initial_value = 5 + # Only items lower than inital extracted + expected_items = list(reversed(range(1, 5))) + order_dir = "DESC" + + @dlt.resource + def some_data( + updated_at: dlt.sources.incremental[int] = dlt.sources.incremental( + "updated_at", + initial_value=initial_value, + range_start="open", + last_value_func=last_value_func, + ), + ) -> Any: + data = [{"updated_at": i} for i in data_range] + yield data_to_item_format(item_type, data) + + pipeline = dlt.pipeline(pipeline_name=uniq_id(), destination="duckdb") + pipeline.run(some_data()) + + with pipeline.sql_client() as client: + items = [ + row[0] + for row in client.execute_sql( + f"SELECT updated_at FROM some_data ORDER BY updated_at {order_dir}" + ) + ] + + assert items == expected_items + + +@pytest.mark.parametrize("item_type", ALL_TEST_DATA_ITEM_FORMATS) +def test_start_range_open_no_deduplication(item_type: TestDataItemFormat) -> None: + @dlt.source + def dummy(): + @dlt.resource + def some_data( + updated_at: dlt.sources.incremental[int] = dlt.sources.incremental( + "updated_at", + range_start="open", + ) + ): + yield [{"updated_at": i} for i in range(3)] + + yield some_data + + pipeline = dlt.pipeline(pipeline_name=uniq_id()) + pipeline.extract(dummy()) + + state = pipeline.state["sources"]["dummy"]["resources"]["some_data"]["incremental"][ + "updated_at" + ] + + # No unique values should be computed + assert state["unique_hashes"] == [] + + +@pytest.mark.parametrize("item_type", ALL_TEST_DATA_ITEM_FORMATS) +@pytest.mark.parametrize("last_value_func", [min, max]) +def test_end_range_closed(item_type: TestDataItemFormat, last_value_func: Any) -> None: + values = [5, 10] + expected_items = list(range(5, 11)) + if last_value_func == max: + order_dir = "ASC" + elif last_value_func == min: + values = list(reversed(values)) + expected_items = list(reversed(expected_items)) + order_dir = "DESC" + + @dlt.resource + def some_data( + updated_at: dlt.sources.incremental[int] = dlt.sources.incremental( + "updated_at", + initial_value=values[0], + end_value=values[1], + range_end="closed", + last_value_func=last_value_func, + ), + ) -> Any: + data = [{"updated_at": i} for i in range(1, 12)] + yield data_to_item_format(item_type, data) + + pipeline = dlt.pipeline(pipeline_name=uniq_id(), destination="duckdb") + pipeline.run(some_data()) + + with pipeline.sql_client() as client: + items = [ + row[0] + for row in client.execute_sql( + f"SELECT updated_at FROM some_data ORDER BY updated_at {order_dir}" + ) + ] + + # Includes values 5-10 inclusive + assert items == expected_items + + +@pytest.mark.parametrize("offset_by_last_value", [True, False]) +def test_incremental_and_limit(offset_by_last_value: bool): + resource_called = 0 + + # here we check incremental and limit when incremental once when last value cannot be used + # to offset the source, and once when it can. + + @dlt.resource( + table_name="items", + ) + def resource( + incremental=dlt.sources.incremental(cursor_path="id", initial_value=-1, row_order="asc") + ): + range_iterator = ( + range(incremental.start_value + 1, 1000) if offset_by_last_value else range(1000) + ) + for i in range_iterator: + nonlocal resource_called + resource_called += 1 + yield { + "id": i, + "value": str(i), + } + + resource.add_limit(10) + + p = dlt.pipeline(pipeline_name="incremental_limit", destination="duckdb", dev_mode=True) + + p.run(resource()) + + # check we have the right number of items + assert len(p.dataset().items.df()) == 10 + assert resource_called == 10 + # check that we have items 0-9 + assert p.dataset().items.df().id.tolist() == list(range(10)) + + # run the next ten + p.run(resource()) + + # check we have the right number of items + assert len(p.dataset().items.df()) == 20 + assert resource_called == 20 if offset_by_last_value else 30 + # check that we have items 0-19 + assert p.dataset().items.df().id.tolist() == list(range(20)) + + # run the next batch + p.run(resource()) + + # check we have the right number of items + assert len(p.dataset().items.df()) == 30 + assert resource_called == 30 if offset_by_last_value else 60 + # check that we have items 0-29 + assert p.dataset().items.df().id.tolist() == list(range(30)) diff --git a/tests/extract/test_sources.py b/tests/extract/test_sources.py index 3d021d5d10..86646e6369 100644 --- a/tests/extract/test_sources.py +++ b/tests/extract/test_sources.py @@ -1,4 +1,6 @@ import itertools +import time + from typing import Iterator import pytest @@ -837,7 +839,7 @@ def test_limit_infinite_counter() -> None: @pytest.mark.parametrize("limit", (None, -1, 0, 10)) def test_limit_edge_cases(limit: int) -> None: - r = dlt.resource(range(20), name="infinity").add_limit(limit) # type: ignore + r = dlt.resource(range(20), name="resource").add_limit(limit) # type: ignore @dlt.resource() async def r_async(): @@ -845,22 +847,62 @@ async def r_async(): await asyncio.sleep(0.01) yield i + @dlt.resource(parallelized=True) + def parallelized_resource(): + for i in range(20): + yield i + sync_list = list(r) async_list = list(r_async().add_limit(limit)) + parallelized_list = list(parallelized_resource().add_limit(limit)) + + # all lists should be the same + assert sync_list == async_list == parallelized_list if limit == 10: assert sync_list == list(range(10)) - # we have edge cases where the async list will have one extra item - # possibly due to timing issues, maybe some other implementation problem - assert (async_list == list(range(10))) or (async_list == list(range(11))) elif limit in [None, -1]: - assert sync_list == async_list == list(range(20)) + assert sync_list == list(range(20)) elif limit == 0: - assert sync_list == async_list == [] + assert sync_list == [] else: raise AssertionError(f"Unexpected limit: {limit}") +def test_various_limit_setups() -> None: + # basic test + r = dlt.resource([1, 2, 3, 4, 5], name="test").add_limit(3) + assert list(r) == [1, 2, 3] + + # yield map test + r = ( + dlt.resource([1, 2, 3, 4, 5], name="test") + .add_map(lambda i: str(i) * i, 1) + .add_yield_map(lambda i: (yield from i)) + .add_limit(3) + ) + # limit is applied at the end + assert list(r) == ["1", "2", "2"] # "3" ,"3" ,"3" ,"4" ,"4" ,"4" ,"4", ...] + + # nested lists test (limit only applied to yields, not actual items) + r = dlt.resource([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]], name="test").add_limit(3) + assert list(r) == [1, 2, 3, 4, 5, 6, 7, 8, 9] + + # transformer test + r = dlt.resource([1, 2, 3, 4, 5], name="test").add_limit(4) + t = dlt.transformer(lambda i: i * 2, name="test") + assert list(r) == [1, 2, 3, 4] + assert list(r | t) == [2, 4, 6, 8] + + # adding limit to transformer is disregarded + t = t.add_limit(2) + assert list(r | t) == [2, 4, 6, 8] + + # limits are fully replaced (more genereous limit applied later takes precedence) + r = dlt.resource([1, 2, 3, 4, 5], name="test").add_limit(3).add_limit(4) + assert list(r) == [1, 2, 3, 4] + + def test_limit_source() -> None: def mul_c(item): yield from "A" * (item + 2) @@ -876,6 +918,30 @@ def infinite_source(): assert list(infinite_source().add_limit(2)) == ["A", "A", 0, "A", "A", "A", 1] * 3 +def test_limit_max_time() -> None: + @dlt.resource() + def r(): + for i in range(100): + time.sleep(0.1) + yield i + + @dlt.resource() + async def r_async(): + for i in range(100): + await asyncio.sleep(0.1) + yield i + + sync_list = list(r().add_limit(max_time=1)) + async_list = list(r_async().add_limit(max_time=1)) + + # we should have extracted 10 items within 1 second, sleep is included in the resource + # we allow for some variance in the number of items, as the sleep is not super precise + # on mac os we even sometimes just get 4 items... + allowed_results = [list(range(i)) for i in [12, 11, 10, 9, 8, 7, 6, 5, 4]] + assert sync_list in allowed_results + assert async_list in allowed_results + + def test_source_state() -> None: @dlt.source def test_source(expected_state): diff --git a/tests/extract/test_validation.py b/tests/extract/test_validation.py index 138589bb06..3800f333f6 100644 --- a/tests/extract/test_validation.py +++ b/tests/extract/test_validation.py @@ -10,7 +10,7 @@ from dlt.common.libs.pydantic import BaseModel from dlt.extract import DltResource -from dlt.extract.items import ValidateItem +from dlt.extract.items_transform import ValidateItem from dlt.extract.validation import PydanticValidator from dlt.extract.exceptions import ResourceExtractionError from dlt.pipeline.exceptions import PipelineStepFailed diff --git a/tests/extract/utils.py b/tests/extract/utils.py index 7364ef7243..f1de3de093 100644 --- a/tests/extract/utils.py +++ b/tests/extract/utils.py @@ -6,7 +6,7 @@ from dlt.common.typing import TDataItem, TDataItems from dlt.extract.extract import ExtractStorage -from dlt.extract.items import ItemTransform +from dlt.extract.items_transform import ItemTransform from tests.utils import TestDataItemFormat diff --git a/tests/helpers/airflow_tests/test_airflow_provider.py b/tests/helpers/airflow_tests/test_airflow_provider.py index 43fb23e48a..2a8e46e2c8 100644 --- a/tests/helpers/airflow_tests/test_airflow_provider.py +++ b/tests/helpers/airflow_tests/test_airflow_provider.py @@ -1,3 +1,7 @@ +import pytest + +pytest.importorskip("airflow") + from airflow import DAG from airflow.decorators import task, dag from airflow.operators.python import PythonOperator diff --git a/tests/helpers/airflow_tests/test_airflow_wrapper.py b/tests/helpers/airflow_tests/test_airflow_wrapper.py index 69e48733e3..06603ffcec 100644 --- a/tests/helpers/airflow_tests/test_airflow_wrapper.py +++ b/tests/helpers/airflow_tests/test_airflow_wrapper.py @@ -2,6 +2,8 @@ import pytest from unittest import mock from typing import Iterator, List + +pytest.importorskip("airflow") from airflow import DAG from airflow.decorators import dag from airflow.operators.python import PythonOperator, get_current_context diff --git a/tests/helpers/airflow_tests/test_join_airflow_scheduler.py b/tests/helpers/airflow_tests/test_join_airflow_scheduler.py index d737f254e3..503aa62359 100644 --- a/tests/helpers/airflow_tests/test_join_airflow_scheduler.py +++ b/tests/helpers/airflow_tests/test_join_airflow_scheduler.py @@ -1,5 +1,8 @@ +import pytest import datetime from pendulum.tz import UTC + +pytest.importorskip("airflow") from airflow import DAG from airflow.decorators import dag, task from airflow.models import DagRun diff --git a/tests/helpers/airflow_tests/utils.py b/tests/helpers/airflow_tests/utils.py index a98ad4333a..4c1482a2ef 100644 --- a/tests/helpers/airflow_tests/utils.py +++ b/tests/helpers/airflow_tests/utils.py @@ -2,9 +2,6 @@ import os import argparse import pytest -from airflow.cli.commands.db_command import resetdb -from airflow.configuration import conf -from airflow.models.variable import Variable from dlt.common.configuration.container import Container from dlt.common.configuration.specs import PluggableRunContext @@ -19,6 +16,8 @@ @pytest.fixture(scope="function", autouse=True) def initialize_airflow_db(): + from airflow.models.variable import Variable + setup_airflow() # backup context providers providers = Container()[PluggableRunContext].providers @@ -35,6 +34,9 @@ def initialize_airflow_db(): def setup_airflow() -> None: + from airflow.cli.commands.db_command import resetdb + from airflow.configuration import conf + # Disable loading examples try: conf.add_section("core") diff --git a/tests/libs/test_csv_writer.py b/tests/libs/test_csv_writer.py index 3c30123e1c..a120cd048e 100644 --- a/tests/libs/test_csv_writer.py +++ b/tests/libs/test_csv_writer.py @@ -178,7 +178,7 @@ def test_non_utf8_binary(item_type: TestDataItemFormat) -> None: table = pq.read_table(f) else: table = data - writer_type: Type[DataWriter] = ArrowToCsvWriter if item_type == "arrow-table" else CsvWriter # type: ignore + writer_type: Type[DataWriter] = ArrowToCsvWriter if item_type == "arrow-table" else CsvWriter with pytest.raises(InvalidDataItem) as inv_ex: with get_writer(writer_type, disable_compression=True) as writer: @@ -195,7 +195,7 @@ def test_arrow_struct() -> None: @pytest.mark.parametrize("item_type", ["object", "arrow-table"]) def test_csv_writer_empty(item_type: TestDataItemFormat) -> None: - writer_type: Type[DataWriter] = ArrowToCsvWriter if item_type == "arrow-table" else CsvWriter # type: ignore + writer_type: Type[DataWriter] = ArrowToCsvWriter if item_type == "arrow-table" else CsvWriter with get_writer(writer_type, disable_compression=True) as writer: writer.write_empty_file(TABLE_UPDATE_COLUMNS_SCHEMA) diff --git a/tests/load/bigquery/test_bigquery_table_builder.py b/tests/load/bigquery/test_bigquery_table_builder.py index 56a674cfa3..b2857b7c08 100644 --- a/tests/load/bigquery/test_bigquery_table_builder.py +++ b/tests/load/bigquery/test_bigquery_table_builder.py @@ -107,25 +107,25 @@ def test_create_table(gcp_client: BigQueryClient) -> None: sqlfluff.parse(sql, dialect="bigquery") assert sql.startswith("CREATE TABLE") assert "event_test_table" in sql - assert "`col1` INT64 NOT NULL" in sql - assert "`col2` FLOAT64 NOT NULL" in sql - assert "`col3` BOOL NOT NULL" in sql - assert "`col4` TIMESTAMP NOT NULL" in sql + assert "`col1` INT64 NOT NULL" in sql + assert "`col2` FLOAT64 NOT NULL" in sql + assert "`col3` BOOL NOT NULL" in sql + assert "`col4` TIMESTAMP NOT NULL" in sql assert "`col5` STRING " in sql - assert "`col6` NUMERIC(38,9) NOT NULL" in sql + assert "`col6` NUMERIC(38,9) NOT NULL" in sql assert "`col7` BYTES" in sql assert "`col8` BIGNUMERIC" in sql - assert "`col9` JSON NOT NULL" in sql + assert "`col9` JSON NOT NULL" in sql assert "`col10` DATE" in sql assert "`col11` TIME" in sql - assert "`col1_precision` INT64 NOT NULL" in sql - assert "`col4_precision` TIMESTAMP NOT NULL" in sql + assert "`col1_precision` INT64 NOT NULL" in sql + assert "`col4_precision` TIMESTAMP NOT NULL" in sql assert "`col5_precision` STRING(25) " in sql - assert "`col6_precision` NUMERIC(6,2) NOT NULL" in sql + assert "`col6_precision` NUMERIC(6,2) NOT NULL" in sql assert "`col7_precision` BYTES(19)" in sql - assert "`col11_precision` TIME NOT NULL" in sql - assert "`col_high_p_decimal` BIGNUMERIC(76,0) NOT NULL" in sql - assert "`col_high_s_decimal` BIGNUMERIC(38,24) NOT NULL" in sql + assert "`col11_precision` TIME NOT NULL" in sql + assert "`col_high_p_decimal` BIGNUMERIC(76,0) NOT NULL" in sql + assert "`col_high_s_decimal` BIGNUMERIC(38,24) NOT NULL" in sql assert "CLUSTER BY" not in sql assert "PARTITION BY" not in sql @@ -137,29 +137,29 @@ def test_alter_table(gcp_client: BigQueryClient) -> None: assert sql.startswith("ALTER TABLE") assert sql.count("ALTER TABLE") == 1 assert "event_test_table" in sql - assert "ADD COLUMN `col1` INT64 NOT NULL" in sql - assert "ADD COLUMN `col2` FLOAT64 NOT NULL" in sql - assert "ADD COLUMN `col3` BOOL NOT NULL" in sql - assert "ADD COLUMN `col4` TIMESTAMP NOT NULL" in sql + assert "ADD COLUMN `col1` INT64 NOT NULL" in sql + assert "ADD COLUMN `col2` FLOAT64 NOT NULL" in sql + assert "ADD COLUMN `col3` BOOL NOT NULL" in sql + assert "ADD COLUMN `col4` TIMESTAMP NOT NULL" in sql assert "ADD COLUMN `col5` STRING" in sql - assert "ADD COLUMN `col6` NUMERIC(38,9) NOT NULL" in sql + assert "ADD COLUMN `col6` NUMERIC(38,9) NOT NULL" in sql assert "ADD COLUMN `col7` BYTES" in sql assert "ADD COLUMN `col8` BIGNUMERIC" in sql - assert "ADD COLUMN `col9` JSON NOT NULL" in sql + assert "ADD COLUMN `col9` JSON NOT NULL" in sql assert "ADD COLUMN `col10` DATE" in sql assert "ADD COLUMN `col11` TIME" in sql - assert "ADD COLUMN `col1_precision` INT64 NOT NULL" in sql - assert "ADD COLUMN `col4_precision` TIMESTAMP NOT NULL" in sql + assert "ADD COLUMN `col1_precision` INT64 NOT NULL" in sql + assert "ADD COLUMN `col4_precision` TIMESTAMP NOT NULL" in sql assert "ADD COLUMN `col5_precision` STRING(25)" in sql - assert "ADD COLUMN `col6_precision` NUMERIC(6,2) NOT NULL" in sql + assert "ADD COLUMN `col6_precision` NUMERIC(6,2) NOT NULL" in sql assert "ADD COLUMN `col7_precision` BYTES(19)" in sql - assert "ADD COLUMN `col11_precision` TIME NOT NULL" in sql + assert "ADD COLUMN `col11_precision` TIME NOT NULL" in sql # table has col1 already in storage mod_table = deepcopy(TABLE_UPDATE) mod_table.pop(0) sql = gcp_client._get_table_update_sql("event_test_table", mod_table, True)[0] - assert "ADD COLUMN `col1` INTEGER NOT NULL" not in sql - assert "ADD COLUMN `col2` FLOAT64 NOT NULL" in sql + assert "ADD COLUMN `col1` INTEGER NOT NULL" not in sql + assert "ADD COLUMN `col2` FLOAT64 NOT NULL" in sql def test_create_table_case_insensitive(ci_gcp_client: BigQueryClient) -> None: diff --git a/tests/load/databricks/test_databricks_configuration.py b/tests/load/databricks/test_databricks_configuration.py index e27da4db2a..8b3beed2b3 100644 --- a/tests/load/databricks/test_databricks_configuration.py +++ b/tests/load/databricks/test_databricks_configuration.py @@ -4,6 +4,7 @@ pytest.importorskip("databricks") from dlt.common.exceptions import TerminalValueError +from dlt.common.configuration.exceptions import ConfigurationValueError from dlt.destinations.impl.databricks.databricks import DatabricksLoadJob from dlt.common.configuration import resolve_configuration @@ -86,3 +87,12 @@ def test_databricks_abfss_converter() -> None: abfss_url == "abfss://dlt-ci-test-bucket@my_account.dfs.core.windows.net/path/to/file.parquet" ) + + +def test_databricks_auth_invalid() -> None: + with pytest.raises(ConfigurationValueError, match="No valid authentication method detected.*"): + os.environ["DESTINATION__DATABRICKS__CREDENTIALS__CLIENT_ID"] = "" + os.environ["DESTINATION__DATABRICKS__CREDENTIALS__CLIENT_SECRET"] = "" + os.environ["DESTINATION__DATABRICKS__CREDENTIALS__ACCESS_TOKEN"] = "" + bricks = databricks() + bricks.configuration(None, accept_partial=True) diff --git a/tests/load/dremio/test_dremio_client.py b/tests/load/dremio/test_dremio_client.py index efc72c0652..98212efb13 100644 --- a/tests/load/dremio/test_dremio_client.py +++ b/tests/load/dremio/test_dremio_client.py @@ -48,12 +48,12 @@ def test_dremio_factory() -> None: [ TColumnSchema(name="foo", data_type="text", partition=True), TColumnSchema(name="bar", data_type="bigint", sort=True), - TColumnSchema(name="baz", data_type="double"), + TColumnSchema(name="baz", data_type="double", nullable=False), ], False, [ 'CREATE TABLE "test_database"."test_dataset"."event_test_table"' - ' (\n"foo" VARCHAR ,\n"bar" BIGINT ,\n"baz" DOUBLE )\nPARTITION BY' + ' (\n"foo" VARCHAR ,\n"bar" BIGINT ,\n"baz" DOUBLE NOT NULL)\nPARTITION BY' ' ("foo")\nLOCALSORT BY ("bar")' ], ), @@ -66,7 +66,7 @@ def test_dremio_factory() -> None: False, [ 'CREATE TABLE "test_database"."test_dataset"."event_test_table"' - ' (\n"foo" VARCHAR ,\n"bar" BIGINT ,\n"baz" DOUBLE )\nPARTITION BY' + ' (\n"foo" VARCHAR ,\n"bar" BIGINT ,\n"baz" DOUBLE )\nPARTITION BY' ' ("foo","bar")' ], ), @@ -79,7 +79,7 @@ def test_dremio_factory() -> None: False, [ 'CREATE TABLE "test_database"."test_dataset"."event_test_table"' - ' (\n"foo" VARCHAR ,\n"bar" BIGINT ,\n"baz" DOUBLE )' + ' (\n"foo" VARCHAR ,\n"bar" BIGINT ,\n"baz" DOUBLE )' ], ), ], diff --git a/tests/load/duckdb/test_duckdb_client.py b/tests/load/duckdb/test_duckdb_client.py index 49475ce43f..652f75772a 100644 --- a/tests/load/duckdb/test_duckdb_client.py +++ b/tests/load/duckdb/test_duckdb_client.py @@ -282,14 +282,14 @@ def test_drops_pipeline_changes_bound() -> None: p = dlt.pipeline(pipeline_name="quack_pipeline", destination="duckdb") p.run([1, 2, 3], table_name="p_table") p = p.drop() - assert len(p._dataset().p_table.fetchall()) == 3 + assert len(p.dataset().p_table.fetchall()) == 3 # drops internal duckdb p = dlt.pipeline(pipeline_name="quack_pipeline", destination=duckdb(":pipeline:")) p.run([1, 2, 3], table_name="p_table") p = p.drop() with pytest.raises(DatabaseUndefinedRelation): - p._dataset().p_table.fetchall() + p.dataset().p_table.fetchall() def test_duckdb_database_delete() -> None: diff --git a/tests/load/filesystem/test_object_store_rs_credentials.py b/tests/load/filesystem/test_credentials_mixins.py similarity index 50% rename from tests/load/filesystem/test_object_store_rs_credentials.py rename to tests/load/filesystem/test_credentials_mixins.py index f23187a269..c1fb02c152 100644 --- a/tests/load/filesystem/test_object_store_rs_credentials.py +++ b/tests/load/filesystem/test_credentials_mixins.py @@ -1,12 +1,8 @@ -"""Tests translation of `dlt` credentials into `object_store` Rust crate credentials.""" - -from typing import Any, Dict +from typing import Any, Dict, Union, Type, get_args, cast import os import json # noqa: I251 import pytest -from deltalake import DeltaTable -from deltalake.exceptions import TableNotFoundError import dlt from dlt.common.configuration import resolve_configuration @@ -23,10 +19,15 @@ from dlt.common.utils import custom_environ from dlt.common.configuration.resolve import resolve_configuration from dlt.common.configuration.specs.gcp_credentials import GcpDefaultCredentials -from dlt.common.configuration.specs.exceptions import ObjectStoreRsCredentialsException +from dlt.common.configuration.specs.exceptions import ( + ObjectStoreRsCredentialsException, + UnsupportedAuthenticationMethodException, +) +from dlt.common.configuration.specs.mixins import WithObjectStoreRsCredentials, WithPyicebergConfig from tests.load.utils import ( AZ_BUCKET, + ABFS_BUCKET, AWS_BUCKET, GCS_BUCKET, R2_BUCKET_CONFIG, @@ -34,6 +35,9 @@ ) +TCredentialsMixin = Union[WithObjectStoreRsCredentials, WithPyicebergConfig] +ALL_CREDENTIALS_MIXINS = get_args(TCredentialsMixin) + pytestmark = pytest.mark.essential if all(driver not in ALL_FILESYSTEM_DRIVERS for driver in ("az", "s3", "gs", "r2")): @@ -53,11 +57,27 @@ def fs_creds() -> Dict[str, Any]: return creds -def can_connect(bucket_url: str, object_store_rs_credentials: Dict[str, str]) -> bool: - """Returns True if client can connect to object store, False otherwise. +def can_connect(bucket_url: str, credentials: TCredentialsMixin, mixin: Type[TCredentialsMixin]) -> bool: # type: ignore[return] + """Returns True if client can connect to object store, False otherwise.""" + if mixin == WithObjectStoreRsCredentials: + credentials = cast(WithObjectStoreRsCredentials, credentials) + return can_connect_object_store_rs_credentials( + bucket_url, credentials.to_object_store_rs_credentials() + ) + elif mixin == WithPyicebergConfig: + credentials = cast(WithPyicebergConfig, credentials) + return can_connect_pyiceberg_fileio_config( + bucket_url, credentials.to_pyiceberg_fileio_config() + ) + + +def can_connect_object_store_rs_credentials( + bucket_url: str, object_store_rs_credentials: Dict[str, str] +) -> bool: + # uses `deltatable` library as Python interface to `object_store` Rust crate + from deltalake import DeltaTable + from deltalake.exceptions import TableNotFoundError - Uses `deltatable` library as Python interface to `object_store` Rust crate. - """ try: DeltaTable( bucket_url, @@ -70,16 +90,40 @@ def can_connect(bucket_url: str, object_store_rs_credentials: Dict[str, str]) -> return False +def can_connect_pyiceberg_fileio_config( + bucket_url: str, pyiceberg_fileio_config: Dict[str, str] +) -> bool: + from pyiceberg.table import StaticTable + + try: + StaticTable.from_metadata( + f"{bucket_url}/non_existing_metadata_file.json", + properties=pyiceberg_fileio_config, + ) + except FileNotFoundError: + # this error implies the connection was successful + # there is no Iceberg metadata file at the specified path + return True + return False + + @pytest.mark.parametrize( - "driver", [driver for driver in ALL_FILESYSTEM_DRIVERS if driver in ("az")] + "driver", [driver for driver in ALL_FILESYSTEM_DRIVERS if driver in ("az", "abfss")] ) -def test_azure_object_store_rs_credentials(driver: str, fs_creds: Dict[str, Any]) -> None: +@pytest.mark.parametrize("mixin", ALL_CREDENTIALS_MIXINS) +def test_azure_credentials_mixins( + driver: str, fs_creds: Dict[str, Any], mixin: Type[TCredentialsMixin] +) -> None: + if mixin == WithPyicebergConfig and driver == "az": + pytest.skip("`pyiceberg` does not support `az` scheme") + + buckets = {"az": AZ_BUCKET, "abfss": ABFS_BUCKET} creds: AnyAzureCredentials creds = AzureServicePrincipalCredentialsWithoutDefaults( **dlt.secrets.get("destination.fsazureprincipal.credentials") ) - assert can_connect(AZ_BUCKET, creds.to_object_store_rs_credentials()) + assert can_connect(buckets[driver], creds, mixin) # without SAS token creds = AzureCredentialsWithoutDefaults( @@ -87,18 +131,21 @@ def test_azure_object_store_rs_credentials(driver: str, fs_creds: Dict[str, Any] azure_storage_account_key=fs_creds["azure_storage_account_key"], ) assert creds.azure_storage_sas_token is None - assert can_connect(AZ_BUCKET, creds.to_object_store_rs_credentials()) + assert can_connect(buckets[driver], creds, mixin) # with SAS token creds = resolve_configuration(creds) assert creds.azure_storage_sas_token is not None - assert can_connect(AZ_BUCKET, creds.to_object_store_rs_credentials()) + assert can_connect(buckets[driver], creds, mixin) @pytest.mark.parametrize( "driver", [driver for driver in ALL_FILESYSTEM_DRIVERS if driver in ("s3", "r2")] ) -def test_aws_object_store_rs_credentials(driver: str, fs_creds: Dict[str, Any]) -> None: +@pytest.mark.parametrize("mixin", ALL_CREDENTIALS_MIXINS) +def test_aws_credentials_mixins( + driver: str, fs_creds: Dict[str, Any], mixin: Type[TCredentialsMixin] +) -> None: creds: AwsCredentialsWithoutDefaults if driver == "r2": @@ -112,9 +159,11 @@ def test_aws_object_store_rs_credentials(driver: str, fs_creds: Dict[str, Any]) endpoint_url=fs_creds.get("endpoint_url"), ) assert creds.aws_session_token is None - object_store_rs_creds = creds.to_object_store_rs_credentials() - assert "aws_session_token" not in object_store_rs_creds # no auto-generated token - assert can_connect(AWS_BUCKET, object_store_rs_creds) + if mixin == WithObjectStoreRsCredentials: + assert ( + "aws_session_token" not in creds.to_object_store_rs_credentials() + ) # no auto-generated token + assert can_connect(AWS_BUCKET, creds, mixin) # AwsCredentials: no user-provided session token creds = AwsCredentials( @@ -124,24 +173,27 @@ def test_aws_object_store_rs_credentials(driver: str, fs_creds: Dict[str, Any]) endpoint_url=fs_creds.get("endpoint_url"), ) assert creds.aws_session_token is None - object_store_rs_creds = creds.to_object_store_rs_credentials() - assert "aws_session_token" not in object_store_rs_creds # no auto-generated token - assert can_connect(AWS_BUCKET, object_store_rs_creds) - - # exception should be raised if both `endpoint_url` and `region_name` are - # not provided - with pytest.raises(ObjectStoreRsCredentialsException): - AwsCredentials( - aws_access_key_id=fs_creds["aws_access_key_id"], - aws_secret_access_key=fs_creds["aws_secret_access_key"], - ).to_object_store_rs_credentials() - - if "endpoint_url" in object_store_rs_creds: - # TODO: make sure this case is tested on GitHub CI, e.g. by adding - # a local MinIO bucket to the set of tested buckets - if object_store_rs_creds["endpoint_url"].startswith("http://"): + assert can_connect(AWS_BUCKET, creds, mixin) + if mixin == WithObjectStoreRsCredentials: + object_store_rs_creds = creds.to_object_store_rs_credentials() + assert "aws_session_token" not in object_store_rs_creds # no auto-generated token + + # exception should be raised if both `endpoint_url` and `region_name` are + # not provided + with pytest.raises(ObjectStoreRsCredentialsException): + AwsCredentials( + aws_access_key_id=fs_creds["aws_access_key_id"], + aws_secret_access_key=fs_creds["aws_secret_access_key"], + ).to_object_store_rs_credentials() + + if "endpoint_url" in object_store_rs_creds and object_store_rs_creds[ + "endpoint_url" + ].startswith("http://"): + # TODO: make sure this case is tested on GitHub CI, e.g. by adding + # a local MinIO bucket to the set of tested buckets assert object_store_rs_creds["aws_allow_http"] == "true" + if creds.endpoint_url is not None: # remainder of tests use session tokens # we don't run them on S3 compatible storage because session tokens # may not be available @@ -158,9 +210,10 @@ def test_aws_object_store_rs_credentials(driver: str, fs_creds: Dict[str, Any]) region_name=fs_creds["region_name"], ) assert creds.aws_session_token is not None - object_store_rs_creds = creds.to_object_store_rs_credentials() - assert object_store_rs_creds["aws_session_token"] is not None - assert can_connect(AWS_BUCKET, object_store_rs_creds) + assert can_connect(AWS_BUCKET, creds, mixin) + if mixin == WithObjectStoreRsCredentials: + object_store_rs_creds = creds.to_object_store_rs_credentials() + assert object_store_rs_creds["aws_session_token"] is not None # AwsCredentialsWithoutDefaults: user-provided session token creds = AwsCredentialsWithoutDefaults( @@ -170,15 +223,19 @@ def test_aws_object_store_rs_credentials(driver: str, fs_creds: Dict[str, Any]) region_name=fs_creds["region_name"], ) assert creds.aws_session_token is not None - object_store_rs_creds = creds.to_object_store_rs_credentials() - assert object_store_rs_creds["aws_session_token"] is not None - assert can_connect(AWS_BUCKET, object_store_rs_creds) + assert can_connect(AWS_BUCKET, creds, mixin) + if mixin == WithObjectStoreRsCredentials: + object_store_rs_creds = creds.to_object_store_rs_credentials() + assert object_store_rs_creds["aws_session_token"] is not None @pytest.mark.parametrize( "driver", [driver for driver in ALL_FILESYSTEM_DRIVERS if driver in ("gs")] ) -def test_gcp_object_store_rs_credentials(driver, fs_creds: Dict[str, Any]) -> None: +@pytest.mark.parametrize("mixin", ALL_CREDENTIALS_MIXINS) +def test_gcp_credentials_mixins( + driver, fs_creds: Dict[str, Any], mixin: Type[TCredentialsMixin] +) -> None: creds: GcpCredentials # GcpServiceAccountCredentialsWithoutDefaults @@ -189,7 +246,11 @@ def test_gcp_object_store_rs_credentials(driver, fs_creds: Dict[str, Any]) -> No private_key_id=fs_creds["private_key_id"], client_email=fs_creds["client_email"], ) - assert can_connect(GCS_BUCKET, creds.to_object_store_rs_credentials()) + if mixin == WithPyicebergConfig: + with pytest.raises(UnsupportedAuthenticationMethodException): + assert can_connect(GCS_BUCKET, creds, mixin) + elif mixin == WithObjectStoreRsCredentials: + assert can_connect(GCS_BUCKET, creds, mixin) # GcpDefaultCredentials @@ -197,7 +258,7 @@ def test_gcp_object_store_rs_credentials(driver, fs_creds: Dict[str, Any]) -> No GcpDefaultCredentials._LAST_FAILED_DEFAULT = 0 # write service account key to JSON file - service_json = json.loads(creds.to_object_store_rs_credentials()["service_account_key"]) + service_json = json.loads(creds.to_native_representation()) path = "_secrets/service.json" os.makedirs(os.path.dirname(path), exist_ok=True) with open(path, "w", encoding="utf-8") as f: @@ -206,8 +267,18 @@ def test_gcp_object_store_rs_credentials(driver, fs_creds: Dict[str, Any]) -> No with custom_environ({"GOOGLE_APPLICATION_CREDENTIALS": path}): creds = GcpDefaultCredentials() resolve_configuration(creds) - can_connect(GCS_BUCKET, creds.to_object_store_rs_credentials()) - - # GcpOAuthCredentialsWithoutDefaults is currently not supported - with pytest.raises(NotImplementedError): - GcpOAuthCredentialsWithoutDefaults().to_object_store_rs_credentials() + if mixin == WithPyicebergConfig: + with pytest.raises(UnsupportedAuthenticationMethodException): + assert can_connect(GCS_BUCKET, creds, mixin) + elif mixin == WithObjectStoreRsCredentials: + assert can_connect(GCS_BUCKET, creds, mixin) + + # GcpOAuthCredentialsWithoutDefaults + creds = resolve_configuration( + GcpOAuthCredentialsWithoutDefaults(), sections=("destination", "fsgcpoauth") + ) + if mixin == WithPyicebergConfig: + assert can_connect(GCS_BUCKET, creds, mixin) + elif mixin == WithObjectStoreRsCredentials: + with pytest.raises(UnsupportedAuthenticationMethodException): + assert can_connect(GCS_BUCKET, creds, mixin) diff --git a/tests/load/filesystem/test_sql_client.py b/tests/load/filesystem/test_sql_client.py index ac2ada2551..cf4bbfb286 100644 --- a/tests/load/filesystem/test_sql_client.py +++ b/tests/load/filesystem/test_sql_client.py @@ -1,17 +1,17 @@ """Test the duckdb supported sql client for special internal features""" -from typing import Any +from typing import Optional import pytest import dlt import os import shutil -import logging from dlt import Pipeline from dlt.common.utils import uniq_id +from dlt.common.schema.typing import TTableFormat from tests.load.utils import ( destinations_configs, @@ -19,10 +19,10 @@ GCS_BUCKET, SFTP_BUCKET, MEMORY_BUCKET, - AWS_BUCKET, ) from dlt.destinations import filesystem from tests.utils import TEST_STORAGE_ROOT +from tests.cases import arrow_table_all_data_types from dlt.destinations.exceptions import DatabaseUndefinedRelation @@ -37,7 +37,7 @@ def _run_dataset_checks( pipeline: Pipeline, destination_config: DestinationTestConfiguration, secret_directory: str, - table_format: Any = None, + table_format: Optional[TTableFormat] = None, alternate_access_pipeline: Pipeline = None, ) -> None: total_records = 200 @@ -82,12 +82,17 @@ def double_items(): for i in range(total_records) ] - return [items, double_items] + @dlt.resource(table_format=table_format) + def arrow_all_types(): + yield arrow_table_all_data_types("arrow-table", num_rows=total_records)[0] + + return [items, double_items, arrow_all_types] # run source pipeline.run(source(), loader_file_format=destination_config.file_format) if alternate_access_pipeline: + orig_dest = pipeline.destination pipeline.destination = alternate_access_pipeline.destination import duckdb @@ -97,8 +102,11 @@ def double_items(): DuckDbCredentials, ) - # check we can create new tables from the views with pipeline.sql_client() as c: + # check if all data types are handled properly + c.execute_sql("SELECT * FROM arrow_all_types;") + + # check we can create new tables from the views c.execute_sql( "CREATE TABLE items_joined AS (SELECT i.id, di.double_id FROM items as i JOIN" " double_items as di ON (i.id = di.id));" @@ -110,16 +118,14 @@ def double_items(): assert list(joined_table[5]) == [5, 10] assert list(joined_table[10]) == [10, 20] - # inserting values into a view should fail gracefully - with pipeline.sql_client() as c: + # inserting values into a view should fail gracefully try: c.execute_sql("INSERT INTO double_items VALUES (1, 2)") except Exception as exc: assert "double_items is not an table" in str(exc) - # check that no automated views are created for a schema different than - # the known one - with pipeline.sql_client() as c: + # check that no automated views are created for a schema different than + # the known one c.execute_sql("CREATE SCHEMA other_schema;") with pytest.raises(DatabaseUndefinedRelation): with c.execute_query("SELECT * FROM other_schema.items ORDER BY id ASC;") as cursor: @@ -144,6 +150,8 @@ def _external_duckdb_connection() -> duckdb.DuckDBPyConnection: # the line below solves problems with certificate path lookup on linux, see duckdb docs external_db.sql("SET azure_transport_option_type = 'curl';") external_db.sql(f"SET secret_directory = '{secret_directory}';") + if table_format == "iceberg": + FilesystemSqlClient._setup_iceberg(external_db) return external_db def _fs_sql_client_for_external_db( @@ -171,6 +179,24 @@ def _fs_sql_client_for_external_db( # views exist assert len(external_db.sql("SELECT * FROM second.referenced_items").fetchall()) == total_records assert len(external_db.sql("SELECT * FROM first.items").fetchall()) == 3 + + # test if view reflects source table accurately after it has changed + # conretely, this tests if an existing view is replaced with formats that need it, such as + # `iceberg` table format + with fs_sql_client as sql_client: + sql_client.create_views_for_tables({"arrow_all_types": "arrow_all_types"}) + assert external_db.sql("FROM second.arrow_all_types;").arrow().num_rows == total_records + if alternate_access_pipeline: + # switch back for the write path + pipeline.destination = orig_dest + pipeline.run( # run pipeline again to add rows to source table + source().with_resources("arrow_all_types"), + loader_file_format=destination_config.file_format, + ) + with fs_sql_client as sql_client: + sql_client.create_views_for_tables({"arrow_all_types": "arrow_all_types"}) + assert external_db.sql("FROM second.arrow_all_types;").arrow().num_rows == (2 * total_records) + external_db.close() # in case we are not connecting to a bucket that needs secrets, views should still be here after connection reopen @@ -283,13 +309,13 @@ def test_read_interfaces_filesystem( "destination_config", destinations_configs( table_format_filesystem_configs=True, - with_table_format="delta", + with_table_format=("delta", "iceberg"), bucket_exclude=[SFTP_BUCKET, MEMORY_BUCKET], # NOTE: delta does not work on memory buckets ), ids=lambda x: x.name, ) -def test_delta_tables( +def test_table_formats( destination_config: DestinationTestConfiguration, secret_directory: str ) -> None: os.environ["DATA_WRITER__FILE_MAX_ITEMS"] = "700" @@ -297,25 +323,27 @@ def test_delta_tables( pipeline = destination_config.setup_pipeline( "read_pipeline", dataset_name="read_test", + dev_mode=True, ) # in case of gcs we use the s3 compat layer for reading # for writing we still need to use the gc authentication, as delta_rs seems to use # methods on the s3 interface that are not implemented by gcs + # s3 compat layer does not work with `iceberg` table format access_pipeline = pipeline - if destination_config.bucket_url == GCS_BUCKET: + if destination_config.bucket_url == GCS_BUCKET and destination_config.table_format != "iceberg": gcp_bucket = filesystem( GCS_BUCKET.replace("gs://", "s3://"), destination_name="filesystem_s3_gcs_comp" ) access_pipeline = destination_config.setup_pipeline( - "read_pipeline", dataset_name="read_test", destination=gcp_bucket + "read_pipeline", dataset_name="read_test", dev_mode=True, destination=gcp_bucket ) _run_dataset_checks( pipeline, destination_config, secret_directory=secret_directory, - table_format="delta", + table_format=destination_config.table_format, alternate_access_pipeline=access_pipeline, ) @@ -349,7 +377,7 @@ def items(): pipeline.run([items()], loader_file_format=destination_config.file_format) - df = pipeline._dataset().items.df() + df = pipeline.dataset().items.df() assert len(df.index) == 20 @dlt.resource(table_name="items") @@ -359,5 +387,5 @@ def items2(): pipeline.run([items2()], loader_file_format=destination_config.file_format) # check df and arrow access - assert len(pipeline._dataset().items.df().index) == 50 - assert pipeline._dataset().items.arrow().num_rows == 50 + assert len(pipeline.dataset().items.df().index) == 50 + assert pipeline.dataset().items.arrow().num_rows == 50 diff --git a/tests/load/pipeline/test_bigquery.py b/tests/load/pipeline/test_bigquery.py index cb65c6bcf1..83982bb998 100644 --- a/tests/load/pipeline/test_bigquery.py +++ b/tests/load/pipeline/test_bigquery.py @@ -384,8 +384,8 @@ def resource(): bigquery_adapter(resource, autodetect_schema=True) pipeline.run(resource) - assert len(pipeline._dataset().items.df()) == 5 - assert len(pipeline._dataset().items__nested.df()) == 5 + assert len(pipeline.dataset().items.df()) == 5 + assert len(pipeline.dataset().items__nested.df()) == 5 @dlt.resource(primary_key="id", table_name="items", write_disposition="merge") def resource2(): @@ -395,5 +395,5 @@ def resource2(): bigquery_adapter(resource2, autodetect_schema=True) pipeline.run(resource2) - assert len(pipeline._dataset().items.df()) == 7 - assert len(pipeline._dataset().items__nested.df()) == 7 + assert len(pipeline.dataset().items.df()) == 7 + assert len(pipeline.dataset().items__nested.df()) == 7 diff --git a/tests/load/pipeline/test_databricks_pipeline.py b/tests/load/pipeline/test_databricks_pipeline.py index e802cde693..078dce3a7f 100644 --- a/tests/load/pipeline/test_databricks_pipeline.py +++ b/tests/load/pipeline/test_databricks_pipeline.py @@ -2,6 +2,7 @@ import os from dlt.common.utils import uniq_id +from dlt.destinations import databricks from tests.load.utils import ( GCS_BUCKET, DestinationTestConfiguration, @@ -23,6 +24,10 @@ ids=lambda x: x.name, ) def test_databricks_external_location(destination_config: DestinationTestConfiguration) -> None: + # force token-based authentication + os.environ["DESTINATION__DATABRICKS__CREDENTIALS__CLIENT_ID"] = "" + os.environ["DESTINATION__DATABRICKS__CREDENTIALS__CLIENT_SECRET"] = "" + # do not interfere with state os.environ["RESTORE_FROM_DESTINATION"] = "False" # let the package complete even with failed jobs @@ -145,3 +150,54 @@ def test_databricks_gcs_external_location(destination_config: DestinationTestCon assert ( "credential_x" in pipeline.list_failed_jobs_in_package(info.loads_ids[0])[0].failed_message ) + + +@pytest.mark.parametrize( + "destination_config", + destinations_configs(default_sql_configs=True, subset=("databricks",)), + ids=lambda x: x.name, +) +def test_databricks_auth_oauth(destination_config: DestinationTestConfiguration) -> None: + os.environ["DESTINATION__DATABRICKS__CREDENTIALS__ACCESS_TOKEN"] = "" + bricks = databricks() + config = bricks.configuration(None, accept_partial=True) + assert config.credentials.client_id and config.credentials.client_secret + assert not config.credentials.access_token + + dataset_name = "test_databricks_oauth" + uniq_id() + pipeline = destination_config.setup_pipeline( + "test_databricks_oauth", dataset_name=dataset_name, destination=bricks + ) + + info = pipeline.run([1, 2, 3], table_name="digits", **destination_config.run_kwargs) + assert info.has_failed_jobs is False + + with pipeline.sql_client() as client: + rows = client.execute_sql(f"select * from {dataset_name}.digits") + assert len(rows) == 3 + + +@pytest.mark.parametrize( + "destination_config", + destinations_configs(default_sql_configs=True, subset=("databricks",)), + ids=lambda x: x.name, +) +def test_databricks_auth_token(destination_config: DestinationTestConfiguration) -> None: + os.environ["DESTINATION__DATABRICKS__CREDENTIALS__CLIENT_ID"] = "" + os.environ["DESTINATION__DATABRICKS__CREDENTIALS__CLIENT_SECRET"] = "" + bricks = databricks() + config = bricks.configuration(None, accept_partial=True) + assert config.credentials.access_token + assert not (config.credentials.client_secret and config.credentials.client_id) + + dataset_name = "test_databricks_token" + uniq_id() + pipeline = destination_config.setup_pipeline( + "test_databricks_token", dataset_name=dataset_name, destination=bricks + ) + + info = pipeline.run([1, 2, 3], table_name="digits", **destination_config.run_kwargs) + assert info.has_failed_jobs is False + + with pipeline.sql_client() as client: + rows = client.execute_sql(f"select * from {dataset_name}.digits") + assert len(rows) == 3 diff --git a/tests/load/pipeline/test_drop.py b/tests/load/pipeline/test_drop.py index 0e44c754e7..330f2606ff 100644 --- a/tests/load/pipeline/test_drop.py +++ b/tests/load/pipeline/test_drop.py @@ -27,13 +27,17 @@ def _attach(pipeline: Pipeline) -> Pipeline: @dlt.source(section="droppable", name="droppable") -def droppable_source() -> List[DltResource]: +def droppable_source(drop_columns: bool = False) -> List[DltResource]: @dlt.resource def droppable_a( - a: dlt.sources.incremental[int] = dlt.sources.incremental("a", 0) + a: dlt.sources.incremental[int] = dlt.sources.incremental("a", 0, range_start="open") ) -> Iterator[Dict[str, Any]]: - yield dict(a=1, b=2, c=3) - yield dict(a=4, b=23, c=24) + if drop_columns: + yield dict(a=1, b=2) + yield dict(a=4, b=23) + else: + yield dict(a=1, b=2, c=3) + yield dict(a=4, b=23, c=24) @dlt.resource def droppable_b( @@ -47,9 +51,17 @@ def droppable_c( qe: dlt.sources.incremental[int] = dlt.sources.incremental("qe"), ) -> Iterator[Dict[str, Any]]: # Grandchild table - yield dict( - asdasd=2424, qe=111, items=[dict(k=2, r=2, labels=[dict(name="abc"), dict(name="www")])] - ) + if drop_columns: + # dropped asdasd, items[r], items.labels.value + yield dict(qe=111, items=[dict(k=2, labels=[dict(name="abc"), dict(name="www")])]) + else: + yield dict( + asdasd=2424, + qe=111, + items=[ + dict(k=2, r=2, labels=[dict(name="abc", value=1), dict(name="www", value=2)]) + ], + ) @dlt.resource def droppable_d( @@ -134,11 +146,17 @@ def assert_destination_state_loaded(pipeline: Pipeline) -> None: ), ids=lambda x: x.name, ) -def test_drop_command_resources_and_state(destination_config: DestinationTestConfiguration) -> None: +@pytest.mark.parametrize("in_source", (True, False)) +def test_drop_command_resources_and_state( + destination_config: DestinationTestConfiguration, in_source: bool +) -> None: """Test the drop command with resource and state path options and verify correct data is deleted from destination and locally""" - source = droppable_source() - pipeline = destination_config.setup_pipeline("drop_test_" + uniq_id(), dev_mode=True) + source: Any = droppable_source() + if not in_source: + source = list(source.selected_resources.values()) + + pipeline = destination_config.setup_pipeline("droppable", dev_mode=True) info = pipeline.run(source, **destination_config.run_kwargs) assert_load_info(info) assert load_table_counts(pipeline, *pipeline.default_schema.tables.keys()) == { @@ -173,6 +191,9 @@ def test_drop_command_resources_and_state(destination_config: DestinationTestCon assert_destination_state_loaded(pipeline) # now run the same droppable_source to see if tables are recreated and they contain right number of items + source = droppable_source(drop_columns=True) + if not in_source: + source = list(source.selected_resources.values()) info = pipeline.run(source, **destination_config.run_kwargs) assert_load_info(info) # 2 versions (one dropped and replaced with schema with dropped tables, then we added missing tables) @@ -192,6 +213,20 @@ def test_drop_command_resources_and_state(destination_config: DestinationTestCon "droppable_c__items": 1, "droppable_c__items__labels": 2, } + # check if columns got correctly dropped + droppable_a_schema = pipeline.default_schema.get_table("droppable_a") + # this table was not dropped so column still exists + assert "c" in droppable_a_schema["columns"] + # dropped asdasd, items[r], items.labels.value + droppable_c_schema = pipeline.default_schema.get_table("droppable_c") + assert "asdasd" not in droppable_c_schema["columns"] + assert "qe" in droppable_c_schema["columns"] + droppable_c_i_schema = pipeline.default_schema.get_table("droppable_c__items") + assert "r" not in droppable_c_i_schema["columns"] + assert "k" in droppable_c_i_schema["columns"] + droppable_c_l_schema = pipeline.default_schema.get_table("droppable_c__items__labels") + assert "value" not in droppable_c_l_schema["columns"] + assert "name" in droppable_c_l_schema["columns"] @pytest.mark.parametrize( diff --git a/tests/load/pipeline/test_duckdb.py b/tests/load/pipeline/test_duckdb.py index 98642bb263..2d1138a51d 100644 --- a/tests/load/pipeline/test_duckdb.py +++ b/tests/load/pipeline/test_duckdb.py @@ -273,18 +273,18 @@ def test_duckdb_credentials_separation( p2 = dlt.pipeline("p2", destination=duckdb(credentials=":pipeline:")) p1.run([1, 2, 3], table_name="p1_data") - p1_dataset = p1._dataset() + p1_dataset = p1.dataset() p2.run([1, 2, 3], table_name="p2_data") - p2_dataset = p2._dataset() + p2_dataset = p2.dataset() # both dataset should have independent duckdb databases # destinations should be bounded to pipelines still print(p1_dataset.p1_data.fetchall()) print(p2_dataset.p2_data.fetchall()) - assert "p1" in p1_dataset.sql_client.credentials._conn_str() # type: ignore[attr-defined] - assert "p2" in p2_dataset.sql_client.credentials._conn_str() # type: ignore[attr-defined] + assert "p1" in p1_dataset.sql_client.credentials._conn_str() + assert "p2" in p2_dataset.sql_client.credentials._conn_str() - assert p1_dataset.sql_client.credentials.bound_to_pipeline is p1 # type: ignore[attr-defined] - assert p2_dataset.sql_client.credentials.bound_to_pipeline is p2 # type: ignore[attr-defined] + assert p1_dataset.sql_client.credentials.bound_to_pipeline is p1 + assert p2_dataset.sql_client.credentials.bound_to_pipeline is p2 diff --git a/tests/load/pipeline/test_filesystem_pipeline.py b/tests/load/pipeline/test_filesystem_pipeline.py index 8d890642ee..c70fa5ab5d 100644 --- a/tests/load/pipeline/test_filesystem_pipeline.py +++ b/tests/load/pipeline/test_filesystem_pipeline.py @@ -2,7 +2,7 @@ import os import posixpath from pathlib import Path -from typing import Any, Callable, List, Dict, cast +from typing import Any, Callable, List, Dict, cast, Tuple from importlib.metadata import version as pkg_version from packaging.version import Version @@ -15,7 +15,7 @@ from dlt.common.storages.configuration import FilesystemConfiguration from dlt.common.storages.load_package import ParsedLoadJobFileName from dlt.common.utils import uniq_id -from dlt.common.schema.typing import TWriteDisposition +from dlt.common.schema.typing import TWriteDisposition, TTableFormat from dlt.common.configuration.exceptions import ConfigurationValueError from dlt.destinations import filesystem from dlt.destinations.impl.filesystem.filesystem import FilesystemClient @@ -223,6 +223,48 @@ def some_source(): assert table.column("value").to_pylist() == [1, 2, 3, 4, 5] +# here start the `table_format` tests + + +def get_expected_actual( + pipeline: dlt.Pipeline, + table_name: str, + table_format: TTableFormat, + arrow_table: "pyarrow.Table", # type: ignore[name-defined] # noqa: F821 +) -> Tuple["pyarrow.Table", "pyarrow.Table"]: # type: ignore[name-defined] # noqa: F821 + from dlt.common.libs.pyarrow import pyarrow, cast_arrow_schema_types + + if table_format == "delta": + from dlt.common.libs.deltalake import ( + get_delta_tables, + ensure_delta_compatible_arrow_data, + ) + + dt = get_delta_tables(pipeline, table_name)[table_name] + expected = ensure_delta_compatible_arrow_data(arrow_table) + actual = dt.to_pyarrow_table() + elif table_format == "iceberg": + from dlt.common.libs.pyiceberg import ( + get_iceberg_tables, + ensure_iceberg_compatible_arrow_data, + ) + + it = get_iceberg_tables(pipeline, table_name)[table_name] + expected = ensure_iceberg_compatible_arrow_data(arrow_table) + actual = it.scan().to_arrow() + + # work around pyiceberg bug https://github.com/apache/iceberg-python/issues/1128 + schema = cast_arrow_schema_types( + actual.schema, + { + pyarrow.types.is_large_string: pyarrow.string(), + pyarrow.types.is_large_binary: pyarrow.binary(), + }, + ) + actual = actual.cast(schema) + return (expected, actual) + + @pytest.mark.skip( reason="pyarrow version check not needed anymore, since we have 17 as a dependency" ) @@ -258,44 +300,44 @@ def foo(): "destination_config", destinations_configs( table_format_filesystem_configs=True, - with_table_format="delta", + with_table_format=("delta", "iceberg"), bucket_exclude=(MEMORY_BUCKET, SFTP_BUCKET), ), ids=lambda x: x.name, ) -def test_delta_table_core( +def test_table_format_core( destination_config: DestinationTestConfiguration, ) -> None: - """Tests core functionality for `delta` table format. + """Tests core functionality for `delta` and `iceberg` table formats. Tests all data types, all filesystems. Tests `append` and `replace` write dispositions (`merge` is tested elsewhere). """ - - from dlt.common.libs.deltalake import get_delta_tables + if destination_config.table_format == "delta": + from dlt.common.libs.deltalake import get_delta_tables # create resource that yields rows with all data types column_schemas, row = table_update_and_row() - @dlt.resource(columns=column_schemas, table_format="delta") + @dlt.resource(columns=column_schemas, table_format=destination_config.table_format) def data_types(): nonlocal row yield [row] * 10 pipeline = destination_config.setup_pipeline("fs_pipe", dev_mode=True) - # run pipeline, this should create Delta table + # run pipeline, this should create table info = pipeline.run(data_types()) assert_load_info(info) - # `delta` table format should use `parquet` file format + # table formats should use `parquet` file format completed_jobs = info.load_packages[0].jobs["completed_jobs"] data_types_jobs = [ job for job in completed_jobs if job.job_file_info.table_name == "data_types" ] assert all([job.file_path.endswith((".parquet", ".reference")) for job in data_types_jobs]) - # 10 rows should be loaded to the Delta table and the content of the first + # 10 rows should be loaded to the table and the content of the first # row should match expected values rows = load_tables_to_dicts(pipeline, "data_types", exclude_system_cols=True)["data_types"] assert len(rows) == 10 @@ -322,7 +364,8 @@ def data_types(): # should do logical replace, increasing the table version info = pipeline.run(data_types(), write_disposition="replace") assert_load_info(info) - assert get_delta_tables(pipeline, "data_types")["data_types"].version() == 2 + if destination_config.table_format == "delta": + assert get_delta_tables(pipeline, "data_types")["data_types"].version() == 2 rows = load_tables_to_dicts(pipeline, "data_types", exclude_system_cols=True)["data_types"] assert len(rows) == 10 @@ -331,15 +374,16 @@ def data_types(): "destination_config", destinations_configs( table_format_filesystem_configs=True, + # job orchestration is same across table formats—no need to test all formats with_table_format="delta", bucket_subset=(FILE_BUCKET), ), ids=lambda x: x.name, ) -def test_delta_table_does_not_contain_job_files( +def test_table_format_does_not_contain_job_files( destination_config: DestinationTestConfiguration, ) -> None: - """Asserts Parquet job files do not end up in Delta table.""" + """Asserts Parquet job files do not end up in table.""" pipeline = destination_config.setup_pipeline("fs_pipe", dev_mode=True) @@ -376,17 +420,18 @@ def delta_table(): "destination_config", destinations_configs( table_format_filesystem_configs=True, + # job orchestration is same across table formats—no need to test all formats with_table_format="delta", bucket_subset=(FILE_BUCKET), ), ids=lambda x: x.name, ) -def test_delta_table_multiple_files( +def test_table_format_multiple_files( destination_config: DestinationTestConfiguration, ) -> None: - """Tests loading multiple files into a Delta table. + """Tests loading multiple files into a table. - Files should be loaded into the Delta table in a single commit. + Files should be loaded into the table in a single commit. """ from dlt.common.libs.deltalake import get_delta_tables @@ -422,17 +467,17 @@ def delta_table(): "destination_config", destinations_configs( table_format_filesystem_configs=True, - with_table_format="delta", + with_table_format=("delta", "iceberg"), bucket_subset=(FILE_BUCKET), ), ids=lambda x: x.name, ) -def test_delta_table_child_tables( +def test_table_format_child_tables( destination_config: DestinationTestConfiguration, ) -> None: - """Tests child table handling for `delta` table format.""" + """Tests child table handling for `delta` and `iceberg` table formats.""" - @dlt.resource(table_format="delta") + @dlt.resource(table_format=destination_config.table_format) def nested_table(): yield [ { @@ -494,49 +539,63 @@ def nested_table(): assert len(rows_dict["nested_table__child"]) == 3 assert len(rows_dict["nested_table__child__grandchild"]) == 5 - # now drop children and grandchildren, use merge write disposition to create and pass full table chain - # also for tables that do not have jobs - info = pipeline.run( - [{"foo": 3}] * 10000, - table_name="nested_table", - primary_key="foo", - write_disposition="merge", - ) - assert_load_info(info) + if destination_config.supports_merge: + # now drop children and grandchildren, use merge write disposition to create and pass full table chain + # also for tables that do not have jobs + info = pipeline.run( + [{"foo": 3}] * 10000, + table_name="nested_table", + primary_key="foo", + write_disposition="merge", + ) + assert_load_info(info) @pytest.mark.parametrize( "destination_config", destinations_configs( table_format_filesystem_configs=True, - with_table_format="delta", + with_table_format=("delta", "iceberg"), bucket_subset=(FILE_BUCKET), ), ids=lambda x: x.name, ) -def test_delta_table_partitioning( +def test_table_format_partitioning( destination_config: DestinationTestConfiguration, ) -> None: - """Tests partitioning for `delta` table format.""" + """Tests partitioning for `delta` and `iceberg` table formats.""" - from dlt.common.libs.deltalake import get_delta_tables from tests.pipeline.utils import users_materialize_table_schema + def assert_partition_columns( + table_name: str, table_format: TTableFormat, expected_partition_columns: List[str] + ) -> None: + if table_format == "delta": + from dlt.common.libs.deltalake import get_delta_tables + + dt = get_delta_tables(pipeline, table_name)[table_name] + actual_partition_columns = dt.metadata().partition_columns + elif table_format == "iceberg": + from dlt.common.libs.pyiceberg import get_iceberg_tables + + it = get_iceberg_tables(pipeline, table_name)[table_name] + actual_partition_columns = [f.name for f in it.metadata.specs_struct().fields] + assert actual_partition_columns == expected_partition_columns + pipeline = destination_config.setup_pipeline("fs_pipe", dev_mode=True) # zero partition columns - @dlt.resource(table_format="delta") + @dlt.resource(table_format=destination_config.table_format) def zero_part(): yield {"foo": 1, "bar": 1} info = pipeline.run(zero_part()) assert_load_info(info) - dt = get_delta_tables(pipeline, "zero_part")["zero_part"] - assert dt.metadata().partition_columns == [] + assert_partition_columns("zero_part", destination_config.table_format, []) assert load_table_counts(pipeline, "zero_part")["zero_part"] == 1 # one partition column - @dlt.resource(table_format="delta", columns={"c1": {"partition": True}}) + @dlt.resource(table_format=destination_config.table_format, columns={"c1": {"partition": True}}) def one_part(): yield [ {"c1": "foo", "c2": 1}, @@ -547,13 +606,13 @@ def one_part(): info = pipeline.run(one_part()) assert_load_info(info) - dt = get_delta_tables(pipeline, "one_part")["one_part"] - assert dt.metadata().partition_columns == ["c1"] + assert_partition_columns("one_part", destination_config.table_format, ["c1"]) assert load_table_counts(pipeline, "one_part")["one_part"] == 4 # two partition columns @dlt.resource( - table_format="delta", columns={"c1": {"partition": True}, "c2": {"partition": True}} + table_format=destination_config.table_format, + columns={"c1": {"partition": True}, "c2": {"partition": True}}, ) def two_part(): yield [ @@ -565,29 +624,31 @@ def two_part(): info = pipeline.run(two_part()) assert_load_info(info) - dt = get_delta_tables(pipeline, "two_part")["two_part"] - assert dt.metadata().partition_columns == ["c1", "c2"] + assert_partition_columns("two_part", destination_config.table_format, ["c1", "c2"]) assert load_table_counts(pipeline, "two_part")["two_part"] == 4 # test partitioning with empty source users_materialize_table_schema.apply_hints( - table_format="delta", + table_format=destination_config.table_format, columns={"id": {"partition": True}}, ) info = pipeline.run(users_materialize_table_schema()) assert_load_info(info) - dt = get_delta_tables(pipeline, "users")["users"] - assert dt.metadata().partition_columns == ["id"] + assert_partition_columns("users", destination_config.table_format, ["id"]) assert load_table_counts(pipeline, "users")["users"] == 0 # changing partitioning after initial table creation is not supported zero_part.apply_hints(columns={"foo": {"partition": True}}) - with pytest.raises(PipelineStepFailed) as pip_ex: + if destination_config.table_format == "delta": + # Delta raises error when trying to change partitioning + with pytest.raises(PipelineStepFailed) as pip_ex: + pipeline.run(zero_part()) + assert isinstance(pip_ex.value.__context__, LoadClientJobRetry) + assert "partitioning" in pip_ex.value.__context__.retry_message + elif destination_config.table_format == "iceberg": + # while Iceberg supports partition evolution, we don't apply it pipeline.run(zero_part()) - assert isinstance(pip_ex.value.__context__, LoadClientJobRetry) - assert "partitioning" in pip_ex.value.__context__.retry_message - dt = get_delta_tables(pipeline, "zero_part")["zero_part"] - assert dt.metadata().partition_columns == [] + assert_partition_columns("zero_part", destination_config.table_format, []) @pytest.mark.parametrize( @@ -646,7 +707,7 @@ def test_delta_table_partitioning_arrow_load_id( "destination_config", destinations_configs( table_format_filesystem_configs=True, - with_table_format="delta", + with_table_format=("delta", "iceberg"), bucket_subset=(FILE_BUCKET), ), ids=lambda x: x.name, @@ -659,20 +720,25 @@ def test_delta_table_partitioning_arrow_load_id( pytest.param({"disposition": "merge", "strategy": "upsert"}, id="upsert"), ), ) -def test_delta_table_schema_evolution( +def test_table_format_schema_evolution( destination_config: DestinationTestConfiguration, write_disposition: TWriteDisposition, ) -> None: - """Tests schema evolution (adding new columns) for `delta` table format.""" - from dlt.common.libs.deltalake import get_delta_tables, ensure_delta_compatible_arrow_data + """Tests schema evolution (adding new columns) for `delta` and `iceberg` table formats.""" + if destination_config.table_format == "iceberg" and write_disposition == { + "disposition": "merge", + "strategy": "upsert", + }: + pytest.skip("`upsert` currently not implemented for `iceberg`") + from dlt.common.libs.pyarrow import pyarrow @dlt.resource( write_disposition=write_disposition, primary_key="pk", - table_format="delta", + table_format=destination_config.table_format, ) - def delta_table(data): + def evolving_table(data): yield data pipeline = destination_config.setup_pipeline("fs_pipe", dev_mode=True) @@ -684,11 +750,11 @@ def delta_table(data): assert arrow_table.shape == (1, 1) # initial load - info = pipeline.run(delta_table(arrow_table)) + info = pipeline.run(evolving_table(arrow_table)) assert_load_info(info) - dt = get_delta_tables(pipeline, "delta_table")["delta_table"] - expected = ensure_delta_compatible_arrow_data(arrow_table) - actual = dt.to_pyarrow_table() + expected, actual = get_expected_actual( + pipeline, "evolving_table", destination_config.table_format, arrow_table + ) assert actual.equals(expected) # create Arrow table with many columns, two rows @@ -703,11 +769,11 @@ def delta_table(data): arrow_table = arrow_table.add_column(0, pk_field, [[1, 2]]) # second load — this should evolve the schema (i.e. add the new columns) - info = pipeline.run(delta_table(arrow_table)) + info = pipeline.run(evolving_table(arrow_table)) assert_load_info(info) - dt = get_delta_tables(pipeline, "delta_table")["delta_table"] - actual = dt.to_pyarrow_table() - expected = ensure_delta_compatible_arrow_data(arrow_table) + expected, actual = get_expected_actual( + pipeline, "evolving_table", destination_config.table_format, arrow_table + ) if write_disposition == "append": # just check shape and schema for `append`, because table comparison is # more involved than with the other dispositions @@ -724,13 +790,21 @@ def delta_table(data): empty_arrow_table = arrow_table.schema.empty_table() # load 3 — this should evolve the schema without changing data - info = pipeline.run(delta_table(empty_arrow_table)) + info = pipeline.run(evolving_table(empty_arrow_table)) assert_load_info(info) - dt = get_delta_tables(pipeline, "delta_table")["delta_table"] - actual = dt.to_pyarrow_table() - expected_schema = ensure_delta_compatible_arrow_data(arrow_table).schema - assert actual.schema.equals(expected_schema) - expected_num_rows = 3 if write_disposition == "append" else 2 + expected, actual = get_expected_actual( + pipeline, "evolving_table", destination_config.table_format, arrow_table + ) + assert actual.schema.equals(expected.schema) + if write_disposition == "append": + expected_num_rows = 3 + elif write_disposition == "replace": + expected_num_rows = 0 + if destination_config.table_format == "delta": + # TODO: fix https://github.com/dlt-hub/dlt/issues/2092 and remove this if-clause + expected_num_rows = 2 + elif write_disposition == {"disposition": "merge", "strategy": "upsert"}: + expected_num_rows = 2 assert actual.num_rows == expected_num_rows # new column should have NULLs only assert ( @@ -743,23 +817,38 @@ def delta_table(data): "destination_config", destinations_configs( table_format_filesystem_configs=True, - with_table_format="delta", + with_table_format=("delta", "iceberg"), bucket_subset=(FILE_BUCKET, AZ_BUCKET), ), ids=lambda x: x.name, ) -def test_delta_table_empty_source( +def test_table_format_empty_source( destination_config: DestinationTestConfiguration, ) -> None: - """Tests empty source handling for `delta` table format. + """Tests empty source handling for `delta` and `iceberg` table formats. Tests both empty Arrow table and `dlt.mark.materialize_table_schema()`. """ - from dlt.common.libs.deltalake import ensure_delta_compatible_arrow_data, get_delta_tables from tests.pipeline.utils import users_materialize_table_schema - @dlt.resource(table_format="delta") - def delta_table(data): + def get_table_version( # type: ignore[return] + pipeline: dlt.Pipeline, + table_name: str, + table_format: TTableFormat, + ) -> int: + if table_format == "delta": + from dlt.common.libs.deltalake import get_delta_tables + + dt = get_delta_tables(pipeline, table_name)[table_name] + return dt.version() + elif table_format == "iceberg": + from dlt.common.libs.pyiceberg import get_iceberg_tables + + it = get_iceberg_tables(pipeline, table_name)[table_name] + return it.last_sequence_number - 1 # subtract 1 to match `delta` + + @dlt.resource(table_format=destination_config.table_format) + def a_table(data): yield data # create empty Arrow table with schema @@ -779,61 +868,62 @@ def delta_table(data): # run 1: empty Arrow table with schema # this should create empty Delta table with same schema as Arrow table - info = pipeline.run(delta_table(empty_arrow_table)) + info = pipeline.run(a_table(empty_arrow_table)) assert_load_info(info) - dt = get_delta_tables(pipeline, "delta_table")["delta_table"] - assert dt.version() == 0 - dt_arrow_table = dt.to_pyarrow_table() - assert dt_arrow_table.shape == (0, empty_arrow_table.num_columns) - assert dt_arrow_table.schema.equals( - ensure_delta_compatible_arrow_data(empty_arrow_table).schema + assert get_table_version(pipeline, "a_table", destination_config.table_format) == 0 + expected, actual = get_expected_actual( + pipeline, "a_table", destination_config.table_format, empty_arrow_table ) + assert actual.shape == (0, expected.num_columns) + assert actual.schema.equals(expected.schema) # run 2: non-empty Arrow table with same schema as run 1 # this should load records into Delta table - info = pipeline.run(delta_table(arrow_table)) + info = pipeline.run(a_table(arrow_table)) assert_load_info(info) - dt = get_delta_tables(pipeline, "delta_table")["delta_table"] - assert dt.version() == 1 - dt_arrow_table = dt.to_pyarrow_table() - assert dt_arrow_table.shape == (2, empty_arrow_table.num_columns) - assert dt_arrow_table.schema.equals( - ensure_delta_compatible_arrow_data(empty_arrow_table).schema + assert get_table_version(pipeline, "a_table", destination_config.table_format) == 1 + expected, actual = get_expected_actual( + pipeline, "a_table", destination_config.table_format, empty_arrow_table ) + assert actual.shape == (2, expected.num_columns) + assert actual.schema.equals(expected.schema) # now run the empty frame again - info = pipeline.run(delta_table(empty_arrow_table)) + info = pipeline.run(a_table(empty_arrow_table)) assert_load_info(info) - # use materialized list - # NOTE: this will create an empty parquet file with a schema takes from dlt schema. - # the original parquet file had a nested (struct) type in `json` field that is now - # in the delta table schema. the empty parquet file lost this information and had - # string type (converted from dlt `json`) - info = pipeline.run([dlt.mark.materialize_table_schema()], table_name="delta_table") - assert_load_info(info) + if destination_config.table_format == "delta": + # use materialized list + # NOTE: this will create an empty parquet file with a schema takes from dlt schema. + # the original parquet file had a nested (struct) type in `json` field that is now + # in the delta table schema. the empty parquet file lost this information and had + # string type (converted from dlt `json`) + info = pipeline.run([dlt.mark.materialize_table_schema()], table_name="a_table") + assert_load_info(info) # test `dlt.mark.materialize_table_schema()` - users_materialize_table_schema.apply_hints(table_format="delta") + users_materialize_table_schema.apply_hints(table_format=destination_config.table_format) info = pipeline.run(users_materialize_table_schema(), loader_file_format="parquet") assert_load_info(info) - dt = get_delta_tables(pipeline, "users")["users"] - assert dt.version() == 0 - dt_arrow_table = dt.to_pyarrow_table() - assert dt_arrow_table.num_rows == 0 - assert "id", "name" == dt_arrow_table.schema.names[:2] + assert get_table_version(pipeline, "users", destination_config.table_format) == 0 + _, actual = get_expected_actual( + pipeline, "users", destination_config.table_format, empty_arrow_table + ) + assert actual.num_rows == 0 + assert "id", "name" == actual.schema.names[:2] @pytest.mark.parametrize( "destination_config", destinations_configs( table_format_filesystem_configs=True, + # job orchestration is same across table formats—no need to test all formats with_table_format="delta", bucket_subset=(FILE_BUCKET), ), ids=lambda x: x.name, ) -def test_delta_table_mixed_source( +def test_table_format_mixed_source( destination_config: DestinationTestConfiguration, ) -> None: """Tests file format handling in mixed source. @@ -877,12 +967,13 @@ def s(): "destination_config", destinations_configs( table_format_filesystem_configs=True, + # job orchestration is same across table formats—no need to test all formats with_table_format="delta", bucket_subset=(FILE_BUCKET), ), ids=lambda x: x.name, ) -def test_delta_table_dynamic_dispatch( +def test_table_format_dynamic_dispatch( destination_config: DestinationTestConfiguration, ) -> None: @dlt.resource(primary_key="id", table_name=lambda i: i["type"], table_format="delta") @@ -905,80 +996,96 @@ def github_events(): "destination_config", destinations_configs( table_format_filesystem_configs=True, - with_table_format="delta", + with_table_format=("delta", "iceberg"), bucket_subset=(FILE_BUCKET, AZ_BUCKET), ), ids=lambda x: x.name, ) -def test_delta_table_get_delta_tables_helper( +def test_table_format_get_tables_helper( destination_config: DestinationTestConfiguration, ) -> None: - """Tests `get_delta_tables` helper function.""" - from dlt.common.libs.deltalake import DeltaTable, get_delta_tables + """Tests `get_delta_tables` / `get_iceberg_tables` helper functions.""" + get_tables: Any + if destination_config.table_format == "delta": + from dlt.common.libs.deltalake import DeltaTable, get_delta_tables - @dlt.resource(table_format="delta") - def foo_delta(): + get_tables = get_delta_tables + get_num_rows = lambda table: table.to_pyarrow_table().num_rows + elif destination_config.table_format == "iceberg": + from dlt.common.libs.pyiceberg import IcebergTable, get_iceberg_tables + + get_tables = get_iceberg_tables + get_num_rows = lambda table: table.scan().to_arrow().num_rows + + @dlt.resource(table_format=destination_config.table_format) + def foo_table_format(): yield [{"foo": 1}, {"foo": 2}] - @dlt.resource(table_format="delta") - def bar_delta(): + @dlt.resource(table_format=destination_config.table_format) + def bar_table_format(): yield [{"bar": 1}] @dlt.resource - def baz_not_delta(): + def baz_not_table_format(): yield [{"baz": 1}] pipeline = destination_config.setup_pipeline("fs_pipe", dev_mode=True) - info = pipeline.run(foo_delta()) + info = pipeline.run(foo_table_format()) assert_load_info(info) - delta_tables = get_delta_tables(pipeline) - assert delta_tables.keys() == {"foo_delta"} - assert isinstance(delta_tables["foo_delta"], DeltaTable) - assert delta_tables["foo_delta"].to_pyarrow_table().num_rows == 2 - - info = pipeline.run([foo_delta(), bar_delta(), baz_not_delta()]) + tables = get_tables(pipeline) + assert tables.keys() == {"foo_table_format"} + if destination_config.table_format == "delta": + assert isinstance(tables["foo_table_format"], DeltaTable) + elif destination_config.table_format == "iceberg": + assert isinstance(tables["foo_table_format"], IcebergTable) + assert get_num_rows(tables["foo_table_format"]) == 2 + + info = pipeline.run([foo_table_format(), bar_table_format(), baz_not_table_format()]) assert_load_info(info) - delta_tables = get_delta_tables(pipeline) - assert delta_tables.keys() == {"foo_delta", "bar_delta"} - assert delta_tables["bar_delta"].to_pyarrow_table().num_rows == 1 - assert get_delta_tables(pipeline, "foo_delta").keys() == {"foo_delta"} - assert get_delta_tables(pipeline, "bar_delta").keys() == {"bar_delta"} - assert get_delta_tables(pipeline, "foo_delta", "bar_delta").keys() == {"foo_delta", "bar_delta"} + tables = get_tables(pipeline) + assert tables.keys() == {"foo_table_format", "bar_table_format"} + assert get_num_rows(tables["bar_table_format"]) == 1 + assert get_tables(pipeline, "foo_table_format").keys() == {"foo_table_format"} + assert get_tables(pipeline, "bar_table_format").keys() == {"bar_table_format"} + assert get_tables(pipeline, "foo_table_format", "bar_table_format").keys() == { + "foo_table_format", + "bar_table_format", + } # test with child table - @dlt.resource(table_format="delta") - def parent_delta(): + @dlt.resource(table_format=destination_config.table_format) + def parent_table_format(): yield [{"foo": 1, "child": [1, 2, 3]}] - info = pipeline.run(parent_delta()) + info = pipeline.run(parent_table_format()) assert_load_info(info) - delta_tables = get_delta_tables(pipeline) - assert "parent_delta__child" in delta_tables.keys() - assert delta_tables["parent_delta__child"].to_pyarrow_table().num_rows == 3 + tables = get_tables(pipeline) + assert "parent_table_format__child" in tables.keys() + assert get_num_rows(tables["parent_table_format__child"]) == 3 # test invalid input with pytest.raises(ValueError): - get_delta_tables(pipeline, "baz_not_delta") + get_tables(pipeline, "baz_not_table_format") with pytest.raises(ValueError): - get_delta_tables(pipeline, "non_existing_table") + get_tables(pipeline, "non_existing_table") # test unknown schema with pytest.raises(FileNotFoundError): - get_delta_tables(pipeline, "non_existing_table", schema_name="aux_2") + get_tables(pipeline, "non_existing_table", schema_name="aux_2") # load to a new schema and under new name aux_schema = dlt.Schema("aux_2") # NOTE: you cannot have a file with name - info = pipeline.run(parent_delta().with_name("aux_delta"), schema=aux_schema) + info = pipeline.run(parent_table_format().with_name("aux_table"), schema=aux_schema) # also state in seprate package assert_load_info(info, expected_load_packages=2) - delta_tables = get_delta_tables(pipeline, schema_name="aux_2") - assert "aux_delta__child" in delta_tables.keys() - get_delta_tables(pipeline, "aux_delta", schema_name="aux_2") + tables = get_tables(pipeline, schema_name="aux_2") + assert "aux_table__child" in tables.keys() + get_tables(pipeline, "aux_table", schema_name="aux_2") with pytest.raises(ValueError): - get_delta_tables(pipeline, "aux_delta") + get_tables(pipeline, "aux_table") @pytest.mark.parametrize( diff --git a/tests/load/pipeline/test_pipelines.py b/tests/load/pipeline/test_pipelines.py index 9190225a8c..b998b78471 100644 --- a/tests/load/pipeline/test_pipelines.py +++ b/tests/load/pipeline/test_pipelines.py @@ -10,7 +10,6 @@ from dlt.common.pipeline import SupportsPipeline from dlt.common.destination import Destination from dlt.common.destination.reference import WithStagingDataset -from dlt.common.schema.exceptions import CannotCoerceColumnException from dlt.common.schema.schema import Schema from dlt.common.schema.typing import VERSION_TABLE_NAME from dlt.common.schema.utils import new_table diff --git a/tests/load/pipeline/test_postgres.py b/tests/load/pipeline/test_postgres.py index 29ad21941e..e09582f8a8 100644 --- a/tests/load/pipeline/test_postgres.py +++ b/tests/load/pipeline/test_postgres.py @@ -127,177 +127,6 @@ def test_pipeline_explicit_destination_credentials( ) -# do not remove - it allows us to filter tests by destination -@pytest.mark.parametrize( - "destination_config", - destinations_configs(default_sql_configs=True, subset=["postgres"]), - ids=lambda x: x.name, -) -def test_pipeline_with_sources_sharing_schema( - destination_config: DestinationTestConfiguration, -) -> None: - schema = Schema("shared") - - @dlt.source(schema=schema, max_table_nesting=1) - def source_1(): - @dlt.resource(primary_key="user_id") - def gen1(): - dlt.current.source_state()["source_1"] = True - dlt.current.resource_state()["source_1"] = True - yield {"id": "Y", "user_id": "user_y"} - - @dlt.resource(columns={"col": {"data_type": "bigint"}}) - def conflict(): - yield "conflict" - - return gen1, conflict - - @dlt.source(schema=schema, max_table_nesting=2) - def source_2(): - @dlt.resource(primary_key="id") - def gen1(): - dlt.current.source_state()["source_2"] = True - dlt.current.resource_state()["source_2"] = True - yield {"id": "X", "user_id": "user_X"} - - def gen2(): - yield from "CDE" - - @dlt.resource(columns={"col": {"data_type": "bool"}}, selected=False) - def conflict(): - yield "conflict" - - return gen2, gen1, conflict - - # all selected tables with hints should be there - discover_1 = source_1().discover_schema() - assert "gen1" in discover_1.tables - assert discover_1.tables["gen1"]["columns"]["user_id"]["primary_key"] is True - assert "data_type" not in discover_1.tables["gen1"]["columns"]["user_id"] - assert "conflict" in discover_1.tables - assert discover_1.tables["conflict"]["columns"]["col"]["data_type"] == "bigint" - - discover_2 = source_2().discover_schema() - assert "gen1" in discover_2.tables - assert "gen2" in discover_2.tables - # conflict deselected - assert "conflict" not in discover_2.tables - - p = dlt.pipeline(pipeline_name="multi", destination="duckdb", dev_mode=True) - p.extract([source_1(), source_2()], table_format=destination_config.table_format) - default_schema = p.default_schema - gen1_table = default_schema.tables["gen1"] - assert "user_id" in gen1_table["columns"] - assert "id" in gen1_table["columns"] - assert "conflict" in default_schema.tables - assert "gen2" in default_schema.tables - p.normalize(loader_file_format=destination_config.file_format) - assert "gen2" in default_schema.tables - p.load() - table_names = [t["name"] for t in default_schema.data_tables()] - counts = load_table_counts(p, *table_names) - assert counts == {"gen1": 2, "gen2": 3, "conflict": 1} - # both sources share the same state - assert p.state["sources"] == { - "shared": { - "source_1": True, - "resources": {"gen1": {"source_1": True, "source_2": True}}, - "source_2": True, - } - } - drop_active_pipeline_data() - - # same pipeline but enable conflict - p = dlt.pipeline(pipeline_name="multi", destination="duckdb", dev_mode=True) - with pytest.raises(PipelineStepFailed) as py_ex: - p.extract([source_1(), source_2().with_resources("conflict")]) - assert isinstance(py_ex.value.__context__, CannotCoerceColumnException) - - -# do not remove - it allows us to filter tests by destination -@pytest.mark.parametrize( - "destination_config", - destinations_configs(default_sql_configs=True, subset=["postgres"]), - ids=lambda x: x.name, -) -def test_many_pipelines_single_dataset(destination_config: DestinationTestConfiguration) -> None: - schema = Schema("shared") - - @dlt.source(schema=schema, max_table_nesting=1) - def source_1(): - @dlt.resource(primary_key="user_id") - def gen1(): - dlt.current.source_state()["source_1"] = True - dlt.current.resource_state()["source_1"] = True - yield {"id": "Y", "user_id": "user_y"} - - return gen1 - - @dlt.source(schema=schema, max_table_nesting=2) - def source_2(): - @dlt.resource(primary_key="id") - def gen1(): - dlt.current.source_state()["source_2"] = True - dlt.current.resource_state()["source_2"] = True - yield {"id": "X", "user_id": "user_X"} - - def gen2(): - yield from "CDE" - - return gen2, gen1 - - # load source_1 to common dataset - p = dlt.pipeline( - pipeline_name="source_1_pipeline", destination="duckdb", dataset_name="shared_dataset" - ) - p.run(source_1(), credentials="duckdb:///_storage/test_quack.duckdb") - counts = load_table_counts(p, *p.default_schema.tables.keys()) - assert counts.items() >= {"gen1": 1, "_dlt_pipeline_state": 1, "_dlt_loads": 1}.items() - p._wipe_working_folder() - p.deactivate() - - p = dlt.pipeline( - pipeline_name="source_2_pipeline", destination="duckdb", dataset_name="shared_dataset" - ) - p.run(source_2(), credentials="duckdb:///_storage/test_quack.duckdb") - # table_names = [t["name"] for t in p.default_schema.data_tables()] - counts = load_table_counts(p, *p.default_schema.tables.keys()) - # gen1: one record comes from source_1, 1 record from source_2 - assert counts.items() >= {"gen1": 2, "_dlt_pipeline_state": 2, "_dlt_loads": 2}.items() - # assert counts == {'gen1': 2, 'gen2': 3} - p._wipe_working_folder() - p.deactivate() - - # restore from destination, check state - p = dlt.pipeline( - pipeline_name="source_1_pipeline", - destination=dlt.destinations.duckdb(credentials="duckdb:///_storage/test_quack.duckdb"), - dataset_name="shared_dataset", - ) - p.sync_destination() - # we have our separate state - assert p.state["sources"]["shared"] == { - "source_1": True, - "resources": {"gen1": {"source_1": True}}, - } - # but the schema was common so we have the earliest one - assert "gen2" in p.default_schema.tables - p._wipe_working_folder() - p.deactivate() - - p = dlt.pipeline( - pipeline_name="source_2_pipeline", - destination=dlt.destinations.duckdb(credentials="duckdb:///_storage/test_quack.duckdb"), - dataset_name="shared_dataset", - ) - p.sync_destination() - # we have our separate state - assert p.state["sources"]["shared"] == { - "source_2": True, - "resources": {"gen1": {"source_2": True}}, - } - - # TODO: uncomment and finalize when we implement encoding for psycopg2 # @pytest.mark.parametrize( # "destination_config", diff --git a/tests/load/pipeline/test_refresh_modes.py b/tests/load/pipeline/test_refresh_modes.py index 86479acd2b..fb88ba915c 100644 --- a/tests/load/pipeline/test_refresh_modes.py +++ b/tests/load/pipeline/test_refresh_modes.py @@ -1,5 +1,5 @@ from typing import Any, List - +import os import pytest import dlt from dlt.common.destination.exceptions import DestinationUndefinedEntity @@ -12,7 +12,7 @@ from dlt.extract.source import DltSource from dlt.pipeline.state_sync import load_pipeline_state_from_destination -from tests.utils import clean_test_storage +from tests.utils import clean_test_storage, TEST_STORAGE_ROOT from tests.pipeline.utils import ( _is_filesystem, assert_load_info, @@ -106,19 +106,40 @@ def some_data_4(): ), ids=lambda x: x.name, ) -def test_refresh_drop_sources(destination_config: DestinationTestConfiguration): - pipeline = destination_config.setup_pipeline("refresh_full_test", refresh="drop_sources") +@pytest.mark.parametrize("in_source", (True, False)) +@pytest.mark.parametrize("with_wipe", (True, False)) +def test_refresh_drop_sources( + destination_config: DestinationTestConfiguration, in_source: bool, with_wipe: bool +): + # do not place duckdb in the working dir, because we may wipe it + os.environ["DESTINATION__DUCKDB__CREDENTIALS"] = os.path.join( + TEST_STORAGE_ROOT, "refresh_source_db.duckdb" + ) + + pipeline = destination_config.setup_pipeline("refresh_source") + + data: Any = refresh_source(first_run=True, drop_sources=True) + if not in_source: + data = list(data.selected_resources.values()) # First run pipeline so destination so tables are created - info = pipeline.run( - refresh_source(first_run=True, drop_sources=True), **destination_config.run_kwargs - ) + info = pipeline.run(data, refresh="drop_sources", **destination_config.run_kwargs) assert_load_info(info) + # Second run of pipeline with only selected resources + if with_wipe: + pipeline._wipe_working_folder() + pipeline = destination_config.setup_pipeline("refresh_source") + + data = refresh_source(first_run=False, drop_sources=True).with_resources( + "some_data_1", "some_data_2" + ) + if not in_source: + data = list(data.selected_resources.values()) + info = pipeline.run( - refresh_source(first_run=False, drop_sources=True).with_resources( - "some_data_1", "some_data_2" - ), + data, + refresh="drop_sources", **destination_config.run_kwargs, ) @@ -199,16 +220,37 @@ def test_existing_schema_hash(destination_config: DestinationTestConfiguration): ), ids=lambda x: x.name, ) -def test_refresh_drop_resources(destination_config: DestinationTestConfiguration): +@pytest.mark.parametrize("in_source", (True, False)) +@pytest.mark.parametrize("with_wipe", (True, False)) +def test_refresh_drop_resources( + destination_config: DestinationTestConfiguration, in_source: bool, with_wipe: bool +): + # do not place duckdb in the working dir, because we may wipe it + os.environ["DESTINATION__DUCKDB__CREDENTIALS"] = os.path.join( + TEST_STORAGE_ROOT, "refresh_source_db.duckdb" + ) # First run pipeline with load to destination so tables are created - pipeline = destination_config.setup_pipeline("refresh_full_test", refresh="drop_tables") + pipeline = destination_config.setup_pipeline("refresh_source") - info = pipeline.run(refresh_source(first_run=True), **destination_config.run_kwargs) + data: Any = refresh_source(first_run=True) + if not in_source: + data = list(data.selected_resources.values()) + + info = pipeline.run(data, refresh="drop_resources", **destination_config.run_kwargs) assert_load_info(info) # Second run of pipeline with only selected resources + if with_wipe: + pipeline._wipe_working_folder() + pipeline = destination_config.setup_pipeline("refresh_source") + + data = refresh_source(first_run=False).with_resources("some_data_1", "some_data_2") + if not in_source: + data = list(data.selected_resources.values()) + info = pipeline.run( - refresh_source(first_run=False).with_resources("some_data_1", "some_data_2"), + data, + refresh="drop_resources", **destination_config.run_kwargs, ) @@ -309,7 +351,9 @@ def test_refresh_drop_data_only(destination_config: DestinationTestConfiguration @pytest.mark.parametrize( "destination_config", - destinations_configs(default_sql_configs=True, subset=["duckdb"]), + destinations_configs( + default_sql_configs=True, local_filesystem_configs=True, subset=["duckdb", "filesystem"] + ), ids=lambda x: x.name, ) def test_refresh_drop_sources_multiple_sources(destination_config: DestinationTestConfiguration): @@ -364,7 +408,6 @@ def source_2_data_2(): **destination_config.run_kwargs, ) assert_load_info(info, 2) - # breakpoint() info = pipeline.run( refresh_source_2(first_run=False).with_resources("source_2_data_1"), **destination_config.run_kwargs, @@ -388,7 +431,7 @@ def source_2_data_2(): result = sorted([(row["id"], row["name"]) for row in data["some_data_1"]]) assert result == [(1, "John"), (2, "Jane")] - # # First table from source2 exists, with only first column + # First table from source2 exists, with only first column data = load_tables_to_dicts(pipeline, "source_2_data_1", schema_name="refresh_source_2") assert_only_table_columns( pipeline, "source_2_data_1", ["product"], schema_name="refresh_source_2" @@ -396,7 +439,7 @@ def source_2_data_2(): result = sorted([row["product"] for row in data["source_2_data_1"]]) assert result == ["orange", "pear"] - # # Second table from source 2 is gone + # Second table from source 2 is gone assert not table_exists(pipeline, "source_2_data_2", schema_name="refresh_source_2") diff --git a/tests/load/snowflake/test_snowflake_client.py b/tests/load/snowflake/test_snowflake_client.py index aebf514b56..674e01ba31 100644 --- a/tests/load/snowflake/test_snowflake_client.py +++ b/tests/load/snowflake/test_snowflake_client.py @@ -1,14 +1,17 @@ +from copy import deepcopy import os from typing import Iterator from pytest_mock import MockerFixture import pytest -from dlt.destinations.impl.snowflake.snowflake import SnowflakeClient +from dlt.common.schema.schema import Schema +from dlt.destinations.impl.snowflake.snowflake import SUPPORTED_HINTS, SnowflakeClient from dlt.destinations.job_client_impl import SqlJobClientBase from dlt.destinations.sql_client import TJobQueryTags -from tests.load.utils import yield_client_with_storage +from tests.cases import TABLE_UPDATE +from tests.load.utils import yield_client_with_storage, empty_schema # mark all tests as essential, do not remove pytestmark = pytest.mark.essential @@ -32,6 +35,39 @@ def client() -> Iterator[SqlJobClientBase]: yield from yield_client_with_storage("snowflake") +def test_create_table_with_hints(client: SnowflakeClient, empty_schema: Schema) -> None: + mod_update = deepcopy(TABLE_UPDATE[:11]) + # mock hints + client.config.create_indexes = True + client.active_hints = SUPPORTED_HINTS + client.schema = empty_schema + + mod_update[0]["primary_key"] = True + mod_update[5]["primary_key"] = True + + mod_update[0]["sort"] = True + mod_update[4]["parent_key"] = True + + # unique constraints are always single columns + mod_update[1]["unique"] = True + mod_update[7]["unique"] = True + + sql = ";".join(client._get_table_update_sql("event_test_table", mod_update, False)) + + print(sql) + client.sql_client.execute_sql(sql) + + # generate alter table + mod_update = deepcopy(TABLE_UPDATE[11:]) + mod_update[0]["primary_key"] = True + mod_update[1]["unique"] = True + + sql = ";".join(client._get_table_update_sql("event_test_table", mod_update, True)) + + print(sql) + client.sql_client.execute_sql(sql) + + def test_query_tag(client: SnowflakeClient, mocker: MockerFixture): assert client.config.query_tag == QUERY_TAG # make sure we generate proper query diff --git a/tests/load/snowflake/test_snowflake_table_builder.py b/tests/load/snowflake/test_snowflake_table_builder.py index 1fc0034f43..43d4395188 100644 --- a/tests/load/snowflake/test_snowflake_table_builder.py +++ b/tests/load/snowflake/test_snowflake_table_builder.py @@ -6,7 +6,7 @@ from dlt.common.utils import uniq_id from dlt.common.schema import Schema, utils from dlt.destinations import snowflake -from dlt.destinations.impl.snowflake.snowflake import SnowflakeClient +from dlt.destinations.impl.snowflake.snowflake import SnowflakeClient, SUPPORTED_HINTS from dlt.destinations.impl.snowflake.configuration import ( SnowflakeClientConfiguration, SnowflakeCredentials, @@ -66,16 +66,63 @@ def test_create_table(snowflake_client: SnowflakeClient) -> None: assert sql.strip().startswith("CREATE TABLE") assert "EVENT_TEST_TABLE" in sql - assert '"COL1" NUMBER(19,0) NOT NULL' in sql - assert '"COL2" FLOAT NOT NULL' in sql - assert '"COL3" BOOLEAN NOT NULL' in sql - assert '"COL4" TIMESTAMP_TZ NOT NULL' in sql + assert '"COL1" NUMBER(19,0) NOT NULL' in sql + assert '"COL2" FLOAT NOT NULL' in sql + assert '"COL3" BOOLEAN NOT NULL' in sql + assert '"COL4" TIMESTAMP_TZ NOT NULL' in sql assert '"COL5" VARCHAR' in sql - assert '"COL6" NUMBER(38,9) NOT NULL' in sql + assert '"COL6" NUMBER(38,9) NOT NULL' in sql assert '"COL7" BINARY' in sql assert '"COL8" NUMBER(38,0)' in sql - assert '"COL9" VARIANT NOT NULL' in sql - assert '"COL10" DATE NOT NULL' in sql + assert '"COL9" VARIANT NOT NULL' in sql + assert '"COL10" DATE NOT NULL' in sql + + +def test_create_table_with_hints(snowflake_client: SnowflakeClient) -> None: + mod_update = deepcopy(TABLE_UPDATE[:11]) + # mock hints + snowflake_client.config.create_indexes = True + snowflake_client.active_hints = SUPPORTED_HINTS + + mod_update[0]["primary_key"] = True + mod_update[5]["primary_key"] = True + + mod_update[0]["sort"] = True + + # unique constraints are always single columns + mod_update[1]["unique"] = True + mod_update[7]["unique"] = True + + mod_update[4]["parent_key"] = True + + sql = ";".join(snowflake_client._get_table_update_sql("event_test_table", mod_update, False)) + + assert sql.strip().startswith("CREATE TABLE") + assert "EVENT_TEST_TABLE" in sql + assert '"COL1" NUMBER(19,0) NOT NULL' in sql + assert '"COL2" FLOAT UNIQUE NOT NULL' in sql + assert '"COL3" BOOLEAN NOT NULL' in sql + assert '"COL4" TIMESTAMP_TZ NOT NULL' in sql + assert '"COL5" VARCHAR' in sql + assert '"COL6" NUMBER(38,9) NOT NULL' in sql + assert '"COL7" BINARY' in sql + assert '"COL8" NUMBER(38,0) UNIQUE' in sql + assert '"COL9" VARIANT NOT NULL' in sql + assert '"COL10" DATE NOT NULL' in sql + + # PRIMARY KEY constraint + assert 'CONSTRAINT "PK_EVENT_TEST_TABLE_' in sql + assert 'PRIMARY KEY ("COL1", "COL6")' in sql + + # generate alter + mod_update = deepcopy(TABLE_UPDATE[11:]) + mod_update[0]["primary_key"] = True + mod_update[1]["unique"] = True + + sql = ";".join(snowflake_client._get_table_update_sql("event_test_table", mod_update, True)) + # PK constraint ignored for alter + assert "PRIMARY KEY" not in sql + assert '"COL2_NULL" FLOAT UNIQUE' in sql def test_alter_table(snowflake_client: SnowflakeClient) -> None: @@ -90,15 +137,15 @@ def test_alter_table(snowflake_client: SnowflakeClient) -> None: assert sql.count("ALTER TABLE") == 1 assert sql.count("ADD COLUMN") == 1 assert '"EVENT_TEST_TABLE"' in sql - assert '"COL1" NUMBER(19,0) NOT NULL' in sql - assert '"COL2" FLOAT NOT NULL' in sql - assert '"COL3" BOOLEAN NOT NULL' in sql - assert '"COL4" TIMESTAMP_TZ NOT NULL' in sql + assert '"COL1" NUMBER(19,0) NOT NULL' in sql + assert '"COL2" FLOAT NOT NULL' in sql + assert '"COL3" BOOLEAN NOT NULL' in sql + assert '"COL4" TIMESTAMP_TZ NOT NULL' in sql assert '"COL5" VARCHAR' in sql - assert '"COL6" NUMBER(38,9) NOT NULL' in sql + assert '"COL6" NUMBER(38,9) NOT NULL' in sql assert '"COL7" BINARY' in sql assert '"COL8" NUMBER(38,0)' in sql - assert '"COL9" VARIANT NOT NULL' in sql + assert '"COL9" VARIANT NOT NULL' in sql assert '"COL10" DATE' in sql mod_table = deepcopy(TABLE_UPDATE) @@ -106,7 +153,7 @@ def test_alter_table(snowflake_client: SnowflakeClient) -> None: sql = snowflake_client._get_table_update_sql("event_test_table", mod_table, True)[0] assert '"COL1"' not in sql - assert '"COL2" FLOAT NOT NULL' in sql + assert '"COL2" FLOAT NOT NULL' in sql def test_create_table_case_sensitive(cs_client: SnowflakeClient) -> None: diff --git a/tests/load/sources/sql_database/test_helpers.py b/tests/load/sources/sql_database/test_helpers.py index def5430146..43da9c955f 100644 --- a/tests/load/sources/sql_database/test_helpers.py +++ b/tests/load/sources/sql_database/test_helpers.py @@ -1,3 +1,6 @@ +from typing import Callable, Any, TYPE_CHECKING +from dataclasses import dataclass + import pytest import dlt @@ -14,6 +17,18 @@ pytest.skip("Tests require sql alchemy", allow_module_level=True) +@dataclass +class MockIncremental: + last_value: Any + last_value_func: Callable[[Any], Any] + cursor_path: str + row_order: str = None + end_value: Any = None + on_cursor_value_missing: str = "raise" + range_start: str = "closed" + range_end: str = "open" + + @pytest.mark.parametrize("backend", ["sqlalchemy", "pyarrow", "pandas", "connectorx"]) def test_cursor_or_unique_column_not_in_table( sql_source_db: SQLAlchemySourceDB, backend: TableBackend @@ -36,13 +51,12 @@ def test_make_query_incremental_max( ) -> None: """Verify query is generated according to incremental settings""" - class MockIncremental: - last_value = dlt.common.pendulum.now() - last_value_func = max - cursor_path = "created_at" - row_order = "asc" - end_value = None - on_cursor_value_missing = "raise" + incremental = MockIncremental( + last_value=dlt.common.pendulum.now(), + last_value_func=max, + cursor_path="created_at", + row_order="asc", + ) table = sql_source_db.get_table("chat_message") loader = TableLoader( @@ -50,14 +64,14 @@ class MockIncremental: backend, table, table_to_columns(table), - incremental=MockIncremental(), # type: ignore[arg-type] + incremental=incremental, # type: ignore[arg-type] ) query = loader.make_query() expected = ( table.select() .order_by(table.c.created_at.asc()) - .where(table.c.created_at >= MockIncremental.last_value) + .where(table.c.created_at >= incremental.last_value) ) assert query.compare(expected) @@ -67,13 +81,14 @@ class MockIncremental: def test_make_query_incremental_min( sql_source_db: SQLAlchemySourceDB, backend: TableBackend ) -> None: - class MockIncremental: - last_value = dlt.common.pendulum.now() - last_value_func = min - cursor_path = "created_at" - row_order = "desc" - end_value = None - on_cursor_value_missing = "raise" + incremental = MockIncremental( + last_value=dlt.common.pendulum.now(), + last_value_func=min, + cursor_path="created_at", + row_order="desc", + end_value=None, + on_cursor_value_missing="raise", + ) table = sql_source_db.get_table("chat_message") loader = TableLoader( @@ -81,14 +96,14 @@ class MockIncremental: backend, table, table_to_columns(table), - incremental=MockIncremental(), # type: ignore[arg-type] + incremental=incremental, # type: ignore[arg-type] ) query = loader.make_query() expected = ( table.select() .order_by(table.c.created_at.asc()) # `min` func swaps order - .where(table.c.created_at <= MockIncremental.last_value) + .where(table.c.created_at <= incremental.last_value) ) assert query.compare(expected) @@ -103,13 +118,14 @@ def test_make_query_incremental_on_cursor_value_missing_set( with_end_value: bool, cursor_value_missing: str, ) -> None: - class MockIncremental: - last_value = dlt.common.pendulum.now() - last_value_func = max - cursor_path = "created_at" - row_order = "asc" - end_value = None if not with_end_value else dlt.common.pendulum.now().add(hours=1) - on_cursor_value_missing = cursor_value_missing + incremental = MockIncremental( + last_value=dlt.common.pendulum.now(), + last_value_func=max, + cursor_path="created_at", + row_order="asc", + end_value=None if not with_end_value else dlt.common.pendulum.now().add(hours=1), + on_cursor_value_missing=cursor_value_missing, + ) table = sql_source_db.get_table("chat_message") loader = TableLoader( @@ -117,7 +133,7 @@ class MockIncremental: backend, table, table_to_columns(table), - incremental=MockIncremental(), # type: ignore[arg-type] + incremental=incremental, # type: ignore[arg-type] ) query = loader.make_query() @@ -131,14 +147,14 @@ class MockIncremental: if with_end_value: where_clause = operator( sa.and_( - table.c.created_at >= MockIncremental.last_value, - table.c.created_at < MockIncremental.end_value, + table.c.created_at >= incremental.last_value, + table.c.created_at < incremental.end_value, ), missing_cond, ) else: where_clause = operator( - table.c.created_at >= MockIncremental.last_value, + table.c.created_at >= incremental.last_value, missing_cond, ) expected = table.select().order_by(table.c.created_at.asc()).where(where_clause) @@ -152,13 +168,14 @@ def test_make_query_incremental_on_cursor_value_missing_no_last_value( backend: TableBackend, cursor_value_missing: str, ) -> None: - class MockIncremental: - last_value = None - last_value_func = max - cursor_path = "created_at" - row_order = "asc" - end_value = None - on_cursor_value_missing = cursor_value_missing + incremental = MockIncremental( + last_value=None, + last_value_func=max, + cursor_path="created_at", + row_order="asc", + end_value=None, + on_cursor_value_missing=cursor_value_missing, + ) table = sql_source_db.get_table("chat_message") loader = TableLoader( @@ -166,7 +183,7 @@ class MockIncremental: backend, table, table_to_columns(table), - incremental=MockIncremental(), # type: ignore[arg-type] + incremental=incremental, # type: ignore[arg-type] ) query = loader.make_query() @@ -189,13 +206,14 @@ def test_make_query_incremental_end_value( ) -> None: now = dlt.common.pendulum.now() - class MockIncremental: - last_value = now - last_value_func = min - cursor_path = "created_at" - end_value = now.add(hours=1) - row_order = None - on_cursor_value_missing = "raise" + incremental = MockIncremental( + last_value=now, + last_value_func=min, + cursor_path="created_at", + end_value=now.add(hours=1), + row_order=None, + on_cursor_value_missing="raise", + ) table = sql_source_db.get_table("chat_message") loader = TableLoader( @@ -203,14 +221,14 @@ class MockIncremental: backend, table, table_to_columns(table), - incremental=MockIncremental(), # type: ignore[arg-type] + incremental=incremental, # type: ignore[arg-type] ) query = loader.make_query() expected = table.select().where( sa.and_( - table.c.created_at <= MockIncremental.last_value, - table.c.created_at > MockIncremental.end_value, + table.c.created_at <= incremental.last_value, + table.c.created_at > incremental.end_value, ) ) @@ -221,13 +239,14 @@ class MockIncremental: def test_make_query_incremental_any_fun( sql_source_db: SQLAlchemySourceDB, backend: TableBackend ) -> None: - class MockIncremental: - last_value = dlt.common.pendulum.now() - last_value_func = lambda x: x[-1] - cursor_path = "created_at" - row_order = "asc" - end_value = dlt.common.pendulum.now() - on_cursor_value_missing = "raise" + incremental = MockIncremental( + last_value=dlt.common.pendulum.now(), + last_value_func=lambda x: x[-1], + cursor_path="created_at", + row_order="asc", + end_value=dlt.common.pendulum.now(), + on_cursor_value_missing="raise", + ) table = sql_source_db.get_table("chat_message") loader = TableLoader( @@ -235,7 +254,7 @@ class MockIncremental: backend, table, table_to_columns(table), - incremental=MockIncremental(), # type: ignore[arg-type] + incremental=incremental, # type: ignore[arg-type] ) query = loader.make_query() @@ -256,12 +275,11 @@ def test_cursor_path_field_name_with_a_special_chars( if special_field_name not in table.c: table.append_column(sa.Column(special_field_name, sa.String)) - class MockIncremental: - cursor_path = "'id$field'" - last_value = None - end_value = None - row_order = None - on_cursor_value_missing = None + incremental = MockIncremental( + cursor_path="'id$field'", + last_value=None, + last_value_func=max, + ) # Should not raise any exception loader = TableLoader( @@ -269,7 +287,7 @@ class MockIncremental: backend, table, table_to_columns(table), - incremental=MockIncremental(), # type: ignore[arg-type] + incremental=incremental, # type: ignore[arg-type] ) assert loader.cursor_column == table.c[special_field_name] @@ -281,12 +299,11 @@ def test_cursor_path_multiple_fields( """Test that a cursor_path with multiple fields raises a ValueError.""" table = sql_source_db.get_table("chat_message") - class MockIncremental: - cursor_path = "created_at,updated_at" - last_value = None - end_value = None - row_order = None - on_cursor_value_missing = None + incremental = MockIncremental( + cursor_path="created_at,updated_at", + last_value=None, + last_value_func=max, + ) with pytest.raises(ValueError) as excinfo: TableLoader( @@ -294,7 +311,7 @@ class MockIncremental: backend, table, table_to_columns(table), - incremental=MockIncremental(), # type: ignore[arg-type] + incremental=incremental, # type: ignore[arg-type] ) assert "must be a simple column name" in str(excinfo.value) @@ -306,12 +323,11 @@ def test_cursor_path_complex_expression( """Test that a complex JSONPath expression in cursor_path raises a ValueError.""" table = sql_source_db.get_table("chat_message") - class MockIncremental: - cursor_path = "$.users[0].id" - last_value = None - end_value = None - row_order = None - on_cursor_value_missing = None + incremental = MockIncremental( + cursor_path="$.users[0].id", + last_value=None, + last_value_func=max, + ) with pytest.raises(ValueError) as excinfo: TableLoader( @@ -319,11 +335,80 @@ class MockIncremental: backend, table, table_to_columns(table), - incremental=MockIncremental(), # type: ignore[arg-type] + incremental=incremental, # type: ignore[arg-type] ) assert "must be a simple column name" in str(excinfo.value) +@pytest.mark.parametrize("backend", ["sqlalchemy", "pyarrow", "pandas", "connectorx"]) +@pytest.mark.parametrize("last_value_func", [min, max]) +def test_make_query_incremental_range_start_open( + sql_source_db: SQLAlchemySourceDB, backend: TableBackend, last_value_func: Callable[[Any], Any] +) -> None: + incremental = MockIncremental( + last_value=dlt.common.pendulum.now(), + last_value_func=last_value_func, + cursor_path="created_at", + end_value=None, + on_cursor_value_missing="raise", + range_start="open", + ) + + table = sql_source_db.get_table("chat_message") + + loader = TableLoader( + sql_source_db.engine, + backend, + table, + table_to_columns(table), + incremental=incremental, # type: ignore[arg-type] + ) + + query = loader.make_query() + expected = table.select() + + if last_value_func == min: + expected = expected.where(table.c.created_at < incremental.last_value) + else: + expected = expected.where(table.c.created_at > incremental.last_value) + + assert query.compare(expected) + + +@pytest.mark.parametrize("backend", ["sqlalchemy", "pyarrow", "pandas", "connectorx"]) +@pytest.mark.parametrize("last_value_func", [min, max]) +def test_make_query_incremental_range_end_closed( + sql_source_db: SQLAlchemySourceDB, backend: TableBackend, last_value_func: Callable[[Any], Any] +) -> None: + incremental = MockIncremental( + last_value=dlt.common.pendulum.now(), + last_value_func=last_value_func, + cursor_path="created_at", + end_value=None, + on_cursor_value_missing="raise", + range_end="closed", + ) + + table = sql_source_db.get_table("chat_message") + loader = TableLoader( + sql_source_db.engine, + backend, + table, + table_to_columns(table), + incremental=incremental, # type: ignore[arg-type] + ) + + query = loader.make_query() + expected = table.select() + + if last_value_func == min: + expected = expected.where(table.c.created_at <= incremental.last_value) + else: + expected = expected.where(table.c.created_at >= incremental.last_value) + + assert query.compare(expected) + + def mock_json_column(field: str) -> TDataItem: """""" import pyarrow as pa diff --git a/tests/load/sources/sql_database/test_sql_database_source.py b/tests/load/sources/sql_database/test_sql_database_source.py index 9079638586..2de923fe38 100644 --- a/tests/load/sources/sql_database/test_sql_database_source.py +++ b/tests/load/sources/sql_database/test_sql_database_source.py @@ -13,6 +13,7 @@ from dlt.common.utils import uniq_id from dlt.extract.exceptions import ResourceExtractionError +from dlt.extract.incremental.transform import JsonIncremental, ArrowIncremental from dlt.sources import DltResource from tests.pipeline.utils import ( @@ -831,8 +832,12 @@ def _assert_incremental(item): else: assert _r.incremental.primary_key == ["id"] assert _r.incremental._incremental.primary_key == ["id"] - assert _r.incremental._incremental._transformers["json"].primary_key == ["id"] - assert _r.incremental._incremental._transformers["arrow"].primary_key == ["id"] + assert _r.incremental._incremental._make_or_get_transformer( + JsonIncremental + ).primary_key == ["id"] + assert _r.incremental._incremental._make_or_get_transformer( + ArrowIncremental + ).primary_key == ["id"] return item pipeline = make_pipeline("duckdb") @@ -841,8 +846,12 @@ def _assert_incremental(item): assert resource.incremental.primary_key == ["id"] assert resource.incremental._incremental.primary_key == ["id"] - assert resource.incremental._incremental._transformers["json"].primary_key == ["id"] - assert resource.incremental._incremental._transformers["arrow"].primary_key == ["id"] + assert resource.incremental._incremental._make_or_get_transformer( + JsonIncremental + ).primary_key == ["id"] + assert resource.incremental._incremental._make_or_get_transformer( + ArrowIncremental + ).primary_key == ["id"] @pytest.mark.parametrize("backend", ["sqlalchemy", "pyarrow", "pandas", "connectorx"]) @@ -1277,10 +1286,7 @@ def assert_no_precision_columns( ) -> None: actual = list(columns.values()) # we always infer and emit nullability - expected = cast( - List[TColumnSchema], - deepcopy(NULL_NO_PRECISION_COLUMNS if nullable else NOT_NULL_NO_PRECISION_COLUMNS), - ) + expected = deepcopy(NULL_NO_PRECISION_COLUMNS if nullable else NOT_NULL_NO_PRECISION_COLUMNS) if backend == "pyarrow": expected = cast( List[TColumnSchema], diff --git a/tests/load/test_read_interfaces.py b/tests/load/test_read_interfaces.py index 1a9c8a383b..756002ef7f 100644 --- a/tests/load/test_read_interfaces.py +++ b/tests/load/test_read_interfaces.py @@ -1,5 +1,5 @@ -from typing import Any, cast - +from typing import Any, cast, Tuple, List +import re import pytest import dlt import os @@ -20,9 +20,12 @@ ) from dlt.destinations import filesystem from tests.utils import TEST_STORAGE_ROOT, clean_test_storage -from dlt.common.destination.reference import TDestinationReferenceArg -from dlt.destinations.dataset import ReadableDBAPIDataset, ReadableRelationUnknownColumnException +from dlt.destinations.dataset.dataset import ReadableDBAPIDataset +from dlt.destinations.dataset.exceptions import ( + ReadableRelationUnknownColumnException, +) from tests.load.utils import drop_pipeline_data +from dlt.destinations.dataset import dataset as _dataset EXPECTED_COLUMNS = ["id", "decimal", "other_decimal", "_dlt_load_id", "_dlt_id"] @@ -58,6 +61,7 @@ def autouse_test_storage() -> FileStorage: @pytest.fixture(scope="session") def populated_pipeline(request, autouse_test_storage) -> Any: """fixture that returns a pipeline object populated with the example data""" + destination_config = cast(DestinationTestConfiguration, request.param) if ( @@ -104,6 +108,7 @@ def items(): columns={ "id": {"data_type": "bigint"}, "double_id": {"data_type": "bigint"}, + "di_decimal": {"data_type": "decimal", "precision": 7, "scale": 3}, }, ) def double_items(): @@ -111,6 +116,7 @@ def double_items(): { "id": i, "double_id": i * 2, + "di_decimal": Decimal("10.433"), } for i in range(total_records) ] @@ -151,6 +157,24 @@ def double_items(): ) +@pytest.mark.no_load +@pytest.mark.essential +@pytest.mark.parametrize( + "populated_pipeline", + configs, + indirect=True, + ids=lambda x: x.name, +) +def test_explicit_dataset_type_selection(populated_pipeline: Pipeline): + from dlt.destinations.dataset.dataset import ReadableDBAPIRelation + from dlt.destinations.dataset.ibis_relation import ReadableIbisRelation + + assert isinstance( + populated_pipeline.dataset(dataset_type="default").items, ReadableDBAPIRelation + ) + assert isinstance(populated_pipeline.dataset(dataset_type="ibis").items, ReadableIbisRelation) + + @pytest.mark.no_load @pytest.mark.essential @pytest.mark.parametrize( @@ -160,7 +184,7 @@ def double_items(): ids=lambda x: x.name, ) def test_arrow_access(populated_pipeline: Pipeline) -> None: - table_relationship = populated_pipeline._dataset().items + table_relationship = populated_pipeline.dataset().items total_records = _total_records(populated_pipeline) chunk_size = _chunk_size(populated_pipeline) expected_chunk_counts = _expected_chunk_count(populated_pipeline) @@ -193,7 +217,7 @@ def test_arrow_access(populated_pipeline: Pipeline) -> None: ) def test_dataframe_access(populated_pipeline: Pipeline) -> None: # access via key - table_relationship = populated_pipeline._dataset()["items"] + table_relationship = populated_pipeline.dataset()["items"] total_records = _total_records(populated_pipeline) chunk_size = _chunk_size(populated_pipeline) expected_chunk_counts = _expected_chunk_count(populated_pipeline) @@ -210,7 +234,6 @@ def test_dataframe_access(populated_pipeline: Pipeline) -> None: if not skip_df_chunk_size_check: assert len(df.index) == chunk_size - # lowercase results for the snowflake case assert set(df.columns.values) == set(EXPECTED_COLUMNS) # iterate all dataframes @@ -233,7 +256,7 @@ def test_dataframe_access(populated_pipeline: Pipeline) -> None: ) def test_db_cursor_access(populated_pipeline: Pipeline) -> None: # check fetch accessors - table_relationship = populated_pipeline._dataset().items + table_relationship = populated_pipeline.dataset().items total_records = _total_records(populated_pipeline) chunk_size = _chunk_size(populated_pipeline) expected_chunk_counts = _expected_chunk_count(populated_pipeline) @@ -258,71 +281,6 @@ def test_db_cursor_access(populated_pipeline: Pipeline) -> None: assert set(ids) == set(range(total_records)) -@pytest.mark.no_load -@pytest.mark.essential -@pytest.mark.parametrize( - "populated_pipeline", - configs, - indirect=True, - ids=lambda x: x.name, -) -def test_ibis_dataset_access(populated_pipeline: Pipeline) -> None: - # NOTE: we could generalize this with a context for certain deps - import subprocess - - subprocess.check_call( - ["pip", "install", "ibis-framework[duckdb,postgres,bigquery,snowflake,mssql,clickhouse]"] - ) - - from dlt.common.libs.ibis import SUPPORTED_DESTINATIONS - - # check correct error if not supported - if populated_pipeline.destination.destination_type not in SUPPORTED_DESTINATIONS: - with pytest.raises(NotImplementedError): - populated_pipeline._dataset().ibis() - return - - total_records = _total_records(populated_pipeline) - ibis_connection = populated_pipeline._dataset().ibis() - - map_i = lambda x: x - if populated_pipeline.destination.destination_type == "dlt.destinations.snowflake": - map_i = lambda x: x.upper() - - dataset_name = map_i(populated_pipeline.dataset_name) - table_like_statement = None - table_name_prefix = "" - addtional_tables = [] - - # clickhouse has no datasets, but table prefixes and a sentinel table - if populated_pipeline.destination.destination_type == "dlt.destinations.clickhouse": - table_like_statement = dataset_name + "." - table_name_prefix = dataset_name + "___" - dataset_name = None - addtional_tables = ["dlt_sentinel_table"] - - add_table_prefix = lambda x: table_name_prefix + x - - # just do a basic check to see wether ibis can connect - assert set(ibis_connection.list_tables(database=dataset_name, like=table_like_statement)) == { - add_table_prefix(map_i(x)) - for x in ( - [ - "_dlt_loads", - "_dlt_pipeline_state", - "_dlt_version", - "double_items", - "items", - "items__children", - ] - + addtional_tables - ) - } - - items_table = ibis_connection.table(add_table_prefix(map_i("items")), database=dataset_name) - assert items_table.count().to_pandas() == total_records - - @pytest.mark.no_load @pytest.mark.essential @pytest.mark.parametrize( @@ -332,7 +290,7 @@ def test_ibis_dataset_access(populated_pipeline: Pipeline) -> None: ids=lambda x: x.name, ) def test_hint_preservation(populated_pipeline: Pipeline) -> None: - table_relationship = populated_pipeline._dataset().items + table_relationship = populated_pipeline.dataset(dataset_type="default").items # check that hints are carried over to arrow table expected_decimal_precision = 10 expected_decimal_precision_2 = 12 @@ -360,10 +318,94 @@ def test_hint_preservation(populated_pipeline: Pipeline) -> None: ) def test_loads_table_access(populated_pipeline: Pipeline) -> None: # check loads table access, we should have one entry - loads_table = populated_pipeline._dataset()[populated_pipeline.default_schema.loads_table_name] + loads_table = populated_pipeline.dataset()[populated_pipeline.default_schema.loads_table_name] assert len(loads_table.fetchall()) == 1 +@pytest.mark.no_load +@pytest.mark.essential +@pytest.mark.parametrize( + "populated_pipeline", + configs, + indirect=True, + ids=lambda x: x.name, +) +def test_row_counts(populated_pipeline: Pipeline) -> None: + total_records = _total_records(populated_pipeline) + + dataset = populated_pipeline.dataset() + # default is all data tables + assert set(dataset.row_counts().df().itertuples(index=False, name=None)) == { + ( + "items", + total_records, + ), + ( + "double_items", + total_records, + ), + ( + "items__children", + total_records * 2, + ), + } + # get only one data table + assert set( + dataset.row_counts(table_names=["items"]).df().itertuples(index=False, name=None) + ) == { + ( + "items", + total_records, + ), + } + # get all dlt tables + assert set( + dataset.row_counts(dlt_tables=True, data_tables=False) + .df() + .itertuples(index=False, name=None) + ) == { + ( + "_dlt_version", + 1, + ), + ( + "_dlt_loads", + 1, + ), + ( + "_dlt_pipeline_state", + 1, + ), + } + # get them all + assert set(dataset.row_counts(dlt_tables=True).df().itertuples(index=False, name=None)) == { + ( + "_dlt_version", + 1, + ), + ( + "_dlt_loads", + 1, + ), + ( + "_dlt_pipeline_state", + 1, + ), + ( + "items", + total_records, + ), + ( + "double_items", + total_records, + ), + ( + "items__children", + total_records * 2, + ), + } + + @pytest.mark.no_load @pytest.mark.essential @pytest.mark.parametrize( @@ -375,7 +417,7 @@ def test_loads_table_access(populated_pipeline: Pipeline) -> None: def test_sql_queries(populated_pipeline: Pipeline) -> None: # simple check that query also works tname = populated_pipeline.sql_client().make_qualified_table_name("items") - query_relationship = populated_pipeline._dataset()(f"select * from {tname} where id < 20") + query_relationship = populated_pipeline.dataset()(f"select * from {tname} where id < 20") # we selected the first 20 table = query_relationship.arrow() @@ -387,7 +429,7 @@ def test_sql_queries(populated_pipeline: Pipeline) -> None: f"SELECT i.id, di.double_id FROM {tname} as i JOIN {tdname} as di ON (i.id = di.id) WHERE" " i.id < 20 ORDER BY i.id ASC" ) - join_relationship = populated_pipeline._dataset()(query) + join_relationship = populated_pipeline.dataset()(query) table = join_relationship.fetchall() assert len(table) == 20 assert list(table[0]) == [0, 0] @@ -404,7 +446,7 @@ def test_sql_queries(populated_pipeline: Pipeline) -> None: ids=lambda x: x.name, ) def test_limit_and_head(populated_pipeline: Pipeline) -> None: - table_relationship = populated_pipeline._dataset().items + table_relationship = populated_pipeline.dataset().items assert len(table_relationship.head().fetchall()) == 5 assert len(table_relationship.limit(24).fetchall()) == 24 @@ -425,8 +467,7 @@ def test_limit_and_head(populated_pipeline: Pipeline) -> None: ids=lambda x: x.name, ) def test_column_selection(populated_pipeline: Pipeline) -> None: - table_relationship = populated_pipeline._dataset().items - + table_relationship = populated_pipeline.dataset(dataset_type="default").items columns = ["_dlt_load_id", "other_decimal"] data_frame = table_relationship.select(*columns).head().df() assert [v.lower() for v in data_frame.columns.values] == columns @@ -463,22 +504,283 @@ def test_schema_arg(populated_pipeline: Pipeline) -> None: """Simple test to ensure schemas may be selected via schema arg""" # if there is no arg, the defautl schema is used - dataset = populated_pipeline._dataset() + dataset = populated_pipeline.dataset() assert dataset.schema.name == populated_pipeline.default_schema_name assert "items" in dataset.schema.tables # setting a different schema name will try to load that schema, # not find one and create an empty schema with that name - dataset = populated_pipeline._dataset(schema="unknown_schema") + dataset = populated_pipeline.dataset(schema="unknown_schema") assert dataset.schema.name == "unknown_schema" assert "items" not in dataset.schema.tables # providing the schema name of the right schema will load it - dataset = populated_pipeline._dataset(schema=populated_pipeline.default_schema_name) + dataset = populated_pipeline.dataset(schema=populated_pipeline.default_schema_name) assert dataset.schema.name == populated_pipeline.default_schema_name assert "items" in dataset.schema.tables +@pytest.mark.no_load +@pytest.mark.essential +@pytest.mark.parametrize( + "populated_pipeline", + configs, + indirect=True, + ids=lambda x: x.name, +) +def test_ibis_expression_relation(populated_pipeline: Pipeline) -> None: + # NOTE: we could generalize this with a context for certain deps + import ibis # type: ignore + + # now we should get the more powerful ibis relation + dataset = populated_pipeline.dataset() + total_records = _total_records(populated_pipeline) + + items_table = dataset["items"] + double_items_table = dataset["double_items"] + + # check full table access + df = items_table.df() + assert len(df.index) == total_records + + df = double_items_table.df() + assert len(df.index) == total_records + + # check limit + df = items_table.limit(5).df() + assert len(df.index) == 5 + + # check chained expression with join, column selection, order by and limit + joined_table = ( + items_table.join(double_items_table, items_table.id == double_items_table.id)[ + ["id", "double_id"] + ] + .order_by("id") + .limit(20) + ) + table = joined_table.fetchall() + assert len(table) == 20 + assert list(table[0]) == [0, 0] + assert list(table[5]) == [5, 10] + assert list(table[10]) == [10, 20] + + # check aggregate of first 20 items + agg_table = items_table.order_by("id").limit(20).aggregate(sum_id=items_table.id.sum()) + assert agg_table.fetchone()[0] == reduce(lambda a, b: a + b, range(20)) + + # check filtering + filtered_table = items_table.filter(items_table.id < 10) + assert len(filtered_table.fetchall()) == 10 + + if populated_pipeline.destination.destination_type != "dlt.destinations.duckdb": + return + + # we check a bunch of expressions without executing them to see that they produce correct sql + # also we return the keys of the disovered schema columns + def sql_from_expr(expr: Any) -> Tuple[str, List[str]]: + query = str(expr.query).replace(populated_pipeline.dataset_name, "dataset") + columns = list(expr.columns_schema.keys()) if expr.columns_schema else None + return re.sub(r"\s+", " ", query), columns + + # test all functions discussed here: https://ibis-project.org/tutorials/ibis-for-sql-users + ALL_COLUMNS = ["id", "decimal", "other_decimal", "_dlt_load_id", "_dlt_id"] + + # selecting two columns + assert sql_from_expr(items_table.select("id", "decimal")) == ( + 'SELECT "t0"."id", "t0"."decimal" FROM "dataset"."items" AS "t0"', + ["id", "decimal"], + ) + + # selecting all columns + assert sql_from_expr(items_table) == ('SELECT * FROM "dataset"."items"', ALL_COLUMNS) + + # selecting two other columns via item getter + assert sql_from_expr(items_table["id", "decimal"]) == ( + 'SELECT "t0"."id", "t0"."decimal" FROM "dataset"."items" AS "t0"', + ["id", "decimal"], + ) + + # adding a new columns + new_col = (items_table.id * 2).name("new_col") + assert sql_from_expr(items_table.select("id", "decimal", new_col)) == ( + ( + 'SELECT "t0"."id", "t0"."decimal", "t0"."id" * 2 AS "new_col" FROM' + ' "dataset"."items" AS "t0"' + ), + None, + ) + + # mutating table (add a new column computed from existing columns) + assert sql_from_expr( + items_table.mutate(double_id=items_table.id * 2).select("id", "double_id") + ) == ( + 'SELECT "t0"."id", "t0"."id" * 2 AS "double_id" FROM "dataset"."items" AS "t0"', + None, + ) + + # mutating table add new static column + assert sql_from_expr( + items_table.mutate(new_col=ibis.literal("static_value")).select("id", "new_col") + ) == ('SELECT "t0"."id", \'static_value\' AS "new_col" FROM "dataset"."items" AS "t0"', None) + + # check filtering (preserves all columns) + assert sql_from_expr(items_table.filter(items_table.id < 10)) == ( + 'SELECT * FROM "dataset"."items" AS "t0" WHERE "t0"."id" < 10', + ALL_COLUMNS, + ) + + # filtering and selecting a single column + assert sql_from_expr(items_table.filter(items_table.id < 10).select("id")) == ( + 'SELECT "t0"."id" FROM "dataset"."items" AS "t0" WHERE "t0"."id" < 10', + ["id"], + ) + + # check filter "and" condition + assert sql_from_expr(items_table.filter(items_table.id < 10).filter(items_table.id > 5)) == ( + 'SELECT * FROM "dataset"."items" AS "t0" WHERE "t0"."id" < 10 AND "t0"."id" > 5', + ALL_COLUMNS, + ) + + # check filter "or" condition + assert sql_from_expr(items_table.filter((items_table.id < 10) | (items_table.id > 5))) == ( + 'SELECT * FROM "dataset"."items" AS "t0" WHERE ( "t0"."id" < 10 ) OR ( "t0"."id" > 5 )', + ALL_COLUMNS, + ) + + # check group by and aggregate + assert sql_from_expr( + items_table.group_by("id") + .having(items_table.count() >= 1000) + .aggregate(sum_id=items_table.id.sum()) + ) == ( + ( + 'SELECT "t1"."id", "t1"."sum_id" FROM ( SELECT "t0"."id", SUM("t0"."id") AS "sum_id",' + ' COUNT(*) AS "CountStar(items)" FROM "dataset"."items" AS "t0" GROUP BY 1 ) AS "t1"' + ' WHERE "t1"."CountStar(items)" >= 1000' + ), + None, + ) + + # sorting and ordering + assert sql_from_expr(items_table.order_by("id", "decimal").limit(10)) == ( + ( + 'SELECT * FROM "dataset"."items" AS "t0" ORDER BY "t0"."id" ASC, "t0"."decimal" ASC' + " LIMIT 10" + ), + ALL_COLUMNS, + ) + + # sort desc and asc + assert sql_from_expr(items_table.order_by(ibis.desc("id"), ibis.asc("decimal")).limit(10)) == ( + ( + 'SELECT * FROM "dataset"."items" AS "t0" ORDER BY "t0"."id" DESC, "t0"."decimal" ASC' + " LIMIT 10" + ), + ALL_COLUMNS, + ) + + # offset and limit + assert sql_from_expr(items_table.order_by("id").limit(10, offset=5)) == ( + 'SELECT * FROM "dataset"."items" AS "t0" ORDER BY "t0"."id" ASC LIMIT 10 OFFSET 5', + ALL_COLUMNS, + ) + + # join + assert sql_from_expr( + items_table.join(double_items_table, items_table.id == double_items_table.id)[ + ["id", "double_id"] + ] + ) == ( + ( + 'SELECT "t2"."id", "t3"."double_id" FROM "dataset"."items" AS "t2" INNER JOIN' + ' "dataset"."double_items" AS "t3" ON "t2"."id" = "t3"."id"' + ), + None, + ) + + # subqueries + assert sql_from_expr( + items_table.filter(items_table.decimal.isin(double_items_table.di_decimal)) + ) == ( + ( + 'SELECT * FROM "dataset"."items" AS "t0" WHERE "t0"."decimal" IN ( SELECT' + ' "t1"."di_decimal" FROM "dataset"."double_items" AS "t1" )' + ), + ALL_COLUMNS, + ) + + # topk + assert sql_from_expr(items_table.decimal.topk(10)) == ( + ( + 'SELECT * FROM ( SELECT "t0"."decimal", COUNT(*) AS "CountStar(items)" FROM' + ' "dataset"."items" AS "t0" GROUP BY 1 ) AS "t1" ORDER BY "t1"."CountStar(items)" DESC' + " LIMIT 10" + ), + None, + ) + + +@pytest.mark.no_load +@pytest.mark.essential +@pytest.mark.parametrize( + "populated_pipeline", + configs, + indirect=True, + ids=lambda x: x.name, +) +def test_ibis_dataset_access(populated_pipeline: Pipeline) -> None: + # NOTE: we could generalize this with a context for certain deps + + from dlt.helpers.ibis import SUPPORTED_DESTINATIONS + + # check correct error if not supported + if populated_pipeline.destination.destination_type not in SUPPORTED_DESTINATIONS: + with pytest.raises(NotImplementedError): + populated_pipeline.dataset().ibis() + return + + total_records = _total_records(populated_pipeline) + ibis_connection = populated_pipeline.dataset().ibis() + + map_i = lambda x: x + if populated_pipeline.destination.destination_type == "dlt.destinations.snowflake": + map_i = lambda x: x.upper() + + dataset_name = map_i(populated_pipeline.dataset_name) + table_like_statement = None + table_name_prefix = "" + addtional_tables = [] + + # clickhouse has no datasets, but table prefixes and a sentinel table + if populated_pipeline.destination.destination_type == "dlt.destinations.clickhouse": + table_like_statement = dataset_name + "." + table_name_prefix = dataset_name + "___" + dataset_name = None + addtional_tables = ["dlt_sentinel_table"] + + add_table_prefix = lambda x: table_name_prefix + x + + # just do a basic check to see wether ibis can connect + assert set(ibis_connection.list_tables(database=dataset_name, like=table_like_statement)) == { + add_table_prefix(map_i(x)) + for x in ( + [ + "_dlt_loads", + "_dlt_pipeline_state", + "_dlt_version", + "double_items", + "items", + "items__children", + ] + + addtional_tables + ) + } + + items_table = ibis_connection.table(add_table_prefix(map_i("items")), database=dataset_name) + assert items_table.count().to_pandas() == total_records + ibis_connection.disconnect() + + @pytest.mark.no_load @pytest.mark.essential @pytest.mark.parametrize( @@ -491,7 +793,7 @@ def test_standalone_dataset(populated_pipeline: Pipeline) -> None: total_records = _total_records(populated_pipeline) # check dataset factory - dataset = dlt._dataset( + dataset = _dataset( destination=populated_pipeline.destination, dataset_name=populated_pipeline.dataset_name ) # verfiy that sql client and schema are lazy loaded @@ -504,7 +806,7 @@ def test_standalone_dataset(populated_pipeline: Pipeline) -> None: # check that schema is loaded by name dataset = cast( ReadableDBAPIDataset, - dlt._dataset( + _dataset( destination=populated_pipeline.destination, dataset_name=populated_pipeline.dataset_name, schema=populated_pipeline.default_schema_name, @@ -515,7 +817,7 @@ def test_standalone_dataset(populated_pipeline: Pipeline) -> None: # check that schema is not loaded when wrong name given dataset = cast( ReadableDBAPIDataset, - dlt._dataset( + _dataset( destination=populated_pipeline.destination, dataset_name=populated_pipeline.dataset_name, schema="wrong_schema_name", @@ -527,7 +829,7 @@ def test_standalone_dataset(populated_pipeline: Pipeline) -> None: # check that schema is loaded if no schema name given dataset = cast( ReadableDBAPIDataset, - dlt._dataset( + _dataset( destination=populated_pipeline.destination, dataset_name=populated_pipeline.dataset_name, ), @@ -538,7 +840,7 @@ def test_standalone_dataset(populated_pipeline: Pipeline) -> None: # check that there is no error when creating dataset without schema table dataset = cast( ReadableDBAPIDataset, - dlt._dataset( + _dataset( destination=populated_pipeline.destination, dataset_name="unknown_dataset", ), @@ -546,6 +848,7 @@ def test_standalone_dataset(populated_pipeline: Pipeline) -> None: assert dataset.schema.name == "unknown_dataset" assert "items" not in dataset.schema.tables + # NOTE: this breaks the following test, it will need to be fixed somehow # create a newer schema with different name and see wether this is loaded from dlt.common.schema import Schema from dlt.common.schema import utils @@ -560,7 +863,7 @@ def test_standalone_dataset(populated_pipeline: Pipeline) -> None: dataset = cast( ReadableDBAPIDataset, - dlt._dataset( + _dataset( destination=populated_pipeline.destination, dataset_name=populated_pipeline.dataset_name, ), diff --git a/tests/load/utils.py b/tests/load/utils.py index 5c24b2d1dc..5660202ec3 100644 --- a/tests/load/utils.py +++ b/tests/load/utils.py @@ -26,7 +26,10 @@ from dlt.common.configuration import resolve_configuration from dlt.common.configuration.container import Container from dlt.common.configuration.specs.config_section_context import ConfigSectionContext -from dlt.common.configuration.specs import CredentialsConfiguration +from dlt.common.configuration.specs import ( + CredentialsConfiguration, + GcpOAuthCredentialsWithoutDefaults, +) from dlt.common.destination.reference import ( DestinationClientDwhConfiguration, JobClientBase, @@ -57,6 +60,7 @@ from dlt.pipeline.exceptions import SqlClientNotAvailable from tests.utils import ( ACTIVE_DESTINATIONS, + ACTIVE_TABLE_FORMATS, IMPLEMENTED_DESTINATIONS, SQL_DESTINATIONS, EXCLUDED_DESTINATION_CONFIGURATIONS, @@ -171,7 +175,9 @@ def destination_factory(self, **kwargs) -> Destination[Any, Any]: dest_type = kwargs.pop("destination", self.destination_type) dest_name = kwargs.pop("destination_name", self.destination_name) self.setup() - return Destination.from_reference(dest_type, destination_name=dest_name, **kwargs) + return Destination.from_reference( + dest_type, self.credentials, destination_name=dest_name, **kwargs + ) def raw_capabilities(self) -> DestinationCapabilitiesContext: dest = Destination.from_reference(self.destination_type) @@ -604,7 +610,7 @@ def destinations_configs( DestinationTestConfiguration( destination_type="filesystem", bucket_url=bucket, - extra_info=bucket + "-delta", + extra_info=bucket, table_format="delta", supports_merge=True, file_format="parquet", @@ -619,12 +625,33 @@ def destinations_configs( ), ) ] + if bucket == AZ_BUCKET: + # `pyiceberg` does not support `az` scheme + continue + destination_configs += [ + DestinationTestConfiguration( + destination_type="filesystem", + bucket_url=bucket, + extra_info=bucket, + table_format="iceberg", + supports_merge=False, + file_format="parquet", + destination_name="fsgcpoauth" if bucket == GCS_BUCKET else None, + ) + ] # filter out non active destinations destination_configs = [ conf for conf in destination_configs if conf.destination_type in ACTIVE_DESTINATIONS ] + # filter out non active table formats + destination_configs = [ + conf + for conf in destination_configs + if conf.table_format is None or conf.table_format in ACTIVE_TABLE_FORMATS + ] + # filter out destinations not in subset if subset: destination_configs = [ diff --git a/tests/normalize/test_normalize.py b/tests/normalize/test_normalize.py index 7463184be7..84e22af9ff 100644 --- a/tests/normalize/test_normalize.py +++ b/tests/normalize/test_normalize.py @@ -1,3 +1,4 @@ +from copy import deepcopy import pytest from fnmatch import fnmatch from typing import Dict, Iterator, List, Sequence, Tuple @@ -5,6 +6,7 @@ from dlt.common import json from dlt.common.destination.capabilities import TLoaderFileFormat +from dlt.common.schema.exceptions import CannotCoerceColumnException from dlt.common.schema.schema import Schema from dlt.common.schema.utils import new_table from dlt.common.storages.exceptions import SchemaNotFoundError @@ -16,6 +18,7 @@ from dlt.extract.extract import ExtractStorage from dlt.normalize import Normalize +from dlt.normalize.validate import validate_and_update_schema from dlt.normalize.worker import group_worker_files from dlt.normalize.exceptions import NormalizeJobFailed @@ -284,6 +287,8 @@ def test_multiprocessing_row_counting( extract_cases(raw_normalize, ["github.events.load_page_1_duck"]) # use real process pool in tests with ProcessPoolExecutor(max_workers=4) as p: + # test if we get correct number of workers + assert getattr(p, "_max_workers", None) == 4 raw_normalize.run(p) # get step info step_info = raw_normalize.get_step_info(MockPipeline("multiprocessing_pipeline", True)) # type: ignore[abstract] @@ -712,6 +717,71 @@ def assert_timestamp_data_type(load_storage: LoadStorage, data_type: TDataType) assert event_schema.get_table_columns("event")["timestamp"]["data_type"] == data_type +def test_update_schema_column_conflict(rasa_normalize: Normalize) -> None: + extract_cases( + rasa_normalize, + [ + "event.event.many_load_2", + "event.event.user_load_1", + ], + ) + extract_cases( + rasa_normalize, + [ + "ethereum.blocks.9c1d9b504ea240a482b007788d5cd61c_2", + ], + ) + # use real process pool in tests + with ProcessPoolExecutor(max_workers=4) as p: + rasa_normalize.run(p) + + schema = rasa_normalize.schema_storage.load_schema("event") + tab1 = new_table( + "event_user", + write_disposition="append", + columns=[ + {"name": "col1", "data_type": "text", "nullable": False}, + ], + ) + validate_and_update_schema(schema, [{"event_user": [deepcopy(tab1)]}]) + assert schema.tables["event_user"]["columns"]["col1"]["data_type"] == "text" + + tab1["columns"]["col1"]["data_type"] = "bool" + tab1["columns"]["col2"] = {"name": "col2", "data_type": "text", "nullable": False} + with pytest.raises(CannotCoerceColumnException) as exc_val: + validate_and_update_schema(schema, [{"event_user": [deepcopy(tab1)]}]) + assert exc_val.value.column_name == "col1" + assert exc_val.value.from_type == "bool" + assert exc_val.value.to_type == "text" + # whole column mismatch + assert exc_val.value.coerced_value is None + # make sure col2 is not added + assert "col2" not in schema.tables["event_user"]["columns"] + + # add two updates that are conflicting + tab2 = new_table( + "event_slot", + write_disposition="append", + columns=[ + {"name": "col1", "data_type": "text", "nullable": False}, + {"name": "col2", "data_type": "text", "nullable": False}, + ], + ) + tab3 = new_table( + "event_slot", + write_disposition="append", + columns=[ + {"name": "col1", "data_type": "bool", "nullable": False}, + ], + ) + with pytest.raises(CannotCoerceColumnException) as exc_val: + validate_and_update_schema( + schema, [{"event_slot": [deepcopy(tab2)]}, {"event_slot": [deepcopy(tab3)]}] + ) + # col2 is added from first update + assert "col2" in schema.tables["event_slot"]["columns"] + + def test_removal_of_normalizer_schema_section_and_add_seen_data(raw_normalize: Normalize) -> None: extract_cases( raw_normalize, diff --git a/tests/pipeline/test_dlt_versions.py b/tests/pipeline/test_dlt_versions.py index fbd4d412b3..51de3e0f76 100644 --- a/tests/pipeline/test_dlt_versions.py +++ b/tests/pipeline/test_dlt_versions.py @@ -538,5 +538,5 @@ def test_normalize_path_separator_legacy_behavior(test_storage: FileStorage) -> "_dlt_load_id", } # datasets must be the same - data_ = pipeline._dataset().issues_2.select("issue_id", "id").fetchall() + data_ = pipeline.dataset().issues_2.select("issue_id", "id").fetchall() print(data_) diff --git a/tests/pipeline/test_import_export_schema.py b/tests/pipeline/test_import_export_schema.py index eb36d36ba3..5eb9c664d0 100644 --- a/tests/pipeline/test_import_export_schema.py +++ b/tests/pipeline/test_import_export_schema.py @@ -1,4 +1,4 @@ -import dlt, os, pytest +import dlt, os from dlt.common.utils import uniq_id @@ -6,8 +6,6 @@ from tests.utils import TEST_STORAGE_ROOT from dlt.common.schema import Schema from dlt.common.storages.schema_storage import SchemaStorage -from dlt.common.schema.exceptions import CannotCoerceColumnException -from dlt.pipeline.exceptions import PipelineStepFailed from dlt.destinations import dummy diff --git a/tests/pipeline/test_pipeline.py b/tests/pipeline/test_pipeline.py index e58db64e5e..95d464d48a 100644 --- a/tests/pipeline/test_pipeline.py +++ b/tests/pipeline/test_pipeline.py @@ -52,7 +52,7 @@ from dlt.pipeline.pipeline import Pipeline from tests.common.utils import TEST_SENTRY_DSN -from tests.utils import TEST_STORAGE_ROOT +from tests.utils import TEST_STORAGE_ROOT, load_table_counts from tests.extract.utils import expect_extracted_file from tests.pipeline.utils import ( assert_data_table_counts, @@ -1566,6 +1566,30 @@ def test_drop() -> None: pipeline.run([1, 2, 3], table_name="numbers") +def test_source_schema_in_resource() -> None: + run_count = 0 + + @dlt.resource + def schema_inspector(): + schema = dlt.current.source_schema() + if run_count == 0: + assert "schema_inspector" not in schema.tables + if run_count == 1: + assert "schema_inspector" in schema.tables + assert schema.tables["schema_inspector"]["columns"]["value"]["x-custom"] == "X" # type: ignore[typeddict-item] + + yield [1, 2, 3] + + pipeline = dlt.pipeline(pipeline_name="test_inspector", destination="duckdb") + pipeline.run(schema_inspector()) + + # add custom annotation + pipeline.default_schema.tables["schema_inspector"]["columns"]["value"]["x-custom"] = "X" # type: ignore[typeddict-unknown-key] + + run_count += 1 + pipeline.run(schema_inspector()) + + def test_schema_version_increase_and_source_update() -> None: now = pendulum.now() @@ -1730,7 +1754,7 @@ def test_column_name_with_break_path() -> None: # get data assert_data_table_counts(pipeline, {"custom__path": 1}) # get data via dataset with dbapi - data_ = pipeline._dataset().custom__path[["example_custom_field__c", "reg_c"]].fetchall() + data_ = pipeline.dataset().custom__path[["example_custom_field__c", "reg_c"]].fetchall() assert data_ == [("custom", "c")] @@ -1754,7 +1778,7 @@ def test_column_name_with_break_path_legacy() -> None: # get data assert_data_table_counts(pipeline, {"custom_path": 1}) # get data via dataset with dbapi - data_ = pipeline._dataset().custom_path[["example_custom_field_c", "reg_c"]].fetchall() + data_ = pipeline.dataset().custom_path[["example_custom_field_c", "reg_c"]].fetchall() assert data_ == [("custom", "c")] @@ -1782,7 +1806,7 @@ def flattened_dict(): assert table["columns"]["value__timestamp"]["data_type"] == "timestamp" # make sure data is there - data_ = pipeline._dataset().flattened__dict[["delta", "value__timestamp"]].limit(1).fetchall() + data_ = pipeline.dataset().flattened__dict[["delta", "value__timestamp"]].limit(1).fetchall() assert data_ == [(0, now)] @@ -1812,7 +1836,7 @@ def flattened_dict(): assert set(table["columns"]) == {"delta", "value__timestamp", "_dlt_id", "_dlt_load_id"} assert table["columns"]["value__timestamp"]["data_type"] == "timestamp" # make sure data is there - data_ = pipeline._dataset().flattened_dict[["delta", "value__timestamp"]].limit(1).fetchall() + data_ = pipeline.dataset().flattened_dict[["delta", "value__timestamp"]].limit(1).fetchall() assert data_ == [(0, now)] @@ -2987,3 +3011,171 @@ def test_push_table_with_upfront_schema() -> None: copy_pipeline = dlt.pipeline(pipeline_name="push_table_copy_pipeline", destination="duckdb") info = copy_pipeline.run(data, table_name="events", schema=copy_schema) assert copy_pipeline.default_schema.version_hash != infer_hash + + +def test_pipeline_with_sources_sharing_schema() -> None: + schema = Schema("shared") + + @dlt.source(schema=schema, max_table_nesting=1) + def source_1(): + @dlt.resource(primary_key="user_id") + def gen1(): + dlt.current.source_state()["source_1"] = True + dlt.current.resource_state()["source_1"] = True + yield {"id": "Y", "user_id": "user_y"} + + @dlt.resource(columns={"value": {"data_type": "bool"}}) + def conflict(): + yield True + + return gen1, conflict + + @dlt.source(schema=schema, max_table_nesting=2) + def source_2(): + @dlt.resource(primary_key="id") + def gen1(): + dlt.current.source_state()["source_2"] = True + dlt.current.resource_state()["source_2"] = True + yield {"id": "X", "user_id": "user_X"} + + def gen2(): + yield from "CDE" + + @dlt.resource(columns={"value": {"data_type": "text"}}, selected=False) + def conflict(): + yield "indeed" + + return gen2, gen1, conflict + + # all selected tables with hints should be there + discover_1 = source_1().discover_schema() + assert "gen1" in discover_1.tables + assert discover_1.tables["gen1"]["columns"]["user_id"]["primary_key"] is True + assert "data_type" not in discover_1.tables["gen1"]["columns"]["user_id"] + assert "conflict" in discover_1.tables + assert discover_1.tables["conflict"]["columns"]["value"]["data_type"] == "bool" + + discover_2 = source_2().discover_schema() + assert "gen1" in discover_2.tables + assert "gen2" in discover_2.tables + # conflict deselected + assert "conflict" not in discover_2.tables + + p = dlt.pipeline(pipeline_name="multi", destination="duckdb", dev_mode=True) + p.extract([source_1(), source_2()]) + default_schema = p.default_schema + gen1_table = default_schema.tables["gen1"] + assert "user_id" in gen1_table["columns"] + assert "id" in gen1_table["columns"] + assert "conflict" in default_schema.tables + assert "gen2" in default_schema.tables + p.normalize() + assert "gen2" in default_schema.tables + assert default_schema.tables["conflict"]["columns"]["value"]["data_type"] == "bool" + p.load() + table_names = [t["name"] for t in default_schema.data_tables()] + counts = load_table_counts(p, *table_names) + assert counts == {"gen1": 2, "gen2": 3, "conflict": 1} + # both sources share the same state + assert p.state["sources"] == { + "shared": { + "source_1": True, + "resources": {"gen1": {"source_1": True, "source_2": True}}, + "source_2": True, + } + } + + # same pipeline but enable conflict + p.extract([source_2().with_resources("conflict")]) + p.normalize() + assert default_schema.tables["conflict"]["columns"]["value"]["data_type"] == "text" + with pytest.raises(PipelineStepFailed): + # will generate failed job on type that does not match + p.load() + counts = load_table_counts(p, "conflict") + assert counts == {"conflict": 1} + + # alter table in duckdb + with p.sql_client() as client: + client.execute_sql("ALTER TABLE conflict ALTER value TYPE VARCHAR;") + p.run([source_2().with_resources("conflict")]) + counts = load_table_counts(p, "conflict") + assert counts == {"conflict": 2} + + +def test_many_pipelines_single_dataset() -> None: + schema = Schema("shared") + + @dlt.source(schema=schema, max_table_nesting=1) + def source_1(): + @dlt.resource(primary_key="user_id") + def gen1(): + dlt.current.source_state()["source_1"] = True + dlt.current.resource_state()["source_1"] = True + yield {"id": "Y", "user_id": "user_y"} + + return gen1 + + @dlt.source(schema=schema, max_table_nesting=2) + def source_2(): + @dlt.resource(primary_key="id") + def gen1(): + dlt.current.source_state()["source_2"] = True + dlt.current.resource_state()["source_2"] = True + yield {"id": "X", "user_id": "user_X"} + + def gen2(): + yield from "CDE" + + return gen2, gen1 + + # load source_1 to common dataset + p = dlt.pipeline( + pipeline_name="source_1_pipeline", destination="duckdb", dataset_name="shared_dataset" + ) + p.run(source_1(), credentials="duckdb:///_storage/test_quack.duckdb") + counts = load_table_counts(p, *p.default_schema.tables.keys()) + assert counts.items() >= {"gen1": 1, "_dlt_pipeline_state": 1, "_dlt_loads": 1}.items() + p._wipe_working_folder() + p.deactivate() + + p = dlt.pipeline( + pipeline_name="source_2_pipeline", destination="duckdb", dataset_name="shared_dataset" + ) + p.run(source_2(), credentials="duckdb:///_storage/test_quack.duckdb") + # table_names = [t["name"] for t in p.default_schema.data_tables()] + counts = load_table_counts(p, *p.default_schema.tables.keys()) + # gen1: one record comes from source_1, 1 record from source_2 + assert counts.items() >= {"gen1": 2, "_dlt_pipeline_state": 2, "_dlt_loads": 2}.items() + # assert counts == {'gen1': 2, 'gen2': 3} + p._wipe_working_folder() + p.deactivate() + + # restore from destination, check state + p = dlt.pipeline( + pipeline_name="source_1_pipeline", + destination=dlt.destinations.duckdb(credentials="duckdb:///_storage/test_quack.duckdb"), + dataset_name="shared_dataset", + ) + p.sync_destination() + # we have our separate state + assert p.state["sources"]["shared"] == { + "source_1": True, + "resources": {"gen1": {"source_1": True}}, + } + # but the schema was common so we have the earliest one + assert "gen2" in p.default_schema.tables + p._wipe_working_folder() + p.deactivate() + + p = dlt.pipeline( + pipeline_name="source_2_pipeline", + destination=dlt.destinations.duckdb(credentials="duckdb:///_storage/test_quack.duckdb"), + dataset_name="shared_dataset", + ) + p.sync_destination() + # we have our separate state + assert p.state["sources"]["shared"] == { + "source_2": True, + "resources": {"gen1": {"source_2": True}}, + } diff --git a/tests/pipeline/test_pipeline_extra.py b/tests/pipeline/test_pipeline_extra.py index a51052d247..32b16c234f 100644 --- a/tests/pipeline/test_pipeline_extra.py +++ b/tests/pipeline/test_pipeline_extra.py @@ -521,7 +521,7 @@ def test_parquet_with_flattened_columns() -> None: assert "issue__reactions__url" in pipeline.default_schema.tables["events"]["columns"] assert "issue_reactions_url" not in pipeline.default_schema.tables["events"]["columns"] - events_table = pipeline._dataset().events.arrow() + events_table = pipeline.dataset().events.arrow() assert "issue__reactions__url" in events_table.schema.names assert "issue_reactions_url" not in events_table.schema.names @@ -536,7 +536,7 @@ def test_parquet_with_flattened_columns() -> None: info = pipeline.run(events_table, table_name="events", loader_file_format="parquet") assert_load_info(info) - events_table_new = pipeline._dataset().events.arrow() + events_table_new = pipeline.dataset().events.arrow() assert events_table.schema == events_table_new.schema # double row count assert events_table.num_rows * 2 == events_table_new.num_rows diff --git a/tests/pipeline/utils.py b/tests/pipeline/utils.py index 0ae734f72e..e72a27c827 100644 --- a/tests/pipeline/utils.py +++ b/tests/pipeline/utils.py @@ -197,10 +197,23 @@ def _load_tables_to_dicts_fs( delta_tables = get_delta_tables(p, *table_names, schema_name=schema_name) + iceberg_table_names = [ + table_name + for table_name in table_names + if get_table_format(client.schema.tables, table_name) == "iceberg" + ] + if len(iceberg_table_names) > 0: + from dlt.common.libs.pyiceberg import get_iceberg_tables + + iceberg_tables = get_iceberg_tables(p, *table_names, schema_name=schema_name) + for table_name in table_names: if table_name in client.schema.data_table_names() and table_name in delta_table_names: dt = delta_tables[table_name] result[table_name] = dt.to_pyarrow_table().to_pylist() + elif table_name in client.schema.data_table_names() and table_name in iceberg_table_names: + it = iceberg_tables[table_name] + result[table_name] = it.scan().to_arrow().to_pylist() else: table_files = client.list_table_files(table_name) for file in table_files: diff --git a/tests/sources/helpers/rest_client/test_client.py b/tests/sources/helpers/rest_client/test_client.py index 36fe009b93..e67ff9c70a 100644 --- a/tests/sources/helpers/rest_client/test_client.py +++ b/tests/sources/helpers/rest_client/test_client.py @@ -401,7 +401,7 @@ def test_paginate_json_body_without_params(self, rest_client) -> None: posts_skip = (DEFAULT_TOTAL_PAGES - 3) * DEFAULT_PAGE_SIZE class JSONBodyPageCursorPaginator(BaseReferencePaginator): - def update_state(self, response, data): + def update_state(self, response, data): # type: ignore[override] self._next_reference = response.json().get("next_page") def update_request(self, request): diff --git a/tests/sources/rest_api/configurations/test_custom_auth_config.py b/tests/sources/rest_api/configurations/test_custom_auth_config.py index 1a5a2e58a3..52cdb95735 100644 --- a/tests/sources/rest_api/configurations/test_custom_auth_config.py +++ b/tests/sources/rest_api/configurations/test_custom_auth_config.py @@ -5,7 +5,7 @@ from dlt.sources import rest_api from dlt.sources.helpers.rest_client.auth import APIKeyAuth, OAuth2ClientCredentials -from dlt.sources.rest_api.typing import ApiKeyAuthConfig, AuthConfig +from dlt.sources.rest_api.typing import ApiKeyAuthConfig, AuthConfig, RESTAPIConfig class CustomOAuth2(OAuth2ClientCredentials): @@ -77,3 +77,18 @@ class NotAuthConfigBase: "not_an_auth_config_base", NotAuthConfigBase # type: ignore ) assert e.match("Invalid auth: NotAuthConfigBase.") + + def test_valid_config_raises_no_error(self, custom_auth_config: AuthConfig) -> None: + rest_api.config_setup.register_auth("custom_oauth_2", CustomOAuth2) + + valid_config: RESTAPIConfig = { + "client": { + "base_url": "https://example.com", + "auth": custom_auth_config, + }, + "resources": ["test"], + } + + rest_api.rest_api_source(valid_config) + + del rest_api.config_setup.AUTH_MAP["custom_oauth_2"] diff --git a/tests/sources/rest_api/configurations/test_custom_paginator_config.py b/tests/sources/rest_api/configurations/test_custom_paginator_config.py index f8ac060218..975ab10176 100644 --- a/tests/sources/rest_api/configurations/test_custom_paginator_config.py +++ b/tests/sources/rest_api/configurations/test_custom_paginator_config.py @@ -4,7 +4,7 @@ from dlt.sources import rest_api from dlt.sources.helpers.rest_client.paginators import JSONLinkPaginator -from dlt.sources.rest_api.typing import PaginatorConfig +from dlt.sources.rest_api.typing import PaginatorConfig, RESTAPIConfig class CustomPaginator(JSONLinkPaginator): @@ -67,3 +67,13 @@ class NotAPaginator: with pytest.raises(ValueError) as e: rest_api.config_setup.register_paginator("not_a_paginator", NotAPaginator) # type: ignore[arg-type] assert e.match("Invalid paginator: NotAPaginator.") + + def test_test_valid_config_raises_no_error(self, custom_paginator_config) -> None: + rest_api.config_setup.register_paginator("custom_paginator", CustomPaginator) + + valid_config: RESTAPIConfig = { + "client": {"base_url": "https://example.com", "paginator": custom_paginator_config}, + "resources": ["test"], + } + + rest_api.rest_api_source(valid_config) diff --git a/tests/utils.py b/tests/utils.py index 1aafa4bfe4..82d742ac65 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -32,6 +32,7 @@ from dlt.common.runtime.run_context import DOT_DLT, RunContext from dlt.common.runtime.telemetry import start_telemetry, stop_telemetry from dlt.common.schema import Schema +from dlt.common.schema.typing import TTableFormat from dlt.common.storages import FileStorage from dlt.common.storages.versioned_storage import VersionedStorage from dlt.common.typing import DictStrAny, StrAny, TDataItem @@ -88,6 +89,12 @@ ACTIVE_SQL_DESTINATIONS = SQL_DESTINATIONS.intersection(ACTIVE_DESTINATIONS) ACTIVE_NON_SQL_DESTINATIONS = NON_SQL_DESTINATIONS.intersection(ACTIVE_DESTINATIONS) +# filter out active table formats for current tests +IMPLEMENTED_TABLE_FORMATS = set(get_args(TTableFormat)) +ACTIVE_TABLE_FORMATS = set( + dlt.config.get("ACTIVE_TABLE_FORMATS", list) or IMPLEMENTED_TABLE_FORMATS +) + # sanity checks assert len(ACTIVE_DESTINATIONS) >= 0, "No active destinations selected"