diff --git a/.github/workflows/test_destination_athena.yml b/.github/workflows/test_destination_athena.yml
index 1169fab0de..03eb7f9434 100644
--- a/.github/workflows/test_destination_athena.yml
+++ b/.github/workflows/test_destination_athena.yml
@@ -67,7 +67,7 @@ jobs:
- name: Install dependencies
# if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true'
- run: poetry install --no-interaction -E athena --with sentry-sdk --with pipeline
+ run: poetry install --no-interaction -E athena --with sentry-sdk --with pipeline,ibis
- name: create secrets.toml
run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml
diff --git a/.github/workflows/test_destination_athena_iceberg.yml b/.github/workflows/test_destination_athena_iceberg.yml
index 7ccefcc055..3412e789e3 100644
--- a/.github/workflows/test_destination_athena_iceberg.yml
+++ b/.github/workflows/test_destination_athena_iceberg.yml
@@ -67,7 +67,7 @@ jobs:
- name: Install dependencies
# if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true'
- run: poetry install --no-interaction -E athena --with sentry-sdk --with pipeline
+ run: poetry install --no-interaction -E athena --with sentry-sdk --with pipeline,ibis
- name: create secrets.toml
run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml
diff --git a/.github/workflows/test_destination_bigquery.yml b/.github/workflows/test_destination_bigquery.yml
index 7afc9b8a00..eb8b63f757 100644
--- a/.github/workflows/test_destination_bigquery.yml
+++ b/.github/workflows/test_destination_bigquery.yml
@@ -66,7 +66,7 @@ jobs:
- name: Install dependencies
# if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true'
- run: poetry install --no-interaction -E bigquery --with providers -E parquet --with sentry-sdk --with pipeline
+ run: poetry install --no-interaction -E bigquery --with providers -E parquet --with sentry-sdk --with pipeline,ibis
- name: create secrets.toml
run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml
diff --git a/.github/workflows/test_destination_clickhouse.yml b/.github/workflows/test_destination_clickhouse.yml
index 7f297db971..46464ea462 100644
--- a/.github/workflows/test_destination_clickhouse.yml
+++ b/.github/workflows/test_destination_clickhouse.yml
@@ -61,7 +61,7 @@ jobs:
key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-gcp
- name: Install dependencies
- run: poetry install --no-interaction -E clickhouse --with providers -E parquet --with sentry-sdk --with pipeline
+ run: poetry install --no-interaction -E clickhouse --with providers -E parquet --with sentry-sdk --with pipeline,ibis
- name: create secrets.toml
run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml
diff --git a/.github/workflows/test_destination_databricks.yml b/.github/workflows/test_destination_databricks.yml
index 1656fe27f4..c1609de863 100644
--- a/.github/workflows/test_destination_databricks.yml
+++ b/.github/workflows/test_destination_databricks.yml
@@ -64,7 +64,7 @@ jobs:
key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-gcp
- name: Install dependencies
- run: poetry install --no-interaction -E databricks -E s3 -E gs -E az -E parquet --with sentry-sdk --with pipeline
+ run: poetry install --no-interaction -E databricks -E s3 -E gs -E az -E parquet --with sentry-sdk --with pipeline,ibis
- name: create secrets.toml
run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml
diff --git a/.github/workflows/test_destination_dremio.yml b/.github/workflows/test_destination_dremio.yml
index 45c6d17db1..4bc48c54db 100644
--- a/.github/workflows/test_destination_dremio.yml
+++ b/.github/workflows/test_destination_dremio.yml
@@ -65,7 +65,7 @@ jobs:
key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-gcp
- name: Install dependencies
- run: poetry install --no-interaction -E s3 -E gs -E az -E parquet --with sentry-sdk --with pipeline
+ run: poetry install --no-interaction -E s3 -E gs -E az -E parquet --with sentry-sdk --with pipeline,ibis
- run: |
poetry run pytest tests/load --ignore tests/load/sources
diff --git a/.github/workflows/test_destination_motherduck.yml b/.github/workflows/test_destination_motherduck.yml
index 0014b17655..db81131266 100644
--- a/.github/workflows/test_destination_motherduck.yml
+++ b/.github/workflows/test_destination_motherduck.yml
@@ -64,7 +64,7 @@ jobs:
key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-motherduck
- name: Install dependencies
- run: poetry install --no-interaction -E motherduck -E s3 -E gs -E az -E parquet --with sentry-sdk --with pipeline
+ run: poetry install --no-interaction -E motherduck -E s3 -E gs -E az -E parquet --with sentry-sdk --with pipeline,ibis
- name: create secrets.toml
run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml
diff --git a/.github/workflows/test_destination_mssql.yml b/.github/workflows/test_destination_mssql.yml
index 8b899e7da2..6fdd7a5bc5 100644
--- a/.github/workflows/test_destination_mssql.yml
+++ b/.github/workflows/test_destination_mssql.yml
@@ -69,7 +69,7 @@ jobs:
key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-gcp
- name: Install dependencies
- run: poetry install --no-interaction -E mssql -E s3 -E gs -E az -E parquet --with sentry-sdk --with pipeline
+ run: poetry install --no-interaction -E mssql -E s3 -E gs -E az -E parquet --with sentry-sdk --with pipeline,ibis
- name: create secrets.toml
run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml
diff --git a/.github/workflows/test_destination_snowflake.yml b/.github/workflows/test_destination_snowflake.yml
index a720c479bd..73a2a8f6e7 100644
--- a/.github/workflows/test_destination_snowflake.yml
+++ b/.github/workflows/test_destination_snowflake.yml
@@ -64,7 +64,7 @@ jobs:
key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-gcp
- name: Install dependencies
- run: poetry install --no-interaction -E snowflake -E s3 -E gs -E az -E parquet --with sentry-sdk --with pipeline
+ run: poetry install --no-interaction -E snowflake -E s3 -E gs -E az -E parquet --with sentry-sdk --with pipeline,ibis
- name: create secrets.toml
run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml
diff --git a/.github/workflows/test_destination_synapse.yml b/.github/workflows/test_destination_synapse.yml
index be1b493916..8f6bf1eb29 100644
--- a/.github/workflows/test_destination_synapse.yml
+++ b/.github/workflows/test_destination_synapse.yml
@@ -67,7 +67,7 @@ jobs:
key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-gcp
- name: Install dependencies
- run: poetry install --no-interaction -E synapse -E parquet --with sentry-sdk --with pipeline
+ run: poetry install --no-interaction -E synapse -E parquet --with sentry-sdk --with pipeline,ibis
- name: create secrets.toml
run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml
diff --git a/.github/workflows/test_destinations.yml b/.github/workflows/test_destinations.yml
index 933248d994..a9306c2f9c 100644
--- a/.github/workflows/test_destinations.yml
+++ b/.github/workflows/test_destinations.yml
@@ -77,8 +77,10 @@ jobs:
# key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-redshift
- name: Install dependencies
- # if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true'
- run: poetry install --no-interaction -E redshift -E postgis -E postgres -E gs -E s3 -E az -E parquet -E duckdb -E cli -E filesystem --with sentry-sdk --with pipeline -E deltalake
+ run: poetry install --no-interaction -E redshift -E postgis -E postgres -E gs -E s3 -E az -E parquet -E duckdb -E cli -E filesystem --with sentry-sdk --with pipeline,ibis -E deltalake -E pyiceberg
+
+ - name: Upgrade sqlalchemy
+ run: poetry run pip install sqlalchemy==2.0.18 # minimum version required by `pyiceberg`
- name: create secrets.toml
run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml
diff --git a/.github/workflows/test_local_destinations.yml b/.github/workflows/test_local_destinations.yml
index 4947a46a3b..706bae1b0c 100644
--- a/.github/workflows/test_local_destinations.yml
+++ b/.github/workflows/test_local_destinations.yml
@@ -95,7 +95,10 @@ jobs:
key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-local-destinations
- name: Install dependencies
- run: poetry install --no-interaction -E postgres -E postgis -E duckdb -E parquet -E filesystem -E cli -E weaviate -E qdrant -E sftp --with sentry-sdk --with pipeline -E deltalake
+ run: poetry install --no-interaction -E postgres -E postgis -E duckdb -E parquet -E filesystem -E cli -E weaviate -E qdrant -E sftp --with sentry-sdk --with pipeline,ibis -E deltalake -E pyiceberg
+
+ - name: Upgrade sqlalchemy
+ run: poetry run pip install sqlalchemy==2.0.18 # minimum version required by `pyiceberg`
- name: Start SFTP server
run: docker compose -f "tests/load/filesystem_sftp/docker-compose.yml" up -d
diff --git a/.github/workflows/test_sqlalchemy_destinations.yml b/.github/workflows/test_sqlalchemy_destinations.yml
index c2572b322d..1f00373674 100644
--- a/.github/workflows/test_sqlalchemy_destinations.yml
+++ b/.github/workflows/test_sqlalchemy_destinations.yml
@@ -86,7 +86,7 @@ jobs:
key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-local-destinations
- name: Install dependencies
- run: poetry install --no-interaction -E parquet -E filesystem -E sqlalchemy -E cli --with sentry-sdk --with pipeline && poetry run pip install mysqlclient && poetry run pip install "sqlalchemy==${{ matrix.sqlalchemy }}"
+ run: poetry install --no-interaction -E parquet -E filesystem -E sqlalchemy -E cli --with sentry-sdk --with pipeline,ibis && poetry run pip install mysqlclient && poetry run pip install "sqlalchemy==${{ matrix.sqlalchemy }}"
- name: create secrets.toml
run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml
diff --git a/Makefile b/Makefile
index 2a7f6dac0a..975a8a42da 100644
--- a/Makefile
+++ b/Makefile
@@ -44,7 +44,7 @@ has-poetry:
poetry --version
dev: has-poetry
- poetry install --all-extras --with docs,providers,pipeline,sources,sentry-sdk,airflow
+ poetry install --all-extras --with docs,providers,pipeline,sources,sentry-sdk
lint:
./tools/check-package.sh
@@ -63,7 +63,6 @@ format:
lint-snippets:
cd docs/tools && poetry run python check_embedded_snippets.py full
-
lint-and-test-snippets: lint-snippets
poetry run mypy --config-file mypy.ini docs/website docs/tools --exclude docs/tools/lint_setup --exclude docs/website/docs_processed
poetry run flake8 --max-line-length=200 docs/website docs/tools --exclude docs/website/.dlt-repo
@@ -82,7 +81,7 @@ lint-security:
poetry run bandit -r dlt/ -n 3 -l
test:
- (set -a && . tests/.env && poetry run pytest tests)
+ poetry run pytest tests
test-load-local:
DESTINATION__POSTGRES__CREDENTIALS=postgresql://loader:loader@localhost:5432/dlt_data DESTINATION__DUCKDB__CREDENTIALS=duckdb:///_storage/test_quack.duckdb poetry run pytest tests -k '(postgres or duckdb)'
diff --git a/dlt/__init__.py b/dlt/__init__.py
index e8a1b7bf92..328817efd2 100644
--- a/dlt/__init__.py
+++ b/dlt/__init__.py
@@ -42,7 +42,6 @@
)
from dlt.pipeline import progress
from dlt import destinations
-from dlt.destinations.dataset import dataset as _dataset
pipeline = _pipeline
current = _current
@@ -80,7 +79,6 @@
"TCredentials",
"sources",
"destinations",
- "_dataset",
]
# verify that no injection context was created
diff --git a/dlt/cli/command_wrappers.py b/dlt/cli/command_wrappers.py
index 0e6491688e..847b5daabb 100644
--- a/dlt/cli/command_wrappers.py
+++ b/dlt/cli/command_wrappers.py
@@ -43,14 +43,14 @@ def init_command_wrapper(
destination_type: str,
repo_location: str,
branch: str,
- omit_core_sources: bool = False,
+ eject_source: bool = False,
) -> None:
init_command(
source_name,
destination_type,
repo_location,
branch,
- omit_core_sources,
+ eject_source,
)
diff --git a/dlt/cli/init_command.py b/dlt/cli/init_command.py
index ac8adcc588..e81fa80c36 100644
--- a/dlt/cli/init_command.py
+++ b/dlt/cli/init_command.py
@@ -157,7 +157,7 @@ def _list_core_sources() -> Dict[str, SourceConfiguration]:
sources: Dict[str, SourceConfiguration] = {}
for source_name in files_ops.get_sources_names(core_sources_storage, source_type="core"):
sources[source_name] = files_ops.get_core_source_configuration(
- core_sources_storage, source_name
+ core_sources_storage, source_name, eject_source=False
)
return sources
@@ -295,7 +295,7 @@ def init_command(
destination_type: str,
repo_location: str,
branch: str = None,
- omit_core_sources: bool = False,
+ eject_source: bool = False,
) -> None:
# try to import the destination and get config spec
destination_reference = Destination.from_reference(destination_type)
@@ -310,13 +310,9 @@ def init_command(
# discover type of source
source_type: files_ops.TSourceType = "template"
- if (
- source_name in files_ops.get_sources_names(core_sources_storage, source_type="core")
- ) and not omit_core_sources:
+ if source_name in files_ops.get_sources_names(core_sources_storage, source_type="core"):
source_type = "core"
else:
- if omit_core_sources:
- fmt.echo("Omitting dlt core sources.")
verified_sources_storage = _clone_and_get_verified_sources_storage(repo_location, branch)
if source_name in files_ops.get_sources_names(
verified_sources_storage, source_type="verified"
@@ -380,7 +376,7 @@ def init_command(
else:
if source_type == "core":
source_configuration = files_ops.get_core_source_configuration(
- core_sources_storage, source_name
+ core_sources_storage, source_name, eject_source
)
from importlib.metadata import Distribution
@@ -392,6 +388,9 @@ def init_command(
if canonical_source_name in extras:
source_configuration.requirements.update_dlt_extras(canonical_source_name)
+
+ # create remote modified index to copy files when ejecting
+ remote_modified = {file_name: None for file_name in source_configuration.files}
else:
if not is_valid_schema_name(source_name):
raise InvalidSchemaName(source_name)
@@ -536,11 +535,17 @@ def init_command(
"Creating a new pipeline with the dlt core source %s (%s)"
% (fmt.bold(source_name), source_configuration.doc)
)
- fmt.echo(
- "NOTE: Beginning with dlt 1.0.0, the source %s will no longer be copied from the"
- " verified sources repo but imported from dlt.sources. You can provide the"
- " --omit-core-sources flag to revert to the old behavior." % (fmt.bold(source_name))
- )
+ if eject_source:
+ fmt.echo(
+ "NOTE: Source code of %s will be ejected. Remember to modify the pipeline "
+ "example script to import the ejected source." % (fmt.bold(source_name))
+ )
+ else:
+ fmt.echo(
+ "NOTE: Beginning with dlt 1.0.0, the source %s will no longer be copied from"
+ " the verified sources repo but imported from dlt.sources. You can provide the"
+ " --eject flag to revert to the old behavior." % (fmt.bold(source_name))
+ )
elif source_configuration.source_type == "verified":
fmt.echo(
"Creating and configuring a new pipeline with the verified source %s (%s)"
diff --git a/dlt/cli/pipeline_files.py b/dlt/cli/pipeline_files.py
index b6f8f85271..c0139fe2a7 100644
--- a/dlt/cli/pipeline_files.py
+++ b/dlt/cli/pipeline_files.py
@@ -226,11 +226,31 @@ def get_template_configuration(
)
+def _get_source_files(sources_storage: FileStorage, source_name: str) -> List[str]:
+ """Get all files that belong to source `source_name`"""
+ files: List[str] = []
+ for root, subdirs, _files in os.walk(sources_storage.make_full_path(source_name)):
+ # filter unwanted files
+ for subdir in list(subdirs):
+ if any(fnmatch.fnmatch(subdir, ignore) for ignore in IGNORE_FILES):
+ subdirs.remove(subdir)
+ rel_root = sources_storage.to_relative_path(root)
+ files.extend(
+ [
+ os.path.join(rel_root, file)
+ for file in _files
+ if all(not fnmatch.fnmatch(file, ignore) for ignore in IGNORE_FILES)
+ ]
+ )
+ return files
+
+
def get_core_source_configuration(
- sources_storage: FileStorage, source_name: str
+ sources_storage: FileStorage, source_name: str, eject_source: bool
) -> SourceConfiguration:
src_pipeline_file = CORE_SOURCE_TEMPLATE_MODULE_NAME + "/" + source_name + PIPELINE_FILE_SUFFIX
dest_pipeline_file = source_name + PIPELINE_FILE_SUFFIX
+ files: List[str] = _get_source_files(sources_storage, source_name) if eject_source else []
return SourceConfiguration(
"core",
@@ -238,7 +258,7 @@ def get_core_source_configuration(
sources_storage,
src_pipeline_file,
dest_pipeline_file,
- [".gitignore"],
+ files,
SourceRequirements([]),
_get_docstring_for_module(sources_storage, source_name),
False,
@@ -259,21 +279,7 @@ def get_verified_source_configuration(
f"Pipeline example script {example_script} could not be found in the repository",
source_name,
)
- # get all files recursively
- files: List[str] = []
- for root, subdirs, _files in os.walk(sources_storage.make_full_path(source_name)):
- # filter unwanted files
- for subdir in list(subdirs):
- if any(fnmatch.fnmatch(subdir, ignore) for ignore in IGNORE_FILES):
- subdirs.remove(subdir)
- rel_root = sources_storage.to_relative_path(root)
- files.extend(
- [
- os.path.join(rel_root, file)
- for file in _files
- if all(not fnmatch.fnmatch(file, ignore) for ignore in IGNORE_FILES)
- ]
- )
+ files = _get_source_files(sources_storage, source_name)
# read requirements
requirements_path = os.path.join(source_name, utils.REQUIREMENTS_TXT)
if sources_storage.has_file(requirements_path):
diff --git a/dlt/cli/plugins.py b/dlt/cli/plugins.py
index cc2d4594b9..1712efbbd7 100644
--- a/dlt/cli/plugins.py
+++ b/dlt/cli/plugins.py
@@ -84,14 +84,10 @@ def configure_parser(self, parser: argparse.ArgumentParser) -> None:
)
parser.add_argument(
- "--omit-core-sources",
+ "--eject",
default=False,
action="store_true",
- help=(
- "When present, will not create the new pipeline with a core source of the given"
- " name but will take a source of this name from the default or provided"
- " location."
- ),
+ help="Ejects the source code of the core source like sql_database",
)
def execute(self, args: argparse.Namespace) -> None:
@@ -107,7 +103,7 @@ def execute(self, args: argparse.Namespace) -> None:
args.destination,
args.location,
args.branch,
- args.omit_core_sources,
+ args.eject,
)
diff --git a/dlt/cli/source_detection.py b/dlt/cli/source_detection.py
index 7067f8b896..0769605d01 100644
--- a/dlt/cli/source_detection.py
+++ b/dlt/cli/source_detection.py
@@ -29,8 +29,7 @@ def find_call_arguments_to_replace(
if not isinstance(dn_node, ast.Constant) or not isinstance(dn_node.value, str):
raise CliCommandInnerException(
"init",
- f"The pipeline script {init_script_name} must pass the {t_arg_name} as"
- f" string to '{arg_name}' function in line {dn_node.lineno}",
+ f"The pipeline script {init_script_name} must pass the {t_arg_name} as string to '{arg_name}' function in line {dn_node.lineno}", # type: ignore[attr-defined]
)
else:
transformed_nodes.append((dn_node, ast.Constant(value=t_value, kind=None)))
diff --git a/dlt/common/configuration/providers/toml.py b/dlt/common/configuration/providers/toml.py
index 3636565fae..e586fef225 100644
--- a/dlt/common/configuration/providers/toml.py
+++ b/dlt/common/configuration/providers/toml.py
@@ -124,6 +124,12 @@ def _read_google_colab_secrets(self, name: str, file_name: str) -> tomlkit.TOMLD
"""Try to load the toml from google colab userdata object"""
try:
from google.colab import userdata
+ from dlt.common.runtime.exec_info import is_notebook
+
+ # make sure we work in interactive mode (get_ipython() is available)
+ # when dlt cli is run, userdata is available but without a kernel
+ if not is_notebook():
+ return None
try:
return tomlkit.loads(userdata.get(file_name))
diff --git a/dlt/common/configuration/specs/aws_credentials.py b/dlt/common/configuration/specs/aws_credentials.py
index 5f69be6a33..a75cd85225 100644
--- a/dlt/common/configuration/specs/aws_credentials.py
+++ b/dlt/common/configuration/specs/aws_credentials.py
@@ -8,6 +8,7 @@
CredentialsWithDefault,
configspec,
)
+from dlt.common.configuration.specs.mixins import WithObjectStoreRsCredentials, WithPyicebergConfig
from dlt.common.configuration.specs.exceptions import (
InvalidBoto3Session,
ObjectStoreRsCredentialsException,
@@ -16,7 +17,9 @@
@configspec
-class AwsCredentialsWithoutDefaults(CredentialsConfiguration):
+class AwsCredentialsWithoutDefaults(
+ CredentialsConfiguration, WithObjectStoreRsCredentials, WithPyicebergConfig
+):
# credentials without boto implementation
aws_access_key_id: str = None
aws_secret_access_key: TSecretStrValue = None
@@ -77,6 +80,16 @@ def to_object_store_rs_credentials(self) -> Dict[str, str]:
return creds
+ def to_pyiceberg_fileio_config(self) -> Dict[str, Any]:
+ return {
+ "s3.access-key-id": self.aws_access_key_id,
+ "s3.secret-access-key": self.aws_secret_access_key,
+ "s3.session-token": self.aws_session_token,
+ "s3.region": self.region_name,
+ "s3.endpoint": self.endpoint_url,
+ "s3.connect-timeout": 300,
+ }
+
@configspec
class AwsCredentials(AwsCredentialsWithoutDefaults, CredentialsWithDefault):
diff --git a/dlt/common/configuration/specs/azure_credentials.py b/dlt/common/configuration/specs/azure_credentials.py
index cf6ec493de..aabd0b471a 100644
--- a/dlt/common/configuration/specs/azure_credentials.py
+++ b/dlt/common/configuration/specs/azure_credentials.py
@@ -8,6 +8,7 @@
CredentialsWithDefault,
configspec,
)
+from dlt.common.configuration.specs.mixins import WithObjectStoreRsCredentials, WithPyicebergConfig
from dlt import version
from dlt.common.utils import without_none
@@ -15,7 +16,7 @@
@configspec
-class AzureCredentialsBase(CredentialsConfiguration):
+class AzureCredentialsBase(CredentialsConfiguration, WithObjectStoreRsCredentials):
azure_storage_account_name: str = None
azure_account_host: Optional[str] = None
"""Alternative host when accessing blob storage endpoint ie. my_account.dfs.core.windows.net"""
@@ -32,7 +33,7 @@ def to_object_store_rs_credentials(self) -> Dict[str, str]:
@configspec
-class AzureCredentialsWithoutDefaults(AzureCredentialsBase):
+class AzureCredentialsWithoutDefaults(AzureCredentialsBase, WithPyicebergConfig):
"""Credentials for Azure Blob Storage, compatible with adlfs"""
azure_storage_account_key: Optional[TSecretStrValue] = None
@@ -49,6 +50,13 @@ def to_adlfs_credentials(self) -> Dict[str, Any]:
account_host=self.azure_account_host,
)
+ def to_pyiceberg_fileio_config(self) -> Dict[str, Any]:
+ return {
+ "adlfs.account-name": self.azure_storage_account_name,
+ "adlfs.account-key": self.azure_storage_account_key,
+ "adlfs.sas-token": self.azure_storage_sas_token,
+ }
+
def create_sas_token(self) -> None:
try:
from azure.storage.blob import generate_account_sas, ResourceTypes
@@ -72,7 +80,7 @@ def on_partial(self) -> None:
@configspec
-class AzureServicePrincipalCredentialsWithoutDefaults(AzureCredentialsBase):
+class AzureServicePrincipalCredentialsWithoutDefaults(AzureCredentialsBase, WithPyicebergConfig):
azure_tenant_id: str = None
azure_client_id: str = None
azure_client_secret: TSecretStrValue = None
@@ -86,6 +94,14 @@ def to_adlfs_credentials(self) -> Dict[str, Any]:
client_secret=self.azure_client_secret,
)
+ def to_pyiceberg_fileio_config(self) -> Dict[str, Any]:
+ return {
+ "adlfs.account-name": self.azure_storage_account_name,
+ "adlfs.tenant-id": self.azure_tenant_id,
+ "adlfs.client-id": self.azure_client_id,
+ "adlfs.client-secret": self.azure_client_secret,
+ }
+
@configspec
class AzureCredentials(AzureCredentialsWithoutDefaults, CredentialsWithDefault):
diff --git a/dlt/common/configuration/specs/base_configuration.py b/dlt/common/configuration/specs/base_configuration.py
index 8d913d0542..41d1d7a0ca 100644
--- a/dlt/common/configuration/specs/base_configuration.py
+++ b/dlt/common/configuration/specs/base_configuration.py
@@ -359,7 +359,7 @@ def _get_resolvable_dataclass_fields(cls) -> Iterator[TDtcField]:
def get_resolvable_fields(cls) -> Dict[str, type]:
"""Returns a mapping of fields to their type hints. Dunders should not be resolved and are not returned"""
return {
- f.name: eval(f.type) if isinstance(f.type, str) else f.type # type: ignore[arg-type]
+ f.name: eval(f.type) if isinstance(f.type, str) else f.type
for f in cls._get_resolvable_dataclass_fields()
}
diff --git a/dlt/common/configuration/specs/config_providers_context.py b/dlt/common/configuration/specs/config_providers_context.py
index 5d1a5b7f26..a244ab571f 100644
--- a/dlt/common/configuration/specs/config_providers_context.py
+++ b/dlt/common/configuration/specs/config_providers_context.py
@@ -1,5 +1,4 @@
import contextlib
-import dataclasses
import io
from typing import ClassVar, List
@@ -8,10 +7,6 @@
ConfigProvider,
ContextProvider,
)
-from dlt.common.configuration.specs.base_configuration import (
- ContainerInjectableContext,
- NotResolved,
-)
from dlt.common.configuration.specs import (
GcpServiceAccountCredentials,
BaseConfiguration,
@@ -137,7 +132,7 @@ def _airflow_providers() -> List[ConfigProvider]:
# check if we are in task context and provide more info
from airflow.operators.python import get_current_context # noqa
- ti: TaskInstance = get_current_context()["ti"] # type: ignore
+ ti: TaskInstance = get_current_context()["ti"] # type: ignore[assignment,unused-ignore]
# log outside of stderr/out redirect
if secrets_toml_var is None:
diff --git a/dlt/common/configuration/specs/exceptions.py b/dlt/common/configuration/specs/exceptions.py
index 928e46a8a0..fe87ef24d7 100644
--- a/dlt/common/configuration/specs/exceptions.py
+++ b/dlt/common/configuration/specs/exceptions.py
@@ -72,3 +72,7 @@ def __init__(self, spec: Type[Any], native_value: Any):
class ObjectStoreRsCredentialsException(ConfigurationException):
pass
+
+
+class UnsupportedAuthenticationMethodException(ConfigurationException):
+ pass
diff --git a/dlt/common/configuration/specs/gcp_credentials.py b/dlt/common/configuration/specs/gcp_credentials.py
index 60ab1d4b56..17519b032a 100644
--- a/dlt/common/configuration/specs/gcp_credentials.py
+++ b/dlt/common/configuration/specs/gcp_credentials.py
@@ -11,7 +11,9 @@
InvalidGoogleServicesJson,
NativeValueError,
OAuth2ScopesRequired,
+ UnsupportedAuthenticationMethodException,
)
+from dlt.common.configuration.specs.mixins import WithObjectStoreRsCredentials, WithPyicebergConfig
from dlt.common.exceptions import MissingDependencyException
from dlt.common.typing import DictStrAny, TSecretStrValue, StrAny
from dlt.common.configuration.specs.base_configuration import (
@@ -23,7 +25,7 @@
@configspec
-class GcpCredentials(CredentialsConfiguration):
+class GcpCredentials(CredentialsConfiguration, WithObjectStoreRsCredentials, WithPyicebergConfig):
token_uri: Final[str] = dataclasses.field(
default="https://oauth2.googleapis.com/token", init=False, repr=False, compare=False
)
@@ -126,6 +128,12 @@ def to_native_credentials(self) -> Any:
else:
return ServiceAccountCredentials.from_service_account_info(self)
+ def to_pyiceberg_fileio_config(self) -> Dict[str, Any]:
+ raise UnsupportedAuthenticationMethodException(
+ "Service Account authentication not supported with `iceberg` table format. Use OAuth"
+ " authentication instead."
+ )
+
def __str__(self) -> str:
return f"{self.client_email}@{self.project_id}"
@@ -176,11 +184,19 @@ def to_native_representation(self) -> str:
return json.dumps(self._info_dict())
def to_object_store_rs_credentials(self) -> Dict[str, str]:
- raise NotImplementedError(
- "`object_store` Rust crate does not support OAuth for GCP credentials. Reference:"
- " https://docs.rs/object_store/latest/object_store/gcp."
+ raise UnsupportedAuthenticationMethodException(
+ "OAuth authentication not supported with `delta` table format. Use Service Account or"
+ " Application Default Credentials authentication instead."
)
+ def to_pyiceberg_fileio_config(self) -> Dict[str, Any]:
+ self.auth()
+ return {
+ "gcs.project-id": self.project_id,
+ "gcs.oauth2.token": self.token,
+ "gcs.oauth2.token-expires-at": (pendulum.now().timestamp() + 60) * 1000,
+ }
+
def auth(self, scopes: Union[str, List[str]] = None, redirect_url: str = None) -> None:
if not self.refresh_token:
self.add_scopes(scopes)
@@ -313,6 +329,12 @@ def to_native_credentials(self) -> Any:
else:
return super().to_native_credentials()
+ def to_pyiceberg_fileio_config(self) -> Dict[str, Any]:
+ raise UnsupportedAuthenticationMethodException(
+ "Application Default Credentials authentication not supported with `iceberg` table"
+ " format. Use OAuth authentication instead."
+ )
+
@configspec
class GcpServiceAccountCredentials(
@@ -334,3 +356,9 @@ def parse_native_representation(self, native_value: Any) -> None:
except NativeValueError:
pass
GcpOAuthCredentialsWithoutDefaults.parse_native_representation(self, native_value)
+
+ def to_pyiceberg_fileio_config(self) -> Dict[str, Any]:
+ if self.has_default_credentials():
+ return GcpDefaultCredentials.to_pyiceberg_fileio_config(self)
+ else:
+ return GcpOAuthCredentialsWithoutDefaults.to_pyiceberg_fileio_config(self)
diff --git a/dlt/common/configuration/specs/mixins.py b/dlt/common/configuration/specs/mixins.py
new file mode 100644
index 0000000000..2f843aee5b
--- /dev/null
+++ b/dlt/common/configuration/specs/mixins.py
@@ -0,0 +1,24 @@
+from typing import Dict, Any
+from abc import abstractmethod, ABC
+
+
+class WithObjectStoreRsCredentials(ABC):
+ @abstractmethod
+ def to_object_store_rs_credentials(self) -> Dict[str, Any]:
+ """Returns credentials dictionary for object_store Rust crate.
+
+ Can be used for libraries that build on top of the object_store crate, such as `deltalake`.
+
+ https://docs.rs/object_store/latest/object_store/
+ """
+ pass
+
+
+class WithPyicebergConfig(ABC):
+ @abstractmethod
+ def to_pyiceberg_fileio_config(self) -> Dict[str, Any]:
+ """Returns `pyiceberg` FileIO configuration dictionary.
+
+ https://py.iceberg.apache.org/configuration/#fileio
+ """
+ pass
diff --git a/dlt/common/data_writers/buffered.py b/dlt/common/data_writers/buffered.py
index e2b6c9a442..6ef431a4d0 100644
--- a/dlt/common/data_writers/buffered.py
+++ b/dlt/common/data_writers/buffered.py
@@ -242,7 +242,7 @@ def _flush_items(self, allow_empty_file: bool = False) -> None:
if self.writer_spec.is_binary_format:
self._file = self.open(self._file_name, "wb") # type: ignore
else:
- self._file = self.open(self._file_name, "wt", encoding="utf-8", newline="") # type: ignore
+ self._file = self.open(self._file_name, "wt", encoding="utf-8", newline="")
self._writer = self.writer_cls(self._file, caps=self._caps) # type: ignore[assignment]
self._writer.write_header(self._current_columns)
# write buffer
diff --git a/dlt/common/destination/reference.py b/dlt/common/destination/reference.py
index e27f99cde7..827034ddca 100644
--- a/dlt/common/destination/reference.py
+++ b/dlt/common/destination/reference.py
@@ -67,7 +67,7 @@
TDestinationConfig = TypeVar("TDestinationConfig", bound="DestinationClientConfiguration")
TDestinationClient = TypeVar("TDestinationClient", bound="JobClientBase")
TDestinationDwhClient = TypeVar("TDestinationDwhClient", bound="DestinationClientDwhConfiguration")
-TDatasetType = Literal["dbapi", "ibis"]
+TDatasetType = Literal["auto", "default", "ibis"]
DEFAULT_FILE_LAYOUT = "{table_name}/{load_id}.{file_id}.{ext}"
@@ -76,7 +76,7 @@
try:
from dlt.common.libs.pandas import DataFrame
from dlt.common.libs.pyarrow import Table as ArrowTable
- from dlt.common.libs.ibis import BaseBackend as IbisBackend
+ from dlt.helpers.ibis import BaseBackend as IbisBackend
except MissingDependencyException:
DataFrame = Any
ArrowTable = Any
@@ -535,7 +535,7 @@ def fetchone(self) -> Optional[Tuple[Any, ...]]:
...
# modifying access parameters
- def limit(self, limit: int) -> "SupportsReadableRelation":
+ def limit(self, limit: int, **kwargs: Any) -> "SupportsReadableRelation":
"""limit the result to 'limit' items"""
...
@@ -557,6 +557,10 @@ def __getitem__(self, columns: Union[str, Sequence[str]]) -> "SupportsReadableRe
"""set which columns will be selected"""
...
+ def __getattr__(self, attr: str) -> Any:
+ """get an attribute of the relation"""
+ ...
+
def __copy__(self) -> "SupportsReadableRelation":
"""create a copy of the relation object"""
...
@@ -588,6 +592,10 @@ def __getattr__(self, table: str) -> SupportsReadableRelation: ...
def ibis(self) -> IbisBackend: ...
+ def row_counts(
+ self, *, data_tables: bool = True, dlt_tables: bool = False, table_names: List[str] = None
+ ) -> SupportsReadableRelation: ...
+
class JobClientBase(ABC):
def __init__(
diff --git a/dlt/common/destination/utils.py b/dlt/common/destination/utils.py
index 0bad5b152e..c98344b687 100644
--- a/dlt/common/destination/utils.py
+++ b/dlt/common/destination/utils.py
@@ -38,7 +38,7 @@ def verify_schema_capabilities(
exception_log: List[Exception] = []
# combined casing function
case_identifier = lambda ident: capabilities.casefold_identifier(
- (str if capabilities.has_case_sensitive_identifiers else str.casefold)(ident) # type: ignore
+ (str if capabilities.has_case_sensitive_identifiers else str.casefold)(ident)
)
table_name_lookup: DictStrStr = {}
# name collision explanation
diff --git a/dlt/common/incremental/typing.py b/dlt/common/incremental/typing.py
index 460e2f234b..2ca981bff0 100644
--- a/dlt/common/incremental/typing.py
+++ b/dlt/common/incremental/typing.py
@@ -8,6 +8,8 @@
LastValueFunc = Callable[[Sequence[TCursorValue]], Any]
OnCursorValueMissing = Literal["raise", "include", "exclude"]
+TIncrementalRange = Literal["open", "closed"]
+
class IncrementalColumnState(TypedDict):
initial_value: Optional[Any]
@@ -26,3 +28,5 @@ class IncrementalArgs(TypedDict, total=False):
allow_external_schedulers: Optional[bool]
lag: Optional[Union[float, int]]
on_cursor_value_missing: Optional[OnCursorValueMissing]
+ range_start: Optional[TIncrementalRange]
+ range_end: Optional[TIncrementalRange]
diff --git a/dlt/common/libs/deltalake.py b/dlt/common/libs/deltalake.py
index 4047bc3a1a..0f938e7102 100644
--- a/dlt/common/libs/deltalake.py
+++ b/dlt/common/libs/deltalake.py
@@ -10,6 +10,7 @@
from dlt.common.exceptions import MissingDependencyException
from dlt.common.storages import FilesystemConfiguration
from dlt.common.utils import assert_min_pkg_version
+from dlt.common.configuration.specs.mixins import WithObjectStoreRsCredentials
from dlt.destinations.impl.filesystem.filesystem import FilesystemClient
try:
@@ -191,10 +192,9 @@ def get_delta_tables(
def _deltalake_storage_options(config: FilesystemConfiguration) -> Dict[str, str]:
"""Returns dict that can be passed as `storage_options` in `deltalake` library."""
- creds = {} # type: ignore
+ creds = {}
extra_options = {}
- # TODO: create a mixin with to_object_store_rs_credentials for a proper discovery
- if hasattr(config.credentials, "to_object_store_rs_credentials"):
+ if isinstance(config.credentials, WithObjectStoreRsCredentials):
creds = config.credentials.to_object_store_rs_credentials()
if config.deltalake_storage_options is not None:
extra_options = config.deltalake_storage_options
diff --git a/dlt/common/libs/pyiceberg.py b/dlt/common/libs/pyiceberg.py
new file mode 100644
index 0000000000..19ce9abbf2
--- /dev/null
+++ b/dlt/common/libs/pyiceberg.py
@@ -0,0 +1,192 @@
+from typing import Dict, Any, List, Optional
+
+from dlt import version, Pipeline
+from dlt.common.libs.pyarrow import cast_arrow_schema_types
+from dlt.common.schema.typing import TWriteDisposition
+from dlt.common.utils import assert_min_pkg_version
+from dlt.common.exceptions import MissingDependencyException
+from dlt.common.storages.configuration import FileSystemCredentials
+from dlt.common.configuration.specs import CredentialsConfiguration
+from dlt.common.configuration.specs.mixins import WithPyicebergConfig
+from dlt.destinations.impl.filesystem.filesystem import FilesystemClient
+
+
+try:
+ from pyiceberg.table import Table as IcebergTable
+ from pyiceberg.catalog import MetastoreCatalog
+ import pyarrow as pa
+except ModuleNotFoundError:
+ raise MissingDependencyException(
+ "dlt pyiceberg helpers",
+ [f"{version.DLT_PKG_NAME}[pyiceberg]"],
+ "Install `pyiceberg` so dlt can create Iceberg tables in the `filesystem` destination.",
+ )
+
+
+def ensure_iceberg_compatible_arrow_schema(schema: pa.Schema) -> pa.Schema:
+ ARROW_TO_ICEBERG_COMPATIBLE_ARROW_TYPE_MAP = {
+ pa.types.is_time: pa.string(),
+ pa.types.is_decimal256: pa.string(), # pyarrow does not allow downcasting to decimal128
+ }
+ return cast_arrow_schema_types(schema, ARROW_TO_ICEBERG_COMPATIBLE_ARROW_TYPE_MAP)
+
+
+def ensure_iceberg_compatible_arrow_data(data: pa.Table) -> pa.Table:
+ schema = ensure_iceberg_compatible_arrow_schema(data.schema)
+ return data.cast(schema)
+
+
+def write_iceberg_table(
+ table: IcebergTable,
+ data: pa.Table,
+ write_disposition: TWriteDisposition,
+) -> None:
+ if write_disposition == "append":
+ table.append(ensure_iceberg_compatible_arrow_data(data))
+ elif write_disposition == "replace":
+ table.overwrite(ensure_iceberg_compatible_arrow_data(data))
+
+
+def get_sql_catalog(credentials: FileSystemCredentials) -> "SqlCatalog": # type: ignore[name-defined] # noqa: F821
+ assert_min_pkg_version(
+ pkg_name="sqlalchemy",
+ version="2.0.18",
+ msg=(
+ "`sqlalchemy>=2.0.18` is needed for `iceberg` table format on `filesystem` destination."
+ ),
+ )
+
+ from pyiceberg.catalog.sql import SqlCatalog
+
+ return SqlCatalog(
+ "default",
+ uri="sqlite:///:memory:",
+ **_get_fileio_config(credentials),
+ )
+
+
+def create_or_evolve_table(
+ catalog: MetastoreCatalog,
+ client: FilesystemClient,
+ table_name: str,
+ namespace_name: Optional[str] = None,
+ schema: Optional[pa.Schema] = None,
+ partition_columns: Optional[List[str]] = None,
+) -> MetastoreCatalog:
+ # add table to catalog
+ table_id = f"{namespace_name}.{table_name}"
+ table_path = f"{client.dataset_path}/{table_name}"
+ metadata_path = f"{table_path}/metadata"
+ if client.fs_client.exists(metadata_path):
+ # found metadata; register existing table
+ table = _register_table(table_id, metadata_path, catalog, client)
+
+ # evolve schema
+ if schema is not None:
+ with table.update_schema() as update:
+ update.union_by_name(ensure_iceberg_compatible_arrow_schema(schema))
+ else:
+ # found no metadata; create new table
+ assert schema is not None
+ with catalog.create_table_transaction(
+ table_id,
+ schema=ensure_iceberg_compatible_arrow_schema(schema),
+ location=_make_path(table_path, client),
+ ) as txn:
+ # add partitioning
+ with txn.update_spec() as update_spec:
+ for col in partition_columns:
+ update_spec.add_identity(col)
+
+ return catalog
+
+
+def get_catalog(
+ client: FilesystemClient,
+ table_name: str,
+ namespace_name: Optional[str] = None,
+ schema: Optional[pa.Schema] = None,
+ partition_columns: Optional[List[str]] = None,
+) -> MetastoreCatalog:
+ """Returns single-table, ephemeral, in-memory Iceberg catalog."""
+
+ # create in-memory catalog
+ catalog: MetastoreCatalog = get_sql_catalog(client.config.credentials)
+
+ # create namespace
+ if namespace_name is None:
+ namespace_name = client.dataset_name
+ catalog.create_namespace(namespace_name)
+
+ # add table to catalog
+ catalog = create_or_evolve_table(
+ catalog=catalog,
+ client=client,
+ table_name=table_name,
+ namespace_name=namespace_name,
+ schema=schema,
+ partition_columns=partition_columns,
+ )
+
+ return catalog
+
+
+def get_iceberg_tables(
+ pipeline: Pipeline, *tables: str, schema_name: Optional[str] = None
+) -> Dict[str, IcebergTable]:
+ from dlt.common.schema.utils import get_table_format
+
+ with pipeline.destination_client(schema_name=schema_name) as client:
+ assert isinstance(
+ client, FilesystemClient
+ ), "The `get_iceberg_tables` function requires a `filesystem` destination."
+
+ schema_iceberg_tables = [
+ t["name"]
+ for t in client.schema.tables.values()
+ if get_table_format(client.schema.tables, t["name"]) == "iceberg"
+ ]
+ if len(tables) > 0:
+ invalid_tables = set(tables) - set(schema_iceberg_tables)
+ if len(invalid_tables) > 0:
+ available_schemas = ""
+ if len(pipeline.schema_names) > 1:
+ available_schemas = f" Available schemas are {pipeline.schema_names}"
+ raise ValueError(
+ f"Schema {client.schema.name} does not contain Iceberg tables with these names:"
+ f" {', '.join(invalid_tables)}.{available_schemas}"
+ )
+ schema_iceberg_tables = [t for t in schema_iceberg_tables if t in tables]
+
+ return {
+ name: get_catalog(client, name).load_table(f"{pipeline.dataset_name}.{name}")
+ for name in schema_iceberg_tables
+ }
+
+
+def _get_fileio_config(credentials: CredentialsConfiguration) -> Dict[str, Any]:
+ if isinstance(credentials, WithPyicebergConfig):
+ return credentials.to_pyiceberg_fileio_config()
+ return {}
+
+
+def _get_last_metadata_file(metadata_path: str, client: FilesystemClient) -> str:
+ # TODO: implement faster way to obtain `last_metadata_file` (listing is slow)
+ metadata_files = [f for f in client.fs_client.ls(metadata_path) if f.endswith(".json")]
+ return _make_path(sorted(metadata_files)[-1], client)
+
+
+def _register_table(
+ identifier: str,
+ metadata_path: str,
+ catalog: MetastoreCatalog,
+ client: FilesystemClient,
+) -> IcebergTable:
+ last_metadata_file = _get_last_metadata_file(metadata_path, client)
+ return catalog.register_table(identifier, last_metadata_file)
+
+
+def _make_path(path: str, client: FilesystemClient) -> str:
+ # don't use file protocol for local files because duckdb does not support it
+ # https://github.com/duckdb/duckdb/issues/13669
+ return path if client.is_local_filesystem else client.config.make_url(path)
diff --git a/dlt/common/logger.py b/dlt/common/logger.py
index b163c15672..634e305805 100644
--- a/dlt/common/logger.py
+++ b/dlt/common/logger.py
@@ -47,7 +47,7 @@ def is_logging() -> bool:
def log_level() -> str:
if not LOGGER:
raise RuntimeError("Logger not initialized")
- return logging.getLevelName(LOGGER.level) # type: ignore
+ return logging.getLevelName(LOGGER.level)
def is_json_logging(log_format: str) -> bool:
diff --git a/dlt/common/metrics.py b/dlt/common/metrics.py
index d6acf19d0d..2f9f574dd0 100644
--- a/dlt/common/metrics.py
+++ b/dlt/common/metrics.py
@@ -9,7 +9,7 @@ class DataWriterMetrics(NamedTuple):
created: float
last_modified: float
- def __add__(self, other: Tuple[object, ...], /) -> Tuple[object, ...]:
+ def __add__(self, other: Tuple[object, ...], /) -> Tuple[object, ...]: # type: ignore[override]
if isinstance(other, DataWriterMetrics):
return DataWriterMetrics(
self.file_path if self.file_path == other.file_path else "",
diff --git a/dlt/common/normalizers/json/__init__.py b/dlt/common/normalizers/json/__init__.py
index 725f6a8355..ae5e06fe2e 100644
--- a/dlt/common/normalizers/json/__init__.py
+++ b/dlt/common/normalizers/json/__init__.py
@@ -36,6 +36,10 @@ def extend_schema(self) -> None:
def extend_table(self, table_name: str) -> None:
pass
+ @abc.abstractmethod
+ def remove_table(self, table_name: str) -> None:
+ pass
+
@classmethod
@abc.abstractmethod
def update_normalizer_config(cls, schema: Schema, config: TNormalizerConfig) -> None:
diff --git a/dlt/common/normalizers/json/relational.py b/dlt/common/normalizers/json/relational.py
index e365017125..36845b2e14 100644
--- a/dlt/common/normalizers/json/relational.py
+++ b/dlt/common/normalizers/json/relational.py
@@ -1,4 +1,16 @@
-from typing import Dict, List, Mapping, Optional, Sequence, Tuple, cast, TypedDict, Any
+from typing import (
+ ClassVar,
+ Dict,
+ List,
+ Mapping,
+ Optional,
+ Sequence,
+ Tuple,
+ Type,
+ cast,
+ TypedDict,
+ Any,
+)
from dlt.common.normalizers.exceptions import InvalidJsonNormalizer
from dlt.common.normalizers.typing import TJSONNormalizer
@@ -14,6 +26,9 @@
from dlt.common.schema.utils import (
column_name_validator,
is_nested_table,
+ get_nested_tables,
+ has_column_with_prop,
+ get_first_column_name_with_prop,
)
from dlt.common.utils import update_dict_nested
from dlt.common.normalizers.json import (
@@ -48,6 +63,7 @@ class DataItemNormalizer(DataItemNormalizerBase[RelationalNormalizerConfig]):
# other constants
EMPTY_KEY_IDENTIFIER = "_empty" # replace empty keys with this
+ RELATIONAL_CONFIG_TYPE: ClassVar[Type[RelationalNormalizerConfig]] = RelationalNormalizerConfig
normalizer_config: RelationalNormalizerConfig
propagation_config: RelationalNormalizerConfigPropagation
@@ -310,20 +326,38 @@ def extend_table(self, table_name: str) -> None:
Table name should be normalized.
"""
table = self.schema.tables.get(table_name)
- if not is_nested_table(table) and table.get("write_disposition") == "merge":
- DataItemNormalizer.update_normalizer_config(
+ # add root key prop when merge disposition is used or any of nested tables needs row_key
+ if not is_nested_table(table) and (
+ table.get("write_disposition") == "merge"
+ or any(
+ has_column_with_prop(t, "root_key", include_incomplete=True)
+ for t in get_nested_tables(self.schema.tables, table_name)
+ )
+ ):
+ # get row id column from table, assume that we propagate it into c_dlt_root_id always
+ c_dlt_id = get_first_column_name_with_prop(table, "row_key", include_incomplete=True)
+ self.update_normalizer_config(
self.schema,
{
"propagation": {
"tables": {
table_name: {
- TColumnName(self.c_dlt_id): TColumnName(self.c_dlt_root_id)
+ TColumnName(c_dlt_id or self.c_dlt_id): TColumnName(
+ self.c_dlt_root_id
+ )
}
}
}
},
)
+ def remove_table(self, table_name: str) -> None:
+ """Called by the Schema when table is removed from it."""
+ config = self.get_normalizer_config(self.schema)
+ if propagation := config.get("propagation"):
+ if tables := propagation.get("tables"):
+ tables.pop(table_name, None)
+
def normalize_data_item(
self, item: TDataItem, load_id: str, table_name: str
) -> TNormalizedRowIterator:
@@ -352,8 +386,8 @@ def normalize_data_item(
def ensure_this_normalizer(cls, norm_config: TJSONNormalizer) -> None:
# make sure schema has right normalizer
present_normalizer = norm_config["module"]
- if present_normalizer != __name__:
- raise InvalidJsonNormalizer(__name__, present_normalizer)
+ if present_normalizer != cls.__module__:
+ raise InvalidJsonNormalizer(cls.__module__, present_normalizer)
@classmethod
def update_normalizer_config(cls, schema: Schema, config: RelationalNormalizerConfig) -> None:
@@ -371,8 +405,10 @@ def get_normalizer_config(cls, schema: Schema) -> RelationalNormalizerConfig:
cls.ensure_this_normalizer(norm_config)
return cast(RelationalNormalizerConfig, norm_config.get("config", {}))
- @staticmethod
- def _validate_normalizer_config(schema: Schema, config: RelationalNormalizerConfig) -> None:
+ @classmethod
+ def _validate_normalizer_config(
+ cls, schema: Schema, config: RelationalNormalizerConfig
+ ) -> None:
"""Normalizes all known column identifiers according to the schema and then validates the configuration"""
def _normalize_prop(
@@ -397,7 +433,7 @@ def _normalize_prop(
)
validate_dict(
- RelationalNormalizerConfig,
+ cls.RELATIONAL_CONFIG_TYPE,
config,
"./normalizers/json/config",
validator_f=column_name_validator(schema.naming),
diff --git a/dlt/common/reflection/utils.py b/dlt/common/reflection/utils.py
index c612c5a4f1..27c7bd8758 100644
--- a/dlt/common/reflection/utils.py
+++ b/dlt/common/reflection/utils.py
@@ -90,24 +90,24 @@ def rewrite_python_script(
last_line = -1
last_offset = -1
# sort transformed nodes by line and offset
- for node, t_value in sorted(transformed_nodes, key=lambda n: (n[0].lineno, n[0].col_offset)):
+ for node, t_value in sorted(transformed_nodes, key=lambda n: (n[0].lineno, n[0].col_offset)): # type: ignore[attr-defined]
# do we have a line changed
- if last_line != node.lineno - 1:
+ if last_line != node.lineno - 1: # type: ignore[attr-defined]
# add remainder from the previous line
if last_offset >= 0:
script_lines.append(source_script_lines[last_line][last_offset:])
# add all new lines from previous line to current
- script_lines.extend(source_script_lines[last_line + 1 : node.lineno - 1])
+ script_lines.extend(source_script_lines[last_line + 1 : node.lineno - 1]) # type: ignore[attr-defined]
# add trailing characters until node in current line starts
- script_lines.append(source_script_lines[node.lineno - 1][: node.col_offset])
+ script_lines.append(source_script_lines[node.lineno - 1][: node.col_offset]) # type: ignore[attr-defined]
elif last_offset >= 0:
# no line change, add the characters from the end of previous node to the current
- script_lines.append(source_script_lines[last_line][last_offset : node.col_offset])
+ script_lines.append(source_script_lines[last_line][last_offset : node.col_offset]) # type: ignore[attr-defined]
# replace node value
script_lines.append(ast_unparse(t_value).strip())
- last_line = node.end_lineno - 1
- last_offset = node.end_col_offset
+ last_line = node.end_lineno - 1 # type: ignore[attr-defined]
+ last_offset = node.end_col_offset # type: ignore[attr-defined]
# add all that was missing
if last_offset >= 0:
diff --git a/dlt/common/schema/schema.py b/dlt/common/schema/schema.py
index d6031a08fa..f2d75638fe 100644
--- a/dlt/common/schema/schema.py
+++ b/dlt/common/schema/schema.py
@@ -451,10 +451,12 @@ def drop_tables(
) -> List[TTableSchema]:
"""Drops tables from the schema and returns the dropped tables"""
result = []
+ # TODO: make sure all nested tables to table_names are also dropped
for table_name in table_names:
table = self.get_table(table_name)
if table and (not seen_data_only or utils.has_table_seen_data(table)):
result.append(self._schema_tables.pop(table_name))
+ self.data_item_normalizer.remove_table(table_name)
return result
def filter_row_with_hint(
@@ -525,7 +527,7 @@ def get_new_table_columns(
Typically they come from the destination schema. Columns that are in `existing_columns` and not in `table_name` columns are ignored.
Optionally includes incomplete columns (without data type)"""
- casefold_f: Callable[[str], str] = str.casefold if not case_sensitive else str # type: ignore[assignment]
+ casefold_f: Callable[[str], str] = str.casefold if not case_sensitive else str
casefold_existing = {
casefold_f(col_name): col for col_name, col in existing_columns.items()
}
diff --git a/dlt/common/schema/utils.py b/dlt/common/schema/utils.py
index 038abdc4d0..4f9e0eb42e 100644
--- a/dlt/common/schema/utils.py
+++ b/dlt/common/schema/utils.py
@@ -457,16 +457,8 @@ def diff_table(
* when columns with the same name have different data types
* when table links to different parent tables
"""
- if tab_a["name"] != tab_b["name"]:
- raise TablePropertiesConflictException(
- schema_name, tab_a["name"], "name", tab_a["name"], tab_b["name"]
- )
- table_name = tab_a["name"]
- # check if table properties can be merged
- if tab_a.get("parent") != tab_b.get("parent"):
- raise TablePropertiesConflictException(
- schema_name, table_name, "parent", tab_a.get("parent"), tab_b.get("parent")
- )
+ # allow for columns to differ
+ ensure_compatible_tables(schema_name, tab_a, tab_b, ensure_columns=False)
# get new columns, changes in the column data type or other properties are not allowed
tab_a_columns = tab_a["columns"]
@@ -474,18 +466,6 @@ def diff_table(
for col_b_name, col_b in tab_b["columns"].items():
if col_b_name in tab_a_columns:
col_a = tab_a_columns[col_b_name]
- # we do not support changing data types of columns
- if is_complete_column(col_a) and is_complete_column(col_b):
- if not compare_complete_columns(tab_a_columns[col_b_name], col_b):
- # attempt to update to incompatible columns
- raise CannotCoerceColumnException(
- schema_name,
- table_name,
- col_b_name,
- col_b["data_type"],
- tab_a_columns[col_b_name]["data_type"],
- None,
- )
# all other properties can change
merged_column = merge_column(copy(col_a), col_b)
if merged_column != col_a:
@@ -494,6 +474,8 @@ def diff_table(
new_columns.append(col_b)
# return partial table containing only name and properties that differ (column, filters etc.)
+ table_name = tab_a["name"]
+
partial_table: TPartialTableSchema = {
"name": table_name,
"columns": {} if new_columns is None else {c["name"]: c for c in new_columns},
@@ -519,6 +501,50 @@ def diff_table(
return partial_table
+def ensure_compatible_tables(
+ schema_name: str, tab_a: TTableSchema, tab_b: TPartialTableSchema, ensure_columns: bool = True
+) -> None:
+ """Ensures that `tab_a` and `tab_b` can be merged without conflicts. Conflicts are detected when
+
+ - tables have different names
+ - nested tables have different parents
+ - tables have any column with incompatible types
+
+ Note: all the identifiers must be already normalized
+
+ """
+ if tab_a["name"] != tab_b["name"]:
+ raise TablePropertiesConflictException(
+ schema_name, tab_a["name"], "name", tab_a["name"], tab_b["name"]
+ )
+ table_name = tab_a["name"]
+ # check if table properties can be merged
+ if tab_a.get("parent") != tab_b.get("parent"):
+ raise TablePropertiesConflictException(
+ schema_name, table_name, "parent", tab_a.get("parent"), tab_b.get("parent")
+ )
+
+ if not ensure_columns:
+ return
+
+ tab_a_columns = tab_a["columns"]
+ for col_b_name, col_b in tab_b["columns"].items():
+ if col_b_name in tab_a_columns:
+ col_a = tab_a_columns[col_b_name]
+ # we do not support changing data types of columns
+ if is_complete_column(col_a) and is_complete_column(col_b):
+ if not compare_complete_columns(tab_a_columns[col_b_name], col_b):
+ # attempt to update to incompatible columns
+ raise CannotCoerceColumnException(
+ schema_name,
+ table_name,
+ col_b_name,
+ col_b["data_type"],
+ tab_a_columns[col_b_name]["data_type"],
+ None,
+ )
+
+
# def compare_tables(tab_a: TTableSchema, tab_b: TTableSchema) -> bool:
# try:
# table_name = tab_a["name"]
diff --git a/dlt/common/time.py b/dlt/common/time.py
index 4ce411baa4..74c32e4ea0 100644
--- a/dlt/common/time.py
+++ b/dlt/common/time.py
@@ -164,17 +164,30 @@ def detect_datetime_format(value: str) -> Optional[str]:
): "%Y-%m-%dT%H:%M:%S.%fZ", # UTC with fractional seconds
re.compile(
r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\+\d{2}:\d{2}$"
- ): "%Y-%m-%dT%H:%M:%S%z", # Timezone offset
+ ): "%Y-%m-%dT%H:%M:%S%z", # Positive timezone offset
re.compile(
r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\+\d{4}$"
- ): "%Y-%m-%dT%H:%M:%S%z", # Timezone without colon
- # Full datetime with fractional seconds and timezone
+ ): "%Y-%m-%dT%H:%M:%S%z", # Positive timezone without colon
+ # Full datetime with fractional seconds and positive timezone offset
re.compile(
r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d+\+\d{2}:\d{2}$"
): "%Y-%m-%dT%H:%M:%S.%f%z",
re.compile(
r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d+\+\d{4}$"
- ): "%Y-%m-%dT%H:%M:%S.%f%z", # Timezone without colon
+ ): "%Y-%m-%dT%H:%M:%S.%f%z", # Positive timezone without colon
+ re.compile(
+ r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}-\d{2}:\d{2}$"
+ ): "%Y-%m-%dT%H:%M:%S%z", # Negative timezone offset
+ re.compile(
+ r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}-\d{4}$"
+ ): "%Y-%m-%dT%H:%M:%S%z", # Negative timezone without colon
+ # Full datetime with fractional seconds and negative timezone offset
+ re.compile(
+ r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d+-\d{2}:\d{2}$"
+ ): "%Y-%m-%dT%H:%M:%S.%f%z",
+ re.compile(
+ r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d+-\d{4}$"
+ ): "%Y-%m-%dT%H:%M:%S.%f%z", # Negative Timezone without colon
# Datetime without timezone
re.compile(r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}$"): "%Y-%m-%dT%H:%M:%S", # No timezone
re.compile(r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}$"): "%Y-%m-%dT%H:%M", # Minute precision
diff --git a/dlt/common/typing.py b/dlt/common/typing.py
index a3364d1b07..a0322fe01e 100644
--- a/dlt/common/typing.py
+++ b/dlt/common/typing.py
@@ -446,7 +446,7 @@ def get_generic_type_argument_from_instance(
if cls_:
orig_param_type = get_args(cls_)[0]
if orig_param_type in (Any, CallableAny) and sample_value is not None:
- orig_param_type = type(sample_value)
+ orig_param_type = type(sample_value) # type: ignore[assignment]
return orig_param_type # type: ignore
@@ -484,3 +484,18 @@ def decorator(
return func
return decorator
+
+
+def add_value_to_literal(literal: Any, value: Any) -> None:
+ """Extends a Literal at runtime with a new value.
+
+ Args:
+ literal (Type[Any]): Literal to extend
+ value (Any): Value to add
+
+ """
+ type_args = get_args(literal)
+
+ if value not in type_args:
+ type_args += (value,)
+ literal.__args__ = type_args
diff --git a/dlt/destinations/dataset.py b/dlt/destinations/dataset.py
deleted file mode 100644
index 27a7f5a7af..0000000000
--- a/dlt/destinations/dataset.py
+++ /dev/null
@@ -1,412 +0,0 @@
-from typing import Any, Generator, Sequence, Union, TYPE_CHECKING, Tuple
-
-from contextlib import contextmanager
-
-from dlt import version
-from dlt.common.json import json
-from dlt.common.exceptions import MissingDependencyException
-from dlt.common.destination import AnyDestination
-from dlt.common.destination.reference import (
- SupportsReadableRelation,
- SupportsReadableDataset,
- TDatasetType,
- TDestinationReferenceArg,
- Destination,
- JobClientBase,
- WithStateSync,
- DestinationClientDwhConfiguration,
- DestinationClientStagingConfiguration,
- DestinationClientConfiguration,
- DestinationClientDwhWithStagingConfiguration,
-)
-
-from dlt.common.schema.typing import TTableSchemaColumns
-from dlt.destinations.sql_client import SqlClientBase, WithSqlClient
-from dlt.common.schema import Schema
-from dlt.common.exceptions import DltException
-
-if TYPE_CHECKING:
- try:
- from dlt.common.libs.ibis import BaseBackend as IbisBackend
- except MissingDependencyException:
- IbisBackend = Any
-else:
- IbisBackend = Any
-
-
-class DatasetException(DltException):
- pass
-
-
-class ReadableRelationHasQueryException(DatasetException):
- def __init__(self, attempted_change: str) -> None:
- msg = (
- "This readable relation was created with a provided sql query. You cannot change"
- f" {attempted_change}. Please change the orignal sql query."
- )
- super().__init__(msg)
-
-
-class ReadableRelationUnknownColumnException(DatasetException):
- def __init__(self, column_name: str) -> None:
- msg = (
- f"The selected column {column_name} is not known in the dlt schema for this releation."
- )
- super().__init__(msg)
-
-
-class ReadableDBAPIRelation(SupportsReadableRelation):
- def __init__(
- self,
- *,
- readable_dataset: "ReadableDBAPIDataset",
- provided_query: Any = None,
- table_name: str = None,
- limit: int = None,
- selected_columns: Sequence[str] = None,
- ) -> None:
- """Create a lazy evaluated relation to for the dataset of a destination"""
-
- # NOTE: we can keep an assertion here, this class will not be created by the user
- assert bool(table_name) != bool(
- provided_query
- ), "Please provide either an sql query OR a table_name"
-
- self._dataset = readable_dataset
-
- self._provided_query = provided_query
- self._table_name = table_name
- self._limit = limit
- self._selected_columns = selected_columns
-
- # wire protocol functions
- self.df = self._wrap_func("df") # type: ignore
- self.arrow = self._wrap_func("arrow") # type: ignore
- self.fetchall = self._wrap_func("fetchall") # type: ignore
- self.fetchmany = self._wrap_func("fetchmany") # type: ignore
- self.fetchone = self._wrap_func("fetchone") # type: ignore
-
- self.iter_df = self._wrap_iter("iter_df") # type: ignore
- self.iter_arrow = self._wrap_iter("iter_arrow") # type: ignore
- self.iter_fetch = self._wrap_iter("iter_fetch") # type: ignore
-
- @property
- def sql_client(self) -> SqlClientBase[Any]:
- return self._dataset.sql_client
-
- @property
- def schema(self) -> Schema:
- return self._dataset.schema
-
- @property
- def query(self) -> Any:
- """build the query"""
- if self._provided_query:
- return self._provided_query
-
- table_name = self.sql_client.make_qualified_table_name(
- self.schema.naming.normalize_tables_path(self._table_name)
- )
-
- maybe_limit_clause_1 = ""
- maybe_limit_clause_2 = ""
- if self._limit:
- maybe_limit_clause_1, maybe_limit_clause_2 = self.sql_client._limit_clause_sql(
- self._limit
- )
-
- selector = "*"
- if self._selected_columns:
- selector = ",".join(
- [
- self.sql_client.escape_column_name(self.schema.naming.normalize_path(c))
- for c in self._selected_columns
- ]
- )
-
- return f"SELECT {maybe_limit_clause_1} {selector} FROM {table_name} {maybe_limit_clause_2}"
-
- @property
- def columns_schema(self) -> TTableSchemaColumns:
- return self.compute_columns_schema()
-
- @columns_schema.setter
- def columns_schema(self, new_value: TTableSchemaColumns) -> None:
- raise NotImplementedError("columns schema in ReadableDBAPIRelation can only be computed")
-
- def compute_columns_schema(self) -> TTableSchemaColumns:
- """provide schema columns for the cursor, may be filtered by selected columns"""
-
- columns_schema = (
- self.schema.tables.get(self._table_name, {}).get("columns", {}) if self.schema else {}
- )
-
- if not columns_schema:
- return None
- if not self._selected_columns:
- return columns_schema
-
- filtered_columns: TTableSchemaColumns = {}
- for sc in self._selected_columns:
- sc = self.schema.naming.normalize_path(sc)
- if sc not in columns_schema.keys():
- raise ReadableRelationUnknownColumnException(sc)
- filtered_columns[sc] = columns_schema[sc]
-
- return filtered_columns
-
- @contextmanager
- def cursor(self) -> Generator[SupportsReadableRelation, Any, Any]:
- """Gets a DBApiCursor for the current relation"""
- with self.sql_client as client:
- # this hacky code is needed for mssql to disable autocommit, read iterators
- # will not work otherwise. in the future we should be able to create a readony
- # client which will do this automatically
- if hasattr(self.sql_client, "_conn") and hasattr(self.sql_client._conn, "autocommit"):
- self.sql_client._conn.autocommit = False
- with client.execute_query(self.query) as cursor:
- if columns_schema := self.columns_schema:
- cursor.columns_schema = columns_schema
- yield cursor
-
- def _wrap_iter(self, func_name: str) -> Any:
- """wrap SupportsReadableRelation generators in cursor context"""
-
- def _wrap(*args: Any, **kwargs: Any) -> Any:
- with self.cursor() as cursor:
- yield from getattr(cursor, func_name)(*args, **kwargs)
-
- return _wrap
-
- def _wrap_func(self, func_name: str) -> Any:
- """wrap SupportsReadableRelation functions in cursor context"""
-
- def _wrap(*args: Any, **kwargs: Any) -> Any:
- with self.cursor() as cursor:
- return getattr(cursor, func_name)(*args, **kwargs)
-
- return _wrap
-
- def __copy__(self) -> "ReadableDBAPIRelation":
- return self.__class__(
- readable_dataset=self._dataset,
- provided_query=self._provided_query,
- table_name=self._table_name,
- limit=self._limit,
- selected_columns=self._selected_columns,
- )
-
- def limit(self, limit: int) -> "ReadableDBAPIRelation":
- if self._provided_query:
- raise ReadableRelationHasQueryException("limit")
- rel = self.__copy__()
- rel._limit = limit
- return rel
-
- def select(self, *columns: str) -> "ReadableDBAPIRelation":
- if self._provided_query:
- raise ReadableRelationHasQueryException("select")
- rel = self.__copy__()
- rel._selected_columns = columns
- # NOTE: the line below will ensure that no unknown columns are selected if
- # schema is known
- rel.compute_columns_schema()
- return rel
-
- def __getitem__(self, columns: Union[str, Sequence[str]]) -> "SupportsReadableRelation":
- if isinstance(columns, str):
- return self.select(columns)
- elif isinstance(columns, Sequence):
- return self.select(*columns)
- else:
- raise TypeError(f"Invalid argument type: {type(columns).__name__}")
-
- def head(self, limit: int = 5) -> "ReadableDBAPIRelation":
- return self.limit(limit)
-
-
-class ReadableDBAPIDataset(SupportsReadableDataset):
- """Access to dataframes and arrowtables in the destination dataset via dbapi"""
-
- def __init__(
- self,
- destination: TDestinationReferenceArg,
- dataset_name: str,
- schema: Union[Schema, str, None] = None,
- ) -> None:
- self._destination = Destination.from_reference(destination)
- self._provided_schema = schema
- self._dataset_name = dataset_name
- self._sql_client: SqlClientBase[Any] = None
- self._schema: Schema = None
-
- def ibis(self) -> IbisBackend:
- """return a connected ibis backend"""
- from dlt.common.libs.ibis import create_ibis_backend
-
- self._ensure_client_and_schema()
- return create_ibis_backend(
- self._destination,
- self._destination_client(self.schema),
- )
-
- @property
- def schema(self) -> Schema:
- self._ensure_client_and_schema()
- return self._schema
-
- @property
- def sql_client(self) -> SqlClientBase[Any]:
- self._ensure_client_and_schema()
- return self._sql_client
-
- def _destination_client(self, schema: Schema) -> JobClientBase:
- return get_destination_clients(
- schema, destination=self._destination, destination_dataset_name=self._dataset_name
- )[0]
-
- def _ensure_client_and_schema(self) -> None:
- """Lazy load schema and client"""
-
- # full schema given, nothing to do
- if not self._schema and isinstance(self._provided_schema, Schema):
- self._schema = self._provided_schema
-
- # schema name given, resolve it from destination by name
- elif not self._schema and isinstance(self._provided_schema, str):
- with self._destination_client(Schema(self._provided_schema)) as client:
- if isinstance(client, WithStateSync):
- stored_schema = client.get_stored_schema(self._provided_schema)
- if stored_schema:
- self._schema = Schema.from_stored_schema(json.loads(stored_schema.schema))
- else:
- self._schema = Schema(self._provided_schema)
-
- # no schema name given, load newest schema from destination
- elif not self._schema:
- with self._destination_client(Schema(self._dataset_name)) as client:
- if isinstance(client, WithStateSync):
- stored_schema = client.get_stored_schema()
- if stored_schema:
- self._schema = Schema.from_stored_schema(json.loads(stored_schema.schema))
-
- # default to empty schema with dataset name
- if not self._schema:
- self._schema = Schema(self._dataset_name)
-
- # here we create the client bound to the resolved schema
- if not self._sql_client:
- destination_client = self._destination_client(self._schema)
- if isinstance(destination_client, WithSqlClient):
- self._sql_client = destination_client.sql_client
- else:
- raise Exception(
- f"Destination {destination_client.config.destination_type} does not support"
- " SqlClient."
- )
-
- def __call__(self, query: Any) -> ReadableDBAPIRelation:
- return ReadableDBAPIRelation(readable_dataset=self, provided_query=query) # type: ignore[abstract]
-
- def table(self, table_name: str) -> SupportsReadableRelation:
- return ReadableDBAPIRelation(
- readable_dataset=self,
- table_name=table_name,
- ) # type: ignore[abstract]
-
- def __getitem__(self, table_name: str) -> SupportsReadableRelation:
- """access of table via dict notation"""
- return self.table(table_name)
-
- def __getattr__(self, table_name: str) -> SupportsReadableRelation:
- """access of table via property notation"""
- return self.table(table_name)
-
-
-def dataset(
- destination: TDestinationReferenceArg,
- dataset_name: str,
- schema: Union[Schema, str, None] = None,
- dataset_type: TDatasetType = "dbapi",
-) -> SupportsReadableDataset:
- if dataset_type == "dbapi":
- return ReadableDBAPIDataset(destination, dataset_name, schema)
- raise NotImplementedError(f"Dataset of type {dataset_type} not implemented")
-
-
-# helpers
-def get_destination_client_initial_config(
- destination: AnyDestination,
- default_schema_name: str,
- dataset_name: str,
- as_staging: bool = False,
-) -> DestinationClientConfiguration:
- client_spec = destination.spec
-
- # this client supports many schemas and datasets
- if issubclass(client_spec, DestinationClientDwhConfiguration):
- if issubclass(client_spec, DestinationClientStagingConfiguration):
- spec: DestinationClientDwhConfiguration = client_spec(as_staging_destination=as_staging)
- else:
- spec = client_spec()
-
- spec._bind_dataset_name(dataset_name, default_schema_name)
- return spec
-
- return client_spec()
-
-
-def get_destination_clients(
- schema: Schema,
- destination: AnyDestination = None,
- destination_dataset_name: str = None,
- destination_initial_config: DestinationClientConfiguration = None,
- staging: AnyDestination = None,
- staging_dataset_name: str = None,
- staging_initial_config: DestinationClientConfiguration = None,
- # pipeline specific settings
- default_schema_name: str = None,
-) -> Tuple[JobClientBase, JobClientBase]:
- destination = Destination.from_reference(destination) if destination else None
- staging = Destination.from_reference(staging) if staging else None
-
- try:
- # resolve staging config in order to pass it to destination client config
- staging_client = None
- if staging:
- if not staging_initial_config:
- # this is just initial config - without user configuration injected
- staging_initial_config = get_destination_client_initial_config(
- staging,
- dataset_name=staging_dataset_name,
- default_schema_name=default_schema_name,
- as_staging=True,
- )
- # create the client - that will also resolve the config
- staging_client = staging.client(schema, staging_initial_config)
-
- if not destination_initial_config:
- # config is not provided then get it with injected credentials
- initial_config = get_destination_client_initial_config(
- destination,
- dataset_name=destination_dataset_name,
- default_schema_name=default_schema_name,
- )
-
- # attach the staging client config to destination client config - if its type supports it
- if (
- staging_client
- and isinstance(initial_config, DestinationClientDwhWithStagingConfiguration)
- and isinstance(staging_client.config, DestinationClientStagingConfiguration)
- ):
- initial_config.staging_config = staging_client.config
- # create instance with initial_config properly set
- client = destination.client(schema, initial_config)
- return client, staging_client
- except ModuleNotFoundError:
- client_spec = destination.spec()
- raise MissingDependencyException(
- f"{client_spec.destination_type} destination",
- [f"{version.DLT_PKG_NAME}[{client_spec.destination_type}]"],
- "Dependencies for specific destinations are available as extras of dlt",
- )
diff --git a/dlt/destinations/dataset/__init__.py b/dlt/destinations/dataset/__init__.py
new file mode 100644
index 0000000000..e0eef681b8
--- /dev/null
+++ b/dlt/destinations/dataset/__init__.py
@@ -0,0 +1,19 @@
+from dlt.destinations.dataset.factory import (
+ dataset,
+)
+from dlt.destinations.dataset.dataset import (
+ ReadableDBAPIDataset,
+ get_destination_clients,
+)
+from dlt.destinations.dataset.utils import (
+ get_destination_clients,
+ get_destination_client_initial_config,
+)
+
+
+__all__ = [
+ "dataset",
+ "ReadableDBAPIDataset",
+ "get_destination_client_initial_config",
+ "get_destination_clients",
+]
diff --git a/dlt/destinations/dataset/dataset.py b/dlt/destinations/dataset/dataset.py
new file mode 100644
index 0000000000..fc55393a60
--- /dev/null
+++ b/dlt/destinations/dataset/dataset.py
@@ -0,0 +1,168 @@
+from typing import Any, Union, TYPE_CHECKING, List
+
+from dlt.common.json import json
+
+from dlt.common.exceptions import MissingDependencyException
+
+from dlt.common.destination.reference import (
+ SupportsReadableRelation,
+ SupportsReadableDataset,
+ TDestinationReferenceArg,
+ Destination,
+ JobClientBase,
+ WithStateSync,
+)
+
+from dlt.destinations.sql_client import SqlClientBase, WithSqlClient
+from dlt.common.schema import Schema
+from dlt.destinations.dataset.relation import ReadableDBAPIRelation
+from dlt.destinations.dataset.utils import get_destination_clients
+from dlt.common.destination.reference import TDatasetType
+
+if TYPE_CHECKING:
+ try:
+ from dlt.helpers.ibis import BaseBackend as IbisBackend
+ except MissingDependencyException:
+ IbisBackend = Any
+else:
+ IbisBackend = Any
+
+
+class ReadableDBAPIDataset(SupportsReadableDataset):
+ """Access to dataframes and arrowtables in the destination dataset via dbapi"""
+
+ def __init__(
+ self,
+ destination: TDestinationReferenceArg,
+ dataset_name: str,
+ schema: Union[Schema, str, None] = None,
+ dataset_type: TDatasetType = "auto",
+ ) -> None:
+ self._destination = Destination.from_reference(destination)
+ self._provided_schema = schema
+ self._dataset_name = dataset_name
+ self._sql_client: SqlClientBase[Any] = None
+ self._schema: Schema = None
+ self._dataset_type = dataset_type
+
+ def ibis(self) -> IbisBackend:
+ """return a connected ibis backend"""
+ from dlt.helpers.ibis import create_ibis_backend
+
+ self._ensure_client_and_schema()
+ return create_ibis_backend(
+ self._destination,
+ self._destination_client(self.schema),
+ )
+
+ @property
+ def schema(self) -> Schema:
+ self._ensure_client_and_schema()
+ return self._schema
+
+ @property
+ def sql_client(self) -> SqlClientBase[Any]:
+ self._ensure_client_and_schema()
+ return self._sql_client
+
+ def _destination_client(self, schema: Schema) -> JobClientBase:
+ return get_destination_clients(
+ schema, destination=self._destination, destination_dataset_name=self._dataset_name
+ )[0]
+
+ def _ensure_client_and_schema(self) -> None:
+ """Lazy load schema and client"""
+
+ # full schema given, nothing to do
+ if not self._schema and isinstance(self._provided_schema, Schema):
+ self._schema = self._provided_schema
+
+ # schema name given, resolve it from destination by name
+ elif not self._schema and isinstance(self._provided_schema, str):
+ with self._destination_client(Schema(self._provided_schema)) as client:
+ if isinstance(client, WithStateSync):
+ stored_schema = client.get_stored_schema(self._provided_schema)
+ if stored_schema:
+ self._schema = Schema.from_stored_schema(json.loads(stored_schema.schema))
+ else:
+ self._schema = Schema(self._provided_schema)
+
+ # no schema name given, load newest schema from destination
+ elif not self._schema:
+ with self._destination_client(Schema(self._dataset_name)) as client:
+ if isinstance(client, WithStateSync):
+ stored_schema = client.get_stored_schema()
+ if stored_schema:
+ self._schema = Schema.from_stored_schema(json.loads(stored_schema.schema))
+
+ # default to empty schema with dataset name
+ if not self._schema:
+ self._schema = Schema(self._dataset_name)
+
+ # here we create the client bound to the resolved schema
+ if not self._sql_client:
+ destination_client = self._destination_client(self._schema)
+ if isinstance(destination_client, WithSqlClient):
+ self._sql_client = destination_client.sql_client
+ else:
+ raise Exception(
+ f"Destination {destination_client.config.destination_type} does not support"
+ " SqlClient."
+ )
+
+ def __call__(self, query: Any) -> ReadableDBAPIRelation:
+ return ReadableDBAPIRelation(readable_dataset=self, provided_query=query) # type: ignore[abstract]
+
+ def table(self, table_name: str) -> SupportsReadableRelation:
+ # we can create an ibis powered relation if ibis is available
+ if table_name in self.schema.tables and self._dataset_type in ("auto", "ibis"):
+ try:
+ from dlt.helpers.ibis import create_unbound_ibis_table
+ from dlt.destinations.dataset.ibis_relation import ReadableIbisRelation
+
+ unbound_table = create_unbound_ibis_table(self.sql_client, self.schema, table_name)
+ return ReadableIbisRelation(readable_dataset=self, ibis_object=unbound_table, columns_schema=self.schema.tables[table_name]["columns"]) # type: ignore[abstract]
+ except MissingDependencyException:
+ # if ibis is explicitly requested, reraise
+ if self._dataset_type == "ibis":
+ raise
+
+ # fallback to the standard dbapi relation
+ return ReadableDBAPIRelation(
+ readable_dataset=self,
+ table_name=table_name,
+ ) # type: ignore[abstract]
+
+ def row_counts(
+ self, *, data_tables: bool = True, dlt_tables: bool = False, table_names: List[str] = None
+ ) -> SupportsReadableRelation:
+ """Returns a dictionary of table names and their row counts, returns counts of all data tables by default"""
+ """If table_names is provided, only the tables in the list are returned regardless of the data_tables and dlt_tables flags"""
+
+ selected_tables = table_names or []
+ if not selected_tables:
+ if data_tables:
+ selected_tables += self.schema.data_table_names(seen_data_only=True)
+ if dlt_tables:
+ selected_tables += self.schema.dlt_table_names()
+
+ # Build UNION ALL query to get row counts for all selected tables
+ queries = []
+ for table in selected_tables:
+ queries.append(
+ f"SELECT '{table}' as table_name, COUNT(*) as row_count FROM"
+ f" {self.sql_client.make_qualified_table_name(table)}"
+ )
+
+ query = " UNION ALL ".join(queries)
+
+ # Execute query and build result dict
+ return self(query)
+
+ def __getitem__(self, table_name: str) -> SupportsReadableRelation:
+ """access of table via dict notation"""
+ return self.table(table_name)
+
+ def __getattr__(self, table_name: str) -> SupportsReadableRelation:
+ """access of table via property notation"""
+ return self.table(table_name)
diff --git a/dlt/destinations/dataset/exceptions.py b/dlt/destinations/dataset/exceptions.py
new file mode 100644
index 0000000000..17e8f6b563
--- /dev/null
+++ b/dlt/destinations/dataset/exceptions.py
@@ -0,0 +1,22 @@
+from dlt.common.exceptions import DltException
+
+
+class DatasetException(DltException):
+ pass
+
+
+class ReadableRelationHasQueryException(DatasetException):
+ def __init__(self, attempted_change: str) -> None:
+ msg = (
+ "This readable relation was created with a provided sql query. You cannot change"
+ f" {attempted_change}. Please change the orignal sql query."
+ )
+ super().__init__(msg)
+
+
+class ReadableRelationUnknownColumnException(DatasetException):
+ def __init__(self, column_name: str) -> None:
+ msg = (
+ f"The selected column {column_name} is not known in the dlt schema for this releation."
+ )
+ super().__init__(msg)
diff --git a/dlt/destinations/dataset/factory.py b/dlt/destinations/dataset/factory.py
new file mode 100644
index 0000000000..8ea0ddf7a1
--- /dev/null
+++ b/dlt/destinations/dataset/factory.py
@@ -0,0 +1,22 @@
+from typing import Union
+
+
+from dlt.common.destination import AnyDestination
+from dlt.common.destination.reference import (
+ SupportsReadableDataset,
+ TDatasetType,
+ TDestinationReferenceArg,
+)
+
+from dlt.common.schema import Schema
+
+from dlt.destinations.dataset.dataset import ReadableDBAPIDataset
+
+
+def dataset(
+ destination: TDestinationReferenceArg,
+ dataset_name: str,
+ schema: Union[Schema, str, None] = None,
+ dataset_type: TDatasetType = "auto",
+) -> SupportsReadableDataset:
+ return ReadableDBAPIDataset(destination, dataset_name, schema, dataset_type)
diff --git a/dlt/destinations/dataset/ibis_relation.py b/dlt/destinations/dataset/ibis_relation.py
new file mode 100644
index 0000000000..632298ad56
--- /dev/null
+++ b/dlt/destinations/dataset/ibis_relation.py
@@ -0,0 +1,224 @@
+from typing import TYPE_CHECKING, Any, Union, Sequence
+
+from functools import partial
+
+from dlt.common.exceptions import MissingDependencyException
+from dlt.destinations.dataset.relation import BaseReadableDBAPIRelation
+from dlt.common.schema.typing import TTableSchemaColumns
+
+
+if TYPE_CHECKING:
+ from dlt.destinations.dataset.dataset import ReadableDBAPIDataset
+else:
+ ReadableDBAPIDataset = Any
+
+try:
+ from dlt.helpers.ibis import Expr
+except MissingDependencyException:
+ Expr = Any
+
+# map dlt destination to sqlglot dialect
+DIALECT_MAP = {
+ "dlt.destinations.duckdb": "duckdb", # works
+ "dlt.destinations.motherduck": "duckdb", # works
+ "dlt.destinations.clickhouse": "clickhouse", # works
+ "dlt.destinations.databricks": "databricks", # works
+ "dlt.destinations.bigquery": "bigquery", # works
+ "dlt.destinations.postgres": "postgres", # works
+ "dlt.destinations.redshift": "redshift", # works
+ "dlt.destinations.snowflake": "snowflake", # works
+ "dlt.destinations.mssql": "tsql", # works
+ "dlt.destinations.synapse": "tsql", # works
+ "dlt.destinations.athena": "trino", # works
+ "dlt.destinations.filesystem": "duckdb", # works
+ "dlt.destinations.dremio": "presto", # works
+ # NOTE: can we discover the current dialect in sqlalchemy?
+ "dlt.destinations.sqlalchemy": "mysql", # may work
+}
+
+# NOTE: some dialects are not supported by ibis, but by sqlglot, these need to
+# be transpiled with a intermediary step
+TRANSPILE_VIA_MAP = {
+ "tsql": "postgres",
+ "databricks": "postgres",
+ "clickhouse": "postgres",
+ "redshift": "postgres",
+ "presto": "postgres",
+}
+
+
+class ReadableIbisRelation(BaseReadableDBAPIRelation):
+ def __init__(
+ self,
+ *,
+ readable_dataset: ReadableDBAPIDataset,
+ ibis_object: Any = None,
+ columns_schema: TTableSchemaColumns = None,
+ ) -> None:
+ """Create a lazy evaluated relation to for the dataset of a destination"""
+ super().__init__(readable_dataset=readable_dataset)
+ self._ibis_object = ibis_object
+ self._columns_schema = columns_schema
+
+ @property
+ def query(self) -> Any:
+ """build the query"""
+
+ from dlt.helpers.ibis import ibis, sqlglot
+
+ destination_type = self._dataset._destination.destination_type
+ target_dialect = DIALECT_MAP[destination_type]
+
+ # render sql directly if possible
+ if target_dialect not in TRANSPILE_VIA_MAP:
+ return ibis.to_sql(self._ibis_object, dialect=target_dialect)
+
+ # here we need to transpile first
+ transpile_via = TRANSPILE_VIA_MAP[target_dialect]
+ sql = ibis.to_sql(self._ibis_object, dialect=transpile_via)
+ sql = sqlglot.transpile(sql, read=transpile_via, write=target_dialect)[0]
+ return sql
+
+ @property
+ def columns_schema(self) -> TTableSchemaColumns:
+ return self.compute_columns_schema()
+
+ @columns_schema.setter
+ def columns_schema(self, new_value: TTableSchemaColumns) -> None:
+ raise NotImplementedError("columns schema in ReadableDBAPIRelation can only be computed")
+
+ def compute_columns_schema(self) -> TTableSchemaColumns:
+ """provide schema columns for the cursor, may be filtered by selected columns"""
+ # TODO: provide column lineage tracing with sqlglot lineage
+ return self._columns_schema
+
+ def _proxy_expression_method(self, method_name: str, *args: Any, **kwargs: Any) -> Any:
+ """Proxy method calls to the underlying ibis expression, allowing to wrap the resulting expression in a new relation"""
+
+ # Get the method from the expression
+ method = getattr(self._ibis_object, method_name)
+
+ # unwrap args and kwargs if they are relations
+ args = tuple(
+ arg._ibis_object if isinstance(arg, ReadableIbisRelation) else arg for arg in args
+ )
+ kwargs = {
+ k: v._ibis_object if isinstance(v, ReadableIbisRelation) else v
+ for k, v in kwargs.items()
+ }
+
+ # casefold string params, we assume these are column names
+ args = tuple(
+ self.sql_client.capabilities.casefold_identifier(arg) if isinstance(arg, str) else arg
+ for arg in args
+ )
+ kwargs = {
+ k: self.sql_client.capabilities.casefold_identifier(v) if isinstance(v, str) else v
+ for k, v in kwargs.items()
+ }
+
+ # Call it with provided args
+ result = method(*args, **kwargs)
+
+ # calculate columns schema for the result, some operations we know will not change the schema
+ # and select will just reduce the amount of column
+ columns_schema = None
+ if method_name == "select":
+ columns_schema = self._get_filtered_columns_schema(args)
+ elif method_name in ["filter", "limit", "order_by", "head"]:
+ columns_schema = self._columns_schema
+
+ # If result is an ibis expression, wrap it in a new relation else return raw result
+ return self.__class__(
+ readable_dataset=self._dataset, ibis_object=result, columns_schema=columns_schema
+ )
+
+ def __getattr__(self, name: str) -> Any:
+ """Wrap all callable attributes of the expression"""
+
+ attr = getattr(self._ibis_object, name, None)
+
+ # try casefolded name for ibis columns access
+ if attr is None:
+ name = self.sql_client.capabilities.casefold_identifier(name)
+ attr = getattr(self._ibis_object, name, None)
+
+ if attr is None:
+ raise AttributeError(f"'{self.__class__.__name__}' object has no attribute '{name}'")
+
+ if not callable(attr):
+ # NOTE: we don't need to forward columns schema for non-callable attributes, these are usually columns
+ return self.__class__(readable_dataset=self._dataset, ibis_object=attr)
+
+ return partial(self._proxy_expression_method, name)
+
+ def __getitem__(self, columns: Union[str, Sequence[str]]) -> "ReadableIbisRelation":
+ # casefold column-names
+ columns = [columns] if isinstance(columns, str) else columns
+ columns = [self.sql_client.capabilities.casefold_identifier(col) for col in columns]
+ expr = self._ibis_object[columns]
+ return self.__class__(
+ readable_dataset=self._dataset,
+ ibis_object=expr,
+ columns_schema=self._get_filtered_columns_schema(columns),
+ )
+
+ def _get_filtered_columns_schema(self, columns: Sequence[str]) -> TTableSchemaColumns:
+ if not self._columns_schema:
+ return None
+ try:
+ return {col: self._columns_schema[col] for col in columns}
+ except KeyError:
+ # NOTE: select statements can contain new columns not present in the original schema
+ # here we just break the column schema inheritance chain
+ return None
+
+ # forward ibis methods defined on interface
+ def limit(self, limit: int, **kwargs: Any) -> "ReadableIbisRelation":
+ """limit the result to 'limit' items"""
+ return self._proxy_expression_method("limit", limit, **kwargs) # type: ignore
+
+ def head(self, limit: int = 5) -> "ReadableIbisRelation":
+ """limit the result to 5 items by default"""
+ return self._proxy_expression_method("head", limit) # type: ignore
+
+ def select(self, *columns: str) -> "ReadableIbisRelation":
+ """set which columns will be selected"""
+ return self._proxy_expression_method("select", *columns) # type: ignore
+
+ # forward ibis comparison and math operators
+ def __lt__(self, other: Any) -> "ReadableIbisRelation":
+ return self._proxy_expression_method("__lt__", other) # type: ignore
+
+ def __gt__(self, other: Any) -> "ReadableIbisRelation":
+ return self._proxy_expression_method("__gt__", other) # type: ignore
+
+ def __ge__(self, other: Any) -> "ReadableIbisRelation":
+ return self._proxy_expression_method("__ge__", other) # type: ignore
+
+ def __le__(self, other: Any) -> "ReadableIbisRelation":
+ return self._proxy_expression_method("__le__", other) # type: ignore
+
+ def __eq__(self, other: Any) -> bool:
+ return self._proxy_expression_method("__eq__", other) # type: ignore
+
+ def __ne__(self, other: Any) -> bool:
+ return self._proxy_expression_method("__ne__", other) # type: ignore
+
+ def __and__(self, other: Any) -> "ReadableIbisRelation":
+ return self._proxy_expression_method("__and__", other) # type: ignore
+
+ def __or__(self, other: Any) -> "ReadableIbisRelation":
+ return self._proxy_expression_method("__or__", other) # type: ignore
+
+ def __mul__(self, other: Any) -> "ReadableIbisRelation":
+ return self._proxy_expression_method("__mul__", other) # type: ignore
+
+ def __div__(self, other: Any) -> "ReadableIbisRelation":
+ return self._proxy_expression_method("__div__", other) # type: ignore
+
+ def __add__(self, other: Any) -> "ReadableIbisRelation":
+ return self._proxy_expression_method("__add__", other) # type: ignore
+
+ def __sub__(self, other: Any) -> "ReadableIbisRelation":
+ return self._proxy_expression_method("__sub__", other) # type: ignore
diff --git a/dlt/destinations/dataset/relation.py b/dlt/destinations/dataset/relation.py
new file mode 100644
index 0000000000..2cdb7640df
--- /dev/null
+++ b/dlt/destinations/dataset/relation.py
@@ -0,0 +1,207 @@
+from typing import Any, Generator, Sequence, Union, TYPE_CHECKING
+
+from contextlib import contextmanager
+
+
+from dlt.common.destination.reference import (
+ SupportsReadableRelation,
+)
+
+from dlt.destinations.dataset.exceptions import (
+ ReadableRelationHasQueryException,
+ ReadableRelationUnknownColumnException,
+)
+
+from dlt.common.schema.typing import TTableSchemaColumns
+from dlt.destinations.sql_client import SqlClientBase
+from dlt.common.schema import Schema
+
+if TYPE_CHECKING:
+ from dlt.destinations.dataset.dataset import ReadableDBAPIDataset
+else:
+ ReadableDBAPIDataset = Any
+
+
+class BaseReadableDBAPIRelation(SupportsReadableRelation):
+ def __init__(
+ self,
+ *,
+ readable_dataset: "ReadableDBAPIDataset",
+ ) -> None:
+ """Create a lazy evaluated relation to for the dataset of a destination"""
+
+ self._dataset = readable_dataset
+
+ # wire protocol functions
+ self.df = self._wrap_func("df") # type: ignore
+ self.arrow = self._wrap_func("arrow") # type: ignore
+ self.fetchall = self._wrap_func("fetchall") # type: ignore
+ self.fetchmany = self._wrap_func("fetchmany") # type: ignore
+ self.fetchone = self._wrap_func("fetchone") # type: ignore
+
+ self.iter_df = self._wrap_iter("iter_df") # type: ignore
+ self.iter_arrow = self._wrap_iter("iter_arrow") # type: ignore
+ self.iter_fetch = self._wrap_iter("iter_fetch") # type: ignore
+
+ @property
+ def sql_client(self) -> SqlClientBase[Any]:
+ return self._dataset.sql_client
+
+ @property
+ def schema(self) -> Schema:
+ return self._dataset.schema
+
+ @property
+ def query(self) -> Any:
+ raise NotImplementedError("No query in ReadableDBAPIRelation")
+
+ @contextmanager
+ def cursor(self) -> Generator[SupportsReadableRelation, Any, Any]:
+ """Gets a DBApiCursor for the current relation"""
+ with self.sql_client as client:
+ # this hacky code is needed for mssql to disable autocommit, read iterators
+ # will not work otherwise. in the future we should be able to create a readony
+ # client which will do this automatically
+ if hasattr(self.sql_client, "_conn") and hasattr(self.sql_client._conn, "autocommit"):
+ self.sql_client._conn.autocommit = False
+ with client.execute_query(self.query) as cursor:
+ if columns_schema := self.columns_schema:
+ cursor.columns_schema = columns_schema
+ yield cursor
+
+ def _wrap_iter(self, func_name: str) -> Any:
+ """wrap SupportsReadableRelation generators in cursor context"""
+
+ def _wrap(*args: Any, **kwargs: Any) -> Any:
+ with self.cursor() as cursor:
+ yield from getattr(cursor, func_name)(*args, **kwargs)
+
+ return _wrap
+
+ def _wrap_func(self, func_name: str) -> Any:
+ """wrap SupportsReadableRelation functions in cursor context"""
+
+ def _wrap(*args: Any, **kwargs: Any) -> Any:
+ with self.cursor() as cursor:
+ return getattr(cursor, func_name)(*args, **kwargs)
+
+ return _wrap
+
+
+class ReadableDBAPIRelation(BaseReadableDBAPIRelation):
+ def __init__(
+ self,
+ *,
+ readable_dataset: "ReadableDBAPIDataset",
+ provided_query: Any = None,
+ table_name: str = None,
+ limit: int = None,
+ selected_columns: Sequence[str] = None,
+ ) -> None:
+ """Create a lazy evaluated relation to for the dataset of a destination"""
+
+ # NOTE: we can keep an assertion here, this class will not be created by the user
+ assert bool(table_name) != bool(
+ provided_query
+ ), "Please provide either an sql query OR a table_name"
+
+ super().__init__(readable_dataset=readable_dataset)
+
+ self._provided_query = provided_query
+ self._table_name = table_name
+ self._limit = limit
+ self._selected_columns = selected_columns
+
+ @property
+ def query(self) -> Any:
+ """build the query"""
+ if self._provided_query:
+ return self._provided_query
+
+ table_name = self.sql_client.make_qualified_table_name(
+ self.schema.naming.normalize_path(self._table_name)
+ )
+
+ maybe_limit_clause_1 = ""
+ maybe_limit_clause_2 = ""
+ if self._limit:
+ maybe_limit_clause_1, maybe_limit_clause_2 = self.sql_client._limit_clause_sql(
+ self._limit
+ )
+
+ selector = "*"
+ if self._selected_columns:
+ selector = ",".join(
+ [
+ self.sql_client.escape_column_name(self.schema.naming.normalize_tables_path(c))
+ for c in self._selected_columns
+ ]
+ )
+
+ return f"SELECT {maybe_limit_clause_1} {selector} FROM {table_name} {maybe_limit_clause_2}"
+
+ @property
+ def columns_schema(self) -> TTableSchemaColumns:
+ return self.compute_columns_schema()
+
+ @columns_schema.setter
+ def columns_schema(self, new_value: TTableSchemaColumns) -> None:
+ raise NotImplementedError("columns schema in ReadableDBAPIRelation can only be computed")
+
+ def compute_columns_schema(self) -> TTableSchemaColumns:
+ """provide schema columns for the cursor, may be filtered by selected columns"""
+
+ columns_schema = (
+ self.schema.tables.get(self._table_name, {}).get("columns", {}) if self.schema else {}
+ )
+
+ if not columns_schema:
+ return None
+ if not self._selected_columns:
+ return columns_schema
+
+ filtered_columns: TTableSchemaColumns = {}
+ for sc in self._selected_columns:
+ sc = self.schema.naming.normalize_path(sc)
+ if sc not in columns_schema.keys():
+ raise ReadableRelationUnknownColumnException(sc)
+ filtered_columns[sc] = columns_schema[sc]
+
+ return filtered_columns
+
+ def __copy__(self) -> "ReadableDBAPIRelation":
+ return self.__class__(
+ readable_dataset=self._dataset,
+ provided_query=self._provided_query,
+ table_name=self._table_name,
+ limit=self._limit,
+ selected_columns=self._selected_columns,
+ )
+
+ def limit(self, limit: int, **kwargs: Any) -> "ReadableDBAPIRelation":
+ if self._provided_query:
+ raise ReadableRelationHasQueryException("limit")
+ rel = self.__copy__()
+ rel._limit = limit
+ return rel
+
+ def select(self, *columns: str) -> "ReadableDBAPIRelation":
+ if self._provided_query:
+ raise ReadableRelationHasQueryException("select")
+ rel = self.__copy__()
+ rel._selected_columns = columns
+ # NOTE: the line below will ensure that no unknown columns are selected if
+ # schema is known
+ rel.compute_columns_schema()
+ return rel
+
+ def __getitem__(self, columns: Union[str, Sequence[str]]) -> "SupportsReadableRelation":
+ if isinstance(columns, str):
+ return self.select(columns)
+ elif isinstance(columns, Sequence):
+ return self.select(*columns)
+ else:
+ raise TypeError(f"Invalid argument type: {type(columns).__name__}")
+
+ def head(self, limit: int = 5) -> "ReadableDBAPIRelation":
+ return self.limit(limit)
diff --git a/dlt/destinations/dataset/utils.py b/dlt/destinations/dataset/utils.py
new file mode 100644
index 0000000000..766fbc13ea
--- /dev/null
+++ b/dlt/destinations/dataset/utils.py
@@ -0,0 +1,95 @@
+from typing import Tuple
+
+from dlt import version
+
+from dlt.common.exceptions import MissingDependencyException
+
+from dlt.common.destination import AnyDestination
+from dlt.common.destination.reference import (
+ Destination,
+ JobClientBase,
+ DestinationClientDwhConfiguration,
+ DestinationClientStagingConfiguration,
+ DestinationClientConfiguration,
+ DestinationClientDwhWithStagingConfiguration,
+)
+
+from dlt.common.schema import Schema
+
+
+# helpers
+def get_destination_client_initial_config(
+ destination: AnyDestination,
+ default_schema_name: str,
+ dataset_name: str,
+ as_staging: bool = False,
+) -> DestinationClientConfiguration:
+ client_spec = destination.spec
+
+ # this client supports many schemas and datasets
+ if issubclass(client_spec, DestinationClientDwhConfiguration):
+ if issubclass(client_spec, DestinationClientStagingConfiguration):
+ spec: DestinationClientDwhConfiguration = client_spec(as_staging_destination=as_staging)
+ else:
+ spec = client_spec()
+
+ spec._bind_dataset_name(dataset_name, default_schema_name)
+ return spec
+
+ return client_spec()
+
+
+def get_destination_clients(
+ schema: Schema,
+ destination: AnyDestination = None,
+ destination_dataset_name: str = None,
+ destination_initial_config: DestinationClientConfiguration = None,
+ staging: AnyDestination = None,
+ staging_dataset_name: str = None,
+ staging_initial_config: DestinationClientConfiguration = None,
+ # pipeline specific settings
+ default_schema_name: str = None,
+) -> Tuple[JobClientBase, JobClientBase]:
+ destination = Destination.from_reference(destination) if destination else None
+ staging = Destination.from_reference(staging) if staging else None
+
+ try:
+ # resolve staging config in order to pass it to destination client config
+ staging_client = None
+ if staging:
+ if not staging_initial_config:
+ # this is just initial config - without user configuration injected
+ staging_initial_config = get_destination_client_initial_config(
+ staging,
+ dataset_name=staging_dataset_name,
+ default_schema_name=default_schema_name,
+ as_staging=True,
+ )
+ # create the client - that will also resolve the config
+ staging_client = staging.client(schema, staging_initial_config)
+
+ if not destination_initial_config:
+ # config is not provided then get it with injected credentials
+ initial_config = get_destination_client_initial_config(
+ destination,
+ dataset_name=destination_dataset_name,
+ default_schema_name=default_schema_name,
+ )
+
+ # attach the staging client config to destination client config - if its type supports it
+ if (
+ staging_client
+ and isinstance(initial_config, DestinationClientDwhWithStagingConfiguration)
+ and isinstance(staging_client.config, DestinationClientStagingConfiguration)
+ ):
+ initial_config.staging_config = staging_client.config
+ # create instance with initial_config properly set
+ client = destination.client(schema, initial_config)
+ return client, staging_client
+ except ModuleNotFoundError:
+ client_spec = destination.spec()
+ raise MissingDependencyException(
+ f"{client_spec.destination_type} destination",
+ [f"{version.DLT_PKG_NAME}[{client_spec.destination_type}]"],
+ "Dependencies for specific destinations are available as extras of dlt",
+ )
diff --git a/dlt/destinations/impl/bigquery/bigquery.py b/dlt/destinations/impl/bigquery/bigquery.py
index 2b3927e7c9..10a344f768 100644
--- a/dlt/destinations/impl/bigquery/bigquery.py
+++ b/dlt/destinations/impl/bigquery/bigquery.py
@@ -401,10 +401,7 @@ def _get_info_schema_columns_query(
return query, folded_table_names
def _get_column_def_sql(self, column: TColumnSchema, table: PreparedTableSchema = None) -> str:
- name = self.sql_client.escape_column_name(column["name"])
- column_def_sql = (
- f"{name} {self.type_mapper.to_destination_type(column, table)} {self._gen_not_null(column.get('nullable', True))}"
- )
+ column_def_sql = super()._get_column_def_sql(column, table)
if column.get(ROUND_HALF_EVEN_HINT, False):
column_def_sql += " OPTIONS (rounding_mode='ROUND_HALF_EVEN')"
if column.get(ROUND_HALF_AWAY_FROM_ZERO_HINT, False):
diff --git a/dlt/destinations/impl/clickhouse/clickhouse.py b/dlt/destinations/impl/clickhouse/clickhouse.py
index 3a5f5c3e28..a407e56361 100644
--- a/dlt/destinations/impl/clickhouse/clickhouse.py
+++ b/dlt/destinations/impl/clickhouse/clickhouse.py
@@ -292,11 +292,10 @@ def _get_table_update_sql(
return sql
- @staticmethod
- def _gen_not_null(v: bool) -> str:
+ def _gen_not_null(self, v: bool) -> str:
# ClickHouse fields are not nullable by default.
# We use the `Nullable` modifier instead of NULL / NOT NULL modifiers to cater for ALTER statement.
- pass
+ return ""
def _from_db_type(
self, ch_t: str, precision: Optional[int], scale: Optional[int]
diff --git a/dlt/destinations/impl/databricks/configuration.py b/dlt/destinations/impl/databricks/configuration.py
index c95b6eba4c..21338bd310 100644
--- a/dlt/destinations/impl/databricks/configuration.py
+++ b/dlt/destinations/impl/databricks/configuration.py
@@ -4,6 +4,7 @@
from dlt.common.typing import TSecretStrValue
from dlt.common.configuration.specs.base_configuration import CredentialsConfiguration, configspec
from dlt.common.destination.reference import DestinationClientDwhWithStagingConfiguration
+from dlt.common.configuration.exceptions import ConfigurationValueError
DATABRICKS_APPLICATION_ID = "dltHub_dlt"
@@ -15,6 +16,8 @@ class DatabricksCredentials(CredentialsConfiguration):
server_hostname: str = None
http_path: str = None
access_token: Optional[TSecretStrValue] = None
+ client_id: Optional[TSecretStrValue] = None
+ client_secret: Optional[TSecretStrValue] = None
http_headers: Optional[Dict[str, str]] = None
session_configuration: Optional[Dict[str, Any]] = None
"""Dict of session parameters that will be passed to `databricks.sql.connect`"""
@@ -27,9 +30,18 @@ class DatabricksCredentials(CredentialsConfiguration):
"server_hostname",
"http_path",
"catalog",
+ "client_id",
+ "client_secret",
"access_token",
]
+ def on_resolved(self) -> None:
+ if not ((self.client_id and self.client_secret) or self.access_token):
+ raise ConfigurationValueError(
+ "No valid authentication method detected. Provide either 'client_id' and"
+ " 'client_secret' for OAuth, or 'access_token' for token-based authentication."
+ )
+
def to_connector_params(self) -> Dict[str, Any]:
conn_params = dict(
catalog=self.catalog,
diff --git a/dlt/destinations/impl/databricks/databricks.py b/dlt/destinations/impl/databricks/databricks.py
index 2bb68a607e..a83db6ec34 100644
--- a/dlt/destinations/impl/databricks/databricks.py
+++ b/dlt/destinations/impl/databricks/databricks.py
@@ -264,12 +264,6 @@ def _from_db_type(
) -> TColumnType:
return self.type_mapper.from_destination_type(bq_t, precision, scale)
- def _get_column_def_sql(self, c: TColumnSchema, table: PreparedTableSchema = None) -> str:
- name = self.sql_client.escape_column_name(c["name"])
- return (
- f"{name} {self.type_mapper.to_destination_type(c,table)} {self._gen_not_null(c.get('nullable', True))}"
- )
-
def _get_storage_table_query_columns(self) -> List[str]:
fields = super()._get_storage_table_query_columns()
fields[2] = ( # Override because this is the only way to get data type with precision
diff --git a/dlt/destinations/impl/databricks/sql_client.py b/dlt/destinations/impl/databricks/sql_client.py
index 8bff4e0d73..16e1e73d93 100644
--- a/dlt/destinations/impl/databricks/sql_client.py
+++ b/dlt/destinations/impl/databricks/sql_client.py
@@ -11,10 +11,12 @@
Tuple,
Union,
Dict,
+ cast,
+ Callable,
)
-
-from databricks import sql as databricks_lib
+from databricks.sdk.core import Config, oauth_service_principal
+from databricks import sql as databricks_lib # type: ignore[attr-defined]
from databricks.sql.client import (
Connection as DatabricksSqlConnection,
Cursor as DatabricksSqlCursor,
@@ -73,8 +75,22 @@ def __init__(
self._conn: DatabricksSqlConnection = None
self.credentials = credentials
+ def _get_oauth_credentials(self) -> Optional[Callable[[], Dict[str, str]]]:
+ config = Config(
+ host=f"https://{self.credentials.server_hostname}",
+ client_id=self.credentials.client_id,
+ client_secret=self.credentials.client_secret,
+ )
+ return cast(Callable[[], Dict[str, str]], oauth_service_principal(config))
+
def open_connection(self) -> DatabricksSqlConnection:
conn_params = self.credentials.to_connector_params()
+
+ if self.credentials.client_id and self.credentials.client_secret:
+ conn_params["credentials_provider"] = self._get_oauth_credentials
+ else:
+ conn_params["access_token"] = self.credentials.access_token
+
self._conn = databricks_lib.connect(
**conn_params, schema=self.dataset_name, use_inline_params="silent"
)
diff --git a/dlt/destinations/impl/dremio/dremio.py b/dlt/destinations/impl/dremio/dremio.py
index ab23f58ab4..e3a090c824 100644
--- a/dlt/destinations/impl/dremio/dremio.py
+++ b/dlt/destinations/impl/dremio/dremio.py
@@ -151,12 +151,6 @@ def _from_db_type(
) -> TColumnType:
return self.type_mapper.from_destination_type(bq_t, precision, scale)
- def _get_column_def_sql(self, c: TColumnSchema, table: PreparedTableSchema = None) -> str:
- name = self.sql_client.escape_column_name(c["name"])
- return (
- f"{name} {self.type_mapper.to_destination_type(c,table)} {self._gen_not_null(c.get('nullable', True))}"
- )
-
def _create_merge_followup_jobs(
self, table_chain: Sequence[PreparedTableSchema]
) -> List[FollowupJobRequest]:
diff --git a/dlt/destinations/impl/duckdb/duck.py b/dlt/destinations/impl/duckdb/duck.py
index 3bd4c83e1f..2b3370270b 100644
--- a/dlt/destinations/impl/duckdb/duck.py
+++ b/dlt/destinations/impl/duckdb/duck.py
@@ -74,17 +74,6 @@ def create_load_job(
job = DuckDbCopyJob(file_path)
return job
- def _get_column_def_sql(self, c: TColumnSchema, table: PreparedTableSchema = None) -> str:
- hints_str = " ".join(
- self.active_hints.get(h, "")
- for h in self.active_hints.keys()
- if c.get(h, False) is True
- )
- column_name = self.sql_client.escape_column_name(c["name"])
- return (
- f"{column_name} {self.type_mapper.to_destination_type(c,table)} {hints_str} {self._gen_not_null(c.get('nullable', True))}"
- )
-
def _from_db_type(
self, pq_t: str, precision: Optional[int], scale: Optional[int]
) -> TColumnType:
diff --git a/dlt/destinations/impl/filesystem/factory.py b/dlt/destinations/impl/filesystem/factory.py
index 2463da58fa..906bd157e4 100644
--- a/dlt/destinations/impl/filesystem/factory.py
+++ b/dlt/destinations/impl/filesystem/factory.py
@@ -19,7 +19,7 @@ def filesystem_loader_file_format_selector(
*,
table_schema: TTableSchema,
) -> t.Tuple[TLoaderFileFormat, t.Sequence[TLoaderFileFormat]]:
- if table_schema.get("table_format") == "delta":
+ if table_schema.get("table_format") in ("delta", "iceberg"):
return ("parquet", ["parquet"])
return (preferred_loader_file_format, supported_loader_file_formats)
@@ -43,7 +43,7 @@ def _raw_capabilities(self) -> DestinationCapabilitiesContext:
caps = DestinationCapabilitiesContext.generic_capabilities(
preferred_loader_file_format="jsonl",
loader_file_format_selector=filesystem_loader_file_format_selector,
- supported_table_formats=["delta"],
+ supported_table_formats=["delta", "iceberg"],
supported_merge_strategies=["upsert"],
merge_strategies_selector=filesystem_merge_strategies_selector,
)
diff --git a/dlt/destinations/impl/filesystem/filesystem.py b/dlt/destinations/impl/filesystem/filesystem.py
index 1739c87fb3..ccf764811b 100644
--- a/dlt/destinations/impl/filesystem/filesystem.py
+++ b/dlt/destinations/impl/filesystem/filesystem.py
@@ -119,16 +119,27 @@ def metrics(self) -> Optional[LoadJobMetrics]:
return m._replace(remote_url=self.make_remote_url())
-class DeltaLoadFilesystemJob(FilesystemLoadJob):
+class TableFormatLoadFilesystemJob(FilesystemLoadJob):
def __init__(self, file_path: str) -> None:
super().__init__(file_path=file_path)
self.file_paths = ReferenceFollowupJobRequest.resolve_references(self._file_path)
def make_remote_path(self) -> str:
- # remote path is table dir - delta will create its file structure inside it
return self._job_client.get_table_dir(self.load_table_name)
+ @property
+ def arrow_dataset(self) -> Any:
+ from dlt.common.libs.pyarrow import pyarrow
+
+ return pyarrow.dataset.dataset(self.file_paths)
+
+ @property
+ def _partition_columns(self) -> List[str]:
+ return get_columns_names_with_prop(self._load_table, "partition")
+
+
+class DeltaLoadFilesystemJob(TableFormatLoadFilesystemJob):
def run(self) -> None:
# create Arrow dataset from Parquet files
from dlt.common.libs.pyarrow import pyarrow as pa
@@ -138,7 +149,7 @@ def run(self) -> None:
f"Will copy file(s) {self.file_paths} to delta table {self.make_remote_url()} [arrow"
f" buffer: {pa.total_allocated_bytes()}]"
)
- source_ds = pa.dataset.dataset(self.file_paths)
+ source_ds = self.arrow_dataset
delta_table = self._delta_table()
# explicitly check if there is data
@@ -148,9 +159,6 @@ def run(self) -> None:
else:
with source_ds.scanner().to_reader() as arrow_rbr: # RecordBatchReader
if self._load_table["write_disposition"] == "merge" and delta_table is not None:
- self._load_table["x-merge-strategy"] = resolve_merge_strategy( # type: ignore[typeddict-unknown-key]
- self._schema.tables, self._load_table, self._job_client.capabilities
- )
merge_delta_table(
table=delta_table,
data=arrow_rbr,
@@ -188,10 +196,6 @@ def _delta_table(self) -> Optional["DeltaTable"]: # type: ignore[name-defined]
else:
return None
- @property
- def _partition_columns(self) -> List[str]:
- return get_columns_names_with_prop(self._load_table, "partition")
-
def _create_or_evolve_delta_table(self, arrow_ds: "Dataset", delta_table: "DeltaTable") -> "DeltaTable": # type: ignore[name-defined] # noqa: F821
from dlt.common.libs.deltalake import (
DeltaTable,
@@ -211,13 +215,36 @@ def _create_or_evolve_delta_table(self, arrow_ds: "Dataset", delta_table: "Delta
return _evolve_delta_table_schema(delta_table, arrow_ds.schema)
+class IcebergLoadFilesystemJob(TableFormatLoadFilesystemJob):
+ def run(self) -> None:
+ from dlt.common.libs.pyiceberg import write_iceberg_table
+
+ write_iceberg_table(
+ table=self._iceberg_table(),
+ data=self.arrow_dataset.to_table(),
+ write_disposition=self._load_table["write_disposition"],
+ )
+
+ def _iceberg_table(self) -> "pyiceberg.table.Table": # type: ignore[name-defined] # noqa: F821
+ from dlt.common.libs.pyiceberg import get_catalog
+
+ catalog = get_catalog(
+ client=self._job_client,
+ table_name=self.load_table_name,
+ schema=self.arrow_dataset.schema,
+ partition_columns=self._partition_columns,
+ )
+ return catalog.load_table(self.table_identifier)
+
+ @property
+ def table_identifier(self) -> str:
+ return f"{self._job_client.dataset_name}.{self.load_table_name}"
+
+
class FilesystemLoadJobWithFollowup(HasFollowupJobs, FilesystemLoadJob):
def create_followup_jobs(self, final_state: TLoadJobState) -> List[FollowupJobRequest]:
jobs = super().create_followup_jobs(final_state)
- if self._load_table.get("table_format") == "delta":
- # delta table jobs only require table chain followup jobs
- pass
- elif final_state == "completed":
+ if final_state == "completed":
ref_job = ReferenceFollowupJobRequest(
original_file_name=self.file_name(),
remote_paths=[self._job_client.make_remote_url(self.make_remote_path())],
@@ -394,6 +421,13 @@ def prepare_load_table(self, table_name: str) -> PreparedTableSchema:
if table["write_disposition"] == "merge":
table["write_disposition"] = "append"
table.pop("table_format", None)
+ merge_strategy = resolve_merge_strategy(self.schema.tables, table, self.capabilities)
+ if table["write_disposition"] == "merge":
+ if merge_strategy is None:
+ # no supported merge strategies, fall back to append
+ table["write_disposition"] = "append"
+ else:
+ table["x-merge-strategy"] = merge_strategy # type: ignore[typeddict-unknown-key]
return table
def get_table_dir(self, table_name: str, remote: bool = False) -> str:
@@ -458,12 +492,20 @@ def create_load_job(
# where we want to load the state the regular way
if table["name"] == self.schema.state_table_name and not self.config.as_staging_destination:
return FinalizedLoadJob(file_path)
- if table.get("table_format") == "delta":
- import dlt.common.libs.deltalake # assert dependencies are installed
+ table_format = table.get("table_format")
+ if table_format in ("delta", "iceberg"):
# a reference job for a delta table indicates a table chain followup job
if ReferenceFollowupJobRequest.is_reference_job(file_path):
- return DeltaLoadFilesystemJob(file_path)
+ if table_format == "delta":
+ import dlt.common.libs.deltalake
+
+ return DeltaLoadFilesystemJob(file_path)
+ elif table_format == "iceberg":
+ import dlt.common.libs.pyiceberg
+
+ return IcebergLoadFilesystemJob(file_path)
+
# otherwise just continue
return FinalizedLoadJobWithFollowupJobs(file_path)
@@ -494,10 +536,10 @@ def should_load_data_to_staging_dataset(self, table_name: str) -> bool:
def should_truncate_table_before_load(self, table_name: str) -> bool:
table = self.prepare_load_table(table_name)
- return (
- table["write_disposition"] == "replace"
- and not table.get("table_format") == "delta" # Delta can do a logical replace
- )
+ return table["write_disposition"] == "replace" and not table.get("table_format") in (
+ "delta",
+ "iceberg",
+ ) # Delta/Iceberg can do a logical replace
#
# state stuff
@@ -718,7 +760,7 @@ def create_table_chain_completed_followup_jobs(
jobs = super().create_table_chain_completed_followup_jobs(
table_chain, completed_table_chain_jobs
)
- if table_chain[0].get("table_format") == "delta":
+ if table_chain[0].get("table_format") in ("delta", "iceberg"):
for table in table_chain:
table_job_paths = [
job.file_path
diff --git a/dlt/destinations/impl/filesystem/sql_client.py b/dlt/destinations/impl/filesystem/sql_client.py
index d03a00b418..e6b84343bb 100644
--- a/dlt/destinations/impl/filesystem/sql_client.py
+++ b/dlt/destinations/impl/filesystem/sql_client.py
@@ -13,6 +13,7 @@
from dlt.common.destination.reference import DBApiCursor
+from dlt.common.storages.fsspec_filesystem import AZURE_BLOB_STORAGE_PROTOCOLS
from dlt.destinations.sql_client import raise_database_error
from dlt.destinations.impl.duckdb.sql_client import DuckDbSqlClient
@@ -169,8 +170,9 @@ def create_authentication(self, persistent: bool = False, secret_name: str = Non
# native google storage implementation is not supported..
elif self.fs_client.config.protocol in ["gs", "gcs"]:
logger.warn(
- "For gs/gcs access via duckdb please use the gs/gcs s3 compatibility layer. Falling"
- " back to fsspec."
+ "For gs/gcs access via duckdb please use the gs/gcs s3 compatibility layer if"
+ " possible (not supported when using `iceberg` table format). Falling back to"
+ " fsspec."
)
self._conn.register_filesystem(self.fs_client.fs_client)
@@ -192,7 +194,7 @@ def open_connection(self) -> duckdb.DuckDBPyConnection:
# the line below solves problems with certificate path lookup on linux
# see duckdb docs
- if self.fs_client.config.protocol in ["az", "abfss"]:
+ if self.fs_client.config.protocol in AZURE_BLOB_STORAGE_PROTOCOLS:
self._conn.sql("SET azure_transport_option_type = 'curl';")
return self._conn
@@ -212,14 +214,17 @@ def create_views_for_tables(self, tables: Dict[str, str]) -> None:
# unknown views will not be created
continue
- # only create view if it does not exist in the current schema yet
- existing_tables = [tname[0] for tname in self._conn.execute("SHOW TABLES").fetchall()]
- if view_name in existing_tables:
- continue
-
# NOTE: if this is staging configuration then `prepare_load_table` will remove some info
# from table schema, if we ever extend this to handle staging destination, this needs to change
schema_table = self.fs_client.prepare_load_table(table_name)
+ table_format = schema_table.get("table_format")
+
+ # skip if view already exists and does not need to be replaced each time
+ existing_tables = [tname[0] for tname in self._conn.execute("SHOW TABLES").fetchall()]
+ needs_replace = table_format == "iceberg" or self.fs_client.config.protocol == "abfss"
+ if view_name in existing_tables and not needs_replace:
+ continue
+
# discover file type
folder = self.fs_client.get_table_dir(table_name)
files = self.fs_client.list_table_files(table_name)
@@ -256,8 +261,17 @@ def create_views_for_tables(self, tables: Dict[str, str]) -> None:
# create from statement
from_statement = ""
- if schema_table.get("table_format") == "delta":
+ if table_format == "delta":
from_statement = f"delta_scan('{resolved_folder}')"
+ elif table_format == "iceberg":
+ from dlt.common.libs.pyiceberg import _get_last_metadata_file
+
+ self._setup_iceberg(self._conn)
+ metadata_path = f"{resolved_folder}/metadata"
+ last_metadata_file = _get_last_metadata_file(metadata_path, self.fs_client)
+ # skip schema inference to make nested data types work
+ # https://github.com/duckdb/duckdb_iceberg/issues/47
+ from_statement = f"iceberg_scan('{last_metadata_file}', skip_schema_inference=True)"
elif first_file_type == "parquet":
from_statement = f"read_parquet([{resolved_files_string}])"
elif first_file_type == "jsonl":
@@ -267,12 +281,14 @@ def create_views_for_tables(self, tables: Dict[str, str]) -> None:
else:
raise NotImplementedError(
f"Unknown filetype {first_file_type} for table {table_name}. Currently only"
- " jsonl and parquet files as well as delta tables are supported."
+ " jsonl and parquet files as well as delta and iceberg tables are supported."
)
# create table
view_name = self.make_qualified_table_name(view_name)
- create_table_sql_base = f"CREATE VIEW {view_name} AS SELECT * FROM {from_statement}"
+ create_table_sql_base = (
+ f"CREATE OR REPLACE VIEW {view_name} AS SELECT * FROM {from_statement}"
+ )
self._conn.execute(create_table_sql_base)
@contextmanager
@@ -299,6 +315,16 @@ def execute_query(self, query: AnyStr, *args: Any, **kwargs: Any) -> Iterator[DB
with super().execute_query(query, *args, **kwargs) as cursor:
yield cursor
+ @staticmethod
+ def _setup_iceberg(conn: duckdb.DuckDBPyConnection) -> None:
+ # needed to make persistent secrets work in new connection
+ # https://github.com/duckdb/duckdb_iceberg/issues/83
+ conn.execute("FROM duckdb_secrets();")
+
+ # `duckdb_iceberg` extension does not support autoloading
+ # https://github.com/duckdb/duckdb_iceberg/issues/71
+ conn.execute("INSTALL iceberg; LOAD iceberg;")
+
def __del__(self) -> None:
if self.memory_db:
self.memory_db.close()
diff --git a/dlt/destinations/impl/mssql/mssql.py b/dlt/destinations/impl/mssql/mssql.py
index 27aebe07f2..7b48a6b551 100644
--- a/dlt/destinations/impl/mssql/mssql.py
+++ b/dlt/destinations/impl/mssql/mssql.py
@@ -115,11 +115,7 @@ def _get_column_def_sql(self, c: TColumnSchema, table: PreparedTableSchema = Non
else:
db_type = self.type_mapper.to_destination_type(c, table)
- hints_str = " ".join(
- self.active_hints.get(h, "")
- for h in self.active_hints.keys()
- if c.get(h, False) is True
- )
+ hints_str = self._get_column_hints_sql(c)
column_name = self.sql_client.escape_column_name(c["name"])
return f"{column_name} {db_type} {hints_str} {self._gen_not_null(c.get('nullable', True))}"
diff --git a/dlt/destinations/impl/postgres/postgres.py b/dlt/destinations/impl/postgres/postgres.py
index 2459ee1dbe..3d54b59f93 100644
--- a/dlt/destinations/impl/postgres/postgres.py
+++ b/dlt/destinations/impl/postgres/postgres.py
@@ -161,18 +161,6 @@ def create_load_job(
job = PostgresCsvCopyJob(file_path)
return job
- def _get_column_def_sql(self, c: TColumnSchema, table: PreparedTableSchema = None) -> str:
- hints_ = " ".join(
- self.active_hints.get(h, "")
- for h in self.active_hints.keys()
- if c.get(h, False) is True
- )
- column_name = self.sql_client.escape_column_name(c["name"])
- nullability = self._gen_not_null(c.get("nullable", True))
- column_type = self.type_mapper.to_destination_type(c, table)
-
- return f"{column_name} {column_type} {hints_} {nullability}"
-
def _create_replace_followup_jobs(
self, table_chain: Sequence[PreparedTableSchema]
) -> List[FollowupJobRequest]:
diff --git a/dlt/destinations/impl/redshift/redshift.py b/dlt/destinations/impl/redshift/redshift.py
index 2335166761..b1aa37ce6a 100644
--- a/dlt/destinations/impl/redshift/redshift.py
+++ b/dlt/destinations/impl/redshift/redshift.py
@@ -153,6 +153,7 @@ def __init__(
capabilities,
)
super().__init__(schema, config, sql_client)
+ self.active_hints = HINT_TO_REDSHIFT_ATTR
self.sql_client = sql_client
self.config: RedshiftClientConfiguration = config
self.type_mapper = self.capabilities.get_type_mapper()
@@ -162,17 +163,6 @@ def _create_merge_followup_jobs(
) -> List[FollowupJobRequest]:
return [RedshiftMergeJob.from_table_chain(table_chain, self.sql_client)]
- def _get_column_def_sql(self, c: TColumnSchema, table: PreparedTableSchema = None) -> str:
- hints_str = " ".join(
- HINT_TO_REDSHIFT_ATTR.get(h, "")
- for h in HINT_TO_REDSHIFT_ATTR.keys()
- if c.get(h, False) is True
- )
- column_name = self.sql_client.escape_column_name(c["name"])
- return (
- f"{column_name} {self.type_mapper.to_destination_type(c,table)} {hints_str} {self._gen_not_null(c.get('nullable', True))}"
- )
-
def create_load_job(
self, table: PreparedTableSchema, file_path: str, load_id: str, restore: bool = False
) -> LoadJob:
diff --git a/dlt/destinations/impl/snowflake/configuration.py b/dlt/destinations/impl/snowflake/configuration.py
index 4a89a1564b..2e589ea095 100644
--- a/dlt/destinations/impl/snowflake/configuration.py
+++ b/dlt/destinations/impl/snowflake/configuration.py
@@ -138,6 +138,24 @@ class SnowflakeClientConfiguration(DestinationClientDwhWithStagingConfiguration)
query_tag: Optional[str] = None
"""A tag with placeholders to tag sessions executing jobs"""
+ create_indexes: bool = False
+ """Whether UNIQUE or PRIMARY KEY constrains should be created"""
+
+ def __init__(
+ self,
+ *,
+ credentials: SnowflakeCredentials = None,
+ create_indexes: bool = False,
+ destination_name: str = None,
+ environment: str = None,
+ ) -> None:
+ super().__init__(
+ credentials=credentials,
+ destination_name=destination_name,
+ environment=environment,
+ )
+ self.create_indexes = create_indexes
+
def fingerprint(self) -> str:
"""Returns a fingerprint of host part of a connection string"""
if self.credentials and self.credentials.host:
diff --git a/dlt/destinations/impl/snowflake/snowflake.py b/dlt/destinations/impl/snowflake/snowflake.py
index e5146139f2..786cdc0b77 100644
--- a/dlt/destinations/impl/snowflake/snowflake.py
+++ b/dlt/destinations/impl/snowflake/snowflake.py
@@ -1,6 +1,7 @@
-from typing import Optional, Sequence, List
+from typing import Optional, Sequence, List, Dict, Set
from urllib.parse import urlparse, urlunparse
+from dlt.common import logger
from dlt.common.data_writers.configuration import CsvFormatConfiguration
from dlt.common.destination import DestinationCapabilitiesContext
from dlt.common.destination.reference import (
@@ -15,13 +16,15 @@
AwsCredentialsWithoutDefaults,
AzureCredentialsWithoutDefaults,
)
+from dlt.common.schema.utils import get_columns_names_with_prop
from dlt.common.storages.configuration import FilesystemConfiguration, ensure_canonical_az_url
from dlt.common.storages.file_storage import FileStorage
-from dlt.common.schema import TColumnSchema, Schema
-from dlt.common.schema.typing import TColumnType
+from dlt.common.schema import TColumnSchema, Schema, TColumnHint
+from dlt.common.schema.typing import TColumnType, TTableSchema
from dlt.common.storages.fsspec_filesystem import AZURE_BLOB_STORAGE_PROTOCOLS, S3_PROTOCOLS
from dlt.common.typing import TLoaderFileFormat
+from dlt.common.utils import uniq_id
from dlt.destinations.job_client_impl import SqlJobClientWithStagingDataset
from dlt.destinations.exceptions import LoadJobTerminalException
@@ -29,6 +32,8 @@
from dlt.destinations.impl.snowflake.sql_client import SnowflakeSqlClient
from dlt.destinations.job_impl import ReferenceFollowupJobRequest
+SUPPORTED_HINTS: Dict[TColumnHint, str] = {"unique": "UNIQUE"}
+
class SnowflakeLoadJob(RunnableLoadJob, HasFollowupJobs):
def __init__(
@@ -238,6 +243,7 @@ def __init__(
self.config: SnowflakeClientConfiguration = config
self.sql_client: SnowflakeSqlClient = sql_client # type: ignore
self.type_mapper = self.capabilities.get_type_mapper()
+ self.active_hints = SUPPORTED_HINTS if self.config.create_indexes else {}
def create_load_job(
self, table: PreparedTableSchema, file_path: str, load_id: str, restore: bool = False
@@ -264,6 +270,33 @@ def _make_add_column_sql(
"ADD COLUMN\n" + ",\n".join(self._get_column_def_sql(c, table) for c in new_columns)
]
+ def _get_constraints_sql(
+ self, table_name: str, new_columns: Sequence[TColumnSchema], generate_alter: bool
+ ) -> str:
+ # "primary_key": "PRIMARY KEY"
+ if self.config.create_indexes:
+ partial: TTableSchema = {
+ "name": table_name,
+ "columns": {c["name"]: c for c in new_columns},
+ }
+ # Add PK constraint if pk_columns exist
+ pk_columns = get_columns_names_with_prop(partial, "primary_key")
+ if pk_columns:
+ if generate_alter:
+ logger.warning(
+ f"PRIMARY KEY on {table_name} constraint cannot be added in ALTER TABLE and"
+ " is ignored"
+ )
+ else:
+ pk_constraint_name = list(
+ self._norm_and_escape_columns(f"PK_{table_name}_{uniq_id(4)}")
+ )[0]
+ quoted_pk_cols = ", ".join(
+ self.sql_client.escape_column_name(col) for col in pk_columns
+ )
+ return f",\nCONSTRAINT {pk_constraint_name} PRIMARY KEY ({quoted_pk_cols})"
+ return ""
+
def _get_table_update_sql(
self,
table_name: str,
@@ -287,11 +320,5 @@ def _from_db_type(
) -> TColumnType:
return self.type_mapper.from_destination_type(bq_t, precision, scale)
- def _get_column_def_sql(self, c: TColumnSchema, table: PreparedTableSchema = None) -> str:
- name = self.sql_client.escape_column_name(c["name"])
- return (
- f"{name} {self.type_mapper.to_destination_type(c,table)} {self._gen_not_null(c.get('nullable', True))}"
- )
-
def should_truncate_table_before_load_on_staging_destination(self, table_name: str) -> bool:
return self.config.truncate_tables_on_staging_destination_before_load
diff --git a/dlt/destinations/impl/sqlalchemy/db_api_client.py b/dlt/destinations/impl/sqlalchemy/db_api_client.py
index 6f3ff065bf..27c4f2f1f9 100644
--- a/dlt/destinations/impl/sqlalchemy/db_api_client.py
+++ b/dlt/destinations/impl/sqlalchemy/db_api_client.py
@@ -84,7 +84,7 @@ def __init__(self, curr: sa.engine.CursorResult) -> None:
def _get_columns(self) -> List[str]:
try:
- return list(self.native_cursor.keys()) # type: ignore[attr-defined]
+ return list(self.native_cursor.keys())
except ResourceClosedError:
# this happens if now rows are returned
return []
@@ -314,7 +314,7 @@ def execute_sql(
self, sql: Union[AnyStr, sa.sql.Executable], *args: Any, **kwargs: Any
) -> Optional[Sequence[Sequence[Any]]]:
with self.execute_query(sql, *args, **kwargs) as cursor:
- if cursor.returns_rows: # type: ignore[attr-defined]
+ if cursor.returns_rows:
return cursor.fetchall()
return None
diff --git a/dlt/destinations/impl/sqlalchemy/factory.py b/dlt/destinations/impl/sqlalchemy/factory.py
index edd827ed00..e61ac1fb6a 100644
--- a/dlt/destinations/impl/sqlalchemy/factory.py
+++ b/dlt/destinations/impl/sqlalchemy/factory.py
@@ -81,6 +81,9 @@ def adjust_capabilities(
caps.max_column_identifier_length = dialect.max_identifier_length
caps.supports_native_boolean = dialect.supports_native_boolean
if dialect.name == "mysql":
+ # correct max identifier length
+ # dialect uses 255 (max length for aliases) instead of 64 (max length of identifiers)
+ caps.max_identifier_length = 64
caps.format_datetime_literal = _format_mysql_datetime_literal
return caps
diff --git a/dlt/destinations/job_client_impl.py b/dlt/destinations/job_client_impl.py
index d1f211b1e9..888c80c006 100644
--- a/dlt/destinations/job_client_impl.py
+++ b/dlt/destinations/job_client_impl.py
@@ -7,6 +7,7 @@
from typing import (
Any,
ClassVar,
+ Dict,
List,
Optional,
Sequence,
@@ -14,21 +15,18 @@
Type,
Iterable,
Iterator,
- Generator,
)
import zlib
import re
-from contextlib import contextmanager
-from contextlib import suppress
from dlt.common import pendulum, logger
+from dlt.common.destination.capabilities import DataTypeMapper
from dlt.common.json import json
from dlt.common.schema.typing import (
C_DLT_LOAD_ID,
COLUMN_HINTS,
TColumnType,
TColumnSchemaBase,
- TTableFormat,
)
from dlt.common.schema.utils import (
get_inherited_table_hint,
@@ -40,11 +38,11 @@
from dlt.common.storages import FileStorage
from dlt.common.storages.load_package import LoadJobInfo, ParsedLoadJobFileName
from dlt.common.schema import TColumnSchema, Schema, TTableSchemaColumns, TSchemaTables
+from dlt.common.schema import TColumnHint
from dlt.common.destination.reference import (
PreparedTableSchema,
StateInfo,
StorageSchemaInfo,
- SupportsReadableDataset,
WithStateSync,
DestinationClientConfiguration,
DestinationClientDwhConfiguration,
@@ -55,9 +53,7 @@
JobClientBase,
HasFollowupJobs,
CredentialsConfiguration,
- SupportsReadableRelation,
)
-from dlt.destinations.dataset import ReadableDBAPIDataset
from dlt.destinations.exceptions import DatabaseUndefinedRelation
from dlt.destinations.job_impl import (
@@ -154,6 +150,8 @@ def __init__(
self.state_table_columns = ", ".join(
sql_client.escape_column_name(col) for col in state_table_["columns"]
)
+ self.active_hints: Dict[TColumnHint, str] = {}
+ self.type_mapper: DataTypeMapper = None
super().__init__(schema, config, sql_client.capabilities)
self.sql_client = sql_client
assert isinstance(config, DestinationClientDwhConfiguration)
@@ -569,6 +567,7 @@ def _get_table_update_sql(
# build CREATE
sql = self._make_create_table(qualified_name, table) + " (\n"
sql += ",\n".join([self._get_column_def_sql(c, table) for c in new_columns])
+ sql += self._get_constraints_sql(table_name, new_columns, generate_alter)
sql += ")"
sql_result.append(sql)
else:
@@ -582,8 +581,16 @@ def _get_table_update_sql(
sql_result.extend(
[sql_base + col_statement for col_statement in add_column_statements]
)
+ constraints_sql = self._get_constraints_sql(table_name, new_columns, generate_alter)
+ if constraints_sql:
+ sql_result.append(constraints_sql)
return sql_result
+ def _get_constraints_sql(
+ self, table_name: str, new_columns: Sequence[TColumnSchema], generate_alter: bool
+ ) -> str:
+ return ""
+
def _check_table_update_hints(
self, table_name: str, new_columns: Sequence[TColumnSchema], generate_alter: bool
) -> None:
@@ -613,12 +620,22 @@ def _check_table_update_hints(
" existing tables."
)
- @abstractmethod
def _get_column_def_sql(self, c: TColumnSchema, table: PreparedTableSchema = None) -> str:
- pass
+ hints_ = self._get_column_hints_sql(c)
+ column_name = self.sql_client.escape_column_name(c["name"])
+ nullability = self._gen_not_null(c.get("nullable", True))
+ column_type = self.type_mapper.to_destination_type(c, table)
+
+ return f"{column_name} {column_type} {hints_} {nullability}"
+
+ def _get_column_hints_sql(self, c: TColumnSchema) -> str:
+ return " ".join(
+ self.active_hints.get(h, "")
+ for h in self.active_hints.keys()
+ if c.get(h, False) is True # use ColumnPropInfos to get default value
+ )
- @staticmethod
- def _gen_not_null(nullable: bool) -> str:
+ def _gen_not_null(self, nullable: bool) -> str:
return "NOT NULL" if not nullable else ""
def _create_table_update(
diff --git a/dlt/extract/exceptions.py b/dlt/extract/exceptions.py
index f4d2b1f302..e832833428 100644
--- a/dlt/extract/exceptions.py
+++ b/dlt/extract/exceptions.py
@@ -3,7 +3,6 @@
from dlt.common.exceptions import DltException
from dlt.common.utils import get_callable_name
-from dlt.extract.items import ValidateItem, TDataItems
class ExtractorException(DltException):
diff --git a/dlt/extract/extract.py b/dlt/extract/extract.py
index 25c3a0dbae..c062a74920 100644
--- a/dlt/extract/extract.py
+++ b/dlt/extract/extract.py
@@ -87,7 +87,12 @@ def choose_schema() -> Schema:
schema_ = schema
# take pipeline schema to make newest version visible to the resources
elif pipeline.default_schema_name:
- schema_ = pipeline.schemas[pipeline.default_schema_name].clone()
+ # clones with name which will drop previous hashes
+ schema_ = pipeline.schemas[pipeline.default_schema_name].clone(
+ with_name=pipeline.default_schema_name
+ )
+ # delete data tables
+ schema_.drop_tables(schema_.data_table_names(include_incomplete=True))
else:
schema_ = pipeline._make_schema_with_default_name()
return schema_
diff --git a/dlt/extract/hints.py b/dlt/extract/hints.py
index 000e5c4cdb..22a0062acf 100644
--- a/dlt/extract/hints.py
+++ b/dlt/extract/hints.py
@@ -37,7 +37,8 @@
InconsistentTableTemplate,
)
from dlt.extract.incremental import Incremental, TIncrementalConfig
-from dlt.extract.items import TFunHintTemplate, TTableHintTemplate, TableNameMeta, ValidateItem
+from dlt.extract.items import TFunHintTemplate, TTableHintTemplate, TableNameMeta
+from dlt.extract.items_transform import ValidateItem
from dlt.extract.utils import ensure_table_schema_columns, ensure_table_schema_columns_hint
from dlt.extract.validation import create_item_validator
diff --git a/dlt/extract/incremental/__init__.py b/dlt/extract/incremental/__init__.py
index 28d33bb71f..ce06292864 100644
--- a/dlt/extract/incremental/__init__.py
+++ b/dlt/extract/incremental/__init__.py
@@ -42,8 +42,10 @@
LastValueFunc,
OnCursorValueMissing,
IncrementalArgs,
+ TIncrementalRange,
)
-from dlt.extract.items import SupportsPipe, TTableHintTemplate, ItemTransform
+from dlt.extract.items import SupportsPipe, TTableHintTemplate
+from dlt.extract.items_transform import ItemTransform
from dlt.extract.incremental.transform import (
JsonIncremental,
ArrowIncremental,
@@ -104,6 +106,11 @@ class Incremental(ItemTransform[TDataItem], BaseConfiguration, Generic[TCursorVa
Note that if logical "end date" is present then also "end_value" will be set which means that resource state is not used and exactly this range of date will be loaded
on_cursor_value_missing: Specify what happens when the cursor_path does not exist in a record or a record has `None` at the cursor_path: raise, include, exclude
lag: Optional value used to define a lag or attribution window. For datetime cursors, this is interpreted as seconds. For other types, it uses the + or - operator depending on the last_value_func.
+ range_start: Decide whether the incremental filtering range is `open` or `closed` on the start value side. Default is `closed`.
+ Setting this to `open` means that items with the same cursor value as the last value from the previous run (or `initial_value`) are excluded from the result.
+ The `open` range disables deduplication logic so it can serve as an optimization when you know cursors don't overlap between pipeline runs.
+ range_end: Decide whether the incremental filtering range is `open` or `closed` on the end value side. Default is `open` (exact `end_value` is excluded).
+ Setting this to `closed` means that items with the exact same cursor value as the `end_value` are included in the result.
"""
# this is config/dataclass so declare members
@@ -116,6 +123,8 @@ class Incremental(ItemTransform[TDataItem], BaseConfiguration, Generic[TCursorVa
on_cursor_value_missing: OnCursorValueMissing = "raise"
lag: Optional[float] = None
duplicate_cursor_warning_threshold: ClassVar[int] = 200
+ range_start: TIncrementalRange = "closed"
+ range_end: TIncrementalRange = "open"
# incremental acting as empty
EMPTY: ClassVar["Incremental[Any]"] = None
@@ -132,6 +141,8 @@ def __init__(
allow_external_schedulers: bool = False,
on_cursor_value_missing: OnCursorValueMissing = "raise",
lag: Optional[float] = None,
+ range_start: TIncrementalRange = "closed",
+ range_end: TIncrementalRange = "open",
) -> None:
# make sure that path is valid
if cursor_path:
@@ -174,9 +185,11 @@ def __init__(
self.start_out_of_range: bool = False
"""Becomes true on the first item that is out of range of `start_value`. I.e. when using `max` this is a value that is lower than `start_value`"""
- self._transformers: Dict[str, IncrementalTransform] = {}
+ self._transformers: Dict[Type[IncrementalTransform], IncrementalTransform] = {}
self._bound_pipe: SupportsPipe = None
"""Bound pipe"""
+ self.range_start = range_start
+ self.range_end = range_end
@property
def primary_key(self) -> Optional[TTableHintTemplate[TColumnNames]]:
@@ -190,22 +203,6 @@ def primary_key(self, value: str) -> None:
for transform in self._transformers.values():
transform.primary_key = value
- def _make_transforms(self) -> None:
- types = [("arrow", ArrowIncremental), ("json", JsonIncremental)]
- for dt, kls in types:
- self._transformers[dt] = kls(
- self.resource_name,
- self.cursor_path,
- self.initial_value,
- self.start_value,
- self.end_value,
- self.last_value_func,
- self._primary_key,
- set(self._cached_state["unique_hashes"]),
- self.on_cursor_value_missing,
- self.lag,
- )
-
@classmethod
def from_existing_state(
cls, resource_name: str, cursor_path: str
@@ -489,7 +486,8 @@ def bind(self, pipe: SupportsPipe) -> "Incremental[TCursorValue]":
)
# cache state
self._cached_state = self.get_state()
- self._make_transforms()
+ # Clear transforms so we get new instances
+ self._transformers.clear()
return self
def can_close(self) -> bool:
@@ -520,15 +518,34 @@ def __str__(self) -> str:
f" {self.last_value_func}"
)
+ def _make_or_get_transformer(self, cls: Type[IncrementalTransform]) -> IncrementalTransform:
+ if transformer := self._transformers.get(cls):
+ return transformer
+ transformer = self._transformers[cls] = cls(
+ self.resource_name,
+ self.cursor_path,
+ self.initial_value,
+ self.start_value,
+ self.end_value,
+ self.last_value_func,
+ self._primary_key,
+ set(self._cached_state["unique_hashes"]),
+ self.on_cursor_value_missing,
+ self.lag,
+ self.range_start,
+ self.range_end,
+ )
+ return transformer
+
def _get_transformer(self, items: TDataItems) -> IncrementalTransform:
# Assume list is all of the same type
for item in items if isinstance(items, list) else [items]:
if is_arrow_item(item):
- return self._transformers["arrow"]
+ return self._make_or_get_transformer(ArrowIncremental)
elif pandas is not None and isinstance(item, pandas.DataFrame):
- return self._transformers["arrow"]
- return self._transformers["json"]
- return self._transformers["json"]
+ return self._make_or_get_transformer(ArrowIncremental)
+ return self._make_or_get_transformer(JsonIncremental)
+ return self._make_or_get_transformer(JsonIncremental)
def __call__(self, rows: TDataItems, meta: Any = None) -> Optional[TDataItems]:
if rows is None:
diff --git a/dlt/extract/incremental/lag.py b/dlt/extract/incremental/lag.py
index ee102a9961..dfafa2cd11 100644
--- a/dlt/extract/incremental/lag.py
+++ b/dlt/extract/incremental/lag.py
@@ -20,7 +20,7 @@ def _apply_lag_to_value(
parsed_value = ensure_pendulum_datetime(value) if is_str else value
if isinstance(parsed_value, (datetime, date)):
- parsed_value = _apply_lag_to_datetime(lag, parsed_value, last_value_func, is_str_date)
+ parsed_value = _apply_lag_to_datetime(lag, parsed_value, last_value_func, is_str_date) # type: ignore[assignment]
# go back to string or pass exact type
value = parsed_value.strftime(value_format) if value_format else parsed_value # type: ignore[assignment]
diff --git a/dlt/extract/incremental/transform.py b/dlt/extract/incremental/transform.py
index 22b1194b51..1d213e26c2 100644
--- a/dlt/extract/incremental/transform.py
+++ b/dlt/extract/incremental/transform.py
@@ -13,7 +13,12 @@
IncrementalPrimaryKeyMissing,
IncrementalCursorPathHasValueNone,
)
-from dlt.common.incremental.typing import TCursorValue, LastValueFunc, OnCursorValueMissing
+from dlt.common.incremental.typing import (
+ TCursorValue,
+ LastValueFunc,
+ OnCursorValueMissing,
+ TIncrementalRange,
+)
from dlt.extract.utils import resolve_column_value
from dlt.extract.items import TTableHintTemplate
@@ -57,6 +62,8 @@ def __init__(
unique_hashes: Set[str],
on_cursor_value_missing: OnCursorValueMissing = "raise",
lag: Optional[float] = None,
+ range_start: TIncrementalRange = "closed",
+ range_end: TIncrementalRange = "open",
) -> None:
self.resource_name = resource_name
self.cursor_path = cursor_path
@@ -71,6 +78,9 @@ def __init__(
self.start_unique_hashes = set(unique_hashes)
self.on_cursor_value_missing = on_cursor_value_missing
self.lag = lag
+ self.range_start = range_start
+ self.range_end = range_end
+
# compile jsonpath
self._compiled_cursor_path = compile_path(cursor_path)
# for simple column name we'll fallback to search in dict
@@ -107,6 +117,8 @@ def __call__(
def deduplication_disabled(self) -> bool:
"""Skip deduplication when length of the key is 0 or if lag is applied."""
# disable deduplication if end value is set - state is not saved
+ if self.range_start == "open":
+ return True
if self.end_value is not None:
return True
# disable deduplication if lag is applied - destination must deduplicate ranges
@@ -191,10 +203,10 @@ def __call__(
# Filter end value ranges exclusively, so in case of "max" function we remove values >= end_value
if self.end_value is not None:
try:
- if (
- last_value_func((row_value, self.end_value)) != self.end_value
- or last_value_func((row_value,)) == self.end_value
- ):
+ if last_value_func((row_value, self.end_value)) != self.end_value:
+ return None, False, True
+
+ if self.range_end == "open" and last_value_func((row_value,)) == self.end_value:
return None, False, True
except Exception as ex:
raise IncrementalCursorInvalidCoercion(
@@ -221,6 +233,9 @@ def __call__(
) from ex
# new_value is "less" or equal to last_value (the actual max)
if last_value == new_value:
+ if self.range_start == "open":
+ # We only want greater than last_value
+ return None, False, False
# use func to compute row_value into last_value compatible
processed_row_value = last_value_func((row_value,))
# skip the record that is not a start_value or new_value: that record was already processed
@@ -258,6 +273,31 @@ def __call__(
class ArrowIncremental(IncrementalTransform):
_dlt_index = "_dlt_index"
+ def __init__(self, *args: Any, **kwargs: Any) -> None:
+ super().__init__(*args, **kwargs)
+ if self.last_value_func is max:
+ self.compute = pa.compute.max
+ self.end_compare = (
+ pa.compute.less if self.range_end == "open" else pa.compute.less_equal
+ )
+ self.last_value_compare = (
+ pa.compute.greater_equal if self.range_start == "closed" else pa.compute.greater
+ )
+ self.new_value_compare = pa.compute.greater
+ elif self.last_value_func is min:
+ self.compute = pa.compute.min
+ self.end_compare = (
+ pa.compute.greater if self.range_end == "open" else pa.compute.greater_equal
+ )
+ self.last_value_compare = (
+ pa.compute.less_equal if self.range_start == "closed" else pa.compute.less
+ )
+ self.new_value_compare = pa.compute.less
+ else:
+ raise NotImplementedError(
+ "Only min or max last_value_func is supported for arrow tables"
+ )
+
def compute_unique_values(self, item: "TAnyArrowItem", unique_columns: List[str]) -> List[str]:
if not unique_columns:
return []
@@ -312,28 +352,13 @@ def __call__(
if not tbl: # row is None or empty arrow table
return tbl, start_out_of_range, end_out_of_range
- if self.last_value_func is max:
- compute = pa.compute.max
- end_compare = pa.compute.less
- last_value_compare = pa.compute.greater_equal
- new_value_compare = pa.compute.greater
- elif self.last_value_func is min:
- compute = pa.compute.min
- end_compare = pa.compute.greater
- last_value_compare = pa.compute.less_equal
- new_value_compare = pa.compute.less
- else:
- raise NotImplementedError(
- "Only min or max last_value_func is supported for arrow tables"
- )
-
# TODO: Json path support. For now assume the cursor_path is a column name
cursor_path = self.cursor_path
# The new max/min value
try:
# NOTE: datetimes are always pendulum in UTC
- row_value = from_arrow_scalar(compute(tbl[cursor_path]))
+ row_value = from_arrow_scalar(self.compute(tbl[cursor_path]))
cursor_data_type = tbl.schema.field(cursor_path).type
row_value_scalar = to_arrow_scalar(row_value, cursor_data_type)
except KeyError as e:
@@ -364,10 +389,10 @@ def __call__(
cursor_data_type,
str(ex),
) from ex
- tbl = tbl.filter(end_compare(tbl[cursor_path], end_value_scalar))
+ tbl = tbl.filter(self.end_compare(tbl[cursor_path], end_value_scalar))
# Is max row value higher than end value?
# NOTE: pyarrow bool *always* evaluates to python True. `as_py()` is necessary
- end_out_of_range = not end_compare(row_value_scalar, end_value_scalar).as_py()
+ end_out_of_range = not self.end_compare(row_value_scalar, end_value_scalar).as_py()
if self.start_value is not None:
try:
@@ -383,7 +408,7 @@ def __call__(
str(ex),
) from ex
# Remove rows lower or equal than the last start value
- keep_filter = last_value_compare(tbl[cursor_path], start_value_scalar)
+ keep_filter = self.last_value_compare(tbl[cursor_path], start_value_scalar)
start_out_of_range = bool(pa.compute.any(pa.compute.invert(keep_filter)).as_py())
tbl = tbl.filter(keep_filter)
if not self.deduplication_disabled:
@@ -407,7 +432,7 @@ def __call__(
if (
self.last_value is None
- or new_value_compare(
+ or self.new_value_compare(
row_value_scalar, to_arrow_scalar(self.last_value, cursor_data_type)
).as_py()
): # Last value has changed
diff --git a/dlt/extract/items.py b/dlt/extract/items.py
index 888787e6b7..ad7447c163 100644
--- a/dlt/extract/items.py
+++ b/dlt/extract/items.py
@@ -1,21 +1,16 @@
-import inspect
from abc import ABC, abstractmethod
from typing import (
Any,
Callable,
- ClassVar,
- Generic,
Iterator,
Iterable,
Literal,
Optional,
Protocol,
- TypeVar,
Union,
Awaitable,
TYPE_CHECKING,
NamedTuple,
- Generator,
)
from concurrent.futures import Future
@@ -28,7 +23,6 @@
TDynHintType,
)
-
TDecompositionStrategy = Literal["none", "scc"]
TDeferredDataItems = Callable[[], TDataItems]
TAwaitableDataItems = Awaitable[TDataItems]
@@ -113,6 +107,10 @@ def gen(self) -> TPipeStep:
"""A data generating step"""
...
+ def replace_gen(self, gen: TPipeStep) -> None:
+ """Replaces data generating step. Assumes that you know what are you doing"""
+ ...
+
def __getitem__(self, i: int) -> TPipeStep:
"""Get pipe step at index"""
...
@@ -129,112 +127,3 @@ def has_parent(self) -> bool:
def close(self) -> None:
"""Closes pipe generator"""
...
-
-
-ItemTransformFunctionWithMeta = Callable[[TDataItem, str], TAny]
-ItemTransformFunctionNoMeta = Callable[[TDataItem], TAny]
-ItemTransformFunc = Union[ItemTransformFunctionWithMeta[TAny], ItemTransformFunctionNoMeta[TAny]]
-
-
-class ItemTransform(ABC, Generic[TAny]):
- _f_meta: ItemTransformFunctionWithMeta[TAny] = None
- _f: ItemTransformFunctionNoMeta[TAny] = None
-
- placement_affinity: ClassVar[float] = 0
- """Tell how strongly an item sticks to start (-1) or end (+1) of pipe."""
-
- def __init__(self, transform_f: ItemTransformFunc[TAny]) -> None:
- # inspect the signature
- sig = inspect.signature(transform_f)
- # TODO: use TypeGuard here to get rid of type ignore
- if len(sig.parameters) == 1:
- self._f = transform_f # type: ignore
- else: # TODO: do better check
- self._f_meta = transform_f # type: ignore
-
- def bind(self: "ItemTransform[TAny]", pipe: SupportsPipe) -> "ItemTransform[TAny]":
- return self
-
- @abstractmethod
- def __call__(self, item: TDataItems, meta: Any = None) -> Optional[TDataItems]:
- """Transforms `item` (a list of TDataItem or a single TDataItem) and returns or yields TDataItems. Returns None to consume item (filter out)"""
- pass
-
-
-class FilterItem(ItemTransform[bool]):
- # mypy needs those to type correctly
- _f_meta: ItemTransformFunctionWithMeta[bool]
- _f: ItemTransformFunctionNoMeta[bool]
-
- def __call__(self, item: TDataItems, meta: Any = None) -> Optional[TDataItems]:
- if isinstance(item, list):
- # preserve empty lists
- if len(item) == 0:
- return item
-
- if self._f_meta:
- item = [i for i in item if self._f_meta(i, meta)]
- else:
- item = [i for i in item if self._f(i)]
- if not item:
- # item was fully consumed by the filter
- return None
- return item
- else:
- if self._f_meta:
- return item if self._f_meta(item, meta) else None
- else:
- return item if self._f(item) else None
-
-
-class MapItem(ItemTransform[TDataItem]):
- # mypy needs those to type correctly
- _f_meta: ItemTransformFunctionWithMeta[TDataItem]
- _f: ItemTransformFunctionNoMeta[TDataItem]
-
- def __call__(self, item: TDataItems, meta: Any = None) -> Optional[TDataItems]:
- if isinstance(item, list):
- if self._f_meta:
- return [self._f_meta(i, meta) for i in item]
- else:
- return [self._f(i) for i in item]
- else:
- if self._f_meta:
- return self._f_meta(item, meta)
- else:
- return self._f(item)
-
-
-class YieldMapItem(ItemTransform[Iterator[TDataItem]]):
- # mypy needs those to type correctly
- _f_meta: ItemTransformFunctionWithMeta[TDataItem]
- _f: ItemTransformFunctionNoMeta[TDataItem]
-
- def __call__(self, item: TDataItems, meta: Any = None) -> Optional[TDataItems]:
- if isinstance(item, list):
- for i in item:
- if self._f_meta:
- yield from self._f_meta(i, meta)
- else:
- yield from self._f(i)
- else:
- if self._f_meta:
- yield from self._f_meta(item, meta)
- else:
- yield from self._f(item)
-
-
-class ValidateItem(ItemTransform[TDataItem]):
- """Base class for validators of data items.
-
- Subclass should implement the `__call__` method to either return the data item(s) or raise `extract.exceptions.ValidationError`.
- See `PydanticValidator` for possible implementation.
- """
-
- placement_affinity: ClassVar[float] = 0.9 # stick to end but less than incremental
-
- table_name: str
-
- def bind(self, pipe: SupportsPipe) -> ItemTransform[TDataItem]:
- self.table_name = pipe.name
- return self
diff --git a/dlt/extract/items_transform.py b/dlt/extract/items_transform.py
new file mode 100644
index 0000000000..12375640bc
--- /dev/null
+++ b/dlt/extract/items_transform.py
@@ -0,0 +1,179 @@
+import inspect
+import time
+
+from abc import ABC, abstractmethod
+from typing import (
+ Any,
+ Callable,
+ ClassVar,
+ Generic,
+ Iterator,
+ Optional,
+ Union,
+)
+from concurrent.futures import Future
+
+from dlt.common.typing import (
+ TAny,
+ TDataItem,
+ TDataItems,
+)
+
+from dlt.extract.utils import (
+ wrap_iterator,
+)
+
+from dlt.extract.items import SupportsPipe
+
+
+ItemTransformFunctionWithMeta = Callable[[TDataItem, str], TAny]
+ItemTransformFunctionNoMeta = Callable[[TDataItem], TAny]
+ItemTransformFunc = Union[ItemTransformFunctionWithMeta[TAny], ItemTransformFunctionNoMeta[TAny]]
+
+
+class ItemTransform(ABC, Generic[TAny]):
+ _f_meta: ItemTransformFunctionWithMeta[TAny] = None
+ _f: ItemTransformFunctionNoMeta[TAny] = None
+
+ placement_affinity: ClassVar[float] = 0
+ """Tell how strongly an item sticks to start (-1) or end (+1) of pipe."""
+
+ def __init__(self, transform_f: ItemTransformFunc[TAny]) -> None:
+ # inspect the signature
+ sig = inspect.signature(transform_f)
+ # TODO: use TypeGuard here to get rid of type ignore
+ if len(sig.parameters) == 1:
+ self._f = transform_f # type: ignore
+ else: # TODO: do better check
+ self._f_meta = transform_f # type: ignore
+
+ def bind(self: "ItemTransform[TAny]", pipe: SupportsPipe) -> "ItemTransform[TAny]":
+ return self
+
+ @abstractmethod
+ def __call__(self, item: TDataItems, meta: Any = None) -> Optional[TDataItems]:
+ """Transforms `item` (a list of TDataItem or a single TDataItem) and returns or yields TDataItems. Returns None to consume item (filter out)"""
+ pass
+
+
+class FilterItem(ItemTransform[bool]):
+ # mypy needs those to type correctly
+ _f_meta: ItemTransformFunctionWithMeta[bool]
+ _f: ItemTransformFunctionNoMeta[bool]
+
+ def __call__(self, item: TDataItems, meta: Any = None) -> Optional[TDataItems]:
+ if isinstance(item, list):
+ # preserve empty lists
+ if len(item) == 0:
+ return item
+
+ if self._f_meta:
+ item = [i for i in item if self._f_meta(i, meta)]
+ else:
+ item = [i for i in item if self._f(i)]
+ if not item:
+ # item was fully consumed by the filter
+ return None
+ return item
+ else:
+ if self._f_meta:
+ return item if self._f_meta(item, meta) else None
+ else:
+ return item if self._f(item) else None
+
+
+class MapItem(ItemTransform[TDataItem]):
+ # mypy needs those to type correctly
+ _f_meta: ItemTransformFunctionWithMeta[TDataItem]
+ _f: ItemTransformFunctionNoMeta[TDataItem]
+
+ def __call__(self, item: TDataItems, meta: Any = None) -> Optional[TDataItems]:
+ if isinstance(item, list):
+ if self._f_meta:
+ return [self._f_meta(i, meta) for i in item]
+ else:
+ return [self._f(i) for i in item]
+ else:
+ if self._f_meta:
+ return self._f_meta(item, meta)
+ else:
+ return self._f(item)
+
+
+class YieldMapItem(ItemTransform[Iterator[TDataItem]]):
+ # mypy needs those to type correctly
+ _f_meta: ItemTransformFunctionWithMeta[TDataItem]
+ _f: ItemTransformFunctionNoMeta[TDataItem]
+
+ def __call__(self, item: TDataItems, meta: Any = None) -> Optional[TDataItems]:
+ if isinstance(item, list):
+ for i in item:
+ if self._f_meta:
+ yield from self._f_meta(i, meta)
+ else:
+ yield from self._f(i)
+ else:
+ if self._f_meta:
+ yield from self._f_meta(item, meta)
+ else:
+ yield from self._f(item)
+
+
+class ValidateItem(ItemTransform[TDataItem]):
+ """Base class for validators of data items.
+
+ Subclass should implement the `__call__` method to either return the data item(s) or raise `extract.exceptions.ValidationError`.
+ See `PydanticValidator` for possible implementation.
+ """
+
+ placement_affinity: ClassVar[float] = 0.9 # stick to end but less than incremental
+
+ table_name: str
+
+ def bind(self, pipe: SupportsPipe) -> ItemTransform[TDataItem]:
+ self.table_name = pipe.name
+ return self
+
+
+class LimitItem(ItemTransform[TDataItem]):
+ placement_affinity: ClassVar[float] = 1.1 # stick to end right behind incremental
+
+ def __init__(self, max_items: Optional[int], max_time: Optional[float]) -> None:
+ self.max_items = max_items if max_items is not None else -1
+ self.max_time = max_time
+
+ def bind(self, pipe: SupportsPipe) -> "LimitItem":
+ # we also wrap iterators to make them stoppable
+ if isinstance(pipe.gen, Iterator):
+ pipe.replace_gen(wrap_iterator(pipe.gen))
+
+ self.gen = pipe.gen
+ self.count = 0
+ self.exhausted = False
+ self.start_time = time.time()
+
+ return self
+
+ def __call__(self, item: TDataItems, meta: Any = None) -> Optional[TDataItems]:
+ self.count += 1
+
+ # detect when the limit is reached, max time or yield count
+ if (
+ (self.count == self.max_items)
+ or (self.max_time and time.time() - self.start_time > self.max_time)
+ or self.max_items == 0
+ ):
+ self.exhausted = True
+ if inspect.isgenerator(self.gen):
+ self.gen.close()
+
+ # if max items is not 0, we return the last item
+ # otherwise never return anything
+ if self.max_items != 0:
+ return item
+
+ # do not return any late arriving items
+ if self.exhausted:
+ return None
+
+ return item
diff --git a/dlt/extract/pipe.py b/dlt/extract/pipe.py
index 02b52c4623..e70365b4f4 100644
--- a/dlt/extract/pipe.py
+++ b/dlt/extract/pipe.py
@@ -27,12 +27,12 @@
UnclosablePipe,
)
from dlt.extract.items import (
- ItemTransform,
ResolvablePipeItem,
SupportsPipe,
TPipeStep,
TPipedDataItems,
)
+from dlt.extract.items_transform import ItemTransform
from dlt.extract.utils import (
check_compat_transformer,
simulate_func_call,
@@ -122,7 +122,23 @@ def steps(self) -> List[TPipeStep]:
def find(self, *step_type: AnyType) -> int:
"""Finds a step with object of type `step_type`"""
- return next((i for i, v in enumerate(self._steps) if isinstance(v, step_type)), -1)
+ found = self.find_all(step_type)
+ return found[0] if found else -1
+
+ def find_all(self, *step_type: AnyType) -> List[int]:
+ """Finds all steps with object of type `step_type`"""
+ return [i for i, v in enumerate(self._steps) if isinstance(v, step_type)]
+
+ def get_by_type(self, *step_type: AnyType) -> TPipeStep:
+ """Gets first step found with object of type `step_type`"""
+ return next((v for v in self._steps if isinstance(v, step_type)), None)
+
+ def remove_by_type(self, *step_type: AnyType) -> int:
+ """Deletes first step found with object of type `step_type`, returns previous index"""
+ step_index = self.find(*step_type)
+ if step_index >= 0:
+ self.remove_step(step_index)
+ return step_index
def __getitem__(self, i: int) -> TPipeStep:
return self._steps[i]
diff --git a/dlt/extract/pipe_iterator.py b/dlt/extract/pipe_iterator.py
index 465040f9f4..38641c0626 100644
--- a/dlt/extract/pipe_iterator.py
+++ b/dlt/extract/pipe_iterator.py
@@ -24,7 +24,11 @@
)
from dlt.common.configuration.container import Container
from dlt.common.exceptions import PipelineException
-from dlt.common.pipeline import unset_current_pipe_name, set_current_pipe_name
+from dlt.common.pipeline import (
+ unset_current_pipe_name,
+ set_current_pipe_name,
+ get_current_pipe_name,
+)
from dlt.common.utils import get_callable_name
from dlt.extract.exceptions import (
@@ -180,7 +184,6 @@ def __next__(self) -> PipeItem:
item = pipe_item.item
# if item is iterator, then add it as a new source
if isinstance(item, Iterator):
- # print(f"adding iterable {item}")
self._sources.append(
SourcePipeItem(item, pipe_item.step, pipe_item.pipe, pipe_item.meta)
)
@@ -291,7 +294,6 @@ def _get_source_item(self) -> ResolvablePipeItem:
first_evaluated_index = self._current_source_index
# always go round robin if None was returned or item is to be run as future
self._current_source_index = (self._current_source_index - 1) % sources_count
-
except StopIteration:
# remove empty iterator and try another source
self._sources.pop(self._current_source_index)
diff --git a/dlt/extract/resource.py b/dlt/extract/resource.py
index 42e3905162..366e6e1a88 100644
--- a/dlt/extract/resource.py
+++ b/dlt/extract/resource.py
@@ -2,7 +2,7 @@
from functools import partial
from typing import (
AsyncIterable,
- AsyncIterator,
+ cast,
ClassVar,
Callable,
Iterable,
@@ -34,13 +34,16 @@
from dlt.extract.items import (
DataItemWithMeta,
- ItemTransformFunc,
- ItemTransformFunctionWithMeta,
TableNameMeta,
+)
+from dlt.extract.items_transform import (
FilterItem,
MapItem,
YieldMapItem,
ValidateItem,
+ LimitItem,
+ ItemTransformFunc,
+ ItemTransformFunctionWithMeta,
)
from dlt.extract.pipe_iterator import ManagedPipeIterator
from dlt.extract.pipe import Pipe, TPipeStep
@@ -214,29 +217,22 @@ def requires_args(self) -> bool:
return True
@property
- def incremental(self) -> IncrementalResourceWrapper:
+ def incremental(self) -> Optional[IncrementalResourceWrapper]:
"""Gets incremental transform if it is in the pipe"""
- incremental: IncrementalResourceWrapper = None
- step_no = self._pipe.find(IncrementalResourceWrapper, Incremental)
- if step_no >= 0:
- incremental = self._pipe.steps[step_no] # type: ignore
- return incremental
+ return cast(
+ Optional[IncrementalResourceWrapper],
+ self._pipe.get_by_type(IncrementalResourceWrapper, Incremental),
+ )
@property
def validator(self) -> Optional[ValidateItem]:
"""Gets validator transform if it is in the pipe"""
- validator: ValidateItem = None
- step_no = self._pipe.find(ValidateItem)
- if step_no >= 0:
- validator = self._pipe.steps[step_no] # type: ignore[assignment]
- return validator
+ return cast(Optional[ValidateItem], self._pipe.get_by_type(ValidateItem))
@validator.setter
def validator(self, validator: Optional[ValidateItem]) -> None:
"""Add/remove or replace the validator in pipe"""
- step_no = self._pipe.find(ValidateItem)
- if step_no >= 0:
- self._pipe.remove_step(step_no)
+ step_no = self._pipe.remove_by_type(ValidateItem)
if validator:
self.add_step(validator, insert_at=step_no if step_no >= 0 else None)
@@ -347,72 +343,37 @@ def add_filter(
self._pipe.insert_step(FilterItem(item_filter), insert_at)
return self
- def add_limit(self: TDltResourceImpl, max_items: int) -> TDltResourceImpl: # noqa: A003
+ def add_limit(
+ self: TDltResourceImpl,
+ max_items: Optional[int] = None,
+ max_time: Optional[float] = None,
+ ) -> TDltResourceImpl: # noqa: A003
"""Adds a limit `max_items` to the resource pipe.
- This mutates the encapsulated generator to stop after `max_items` items are yielded. This is useful for testing and debugging.
+ This mutates the encapsulated generator to stop after `max_items` items are yielded. This is useful for testing and debugging.
- Notes:
- 1. Transformers won't be limited. They should process all the data they receive fully to avoid inconsistencies in generated datasets.
- 2. Each yielded item may contain several records. `add_limit` only limits the "number of yields", not the total number of records.
- 3. Async resources with a limit added may occasionally produce one item more than the limit on some runs. This behavior is not deterministic.
+ Notes:
+ 1. Transformers won't be limited. They should process all the data they receive fully to avoid inconsistencies in generated datasets.
+ 2. Each yielded item may contain several records. `add_limit` only limits the "number of yields", not the total number of records.
+ 3. Async resources with a limit added may occasionally produce one item more than the limit on some runs. This behavior is not deterministic.
Args:
- max_items (int): The maximum number of items to yield
- Returns:
- "DltResource": returns self
+ max_items (int): The maximum number of items to yield, set to None for no limit
+ max_time (float): The maximum number of seconds for this generator to run after it was opened, set to None for no limit
+ Returns:
+ "DltResource": returns self
"""
- # make sure max_items is a number, to allow "None" as value for unlimited
- if max_items is None:
- max_items = -1
-
- def _gen_wrap(gen: TPipeStep) -> TPipeStep:
- """Wrap a generator to take the first `max_items` records"""
-
- # zero items should produce empty generator
- if max_items == 0:
- return
-
- count = 0
- is_async_gen = False
- if callable(gen):
- gen = gen() # type: ignore
-
- # wrap async gen already here
- if isinstance(gen, AsyncIterator):
- gen = wrap_async_iterator(gen)
- is_async_gen = True
-
- try:
- for i in gen: # type: ignore # TODO: help me fix this later
- yield i
- if i is not None:
- count += 1
- # async gen yields awaitable so we must count one awaitable more
- # so the previous one is evaluated and yielded.
- # new awaitable will be cancelled
- if count == max_items + int(is_async_gen):
- return
- finally:
- if inspect.isgenerator(gen):
- gen.close()
- return
-
- # transformers should be limited by their input, so we only limit non-transformers
- if not self.is_transformer:
- gen = self._pipe.gen
- # wrap gen directly
- if inspect.isgenerator(gen):
- self._pipe.replace_gen(_gen_wrap(gen))
- else:
- # keep function as function to not evaluate generators before pipe starts
- self._pipe.replace_gen(partial(_gen_wrap, gen))
- else:
+ if self.is_transformer:
logger.warning(
f"Setting add_limit to a transformer {self.name} has no effect. Set the limit on"
" the top level resource."
)
+ else:
+ # remove existing limit if any
+ self._pipe.remove_by_type(LimitItem)
+ self.add_step(LimitItem(max_items=max_items, max_time=max_time))
+
return self
def parallelize(self: TDltResourceImpl) -> TDltResourceImpl:
@@ -445,9 +406,7 @@ def add_step(
return self
def _remove_incremental_step(self) -> None:
- step_no = self._pipe.find(Incremental, IncrementalResourceWrapper)
- if step_no >= 0:
- self._pipe.remove_step(step_no)
+ self._pipe.remove_by_type(Incremental, IncrementalResourceWrapper)
def set_incremental(
self,
diff --git a/dlt/extract/utils.py b/dlt/extract/utils.py
index 68570d0995..0bcd13155e 100644
--- a/dlt/extract/utils.py
+++ b/dlt/extract/utils.py
@@ -183,6 +183,17 @@ def check_compat_transformer(name: str, f: AnyFun, sig: inspect.Signature) -> in
return meta_arg
+def wrap_iterator(gen: Iterator[TDataItems]) -> Iterator[TDataItems]:
+ """Wraps an iterator into a generator"""
+ if inspect.isgenerator(gen):
+ return gen
+
+ def wrapped_gen() -> Iterator[TDataItems]:
+ yield from gen
+
+ return wrapped_gen()
+
+
def wrap_async_iterator(
gen: AsyncIterator[TDataItems],
) -> Generator[Awaitable[TDataItems], None, None]:
diff --git a/dlt/extract/validation.py b/dlt/extract/validation.py
index 4cd321b88c..d9fe70a90b 100644
--- a/dlt/extract/validation.py
+++ b/dlt/extract/validation.py
@@ -8,7 +8,8 @@
from dlt.common.typing import TDataItems
from dlt.common.schema.typing import TAnySchemaColumns, TSchemaContract, TSchemaEvolutionMode
-from dlt.extract.items import TTableHintTemplate, ValidateItem
+from dlt.extract.items import TTableHintTemplate
+from dlt.extract.items_transform import ValidateItem
_TPydanticModel = TypeVar("_TPydanticModel", bound=PydanticBaseModel)
diff --git a/dlt/helpers/airflow_helper.py b/dlt/helpers/airflow_helper.py
index 99458a3949..aaa19ea97d 100644
--- a/dlt/helpers/airflow_helper.py
+++ b/dlt/helpers/airflow_helper.py
@@ -18,7 +18,7 @@
from airflow.configuration import conf
from airflow.models import TaskInstance
from airflow.utils.task_group import TaskGroup
- from airflow.operators.dummy import DummyOperator # type: ignore
+ from airflow.operators.dummy import DummyOperator
from airflow.operators.python import PythonOperator, get_current_context
except ModuleNotFoundError:
raise MissingDependencyException("Airflow", ["apache-airflow>=2.5"])
@@ -255,7 +255,7 @@ def _run(
# use task logger
if self.use_task_logger:
- ti: TaskInstance = get_current_context()["ti"] # type: ignore
+ ti: TaskInstance = get_current_context()["ti"] # type: ignore[assignment,unused-ignore]
logger.LOGGER = ti.log
# set global number of buffered items
diff --git a/dlt/helpers/dbt/profiles.yml b/dlt/helpers/dbt/profiles.yml
index a2a0014e4e..fd114478fb 100644
--- a/dlt/helpers/dbt/profiles.yml
+++ b/dlt/helpers/dbt/profiles.yml
@@ -83,6 +83,7 @@ duckdb:
extensions:
- httpfs
- parquet
+ - iceberg
# TODO: emit the config of duck db
motherduck:
diff --git a/dlt/common/libs/ibis.py b/dlt/helpers/ibis.py
similarity index 64%
rename from dlt/common/libs/ibis.py
rename to dlt/helpers/ibis.py
index ba6f363e66..e15bb9bc16 100644
--- a/dlt/common/libs/ibis.py
+++ b/dlt/helpers/ibis.py
@@ -1,14 +1,16 @@
-from typing import cast
+from typing import cast, Any
from dlt.common.exceptions import MissingDependencyException
-
from dlt.common.destination.reference import TDestinationReferenceArg, Destination, JobClientBase
+from dlt.common.schema import Schema
+from dlt.destinations.sql_client import SqlClientBase
try:
import ibis # type: ignore
- from ibis import BaseBackend
+ import sqlglot
+ from ibis import BaseBackend, Expr
except ModuleNotFoundError:
- raise MissingDependencyException("dlt ibis Helpers", ["ibis"])
+ raise MissingDependencyException("dlt ibis helpers", ["ibis-framework"])
SUPPORTED_DESTINATIONS = [
@@ -29,6 +31,22 @@
]
+# Map dlt data types to ibis data types
+DATA_TYPE_MAP = {
+ "text": "string",
+ "double": "float64",
+ "bool": "boolean",
+ "timestamp": "timestamp",
+ "bigint": "int64",
+ "binary": "binary",
+ "json": "string", # Store JSON as string in ibis
+ "decimal": "decimal",
+ "wei": "int64", # Wei is a large integer
+ "date": "date",
+ "time": "time",
+}
+
+
def create_ibis_backend(
destination: TDestinationReferenceArg, client: JobClientBase
) -> BaseBackend:
@@ -105,17 +123,55 @@ def create_ibis_backend(
)
from dlt.destinations.impl.duckdb.factory import DuckDbCredentials
- # we create an in memory duckdb and create all tables on there
- duck = duckdb.connect(":memory:")
+ # we create an in memory duckdb and create the ibis backend from it
fs_client = cast(FilesystemClient, client)
- creds = DuckDbCredentials(duck)
sql_client = FilesystemSqlClient(
- fs_client, dataset_name=fs_client.dataset_name, credentials=creds
+ fs_client,
+ dataset_name=fs_client.dataset_name,
+ credentials=DuckDbCredentials(duckdb.connect()),
)
-
+ # do not use context manager to not return and close the cloned connection
+ duckdb_conn = sql_client.open_connection()
+ # make all tables available here
# NOTE: we should probably have the option for the user to only select a subset of tables here
- with sql_client as _:
- sql_client.create_views_for_all_tables()
- con = ibis.duckdb.from_connection(duck)
+ sql_client.create_views_for_all_tables()
+ # why this works now: whenever a clone of connection is made, all SET commands
+ # apply only to it. old code was setting `curl` on the internal clone of sql_client
+ # now we export this clone directly to ibis to it works
+ con = ibis.duckdb.from_connection(duckdb_conn)
return con
+
+
+def create_unbound_ibis_table(
+ sql_client: SqlClientBase[Any], schema: Schema, table_name: str
+) -> Expr:
+ """Create an unbound ibis table from a dlt schema"""
+
+ if table_name not in schema.tables:
+ raise Exception(
+ f"Table {table_name} not found in schema. Available tables: {schema.tables.keys()}"
+ )
+ table_schema = schema.tables[table_name]
+
+ # Convert dlt table schema columns to ibis schema
+ ibis_schema = {
+ sql_client.capabilities.casefold_identifier(col_name): DATA_TYPE_MAP[
+ col_info.get("data_type", "string")
+ ]
+ for col_name, col_info in table_schema.get("columns", {}).items()
+ }
+
+ # normalize table name
+ table_path = sql_client.make_qualified_table_name_path(table_name, escape=False)
+
+ catalog = None
+ if len(table_path) == 3:
+ catalog, database, table = table_path
+ else:
+ database, table = table_path
+
+ # create unbound ibis table and return in dlt wrapper
+ unbound_table = ibis.table(schema=ibis_schema, name=table, database=database, catalog=catalog)
+
+ return unbound_table
diff --git a/dlt/normalize/normalize.py b/dlt/normalize/normalize.py
index 32db5034b4..1d81d70b10 100644
--- a/dlt/normalize/normalize.py
+++ b/dlt/normalize/normalize.py
@@ -20,7 +20,7 @@
LoadStorage,
ParsedLoadJobFileName,
)
-from dlt.common.schema import TSchemaUpdate, Schema
+from dlt.common.schema import Schema
from dlt.common.schema.exceptions import CannotCoerceColumnException
from dlt.common.pipeline import (
NormalizeInfo,
@@ -34,7 +34,7 @@
from dlt.normalize.configuration import NormalizeConfiguration
from dlt.normalize.exceptions import NormalizeJobFailed
from dlt.normalize.worker import w_normalize_files, group_worker_files, TWorkerRV
-from dlt.normalize.validate import verify_normalized_table
+from dlt.normalize.validate import validate_and_update_schema, verify_normalized_table
# normalize worker wrapping function signature
@@ -80,16 +80,6 @@ def create_storages(self) -> None:
config=self.config._load_storage_config,
)
- def update_schema(self, schema: Schema, schema_updates: List[TSchemaUpdate]) -> None:
- for schema_update in schema_updates:
- for table_name, table_updates in schema_update.items():
- logger.info(
- f"Updating schema for table {table_name} with {len(table_updates)} deltas"
- )
- for partial_table in table_updates:
- # merge columns where we expect identifiers to be normalized
- schema.update_table(partial_table, normalize_identifiers=False)
-
def map_parallel(self, schema: Schema, load_id: str, files: Sequence[str]) -> TWorkerRV:
workers: int = getattr(self.pool, "_max_workers", 1)
chunk_files = group_worker_files(files, workers)
@@ -123,7 +113,7 @@ def map_parallel(self, schema: Schema, load_id: str, files: Sequence[str]) -> TW
result: TWorkerRV = pending.result()
try:
# gather schema from all manifests, validate consistency and combine
- self.update_schema(schema, result[0])
+ validate_and_update_schema(schema, result[0])
summary.schema_updates.extend(result.schema_updates)
summary.file_metrics.extend(result.file_metrics)
# update metrics
@@ -162,7 +152,7 @@ def map_single(self, schema: Schema, load_id: str, files: Sequence[str]) -> TWor
load_id,
files,
)
- self.update_schema(schema, result.schema_updates)
+ validate_and_update_schema(schema, result.schema_updates)
self.collector.update("Files", len(result.file_metrics))
self.collector.update(
"Items", sum(result.file_metrics, EMPTY_DATA_WRITER_METRICS).items_count
@@ -237,23 +227,11 @@ def spool_schema_files(self, load_id: str, schema: Schema, files: Sequence[str])
self.load_storage.import_extracted_package(
load_id, self.normalize_storage.extracted_packages
)
- logger.info(f"Created new load package {load_id} on loading volume")
- try:
- # process parallel
- self.spool_files(
- load_id, schema.clone(update_normalizers=True), self.map_parallel, files
- )
- except CannotCoerceColumnException as exc:
- # schema conflicts resulting from parallel executing
- logger.warning(
- f"Parallel schema update conflict, switching to single thread ({str(exc)}"
- )
- # start from scratch
- self.load_storage.new_packages.delete_package(load_id)
- self.load_storage.import_extracted_package(
- load_id, self.normalize_storage.extracted_packages
- )
- self.spool_files(load_id, schema.clone(update_normalizers=True), self.map_single, files)
+ logger.info(f"Created new load package {load_id} on loading volume with ")
+ # get number of workers with default == 1 if not set (ie. NullExecutor)
+ workers: int = getattr(self.pool, "_max_workers", 1)
+ map_f: TMapFuncType = self.map_parallel if workers > 1 else self.map_single
+ self.spool_files(load_id, schema.clone(update_normalizers=True), map_f, files)
return load_id
diff --git a/dlt/normalize/validate.py b/dlt/normalize/validate.py
index 648deb5da9..868ba3115b 100644
--- a/dlt/normalize/validate.py
+++ b/dlt/normalize/validate.py
@@ -1,7 +1,10 @@
+from typing import List
+
from dlt.common.destination.capabilities import DestinationCapabilitiesContext
from dlt.common.schema import Schema
-from dlt.common.schema.typing import TTableSchema
+from dlt.common.schema.typing import TTableSchema, TSchemaUpdate
from dlt.common.schema.utils import (
+ ensure_compatible_tables,
find_incomplete_columns,
get_first_column_name_with_prop,
is_nested_table,
@@ -10,6 +13,21 @@
from dlt.common import logger
+def validate_and_update_schema(schema: Schema, schema_updates: List[TSchemaUpdate]) -> None:
+ """Updates `schema` tables with partial tables in `schema_updates`"""
+ for schema_update in schema_updates:
+ for table_name, table_updates in schema_update.items():
+ logger.info(f"Updating schema for table {table_name} with {len(table_updates)} deltas")
+ for partial_table in table_updates:
+ # ensure updates will pass
+ if existing_table := schema.tables.get(partial_table["name"]):
+ ensure_compatible_tables(schema.name, existing_table, partial_table)
+
+ for partial_table in table_updates:
+ # merge columns where we expect identifiers to be normalized
+ schema.update_table(partial_table, normalize_identifiers=False)
+
+
def verify_normalized_table(
schema: Schema, table: TTableSchema, capabilities: DestinationCapabilitiesContext
) -> None:
diff --git a/dlt/pipeline/pipeline.py b/dlt/pipeline/pipeline.py
index 70d160ea67..74466a09e4 100644
--- a/dlt/pipeline/pipeline.py
+++ b/dlt/pipeline/pipeline.py
@@ -1750,10 +1750,18 @@ def __getstate__(self) -> Any:
# pickle only the SupportsPipeline protocol fields
return {"pipeline_name": self.pipeline_name}
- def _dataset(
- self, schema: Union[Schema, str, None] = None, dataset_type: TDatasetType = "dbapi"
+ def dataset(
+ self, schema: Union[Schema, str, None] = None, dataset_type: TDatasetType = "auto"
) -> SupportsReadableDataset:
- """Access helper to dataset"""
+ """Returns a dataset object for querying the destination data.
+
+ Args:
+ schema: Schema name or Schema object to use. If None, uses the default schema if set.
+ dataset_type: Type of dataset interface to return. Defaults to 'auto' which will select ibis if available
+ otherwise it will fallback to the standard dbapi interface.
+ Returns:
+ A dataset object that supports querying the destination data.
+ """
if schema is None:
schema = self.default_schema if self.default_schema_name else None
return dataset(
diff --git a/dlt/sources/helpers/transform.py b/dlt/sources/helpers/transform.py
index 32843e2aa2..45738fe4fb 100644
--- a/dlt/sources/helpers/transform.py
+++ b/dlt/sources/helpers/transform.py
@@ -2,7 +2,7 @@
from typing import Any, Dict, Sequence, Union
from dlt.common.typing import TDataItem
-from dlt.extract.items import ItemTransformFunctionNoMeta
+from dlt.extract.items_transform import ItemTransformFunctionNoMeta
import jsonpath_ng
diff --git a/dlt/sources/rest_api/config_setup.py b/dlt/sources/rest_api/config_setup.py
index d03a4fd59b..bf62c6c4f7 100644
--- a/dlt/sources/rest_api/config_setup.py
+++ b/dlt/sources/rest_api/config_setup.py
@@ -20,6 +20,7 @@
from dlt.common.configuration import resolve_configuration
from dlt.common.schema.utils import merge_columns
from dlt.common.utils import update_dict_nested, exclude_keys
+from dlt.common.typing import add_value_to_literal
from dlt.common import jsonpath
from dlt.extract.incremental import Incremental
@@ -64,6 +65,8 @@
ResponseActionDict,
Endpoint,
EndpointResource,
+ AuthType,
+ PaginatorType,
)
@@ -103,6 +106,7 @@ def register_paginator(
"Your custom paginator has to be a subclass of BasePaginator"
)
PAGINATOR_MAP[paginator_name] = paginator_class
+ add_value_to_literal(PaginatorType, paginator_name)
def get_paginator_class(paginator_name: str) -> Type[BasePaginator]:
@@ -153,6 +157,8 @@ def register_auth(
)
AUTH_MAP[auth_name] = auth_class
+ add_value_to_literal(AuthType, auth_name)
+
def get_auth_class(auth_type: str) -> Type[AuthConfigBase]:
try:
@@ -285,7 +291,7 @@ def build_resource_dependency_graph(
resolved_param_map[resource_name] = None
break
assert isinstance(endpoint_resource["endpoint"], dict)
- # connect transformers to resources via resolved params
+ # find resolved parameters to connect dependent resources
resolved_params = _find_resolved_params(endpoint_resource["endpoint"])
# set of resources in resolved params
diff --git a/dlt/sources/sql_database/helpers.py b/dlt/sources/sql_database/helpers.py
index a8be2a6427..ee38c7dd98 100644
--- a/dlt/sources/sql_database/helpers.py
+++ b/dlt/sources/sql_database/helpers.py
@@ -94,12 +94,16 @@ def __init__(
self.end_value = incremental.end_value
self.row_order: TSortOrder = self.incremental.row_order
self.on_cursor_value_missing = self.incremental.on_cursor_value_missing
+ self.range_start = self.incremental.range_start
+ self.range_end = self.incremental.range_end
else:
self.cursor_column = None
self.last_value = None
self.end_value = None
self.row_order = None
self.on_cursor_value_missing = None
+ self.range_start = None
+ self.range_end = None
def _make_query(self) -> SelectAny:
table = self.table
@@ -110,11 +114,11 @@ def _make_query(self) -> SelectAny:
# generate where
if last_value_func is max: # Query ordered and filtered according to last_value function
- filter_op = operator.ge
- filter_op_end = operator.lt
+ filter_op = operator.ge if self.range_start == "closed" else operator.gt
+ filter_op_end = operator.lt if self.range_end == "open" else operator.le
elif last_value_func is min:
- filter_op = operator.le
- filter_op_end = operator.gt
+ filter_op = operator.le if self.range_start == "closed" else operator.lt
+ filter_op_end = operator.gt if self.range_end == "open" else operator.ge
else: # Custom last_value, load everything and let incremental handle filtering
return query # type: ignore[no-any-return]
diff --git a/docs/examples/backfill_in_chunks/__init__.py b/docs/examples/backfill_in_chunks/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/docs/examples/backfill_in_chunks/backfill_in_chunks.py b/docs/examples/backfill_in_chunks/backfill_in_chunks.py
new file mode 100644
index 0000000000..a758d67f7b
--- /dev/null
+++ b/docs/examples/backfill_in_chunks/backfill_in_chunks.py
@@ -0,0 +1,85 @@
+"""
+---
+title: Backfilling in chunks
+description: Learn how to backfill in chunks of defined size
+keywords: [incremental loading, backfilling, chunks,example]
+---
+
+In this example, you'll find a Python script that will load from a sql_database source in chunks of defined size. This is useful for backfilling in multiple pipeline runs as
+opposed to backfilling in one very large pipeline run which may fail due to memory issues on ephemeral storage or just take a very long time to complete without seeing any
+progress in the destination.
+
+We'll learn how to:
+
+- Connect to a mysql database with the sql_database source
+- Select one table to load and apply incremental loading hints as well as the primary key
+- Set the chunk size and limit the number of chunks to load in one pipeline run
+- Create a pipeline and backfill the table in the defined chunks
+- Use the datasets accessor to inspect and assert the load progress
+
+"""
+
+import pandas as pd
+
+import dlt
+from dlt.sources.sql_database import sql_database
+
+
+if __name__ == "__main__":
+ # NOTE: this is a live table in the rfam database, so the number of final rows may change
+ TOTAL_TABLE_ROWS = 4178
+ RFAM_CONNECTION_STRING = "mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam"
+
+ # create sql database source that only loads the family table in chunks of 1000 rows
+ source = sql_database(RFAM_CONNECTION_STRING, table_names=["family"], chunk_size=1000)
+
+ # we apply some hints to the table, we know the rfam_id is unique and that we can order
+ # and load incrementally on the created datetime column
+ source.family.apply_hints(
+ primary_key="rfam_id",
+ incremental=dlt.sources.incremental(
+ cursor_path="created", initial_value=None, row_order="asc"
+ ),
+ )
+
+ # with limit we can limit the number of chunks to load, with a chunk size of 1000 and a limit of 1
+ # we will load 1000 rows per pipeline run
+ source.add_limit(1)
+
+ # create pipeline
+ pipeline = dlt.pipeline(
+ pipeline_name="rfam", destination="duckdb", dataset_name="rfam_data", dev_mode=True
+ )
+
+ def _assert_unique_row_count(df: pd.DataFrame, num_rows: int) -> None:
+ """Assert that a dataframe has the correct number of unique rows"""
+ # NOTE: this check is dependent on reading the full table back from the destination into memory,
+ # so it is only useful for testing before you do a large backfill.
+ assert len(df) == num_rows
+ assert len(set(df.rfam_id.tolist())) == num_rows
+
+ # after the first run, the family table in the destination should contain the first 1000 rows
+ pipeline.run(source)
+ _assert_unique_row_count(pipeline.dataset().family.df(), 1000)
+
+ # after the second run, the family table in the destination should contain 1999 rows
+ # there is some overlap on the incremental to prevent skipping rows
+ pipeline.run(source)
+ _assert_unique_row_count(pipeline.dataset().family.df(), 1999)
+
+ # ...
+ pipeline.run(source)
+ _assert_unique_row_count(pipeline.dataset().family.df(), 2998)
+
+ # ...
+ pipeline.run(source)
+ _assert_unique_row_count(pipeline.dataset().family.df(), 3997)
+
+ # the final run will load all the rows until the end of the table
+ pipeline.run(source)
+ _assert_unique_row_count(pipeline.dataset().family.df(), TOTAL_TABLE_ROWS)
+
+ # NOTE: in a production environment you will likely:
+ # * be using much larger chunk sizes and limits
+ # * run the pipeline in a loop to load all the rows
+ # * and programmatically check if the table is fully loaded and abort the loop if this is the case.
diff --git a/docs/tools/check_embedded_snippets.py b/docs/tools/check_embedded_snippets.py
index e8399fce6e..b917cafee1 100644
--- a/docs/tools/check_embedded_snippets.py
+++ b/docs/tools/check_embedded_snippets.py
@@ -21,7 +21,7 @@
SNIPPET_MARKER = "```"
-ALLOWED_LANGUAGES = ["py", "toml", "json", "yaml", "text", "sh", "bat", "sql"]
+ALLOWED_LANGUAGES = ["py", "toml", "json", "yaml", "text", "sh", "bat", "sql", "hcl"]
LINT_TEMPLATE = "./lint_setup/template.py"
LINT_FILE = "./lint_setup/lint_me.py"
@@ -163,8 +163,11 @@ def parse_snippets(snippets: List[Snippet], verbose: bool) -> None:
json.loads(snippet.code)
elif snippet.language == "yaml":
yaml.safe_load(snippet.code)
- # ignore text and sh scripts
- elif snippet.language in ["text", "sh", "bat", "sql"]:
+ elif snippet.language == "hcl":
+ # TODO: implement hcl parsers
+ pass
+ # ignore all other scripts
+ elif snippet.language in ALLOWED_LANGUAGES:
pass
else:
raise ValueError(f"Unknown language {snippet.language}")
diff --git a/docs/website/docs/build-a-pipeline-tutorial.md b/docs/website/docs/build-a-pipeline-tutorial.md
index f85d2e19ea..36d30a184f 100644
--- a/docs/website/docs/build-a-pipeline-tutorial.md
+++ b/docs/website/docs/build-a-pipeline-tutorial.md
@@ -262,20 +262,30 @@ In this example, the first pipeline loads the data using `pipedrive_source()`. T
#### [Using the `dlt` SQL client](dlt-ecosystem/transformations/sql.md)
-Another option is to leverage the `dlt` SQL client to query the loaded data and perform transformations using SQL statements. You can execute SQL statements that change the database schema or manipulate data within tables. Here's an example of inserting a row into the `customers` table using the `dlt` SQL client:
+Another option is to leverage the `dlt` SQL client to query the loaded data and perform transformations using SQL statements. You can execute SQL statements that change the database schema or manipulate data within tables. Here's an example of creating a new table with aggregated sales data in duckdb:
```py
-pipeline = dlt.pipeline(destination="bigquery", dataset_name="crm")
+pipeline = dlt.pipeline(destination="duckdb", dataset_name="crm")
with pipeline.sql_client() as client:
client.execute_sql(
- "INSERT INTO customers VALUES (%s, %s, %s)", 10, "Fred", "fred@fred.com"
- )
+ """ CREATE TABLE aggregated_sales AS
+ SELECT
+ category,
+ region,
+ SUM(amount) AS total_sales,
+ AVG(amount) AS average_sales
+ FROM
+ sales
+ GROUP BY
+ category,
+ region;
+ """)
```
In this example, the `execute_sql` method of the SQL client allows you to execute SQL statements. The statement inserts a row with values into the `customers` table.
-#### [Using Pandas](dlt-ecosystem/transformations/pandas.md)
+#### [Using Pandas](dlt-ecosystem/transformations/python.md)
You can fetch query results as Pandas data frames and perform transformations using Pandas functionalities. Here's an example of reading data from the `issues` table in DuckDB and counting reaction types using Pandas:
@@ -287,11 +297,8 @@ pipeline = dlt.pipeline(
dev_mode=True
)
-with pipeline.sql_client() as client:
- with client.execute_query(
- 'SELECT "reactions__+1", "reactions__-1", reactions__laugh, reactions__hooray, reactions__rocket FROM issues'
- ) as cursor:
- reactions = cursor.df()
+# get a dataframe of all reactions from the dataset
+reactions = pipeline.dataset().issues.select("reactions__+1", "reactions__-1", "reactions__laugh", "reactions__hooray", "reactions__rocket").df()
counts = reactions.sum(0).sort_values(0, ascending=False)
```
diff --git a/docs/website/docs/dlt-ecosystem/destinations/clickhouse.md b/docs/website/docs/dlt-ecosystem/destinations/clickhouse.md
index 3bd1ae8e15..40ee5d71e8 100644
--- a/docs/website/docs/dlt-ecosystem/destinations/clickhouse.md
+++ b/docs/website/docs/dlt-ecosystem/destinations/clickhouse.md
@@ -229,8 +229,7 @@ To set up GCS staging with HMAC authentication in dlt:
1. Create HMAC keys for your GCS service account by following the [Google Cloud guide](https://cloud.google.com/storage/docs/authentication/managing-hmackeys#create).
-2. Configure the HMAC keys (`aws_access_key_id` and `aws_secret_access_key`) in your dlt project's ClickHouse destination settings in `config.toml`, similar to how you would configure AWS S3
- credentials:
+2. Configure the HMAC keys (`aws_access_key_id` and `aws_secret_access_key`) as well as `endpoint_url` in your dlt project's ClickHouse destination settings in `config.toml`, similar to how you would configure AWS S3 credentials:
```toml
[destination.filesystem]
diff --git a/docs/website/docs/dlt-ecosystem/destinations/databricks.md b/docs/website/docs/dlt-ecosystem/destinations/databricks.md
index 513a3b792f..a28a42f761 100644
--- a/docs/website/docs/dlt-ecosystem/destinations/databricks.md
+++ b/docs/website/docs/dlt-ecosystem/destinations/databricks.md
@@ -52,7 +52,7 @@ If you already have your Databricks workspace set up, you can skip to the [Loade
Add a new role assignment and select "Storage Blob Data Contributor" as the role. Under "Members" select "Managed Identity" and add the Databricks Access Connector you created in the previous step.
-### 2. Set up a metastore and Unity Catalog and get your access token
+### 2. Set up a metastore and Unity Catalog
1. Now go to your Databricks workspace
@@ -85,10 +85,123 @@ If you already have your Databricks workspace set up, you can skip to the [Loade
Go to "Catalog" and click "Create Catalog". Name your catalog and select the storage location you created in the previous step.
-8. Create your access token
+## Authentication
- Click your email in the top right corner and go to "User Settings". Go to "Developer" -> "Access Tokens".
- Generate a new token and save it. You will use it in your `dlt` configuration.
+`dlt` currently supports two options for authentication:
+1. [OAuth2](#oauth) (recommended) allows you to authenticate to Databricks using a service principal via OAuth2 M2M.
+2. [Access token](#access_token) approach using a developer access token. This method may be deprecated in the future by Databricks.
+
+### Using OAuth2
+
+You can authenticate to Databricks using a service principal via OAuth2 M2M. To enable it:
+
+1. Follow the instructions in the Databricks documentation: [Authenticate access to Databricks using OAuth M2M](https://docs.databricks.com/en/dev-tools/auth/oauth-m2m.html)
+to create a service principal and retrieve the `client_id` and `client_secret`.
+
+2. Once you have the service principal credentials, update your credentials with any of the options shown below:
+
+
+
+
+
+```toml
+# secrets.toml
+[destination.databricks.credentials]
+server_hostname = "MY_DATABRICKS.azuredatabricks.net"
+http_path = "/sql/1.0/warehouses/12345"
+catalog = "my_catalog"
+client_id = "XXX"
+client_secret = "XXX"
+```
+
+
+
+
+```sh
+export DESTINATIONS__DATABRICKS__CREDENTIALS__SERVER_HOSTNAME="MY_DATABRICKS.azuredatabricks.net"
+export DESTINATIONS__DATABRICKS__CREDENTIALS__HTTP_PATH="/sql/1.0/warehouses/12345"
+export DESTINATIONS__DATABRICKS__CREDENTIALS__CATALOG="my_catalog"
+export DESTINATIONS__DATABRICKS__CREDENTIALS__CLIENT_ID="XXX"
+export DESTINATIONS__DATABRICKS__CREDENTIALS__CLIENT_SECRET="XXX"
+```
+
+
+
+
+```py
+import os
+
+# Do not set up the secrets directly in the code!
+# What you can do is reassign env variables.
+os.environ["DESTINATIONS__DATABRICKS__CREDENTIALS__SERVER_HOSTNAME"] = "MY_DATABRICKS.azuredatabricks.net"
+os.environ["DESTINATIONS__DATABRICKS__CREDENTIALS__HTTP_PATH"]="/sql/1.0/warehouses/12345"
+os.environ["DESTINATIONS__DATABRICKS__CREDENTIALS__CATALOG"]="my_catalog"
+os.environ["DESTINATIONS__DATABRICKS__CREDENTIALS__CLIENT_ID"]=os.environ.get("CLIENT_ID")
+os.environ["DESTINATIONS__DATABRICKS__CREDENTIALS__CLIENT_SECRET"]=os.environ.get("CLIENT_SECRET")
+```
+
+
+
+### Using access token
+
+To create your access token:
+
+1. Click your email in the top right corner and go to "User Settings". Go to "Developer" -> "Access Tokens".
+Generate a new token and save it.
+2. Set up credentials in a desired way:
+
+
+
+
+
+```toml
+# secrets.toml
+[destination.databricks.credentials]
+server_hostname = "MY_DATABRICKS.azuredatabricks.net"
+http_path = "/sql/1.0/warehouses/12345"
+catalog = "my_catalog"
+access_token = "XXX"
+```
+
+
+
+
+```sh
+export DESTINATIONS__DATABRICKS__CREDENTIALS__SERVER_HOSTNAME="MY_DATABRICKS.azuredatabricks.net"
+export DESTINATIONS__DATABRICKS__CREDENTIALS__HTTP_PATH="/sql/1.0/warehouses/12345"
+export DESTINATIONS__DATABRICKS__CREDENTIALS__CATALOG="my_catalog"
+export DESTINATIONS__DATABRICKS__CREDENTIALS__ACCESS_TOKEN="XXX"
+```
+
+
+
+
+```py
+import os
+
+# Do not set up the secrets directly in the code!
+# What you can do is reassign env variables.
+os.environ["DESTINATIONS__DATABRICKS__CREDENTIALS__SERVER_HOSTNAME"] = "MY_DATABRICKS.azuredatabricks.net"
+os.environ["DESTINATIONS__DATABRICKS__CREDENTIALS__HTTP_PATH"]="/sql/1.0/warehouses/12345"
+os.environ["DESTINATIONS__DATABRICKS__CREDENTIALS__CATALOG"]="my_catalog"
+os.environ["DESTINATIONS__DATABRICKS__CREDENTIALS__ACCESS_TOKEN"]=os.environ.get("ACCESS_TOKEN")
+```
+
+
## Loader setup guide
@@ -106,9 +219,9 @@ pip install -r requirements.txt
This will install dlt with the `databricks` extra, which contains the Databricks Python dbapi client.
-**4. Enter your credentials into `.dlt/secrets.toml`.**
+**3. Enter your credentials into `.dlt/secrets.toml`.**
-This should include your connection parameters and your personal access token.
+This should include your connection parameters and your authentication credentials.
You can find your server hostname and HTTP path in the Databricks workspace dashboard. Go to "SQL Warehouses", select your warehouse (default is called "Starter Warehouse"), and go to "Connection details".
@@ -118,11 +231,14 @@ Example:
[destination.databricks.credentials]
server_hostname = "MY_DATABRICKS.azuredatabricks.net"
http_path = "/sql/1.0/warehouses/12345"
-access_token = "MY_ACCESS_TOKEN"
+client_id = "XXX"
+client_secret = "XXX"
catalog = "my_catalog"
```
-See [staging support](#staging-support) for authentication options when `dlt` copies files from buckets.
+You can find other options for specifying credentials in the [Authentication section](#authentication).
+
+See [Staging support](#staging-support) for authentication options when `dlt` copies files from buckets.
## Write disposition
All write dispositions are supported.
@@ -132,8 +248,7 @@ To load data into Databricks, you must set up a staging filesystem by configurin
dlt will upload the data in Parquet files (or JSONL, if configured) to the bucket and then use `COPY INTO` statements to ingest the data into Databricks.
-For more information on staging, see the [staging support](#staging-support) section below.
-
+For more information on staging, see the [Staging support](#staging-support) section below.
## Supported file formats
* [Parquet](../file-formats/parquet.md) supported when staging is enabled.
@@ -141,13 +256,13 @@ For more information on staging, see the [staging support](#staging-support) sec
The JSONL format has some limitations when used with Databricks:
-1. Compression must be disabled to load jsonl files in Databricks. Set `data_writer.disable_compression` to `true` in the dlt config when using this format.
+1. Compression must be disabled to load JSONL files in Databricks. Set `data_writer.disable_compression` to `true` in the dlt config when using this format.
2. The following data types are not supported when using the JSONL format with `databricks`: `decimal`, `json`, `date`, `binary`. Use `parquet` if your data contains these types.
3. The `bigint` data type with precision is not supported with the JSONL format.
## Staging support
-Databricks supports both Amazon S3, Azure Blob Storage and Google Cloud Storage as staging locations. `dlt` will upload files in Parquet format to the staging location and will instruct Databricks to load data from there.
+Databricks supports both Amazon S3, Azure Blob Storage, and Google Cloud Storage as staging locations. `dlt` will upload files in Parquet format to the staging location and will instruct Databricks to load data from there.
### Databricks and Amazon S3
@@ -155,19 +270,50 @@ Please refer to the [S3 documentation](./filesystem.md#aws-s3) for details on co
Example to set up Databricks with S3 as a staging destination:
+
+
+
+
+```toml
+# secrets.toml
+[destination.filesystem]
+bucket_url = "s3://your-bucket-name"
+
+[destination.filesystem.credentials]
+aws_access_key_id="XXX"
+aws_secret_access_key="XXX"
+```
+
+
+
+
+```sh
+export DESTINATIONS__FILESYSTEM__BUCKET_URL="s3://your-bucket-name"
+export DESTINATIONS__FILESYSTEM__CREDENTIALS__AWS_ACCESS_KEY_ID="XXX"
+export DESTINATIONS__FILESYSTEM__CREDENTIALS__AWS_SECRET_ACCESS_KEY="XXX"
+```
+
+
+
+
```py
-import dlt
+import os
-# Create a dlt pipeline that will load
-# chess player data to the Databricks destination
-# via staging on S3
-pipeline = dlt.pipeline(
- pipeline_name='chess_pipeline',
- destination='databricks',
- staging=dlt.destinations.filesystem('s3://your-bucket-name'), # add this to activate the staging location
- dataset_name='player_data',
-)
+# Do not set up the secrets directly in the code!
+# What you can do is reassign env variables.
+os.environ["DESTINATIONS__FILESYSTEM__BUCKET_URL"] = "s3://your-bucket-name"
+os.environ["DESTINATIONS__FILESYSTEM__CREDENTIALS__AWS_ACCESS_KEY_ID"] = os.environ.get("AWS_ACCESS_KEY_ID")
+os.environ["DESTINATIONS__FILESYSTEM__CREDENTIALS__AWS_SECRET_ACCESS_KEY"] = os.environ.get("AWS_SECRET_ACCESS_KEY")
```
+
+
### Databricks and Azure Blob Storage
@@ -186,22 +332,54 @@ dlt is able to adapt the other representation (i.e., `az://container-name/path`)
Example to set up Databricks with Azure as a staging destination:
+
+
+
+
+```toml
+# secrets.toml
+[destination.filesystem]
+bucket_url = "abfss://container_name@storage_account_name.dfs.core.windows.net/path"
+
+[destination.filesystem.credentials]
+azure_storage_account_name="XXX"
+azure_storage_account_key="XXX"
+```
+
+
+
+
+```sh
+export DESTINATIONS__FILESYSTEM__BUCKET_URL="abfss://container_name@storage_account_name.dfs.core.windows.net/path"
+export DESTINATIONS__FILESYSTEM__CREDENTIALS__AZURE_STORAGE_ACCOUNT_NAME="XXX"
+export DESTINATIONS__FILESYSTEM__CREDENTIALS__AZURE_STORAGE_ACCOUNT_KEY="XXX"
+```
+
+
+
+
```py
-# Create a dlt pipeline that will load
-# chess player data to the Databricks destination
-# via staging on Azure Blob Storage
-pipeline = dlt.pipeline(
- pipeline_name='chess_pipeline',
- destination='databricks',
- staging=dlt.destinations.filesystem('abfss://dlt-ci-data@dltdata.dfs.core.windows.net'), # add this to activate the staging location
- dataset_name='player_data'
-)
+import os
+
+# Do not set up the secrets directly in the code!
+# What you can do is reassign env variables.
+os.environ["DESTINATIONS__FILESYSTEM__BUCKET_URL"] = "abfss://container_name@storage_account_name.dfs.core.windows.net/path"
+os.environ["DESTINATIONS__FILESYSTEM__CREDENTIALS__AZURE_STORAGE_ACCOUNT_NAME"] = os.environ.get("AZURE_STORAGE_ACCOUNT_NAME")
+os.environ["DESTINATIONS__FILESYSTEM__CREDENTIALS__AZURE_STORAGE_ACCOUNT_KEY"] = os.environ.get("AZURE_STORAGE_ACCOUNT_KEY")
```
+
+
### Databricks and Google Cloud Storage
-In order to load from Google Cloud Storage stage you must set-up the credentials via **named credential**. See below. Databricks does not allow to pass Google Credentials
-explicitly in SQL Statements.
+In order to load from Google Cloud Storage stage, you must set up the credentials via a **named credential**. See below. Databricks does not allow you to pass Google Credentials explicitly in SQL statements.
### Use external locations and stored credentials
`dlt` forwards bucket credentials to the `COPY INTO` SQL command by default. You may prefer to use [external locations or stored credentials instead](https://docs.databricks.com/en/sql/language-manual/sql-ref-external-locations.html#external-location) that are stored on the Databricks side.
@@ -212,7 +390,7 @@ If you set up an external location for your staging path, you can tell `dlt` to
is_staging_external_location=true
```
-If you set up Databricks credentials named, for example, **credential_x**, you can tell `dlt` to use it:
+If you set up Databricks credentials named, for example, **credential_x**, you can tell `dlt` to use them:
```toml
[destination.databricks]
staging_credentials_name="credential_x"
@@ -233,8 +411,8 @@ This destination [integrates with dbt](../transformations/dbt/dbt.md) via [dbt-d
### Syncing of `dlt` state
This destination fully supports [dlt state sync](../../general-usage/state#syncing-state-with-destination).
-### Databricks User Agent
-We enable Databricks to identify that the connection is created by dlt.
+### Databricks user agent
+We enable Databricks to identify that the connection is created by `dlt`.
Databricks will use this user agent identifier to better understand the usage patterns associated with dlt integration. The connection identifier is `dltHub_dlt`.
diff --git a/docs/website/docs/dlt-ecosystem/destinations/delta-iceberg.md b/docs/website/docs/dlt-ecosystem/destinations/delta-iceberg.md
new file mode 100644
index 0000000000..7a056d6b40
--- /dev/null
+++ b/docs/website/docs/dlt-ecosystem/destinations/delta-iceberg.md
@@ -0,0 +1,168 @@
+---
+title: Delta / Iceberg
+description: Delta / Iceberg `dlt` destination
+keywords: [delta, iceberg, destination, data warehouse]
+---
+
+# Delta and Iceberg table formats
+`dlt` supports writing [Delta](https://delta.io/) and [Iceberg](https://iceberg.apache.org/) tables when using the [filesystem](./filesystem.md) destination.
+
+## How it works
+`dlt` uses the [deltalake](https://pypi.org/project/deltalake/) and [pyiceberg](https://pypi.org/project/pyiceberg/) libraries to write Delta and Iceberg tables, respectively. One or multiple Parquet files are prepared during the extract and normalize steps. In the load step, these Parquet files are exposed as an Arrow data structure and fed into `deltalake` or `pyiceberg`.
+
+## Iceberg single-user ephemeral catalog
+`dlt` uses single-table, ephemeral, in-memory, sqlite-based [Iceberg catalog](https://iceberg.apache.org/concepts/catalog/)s. These catalogs are created "on demand" when a pipeline is run, and do not persist afterwards. If a table already exists in the filesystem, it gets registered into the catalog using its latest metadata file. This allows for a serverless setup. It is currently not possible to connect your own Iceberg catalog.
+
+:::caution
+While ephemeral catalogs make it easy to get started with Iceberg, it comes with limitations:
+- concurrent writes are not handled and may lead to corrupt table state
+- we cannot guarantee that reads concurrent with writes are clean
+- the latest manifest file needs to be searched for using file listing—this can become slow with large tables, especially in cloud object stores
+:::
+
+## Delta dependencies
+
+You need the `deltalake` package to use this format:
+
+```sh
+pip install "dlt[deltalake]"
+```
+
+You also need `pyarrow>=17.0.0`:
+
+```sh
+pip install 'pyarrow>=17.0.0'
+```
+
+## Iceberg dependencies
+
+You need Python version 3.9 or higher and the `pyiceberg` package to use this format:
+
+```sh
+pip install "dlt[pyiceberg]"
+```
+
+You also need `sqlalchemy>=2.0.18`:
+
+```sh
+pip install 'sqlalchemy>=2.0.18'
+```
+
+## Set table format
+
+Set the `table_format` argument to `delta` or `iceberg` when defining your resource:
+
+```py
+@dlt.resource(table_format="delta")
+def my_delta_resource():
+ ...
+```
+
+or when calling `run` on your pipeline:
+
+```py
+pipeline.run(my_resource, table_format="delta")
+```
+
+:::note
+`dlt` always uses Parquet as `loader_file_format` when using the `delta` or `iceberg` table format. Any setting of `loader_file_format` is disregarded.
+:::
+
+
+## Table format partitioning
+Both `delta` and `iceberg` tables can be partitioned by specifying one or more `partition` column hints. This example partitions a Delta table by the `foo` column:
+
+```py
+@dlt.resource(
+ table_format="delta",
+ columns={"foo": {"partition": True}}
+)
+def my_delta_resource():
+ ...
+```
+
+:::note
+Delta uses [Hive-style partitioning](https://delta.io/blog/pros-cons-hive-style-partionining/), while Iceberg uses [hidden partioning](https://iceberg.apache.org/docs/latest/partitioning/).
+:::
+
+:::caution
+Partition evolution (changing partition columns after a table has been created) is not supported.
+:::
+
+## Table access helper functions
+You can use the `get_delta_tables` and `get_iceberg_tables` helper functions to acccess native table objects. For `delta` these are `deltalake` [DeltaTable](https://delta-io.github.io/delta-rs/api/delta_table/) objects, for `iceberg` these are `pyiceberg` [Table](https://py.iceberg.apache.org/reference/pyiceberg/table/#pyiceberg.table.Table) objects.
+
+```py
+from dlt.common.libs.deltalake import get_delta_tables
+# from dlt.common.libs.pyiceberg import get_iceberg_tables
+
+...
+
+# get dictionary of DeltaTable objects
+delta_tables = get_delta_tables(pipeline)
+
+# execute operations on DeltaTable objects
+delta_tables["my_delta_table"].optimize.compact()
+delta_tables["another_delta_table"].optimize.z_order(["col_a", "col_b"])
+# delta_tables["my_delta_table"].vacuum()
+# etc.
+```
+
+## Table format Google Cloud Storage authentication
+
+Note that not all authentication methods are supported when using table formats on Google Cloud Storage:
+
+| Authentication method | `delta` | `iceberg` |
+| -- | -- | -- |
+| [Service Account](bigquery.md#setup-guide) | ✅ | ❌ |
+| [OAuth](../destinations/bigquery.md#oauth-20-authentication) | ❌ | ✅ |
+| [Application Default Credentials](bigquery.md#using-default-credentials) | ✅ | ❌ |
+
+:::note
+The [S3-compatible](#using-s3-compatible-storage) interface for Google Cloud Storage is not supported when using `iceberg`.
+:::
+
+## Iceberg Azure scheme
+The `az` [scheme](#supported-schemes) is not supported when using the `iceberg` table format. Please use the `abfss` scheme. This is because `pyiceberg`, which `dlt` used under the hood, currently does not support `az`.
+
+## Table format `merge` support (**experimental**)
+The [`upsert`](../../general-usage/incremental-loading.md#upsert-strategy) merge strategy is supported for `delta`. For `iceberg`, the `merge` write disposition is not supported and falls back to `append`.
+
+:::caution
+The `upsert` merge strategy for the filesystem destination with Delta table format is **experimental**.
+:::
+
+```py
+@dlt.resource(
+ write_disposition={"disposition": "merge", "strategy": "upsert"},
+ primary_key="my_primary_key",
+ table_format="delta"
+)
+def my_upsert_resource():
+ ...
+...
+```
+
+### Known limitations
+- `hard_delete` hint not supported
+- Deleting records from nested tables not supported
+ - This means updates to JSON columns that involve element removals are not propagated. For example, if you first load `{"key": 1, "nested": [1, 2]}` and then load `{"key": 1, "nested": [1]}`, then the record for element `2` will not be deleted from the nested table.
+
+## Delta table format storage options
+You can pass storage options by configuring `destination.filesystem.deltalake_storage_options`:
+
+```toml
+[destination.filesystem]
+deltalake_storage_options = '{"AWS_S3_LOCKING_PROVIDER": "dynamodb", "DELTA_DYNAMO_TABLE_NAME": "custom_table_name"}'
+```
+
+`dlt` passes these options to the `storage_options` argument of the `write_deltalake` method in the `deltalake` library. Look at their [documentation](https://delta-io.github.io/delta-rs/api/delta_writer/#deltalake.write_deltalake) to see which options can be used.
+
+You don't need to specify credentials here. `dlt` merges the required credentials with the options you provided before passing it as `storage_options`.
+
+>❗When using `s3`, you need to specify storage options to [configure](https://delta-io.github.io/delta-rs/usage/writing/writing-to-s3-with-locking-provider/) locking behavior.
+
+## Delta table format memory usage
+:::caution
+Beware that when loading a large amount of data for one table, the underlying rust implementation will consume a lot of memory. This is a known issue and the maintainers are actively working on a solution. You can track the progress [here](https://github.com/delta-io/delta-rs/pull/2289). Until the issue is resolved, you can mitigate the memory consumption by doing multiple smaller incremental pipeline runs.
+:::
\ No newline at end of file
diff --git a/docs/website/docs/dlt-ecosystem/destinations/duckdb.md b/docs/website/docs/dlt-ecosystem/destinations/duckdb.md
index 2b284e991a..a4537195ff 100644
--- a/docs/website/docs/dlt-ecosystem/destinations/duckdb.md
+++ b/docs/website/docs/dlt-ecosystem/destinations/duckdb.md
@@ -118,7 +118,7 @@ to disable tz adjustments.
## Destination configuration
-By default, a DuckDB database will be created in the current working directory with a name `.duckdb` (`chess.duckdb` in the example above). After loading, it is available in `read/write` mode via `with pipeline.sql_client() as con:`, which is a wrapper over `DuckDBPyConnection`. See [duckdb docs](https://duckdb.org/docs/api/python/overview#persistent-storage) for details.
+By default, a DuckDB database will be created in the current working directory with a name `.duckdb` (`chess.duckdb` in the example above). After loading, it is available in **read/write** mode via `with pipeline.sql_client() as con:`, which is a wrapper over `DuckDBPyConnection`. See [duckdb docs](https://duckdb.org/docs/api/python/overview#persistent-storage) for details. If you want to **read** data, use [pipeline.dataset()](../../general-usage/dataset-access/dataset) instead of `sql_client`.
The `duckdb` credentials do not require any secret values. [You are free to pass the credentials and configuration explicitly](../../general-usage/destination.md#pass-explicit-credentials). For example:
```py
diff --git a/docs/website/docs/dlt-ecosystem/destinations/filesystem.md b/docs/website/docs/dlt-ecosystem/destinations/filesystem.md
index 9b243b9429..de3d12e8e1 100644
--- a/docs/website/docs/dlt-ecosystem/destinations/filesystem.md
+++ b/docs/website/docs/dlt-ecosystem/destinations/filesystem.md
@@ -108,7 +108,8 @@ You need to create an S3 bucket and a user who can access that bucket. dlt does
#### Using S3 compatible storage
-To use an S3 compatible storage other than AWS S3, such as [MinIO](https://min.io/) or [Cloudflare R2](https://www.cloudflare.com/en-ca/developer-platform/r2/), you may supply an `endpoint_url` in the config. This should be set along with AWS credentials:
+To use an S3 compatible storage other than AWS S3, such as [MinIO](https://min.io/), [Cloudflare R2](https://www.cloudflare.com/en-ca/developer-platform/r2/) or [Google
+Cloud Storage](https://cloud.google.com/storage/docs/interoperability), you may supply an `endpoint_url` in the config. This should be set along with AWS credentials:
```toml
[destination.filesystem]
@@ -166,6 +167,8 @@ Run `pip install "dlt[az]"` which will install the `adlfs` package to interface
Edit the credentials in `.dlt/secrets.toml`, you'll see AWS credentials by default; replace them with your Azure credentials.
+#### Supported schemes
+
`dlt` supports both forms of the blob storage urls:
```toml
[destination.filesystem]
@@ -404,29 +407,6 @@ The filesystem destination handles the write dispositions as follows:
- `replace` - all files that belong to such tables are deleted from the dataset folder, and then the current set of files is added.
- `merge` - falls back to `append`
-### Merge with Delta table format (experimental)
-The [`upsert`](../../general-usage/incremental-loading.md#upsert-strategy) merge strategy is supported when using the [Delta table format](#delta-table-format).
-
-:::caution
-The `upsert` merge strategy for the filesystem destination with Delta table format is experimental.
-:::
-
-```py
-@dlt.resource(
- write_disposition={"disposition": "merge", "strategy": "upsert"},
- primary_key="my_primary_key",
- table_format="delta"
-)
-def my_upsert_resource():
- ...
-...
-```
-
-#### Known limitations
-- `hard_delete` hint not supported
-- Deleting records from nested tables not supported
- - This means updates to JSON columns that involve element removals are not propagated. For example, if you first load `{"key": 1, "nested": [1, 2]}` and then load `{"key": 1, "nested": [1]}`, then the record for element `2` will not be deleted from the nested table.
-
## File compression
The filesystem destination in the dlt library uses `gzip` compression by default for efficiency, which may result in the files being stored in a compressed format. This format may not be easily readable as plain text or JSON Lines (`jsonl`) files. If you encounter files that seem unreadable, they may be compressed.
@@ -645,88 +625,9 @@ You can choose the following file formats:
## Supported table formats
-You can choose the following table formats:
-* [Delta table](../table-formats/delta.md) is supported
-
-### Delta table format
-
-You need the `deltalake` package to use this format:
-
-```sh
-pip install "dlt[deltalake]"
-```
-
-You also need `pyarrow>=17.0.0`:
-
-```sh
-pip install 'pyarrow>=17.0.0'
-```
-
-Set the `table_format` argument to `delta` when defining your resource:
-
-```py
-@dlt.resource(table_format="delta")
-def my_delta_resource():
- ...
-```
-
-:::note
-`dlt` always uses Parquet as `loader_file_format` when using the `delta` table format. Any setting of `loader_file_format` is disregarded.
-:::
-
-:::caution
-Beware that when loading a large amount of data for one table, the underlying rust implementation will consume a lot of memory. This is a known issue and the maintainers are actively working on a solution. You can track the progress [here](https://github.com/delta-io/delta-rs/pull/2289). Until the issue is resolved, you can mitigate the memory consumption by doing multiple smaller incremental pipeline runs.
-:::
-
-#### Delta table partitioning
-A Delta table can be partitioned ([Hive-style partitioning](https://delta.io/blog/pros-cons-hive-style-partionining/)) by specifying one or more `partition` column hints. This example partitions the Delta table by the `foo` column:
-
-```py
-@dlt.resource(
- table_format="delta",
- columns={"foo": {"partition": True}}
-)
-def my_delta_resource():
- ...
-```
-
-:::caution
-It is **not** possible to change partition columns after the Delta table has been created. Trying to do so causes an error stating that the partition columns don't match.
-:::
-
-
-#### Storage options
-You can pass storage options by configuring `destination.filesystem.deltalake_storage_options`:
-
-```toml
-[destination.filesystem]
-deltalake_storage_options = '{"AWS_S3_LOCKING_PROVIDER": "dynamodb", "DELTA_DYNAMO_TABLE_NAME": "custom_table_name"}'
-```
-
-`dlt` passes these options to the `storage_options` argument of the `write_deltalake` method in the `deltalake` library. Look at their [documentation](https://delta-io.github.io/delta-rs/api/delta_writer/#deltalake.write_deltalake) to see which options can be used.
-
-You don't need to specify credentials here. `dlt` merges the required credentials with the options you provided before passing it as `storage_options`.
-
->❗When using `s3`, you need to specify storage options to [configure](https://delta-io.github.io/delta-rs/usage/writing/writing-to-s3-with-locking-provider/) locking behavior.
-
-#### `get_delta_tables` helper
-You can use the `get_delta_tables` helper function to get `deltalake` [DeltaTable](https://delta-io.github.io/delta-rs/api/delta_table/) objects for your Delta tables:
-
-```py
-from dlt.common.libs.deltalake import get_delta_tables
-
-...
-
-# get dictionary of DeltaTable objects
-delta_tables = get_delta_tables(pipeline)
-
-# execute operations on DeltaTable objects
-delta_tables["my_delta_table"].optimize.compact()
-delta_tables["another_delta_table"].optimize.z_order(["col_a", "col_b"])
-# delta_tables["my_delta_table"].vacuum()
-# etc.
-
-```
+You can choose the following [table formats](./delta-iceberg.md):
+* Delta table
+* Iceberg
## Syncing of dlt state
This destination fully supports [dlt state sync](../../general-usage/state#syncing-state-with-destination). To this end, special folders and files will be created at your destination which hold information about your pipeline state, schemas, and completed loads. These folders DO NOT respect your settings in the layout section. When using filesystem as a staging destination, not all of these folders are created, as the state and schemas are managed in the regular way by the final destination you have configured.
diff --git a/docs/website/docs/dlt-ecosystem/destinations/snowflake.md b/docs/website/docs/dlt-ecosystem/destinations/snowflake.md
index 07cf822973..28684c39ac 100644
--- a/docs/website/docs/dlt-ecosystem/destinations/snowflake.md
+++ b/docs/website/docs/dlt-ecosystem/destinations/snowflake.md
@@ -200,6 +200,12 @@ Note that we ignore missing columns `ERROR_ON_COLUMN_COUNT_MISMATCH = FALSE` and
## Supported column hints
Snowflake supports the following [column hints](../../general-usage/schema#tables-and-columns):
* `cluster` - Creates a cluster column(s). Many columns per table are supported and only when a new table is created.
+* `unique` - Creates UNIQUE hint on a Snowflake column, can be added to many columns. ([optional](#additional-destination-options))
+* `primary_key` - Creates PRIMARY KEY on selected column(s), may be compound. ([optional](#additional-destination-options))
+
+`unique` and `primary_key` are not enforced and `dlt` does not instruct Snowflake to `RELY` on them when
+query planning.
+
## Table and column identifiers
Snowflake supports both case-sensitive and case-insensitive identifiers. All unquoted and uppercase identifiers resolve case-insensitively in SQL statements. Case-insensitive [naming conventions](../../general-usage/naming-convention.md#case-sensitive-and-insensitive-destinations) like the default **snake_case** will generate case-insensitive identifiers. Case-sensitive (like **sql_cs_v1**) will generate
@@ -308,6 +314,7 @@ pipeline = dlt.pipeline(
## Additional destination options
You can define your own stage to PUT files and disable the removal of the staged files after loading.
+You can also opt-in to [create indexes](#supported-column-hints).
```toml
[destination.snowflake]
@@ -315,6 +322,8 @@ You can define your own stage to PUT files and disable the removal of the staged
stage_name="DLT_STAGE"
# Whether to keep or delete the staged files after COPY INTO succeeds
keep_staged_files=true
+# Add UNIQUE and PRIMARY KEY hints to tables
+create_indexes=true
```
### Setting up CSV format
diff --git a/docs/website/docs/dlt-ecosystem/table-formats/iceberg.md b/docs/website/docs/dlt-ecosystem/table-formats/iceberg.md
index 233ae0ce21..edca521e52 100644
--- a/docs/website/docs/dlt-ecosystem/table-formats/iceberg.md
+++ b/docs/website/docs/dlt-ecosystem/table-formats/iceberg.md
@@ -10,5 +10,5 @@ keywords: [iceberg, table formats]
## Supported destinations
-Supported by: **Athena**
+Supported by: **Athena**, **filesystem**
diff --git a/docs/website/docs/dlt-ecosystem/transformations/dbt/dbt.md b/docs/website/docs/dlt-ecosystem/transformations/dbt/dbt.md
index 449f8b8bde..59eb340ef2 100644
--- a/docs/website/docs/dlt-ecosystem/transformations/dbt/dbt.md
+++ b/docs/website/docs/dlt-ecosystem/transformations/dbt/dbt.md
@@ -1,10 +1,10 @@
---
-title: Transform the data with dbt
+title: Transforming data with dbt
description: Transforming the data loaded by a dlt pipeline with dbt
keywords: [transform, dbt, runner]
---
-# Transform the data with dbt
+# Transforming data with dbt
[dbt](https://github.com/dbt-labs/dbt-core) is a framework that allows for the simple structuring of your transformations into DAGs. The benefits of using dbt include:
@@ -105,8 +105,8 @@ You can run the example with dbt debug log: `RUNTIME__LOG_LEVEL=DEBUG python dbt
## Other transforming tools
-If you want to transform the data before loading, you can use Python. If you want to transform the data after loading, you can use dbt or one of the following:
+If you want to transform your data before loading, you can use Python. If you want to transform your data after loading, you can use dbt or one of the following:
1. [`dlt` SQL client.](../sql.md)
-2. [Pandas.](../pandas.md)
+2. [Python with dataframes or arrow tables.](../python.md)
diff --git a/docs/website/docs/dlt-ecosystem/transformations/index.md b/docs/website/docs/dlt-ecosystem/transformations/index.md
new file mode 100644
index 0000000000..6c51e8cd8d
--- /dev/null
+++ b/docs/website/docs/dlt-ecosystem/transformations/index.md
@@ -0,0 +1,27 @@
+---
+title: Transforming your data
+description: How to transform your data
+keywords: [datasets, data, access, transformations]
+---
+import DocCardList from '@theme/DocCardList';
+
+# Transforming data
+
+If you'd like to transform your data after a pipeline load, you have 3 options available to you:
+
+* [Using dbt](./dbt/dbt.md) - dlt provides a convenient dbt wrapper to make integration easier.
+* [Using the `dlt` SQL client](./sql.md) - dlt exposes an SQL client to transform data on your destination directly using SQL.
+* [Using Python with DataFrames or Arrow tables](./python.md) - you can also transform your data using Arrow tables and DataFrames in Python.
+
+If you need to preprocess some of your data before it is loaded, you can learn about strategies to:
+
+* [Rename columns.](../../general-usage/customising-pipelines/renaming_columns)
+* [Pseudonymize columns.](../../general-usage/customising-pipelines/pseudonymizing_columns)
+* [Remove columns.](../../general-usage/customising-pipelines/removing_columns)
+
+This is particularly useful if you are trying to remove data related to PII or other sensitive data, you want to remove columns that are not needed for your use case or you are using a destination that does not support certain data types in your source data.
+
+
+# Learn more
+
+
diff --git a/docs/website/docs/dlt-ecosystem/transformations/pandas.md b/docs/website/docs/dlt-ecosystem/transformations/pandas.md
deleted file mode 100644
index e431313d1c..0000000000
--- a/docs/website/docs/dlt-ecosystem/transformations/pandas.md
+++ /dev/null
@@ -1,42 +0,0 @@
----
-title: Transform the data with Pandas
-description: Transform the data loaded by a dlt pipeline with Pandas
-keywords: [transform, pandas]
----
-
-# Transform the data with Pandas
-
-You can fetch the results of any SQL query as a dataframe. If the destination supports that
-natively (i.e., BigQuery and DuckDB), `dlt` uses the native method. Thanks to this, reading
-dataframes can be really fast! The example below reads GitHub reactions data from the `issues` table and
-counts the reaction types.
-
-```py
-pipeline = dlt.pipeline(
- pipeline_name="github_pipeline",
- destination="duckdb",
- dataset_name="github_reactions",
- dev_mode=True
-)
-with pipeline.sql_client() as client:
- with client.execute_query(
- 'SELECT "reactions__+1", "reactions__-1", reactions__laugh, reactions__hooray, reactions__rocket FROM issues'
- ) as cursor:
- # calling `df` on a cursor, returns the data as a pandas data frame
- reactions = cursor.df()
-counts = reactions.sum(0).sort_values(0, ascending=False)
-```
-
-The `df` method above returns all the data in the cursor as a data frame. You can also fetch data in
-chunks by passing the `chunk_size` argument to the `df` method.
-
-Once your data is in a Pandas dataframe, you can transform it as needed.
-
-## Other transforming tools
-
-If you want to transform the data before loading, you can use Python. If you want to transform the
-data after loading, you can use Pandas or one of the following:
-
-1. [dbt.](dbt/dbt.md) (recommended)
-2. [`dlt` SQL client.](sql.md)
-
diff --git a/docs/website/docs/dlt-ecosystem/transformations/python.md b/docs/website/docs/dlt-ecosystem/transformations/python.md
new file mode 100644
index 0000000000..d43f8caaca
--- /dev/null
+++ b/docs/website/docs/dlt-ecosystem/transformations/python.md
@@ -0,0 +1,109 @@
+---
+title: Transforming data in Python with Arrow tables or DataFrames
+description: Transforming data loaded by a dlt pipeline with pandas dataframes or arrow tables
+keywords: [transform, pandas]
+---
+
+# Transforming data in Python with Arrow tables or DataFrames
+
+You can transform your data in Python using Pandas DataFrames or Arrow tables. To get started, please read the [dataset docs](../../general-usage/dataset-access/dataset).
+
+
+## Interactively transforming your data in Python
+
+Using the methods explained in the [dataset docs](../../general-usage/dataset-access/dataset), you can fetch data from your destination into a DataFrame or Arrow table in your local Python process and work with it interactively. This even works for filesystem destinations:
+
+
+The example below reads GitHub reactions data from the `issues` table and
+counts the reaction types.
+
+```py
+pipeline = dlt.pipeline(
+ pipeline_name="github_pipeline",
+ destination="duckdb",
+ dataset_name="github_reactions",
+ dev_mode=True
+)
+
+# get a dataframe of all reactions from the dataset
+reactions = pipeline.dataset().issues.select("reactions__+1", "reactions__-1", "reactions__laugh", "reactions__hooray", "reactions__rocket").df()
+
+# calculate and print out the sum of all reactions
+counts = reactions.sum(0).sort_values(0, ascending=False)
+print(counts)
+
+# alternatively, you can fetch the data as an arrow table
+reactions = pipeline.dataset().issues.select("reactions__+1", "reactions__-1", "reactions__laugh", "reactions__hooray", "reactions__rocket").arrow()
+# ... do transformations on the arrow table
+```
+
+## Persisting your transformed data
+
+Since dlt supports DataFrames and Arrow tables from resources directly, you can use the same pipeline to load the transformed data back into the destination.
+
+
+### A simple example
+
+A simple example that creates a new table from an existing user table but only with columns that do not contain private information. Note that we use the `iter_arrow()` method on the relation to iterate over the arrow table instead of fetching it all at once.
+
+```py
+pipeline = dlt.pipeline(
+ pipeline_name="users_pipeline",
+ destination="duckdb",
+ dataset_name="users_raw",
+ dev_mode=True
+)
+
+# get user relation with only a few columns selected, but omitting email and name
+users = pipeline.dataset().users.select("age", "amount_spent", "country")
+
+# load the data into a new table called users_clean in the same dataset
+pipeline.run(users.iter_arrow(chunk_size=1000), table_name="users_clean")
+```
+
+### A more complex example
+
+The example above could easily be done in SQL. Let's assume you'd like to actually do in Python some Arrow transformations. For this will create a resources from which we can yield the modified Arrow tables. The same is possibly with DataFrames.
+
+```py
+import pyarrow.compute as pc
+
+pipeline = dlt.pipeline(
+ pipeline_name="users_pipeline",
+ destination="duckdb",
+ dataset_name="users_raw",
+ dev_mode=True
+)
+
+# NOTE: this resource will work like a regular resource and support write_disposition, primary_key, etc.
+# NOTE: For selecting only users above 18, we could also use the filter method on the relation with ibis expressions
+@dlt.resource(table_name="users_clean")
+def users_clean():
+ users = pipeline.dataset().users
+ for arrow_table in users.iter_arrow(chunk_size=1000):
+
+ # we want to filter out users under 18
+ age_filter = pc.greater_equal(arrow_table["age"], 18)
+ arrow_table = arrow_table.filter(age_filter)
+
+ # we want to hash the email column
+ arrow_table = arrow_table.append_column("email_hash", pc.sha256(arrow_table["email"]))
+
+ # we want to remove the email column and name column
+ arrow_table = arrow_table.drop(["email", "name"])
+
+ # yield the transformed arrow table
+ yield arrow_table
+
+
+pipeline.run(users_clean())
+```
+
+## Other transforming tools
+
+If you want to transform your data before loading, you can use Python. If you want to transform the
+data after loading, you can use Pandas or one of the following:
+
+1. [dbt.](dbt/dbt.md) (recommended)
+2. [`dlt` SQL client.](sql.md)
+
diff --git a/docs/website/docs/dlt-ecosystem/transformations/sql.md b/docs/website/docs/dlt-ecosystem/transformations/sql.md
index ffd348d1a0..60f3e7f7a5 100644
--- a/docs/website/docs/dlt-ecosystem/transformations/sql.md
+++ b/docs/website/docs/dlt-ecosystem/transformations/sql.md
@@ -1,33 +1,52 @@
---
-title: Transform the data with SQL
+title: Transforming data with SQL
description: Transforming the data loaded by a dlt pipeline with the dlt SQL client
keywords: [transform, sql]
---
-# Transform the data using the `dlt` SQL client
+# Transforming data using the `dlt` SQL client
A simple alternative to dbt is to query the data using the `dlt` SQL client and then perform the
-transformations using Python. The `execute_sql` method allows you to execute any SQL statement,
+transformations using SQL statements in Python. The `execute_sql` method allows you to execute any SQL statement,
including statements that change the database schema or data in the tables. In the example below, we
insert a row into the `customers` table. Note that the syntax is the same as for any standard `dbapi`
connection.
+:::info
+* This method will work for all SQL destinations supported by `dlt`, but not for the filesystem destination.
+* Read the [SQL client docs](../../ general-usage/dataset-access/dataset) for more information on how to access data with the SQL client.
+* If you are simply trying to read data, you should use the powerful [dataset interface](../../general-usage/dataset-access/dataset) instead.
+:::
+
+
+Typically you will use this type of transformation if you can create or update tables directly from existing tables
+without any need to insert data from your Python environment.
+
+The example below creates a new table `aggregated_sales` that contains the total and average sales for each category and region
+
+
```py
-pipeline = dlt.pipeline(destination="bigquery", dataset_name="crm")
-try:
- with pipeline.sql_client() as client:
- client.execute_sql(
- "INSERT INTO customers VALUES (%s, %s, %s)",
- 10,
- "Fred",
- "fred@fred.com"
- )
-except Exception:
- ...
+pipeline = dlt.pipeline(destination="duckdb", dataset_name="crm")
+
+# NOTE: this is the duckdb sql dialect, other destinations may use different expressions
+with pipeline.sql_client() as client:
+ client.execute_sql(
+ """ CREATE OR REPLACE TABLE aggregated_sales AS
+ SELECT
+ category,
+ region,
+ SUM(amount) AS total_sales,
+ AVG(amount) AS average_sales
+ FROM
+ sales
+ GROUP BY
+ category,
+ region;
+ """)
```
-In the case of SELECT queries, the data is returned as a list of rows, with the elements of a row
-corresponding to selected columns.
+You can also use the `execute_sql` method to run select queries. The data is returned as a list of rows, with the elements of a row
+corresponding to selected columns. A more convenient way to extract data is to use dlt datasets.
```py
try:
@@ -44,9 +63,9 @@ except Exception:
## Other transforming tools
-If you want to transform the data before loading, you can use Python. If you want to transform the
+If you want to transform your data before loading, you can use Python. If you want to transform the
data after loading, you can use SQL or one of the following:
1. [dbt](dbt/dbt.md) (recommended).
-2. [Pandas](pandas.md).
+2. [Python with DataFrames or Arrow tables](python.md).
diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/rest_api/basic.md b/docs/website/docs/dlt-ecosystem/verified-sources/rest_api/basic.md
index 14d9ecb04b..ea3c9c768b 100644
--- a/docs/website/docs/dlt-ecosystem/verified-sources/rest_api/basic.md
+++ b/docs/website/docs/dlt-ecosystem/verified-sources/rest_api/basic.md
@@ -306,7 +306,7 @@ A resource configuration is used to define a [dlt resource](../../../general-usa
- `write_disposition`: The write disposition for the resource.
- `primary_key`: The primary key for the resource.
- `include_from_parent`: A list of fields from the parent resource to be included in the resource output. See the [resource relationships](#include-fields-from-the-parent-resource) section for more details.
-- `processing_steps`: A list of [processing steps](#processing-steps-filter-and-transform-data) to filter and transform the data.
+- `processing_steps`: A list of [processing steps](#processing-steps-filter-and-transform-data) to filter and transform your data.
- `selected`: A flag to indicate if the resource is selected for loading. This could be useful when you want to load data only from child resources and not from the parent resource.
- `auth`: An optional `AuthConfig` instance. If passed, is used over the one defined in the [client](#client) definition. Example:
```py
diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/sql_database/advanced.md b/docs/website/docs/dlt-ecosystem/verified-sources/sql_database/advanced.md
index 6ff3a267d2..954c1fb493 100644
--- a/docs/website/docs/dlt-ecosystem/verified-sources/sql_database/advanced.md
+++ b/docs/website/docs/dlt-ecosystem/verified-sources/sql_database/advanced.md
@@ -16,7 +16,7 @@ Efficient data management often requires loading only new or updated data from y
Incremental loading uses a cursor column (e.g., timestamp or auto-incrementing ID) to load only data newer than a specified initial value, enhancing efficiency by reducing processing time and resource use. Read [here](../../../walkthroughs/sql-incremental-configuration) for more details on incremental loading with `dlt`.
-#### How to configure
+### How to configure
1. **Choose a cursor column**: Identify a column in your SQL table that can serve as a reliable indicator of new or updated rows. Common choices include timestamp columns or auto-incrementing IDs.
1. **Set an initial value**: Choose a starting value for the cursor to begin loading data. This could be a specific timestamp or ID from which you wish to start loading data.
1. **Deduplication**: When using incremental loading, the system automatically handles the deduplication of rows based on the primary key (if available) or row hash for tables without a primary key.
@@ -27,7 +27,7 @@ Incremental loading uses a cursor column (e.g., timestamp or auto-incrementing I
If your cursor column name contains special characters (e.g., `$`) you need to escape it when passing it to the `incremental` function. For example, if your cursor column is `example_$column`, you should pass it as `"'example_$column'"` or `'"example_$column"'` to the `incremental` function: `incremental("'example_$column'", initial_value=...)`.
:::
-#### Examples
+### Examples
1. **Incremental loading with the resource `sql_table`**.
@@ -52,7 +52,7 @@ If your cursor column name contains special characters (e.g., `$`) you need to e
print(extract_info)
```
- Behind the scene, the loader generates a SQL query filtering rows with `last_modified` values greater than the incremental value. In the first run, this is the initial value (midnight (00:00:00) January 1, 2024).
+ Behind the scene, the loader generates a SQL query filtering rows with `last_modified` values greater or equal to the incremental value. In the first run, this is the initial value (midnight (00:00:00) January 1, 2024).
In subsequent runs, it is the latest value of `last_modified` that `dlt` stores in [state](../../../general-usage/state).
2. **Incremental loading with the source `sql_database`**.
@@ -78,6 +78,49 @@ If your cursor column name contains special characters (e.g., `$`) you need to e
* `apply_hints` is a powerful method that enables schema modifications after resource creation, like adjusting write disposition and primary keys. You can choose from various tables and use `apply_hints` multiple times to create pipelines with merged, appended, or replaced resources.
:::
+### Inclusive and exclusive filtering
+
+By default the incremental filtering is inclusive on the start value side so that
+rows with cursor equal to the last run's cursor are fetched again from the database.
+
+The SQL query generated looks something like this (assuming `last_value_func` is `max`):
+
+```sql
+SELECT * FROM family
+WHERE last_modified >= :start_value
+ORDER BY last_modified ASC
+```
+
+That means some rows overlapping with the previous load are fetched from the database.
+Duplicates are then filtered out by dlt using either the primary key or a hash of the row's contents.
+
+This ensures there are no gaps in the extracted sequence. But it does come with some performance overhead,
+both due to the deduplication processing and the cost of fetching redundant records from the database.
+
+This is not always needed. If you know that your data does not contain overlapping cursor values then you
+can optimize extraction by passing `range_start="open"` to incremental.
+
+This both disables the deduplication process and changes the operator used in the SQL `WHERE` clause from `>=` (greater-or-equal) to `>` (greater than), so that no overlapping rows are fetched.
+
+E.g.
+
+```py
+table = sql_table(
+ table='family',
+ incremental=dlt.sources.incremental(
+ 'last_modified', # Cursor column name
+ initial_value=pendulum.DateTime(2024, 1, 1, 0, 0, 0), # Initial cursor value
+ range_start="open", # exclude the start value
+ )
+)
+```
+
+It's a good option if:
+
+* The cursor is an auto incrementing ID
+* The cursor is a high precision timestamp and two records are never created at exactly the same time
+* Your pipeline runs are timed in such a way that new data is not generated during the load
+
## Parallelized extraction
You can extract each table in a separate thread (no multiprocessing at this point). This will decrease loading time if your queries take time to execute or your network latency/speed is low. To enable this, declare your sources/resources as follows:
@@ -213,3 +256,24 @@ SOURCES__SQL_DATABASE__CHUNK_SIZE=1000
SOURCES__SQL_DATABASE__CHAT_MESSAGE__INCREMENTAL__CURSOR_PATH=updated_at
```
+### Configure many sources side by side with custom sections
+`dlt` allows you to rename any source to place the source configuration into custom section or to have many instances
+of the source created side by side. For example:
+```py
+from dlt.sources.sql_database import sql_database
+
+my_db = sql_database.with_args(name="my_db", section="my_db")(table_names=["chat_message"])
+print(my_db.name)
+```
+Here we create a renamed version of the `sql_database` and then instantiate it. Such source will read
+credentials from:
+```toml
+[sources.my_db]
+credentials="mssql+pyodbc://loader.database.windows.net/dlt_data?trusted_connection=yes&driver=ODBC+Driver+17+for+SQL+Server"
+schema="data"
+backend="pandas"
+chunk_size=1000
+
+[sources.my_db.chat_message.incremental]
+cursor_path="updated_at"
+```
diff --git a/docs/website/docs/general-usage/dataset-access/dataset.md b/docs/website/docs/general-usage/dataset-access/dataset.md
index 68635383c5..f9c01603f6 100644
--- a/docs/website/docs/general-usage/dataset-access/dataset.md
+++ b/docs/website/docs/general-usage/dataset-access/dataset.md
@@ -19,7 +19,7 @@ Here's a full example of how to retrieve data from a pipeline and load it into a
# and you have loaded data to a table named 'items' in the destination
# Step 1: Get the readable dataset from the pipeline
-dataset = pipeline._dataset()
+dataset = pipeline.dataset()
# Step 2: Access a table as a ReadableRelation
items_relation = dataset.items # Or dataset["items"]
@@ -39,7 +39,10 @@ Assuming you have a `Pipeline` object (let's call it `pipeline`), you can obtain
```py
# Get the readable dataset from the pipeline
-dataset = pipeline._dataset()
+dataset = pipeline.dataset()
+
+# print the row counts of all tables in the destination as dataframe
+print(dataset.row_counts().df())
```
### Access tables as `ReadableRelation`
@@ -116,6 +119,18 @@ for items_chunk in items_relation.iter_fetch(chunk_size=500):
The methods available on the ReadableRelation correspond to the methods available on the cursor returned by the SQL client. Please refer to the [SQL client](./sql-client.md#supported-methods-on-the-cursor) guide for more information.
+## Special queries
+
+You can use the `row_counts` method to get the row counts of all tables in the destination as a DataFrame.
+
+```py
+# print the row counts of all tables in the destination as dataframe
+print(dataset.row_counts().df())
+
+# or as tuples
+print(dataset.row_counts().fetchall())
+```
+
## Modifying queries
You can refine your data retrieval by limiting the number of records, selecting specific columns, or chaining these operations.
@@ -156,6 +171,64 @@ You can combine `select`, `limit`, and other methods.
arrow_table = items_relation.select("col1", "col2").limit(50).arrow()
```
+## Modifying queries with ibis expressions
+
+If you install the amazing [ibis](https://ibis-project.org/) library, you can use ibis expressions to modify your queries.
+
+```sh
+pip install ibis-framework
+```
+
+dlt will then wrap an `ibis.UnboundTable` with a `ReadableIbisRelation` object under the hood that will allow you to modify the query of a reltaion using ibis expressions:
+
+```py
+# now that ibis is installed, we can get a dataset with ibis relations
+dataset = pipeline.dataset()
+
+# get two relations
+items_relation = dataset["items"]
+order_relation = dataset["orders"]
+
+# join them using an ibis expression
+joined_relation = items_relation.join(order_relation, items_relation.id == order_relation.item_id)
+
+# now we can use the ibis expression to filter the data
+filtered_relation = joined_relation.filter(order_relation.status == "completed")
+
+# we can inspect the query that will be used to read the data
+print(filtered_relation.query)
+
+# and finally fetch the data as a pandas dataframe, the same way we would do with a normal relation
+df = filtered_relation.df()
+
+# a few more examples
+
+# filter for rows where the id is in the list of ids
+items_relation.filter(items_relation.id.isin([1, 2, 3])).df()
+
+# limit and offset
+items_relation.limit(10, offset=5).arrow()
+
+# mutate columns by adding a new colums that always is 10 times the value of the id column
+items_relation.mutate(new_id=items_relation.id * 10).df()
+
+# sort asc and desc
+import ibis
+items_relation.order_by(ibis.desc("id"), ibis.asc("price")).limit(10)
+
+# group by and aggregate
+items_relation.group_by("item_group").having(items_table.count() >= 1000).aggregate(sum_id=items_table.id.sum()).df()
+
+# subqueries
+items_relation.filter(items_table.category.isin(beverage_categories.name)).df()
+```
+
+You can learn more about the available expressions on the [ibis for sql users](https://ibis-project.org/tutorials/ibis-for-sql-users) page.
+
+:::note
+Keep in mind that you can use only methods that modify the executed query and none of the methods ibis provides for fetching data. This is done with the same methods defined on the regular relations explained above. If you need full native ibis integration, please read the ibis section in the advanced part further down. Additionally, not all ibis expressions may be supported by all destinations and sql dialects.
+:::
+
## Supported destinations
All SQL and filesystem destinations supported by `dlt` can utilize this data access interface. For filesystem destinations, `dlt` [uses **DuckDB** under the hood](./sql-client.md#the-filesystem-sql-client) to create views from Parquet or JSONL files dynamically. This allows you to query data stored in files using the same interface as you would with SQL databases. If you plan on accessing data in buckets or the filesystem a lot this way, it is advised to load data as Parquet instead of JSONL, as **DuckDB** is able to only load the parts of the data actually needed for the query to work.
@@ -226,7 +299,9 @@ other_pipeline = dlt.pipeline(pipeline_name="other_pipeline", destination="duckd
other_pipeline.run(limited_items_relation.iter_arrow(chunk_size=10_000), table_name="limited_items")
```
-### Using `ibis` to query the data
+Learn more about [transforming data in Python with Arrow tables or DataFrames](../../dlt-ecosystem/transformations/python).
+
+### Using `ibis` to query data
Visit the [Native Ibis integration](./ibis-backend.md) guide to learn more.
diff --git a/docs/website/docs/general-usage/dataset-access/ibis-backend.md b/docs/website/docs/general-usage/dataset-access/ibis-backend.md
index 8f4b0fb6b6..bc8487940e 100644
--- a/docs/website/docs/general-usage/dataset-access/ibis-backend.md
+++ b/docs/website/docs/general-usage/dataset-access/ibis-backend.md
@@ -6,7 +6,7 @@ keywords: [data, dataset, ibis]
# Ibis
-Ibis is a powerful portable Python dataframe library. Learn more about what it is and how to use it in the [official documentation](https://ibis-project.org/).
+Ibis is a powerful portable Python dataframe library. Learn more about what it is and how to use it in the [official documentation](https://ibis-project.org/).
`dlt` provides an easy way to hand over your loaded dataset to an Ibis backend connection.
@@ -28,7 +28,7 @@ pip install ibis-framework[duckdb]
```py
# get the dataset from the pipeline
-dataset = pipeline._dataset()
+dataset = pipeline.dataset()
dataset_name = pipeline.dataset_name
# get the native ibis connection from the dataset
@@ -46,4 +46,3 @@ print(table.limit(10).execute())
# Visit the ibis docs to learn more about the available methods
```
-
diff --git a/docs/website/docs/general-usage/destination.md b/docs/website/docs/general-usage/destination.md
index fa133b6257..ba42869957 100644
--- a/docs/website/docs/general-usage/destination.md
+++ b/docs/website/docs/general-usage/destination.md
@@ -128,7 +128,7 @@ When loading data, `dlt` will access the destination in two cases:
1. At the beginning of the `run` method to sync the pipeline state with the destination (or if you call `pipeline.sync_destination` explicitly).
2. In the `pipeline.load` method - to migrate the schema and load the load package.
-Obviously, `dlt` will access the destination when you instantiate [sql_client](../dlt-ecosystem/transformations/sql.md).
+`dlt` will also access the destination when you instantiate [sql_client](../dlt-ecosystem/transformations/sql.md).
:::note
`dlt` will not import the destination dependencies or access destination configuration if access is not needed. You can build multi-stage pipelines where steps are executed in separate processes or containers - the `extract` and `normalize` step do not need destination dependencies, configuration, and actual connection.
diff --git a/docs/website/docs/general-usage/incremental-loading.md b/docs/website/docs/general-usage/incremental-loading.md
index 3f452f0d16..5008795ed4 100644
--- a/docs/website/docs/general-usage/incremental-loading.md
+++ b/docs/website/docs/general-usage/incremental-loading.md
@@ -693,7 +693,7 @@ august_issues = repo_issues(
...
```
-Note that dlt's incremental filtering considers the ranges half-closed. `initial_value` is inclusive, `end_value` is exclusive, so chaining ranges like above works without overlaps.
+Note that dlt's incremental filtering considers the ranges half-closed. `initial_value` is inclusive, `end_value` is exclusive, so chaining ranges like above works without overlaps. This behaviour can be changed with the `range_start` (default `"closed"`) and `range_end` (default `"open"`) arguments.
### Declare row order to not request unnecessary data
@@ -793,6 +793,9 @@ def some_data(last_timestamp=dlt.sources.incremental("item.ts", primary_key=()))
yield {"delta": i, "item": {"ts": pendulum.now().timestamp()}}
```
+This deduplication process is always enabled when `range_start` is set to `"closed"` (default).
+When you pass `range_start="open"` no deduplication is done as it is not needed as rows with the previous cursor value are excluded. This can be a useful optimization to avoid the performance overhead of deduplication if the cursor field is guaranteed to be unique.
+
### Using `dlt.sources.incremental` with dynamically created resources
When resources are [created dynamically](source.md#create-resources-dynamically), it is possible to use the `dlt.sources.incremental` definition as well.
diff --git a/docs/website/docs/general-usage/resource.md b/docs/website/docs/general-usage/resource.md
index 199eaf9b5d..b8d51caf75 100644
--- a/docs/website/docs/general-usage/resource.md
+++ b/docs/website/docs/general-usage/resource.md
@@ -405,11 +405,26 @@ dlt.pipeline(destination="duckdb").run(my_resource().add_limit(10))
The code above will extract `15*10=150` records. This is happening because in each iteration, 15 records are yielded, and we're limiting the number of iterations to 10.
:::
-Some constraints of `add_limit` include:
+Altenatively you can also apply a time limit to the resource. The code below will run the extraction for 10 seconds and extract how ever many items are yielded in that time. In combination with incrementals, this can be useful for batched loading or for loading on machines that have a run time limit.
+
+```py
+dlt.pipeline(destination="duckdb").run(my_resource().add_limit(max_time=10))
+```
+
+You can also apply a combination of both limits. In this case the extraction will stop as soon as either limit is reached.
+
+```py
+dlt.pipeline(destination="duckdb").run(my_resource().add_limit(max_items=10, max_time=10))
+```
+
+
+Some notes about the `add_limit`:
1. `add_limit` does not skip any items. It closes the iterator/generator that produces data after the limit is reached.
2. You cannot limit transformers. They should process all the data they receive fully to avoid inconsistencies in generated datasets.
3. Async resources with a limit added may occasionally produce one item more than the limit on some runs. This behavior is not deterministic.
+4. Calling add limit on a resource will replace any previously set limits settings.
+5. For time-limited resources, the timer starts when the first item is processed. When resources are processed sequentially (FIFO mode), each resource's time limit applies also sequentially. In the default round robin mode, the time limits will usually run concurrently.
:::tip
If you are parameterizing the value of `add_limit` and sometimes need it to be disabled, you can set `None` or `-1` to disable the limiting.
diff --git a/docs/website/docs/general-usage/source.md b/docs/website/docs/general-usage/source.md
index a5f1f04dee..9c6c2aac13 100644
--- a/docs/website/docs/general-usage/source.md
+++ b/docs/website/docs/general-usage/source.md
@@ -52,7 +52,6 @@ Do not extract data in the source function. Leave that task to your resources if
If this is impractical (for example, you want to reflect a database to create resources for tables), make sure you do not call the source function too often. [See this note if you plan to deploy on Airflow](../walkthroughs/deploy-a-pipeline/deploy-with-airflow-composer.md#2-modify-dag-file)
-
## Customize sources
### Access and select resources to load
@@ -108,12 +107,40 @@ load_info = pipeline.run(pipedrive_source().add_limit(10))
print(load_info)
```
+You can also apply a time limit to the source:
+
+```py
+pipeline.run(pipedrive_source().add_limit(max_time=10))
+```
+
+Or limit by both, the limit that is reached first will stop the extraction:
+
+```py
+pipeline.run(pipedrive_source().add_limit(max_items=10, max_time=10))
+```
+
:::note
-Note that `add_limit` **does not limit the number of records** but rather the "number of yields". `dlt` will close the iterator/generator that produces data after the limit is reached.
+Note that `add_limit` **does not limit the number of records** but rather the "number of yields". `dlt` will close the iterator/generator that produces data after the limit is reached. Please read in more detail about the `add_limit` on the resource page.
:::
Find more on sampling data [here](resource.md#sample-from-large-data).
+### Rename the source
+`dlt` allows you to rename the source ie. to place the source configuration into custom section or to have many instances
+of the source created side by side. For example:
+```py
+from dlt.sources.sql_database import sql_database
+
+my_db = sql_database.with_args(name="my_db", section="my_db")(table_names=["table_1"])
+print(my_db.name)
+```
+Here we create a renamed version of the `sql_database` and then instantiate it. Such source will read
+credentials from:
+```toml
+[sources.my_db.my_db.credentials]
+password="..."
+```
+
### Add more resources to existing source
You can add a custom resource to a source after it was created. Imagine that you want to score all the deals with a keras model that will tell you if the deal is a fraud or not. In order to do that, you declare a new [transformer that takes the data from](resource.md#feeding-data-from-one-resource-into-another) `deals` resource and add it to the source.
diff --git a/docs/website/docs/general-usage/state.md b/docs/website/docs/general-usage/state.md
index 46aa1d63ce..d1fb426452 100644
--- a/docs/website/docs/general-usage/state.md
+++ b/docs/website/docs/general-usage/state.md
@@ -123,14 +123,13 @@ def comments(user_id: str):
# on the first pipeline run, the user_comments table does not yet exist so do not check at all
# alternatively, catch DatabaseUndefinedRelation which is raised when an unknown table is selected
if not current_pipeline.first_run:
- with current_pipeline.sql_client() as client:
- # we may get the last user comment or None which we replace with 0
- max_id = (
- client.execute_sql(
- "SELECT MAX(_id) FROM user_comments WHERE user_id=?", user_id
- )[0][0]
- or 0
- )
+ # get user comments table from pipeline dataset
+ user_comments = current_pipeline.dataset().user_comments
+ # get last user comment id with ibis expression, ibis-extras need to be installed
+ max_id_df = user_comments.filter(user_comments.user_id == user_id).select(user_comments["_id"].max()).df()
+ # if there are no comments for the user, max_id will be None, so we replace it with 0
+ max_id = max_id_df[0][0] if len(max_id_df.index) else 0
+
# use max_id to filter our results (we simulate an API query)
yield from [
{"_id": i, "value": letter, "user_id": user_id}
diff --git a/docs/website/docs/intro.md b/docs/website/docs/intro.md
index b20d41c494..bc227b85ad 100644
--- a/docs/website/docs/intro.md
+++ b/docs/website/docs/intro.md
@@ -70,6 +70,10 @@ pipeline = dlt.pipeline(
)
load_info = pipeline.run(source)
+
+# print load info and posts table as dataframe
+print(load_info)
+print(pipeline.dataset().posts.df())
```
Follow the [REST API source tutorial](./tutorial/rest-api) to learn more about the source configuration and pagination methods.
@@ -92,6 +96,10 @@ pipeline = dlt.pipeline(
)
load_info = pipeline.run(source)
+
+# print load info and the "family" table as dataframe
+print(load_info)
+print(pipeline.dataset().family.df())
```
Follow the [SQL source tutorial](./tutorial/sql-database) to learn more about the source configuration and supported databases.
@@ -116,6 +124,10 @@ pipeline = dlt.pipeline(
)
load_info = pipeline.run(resource)
+
+# print load info and the "example" table as dataframe
+print(load_info)
+print(pipeline.dataset().example.df())
```
Follow the [filesystem source tutorial](./tutorial/filesystem) to learn more about the source configuration and supported storage services.
@@ -128,7 +140,7 @@ dlt is able to load data from Python generators or directly from Python data str
```py
import dlt
-@dlt.resource
+@dlt.resource(table_name="foo_data")
def foo():
for i in range(10):
yield {"id": i, "name": f"This is item {i}"}
@@ -139,6 +151,10 @@ pipeline = dlt.pipeline(
)
load_info = pipeline.run(foo)
+
+# print load info and the "foo_data" table as dataframe
+print(load_info)
+print(pipeline.dataset().foo_data.df())
```
Check out the [Python data structures tutorial](./tutorial/load-data-from-an-api) to learn about dlt fundamentals and advanced usage scenarios.
diff --git a/docs/website/docs/reference/command-line-interface.md b/docs/website/docs/reference/command-line-interface.md
index 825d33d548..2af750f43c 100644
--- a/docs/website/docs/reference/command-line-interface.md
+++ b/docs/website/docs/reference/command-line-interface.md
@@ -20,9 +20,22 @@ This command creates a new dlt pipeline script that loads data from `source` to
This command can be used several times in the same folder to add more sources, destinations, and pipelines. It will also update the verified source code to the newest
version if run again with an existing `source` name. You are warned if files will be overwritten or if the `dlt` version needs an upgrade to run a particular pipeline.
+### Ejecting source code of the core sources like `sql_database`.
+We merged a few sources to the core library. You can still eject source code and hack them with the `--eject` flag:
+```sh
+dlt init sql_database duckdb --eject
+```
+will copy the source code of `sql_database` to your project. Remember to modify the pipeline example script to import from the local folder!
+
### Specify your own "verified sources" repository
You can use the `--location ` option to specify your own repository with sources. Typically, you would [fork ours](https://github.com/dlt-hub/verified-sources) and start customizing and adding sources, e.g., to use them for your team or organization. You can also specify a branch with `--branch `, e.g., to test a version being developed.
+### Using dlt 0.5.x sources
+Use `--branch 0.5` if you are still on `dlt` `0.5.x` ie.
+```sh
+dlt init