From d9a7b93ca74d237ea6d92a774a017eef1013f3f5 Mon Sep 17 00:00:00 2001 From: Deepyaman Datta Date: Sun, 25 Aug 2024 14:50:49 -0600 Subject: [PATCH 01/10] Correct the library name for mem stats to `psutil` (#1733) --- docs/website/docs/reference/performance.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/website/docs/reference/performance.md b/docs/website/docs/reference/performance.md index 075d351553..0ee62acec7 100644 --- a/docs/website/docs/reference/performance.md +++ b/docs/website/docs/reference/performance.md @@ -62,7 +62,7 @@ Several [text file formats](../dlt-ecosystem/file-formats/) have `gzip` compress Keep in mind load packages are buffered to disk and are left for any troubleshooting, so you can [clear disk space by setting the `delete_completed_jobs` option](../running-in-production/running.md#data-left-behind). ### Observing cpu and memory usage -Please make sure that you have the `psutils` package installed (note that Airflow installs it by default). Then you can dump the stats periodically by setting the [progress](../general-usage/pipeline.md#display-the-loading-progress) to `log` in `config.toml`: +Please make sure that you have the `psutil` package installed (note that Airflow installs it by default). Then you can dump the stats periodically by setting the [progress](../general-usage/pipeline.md#display-the-loading-progress) to `log` in `config.toml`: ```toml progress="log" ``` @@ -258,4 +258,4 @@ DLT_USE_JSON=simplejson ## Using the built in requests wrapper or RESTClient for API calls -Instead of using Python Requests directly, you can use the built-in [requests wrapper](../general-usage/http/requests) or [`RESTClient`](../general-usage/http/rest-client) for API calls. This will make your pipeline more resilient to intermittent network errors and other random glitches. \ No newline at end of file +Instead of using Python Requests directly, you can use the built-in [requests wrapper](../general-usage/http/requests) or [`RESTClient`](../general-usage/http/rest-client) for API calls. This will make your pipeline more resilient to intermittent network errors and other random glitches. From 7d7c14f71d14612f0de873110eaa6d300a4544c2 Mon Sep 17 00:00:00 2001 From: dat-a-man <98139823+dat-a-man@users.noreply.github.com> Date: Mon, 26 Aug 2024 02:23:43 +0530 Subject: [PATCH 02/10] Replaced "full_refresh" with "dev_mode" (#1735) --- docs/technical/general_usage.md | 10 +++++----- .../dlt-ecosystem/verified-sources/sql_database.md | 2 +- .../docs/dlt-ecosystem/verified-sources/stripe.md | 2 +- .../docs/dlt-ecosystem/verified-sources/workable.md | 2 +- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/docs/technical/general_usage.md b/docs/technical/general_usage.md index 336c892c66..2df903b062 100644 --- a/docs/technical/general_usage.md +++ b/docs/technical/general_usage.md @@ -47,7 +47,7 @@ Pipeline can be explicitly created and configured via `dlt.pipeline()` that retu 4. dataset_name - name of the dataset where the data goes (see later the default names) 5. import_schema_path - default is None 6. export_schema_path - default is None -7. full_refresh - if set to True the pipeline working dir will be erased and the dataset name will get the unique suffix (current timestamp). ie the `my_data` becomes `my_data_20221107164856`. +7. dev_mode - if set to True the pipeline working dir will be erased and the dataset name will get the unique suffix (current timestamp). ie the `my_data` becomes `my_data_20221107164856`. > **Achtung** as per `secrets_and_config.md` the arguments passed to `dlt.pipeline` are configurable and if skipped will be injected by the config providers. **the values provided explicitly in the code have a full precedence over all config providers** @@ -101,7 +101,7 @@ In case **there are more schemas in the pipeline**, the data will be loaded into 1. `spotify` tables and `labels` will load into `spotify_data_1` 2. `mel` resource will load into `spotify_data_1_echonest` -The `full_refresh` option: dataset name receives a prefix with the current timestamp: ie the `my_data` becomes `my_data_20221107164856`. This allows a non destructive full refresh. Nothing is being deleted/dropped from the destination. +The `dev_mode` option: dataset name receives a prefix with the current timestamp: ie the `my_data` becomes `my_data_20221107164856`. This allows a non destructive full refresh. Nothing is being deleted/dropped from the destination. ## pipeline working directory and state Another fundamental concept is the pipeline working directory. This directory keeps the following information: @@ -117,7 +117,7 @@ The `restore_from_destination` argument to `dlt.pipeline` let's the user restore The state is being stored in the destination together with other data. So only when all pipeline stages are completed the state is available for restoration. -The pipeline cannot be restored if `full_refresh` flag is set. +The pipeline cannot be restored if `dev_mode` flag is set. The other way to trigger full refresh is to drop destination dataset. `dlt` detects that and resets the pipeline local working folder. @@ -155,8 +155,8 @@ The default json normalizer will convert json documents into tables. All the key ❗ [more here](working_with_schemas.md) -### Full refresh mode -If `full_refresh` flag is passed to `dlt.pipeline` then +### Dev mode mode +If `dev_mode` flag is passed to `dlt.pipeline` then 1. the pipeline working dir is fully wiped out (state, schemas, temp files) 2. dataset name receives a prefix with the current timestamp: ie the `my_data` becomes `my_data_20221107164856`. 3. pipeline will not be restored from the destination diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/sql_database.md b/docs/website/docs/dlt-ecosystem/verified-sources/sql_database.md index eeb717515a..c89a63a524 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/sql_database.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/sql_database.md @@ -652,6 +652,6 @@ resource. Below we show you an example on how to pseudonymize the data before it print(info) ``` -1. Remember to keep the pipeline name and destination dataset name consistent. The pipeline name is crucial for retrieving the [state](https://dlthub.com/docs/general-usage/state) from the last run, which is essential for incremental loading. Altering these names could initiate a "[full_refresh](https://dlthub.com/docs/general-usage/pipeline#do-experiments-with-full-refresh)", interfering with the metadata tracking necessary for [incremental loads](https://dlthub.com/docs/general-usage/incremental-loading). +1. Remember to keep the pipeline name and destination dataset name consistent. The pipeline name is crucial for retrieving the [state](https://dlthub.com/docs/general-usage/state) from the last run, which is essential for incremental loading. Altering these names could initiate a "[dev_mode](https://dlthub.com/docs/general-usage/pipeline#do-experiments-with-dev-mode)", interfering with the metadata tracking necessary for [incremental loads](https://dlthub.com/docs/general-usage/incremental-loading). diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/stripe.md b/docs/website/docs/dlt-ecosystem/verified-sources/stripe.md index 8c39a5090e..fdbefeddf1 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/stripe.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/stripe.md @@ -232,6 +232,6 @@ verified source. load_info = pipeline.run(data=[source_single, source_incremental]) print(load_info) ``` - > To load data, maintain the pipeline name and destination dataset name. The pipeline name is vital for accessing the last run's [state](../../general-usage/state), which determines the incremental data load's end date. Altering these names can trigger a [“full_refresh”](../../general-usage/pipeline#do-experiments-with-full-refresh), disrupting the metadata (state) tracking for [incremental data loading](../../general-usage/incremental-loading). + > To load data, maintain the pipeline name and destination dataset name. The pipeline name is vital for accessing the last run's [state](../../general-usage/state), which determines the incremental data load's end date. Altering these names can trigger a [“dev_mode”](../../general-usage/pipeline#do-experiments-with-dev-mode), disrupting the metadata (state) tracking for [incremental data loading](../../general-usage/incremental-loading). diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/workable.md b/docs/website/docs/dlt-ecosystem/verified-sources/workable.md index 472f48a28f..9229ddca7e 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/workable.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/workable.md @@ -272,7 +272,7 @@ To create your data pipeline using single loading and destination dataset names. The pipeline name helps retrieve the [state](https://dlthub.com/docs/general-usage/state) of the last run, essential for incremental data loading. Changing these names might trigger a - [“full_refresh”](https://dlthub.com/docs/general-usage/pipeline#do-experiments-with-full-refresh), + [“dev_mode”](https://dlthub.com/docs/general-usage/pipeline#do-experiments-with-dev-mode), disrupting metadata tracking for [incremental data loading](https://dlthub.com/docs/general-usage/incremental-loading). From 011d7ff508f3d5a2da666e418a7137fb79acab49 Mon Sep 17 00:00:00 2001 From: rudolfix Date: Sun, 25 Aug 2024 23:07:02 +0200 Subject: [PATCH 03/10] feat/1681 collects load job metrics and adds remote uri (#1708) * collects basic load job metrics in LoadJob * adds remote uri to filesystem copy jobs metrics * adds job id to load package info * adds table name to job metrics * skips run step when serializing trace * adds trace shape test with trace schema * tests job file name too long * docs running pipelines with the same name for different envs * extracts step metrics in common, renames followup jobs * fixes tests * fixes tests * tests delta filesystem for remote_uri * adds exec_info to trace contract test * tests remote_uri for filesystem copy * fixes platform test --- dlt/common/data_writers/__init__.py | 2 - dlt/common/data_writers/buffered.py | 3 +- dlt/common/data_writers/writers.py | 20 +- dlt/common/destination/reference.py | 27 +- dlt/common/metrics.py | 71 ++ dlt/common/pipeline.py | 103 +-- dlt/common/storages/__init__.py | 4 +- dlt/common/storages/data_item_storage.py | 7 +- dlt/common/storages/load_package.py | 51 +- dlt/common/storages/load_storage.py | 8 +- dlt/destinations/impl/athena/athena.py | 10 +- dlt/destinations/impl/bigquery/bigquery.py | 12 +- .../impl/clickhouse/clickhouse.py | 12 +- .../impl/databricks/databricks.py | 12 +- dlt/destinations/impl/dremio/dremio.py | 12 +- dlt/destinations/impl/dummy/configuration.py | 2 +- dlt/destinations/impl/dummy/dummy.py | 33 +- .../impl/filesystem/filesystem.py | 43 +- dlt/destinations/impl/mssql/mssql.py | 8 +- dlt/destinations/impl/postgres/postgres.py | 4 +- dlt/destinations/impl/redshift/redshift.py | 10 +- dlt/destinations/impl/snowflake/snowflake.py | 6 +- dlt/destinations/impl/synapse/synapse.py | 8 +- dlt/destinations/job_client_impl.py | 18 +- dlt/destinations/job_impl.py | 11 +- dlt/destinations/sql_jobs.py | 6 +- dlt/extract/extractors.py | 2 +- dlt/extract/storage.py | 3 +- dlt/load/load.py | 41 +- dlt/load/utils.py | 4 +- dlt/normalize/items_normalizers.py | 2 +- dlt/normalize/normalize.py | 2 +- dlt/normalize/worker.py | 2 +- dlt/pipeline/trace.py | 2 +- docs/website/docs/general-usage/pipeline.md | 13 + .../common/data_writers/test_data_writers.py | 7 +- tests/common/storages/utils.py | 4 +- .../data_writers/test_buffered_writer.py | 2 +- .../data_writers/test_data_item_storage.py | 3 +- .../load/pipeline/test_filesystem_pipeline.py | 58 ++ tests/load/pipeline/test_postgres.py | 15 + tests/load/pipeline/test_stage_loading.py | 16 + tests/load/test_dummy_client.py | 110 ++- tests/load/utils.py | 5 +- .../cases/contracts/trace.schema.yaml | 772 ++++++++++++++++++ tests/pipeline/test_pipeline.py | 53 +- tests/pipeline/test_pipeline_trace.py | 169 +++- tests/pipeline/test_platform_connection.py | 3 +- tests/pipeline/utils.py | 3 + tests/utils.py | 5 +- 50 files changed, 1552 insertions(+), 247 deletions(-) create mode 100644 dlt/common/metrics.py create mode 100644 tests/pipeline/cases/contracts/trace.schema.yaml diff --git a/dlt/common/data_writers/__init__.py b/dlt/common/data_writers/__init__.py index 945e74a37b..9966590c06 100644 --- a/dlt/common/data_writers/__init__.py +++ b/dlt/common/data_writers/__init__.py @@ -1,6 +1,5 @@ from dlt.common.data_writers.writers import ( DataWriter, - DataWriterMetrics, TDataItemFormat, FileWriterSpec, create_import_spec, @@ -22,7 +21,6 @@ "resolve_best_writer_spec", "get_best_writer_spec", "is_native_writer", - "DataWriterMetrics", "TDataItemFormat", "BufferedDataWriter", "new_file_id", diff --git a/dlt/common/data_writers/buffered.py b/dlt/common/data_writers/buffered.py index 8077007edb..945fca6580 100644 --- a/dlt/common/data_writers/buffered.py +++ b/dlt/common/data_writers/buffered.py @@ -3,6 +3,7 @@ import contextlib from typing import ClassVar, Iterator, List, IO, Any, Optional, Type, Generic +from dlt.common.metrics import DataWriterMetrics from dlt.common.typing import TDataItem, TDataItems from dlt.common.data_writers.exceptions import ( BufferedDataWriterClosed, @@ -10,7 +11,7 @@ FileImportNotFound, InvalidFileNameTemplateException, ) -from dlt.common.data_writers.writers import TWriter, DataWriter, DataWriterMetrics, FileWriterSpec +from dlt.common.data_writers.writers import TWriter, DataWriter, FileWriterSpec from dlt.common.schema.typing import TTableSchemaColumns from dlt.common.configuration import with_config, known_sections, configspec from dlt.common.configuration.specs import BaseConfiguration diff --git a/dlt/common/data_writers/writers.py b/dlt/common/data_writers/writers.py index d324792a83..abd3343ea1 100644 --- a/dlt/common/data_writers/writers.py +++ b/dlt/common/data_writers/writers.py @@ -34,6 +34,7 @@ TLoaderFileFormat, ALL_SUPPORTED_FILE_FORMATS, ) +from dlt.common.metrics import DataWriterMetrics from dlt.common.schema.typing import TTableSchemaColumns from dlt.common.typing import StrAny @@ -59,25 +60,6 @@ class FileWriterSpec(NamedTuple): supports_compression: bool = False -class DataWriterMetrics(NamedTuple): - file_path: str - items_count: int - file_size: int - created: float - last_modified: float - - def __add__(self, other: Tuple[object, ...], /) -> Tuple[object, ...]: - if isinstance(other, DataWriterMetrics): - return DataWriterMetrics( - "", # path is not known - self.items_count + other.items_count, - self.file_size + other.file_size, - min(self.created, other.created), - max(self.last_modified, other.last_modified), - ) - return NotImplemented - - EMPTY_DATA_WRITER_METRICS = DataWriterMetrics("", 0, 0, 2**32, 0.0) diff --git a/dlt/common/destination/reference.py b/dlt/common/destination/reference.py index 3af7dcff13..b6c7041592 100644 --- a/dlt/common/destination/reference.py +++ b/dlt/common/destination/reference.py @@ -24,10 +24,11 @@ from copy import deepcopy import inspect -from dlt.common import logger +from dlt.common import logger, pendulum from dlt.common.configuration.specs.base_configuration import extract_inner_hint from dlt.common.destination.utils import verify_schema_capabilities from dlt.common.exceptions import TerminalValueError +from dlt.common.metrics import LoadJobMetrics from dlt.common.normalizers.naming import NamingConvention from dlt.common.schema import Schema, TTableSchema, TSchemaTables from dlt.common.schema.utils import ( @@ -284,6 +285,8 @@ def __init__(self, file_path: str) -> None: # NOTE: we only accept a full filepath in the constructor assert self._file_name != self._file_path self._parsed_file_name = ParsedLoadJobFileName.parse(self._file_name) + self._started_at: pendulum.DateTime = None + self._finished_at: pendulum.DateTime = None def job_id(self) -> str: """The job id that is derived from the file name and does not changes during job lifecycle""" @@ -306,6 +309,18 @@ def exception(self) -> str: """The exception associated with failed or retry states""" pass + def metrics(self) -> Optional[LoadJobMetrics]: + """Returns job execution metrics""" + return LoadJobMetrics( + self._parsed_file_name.job_id(), + self._file_path, + self._parsed_file_name.table_name, + self._started_at, + self._finished_at, + self.state(), + None, + ) + class RunnableLoadJob(LoadJob, ABC): """Represents a runnable job that loads a single file @@ -361,6 +376,7 @@ def run_managed( # filepath is now moved to running try: self._state = "running" + self._started_at = pendulum.now() self._job_client.prepare_load_job_execution(self) self.run() self._state = "completed" @@ -371,6 +387,7 @@ def run_managed( self._state = "retry" self._exception = e finally: + self._finished_at = pendulum.now() # sanity check assert self._state in ("completed", "retry", "failed") @@ -391,7 +408,7 @@ def exception(self) -> str: return str(self._exception) -class FollowupJob: +class FollowupJobRequest: """Base class for follow up jobs that should be created""" @abstractmethod @@ -403,8 +420,8 @@ def new_file_path(self) -> str: class HasFollowupJobs: """Adds a trait that allows to create single or table chain followup jobs""" - def create_followup_jobs(self, final_state: TLoadJobState) -> List[FollowupJob]: - """Return list of new jobs. `final_state` is state to which this job transits""" + def create_followup_jobs(self, final_state: TLoadJobState) -> List[FollowupJobRequest]: + """Return list of jobs requests for jobs that should be created. `final_state` is state to which this job transits""" return [] @@ -479,7 +496,7 @@ def create_table_chain_completed_followup_jobs( self, table_chain: Sequence[TTableSchema], completed_table_chain_jobs: Optional[Sequence[LoadJobInfo]] = None, - ) -> List[FollowupJob]: + ) -> List[FollowupJobRequest]: """Creates a list of followup jobs that should be executed after a table chain is completed""" return [] diff --git a/dlt/common/metrics.py b/dlt/common/metrics.py new file mode 100644 index 0000000000..5cccee4045 --- /dev/null +++ b/dlt/common/metrics.py @@ -0,0 +1,71 @@ +import datetime # noqa: I251 +from typing import Any, Dict, List, NamedTuple, Optional, Tuple, TypedDict # noqa: 251 + + +class DataWriterMetrics(NamedTuple): + file_path: str + items_count: int + file_size: int + created: float + last_modified: float + + def __add__(self, other: Tuple[object, ...], /) -> Tuple[object, ...]: + if isinstance(other, DataWriterMetrics): + return DataWriterMetrics( + self.file_path if self.file_path == other.file_path else "", + # self.table_name if self.table_name == other.table_name else "", + self.items_count + other.items_count, + self.file_size + other.file_size, + min(self.created, other.created), + max(self.last_modified, other.last_modified), + ) + return NotImplemented + + +class StepMetrics(TypedDict): + """Metrics for particular package processed in particular pipeline step""" + + started_at: datetime.datetime + """Start of package processing""" + finished_at: datetime.datetime + """End of package processing""" + + +class ExtractDataInfo(TypedDict): + name: str + data_type: str + + +class ExtractMetrics(StepMetrics): + schema_name: str + job_metrics: Dict[str, DataWriterMetrics] + """Metrics collected per job id during writing of job file""" + table_metrics: Dict[str, DataWriterMetrics] + """Job metrics aggregated by table""" + resource_metrics: Dict[str, DataWriterMetrics] + """Job metrics aggregated by resource""" + dag: List[Tuple[str, str]] + """A resource dag where elements of the list are graph edges""" + hints: Dict[str, Dict[str, Any]] + """Hints passed to the resources""" + + +class NormalizeMetrics(StepMetrics): + job_metrics: Dict[str, DataWriterMetrics] + """Metrics collected per job id during writing of job file""" + table_metrics: Dict[str, DataWriterMetrics] + """Job metrics aggregated by table""" + + +class LoadJobMetrics(NamedTuple): + job_id: str + file_path: str + table_name: str + started_at: datetime.datetime + finished_at: datetime.datetime + state: Optional[str] + remote_uri: Optional[str] + + +class LoadMetrics(StepMetrics): + job_metrics: Dict[str, LoadJobMetrics] diff --git a/dlt/common/pipeline.py b/dlt/common/pipeline.py index 1e1416eb53..8a07ddbd33 100644 --- a/dlt/common/pipeline.py +++ b/dlt/common/pipeline.py @@ -16,7 +16,6 @@ Optional, Protocol, Sequence, - TYPE_CHECKING, Tuple, TypeVar, TypedDict, @@ -36,6 +35,14 @@ from dlt.common.destination import TDestinationReferenceArg, TDestination from dlt.common.destination.exceptions import DestinationHasFailedJobs from dlt.common.exceptions import PipelineStateNotAvailable, SourceSectionNotAvailable +from dlt.common.metrics import ( + DataWriterMetrics, + ExtractDataInfo, + ExtractMetrics, + LoadMetrics, + NormalizeMetrics, + StepMetrics, +) from dlt.common.schema import Schema from dlt.common.schema.typing import ( TColumnNames, @@ -44,11 +51,12 @@ TSchemaContract, ) from dlt.common.source import get_current_pipe_name +from dlt.common.storages.load_package import ParsedLoadJobFileName from dlt.common.storages.load_storage import LoadPackageInfo from dlt.common.time import ensure_pendulum_datetime, precise_time from dlt.common.typing import DictStrAny, REPattern, StrAny, SupportsHumanize from dlt.common.jsonpath import delete_matches, TAnyJsonPath -from dlt.common.data_writers.writers import DataWriterMetrics, TLoaderFileFormat +from dlt.common.data_writers.writers import TLoaderFileFormat from dlt.common.utils import RowCounts, merge_row_counts from dlt.common.versioned_state import TVersionedState @@ -68,15 +76,6 @@ class _StepInfo(NamedTuple): finished_at: datetime.datetime -class StepMetrics(TypedDict): - """Metrics for particular package processed in particular pipeline step""" - - started_at: datetime.datetime - """Start of package processing""" - finished_at: datetime.datetime - """End of package processing""" - - TStepMetricsCo = TypeVar("TStepMetricsCo", bound=StepMetrics, covariant=True) @@ -154,17 +153,20 @@ def _load_packages_asstr(load_packages: List[LoadPackageInfo], verbosity: int) - return msg @staticmethod - def job_metrics_asdict( + def writer_metrics_asdict( job_metrics: Dict[str, DataWriterMetrics], key_name: str = "job_id", extend: StrAny = None ) -> List[DictStrAny]: - jobs = [] - for job_id, metrics in job_metrics.items(): + entities = [] + for entity_id, metrics in job_metrics.items(): d = metrics._asdict() if extend: d.update(extend) - d[key_name] = job_id - jobs.append(d) - return jobs + d[key_name] = entity_id + # add job-level info if known + if metrics.file_path: + d["table_name"] = ParsedLoadJobFileName.parse(metrics.file_path).table_name + entities.append(d) + return entities def _astuple(self) -> _StepInfo: return _StepInfo( @@ -177,25 +179,6 @@ def _astuple(self) -> _StepInfo: ) -class ExtractDataInfo(TypedDict): - name: str - data_type: str - - -class ExtractMetrics(StepMetrics): - schema_name: str - job_metrics: Dict[str, DataWriterMetrics] - """Metrics collected per job id during writing of job file""" - table_metrics: Dict[str, DataWriterMetrics] - """Job metrics aggregated by table""" - resource_metrics: Dict[str, DataWriterMetrics] - """Job metrics aggregated by resource""" - dag: List[Tuple[str, str]] - """A resource dag where elements of the list are graph edges""" - hints: Dict[str, Dict[str, Any]] - """Hints passed to the resources""" - - class _ExtractInfo(NamedTuple): """NamedTuple cannot be part of the derivation chain so we must re-declare all fields to use it as mixin later""" @@ -228,16 +211,8 @@ def asdict(self) -> DictStrAny: for load_id, metrics_list in self.metrics.items(): for idx, metrics in enumerate(metrics_list): extend = {"load_id": load_id, "extract_idx": idx} - load_metrics["job_metrics"].extend( - self.job_metrics_asdict(metrics["job_metrics"], extend=extend) - ) - load_metrics["table_metrics"].extend( - self.job_metrics_asdict( - metrics["table_metrics"], key_name="table_name", extend=extend - ) - ) load_metrics["resource_metrics"].extend( - self.job_metrics_asdict( + self.writer_metrics_asdict( metrics["resource_metrics"], key_name="resource_name", extend=extend ) ) @@ -253,6 +228,15 @@ def asdict(self) -> DictStrAny: for name, hints in metrics["hints"].items() ] ) + load_metrics["job_metrics"].extend( + self.writer_metrics_asdict(metrics["job_metrics"], extend=extend) + ) + load_metrics["table_metrics"].extend( + self.writer_metrics_asdict( + metrics["table_metrics"], key_name="table_name", extend=extend + ) + ) + d.update(load_metrics) return d @@ -260,13 +244,6 @@ def asstr(self, verbosity: int = 0) -> str: return self._load_packages_asstr(self.load_packages, verbosity) -class NormalizeMetrics(StepMetrics): - job_metrics: Dict[str, DataWriterMetrics] - """Metrics collected per job id during writing of job file""" - table_metrics: Dict[str, DataWriterMetrics] - """Job metrics aggregated by table""" - - class _NormalizeInfo(NamedTuple): pipeline: "SupportsPipeline" metrics: Dict[str, List[NormalizeMetrics]] @@ -305,10 +282,10 @@ def asdict(self) -> DictStrAny: for idx, metrics in enumerate(metrics_list): extend = {"load_id": load_id, "extract_idx": idx} load_metrics["job_metrics"].extend( - self.job_metrics_asdict(metrics["job_metrics"], extend=extend) + self.writer_metrics_asdict(metrics["job_metrics"], extend=extend) ) load_metrics["table_metrics"].extend( - self.job_metrics_asdict( + self.writer_metrics_asdict( metrics["table_metrics"], key_name="table_name", extend=extend ) ) @@ -326,10 +303,6 @@ def asstr(self, verbosity: int = 0) -> str: return msg -class LoadMetrics(StepMetrics): - pass - - class _LoadInfo(NamedTuple): pipeline: "SupportsPipeline" metrics: Dict[str, List[LoadMetrics]] @@ -354,7 +327,19 @@ class LoadInfo(StepInfo[LoadMetrics], _LoadInfo): # type: ignore[misc] def asdict(self) -> DictStrAny: """A dictionary representation of LoadInfo that can be loaded with `dlt`""" - return super().asdict() + d = super().asdict() + # transform metrics + d.pop("metrics") + load_metrics: Dict[str, List[Any]] = {"job_metrics": []} + for load_id, metrics_list in self.metrics.items(): + # one set of metrics per package id + assert len(metrics_list) == 1 + metrics = metrics_list[0] + for job_metrics in metrics["job_metrics"].values(): + load_metrics["job_metrics"].append({"load_id": load_id, **job_metrics._asdict()}) + + d.update(load_metrics) + return d def asstr(self, verbosity: int = 0) -> str: msg = f"Pipeline {self.pipeline.pipeline_name} load step completed in " diff --git a/dlt/common/storages/__init__.py b/dlt/common/storages/__init__.py index 7bb3c0cf97..50876a01cd 100644 --- a/dlt/common/storages/__init__.py +++ b/dlt/common/storages/__init__.py @@ -8,7 +8,7 @@ LoadJobInfo, LoadPackageInfo, PackageStorage, - TJobState, + TPackageJobState, create_load_id, ) from .data_item_storage import DataItemStorage @@ -40,7 +40,7 @@ "LoadJobInfo", "LoadPackageInfo", "PackageStorage", - "TJobState", + "TPackageJobState", "create_load_id", "fsspec_from_config", "fsspec_filesystem", diff --git a/dlt/common/storages/data_item_storage.py b/dlt/common/storages/data_item_storage.py index 29a9da8acf..0f70c04bc5 100644 --- a/dlt/common/storages/data_item_storage.py +++ b/dlt/common/storages/data_item_storage.py @@ -1,14 +1,13 @@ -from pathlib import Path -from typing import Dict, Any, List, Sequence +from typing import Dict, Any, List from abc import ABC, abstractmethod from dlt.common import logger +from dlt.common.metrics import DataWriterMetrics from dlt.common.schema import TTableSchemaColumns -from dlt.common.typing import StrAny, TDataItems +from dlt.common.typing import TDataItems from dlt.common.data_writers import ( BufferedDataWriter, DataWriter, - DataWriterMetrics, FileWriterSpec, ) diff --git a/dlt/common/storages/load_package.py b/dlt/common/storages/load_package.py index b0ed93f734..d569fbe662 100644 --- a/dlt/common/storages/load_package.py +++ b/dlt/common/storages/load_package.py @@ -143,8 +143,8 @@ def create_load_id() -> str: # folders to manage load jobs in a single load package -TJobState = Literal["new_jobs", "failed_jobs", "started_jobs", "completed_jobs"] -WORKING_FOLDERS: Set[TJobState] = set(get_args(TJobState)) +TPackageJobState = Literal["new_jobs", "failed_jobs", "started_jobs", "completed_jobs"] +WORKING_FOLDERS: Set[TPackageJobState] = set(get_args(TPackageJobState)) TLoadPackageStatus = Literal["new", "extracted", "normalized", "loaded", "aborted"] @@ -191,7 +191,7 @@ def __str__(self) -> str: class LoadJobInfo(NamedTuple): - state: TJobState + state: TPackageJobState file_path: str file_size: int created_at: datetime.datetime @@ -204,6 +204,7 @@ def asdict(self) -> DictStrAny: # flatten del d["job_file_info"] d.update(self.job_file_info._asdict()) + d["job_id"] = self.job_file_info.job_id() return d def asstr(self, verbosity: int = 0) -> str: @@ -241,7 +242,7 @@ class _LoadPackageInfo(NamedTuple): schema: Schema schema_update: TSchemaTables completed_at: datetime.datetime - jobs: Dict[TJobState, List[LoadJobInfo]] + jobs: Dict[TPackageJobState, List[LoadJobInfo]] class LoadPackageInfo(SupportsHumanize, _LoadPackageInfo): @@ -298,10 +299,10 @@ def __str__(self) -> str: class PackageStorage: - NEW_JOBS_FOLDER: ClassVar[TJobState] = "new_jobs" - FAILED_JOBS_FOLDER: ClassVar[TJobState] = "failed_jobs" - STARTED_JOBS_FOLDER: ClassVar[TJobState] = "started_jobs" - COMPLETED_JOBS_FOLDER: ClassVar[TJobState] = "completed_jobs" + NEW_JOBS_FOLDER: ClassVar[TPackageJobState] = "new_jobs" + FAILED_JOBS_FOLDER: ClassVar[TPackageJobState] = "failed_jobs" + STARTED_JOBS_FOLDER: ClassVar[TPackageJobState] = "started_jobs" + COMPLETED_JOBS_FOLDER: ClassVar[TPackageJobState] = "completed_jobs" SCHEMA_FILE_NAME: ClassVar[str] = "schema.json" SCHEMA_UPDATES_FILE_NAME = ( # updates to the tables in schema created by normalizer @@ -330,11 +331,11 @@ def get_package_path(self, load_id: str) -> str: """Gets path of the package relative to storage root""" return load_id - def get_job_state_folder_path(self, load_id: str, state: TJobState) -> str: + def get_job_state_folder_path(self, load_id: str, state: TPackageJobState) -> str: """Gets path to the jobs in `state` in package `load_id`, relative to the storage root""" return os.path.join(self.get_package_path(load_id), state) - def get_job_file_path(self, load_id: str, state: TJobState, file_name: str) -> str: + def get_job_file_path(self, load_id: str, state: TPackageJobState, file_name: str) -> str: """Get path to job with `file_name` in `state` in package `load_id`, relative to the storage root""" return os.path.join(self.get_job_state_folder_path(load_id, state), file_name) @@ -369,12 +370,12 @@ def list_failed_jobs(self, load_id: str) -> Sequence[str]: def list_job_with_states_for_table( self, load_id: str, table_name: str - ) -> Sequence[Tuple[TJobState, ParsedLoadJobFileName]]: + ) -> Sequence[Tuple[TPackageJobState, ParsedLoadJobFileName]]: return self.filter_jobs_for_table(self.list_all_jobs_with_states(load_id), table_name) def list_all_jobs_with_states( self, load_id: str - ) -> Sequence[Tuple[TJobState, ParsedLoadJobFileName]]: + ) -> Sequence[Tuple[TPackageJobState, ParsedLoadJobFileName]]: info = self.get_load_package_jobs(load_id) state_jobs = [] for state, jobs in info.items(): @@ -413,7 +414,7 @@ def is_package_completed(self, load_id: str) -> bool: # def import_job( - self, load_id: str, job_file_path: str, job_state: TJobState = "new_jobs" + self, load_id: str, job_file_path: str, job_state: TPackageJobState = "new_jobs" ) -> None: """Adds new job by moving the `job_file_path` into `new_jobs` of package `load_id`""" self.storage.atomic_import( @@ -568,12 +569,14 @@ def get_load_package_state_path(self, load_id: str) -> str: # Get package info # - def get_load_package_jobs(self, load_id: str) -> Dict[TJobState, List[ParsedLoadJobFileName]]: + def get_load_package_jobs( + self, load_id: str + ) -> Dict[TPackageJobState, List[ParsedLoadJobFileName]]: """Gets all jobs in a package and returns them as lists assigned to a particular state.""" package_path = self.get_package_path(load_id) if not self.storage.has_folder(package_path): raise LoadPackageNotFound(load_id) - all_jobs: Dict[TJobState, List[ParsedLoadJobFileName]] = {} + all_jobs: Dict[TPackageJobState, List[ParsedLoadJobFileName]] = {} for state in WORKING_FOLDERS: jobs: List[ParsedLoadJobFileName] = [] with contextlib.suppress(FileNotFoundError): @@ -616,7 +619,7 @@ def get_load_package_info(self, load_id: str) -> LoadPackageInfo: schema = Schema.from_dict(self._load_schema(load_id)) # read jobs with all statuses - all_job_infos: Dict[TJobState, List[LoadJobInfo]] = {} + all_job_infos: Dict[TPackageJobState, List[LoadJobInfo]] = {} for state, jobs in package_jobs.items(): all_job_infos[state] = [ self._read_job_file_info(load_id, state, job, package_created_at) for job in jobs @@ -643,7 +646,7 @@ def get_job_failed_message(self, load_id: str, job: ParsedLoadJobFileName) -> st return failed_message def job_to_job_info( - self, load_id: str, state: TJobState, job: ParsedLoadJobFileName + self, load_id: str, state: TPackageJobState, job: ParsedLoadJobFileName ) -> LoadJobInfo: """Creates partial job info by converting job object. size, mtime and failed message will not be populated""" full_path = os.path.join( @@ -660,7 +663,11 @@ def job_to_job_info( ) def _read_job_file_info( - self, load_id: str, state: TJobState, job: ParsedLoadJobFileName, now: DateTime = None + self, + load_id: str, + state: TPackageJobState, + job: ParsedLoadJobFileName, + now: DateTime = None, ) -> LoadJobInfo: """Creates job info by reading additional props from storage""" failed_message = None @@ -687,8 +694,8 @@ def _read_job_file_info( def _move_job( self, load_id: str, - source_folder: TJobState, - dest_folder: TJobState, + source_folder: TPackageJobState, + dest_folder: TPackageJobState, file_name: str, new_file_name: str = None, ) -> str: @@ -736,8 +743,8 @@ def _job_elapsed_time_seconds(file_path: str, now_ts: float = None) -> float: @staticmethod def filter_jobs_for_table( - all_jobs: Iterable[Tuple[TJobState, ParsedLoadJobFileName]], table_name: str - ) -> Sequence[Tuple[TJobState, ParsedLoadJobFileName]]: + all_jobs: Iterable[Tuple[TPackageJobState, ParsedLoadJobFileName]], table_name: str + ) -> Sequence[Tuple[TPackageJobState, ParsedLoadJobFileName]]: return [job for job in all_jobs if job[1].table_name == table_name] diff --git a/dlt/common/storages/load_storage.py b/dlt/common/storages/load_storage.py index 00e95fbad9..8ac1d74e9a 100644 --- a/dlt/common/storages/load_storage.py +++ b/dlt/common/storages/load_storage.py @@ -17,7 +17,7 @@ LoadPackageInfo, PackageStorage, ParsedLoadJobFileName, - TJobState, + TPackageJobState, TLoadPackageState, TJobFileFormat, ) @@ -141,16 +141,16 @@ def commit_schema_update(self, load_id: str, applied_update: TSchemaTables) -> N """Marks schema update as processed and stores the update that was applied at the destination""" load_path = self.get_normalized_package_path(load_id) schema_update_file = join(load_path, PackageStorage.SCHEMA_UPDATES_FILE_NAME) - processed_schema_update_file = join( + applied_schema_update_file = join( load_path, PackageStorage.APPLIED_SCHEMA_UPDATES_FILE_NAME ) # delete initial schema update self.storage.delete(schema_update_file) # save applied update - self.storage.save(processed_schema_update_file, json.dumps(applied_update)) + self.storage.save(applied_schema_update_file, json.dumps(applied_update)) def import_new_job( - self, load_id: str, job_file_path: str, job_state: TJobState = "new_jobs" + self, load_id: str, job_file_path: str, job_state: TPackageJobState = "new_jobs" ) -> None: """Adds new job by moving the `job_file_path` into `new_jobs` of package `load_id`""" # TODO: use normalize storage and add file type checks diff --git a/dlt/destinations/impl/athena/athena.py b/dlt/destinations/impl/athena/athena.py index 371c1bae22..1429b28240 100644 --- a/dlt/destinations/impl/athena/athena.py +++ b/dlt/destinations/impl/athena/athena.py @@ -46,7 +46,7 @@ from dlt.common.schema.utils import table_schema_has_type from dlt.common.destination import DestinationCapabilitiesContext from dlt.common.destination.reference import LoadJob -from dlt.common.destination.reference import FollowupJob, SupportsStagingDestination +from dlt.common.destination.reference import FollowupJobRequest, SupportsStagingDestination from dlt.common.data_writers.escape import escape_hive_identifier from dlt.destinations.sql_jobs import SqlStagingCopyFollowupJob, SqlMergeFollowupJob @@ -490,7 +490,7 @@ def create_load_job( def _create_append_followup_jobs( self, table_chain: Sequence[TTableSchema] - ) -> List[FollowupJob]: + ) -> List[FollowupJobRequest]: if self._is_iceberg_table(self.prepare_load_table(table_chain[0]["name"])): return [ SqlStagingCopyFollowupJob.from_table_chain( @@ -501,7 +501,7 @@ def _create_append_followup_jobs( def _create_replace_followup_jobs( self, table_chain: Sequence[TTableSchema] - ) -> List[FollowupJob]: + ) -> List[FollowupJobRequest]: if self._is_iceberg_table(self.prepare_load_table(table_chain[0]["name"])): return [ SqlStagingCopyFollowupJob.from_table_chain( @@ -510,7 +510,9 @@ def _create_replace_followup_jobs( ] return super()._create_replace_followup_jobs(table_chain) - def _create_merge_followup_jobs(self, table_chain: Sequence[TTableSchema]) -> List[FollowupJob]: + def _create_merge_followup_jobs( + self, table_chain: Sequence[TTableSchema] + ) -> List[FollowupJobRequest]: return [AthenaMergeJob.from_table_chain(table_chain, self.sql_client)] def _is_iceberg_table(self, table: TTableSchema) -> bool: diff --git a/dlt/destinations/impl/bigquery/bigquery.py b/dlt/destinations/impl/bigquery/bigquery.py index c6bf2e7654..8291415434 100644 --- a/dlt/destinations/impl/bigquery/bigquery.py +++ b/dlt/destinations/impl/bigquery/bigquery.py @@ -16,7 +16,7 @@ from dlt.common.destination import DestinationCapabilitiesContext from dlt.common.destination.reference import ( HasFollowupJobs, - FollowupJob, + FollowupJobRequest, TLoadJobState, RunnableLoadJob, SupportsStagingDestination, @@ -51,7 +51,7 @@ from dlt.destinations.impl.bigquery.configuration import BigQueryClientConfiguration from dlt.destinations.impl.bigquery.sql_client import BigQuerySqlClient, BQ_TERMINAL_REASONS from dlt.destinations.job_client_impl import SqlJobClientWithStaging -from dlt.destinations.job_impl import ReferenceFollowupJob +from dlt.destinations.job_impl import ReferenceFollowupJobRequest from dlt.destinations.sql_jobs import SqlMergeFollowupJob from dlt.destinations.type_mapping import TypeMapper from dlt.destinations.utils import parse_db_data_type_str_with_precision @@ -234,7 +234,9 @@ def __init__( self.sql_client: BigQuerySqlClient = sql_client # type: ignore self.type_mapper = BigQueryTypeMapper(self.capabilities) - def _create_merge_followup_jobs(self, table_chain: Sequence[TTableSchema]) -> List[FollowupJob]: + def _create_merge_followup_jobs( + self, table_chain: Sequence[TTableSchema] + ) -> List[FollowupJobRequest]: return [BigQueryMergeJob.from_table_chain(table_chain, self.sql_client)] def create_load_job( @@ -433,8 +435,8 @@ def _create_load_job(self, table: TTableSchema, file_path: str) -> bigquery.Load # determine whether we load from local or uri bucket_path = None ext: str = os.path.splitext(file_path)[1][1:] - if ReferenceFollowupJob.is_reference_job(file_path): - bucket_path = ReferenceFollowupJob.resolve_reference(file_path) + if ReferenceFollowupJobRequest.is_reference_job(file_path): + bucket_path = ReferenceFollowupJobRequest.resolve_reference(file_path) ext = os.path.splitext(bucket_path)[1][1:] # Select a correct source format diff --git a/dlt/destinations/impl/clickhouse/clickhouse.py b/dlt/destinations/impl/clickhouse/clickhouse.py index 5bd34e0e0d..5f17a5a18c 100644 --- a/dlt/destinations/impl/clickhouse/clickhouse.py +++ b/dlt/destinations/impl/clickhouse/clickhouse.py @@ -20,7 +20,7 @@ TLoadJobState, HasFollowupJobs, RunnableLoadJob, - FollowupJob, + FollowupJobRequest, LoadJob, ) from dlt.common.schema import Schema, TColumnSchema @@ -52,7 +52,7 @@ SqlJobClientBase, SqlJobClientWithStaging, ) -from dlt.destinations.job_impl import ReferenceFollowupJob, FinalizedLoadJobWithFollowupJobs +from dlt.destinations.job_impl import ReferenceFollowupJobRequest, FinalizedLoadJobWithFollowupJobs from dlt.destinations.sql_jobs import SqlMergeFollowupJob from dlt.destinations.type_mapping import TypeMapper @@ -141,8 +141,8 @@ def run(self) -> None: bucket_path = None file_name = self._file_name - if ReferenceFollowupJob.is_reference_job(self._file_path): - bucket_path = ReferenceFollowupJob.resolve_reference(self._file_path) + if ReferenceFollowupJobRequest.is_reference_job(self._file_path): + bucket_path = ReferenceFollowupJobRequest.resolve_reference(self._file_path) file_name = FileStorage.get_file_name_from_file_path(bucket_path) bucket_url = urlparse(bucket_path) bucket_scheme = bucket_url.scheme @@ -288,7 +288,9 @@ def __init__( self.active_hints = deepcopy(HINT_TO_CLICKHOUSE_ATTR) self.type_mapper = ClickHouseTypeMapper(self.capabilities) - def _create_merge_followup_jobs(self, table_chain: Sequence[TTableSchema]) -> List[FollowupJob]: + def _create_merge_followup_jobs( + self, table_chain: Sequence[TTableSchema] + ) -> List[FollowupJobRequest]: return [ClickHouseMergeJob.from_table_chain(table_chain, self.sql_client)] def _get_column_def_sql(self, c: TColumnSchema, table_format: TTableFormat = None) -> str: diff --git a/dlt/destinations/impl/databricks/databricks.py b/dlt/destinations/impl/databricks/databricks.py index 0a203c21b6..2f23e88ea0 100644 --- a/dlt/destinations/impl/databricks/databricks.py +++ b/dlt/destinations/impl/databricks/databricks.py @@ -5,7 +5,7 @@ from dlt.common.destination import DestinationCapabilitiesContext from dlt.common.destination.reference import ( HasFollowupJobs, - FollowupJob, + FollowupJobRequest, TLoadJobState, RunnableLoadJob, CredentialsConfiguration, @@ -31,7 +31,7 @@ from dlt.destinations.impl.databricks.configuration import DatabricksClientConfiguration from dlt.destinations.impl.databricks.sql_client import DatabricksSqlClient from dlt.destinations.sql_jobs import SqlMergeFollowupJob -from dlt.destinations.job_impl import ReferenceFollowupJob +from dlt.destinations.job_impl import ReferenceFollowupJobRequest from dlt.destinations.type_mapping import TypeMapper @@ -121,8 +121,8 @@ def run(self) -> None: staging_credentials = self._staging_config.credentials # extract and prepare some vars bucket_path = orig_bucket_path = ( - ReferenceFollowupJob.resolve_reference(self._file_path) - if ReferenceFollowupJob.is_reference_job(self._file_path) + ReferenceFollowupJobRequest.resolve_reference(self._file_path) + if ReferenceFollowupJobRequest.is_reference_job(self._file_path) else "" ) file_name = ( @@ -279,7 +279,9 @@ def create_load_job( ) return job - def _create_merge_followup_jobs(self, table_chain: Sequence[TTableSchema]) -> List[FollowupJob]: + def _create_merge_followup_jobs( + self, table_chain: Sequence[TTableSchema] + ) -> List[FollowupJobRequest]: return [DatabricksMergeJob.from_table_chain(table_chain, self.sql_client)] def _make_add_column_sql( diff --git a/dlt/destinations/impl/dremio/dremio.py b/dlt/destinations/impl/dremio/dremio.py index 3611665f6c..68a3fedc31 100644 --- a/dlt/destinations/impl/dremio/dremio.py +++ b/dlt/destinations/impl/dremio/dremio.py @@ -7,7 +7,7 @@ TLoadJobState, RunnableLoadJob, SupportsStagingDestination, - FollowupJob, + FollowupJobRequest, LoadJob, ) from dlt.common.schema import TColumnSchema, Schema, TTableSchemaColumns @@ -19,7 +19,7 @@ from dlt.destinations.impl.dremio.sql_client import DremioSqlClient from dlt.destinations.job_client_impl import SqlJobClientWithStaging from dlt.destinations.job_impl import FinalizedLoadJobWithFollowupJobs -from dlt.destinations.job_impl import ReferenceFollowupJob +from dlt.destinations.job_impl import ReferenceFollowupJobRequest from dlt.destinations.sql_jobs import SqlMergeFollowupJob from dlt.destinations.type_mapping import TypeMapper from dlt.destinations.sql_client import SqlClientBase @@ -101,8 +101,8 @@ def run(self) -> None: # extract and prepare some vars bucket_path = ( - ReferenceFollowupJob.resolve_reference(self._file_path) - if ReferenceFollowupJob.is_reference_job(self._file_path) + ReferenceFollowupJobRequest.resolve_reference(self._file_path) + if ReferenceFollowupJobRequest.is_reference_job(self._file_path) else "" ) @@ -201,7 +201,9 @@ def _get_column_def_sql(self, c: TColumnSchema, table_format: TTableFormat = Non f"{name} {self.type_mapper.to_db_type(c)} {self._gen_not_null(c.get('nullable', True))}" ) - def _create_merge_followup_jobs(self, table_chain: Sequence[TTableSchema]) -> List[FollowupJob]: + def _create_merge_followup_jobs( + self, table_chain: Sequence[TTableSchema] + ) -> List[FollowupJobRequest]: return [DremioMergeJob.from_table_chain(table_chain, self.sql_client)] def _make_add_column_sql( diff --git a/dlt/destinations/impl/dummy/configuration.py b/dlt/destinations/impl/dummy/configuration.py index 7bc1d9e943..023b88e51a 100644 --- a/dlt/destinations/impl/dummy/configuration.py +++ b/dlt/destinations/impl/dummy/configuration.py @@ -25,7 +25,7 @@ class DummyClientConfiguration(DestinationClientConfiguration): retry_prob: float = 0.0 """probability of job retry""" completed_prob: float = 0.0 - """probablibitly of successful job completion""" + """probability of successful job completion""" exception_prob: float = 0.0 """probability of exception transient exception when running job""" timeout: float = 10.0 diff --git a/dlt/destinations/impl/dummy/dummy.py b/dlt/destinations/impl/dummy/dummy.py index 7d406c969f..49b55ec65d 100644 --- a/dlt/destinations/impl/dummy/dummy.py +++ b/dlt/destinations/impl/dummy/dummy.py @@ -14,6 +14,7 @@ ) import os import time +from dlt.common.metrics import LoadJobMetrics from dlt.common.pendulum import pendulum from dlt.common.schema import Schema, TTableSchema, TSchemaTables from dlt.common.storages import FileStorage @@ -25,7 +26,7 @@ ) from dlt.common.destination.reference import ( HasFollowupJobs, - FollowupJob, + FollowupJobRequest, SupportsStagingDestination, TLoadJobState, RunnableLoadJob, @@ -37,10 +38,9 @@ from dlt.destinations.exceptions import ( LoadJobNotExistsException, - LoadJobInvalidStateTransitionException, ) from dlt.destinations.impl.dummy.configuration import DummyClientConfiguration -from dlt.destinations.job_impl import ReferenceFollowupJob +from dlt.destinations.job_impl import ReferenceFollowupJobRequest class LoadDummyBaseJob(RunnableLoadJob): @@ -78,18 +78,25 @@ def run(self) -> None: c_r = random.random() if self.config.retry_prob >= c_r: # this will make the job go to a retry state - raise DestinationTransientException("a random retry occured") + raise DestinationTransientException("a random retry occurred") # fail prob c_r = random.random() if self.config.fail_prob >= c_r: # this will make the the job go to a failed state - raise DestinationTerminalException("a random fail occured") + raise DestinationTerminalException("a random fail occurred") time.sleep(0.1) + def metrics(self) -> Optional[LoadJobMetrics]: + m = super().metrics() + # add remote uri if there's followup job + if self.config.create_followup_jobs: + m = m._replace(remote_uri=self._file_name) + return m -class DummyFollowupJob(ReferenceFollowupJob): + +class DummyFollowupJobRequest(ReferenceFollowupJobRequest): def __init__( self, original_file_name: str, remote_paths: List[str], config: DummyClientConfiguration ) -> None: @@ -100,9 +107,9 @@ def __init__( class LoadDummyJob(LoadDummyBaseJob, HasFollowupJobs): - def create_followup_jobs(self, final_state: TLoadJobState) -> List[FollowupJob]: + def create_followup_jobs(self, final_state: TLoadJobState) -> List[FollowupJobRequest]: if self.config.create_followup_jobs and final_state == "completed": - new_job = DummyFollowupJob( + new_job = DummyFollowupJobRequest( original_file_name=self.file_name(), remote_paths=[self._file_name], config=self.config, @@ -113,8 +120,8 @@ def create_followup_jobs(self, final_state: TLoadJobState) -> List[FollowupJob]: JOBS: Dict[str, LoadDummyBaseJob] = {} -CREATED_FOLLOWUP_JOBS: Dict[str, FollowupJob] = {} -CREATED_TABLE_CHAIN_FOLLOWUP_JOBS: Dict[str, FollowupJob] = {} +CREATED_FOLLOWUP_JOBS: Dict[str, FollowupJobRequest] = {} +CREATED_TABLE_CHAIN_FOLLOWUP_JOBS: Dict[str, FollowupJobRequest] = {} RETRIED_JOBS: Dict[str, LoadDummyBaseJob] = {} @@ -173,7 +180,7 @@ def create_table_chain_completed_followup_jobs( self, table_chain: Sequence[TTableSchema], completed_table_chain_jobs: Optional[Sequence[LoadJobInfo]] = None, - ) -> List[FollowupJob]: + ) -> List[FollowupJobRequest]: """Creates a list of followup jobs that should be executed after a table chain is completed""" # if sql job follow up is configure we schedule a merge job that will always fail @@ -184,7 +191,7 @@ def create_table_chain_completed_followup_jobs( if self.config.create_followup_table_chain_reference_jobs: table_job_paths = [job.file_path for job in completed_table_chain_jobs] file_name = FileStorage.get_file_name_from_file_path(table_job_paths[0]) - job = ReferenceFollowupJob(file_name, table_job_paths) + job = ReferenceFollowupJobRequest(file_name, table_job_paths) CREATED_TABLE_CHAIN_FOLLOWUP_JOBS[job.job_id()] = job return [job] return [] @@ -212,7 +219,7 @@ def __exit__( pass def _create_job(self, job_id: str) -> LoadDummyBaseJob: - if ReferenceFollowupJob.is_reference_job(job_id): + if ReferenceFollowupJobRequest.is_reference_job(job_id): return LoadDummyBaseJob(job_id, config=self.config) else: return LoadDummyJob(job_id, config=self.config) diff --git a/dlt/destinations/impl/filesystem/filesystem.py b/dlt/destinations/impl/filesystem/filesystem.py index f2466f25a2..2e09871ba9 100644 --- a/dlt/destinations/impl/filesystem/filesystem.py +++ b/dlt/destinations/impl/filesystem/filesystem.py @@ -9,6 +9,7 @@ import dlt from dlt.common import logger, time, json, pendulum +from dlt.common.metrics import LoadJobMetrics from dlt.common.storages.fsspec_filesystem import glob_files from dlt.common.typing import DictStrAny from dlt.common.schema import Schema, TSchemaTables, TTableSchema @@ -21,7 +22,7 @@ ) from dlt.common.destination import DestinationCapabilitiesContext from dlt.common.destination.reference import ( - FollowupJob, + FollowupJobRequest, TLoadJobState, RunnableLoadJob, JobClientBase, @@ -34,7 +35,7 @@ ) from dlt.common.destination.exceptions import DestinationUndefinedEntity from dlt.destinations.job_impl import ( - ReferenceFollowupJob, + ReferenceFollowupJobRequest, FinalizedLoadJob, FinalizedLoadJobWithFollowupJobs, ) @@ -87,6 +88,13 @@ def make_remote_path(self) -> str: path_utils.normalize_path_sep(self.pathlib, self.destination_file_name), ) + def make_remote_uri(self) -> str: + return self._job_client.make_remote_uri(self.make_remote_path()) + + def metrics(self) -> Optional[LoadJobMetrics]: + m = super().metrics() + return m._replace(remote_uri=self.make_remote_uri()) + class DeltaLoadFilesystemJob(FilesystemLoadJob): def __init__(self, file_path: str) -> None: @@ -95,6 +103,15 @@ def __init__(self, file_path: str) -> None: ) def run(self) -> None: + # pick local filesystem pathlib or posix for buckets + # TODO: since we pass _job_client via run_managed and not set_env_vars it is hard + # to write a handler with those two line below only in FilesystemLoadJob + self.is_local_filesystem = self._job_client.config.protocol == "file" + self.pathlib = os.path if self.is_local_filesystem else posixpath + self.destination_file_name = self._job_client.make_remote_uri( + self._job_client.get_table_dir(self.load_table_name) + ) + from dlt.common.libs.pyarrow import pyarrow as pa from dlt.common.libs.deltalake import ( DeltaTable, @@ -105,15 +122,13 @@ def run(self) -> None: ) # create Arrow dataset from Parquet files - file_paths = ReferenceFollowupJob.resolve_references(self._file_path) + file_paths = ReferenceFollowupJobRequest.resolve_references(self._file_path) arrow_ds = pa.dataset.dataset(file_paths) # create Delta table object - dt_path = self._job_client.make_remote_uri( - self._job_client.get_table_dir(self.load_table_name) - ) + storage_options = _deltalake_storage_options(self._job_client.config) - dt = try_get_deltatable(dt_path, storage_options=storage_options) + dt = try_get_deltatable(self.destination_file_name, storage_options=storage_options) # get partition columns part_cols = get_columns_names_with_prop(self._load_table, "partition") @@ -124,7 +139,7 @@ def run(self) -> None: if dt is None: # create new empty Delta table with schema from Arrow table DeltaTable.create( - table_uri=dt_path, + table_uri=self.destination_file_name, schema=ensure_delta_compatible_arrow_schema(arrow_ds.schema), mode="overwrite", partition_by=part_cols, @@ -160,7 +175,7 @@ def run(self) -> None: else: write_delta_table( - table_or_uri=dt_path if dt is None else dt, + table_or_uri=self.destination_file_name if dt is None else dt, data=arrow_rbr, write_disposition=self._load_table["write_disposition"], partition_by=part_cols, @@ -169,13 +184,13 @@ def run(self) -> None: class FilesystemLoadJobWithFollowup(HasFollowupJobs, FilesystemLoadJob): - def create_followup_jobs(self, final_state: TLoadJobState) -> List[FollowupJob]: + def create_followup_jobs(self, final_state: TLoadJobState) -> List[FollowupJobRequest]: jobs = super().create_followup_jobs(final_state) if self._load_table.get("table_format") == "delta": # delta table jobs only require table chain followup jobs pass elif final_state == "completed": - ref_job = ReferenceFollowupJob( + ref_job = ReferenceFollowupJobRequest( original_file_name=self.file_name(), remote_paths=[self._job_client.make_remote_uri(self.make_remote_path())], ) @@ -369,7 +384,7 @@ def create_load_job( import dlt.common.libs.deltalake # assert dependencies are installed # a reference job for a delta table indicates a table chain followup job - if ReferenceFollowupJob.is_reference_job(file_path): + if ReferenceFollowupJobRequest.is_reference_job(file_path): return DeltaLoadFilesystemJob(file_path) # otherwise just continue return FinalizedLoadJobWithFollowupJobs(file_path) @@ -578,7 +593,7 @@ def create_table_chain_completed_followup_jobs( self, table_chain: Sequence[TTableSchema], completed_table_chain_jobs: Optional[Sequence[LoadJobInfo]] = None, - ) -> List[FollowupJob]: + ) -> List[FollowupJobRequest]: assert completed_table_chain_jobs is not None jobs = super().create_table_chain_completed_followup_jobs( table_chain, completed_table_chain_jobs @@ -591,5 +606,5 @@ def create_table_chain_completed_followup_jobs( if job.job_file_info.table_name == table["name"] ] file_name = FileStorage.get_file_name_from_file_path(table_job_paths[0]) - jobs.append(ReferenceFollowupJob(file_name, table_job_paths)) + jobs.append(ReferenceFollowupJobRequest(file_name, table_job_paths)) return jobs diff --git a/dlt/destinations/impl/mssql/mssql.py b/dlt/destinations/impl/mssql/mssql.py index a67423a873..750dc93a10 100644 --- a/dlt/destinations/impl/mssql/mssql.py +++ b/dlt/destinations/impl/mssql/mssql.py @@ -1,7 +1,7 @@ from typing import Dict, Optional, Sequence, List, Any from dlt.common.exceptions import TerminalValueError -from dlt.common.destination.reference import FollowupJob +from dlt.common.destination.reference import FollowupJobRequest from dlt.common.destination import DestinationCapabilitiesContext from dlt.common.schema import TColumnSchema, TColumnHint, Schema from dlt.common.schema.typing import TTableSchema, TColumnType, TTableFormat @@ -160,7 +160,9 @@ def __init__( self.active_hints = HINT_TO_MSSQL_ATTR if self.config.create_indexes else {} self.type_mapper = MsSqlTypeMapper(self.capabilities) - def _create_merge_followup_jobs(self, table_chain: Sequence[TTableSchema]) -> List[FollowupJob]: + def _create_merge_followup_jobs( + self, table_chain: Sequence[TTableSchema] + ) -> List[FollowupJobRequest]: return [MsSqlMergeJob.from_table_chain(table_chain, self.sql_client)] def _make_add_column_sql( @@ -189,7 +191,7 @@ def _get_column_def_sql(self, c: TColumnSchema, table_format: TTableFormat = Non def _create_replace_followup_jobs( self, table_chain: Sequence[TTableSchema] - ) -> List[FollowupJob]: + ) -> List[FollowupJobRequest]: if self.config.replace_strategy == "staging-optimized": return [MsSqlStagingCopyJob.from_table_chain(table_chain, self.sql_client)] return super()._create_replace_followup_jobs(table_chain) diff --git a/dlt/destinations/impl/postgres/postgres.py b/dlt/destinations/impl/postgres/postgres.py index 5ae5f27a6e..a832bfe07f 100644 --- a/dlt/destinations/impl/postgres/postgres.py +++ b/dlt/destinations/impl/postgres/postgres.py @@ -9,7 +9,7 @@ from dlt.common.destination.reference import ( HasFollowupJobs, RunnableLoadJob, - FollowupJob, + FollowupJobRequest, LoadJob, TLoadJobState, ) @@ -246,7 +246,7 @@ def _get_column_def_sql(self, c: TColumnSchema, table_format: TTableFormat = Non def _create_replace_followup_jobs( self, table_chain: Sequence[TTableSchema] - ) -> List[FollowupJob]: + ) -> List[FollowupJobRequest]: if self.config.replace_strategy == "staging-optimized": return [PostgresStagingCopyJob.from_table_chain(table_chain, self.sql_client)] return super()._create_replace_followup_jobs(table_chain) diff --git a/dlt/destinations/impl/redshift/redshift.py b/dlt/destinations/impl/redshift/redshift.py index 81abd57803..93827c8163 100644 --- a/dlt/destinations/impl/redshift/redshift.py +++ b/dlt/destinations/impl/redshift/redshift.py @@ -14,7 +14,7 @@ from dlt.common.destination.reference import ( - FollowupJob, + FollowupJobRequest, CredentialsConfiguration, SupportsStagingDestination, LoadJob, @@ -33,7 +33,7 @@ from dlt.destinations.job_client_impl import CopyRemoteFileLoadJob from dlt.destinations.impl.postgres.sql_client import Psycopg2SqlClient from dlt.destinations.impl.redshift.configuration import RedshiftClientConfiguration -from dlt.destinations.job_impl import ReferenceFollowupJob +from dlt.destinations.job_impl import ReferenceFollowupJobRequest from dlt.destinations.sql_client import SqlClientBase from dlt.destinations.type_mapping import TypeMapper @@ -238,7 +238,9 @@ def __init__( self.config: RedshiftClientConfiguration = config self.type_mapper = RedshiftTypeMapper(self.capabilities) - def _create_merge_followup_jobs(self, table_chain: Sequence[TTableSchema]) -> List[FollowupJob]: + def _create_merge_followup_jobs( + self, table_chain: Sequence[TTableSchema] + ) -> List[FollowupJobRequest]: return [RedshiftMergeJob.from_table_chain(table_chain, self.sql_client)] def _get_column_def_sql(self, c: TColumnSchema, table_format: TTableFormat = None) -> str: @@ -258,7 +260,7 @@ def create_load_job( """Starts SqlLoadJob for files ending with .sql or returns None to let derived classes to handle their specific jobs""" job = super().create_load_job(table, file_path, load_id, restore) if not job: - assert ReferenceFollowupJob.is_reference_job( + assert ReferenceFollowupJobRequest.is_reference_job( file_path ), "Redshift must use staging to load files" job = RedshiftCopyFileLoadJob( diff --git a/dlt/destinations/impl/snowflake/snowflake.py b/dlt/destinations/impl/snowflake/snowflake.py index 904b524791..8b4eabc961 100644 --- a/dlt/destinations/impl/snowflake/snowflake.py +++ b/dlt/destinations/impl/snowflake/snowflake.py @@ -29,7 +29,7 @@ from dlt.destinations.impl.snowflake.configuration import SnowflakeClientConfiguration from dlt.destinations.impl.snowflake.sql_client import SnowflakeSqlClient from dlt.destinations.impl.snowflake.sql_client import SnowflakeSqlClient -from dlt.destinations.job_impl import ReferenceFollowupJob +from dlt.destinations.job_impl import ReferenceFollowupJobRequest from dlt.destinations.type_mapping import TypeMapper @@ -98,11 +98,11 @@ def run(self) -> None: self._sql_client = self._job_client.sql_client # resolve reference - is_local_file = not ReferenceFollowupJob.is_reference_job(self._file_path) + is_local_file = not ReferenceFollowupJobRequest.is_reference_job(self._file_path) file_url = ( self._file_path if is_local_file - else ReferenceFollowupJob.resolve_reference(self._file_path) + else ReferenceFollowupJobRequest.resolve_reference(self._file_path) ) # take file name file_name = FileStorage.get_file_name_from_file_path(file_url) diff --git a/dlt/destinations/impl/synapse/synapse.py b/dlt/destinations/impl/synapse/synapse.py index d1b38f73bd..e43e2a6dfa 100644 --- a/dlt/destinations/impl/synapse/synapse.py +++ b/dlt/destinations/impl/synapse/synapse.py @@ -5,7 +5,7 @@ from urllib.parse import urlparse, urlunparse from dlt.common.destination import DestinationCapabilitiesContext -from dlt.common.destination.reference import SupportsStagingDestination, FollowupJob, LoadJob +from dlt.common.destination.reference import SupportsStagingDestination, FollowupJobRequest, LoadJob from dlt.common.schema import TTableSchema, TColumnSchema, Schema, TColumnHint from dlt.common.schema.utils import ( @@ -19,7 +19,7 @@ AzureServicePrincipalCredentialsWithoutDefaults, ) -from dlt.destinations.job_impl import ReferenceFollowupJob +from dlt.destinations.job_impl import ReferenceFollowupJobRequest from dlt.destinations.sql_client import SqlClientBase from dlt.destinations.job_client_impl import ( SqlJobClientBase, @@ -131,7 +131,7 @@ def _get_columstore_valid_column(self, c: TColumnSchema) -> TColumnSchema: def _create_replace_followup_jobs( self, table_chain: Sequence[TTableSchema] - ) -> List[FollowupJob]: + ) -> List[FollowupJobRequest]: return SqlJobClientBase._create_replace_followup_jobs(self, table_chain) def prepare_load_table(self, table_name: str, staging: bool = False) -> TTableSchema: @@ -163,7 +163,7 @@ def create_load_job( ) -> LoadJob: job = super().create_load_job(table, file_path, load_id, restore) if not job: - assert ReferenceFollowupJob.is_reference_job( + assert ReferenceFollowupJobRequest.is_reference_job( file_path ), "Synapse must use staging to load files" job = SynapseCopyFileLoadJob( diff --git a/dlt/destinations/job_client_impl.py b/dlt/destinations/job_client_impl.py index 7fdd979c5d..92132dd751 100644 --- a/dlt/destinations/job_client_impl.py +++ b/dlt/destinations/job_client_impl.py @@ -42,7 +42,7 @@ WithStateSync, DestinationClientConfiguration, DestinationClientDwhConfiguration, - FollowupJob, + FollowupJobRequest, WithStagingDataset, RunnableLoadJob, LoadJob, @@ -53,7 +53,7 @@ from dlt.destinations.exceptions import DatabaseUndefinedRelation from dlt.destinations.job_impl import ( - ReferenceFollowupJob, + ReferenceFollowupJobRequest, ) from dlt.destinations.sql_jobs import SqlMergeFollowupJob, SqlStagingCopyFollowupJob from dlt.destinations.typing import TNativeConn @@ -118,7 +118,7 @@ def __init__( super().__init__(file_path) self._job_client: "SqlJobClientBase" = None self._staging_credentials = staging_credentials - self._bucket_path = ReferenceFollowupJob.resolve_reference(file_path) + self._bucket_path = ReferenceFollowupJobRequest.resolve_reference(file_path) class SqlJobClientBase(JobClientBase, WithStateSync): @@ -216,16 +216,18 @@ def should_truncate_table_before_load(self, table: TTableSchema) -> bool: def _create_append_followup_jobs( self, table_chain: Sequence[TTableSchema] - ) -> List[FollowupJob]: + ) -> List[FollowupJobRequest]: return [] - def _create_merge_followup_jobs(self, table_chain: Sequence[TTableSchema]) -> List[FollowupJob]: + def _create_merge_followup_jobs( + self, table_chain: Sequence[TTableSchema] + ) -> List[FollowupJobRequest]: return [SqlMergeFollowupJob.from_table_chain(table_chain, self.sql_client)] def _create_replace_followup_jobs( self, table_chain: Sequence[TTableSchema] - ) -> List[FollowupJob]: - jobs: List[FollowupJob] = [] + ) -> List[FollowupJobRequest]: + jobs: List[FollowupJobRequest] = [] if self.config.replace_strategy in ["insert-from-staging", "staging-optimized"]: jobs.append( SqlStagingCopyFollowupJob.from_table_chain( @@ -238,7 +240,7 @@ def create_table_chain_completed_followup_jobs( self, table_chain: Sequence[TTableSchema], completed_table_chain_jobs: Optional[Sequence[LoadJobInfo]] = None, - ) -> List[FollowupJob]: + ) -> List[FollowupJobRequest]: """Creates a list of followup jobs for merge write disposition and staging replace strategies""" jobs = super().create_table_chain_completed_followup_jobs( table_chain, completed_table_chain_jobs diff --git a/dlt/destinations/job_impl.py b/dlt/destinations/job_impl.py index 41c939f482..1f54913064 100644 --- a/dlt/destinations/job_impl.py +++ b/dlt/destinations/job_impl.py @@ -1,7 +1,7 @@ from abc import ABC, abstractmethod import os import tempfile # noqa: 251 -from typing import Dict, Iterable, List +from typing import Dict, Iterable, List, Optional from dlt.common.json import json from dlt.common.destination.reference import ( @@ -9,9 +9,10 @@ TLoadJobState, RunnableLoadJob, JobClientBase, - FollowupJob, + FollowupJobRequest, LoadJob, ) +from dlt.common.metrics import LoadJobMetrics from dlt.common.storages.load_package import commit_load_package_state from dlt.common.schema import Schema, TTableSchema from dlt.common.storages import FileStorage @@ -56,7 +57,7 @@ class FinalizedLoadJobWithFollowupJobs(FinalizedLoadJob, HasFollowupJobs): pass -class FollowupJobImpl(FollowupJob): +class FollowupJobRequestImpl(FollowupJobRequest): """ Class to create a new loadjob, not stateful and not runnable """ @@ -79,7 +80,7 @@ def job_id(self) -> str: return self._parsed_file_name.job_id() -class ReferenceFollowupJob(FollowupJobImpl): +class ReferenceFollowupJobRequest(FollowupJobRequestImpl): def __init__(self, original_file_name: str, remote_paths: List[str]) -> None: file_name = os.path.splitext(original_file_name)[0] + "." + "reference" self._remote_paths = remote_paths @@ -98,7 +99,7 @@ def resolve_references(file_path: str) -> List[str]: @staticmethod def resolve_reference(file_path: str) -> str: - refs = ReferenceFollowupJob.resolve_references(file_path) + refs = ReferenceFollowupJobRequest.resolve_references(file_path) assert len(refs) == 1 return refs[0] diff --git a/dlt/destinations/sql_jobs.py b/dlt/destinations/sql_jobs.py index a1e38a2c20..d5f005ee9a 100644 --- a/dlt/destinations/sql_jobs.py +++ b/dlt/destinations/sql_jobs.py @@ -21,7 +21,7 @@ from dlt.common.utils import uniq_id from dlt.common.destination.capabilities import DestinationCapabilitiesContext from dlt.destinations.exceptions import MergeDispositionException -from dlt.destinations.job_impl import FollowupJobImpl +from dlt.destinations.job_impl import FollowupJobRequestImpl from dlt.destinations.sql_client import SqlClientBase from dlt.common.destination.exceptions import DestinationTransientException @@ -45,7 +45,7 @@ def __init__(self, original_exception: Exception, table_chain: Sequence[TTableSc ) -class SqlFollowupJob(FollowupJobImpl): +class SqlFollowupJob(FollowupJobRequestImpl): """Sql base job for jobs that rely on the whole tablechain""" @classmethod @@ -54,7 +54,7 @@ def from_table_chain( table_chain: Sequence[TTableSchema], sql_client: SqlClientBase[Any], params: Optional[SqlJobParams] = None, - ) -> FollowupJobImpl: + ) -> FollowupJobRequestImpl: """Generates a list of sql statements, that will be executed by the sql client when the job is executed in the loader. The `table_chain` contains a list schemas of a tables with parent-child relationship, ordered by the ancestry (the root of the tree is first on the list). diff --git a/dlt/extract/extractors.py b/dlt/extract/extractors.py index 4a1de2517d..8a91dd7477 100644 --- a/dlt/extract/extractors.py +++ b/dlt/extract/extractors.py @@ -4,9 +4,9 @@ from dlt.common.configuration import known_sections, resolve_configuration, with_config from dlt.common import logger from dlt.common.configuration.specs import BaseConfiguration, configspec -from dlt.common.data_writers import DataWriterMetrics from dlt.common.destination.capabilities import DestinationCapabilitiesContext from dlt.common.exceptions import MissingDependencyException +from dlt.common.metrics import DataWriterMetrics from dlt.common.runtime.collector import Collector, NULL_COLLECTOR from dlt.common.typing import TDataItems, TDataItem, TLoaderFileFormat from dlt.common.schema import Schema, utils diff --git a/dlt/extract/storage.py b/dlt/extract/storage.py index de777ad60e..395366b09e 100644 --- a/dlt/extract/storage.py +++ b/dlt/extract/storage.py @@ -1,7 +1,8 @@ import os from typing import Dict, List -from dlt.common.data_writers import TDataItemFormat, DataWriterMetrics, DataWriter, FileWriterSpec +from dlt.common.data_writers import TDataItemFormat, DataWriter, FileWriterSpec +from dlt.common.metrics import DataWriterMetrics from dlt.common.schema import Schema from dlt.common.storages import ( NormalizeStorageConfiguration, diff --git a/dlt/load/load.py b/dlt/load/load.py index 99a12d69ee..f084c9d3d9 100644 --- a/dlt/load/load.py +++ b/dlt/load/load.py @@ -5,12 +5,17 @@ import os from dlt.common import logger +from dlt.common.metrics import LoadJobMetrics from dlt.common.runtime.signals import sleep from dlt.common.configuration import with_config, known_sections from dlt.common.configuration.accessors import config from dlt.common.pipeline import LoadInfo, LoadMetrics, SupportsPipeline, WithStepInfo from dlt.common.schema.utils import get_top_level_table -from dlt.common.storages.load_storage import LoadPackageInfo, ParsedLoadJobFileName, TJobState +from dlt.common.storages.load_storage import ( + LoadPackageInfo, + ParsedLoadJobFileName, + TPackageJobState, +) from dlt.common.storages.load_package import ( LoadPackageStateInjectableContext, load_package as current_load_package, @@ -29,7 +34,7 @@ Destination, RunnableLoadJob, LoadJob, - FollowupJob, + FollowupJobRequest, TLoadJobState, DestinationClientConfiguration, SupportsStagingDestination, @@ -84,6 +89,7 @@ def __init__( self.pool = NullExecutor() self.load_storage: LoadStorage = self.create_storage(is_storage_owner) self._loaded_packages: List[LoadPackageInfo] = [] + self._job_metrics: Dict[str, LoadJobMetrics] = {} self._run_loop_sleep_duration: float = ( 1.0 # amount of time to sleep between querying completed jobs ) @@ -308,7 +314,7 @@ def create_followup_jobs( where they will be picked up for execution """ - jobs: List[FollowupJob] = [] + jobs: List[FollowupJobRequest] = [] if isinstance(starting_job, HasFollowupJobs): # check for merge jobs only for jobs executing on the destination, the staging destination jobs must be excluded # NOTE: we may move that logic to the interface @@ -392,6 +398,11 @@ def complete_jobs( # create followup jobs self.create_followup_jobs(load_id, state, job, schema) + # preserve metrics + metrics = job.metrics() + if metrics: + self._job_metrics[job.job_id()] = metrics + # try to get exception message from job failed_message = job.exception() self.load_storage.normalized_packages.fail_job( @@ -423,7 +434,7 @@ def complete_jobs( if r_c > 0 and r_c % self.config.raise_on_max_retries == 0: pending_exception = LoadClientJobRetry( load_id, - job.job_file_info().job_id(), + job.job_id(), r_c, self.config.raise_on_max_retries, retry_message=retry_message, @@ -431,6 +442,15 @@ def complete_jobs( elif state == "completed": # create followup jobs self.create_followup_jobs(load_id, state, job, schema) + + # preserve metrics + # TODO: metrics should be persisted. this is different vs. all other steps because load step + # may be restarted in the middle of execution + # NOTE: we could use package state but cases with 100k jobs must be tested + metrics = job.metrics() + if metrics: + self._job_metrics[job.job_id()] = metrics + # move to completed folder after followup jobs are created # in case of exception when creating followup job, the loader will retry operation and try to complete again self.load_storage.normalized_packages.complete_job(load_id, job.file_name()) @@ -464,14 +484,18 @@ def complete_package(self, load_id: str, schema: Schema, aborted: bool = False) self.load_storage.complete_load_package(load_id, aborted) # collect package info self._loaded_packages.append(self.load_storage.get_load_package_info(load_id)) - self._step_info_complete_load_id(load_id, metrics={"started_at": None, "finished_at": None}) + # TODO: job metrics must be persisted + self._step_info_complete_load_id( + load_id, + metrics={"started_at": None, "finished_at": None, "job_metrics": self._job_metrics}, + ) # delete jobs only now self.load_storage.maybe_remove_completed_jobs(load_id) logger.info( f"All jobs completed, archiving package {load_id} with aborted set to {aborted}" ) - def update_load_package_info(self, load_id: str) -> None: + def init_jobs_counter(self, load_id: str) -> None: # update counter we only care about the jobs that are scheduled to be loaded package_jobs = self.load_storage.normalized_packages.get_load_package_jobs(load_id) total_jobs = reduce(lambda p, c: p + len(c), package_jobs.values(), 0) @@ -492,7 +516,7 @@ def load_single_package(self, load_id: str, schema: Schema) -> None: dropped_tables = current_load_package()["state"].get("dropped_tables", []) truncated_tables = current_load_package()["state"].get("truncated_tables", []) - self.update_load_package_info(load_id) + self.init_jobs_counter(load_id) # initialize analytical storage ie. create dataset required by passed schema with self.get_destination_client(schema) as job_client: @@ -606,7 +630,8 @@ def run(self, pool: Optional[Executor]) -> TRunMetrics: ) ): # the same load id may be processed across multiple runs - if not self.current_load_id: + if self.current_load_id is None: + self._job_metrics = {} self._step_info_start_load_id(load_id) self.load_single_package(load_id, schema) diff --git a/dlt/load/utils.py b/dlt/load/utils.py index 9750f89d4b..741c01f249 100644 --- a/dlt/load/utils.py +++ b/dlt/load/utils.py @@ -2,7 +2,7 @@ from itertools import groupby from dlt.common import logger -from dlt.common.storages.load_package import LoadJobInfo, PackageStorage, TJobState +from dlt.common.storages.load_package import LoadJobInfo, PackageStorage, TPackageJobState from dlt.common.schema.utils import ( fill_hints_from_parent_and_clone_table, get_child_tables, @@ -19,7 +19,7 @@ def get_completed_table_chain( schema: Schema, - all_jobs: Iterable[Tuple[TJobState, ParsedLoadJobFileName]], + all_jobs: Iterable[Tuple[TPackageJobState, ParsedLoadJobFileName]], top_merged_table: TTableSchema, being_completed_job_id: str = None, ) -> List[TTableSchema]: diff --git a/dlt/normalize/items_normalizers.py b/dlt/normalize/items_normalizers.py index 5f84d57d7a..650d10c268 100644 --- a/dlt/normalize/items_normalizers.py +++ b/dlt/normalize/items_normalizers.py @@ -3,9 +3,9 @@ from dlt.common import logger from dlt.common.json import json -from dlt.common.data_writers import DataWriterMetrics from dlt.common.data_writers.writers import ArrowToObjectAdapter from dlt.common.json import custom_pua_decode, may_have_pua +from dlt.common.metrics import DataWriterMetrics from dlt.common.normalizers.json.relational import DataItemNormalizer as RelationalNormalizer from dlt.common.runtime import signals from dlt.common.schema.typing import TSchemaEvolutionMode, TTableSchemaColumns, TSchemaContractDict diff --git a/dlt/normalize/normalize.py b/dlt/normalize/normalize.py index e80931605c..3df060b141 100644 --- a/dlt/normalize/normalize.py +++ b/dlt/normalize/normalize.py @@ -4,10 +4,10 @@ from concurrent.futures import Future, Executor from dlt.common import logger +from dlt.common.metrics import DataWriterMetrics from dlt.common.runtime.signals import sleep from dlt.common.configuration import with_config, known_sections from dlt.common.configuration.accessors import config -from dlt.common.data_writers import DataWriterMetrics from dlt.common.data_writers.writers import EMPTY_DATA_WRITER_METRICS from dlt.common.runners import TRunMetrics, Runnable, NullExecutor from dlt.common.runtime import signals diff --git a/dlt/normalize/worker.py b/dlt/normalize/worker.py index 10d0a00eb1..b8969f64a3 100644 --- a/dlt/normalize/worker.py +++ b/dlt/normalize/worker.py @@ -4,12 +4,12 @@ from dlt.common.configuration.container import Container from dlt.common.data_writers import ( DataWriter, - DataWriterMetrics, create_import_spec, resolve_best_writer_spec, get_best_writer_spec, is_native_writer, ) +from dlt.common.metrics import DataWriterMetrics from dlt.common.utils import chunks from dlt.common.schema.typing import TStoredSchema, TTableSchema from dlt.common.storages import ( diff --git a/dlt/pipeline/trace.py b/dlt/pipeline/trace.py index 29770966a6..2f857e5fd5 100644 --- a/dlt/pipeline/trace.py +++ b/dlt/pipeline/trace.py @@ -168,7 +168,7 @@ def asdict(self) -> DictStrAny: """A dictionary representation of PipelineTrace that can be loaded with `dlt`""" d = self._asdict() # run step is the same as load step - d["steps"] = [step.asdict() for step in self.steps] # if step.step != "run" + d["steps"] = [step.asdict() for step in self.steps if step.step != "run"] return d @property diff --git a/docs/website/docs/general-usage/pipeline.md b/docs/website/docs/general-usage/pipeline.md index f21d6f0686..40f9419bc2 100644 --- a/docs/website/docs/general-usage/pipeline.md +++ b/docs/website/docs/general-usage/pipeline.md @@ -85,6 +85,19 @@ You can inspect stored artifacts using the command > 💡 You can attach `Pipeline` instance to an existing working folder, without creating a new > pipeline with `dlt.attach`. +### Separate working environments with `pipelines_dir`. +You can run several pipelines with the same name but with different configuration ie. to target development / staging / production environments. +Set the `pipelines_dir` argument to store all the working folders in specific place. For example: +```py +import dlt +from dlt.common.pipeline import get_dlt_pipelines_dir + +dev_pipelines_dir = os.path.join(get_dlt_pipelines_dir(), "dev") +pipeline = dlt.pipeline(destination="duckdb", dataset_name="sequence", pipelines_dir=dev_pipelines_dir) +``` +stores pipeline working folder in `~/.dlt/pipelines/dev/`. Mind that you need to pass this `~/.dlt/pipelines/dev/` +in to all cli commands to get info/trace for that pipeline. + ## Do experiments with dev mode If you [create a new pipeline script](../walkthroughs/create-a-pipeline.md) you will be diff --git a/tests/common/data_writers/test_data_writers.py b/tests/common/data_writers/test_data_writers.py index 9b4e61a2f7..03723b7b55 100644 --- a/tests/common/data_writers/test_data_writers.py +++ b/tests/common/data_writers/test_data_writers.py @@ -5,6 +5,7 @@ from dlt.common import pendulum, json from dlt.common.data_writers.exceptions import DataWriterNotFound, SpecLookupFailed +from dlt.common.metrics import DataWriterMetrics from dlt.common.typing import AnyFun from dlt.common.data_writers.escape import ( @@ -25,7 +26,6 @@ ArrowToTypedJsonlListWriter, CsvWriter, DataWriter, - DataWriterMetrics, EMPTY_DATA_WRITER_METRICS, ImportFileWriter, InsertValuesWriter, @@ -180,12 +180,13 @@ def test_data_writer_metrics_add() -> None: metrics = DataWriterMetrics("file", 10, 100, now, now + 10) add_m: DataWriterMetrics = metrics + EMPTY_DATA_WRITER_METRICS # type: ignore[assignment] assert add_m == DataWriterMetrics("", 10, 100, now, now + 10) - assert metrics + metrics == DataWriterMetrics("", 20, 200, now, now + 10) + # will keep "file" because it is in both + assert metrics + metrics == DataWriterMetrics("file", 20, 200, now, now + 10) assert sum((metrics, metrics, metrics), EMPTY_DATA_WRITER_METRICS) == DataWriterMetrics( "", 30, 300, now, now + 10 ) # time range extends when added - add_m = metrics + DataWriterMetrics("file", 99, 120, now - 10, now + 20) # type: ignore[assignment] + add_m = metrics + DataWriterMetrics("fileX", 99, 120, now - 10, now + 20) # type: ignore[assignment] assert add_m == DataWriterMetrics("", 109, 220, now - 10, now + 20) diff --git a/tests/common/storages/utils.py b/tests/common/storages/utils.py index baac3b7af5..a1334ba1da 100644 --- a/tests/common/storages/utils.py +++ b/tests/common/storages/utils.py @@ -16,7 +16,7 @@ LoadStorageConfiguration, FilesystemConfiguration, LoadPackageInfo, - TJobState, + TPackageJobState, LoadStorage, ) from dlt.common.storages import DataItemStorage, FileStorage @@ -195,7 +195,7 @@ def assert_package_info( storage: LoadStorage, load_id: str, package_state: str, - job_state: TJobState, + job_state: TPackageJobState, jobs_count: int = 1, ) -> LoadPackageInfo: package_info = storage.get_load_package_info(load_id) diff --git a/tests/extract/data_writers/test_buffered_writer.py b/tests/extract/data_writers/test_buffered_writer.py index 5cad5a35b9..205e3f83dc 100644 --- a/tests/extract/data_writers/test_buffered_writer.py +++ b/tests/extract/data_writers/test_buffered_writer.py @@ -7,12 +7,12 @@ from dlt.common.data_writers.exceptions import BufferedDataWriterClosed from dlt.common.data_writers.writers import ( DataWriter, - DataWriterMetrics, InsertValuesWriter, JsonlWriter, ALL_WRITERS, ) from dlt.common.destination.capabilities import TLoaderFileFormat, DestinationCapabilitiesContext +from dlt.common.metrics import DataWriterMetrics from dlt.common.schema.utils import new_column from dlt.common.storages.file_storage import FileStorage diff --git a/tests/extract/data_writers/test_data_item_storage.py b/tests/extract/data_writers/test_data_item_storage.py index feda51c229..558eeec79e 100644 --- a/tests/extract/data_writers/test_data_item_storage.py +++ b/tests/extract/data_writers/test_data_item_storage.py @@ -3,8 +3,9 @@ import pytest from dlt.common.configuration.container import Container -from dlt.common.data_writers.writers import DataWriterMetrics, DataWriter +from dlt.common.data_writers.writers import DataWriter from dlt.common.destination.capabilities import DestinationCapabilitiesContext +from dlt.common.metrics import DataWriterMetrics from dlt.common.schema.utils import new_column from dlt.common.storages.data_item_storage import DataItemStorage diff --git a/tests/load/pipeline/test_filesystem_pipeline.py b/tests/load/pipeline/test_filesystem_pipeline.py index 759f443546..4b8707e989 100644 --- a/tests/load/pipeline/test_filesystem_pipeline.py +++ b/tests/load/pipeline/test_filesystem_pipeline.py @@ -12,6 +12,7 @@ from dlt.common import json from dlt.common import pendulum +from dlt.common.storages.configuration import FilesystemConfiguration from dlt.common.storages.load_package import ParsedLoadJobFileName from dlt.common.utils import uniq_id from dlt.common.exceptions import DependencyVersionException @@ -299,6 +300,17 @@ def data_types(): assert len(rows) == 10 assert_all_data_types_row(rows[0], schema=column_schemas) + # make sure remote_uri is in metrics + metrics = info.metrics[info.loads_ids[0]][0] + # TODO: only final copy job has remote_uri. not the initial (empty) job for particular files + # we could implement an empty job for delta that generates correct remote_uri + remote_uri = list(metrics["job_metrics"].values())[-1].remote_uri + assert remote_uri.endswith("data_types") + bucket_uri = destination_config.bucket_url + if FilesystemConfiguration.is_local_path(bucket_uri): + bucket_uri = FilesystemConfiguration.make_file_uri(bucket_uri) + assert remote_uri.startswith(bucket_uri) + # another run should append rows to the table info = pipeline.run(data_types()) assert_load_info(info) @@ -567,6 +579,7 @@ def two_part(): assert dt.metadata().partition_columns == [] +@pytest.mark.essential @pytest.mark.parametrize( "destination_config", destinations_configs( @@ -798,6 +811,51 @@ def parent_delta(): get_delta_tables(pipeline, "non_existing_table") +@pytest.mark.parametrize( + "destination_config", + destinations_configs( + table_format_filesystem_configs=True, + table_format="delta", + bucket_subset=(FILE_BUCKET,), + ), + ids=lambda x: x.name, +) +def test_parquet_to_delta_upgrade(destination_config: DestinationTestConfiguration): + # change the resource to start creating delta tables + from dlt.common.libs.deltalake import get_delta_tables + + @dlt.resource() + def foo(): + yield [{"foo": 1}, {"foo": 2}] + + pipeline = destination_config.setup_pipeline("fs_pipe") + + info = pipeline.run(foo()) + assert_load_info(info) + delta_tables = get_delta_tables(pipeline) + assert set(delta_tables.keys()) == set() + + # drop the pipeline + pipeline.deactivate() + + # redefine the resource + + @dlt.resource(table_format="delta") # type: ignore + def foo(): + yield [{"foo": 1}, {"foo": 2}] + + pipeline = destination_config.setup_pipeline("fs_pipe") + + info = pipeline.run(foo()) + assert_load_info(info) + delta_tables = get_delta_tables(pipeline) + assert set(delta_tables.keys()) == {"foo"} + + # optimize all delta tables to make sure storage is there + for table in delta_tables.values(): + table.vacuum() + + TEST_LAYOUTS = ( "{schema_name}/{table_name}/{load_id}.{file_id}.{ext}", "{schema_name}.{table_name}.{load_id}.{file_id}.{ext}", diff --git a/tests/load/pipeline/test_postgres.py b/tests/load/pipeline/test_postgres.py index a4001b7faa..5cadf701a2 100644 --- a/tests/load/pipeline/test_postgres.py +++ b/tests/load/pipeline/test_postgres.py @@ -42,3 +42,18 @@ def test_postgres_encoded_binary( # print(bytes(data["table"][0]["hash"])) # data in postgres equals unencoded blob assert data["table"][0]["hash"].tobytes() == blob + + +# TODO: uncomment and finalize when we implement encoding for psycopg2 +# @pytest.mark.parametrize( +# "destination_config", +# destinations_configs(default_sql_configs=True, subset=["postgres"]), +# ids=lambda x: x.name, +# ) +# def test_postgres_encoding(destination_config: DestinationTestConfiguration): +# from dlt.destinations.impl.postgres.sql_client import Psycopg2SqlClient +# pipeline = destination_config.setup_pipeline("postgres_" + uniq_id(), dev_mode=True) +# client: Psycopg2SqlClient = pipeline.sql_client() +# # client.credentials.query["encoding"] = "ru" +# with client: +# print(client.native_connection.encoding) diff --git a/tests/load/pipeline/test_stage_loading.py b/tests/load/pipeline/test_stage_loading.py index 7f1427f20f..a760c86526 100644 --- a/tests/load/pipeline/test_stage_loading.py +++ b/tests/load/pipeline/test_stage_loading.py @@ -4,6 +4,7 @@ import dlt, os from dlt.common import json, sleep from copy import deepcopy +from dlt.common.storages.configuration import FilesystemConfiguration from dlt.common.utils import uniq_id from dlt.common.schema.typing import TDataType @@ -16,6 +17,9 @@ ) from tests.cases import table_update_and_row +# mark all tests as essential, do not remove +pytestmark = pytest.mark.essential + @dlt.resource( table_name="issues", write_disposition="merge", primary_key="id", merge_key=("node_id", "url") @@ -46,6 +50,18 @@ def test_staging_load(destination_config: DestinationTestConfiguration) -> None: info = pipeline.run(github(), loader_file_format=destination_config.file_format) assert_load_info(info) + # checks if remote_uri is set correctly on copy jobs + metrics = info.metrics[info.loads_ids[0]][0] + for job_metrics in metrics["job_metrics"].values(): + remote_uri = job_metrics.remote_uri + job_ext = os.path.splitext(job_metrics.job_id)[1] + if job_ext not in (".reference", ".sql"): + assert remote_uri.endswith(job_ext) + bucket_uri = destination_config.bucket_url + if FilesystemConfiguration.is_local_path(bucket_uri): + bucket_uri = FilesystemConfiguration.make_file_uri(bucket_uri) + assert remote_uri.startswith(bucket_uri) + package_info = pipeline.get_load_package_info(info.loads_ids[0]) assert package_info.state == "loaded" diff --git a/tests/load/test_dummy_client.py b/tests/load/test_dummy_client.py index b55f4ceece..9f0bca6ac5 100644 --- a/tests/load/test_dummy_client.py +++ b/tests/load/test_dummy_client.py @@ -8,7 +8,8 @@ from dlt.common.exceptions import TerminalException, TerminalValueError from dlt.common.storages import FileStorage, PackageStorage, ParsedLoadJobFileName -from dlt.common.storages.load_package import LoadJobInfo, TJobState +from dlt.common.storages.configuration import FilesystemConfiguration +from dlt.common.storages.load_package import LoadJobInfo, TPackageJobState from dlt.common.storages.load_storage import JobFileFormatUnsupported from dlt.common.destination.reference import RunnableLoadJob, TDestination from dlt.common.schema.utils import ( @@ -32,6 +33,7 @@ from dlt.load.utils import get_completed_table_chain, init_client, _extend_tables_with_table_chain from tests.utils import ( + MockPipeline, clean_test_storage, init_test_logging, TEST_DICT_CONFIG_PROVIDER, @@ -78,10 +80,14 @@ def test_spool_job_started() -> None: load_id, PackageStorage.STARTED_JOBS_FOLDER, job.file_name() ) ) + assert_job_metrics(job, "completed") jobs.append(job) remaining_jobs, finalized_jobs, _ = load.complete_jobs(load_id, jobs, schema) assert len(remaining_jobs) == 0 assert len(finalized_jobs) == 2 + assert len(load._job_metrics) == 2 + for job in jobs: + assert load._job_metrics[job.job_id()] == job.metrics() def test_unsupported_writer_type() -> None: @@ -199,7 +205,9 @@ def test_spool_job_failed() -> None: load_id, PackageStorage.STARTED_JOBS_FOLDER, job.file_name() ) ) + assert_job_metrics(job, "failed") jobs.append(job) + assert len(jobs) == 2 # complete files remaining_jobs, finalized_jobs, _ = load.complete_jobs(load_id, jobs, schema) assert len(remaining_jobs) == 0 @@ -215,6 +223,8 @@ def test_spool_job_failed() -> None: load_id, PackageStorage.FAILED_JOBS_FOLDER, job.file_name() + ".exception" ) ) + # load should collect two jobs + assert load._job_metrics[job.job_id()] == job.metrics() started_files = load.load_storage.normalized_packages.list_started_jobs(load_id) assert len(started_files) == 0 @@ -226,6 +236,13 @@ def test_spool_job_failed() -> None: assert package_info.state == "loaded" # all jobs failed assert len(package_info.jobs["failed_jobs"]) == 2 + # check metrics + load_info = load.get_step_info(MockPipeline("pipe", True)) # type: ignore[abstract] + metrics = load_info.metrics[load_id][0]["job_metrics"] + assert len(metrics) == 2 + for job in jobs: + assert job.job_id() in metrics + assert metrics[job.job_id()].state == "failed" def test_spool_job_failed_terminally_exception_init() -> None: @@ -244,6 +261,11 @@ def test_spool_job_failed_terminally_exception_init() -> None: assert len(package_info.jobs["started_jobs"]) == 0 # load id was never committed complete_load.assert_not_called() + # metrics can be gathered + assert len(load._job_metrics) == 2 + load_info = load.get_step_info(MockPipeline("pipe", True)) # type: ignore[abstract] + metrics = load_info.metrics[load_id][0]["job_metrics"] + assert len(metrics) == 2 def test_spool_job_failed_transiently_exception_init() -> None: @@ -264,6 +286,10 @@ def test_spool_job_failed_transiently_exception_init() -> None: # load id was never committed complete_load.assert_not_called() + # no metrics were gathered + assert len(load._job_metrics) == 0 + load_info = load.get_step_info(MockPipeline("pipe", True)) # type: ignore[abstract] + assert len(load_info.metrics) == 0 def test_spool_job_failed_exception_complete() -> None: @@ -279,6 +305,11 @@ def test_spool_job_failed_exception_complete() -> None: # both failed - we wait till the current loop is completed and then raise assert len(package_info.jobs["failed_jobs"]) == 2 assert len(package_info.jobs["started_jobs"]) == 0 + # metrics can be gathered + assert len(load._job_metrics) == 2 + load_info = load.get_step_info(MockPipeline("pipe", True)) # type: ignore[abstract] + metrics = load_info.metrics[load_id][0]["job_metrics"] + assert len(metrics) == 2 def test_spool_job_retry_new() -> None: @@ -328,6 +359,7 @@ def test_spool_job_retry_started() -> None: remaining_jobs, finalized_jobs, _ = load.complete_jobs(load_id, jobs, schema) assert len(remaining_jobs) == 0 assert len(finalized_jobs) == 0 + assert len(load._job_metrics) == 0 # clear retry flag dummy_impl.JOBS = {} files = load.load_storage.normalized_packages.list_new_jobs(load_id) @@ -407,6 +439,8 @@ def test_failing_followup_jobs() -> None: assert len(dummy_impl.JOBS) == 2 assert len(dummy_impl.RETRIED_JOBS) == 0 assert len(dummy_impl.CREATED_FOLLOWUP_JOBS) == 0 + # no metrics were collected + assert len(load._job_metrics) == 0 # now we can retry the same load, it will restart the two jobs and successfully create the followup jobs load.initial_client_config.fail_followup_job_creation = False # type: ignore @@ -436,6 +470,8 @@ def test_failing_table_chain_followup_jobs() -> None: assert len(dummy_impl.JOBS) == 2 assert len(dummy_impl.RETRIED_JOBS) == 0 assert len(dummy_impl.CREATED_FOLLOWUP_JOBS) == 0 + # no metrics were collected + assert len(load._job_metrics) == 0 # now we can retry the same load, it will restart the two jobs and successfully create the table chain followup jobs load.initial_client_config.fail_table_chain_followup_job_creation = False # type: ignore @@ -662,11 +698,11 @@ def test_get_completed_table_chain_cases() -> None: # child completed, parent not event_user = schema.get_table("event_user") event_user_entities = schema.get_table("event_user__parse_data__entities") - event_user_job: Tuple[TJobState, ParsedLoadJobFileName] = ( + event_user_job: Tuple[TPackageJobState, ParsedLoadJobFileName] = ( "started_jobs", ParsedLoadJobFileName("event_user", "event_user_id", 0, "jsonl"), ) - event_user_entities_job: Tuple[TJobState, ParsedLoadJobFileName] = ( + event_user_entities_job: Tuple[TPackageJobState, ParsedLoadJobFileName] = ( "completed_jobs", ParsedLoadJobFileName( "event_user__parse_data__entities", "event_user__parse_data__entities_id", 0, "jsonl" @@ -857,6 +893,33 @@ def test_dummy_staging_filesystem() -> None: assert len(dummy_impl.CREATED_FOLLOWUP_JOBS) == 0 +def test_load_multiple_packages() -> None: + load = setup_loader(client_config=DummyClientConfiguration(completed_prob=1.0)) + load.config.pool_type = "none" + load_id_1, _ = prepare_load_package(load.load_storage, NORMALIZED_FILES) + sleep(0.1) + load_id_2, _ = prepare_load_package(load.load_storage, NORMALIZED_FILES) + run_metrics = load.run(None) + assert run_metrics.pending_items == 1 + # assert load._current_load_id is None + metrics_id_1 = load._job_metrics + assert len(metrics_id_1) == 2 + assert load._step_info_metrics(load_id_1)[0]["job_metrics"] == metrics_id_1 + run_metrics = load.run(None) + assert run_metrics.pending_items == 0 + metrics_id_2 = load._job_metrics + assert len(metrics_id_2) == 2 + assert load._step_info_metrics(load_id_2)[0]["job_metrics"] == metrics_id_2 + load_info = load.get_step_info(MockPipeline("pipe", True)) # type: ignore[abstract] + assert load_id_1 in load_info.metrics + assert load_id_2 in load_info.metrics + assert load_info.metrics[load_id_1][0]["job_metrics"] == metrics_id_1 + assert load_info.metrics[load_id_2][0]["job_metrics"] == metrics_id_2 + # execute empty run + load.run(None) + assert len(load_info.metrics) == 2 + + def test_terminal_exceptions() -> None: try: raise TerminalValueError("a") @@ -866,6 +929,15 @@ def test_terminal_exceptions() -> None: raise AssertionError() +def assert_job_metrics(job: RunnableLoadJob, expected_state: str) -> None: + metrics = job.metrics() + assert metrics.state == expected_state + assert metrics.started_at <= metrics.finished_at + assert metrics.job_id == job.job_id() + assert metrics.table_name == job._parsed_file_name.table_name + assert metrics.file_path == job._file_path + + def assert_complete_job( load: Load, should_delete_completed: bool = False, load_id: str = None, jobs_per_case: int = 1 ) -> None: @@ -910,6 +982,32 @@ def assert_complete_job( assert load.load_storage.loaded_packages.storage.has_folder(completed_path) # complete load on client was called complete_load.assert_called_once_with(load_id) + # assert if all jobs in final state have metrics + metrics = load.get_step_info(MockPipeline("pipe", True)).metrics[load_id][0] # type: ignore[abstract] + package_info = load.load_storage.loaded_packages.get_load_package_jobs(load_id) + for state, jobs in package_info.items(): + for job in jobs: + job_metrics = metrics["job_metrics"].get(job.job_id()) + if state in ("failed_jobs", "completed_jobs"): + assert job_metrics is not None + assert ( + metrics["job_metrics"][job.job_id()].state == "failed" + if state == "failed_jobs" + else "completed" + ) + remote_uri = job_metrics.remote_uri + if load.initial_client_config.create_followup_jobs: # type: ignore + assert remote_uri.endswith(job.file_name()) + elif load.is_staging_destination_job(job.file_name()): + # staging destination should contain reference to remote filesystem + assert ( + FilesystemConfiguration.make_file_uri(REMOTE_FILESYSTEM) + in remote_uri + ) + else: + assert remote_uri is None + else: + assert job_metrics is None def run_all(load: Load) -> None: @@ -941,9 +1039,9 @@ def setup_loader( staging = None if filesystem_staging: # do not accept jsonl to not conflict with filesystem destination - client_config = client_config or DummyClientConfiguration( - loader_file_format="reference", completed_prob=1 - ) + # client_config = client_config or DummyClientConfiguration( + # loader_file_format="reference", completed_prob=1 + # ) staging_system_config = FilesystemDestinationClientConfiguration()._bind_dataset_name( dataset_name="dummy" ) diff --git a/tests/load/utils.py b/tests/load/utils.py index d649343c63..086109de8b 100644 --- a/tests/load/utils.py +++ b/tests/load/utils.py @@ -45,6 +45,7 @@ from dlt.common.storages import SchemaStorage, FileStorage, SchemaStorageConfiguration from dlt.common.schema.utils import new_table, normalize_table_identifiers from dlt.common.storages import ParsedLoadJobFileName, LoadStorage, PackageStorage +from dlt.common.storages.load_package import create_load_id from dlt.common.typing import StrAny from dlt.common.utils import uniq_id @@ -712,7 +713,7 @@ def expect_load_file( query = query.encode("utf-8") # type: ignore[assignment] file_storage.save(file_name, query) table = client.prepare_load_table(table_name) - load_id = uniq_id() + load_id = create_load_id() job = client.create_load_job(table, file_storage.make_full_path(file_name), load_id) if isinstance(job, RunnableLoadJob): @@ -873,7 +874,7 @@ def prepare_load_package( Create a load package with explicitely provided files job_per_case multiplies the amount of load jobs, for big packages use small files """ - load_id = uniq_id() + load_id = create_load_id() load_storage.new_packages.create_package(load_id) for case in cases: path = f"./tests/load/cases/loading/{case}" diff --git a/tests/pipeline/cases/contracts/trace.schema.yaml b/tests/pipeline/cases/contracts/trace.schema.yaml new file mode 100644 index 0000000000..89831977c0 --- /dev/null +++ b/tests/pipeline/cases/contracts/trace.schema.yaml @@ -0,0 +1,772 @@ +version: 4 +version_hash: JE62zVwqT2T/qHTi2Qdnn2d1A/JzCzyGtDwc+qUmbTs= +engine_version: 9 +name: trace +tables: + _dlt_version: + columns: + version: + data_type: bigint + nullable: false + engine_version: + data_type: bigint + nullable: false + inserted_at: + data_type: timestamp + nullable: false + schema_name: + data_type: text + nullable: false + version_hash: + data_type: text + nullable: false + schema: + data_type: text + nullable: false + write_disposition: skip + description: Created by DLT. Tracks schema updates + _dlt_loads: + columns: + load_id: + data_type: text + nullable: false + schema_name: + data_type: text + nullable: true + status: + data_type: bigint + nullable: false + inserted_at: + data_type: timestamp + nullable: false + schema_version_hash: + data_type: text + nullable: true + write_disposition: skip + description: Created by DLT. Tracks completed loads + trace: + columns: + transaction_id: + data_type: text + nullable: true + pipeline_name: + data_type: text + nullable: true + execution_context__ci_run: + data_type: bool + nullable: true + execution_context__python: + data_type: text + nullable: true + execution_context__cpu: + data_type: bigint + nullable: true + execution_context__os__name: + data_type: text + nullable: true + execution_context__os__version: + data_type: text + nullable: true + execution_context__library__name: + data_type: text + nullable: true + execution_context__library__version: + data_type: text + nullable: true + started_at: + data_type: timestamp + nullable: true + finished_at: + data_type: timestamp + nullable: true + engine_version: + data_type: bigint + nullable: true + _dlt_load_id: + data_type: text + nullable: false + _dlt_id: + data_type: text + nullable: false + unique: true + write_disposition: append + trace__execution_context__exec_info: + columns: + value: + data_type: text + nullable: true + _dlt_id: + data_type: text + nullable: false + unique: true + _dlt_parent_id: + data_type: text + nullable: false + foreign_key: true + _dlt_list_idx: + data_type: bigint + nullable: false + parent: trace + trace__steps: + columns: + span_id: + data_type: text + nullable: true + step: + data_type: text + nullable: true + started_at: + data_type: timestamp + nullable: true + finished_at: + data_type: timestamp + nullable: true + step_info__pipeline__pipeline_name: + data_type: text + nullable: true + step_info__first_run: + data_type: bool + nullable: true + step_info__started_at: + data_type: timestamp + nullable: true + step_info__finished_at: + data_type: timestamp + nullable: true + _dlt_parent_id: + data_type: text + nullable: false + foreign_key: true + _dlt_list_idx: + data_type: bigint + nullable: false + _dlt_id: + data_type: text + nullable: false + unique: true + load_info__destination_type: + data_type: text + nullable: true + load_info__destination_displayable_credentials: + data_type: text + nullable: true + load_info__destination_name: + data_type: text + nullable: true + load_info__staging_type: + data_type: text + nullable: true + load_info__staging_name: + data_type: text + nullable: true + load_info__staging_displayable_credentials: + data_type: text + nullable: true + load_info__destination_fingerprint: + data_type: text + nullable: true + step_exception: + data_type: text + nullable: true + parent: trace + trace__steps__extract_info__job_metrics: + columns: + file_path: + data_type: text + nullable: true + items_count: + data_type: bigint + nullable: true + file_size: + data_type: bigint + nullable: true + created: + data_type: double + nullable: true + last_modified: + data_type: double + nullable: true + load_id: + data_type: text + nullable: true + extract_idx: + data_type: bigint + nullable: true + job_id: + data_type: text + nullable: true + table_name: + data_type: text + nullable: true + _dlt_parent_id: + data_type: text + nullable: false + foreign_key: true + _dlt_list_idx: + data_type: bigint + nullable: false + _dlt_id: + data_type: text + nullable: false + unique: true + parent: trace__steps + trace__steps__extract_info__table_metrics: + columns: + file_path: + data_type: text + nullable: true + items_count: + data_type: bigint + nullable: true + file_size: + data_type: bigint + nullable: true + created: + data_type: double + nullable: true + last_modified: + data_type: double + nullable: true + load_id: + data_type: text + nullable: true + extract_idx: + data_type: bigint + nullable: true + table_name: + data_type: text + nullable: true + _dlt_parent_id: + data_type: text + nullable: false + foreign_key: true + _dlt_list_idx: + data_type: bigint + nullable: false + _dlt_id: + data_type: text + nullable: false + unique: true + parent: trace__steps + trace__steps__extract_info__resource_metrics: + columns: + file_path: + data_type: text + nullable: true + items_count: + data_type: bigint + nullable: true + file_size: + data_type: bigint + nullable: true + created: + data_type: double + nullable: true + last_modified: + data_type: double + nullable: true + load_id: + data_type: text + nullable: true + extract_idx: + data_type: bigint + nullable: true + resource_name: + data_type: text + nullable: true + _dlt_parent_id: + data_type: text + nullable: false + foreign_key: true + _dlt_list_idx: + data_type: bigint + nullable: false + _dlt_id: + data_type: text + nullable: false + unique: true + parent: trace__steps + trace__steps__extract_info__dag: + columns: + load_id: + data_type: text + nullable: true + extract_idx: + data_type: bigint + nullable: true + parent_name: + data_type: text + nullable: true + resource_name: + data_type: text + nullable: true + _dlt_parent_id: + data_type: text + nullable: false + foreign_key: true + _dlt_list_idx: + data_type: bigint + nullable: false + _dlt_id: + data_type: text + nullable: false + unique: true + parent: trace__steps + trace__steps__extract_info__hints: + columns: + load_id: + data_type: text + nullable: true + extract_idx: + data_type: bigint + nullable: true + resource_name: + data_type: text + nullable: true + columns: + data_type: text + nullable: true + write_disposition: + data_type: text + nullable: true + schema_contract: + data_type: text + nullable: true + table_format: + data_type: text + nullable: true + file_format: + data_type: text + nullable: true + original_columns: + data_type: text + nullable: true + primary_key: + data_type: text + nullable: true + _dlt_parent_id: + data_type: text + nullable: false + foreign_key: true + _dlt_list_idx: + data_type: bigint + nullable: false + _dlt_id: + data_type: text + nullable: false + unique: true + parent: trace__steps + trace__steps__step_info__loads_ids: + columns: + value: + data_type: text + nullable: true + _dlt_id: + data_type: text + nullable: false + unique: true + _dlt_parent_id: + data_type: text + nullable: false + foreign_key: true + _dlt_list_idx: + data_type: bigint + nullable: false + parent: trace__steps + trace__steps__step_info__load_packages: + columns: + load_id: + data_type: text + nullable: true + package_path: + data_type: text + nullable: true + state: + data_type: text + nullable: true + schema_hash: + data_type: text + nullable: true + schema_name: + data_type: text + nullable: true + _dlt_parent_id: + data_type: text + nullable: false + foreign_key: true + _dlt_list_idx: + data_type: bigint + nullable: false + _dlt_id: + data_type: text + nullable: false + unique: true + completed_at: + data_type: timestamp + nullable: true + parent: trace__steps + trace__steps__step_info__load_packages__jobs: + columns: + state: + data_type: text + nullable: true + file_path: + data_type: text + nullable: true + file_size: + data_type: bigint + nullable: true + created_at: + data_type: timestamp + nullable: true + elapsed: + data_type: double + nullable: true + table_name: + data_type: text + nullable: true + file_id: + data_type: text + nullable: true + retry_count: + data_type: bigint + nullable: true + file_format: + data_type: text + nullable: true + job_id: + data_type: text + nullable: true + _dlt_parent_id: + data_type: text + nullable: false + foreign_key: true + _dlt_list_idx: + data_type: bigint + nullable: false + _dlt_id: + data_type: text + nullable: false + unique: true + parent: trace__steps__step_info__load_packages + trace__steps__normalize_info__job_metrics: + columns: + file_path: + data_type: text + nullable: true + items_count: + data_type: bigint + nullable: true + file_size: + data_type: bigint + nullable: true + created: + data_type: double + nullable: true + last_modified: + data_type: double + nullable: true + load_id: + data_type: text + nullable: true + extract_idx: + data_type: bigint + nullable: true + job_id: + data_type: text + nullable: true + table_name: + data_type: text + nullable: true + _dlt_parent_id: + data_type: text + nullable: false + foreign_key: true + _dlt_list_idx: + data_type: bigint + nullable: false + _dlt_id: + data_type: text + nullable: false + unique: true + parent: trace__steps + trace__steps__normalize_info__table_metrics: + columns: + file_path: + data_type: text + nullable: true + items_count: + data_type: bigint + nullable: true + file_size: + data_type: bigint + nullable: true + created: + data_type: double + nullable: true + last_modified: + data_type: double + nullable: true + load_id: + data_type: text + nullable: true + extract_idx: + data_type: bigint + nullable: true + table_name: + data_type: text + nullable: true + _dlt_parent_id: + data_type: text + nullable: false + foreign_key: true + _dlt_list_idx: + data_type: bigint + nullable: false + _dlt_id: + data_type: text + nullable: false + unique: true + parent: trace__steps + trace__steps__load_info__job_metrics: + columns: + load_id: + data_type: text + nullable: true + job_id: + data_type: text + nullable: true + file_path: + data_type: text + nullable: true + table_name: + data_type: text + nullable: true + state: + data_type: text + nullable: true + _dlt_parent_id: + data_type: text + nullable: false + foreign_key: true + _dlt_list_idx: + data_type: bigint + nullable: false + _dlt_id: + data_type: text + nullable: false + unique: true + started_at: + data_type: timestamp + nullable: true + finished_at: + data_type: timestamp + nullable: true + remote_uri: + data_type: text + nullable: true + parent: trace__steps + trace__steps__step_info__load_packages__tables: + columns: + write_disposition: + data_type: text + nullable: true + schema_contract: + data_type: text + nullable: true + table_format: + data_type: text + nullable: true + file_format: + data_type: text + nullable: true + name: + data_type: text + nullable: true + resource: + data_type: text + nullable: true + schema_name: + data_type: text + nullable: true + load_id: + data_type: text + nullable: true + _dlt_parent_id: + data_type: text + nullable: false + foreign_key: true + _dlt_list_idx: + data_type: bigint + nullable: false + _dlt_id: + data_type: text + nullable: false + unique: true + parent: + data_type: text + nullable: true + x_normalizer__seen_data: + data_type: bool + nullable: true + parent: trace__steps__step_info__load_packages + trace__steps__step_info__load_packages__tables__columns: + columns: + name: + data_type: text + nullable: true + data_type: + data_type: text + nullable: true + nullable: + data_type: bool + nullable: true + primary_key: + data_type: bool + nullable: true + table_name: + data_type: text + nullable: true + schema_name: + data_type: text + nullable: true + load_id: + data_type: text + nullable: true + _dlt_parent_id: + data_type: text + nullable: false + foreign_key: true + _dlt_list_idx: + data_type: bigint + nullable: false + _dlt_id: + data_type: text + nullable: false + unique: true + unique: + data_type: bool + nullable: true + foreign_key: + data_type: bool + nullable: true + parent: trace__steps__step_info__load_packages__tables + trace__resolved_config_values: + columns: + key: + data_type: text + nullable: true + is_secret_hint: + data_type: bool + nullable: true + provider_name: + data_type: text + nullable: true + config_type_name: + data_type: text + nullable: true + _dlt_parent_id: + data_type: text + nullable: false + foreign_key: true + _dlt_list_idx: + data_type: bigint + nullable: false + _dlt_id: + data_type: text + nullable: false + unique: true + parent: trace + trace__resolved_config_values__sections: + columns: + value: + data_type: text + nullable: true + _dlt_id: + data_type: text + nullable: false + unique: true + _dlt_parent_id: + data_type: text + nullable: false + foreign_key: true + _dlt_list_idx: + data_type: bigint + nullable: false + parent: trace__resolved_config_values + trace__steps__exception_traces: + columns: + message: + data_type: text + nullable: true + exception_type: + data_type: text + nullable: true + is_terminal: + data_type: bool + nullable: true + docstring: + data_type: text + nullable: true + load_id: + data_type: text + nullable: true + pipeline_name: + data_type: text + nullable: true + exception_attrs: + data_type: text + nullable: true + _dlt_parent_id: + data_type: text + nullable: false + foreign_key: true + _dlt_list_idx: + data_type: bigint + nullable: false + _dlt_id: + data_type: text + nullable: false + unique: true + parent: trace__steps + trace__steps__exception_traces__stack_trace: + columns: + value: + data_type: text + nullable: true + _dlt_id: + data_type: text + nullable: false + unique: true + _dlt_parent_id: + data_type: text + nullable: false + foreign_key: true + _dlt_list_idx: + data_type: bigint + nullable: false + parent: trace__steps__exception_traces +settings: + detections: + - iso_timestamp + default_hints: + not_null: + - _dlt_id + - _dlt_root_id + - _dlt_parent_id + - _dlt_list_idx + - _dlt_load_id + foreign_key: + - _dlt_parent_id + root_key: + - _dlt_root_id + unique: + - _dlt_id +normalizers: + names: snake_case + json: + module: dlt.common.normalizers.json.relational +previous_hashes: +- 9Ysjq/W0xpxkI/vBiYm8Qbr2nDP3JMt6KvGKUS/FCyI= +- NYeAxJ2r+T+dKFnXFhBEPzBP6SO+ORdhOfgQRo/XqBU= +- RV9jvZSD5dM+ZGjEL3HqokLvtf22K4zMNc3zWRahEw4= diff --git a/tests/pipeline/test_pipeline.py b/tests/pipeline/test_pipeline.py index 0ab1f61d72..b6a7feffc1 100644 --- a/tests/pipeline/test_pipeline.py +++ b/tests/pipeline/test_pipeline.py @@ -39,7 +39,7 @@ from dlt.common.utils import uniq_id from dlt.common.schema import Schema -from dlt.destinations import filesystem, redshift, dummy +from dlt.destinations import filesystem, redshift, dummy, duckdb from dlt.destinations.impl.filesystem.filesystem import INIT_FILE_NAME from dlt.extract.exceptions import InvalidResourceDataTypeBasic, PipeGenInvalid, SourceExhausted from dlt.extract.extract import ExtractStorage @@ -2637,6 +2637,57 @@ def comments(user_id: str): assert pipeline.last_trace.last_normalize_info.row_counts["user_comments"] == 3 +def test_exceed_job_file_name_length() -> None: + # use very long table name both for parent and for a child + data = { + "id": 1, + "child use very long table name both for parent and for a child use very long table name both for parent and for a child use very long table name both for parent and for a child use very long table name both for parent and for a child use very long table name both for parent and for a child": [ + 1, + 2, + 3, + ], + "col use very long table name both for parent and for a child use very long table name both for parent and for a child use very long table name both for parent and for a child use very long table name both for parent and for a child use very long table name both for parent and for a child": ( + "data" + ), + } + + table_name = ( + "parent use very long table name both for parent and for a child use very long table name" + " both for parent and for a child use very long table name both for parent and for a child" + " use very long table name both for parent and for a child use very long table name both" + " for parent and for a child use very long table name both for parent and for a child " + ) + + pipeline = dlt.pipeline( + pipeline_name="test_exceed_job_file_name_length", + destination="duckdb", + ) + # path too long + with pytest.raises(PipelineStepFailed) as os_err: + pipeline.run([data], table_name=table_name) + assert isinstance(os_err.value.__cause__, OSError) + + # fit into 255 + 1 + suffix_len = len(".b61d3af76c.0.insert-values") + pipeline = dlt.pipeline( + pipeline_name="test_exceed_job_file_name_length", + destination=duckdb( + max_identifier_length=255 - suffix_len + 1, + ), + ) + # path too long + with pytest.raises(PipelineStepFailed): + pipeline.run([data], table_name=table_name) + + pipeline = dlt.pipeline( + pipeline_name="test_exceed_job_file_name_length", + destination=duckdb( + max_identifier_length=255 - suffix_len, + ), + ) + pipeline.run([data], table_name=table_name) + + def assert_imported_file( pipeline: Pipeline, table_name: str, diff --git a/tests/pipeline/test_pipeline_trace.py b/tests/pipeline/test_pipeline_trace.py index 3239e01bab..69c0f01b8b 100644 --- a/tests/pipeline/test_pipeline_trace.py +++ b/tests/pipeline/test_pipeline_trace.py @@ -7,6 +7,7 @@ from unittest.mock import patch import pytest import requests_mock +import yaml import dlt @@ -19,6 +20,8 @@ from dlt.common.typing import DictStrAny, StrStr, DictStrStr, TSecretValue from dlt.common.utils import digest128 +from dlt.destinations import dummy, filesystem + from dlt.pipeline.exceptions import PipelineStepFailed from dlt.pipeline.pipeline import Pipeline from dlt.pipeline.trace import ( @@ -31,7 +34,8 @@ from dlt.extract.extract import describe_extract_data from dlt.extract.pipe import Pipe -from tests.utils import start_test_telemetry +from tests.pipeline.utils import PIPELINE_TEST_CASES_PATH +from tests.utils import TEST_STORAGE_ROOT, start_test_telemetry from tests.common.configuration.utils import toml_providers, environment @@ -122,7 +126,7 @@ def data(): resolved = _find_resolved_value(trace.resolved_config_values, "credentials", ["databricks"]) assert resolved.is_secret_hint is True assert resolved.value == databricks_creds - assert_trace_printable(trace) + assert_trace_serializable(trace) # activate pipeline because other was running in assert trace p.activate() @@ -153,7 +157,7 @@ def data(): assert isinstance(step.step_info, ExtractInfo) assert len(step.exception_traces) > 0 assert step.step_info.extract_data_info == [{"name": "async_exception", "data_type": "source"}] - assert_trace_printable(trace) + assert_trace_serializable(trace) extract_info = step.step_info # only new (unprocessed) package is present, all other metrics are empty, state won't be extracted @@ -174,7 +178,7 @@ def data(): step = trace.steps[2] assert step.step == "normalize" assert step.step_info is norm_info - assert_trace_printable(trace) + assert_trace_serializable(trace) assert isinstance(p.last_trace.last_normalize_info, NormalizeInfo) assert p.last_trace.last_normalize_info.row_counts == {"_dlt_pipeline_state": 1, "data": 3} @@ -216,7 +220,7 @@ def data(): assert resolved.is_secret_hint is False assert resolved.value == "1.0" assert resolved.config_type_name == "DummyClientConfiguration" - assert_trace_printable(trace) + assert_trace_serializable(trace) assert isinstance(p.last_trace.last_load_info, LoadInfo) p.activate() @@ -234,12 +238,157 @@ def data(): assert step.step == "load" assert step.step_info is load_info # same load info assert trace.steps[0].step_info is not extract_info - assert_trace_printable(trace) + assert_trace_serializable(trace) assert isinstance(p.last_trace.last_load_info, LoadInfo) assert isinstance(p.last_trace.last_normalize_info, NormalizeInfo) assert isinstance(p.last_trace.last_extract_info, ExtractInfo) +def test_trace_schema() -> None: + os.environ["DATA_WRITER__DISABLE_COMPRESSION"] = "True" + os.environ["RESTORE_FROM_DESTINATION"] = "False" + + # mock runtime env + os.environ["CIRCLECI"] = "1" + os.environ["AWS_LAMBDA_FUNCTION_NAME"] = "lambda" + + @dlt.source(section="many_hints") + def many_hints( + api_type=dlt.config.value, + credentials: str = dlt.secrets.value, + secret_value: TSecretValue = TSecretValue("123"), # noqa: B008 + ): + # TODO: create table / column schema from typed dicts, not explicitly + @dlt.resource( + write_disposition="replace", + primary_key="id", + table_format="delta", + file_format="jsonl", + schema_contract="evolve", + columns=[ + { + "name": "multi", + "data_type": "decimal", + "nullable": True, + "cluster": True, + "description": "unknown", + "merge_key": True, + "precision": 9, + "scale": 3, + "sort": True, + "variant": True, + "partition": True, + } + ], + ) + def data(): + yield [{"id": 1, "multi": "1.2"}, {"id": 2}, {"id": 3}] + + return data() + + @dlt.source + def github(): + @dlt.resource + def get_shuffled_events(): + for _ in range(1): + with open( + "tests/normalize/cases/github.events.load_page_1_duck.json", + "r", + encoding="utf-8", + ) as f: + issues = json.load(f) + yield issues + + return get_shuffled_events() + + @dlt.source + def async_exception(max_range=1): + async def get_val(v): + await asyncio.sleep(0.1) + if v % 3 == 0: + raise ValueError(v) + return v + + @dlt.resource + def data(): + yield from [get_val(v) for v in range(1, max_range)] + + return data() + + # create pipeline with staging to get remote_uri in load step job_metrics + dummy_dest = dummy(completed_prob=1.0) + pipeline = dlt.pipeline( + pipeline_name="test_trace_schema", + destination=dummy_dest, + staging=filesystem(os.path.abspath(os.path.join(TEST_STORAGE_ROOT, "_remote_filesystem"))), + dataset_name="various", + ) + + # mock config + os.environ["API_TYPE"] = "REST" + os.environ["SOURCES__MANY_HINTS__CREDENTIALS"] = "CREDS" + + info = pipeline.run([many_hints(), github()]) + info.raise_on_failed_jobs() + + trace = pipeline.last_trace + pipeline._schema_storage.storage.save("trace.json", json.dumps(trace, pretty=True)) + + schema = dlt.Schema("trace") + trace_pipeline = dlt.pipeline( + pipeline_name="test_trace_schema_traces", destination=dummy(completed_prob=1.0) + ) + info = trace_pipeline.run([trace], table_name="trace", schema=schema) + info.raise_on_failed_jobs() + + # add exception trace + with pytest.raises(PipelineStepFailed): + pipeline.extract(async_exception(max_range=4)) + + trace_exception = pipeline.last_trace + pipeline._schema_storage.storage.save( + "trace_exception.json", json.dumps(trace_exception, pretty=True) + ) + + info = trace_pipeline.run([trace_exception], table_name="trace") + info.raise_on_failed_jobs() + inferred_trace_contract = trace_pipeline.schemas["trace"] + inferred_contract_str = inferred_trace_contract.to_pretty_yaml(remove_processing_hints=True) + + # NOTE: this saves actual inferred contract (schema) to schema storage, move it to test cases if you update + # trace shapes + # TODO: create a proper schema for dlt trace and tables/columns + pipeline._schema_storage.storage.save("trace.schema.yaml", inferred_contract_str) + # print(pipeline._schema_storage.storage.storage_path) + + # load the schema and use it as contract + with open(f"{PIPELINE_TEST_CASES_PATH}/contracts/trace.schema.yaml", encoding="utf-8") as f: + imported_schema = yaml.safe_load(f) + trace_contract = Schema.from_dict(imported_schema, remove_processing_hints=True) + # compare pretty forms of the schemas, they must be identical + # NOTE: if this fails you can comment this out and use contract run below to find first offending difference + # assert trace_contract.to_pretty_yaml() == inferred_contract_str + + # use trace contract to load data again + contract_trace_pipeline = dlt.pipeline( + pipeline_name="test_trace_schema_traces_contract", destination=dummy(completed_prob=1.0) + ) + info = contract_trace_pipeline.run( + [trace_exception, trace], + table_name="trace", + schema=trace_contract, + schema_contract="freeze", + ) + + # assert inferred_trace_contract.version_hash == trace_contract.version_hash + + # print(trace_pipeline.schemas["trace"].to_pretty_yaml()) + # print(pipeline._schema_storage.storage.storage_path) + + +# def test_trace_schema_contract() -> None: + + def test_save_load_trace() -> None: os.environ["COMPLETED_PROB"] = "1.0" info = dlt.pipeline().run([1, 2, 3], table_name="data", destination="dummy") @@ -255,7 +404,7 @@ def test_save_load_trace() -> None: assert resolved.is_secret_hint is False assert resolved.value == "1.0" assert resolved.config_type_name == "DummyClientConfiguration" - assert_trace_printable(trace) + assert_trace_serializable(trace) # check row counts assert pipeline.last_trace.last_normalize_info.row_counts == { "_dlt_pipeline_state": 1, @@ -296,7 +445,7 @@ def data(): assert run_step.step == "run" assert run_step.step_exception is not None assert step.step_exception == run_step.step_exception - assert_trace_printable(trace) + assert_trace_serializable(trace) assert pipeline.last_trace.last_normalize_info is None @@ -306,7 +455,7 @@ def test_save_load_empty_trace() -> None: pipeline = dlt.pipeline() pipeline.run([], table_name="data", destination="dummy") trace = pipeline.last_trace - assert_trace_printable(trace) + assert_trace_serializable(trace) assert len(trace.steps) == 4 pipeline.activate() @@ -529,7 +678,7 @@ def _mock_sentry_before_send(event: DictStrAny, _unused_hint: Any = None) -> Dic return event -def assert_trace_printable(trace: PipelineTrace) -> None: +def assert_trace_serializable(trace: PipelineTrace) -> None: str(trace) trace.asstr(0) trace.asstr(1) diff --git a/tests/pipeline/test_platform_connection.py b/tests/pipeline/test_platform_connection.py index fa5b143ff5..aa46019382 100644 --- a/tests/pipeline/test_platform_connection.py +++ b/tests/pipeline/test_platform_connection.py @@ -65,7 +65,8 @@ def data(): # basic check of trace result assert trace_result, "no trace" assert trace_result["pipeline_name"] == "platform_test_pipeline" - assert len(trace_result["steps"]) == 4 + # just extract, normalize and load steps. run step is not serialized to trace (it was just a copy of load) + assert len(trace_result["steps"]) == 3 assert trace_result["execution_context"]["library"]["name"] == "dlt" # basic check of state result diff --git a/tests/pipeline/utils.py b/tests/pipeline/utils.py index dfdb9c8e40..d3d87f0e0b 100644 --- a/tests/pipeline/utils.py +++ b/tests/pipeline/utils.py @@ -98,6 +98,9 @@ def users_materialize_table_schema(): def assert_load_info(info: LoadInfo, expected_load_packages: int = 1) -> None: """Asserts that expected number of packages was loaded and there are no failed jobs""" + # make sure we can serialize + info.asstr(verbosity=2) + info.asdict() assert len(info.loads_ids) == expected_load_packages # all packages loaded assert all(p.completed_at is not None for p in info.load_packages) is True diff --git a/tests/utils.py b/tests/utils.py index 976a623c0b..1b81881470 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -189,8 +189,9 @@ def wipe_pipeline(preserve_environ) -> Iterator[None]: yield if container[PipelineContext].is_active(): # take existing pipeline - p = dlt.pipeline() - p._wipe_working_folder() + # NOTE: no more needed. test storage is wiped fully when test starts + # p = dlt.pipeline() + # p._wipe_working_folder() # deactivate context container[PipelineContext].deactivate() From 2788235572de105ff01aaf5c1ebcbe4ea40b249b Mon Sep 17 00:00:00 2001 From: Akela Drissner-Schmid <32450038+akelad@users.noreply.github.com> Date: Mon, 26 Aug 2024 16:32:22 +0200 Subject: [PATCH 04/10] Update snowflake.md --- docs/website/docs/dlt-ecosystem/destinations/snowflake.md | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/docs/website/docs/dlt-ecosystem/destinations/snowflake.md b/docs/website/docs/dlt-ecosystem/destinations/snowflake.md index 181d024a2f..d08578c5a2 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/snowflake.md +++ b/docs/website/docs/dlt-ecosystem/destinations/snowflake.md @@ -136,7 +136,12 @@ If you set the [`replace` strategy](../../general-usage/full-loading.md) to `sta recreated with a [clone command](https://docs.snowflake.com/en/sql-reference/sql/create-clone) from the staging tables. ## Data loading -The data is loaded using an internal Snowflake stage. We use the `PUT` command and per-table built-in stages by default. Stage files are immediately removed (if not specified otherwise). +The data is loaded using an internal Snowflake stage. We use the `PUT` command and per-table built-in stages by default. Stage files are kept by default, unless specified otherwise via the `keep_staged_files` parameter: + +```toml +[destination.snowflake] +keep_staged_files = false +``` ## Supported file formats * [insert-values](../file-formats/insert-format.md) is used by default From 935dc09efd067549fbcb87b906ccb560d945bd26 Mon Sep 17 00:00:00 2001 From: rudolfix Date: Tue, 27 Aug 2024 00:20:06 +0200 Subject: [PATCH 05/10] Feat/1711 create with not exists dlt tables (#1740) * uses normalized column names when linking tables in relational * destination cap if create table if not exits supported * generates IF NOT EXISTS for dlt tables * adds logging for terminal and retry exception in run_managed of load job * passes schema update to be collected in trace in filesystem * fixes job log exception message --- dlt/common/destination/capabilities.py | 1 + dlt/common/destination/reference.py | 4 ++++ dlt/common/normalizers/json/relational.py | 12 +++++------- dlt/destinations/impl/athena/athena.py | 2 +- .../impl/filesystem/filesystem.py | 5 ++++- dlt/destinations/impl/mssql/factory.py | 1 + dlt/destinations/impl/synapse/factory.py | 4 ++++ dlt/destinations/job_client_impl.py | 19 ++++++++++++++----- .../parent_child_relationship.py | 9 ++++----- .../test_parent_child_relationship.py | 10 ++++------ tests/load/mssql/test_mssql_table_builder.py | 12 ++++++++++-- .../postgres/test_postgres_table_builder.py | 11 ++++++++++- tests/pipeline/test_pipeline_trace.py | 2 +- 13 files changed, 63 insertions(+), 29 deletions(-) diff --git a/dlt/common/destination/capabilities.py b/dlt/common/destination/capabilities.py index be71cb50e9..52e7d74833 100644 --- a/dlt/common/destination/capabilities.py +++ b/dlt/common/destination/capabilities.py @@ -76,6 +76,7 @@ class DestinationCapabilitiesContext(ContainerInjectableContext): # use naming convention in the schema naming_convention: TNamingConventionReferenceArg = None alter_add_multi_column: bool = True + supports_create_table_if_not_exists: bool = True supports_truncate_command: bool = True schema_supports_numeric_precision: bool = True timestamp_precision: int = 6 diff --git a/dlt/common/destination/reference.py b/dlt/common/destination/reference.py index b6c7041592..744cbbd1f5 100644 --- a/dlt/common/destination/reference.py +++ b/dlt/common/destination/reference.py @@ -383,9 +383,13 @@ def run_managed( except (DestinationTerminalException, TerminalValueError) as e: self._state = "failed" self._exception = e + logger.exception(f"Terminal exception in job {self.job_id()} in file {self._file_path}") except (DestinationTransientException, Exception) as e: self._state = "retry" self._exception = e + logger.exception( + f"Transient exception in job {self.job_id()} in file {self._file_path}" + ) finally: self._finished_at = pendulum.now() # sanity check diff --git a/dlt/common/normalizers/json/relational.py b/dlt/common/normalizers/json/relational.py index 8e296445eb..1dbcec4bff 100644 --- a/dlt/common/normalizers/json/relational.py +++ b/dlt/common/normalizers/json/relational.py @@ -184,11 +184,10 @@ def _get_child_row_hash(parent_row_id: str, child_table: str, list_idx: int) -> # and all child tables must be lists return digest128(f"{parent_row_id}_{child_table}_{list_idx}", DLT_ID_LENGTH_BYTES) - @staticmethod - def _link_row(row: DictStrAny, parent_row_id: str, list_idx: int) -> DictStrAny: + def _link_row(self, row: DictStrAny, parent_row_id: str, list_idx: int) -> DictStrAny: assert parent_row_id - row["_dlt_parent_id"] = parent_row_id - row["_dlt_list_idx"] = list_idx + row[self.c_dlt_parent_id] = parent_row_id + row[self.c_dlt_list_idx] = list_idx return row @@ -227,7 +226,7 @@ def _add_row_id( if row_id_type == "row_hash": row_id = DataItemNormalizer._get_child_row_hash(parent_row_id, table, pos) # link to parent table - DataItemNormalizer._link_row(flattened_row, parent_row_id, pos) + self._link_row(flattened_row, parent_row_id, pos) flattened_row[self.c_dlt_id] = row_id return row_id @@ -260,7 +259,6 @@ def _normalize_list( parent_row_id: Optional[str] = None, _r_lvl: int = 0, ) -> TNormalizedRowIterator: - v: DictStrAny = None table = self.schema.naming.shorten_fragments(*parent_path, *ident_path) for idx, v in enumerate(seq): @@ -285,7 +283,7 @@ def _normalize_list( child_row_hash = DataItemNormalizer._get_child_row_hash(parent_row_id, table, idx) wrap_v = wrap_in_dict(v) wrap_v[self.c_dlt_id] = child_row_hash - e = DataItemNormalizer._link_row(wrap_v, parent_row_id, idx) + e = self._link_row(wrap_v, parent_row_id, idx) DataItemNormalizer._extend_row(extend, e) yield (table, self.schema.naming.shorten_fragments(*parent_path)), e diff --git a/dlt/destinations/impl/athena/athena.py b/dlt/destinations/impl/athena/athena.py index 1429b28240..0c90d171a3 100644 --- a/dlt/destinations/impl/athena/athena.py +++ b/dlt/destinations/impl/athena/athena.py @@ -452,7 +452,7 @@ def _get_table_update_sql( partition_clause = self._iceberg_partition_clause( cast(Optional[Dict[str, str]], table.get(PARTITION_HINT)) ) - sql.append(f"""CREATE TABLE {qualified_table_name} + sql.append(f"""{self._make_create_table(qualified_table_name, table)} ({columns}) {partition_clause} LOCATION '{location.rstrip('/')}' diff --git a/dlt/destinations/impl/filesystem/filesystem.py b/dlt/destinations/impl/filesystem/filesystem.py index 2e09871ba9..5445fd2ae9 100644 --- a/dlt/destinations/impl/filesystem/filesystem.py +++ b/dlt/destinations/impl/filesystem/filesystem.py @@ -303,6 +303,7 @@ def update_stored_schema( only_tables: Iterable[str] = None, expected_update: TSchemaTables = None, ) -> TSchemaTables: + applied_update = super().update_stored_schema(only_tables, expected_update) # create destination dirs for all tables table_names = only_tables or self.schema.tables.keys() dirs_to_create = self.get_table_dirs(table_names) @@ -316,7 +317,9 @@ def update_stored_schema( if not self.config.as_staging: self._store_current_schema() - return expected_update + # we assume that expected_update == applied_update so table schemas in dest were not + # externally changed + return applied_update def get_table_dir(self, table_name: str, remote: bool = False) -> str: # dlt tables do not respect layout (for now) diff --git a/dlt/destinations/impl/mssql/factory.py b/dlt/destinations/impl/mssql/factory.py index 85c94c21b7..f1a8bb136a 100644 --- a/dlt/destinations/impl/mssql/factory.py +++ b/dlt/destinations/impl/mssql/factory.py @@ -37,6 +37,7 @@ def _raw_capabilities(self) -> DestinationCapabilitiesContext: caps.max_text_data_type_length = 2**30 - 1 caps.is_max_text_data_type_length_in_bytes = False caps.supports_ddl_transactions = True + caps.supports_create_table_if_not_exists = False # IF NOT EXISTS not supported caps.max_rows_per_insert = 1000 caps.timestamp_precision = 7 caps.supported_merge_strategies = ["delete-insert", "upsert", "scd2"] diff --git a/dlt/destinations/impl/synapse/factory.py b/dlt/destinations/impl/synapse/factory.py index bb117e48d2..d5a0281bec 100644 --- a/dlt/destinations/impl/synapse/factory.py +++ b/dlt/destinations/impl/synapse/factory.py @@ -63,6 +63,10 @@ def _raw_capabilities(self) -> DestinationCapabilitiesContext: caps.supports_transactions = True caps.supports_ddl_transactions = False + caps.supports_create_table_if_not_exists = ( + False # IF NOT EXISTS on CREATE TABLE not supported + ) + # Synapse throws "Some part of your SQL statement is nested too deeply. Rewrite the query or break it up into smaller queries." # if number of records exceeds a certain number. Which exact number that is seems not deterministic: # in tests, I've seen a query with 12230 records run succesfully on one run, but fail on a subsequent run, while the query remained exactly the same. diff --git a/dlt/destinations/job_client_impl.py b/dlt/destinations/job_client_impl.py index 92132dd751..1d6403a2c8 100644 --- a/dlt/destinations/job_client_impl.py +++ b/dlt/destinations/job_client_impl.py @@ -522,22 +522,31 @@ def _make_add_column_sql( """Make one or more ADD COLUMN sql clauses to be joined in ALTER TABLE statement(s)""" return [f"ADD COLUMN {self._get_column_def_sql(c, table_format)}" for c in new_columns] + def _make_create_table(self, qualified_name: str, table: TTableSchema) -> str: + not_exists_clause = " " + if ( + table["name"] in self.schema.dlt_table_names() + and self.capabilities.supports_create_table_if_not_exists + ): + not_exists_clause = " IF NOT EXISTS " + return f"CREATE TABLE{not_exists_clause}{qualified_name}" + def _get_table_update_sql( self, table_name: str, new_columns: Sequence[TColumnSchema], generate_alter: bool ) -> List[str]: # build sql - canonical_name = self.sql_client.make_qualified_table_name(table_name) + qualified_name = self.sql_client.make_qualified_table_name(table_name) table = self.prepare_load_table(table_name) table_format = table.get("table_format") sql_result: List[str] = [] if not generate_alter: # build CREATE - sql = f"CREATE TABLE {canonical_name} (\n" + sql = self._make_create_table(qualified_name, table) + " (\n" sql += ",\n".join([self._get_column_def_sql(c, table_format) for c in new_columns]) sql += ")" sql_result.append(sql) else: - sql_base = f"ALTER TABLE {canonical_name}\n" + sql_base = f"ALTER TABLE {qualified_name}\n" add_column_statements = self._make_add_column_sql(new_columns, table_format) if self.capabilities.alter_add_multi_column: column_sql = ",\n" @@ -561,13 +570,13 @@ def _get_table_update_sql( if hint == "not_null": logger.warning( f"Column(s) {hint_columns} with NOT NULL are being added to existing" - f" table {canonical_name}. If there's data in the table the operation" + f" table {qualified_name}. If there's data in the table the operation" " will fail." ) else: logger.warning( f"Column(s) {hint_columns} with hint {hint} are being added to existing" - f" table {canonical_name}. Several hint types may not be added to" + f" table {qualified_name}. Several hint types may not be added to" " existing tables." ) return sql_result diff --git a/docs/examples/parent_child_relationship/parent_child_relationship.py b/docs/examples/parent_child_relationship/parent_child_relationship.py index 39c9f577cc..6de00ffb28 100644 --- a/docs/examples/parent_child_relationship/parent_child_relationship.py +++ b/docs/examples/parent_child_relationship/parent_child_relationship.py @@ -22,6 +22,7 @@ from typing import List, Dict, Any, Generator import dlt + # Define a dlt resource with write disposition to 'merge' @dlt.resource(name="parent_with_children", write_disposition={"disposition": "merge"}) def data_source() -> Generator[List[Dict[str, Any]], None, None]: @@ -44,6 +45,7 @@ def data_source() -> Generator[List[Dict[str, Any]], None, None]: yield data + # Function to add parent_id to each child record within a parent record def add_parent_id(record: Dict[str, Any]) -> Dict[str, Any]: parent_id_key = "parent_id" @@ -51,6 +53,7 @@ def add_parent_id(record: Dict[str, Any]) -> Dict[str, Any]: child[parent_id_key] = record[parent_id_key] return record + if __name__ == "__main__": # Create and configure the dlt pipeline pipeline = dlt.pipeline( @@ -60,10 +63,6 @@ def add_parent_id(record: Dict[str, Any]) -> Dict[str, Any]: ) # Run the pipeline - load_info = pipeline.run( - data_source() - .add_map(add_parent_id), - primary_key="parent_id" - ) + load_info = pipeline.run(data_source().add_map(add_parent_id), primary_key="parent_id") # Output the load information after pipeline execution print(load_info) diff --git a/docs/examples/parent_child_relationship/test_parent_child_relationship.py b/docs/examples/parent_child_relationship/test_parent_child_relationship.py index f671040823..95d1bade97 100644 --- a/docs/examples/parent_child_relationship/test_parent_child_relationship.py +++ b/docs/examples/parent_child_relationship/test_parent_child_relationship.py @@ -1,4 +1,3 @@ - import pytest from tests.utils import skipifgithubfork @@ -29,6 +28,7 @@ from typing import List, Dict, Any, Generator import dlt + # Define a dlt resource with write disposition to 'merge' @dlt.resource(name="parent_with_children", write_disposition={"disposition": "merge"}) def data_source() -> Generator[List[Dict[str, Any]], None, None]: @@ -51,6 +51,7 @@ def data_source() -> Generator[List[Dict[str, Any]], None, None]: yield data + # Function to add parent_id to each child record within a parent record def add_parent_id(record: Dict[str, Any]) -> Dict[str, Any]: parent_id_key = "parent_id" @@ -58,6 +59,7 @@ def add_parent_id(record: Dict[str, Any]) -> Dict[str, Any]: child[parent_id_key] = record[parent_id_key] return record + @skipifgithubfork @pytest.mark.forked def test_parent_child_relationship(): @@ -69,10 +71,6 @@ def test_parent_child_relationship(): ) # Run the pipeline - load_info = pipeline.run( - data_source() - .add_map(add_parent_id), - primary_key="parent_id" - ) + load_info = pipeline.run(data_source().add_map(add_parent_id), primary_key="parent_id") # Output the load information after pipeline execution print(load_info) diff --git a/tests/load/mssql/test_mssql_table_builder.py b/tests/load/mssql/test_mssql_table_builder.py index d6cf3ec3e8..3f3896de6c 100644 --- a/tests/load/mssql/test_mssql_table_builder.py +++ b/tests/load/mssql/test_mssql_table_builder.py @@ -55,8 +55,8 @@ def test_alter_table(client: MsSqlJobClient) -> None: # existing table has no columns sql = client._get_table_update_sql("event_test_table", TABLE_UPDATE, True)[0] sqlfluff.parse(sql, dialect="tsql") - canonical_name = client.sql_client.make_qualified_table_name("event_test_table") - assert sql.count(f"ALTER TABLE {canonical_name}\nADD") == 1 + qualified_name = client.sql_client.make_qualified_table_name("event_test_table") + assert sql.count(f"ALTER TABLE {qualified_name}\nADD") == 1 assert "event_test_table" in sql assert '"col1" bigint NOT NULL' in sql assert '"col2" float NOT NULL' in sql @@ -75,3 +75,11 @@ def test_alter_table(client: MsSqlJobClient) -> None: assert '"col6_precision" decimal(6,2) NOT NULL' in sql assert '"col7_precision" varbinary(19)' in sql assert '"col11_precision" time(3) NOT NULL' in sql + + +def test_create_dlt_table(client: MsSqlJobClient) -> None: + # non existing table + sql = client._get_table_update_sql("_dlt_version", TABLE_UPDATE, False)[0] + sqlfluff.parse(sql, dialect="tsql") + qualified_name = client.sql_client.make_qualified_table_name("_dlt_version") + assert f"CREATE TABLE {qualified_name}" in sql diff --git a/tests/load/postgres/test_postgres_table_builder.py b/tests/load/postgres/test_postgres_table_builder.py index 86bd67db9a..28fd4eec9d 100644 --- a/tests/load/postgres/test_postgres_table_builder.py +++ b/tests/load/postgres/test_postgres_table_builder.py @@ -57,7 +57,8 @@ def test_create_table(client: PostgresClient) -> None: # non existing table sql = client._get_table_update_sql("event_test_table", TABLE_UPDATE, False)[0] sqlfluff.parse(sql, dialect="postgres") - assert "event_test_table" in sql + qualified_name = client.sql_client.make_qualified_table_name("event_test_table") + assert f"CREATE TABLE {qualified_name}" in sql assert '"col1" bigint NOT NULL' in sql assert '"col2" double precision NOT NULL' in sql assert '"col3" boolean NOT NULL' in sql @@ -173,3 +174,11 @@ def test_create_table_case_sensitive(cs_client: PostgresClient) -> None: # every line starts with "Col" for line in sql.split("\n")[1:]: assert line.startswith('"Col') + + +def test_create_dlt_table(client: PostgresClient) -> None: + # non existing table + sql = client._get_table_update_sql("_dlt_version", TABLE_UPDATE, False)[0] + sqlfluff.parse(sql, dialect="postgres") + qualified_name = client.sql_client.make_qualified_table_name("_dlt_version") + assert f"CREATE TABLE IF NOT EXISTS {qualified_name}" in sql diff --git a/tests/pipeline/test_pipeline_trace.py b/tests/pipeline/test_pipeline_trace.py index 69c0f01b8b..4e52d2aa29 100644 --- a/tests/pipeline/test_pipeline_trace.py +++ b/tests/pipeline/test_pipeline_trace.py @@ -551,7 +551,7 @@ def test_trace_telemetry() -> None: for item in SENTRY_SENT_ITEMS: # print(item) print(item["logentry"]["message"]) - assert len(SENTRY_SENT_ITEMS) == 2 + assert len(SENTRY_SENT_ITEMS) == 4 # trace with exception @dlt.resource From 08e5e7afca0f328da107d6e8eda7ca3c01366d33 Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink <47451109+jorritsandbrink@users.noreply.github.com> Date: Tue, 27 Aug 2024 11:59:43 +0400 Subject: [PATCH 06/10] Enable schema evolution for `merge` write disposition with `delta` table format (#1742) * black format * increase minimum deltalake version dependency * enable schema evolution for delta table merge * extract delta table merge logic into separate function * remove big decimal exclusion due to upstream bugfix * evolve delta table schema in empty source case * refactor DeltaLoadFilesystemJob * uses right table path format in delta lake load job * allows to pass schema name when getting delta tables and computing table counts * cleansup usage of remote paths and uris in filesystem load jobs * removes tempfile from file_storage --------- Co-authored-by: Marcin Rudolf --- dlt/common/libs/deltalake.py | 77 ++++++-- dlt/common/storages/file_storage.py | 17 +- dlt/destinations/fs_client.py | 3 + .../impl/filesystem/filesystem.py | 166 +++++++++--------- poetry.lock | 162 ++++++++--------- pyproject.toml | 2 +- tests/libs/test_deltalake.py | 14 +- .../load/pipeline/test_filesystem_pipeline.py | 133 ++++++++++++-- tests/pipeline/utils.py | 17 +- 9 files changed, 358 insertions(+), 233 deletions(-) diff --git a/dlt/common/libs/deltalake.py b/dlt/common/libs/deltalake.py index d98795d07c..d4cb46c600 100644 --- a/dlt/common/libs/deltalake.py +++ b/dlt/common/libs/deltalake.py @@ -5,13 +5,15 @@ from dlt.common import logger from dlt.common.libs.pyarrow import pyarrow as pa from dlt.common.libs.pyarrow import cast_arrow_schema_types -from dlt.common.schema.typing import TWriteDisposition +from dlt.common.schema.typing import TWriteDisposition, TTableSchema +from dlt.common.schema.utils import get_first_column_name_with_prop, get_columns_names_with_prop from dlt.common.exceptions import MissingDependencyException from dlt.common.storages import FilesystemConfiguration from dlt.common.utils import assert_min_pkg_version from dlt.destinations.impl.filesystem.filesystem import FilesystemClient try: + import deltalake from deltalake import write_deltalake, DeltaTable from deltalake.writer import try_get_deltatable except ModuleNotFoundError: @@ -74,7 +76,7 @@ def write_delta_table( partition_by: Optional[Union[List[str], str]] = None, storage_options: Optional[Dict[str, str]] = None, ) -> None: - """Writes in-memory Arrow table to on-disk Delta table. + """Writes in-memory Arrow data to on-disk Delta table. Thin wrapper around `deltalake.write_deltalake`. """ @@ -93,31 +95,73 @@ def write_delta_table( ) -def get_delta_tables(pipeline: Pipeline, *tables: str) -> Dict[str, DeltaTable]: - """Returns Delta tables in `pipeline.default_schema` as `deltalake.DeltaTable` objects. +def merge_delta_table( + table: DeltaTable, + data: Union[pa.Table, pa.RecordBatchReader], + schema: TTableSchema, +) -> None: + """Merges in-memory Arrow data into on-disk Delta table.""" + + strategy = schema["x-merge-strategy"] # type: ignore[typeddict-item] + if strategy == "upsert": + # `DeltaTable.merge` does not support automatic schema evolution + # https://github.com/delta-io/delta-rs/issues/2282 + _evolve_delta_table_schema(table, data.schema) + + if "parent" in schema: + unique_column = get_first_column_name_with_prop(schema, "unique") + predicate = f"target.{unique_column} = source.{unique_column}" + else: + primary_keys = get_columns_names_with_prop(schema, "primary_key") + predicate = " AND ".join([f"target.{c} = source.{c}" for c in primary_keys]) + + qry = ( + table.merge( + source=ensure_delta_compatible_arrow_data(data), + predicate=predicate, + source_alias="source", + target_alias="target", + ) + .when_matched_update_all() + .when_not_matched_insert_all() + ) + + qry.execute() + else: + ValueError(f'Merge strategy "{strategy}" not supported.') + + +def get_delta_tables( + pipeline: Pipeline, *tables: str, schema_name: str = None +) -> Dict[str, DeltaTable]: + """Returns Delta tables in `pipeline.default_schema (default)` as `deltalake.DeltaTable` objects. Returned object is a dictionary with table names as keys and `DeltaTable` objects as values. Optionally filters dictionary by table names specified as `*tables*`. - Raises ValueError if table name specified as `*tables` is not found. + Raises ValueError if table name specified as `*tables` is not found. You may try to switch to other + schemas via `schema_name` argument. """ from dlt.common.schema.utils import get_table_format - with pipeline.destination_client() as client: + with pipeline.destination_client(schema_name=schema_name) as client: assert isinstance( client, FilesystemClient ), "The `get_delta_tables` function requires a `filesystem` destination." schema_delta_tables = [ t["name"] - for t in pipeline.default_schema.tables.values() - if get_table_format(pipeline.default_schema.tables, t["name"]) == "delta" + for t in client.schema.tables.values() + if get_table_format(client.schema.tables, t["name"]) == "delta" ] if len(tables) > 0: invalid_tables = set(tables) - set(schema_delta_tables) if len(invalid_tables) > 0: + available_schemas = "" + if len(pipeline.schema_names) > 1: + available_schemas = f" Available schemas are {pipeline.schema_names}" raise ValueError( - "Schema does not contain Delta tables with these names: " - f"{', '.join(invalid_tables)}." + f"Schema {client.schema.name} does not contain Delta tables with these names: " + f"{', '.join(invalid_tables)}.{available_schemas}" ) schema_delta_tables = [t for t in schema_delta_tables if t in tables] table_dirs = client.get_table_dirs(schema_delta_tables, remote=True) @@ -145,3 +189,16 @@ def _deltalake_storage_options(config: FilesystemConfiguration) -> Dict[str, str + ". dlt will use the values in `deltalake_storage_options`." ) return {**creds, **extra_options} + + +def _evolve_delta_table_schema(delta_table: DeltaTable, arrow_schema: pa.Schema) -> None: + """Evolves `delta_table` schema if different from `arrow_schema`. + + Adds column(s) to `delta_table` present in `arrow_schema` but not in `delta_table`. + """ + new_fields = [ + deltalake.Field.from_pyarrow(field) + for field in ensure_delta_compatible_arrow_schema(arrow_schema) + if field not in delta_table.to_pyarrow_dataset().schema + ] + delta_table.alter.add_columns(new_fields) diff --git a/dlt/common/storages/file_storage.py b/dlt/common/storages/file_storage.py index 7d14b8f7f7..f26cc060a3 100644 --- a/dlt/common/storages/file_storage.py +++ b/dlt/common/storages/file_storage.py @@ -3,7 +3,6 @@ import re import stat import errno -import tempfile import shutil import pathvalidate from typing import IO, Any, Optional, List, cast @@ -29,10 +28,8 @@ def save(self, relative_path: str, data: Any) -> str: @staticmethod def save_atomic(storage_path: str, relative_path: str, data: Any, file_type: str = "t") -> str: mode = "w" + file_type - with tempfile.NamedTemporaryFile( - dir=storage_path, mode=mode, delete=False, encoding=encoding_for_mode(mode) - ) as f: - tmp_path = f.name + tmp_path = os.path.join(storage_path, uniq_id(8)) + with open(tmp_path, mode=mode, encoding=encoding_for_mode(mode)) as f: f.write(data) try: dest_path = os.path.join(storage_path, relative_path) @@ -116,11 +113,11 @@ def open_file(self, relative_path: str, mode: str = "r") -> IO[Any]: return FileStorage.open_zipsafe_ro(self.make_full_path(relative_path), mode) return open(self.make_full_path(relative_path), mode, encoding=encoding_for_mode(mode)) - def open_temp(self, delete: bool = False, mode: str = "w", file_type: str = None) -> IO[Any]: - mode = mode + file_type or self.file_type - return tempfile.NamedTemporaryFile( - dir=self.storage_path, mode=mode, delete=delete, encoding=encoding_for_mode(mode) - ) + # def open_temp(self, delete: bool = False, mode: str = "w", file_type: str = None) -> IO[Any]: + # mode = mode + file_type or self.file_type + # return tempfile.NamedTemporaryFile( + # dir=self.storage_path, mode=mode, delete=delete, encoding=encoding_for_mode(mode) + # ) def has_file(self, relative_path: str) -> bool: return os.path.isfile(self.make_full_path(relative_path)) diff --git a/dlt/destinations/fs_client.py b/dlt/destinations/fs_client.py index 3233446594..14e77b6b4e 100644 --- a/dlt/destinations/fs_client.py +++ b/dlt/destinations/fs_client.py @@ -3,9 +3,12 @@ from abc import ABC, abstractmethod from fsspec import AbstractFileSystem +from dlt.common.schema import Schema + class FSClientBase(ABC): fs_client: AbstractFileSystem + schema: Schema @property @abstractmethod diff --git a/dlt/destinations/impl/filesystem/filesystem.py b/dlt/destinations/impl/filesystem/filesystem.py index 5445fd2ae9..05261ccb1b 100644 --- a/dlt/destinations/impl/filesystem/filesystem.py +++ b/dlt/destinations/impl/filesystem/filesystem.py @@ -3,7 +3,7 @@ import base64 from types import TracebackType -from typing import ClassVar, List, Type, Iterable, Iterator, Optional, Tuple, Sequence, cast +from typing import Dict, List, Type, Iterable, Iterator, Optional, Tuple, Sequence, cast from fsspec import AbstractFileSystem from contextlib import contextmanager @@ -13,7 +13,7 @@ from dlt.common.storages.fsspec_filesystem import glob_files from dlt.common.typing import DictStrAny from dlt.common.schema import Schema, TSchemaTables, TTableSchema -from dlt.common.schema.utils import get_first_column_name_with_prop, get_columns_names_with_prop +from dlt.common.schema.utils import get_columns_names_with_prop from dlt.common.storages import FileStorage, fsspec_from_config from dlt.common.storages.load_package import ( LoadJobInfo, @@ -56,36 +56,36 @@ def __init__( self._job_client: FilesystemClient = None def run(self) -> None: - # pick local filesystem pathlib or posix for buckets - self.is_local_filesystem = self._job_client.config.protocol == "file" - self.pathlib = os.path if self.is_local_filesystem else posixpath - - self.destination_file_name = path_utils.create_path( - self._job_client.config.layout, - self._file_name, - self._job_client.schema.name, - self._load_id, - current_datetime=self._job_client.config.current_datetime, - load_package_timestamp=dlt.current.load_package()["state"]["created_at"], - extra_placeholders=self._job_client.config.extra_placeholders, - ) + self.__is_local_filesystem = self._job_client.config.protocol == "file" # We would like to avoid failing for local filesystem where # deeply nested directory will not exist before writing a file. # It `auto_mkdir` is disabled by default in fsspec so we made some # trade offs between different options and decided on this. # remote_path = f"{client.config.protocol}://{posixpath.join(dataset_path, destination_file_name)}" remote_path = self.make_remote_path() - if self.is_local_filesystem: - self._job_client.fs_client.makedirs(self.pathlib.dirname(remote_path), exist_ok=True) + if self.__is_local_filesystem: + # use os.path for local file name + self._job_client.fs_client.makedirs(os.path.dirname(remote_path), exist_ok=True) self._job_client.fs_client.put_file(self._file_path, remote_path) def make_remote_path(self) -> str: """Returns path on the remote filesystem to which copy the file, without scheme. For local filesystem a native path is used""" + destination_file_name = path_utils.create_path( + self._job_client.config.layout, + self._file_name, + self._job_client.schema.name, + self._load_id, + current_datetime=self._job_client.config.current_datetime, + load_package_timestamp=dlt.current.load_package()["state"]["created_at"], + extra_placeholders=self._job_client.config.extra_placeholders, + ) + # pick local filesystem pathlib or posix for buckets + pathlib = os.path if self.__is_local_filesystem else posixpath # path.join does not normalize separators and available # normalization functions are very invasive and may string the trailing separator - return self.pathlib.join( # type: ignore[no-any-return] + return pathlib.join( # type: ignore[no-any-return] self._job_client.dataset_path, - path_utils.normalize_path_sep(self.pathlib, self.destination_file_name), + path_utils.normalize_path_sep(pathlib, destination_file_name), ) def make_remote_uri(self) -> str: @@ -98,89 +98,81 @@ def metrics(self) -> Optional[LoadJobMetrics]: class DeltaLoadFilesystemJob(FilesystemLoadJob): def __init__(self, file_path: str) -> None: - super().__init__( - file_path=file_path, - ) - - def run(self) -> None: - # pick local filesystem pathlib or posix for buckets - # TODO: since we pass _job_client via run_managed and not set_env_vars it is hard - # to write a handler with those two line below only in FilesystemLoadJob - self.is_local_filesystem = self._job_client.config.protocol == "file" - self.pathlib = os.path if self.is_local_filesystem else posixpath - self.destination_file_name = self._job_client.make_remote_uri( - self._job_client.get_table_dir(self.load_table_name) - ) + super().__init__(file_path=file_path) + # create Arrow dataset from Parquet files from dlt.common.libs.pyarrow import pyarrow as pa - from dlt.common.libs.deltalake import ( - DeltaTable, - write_delta_table, - ensure_delta_compatible_arrow_schema, - _deltalake_storage_options, - try_get_deltatable, - ) - # create Arrow dataset from Parquet files - file_paths = ReferenceFollowupJobRequest.resolve_references(self._file_path) - arrow_ds = pa.dataset.dataset(file_paths) + self.file_paths = ReferenceFollowupJobRequest.resolve_references(self._file_path) + self.arrow_ds = pa.dataset.dataset(self.file_paths) - # create Delta table object + def make_remote_path(self) -> str: + # remote path is table dir - delta will create its file structure inside it + return self._job_client.get_table_dir(self.load_table_name) - storage_options = _deltalake_storage_options(self._job_client.config) - dt = try_get_deltatable(self.destination_file_name, storage_options=storage_options) + def run(self) -> None: + logger.info(f"Will copy file(s) {self.file_paths} to delta table {self.make_remote_uri()}") - # get partition columns - part_cols = get_columns_names_with_prop(self._load_table, "partition") + from dlt.common.libs.deltalake import write_delta_table, merge_delta_table # explicitly check if there is data # (https://github.com/delta-io/delta-rs/issues/2686) - if arrow_ds.head(1).num_rows == 0: - if dt is None: - # create new empty Delta table with schema from Arrow table - DeltaTable.create( - table_uri=self.destination_file_name, - schema=ensure_delta_compatible_arrow_schema(arrow_ds.schema), - mode="overwrite", - partition_by=part_cols, - storage_options=storage_options, - ) + if self.arrow_ds.head(1).num_rows == 0: + self._create_or_evolve_delta_table() return - arrow_rbr = arrow_ds.scanner().to_reader() # RecordBatchReader - - if self._load_table["write_disposition"] == "merge" and dt is not None: - assert self._load_table["x-merge-strategy"] in self._job_client.capabilities.supported_merge_strategies # type: ignore[typeddict-item] - - if self._load_table["x-merge-strategy"] == "upsert": # type: ignore[typeddict-item] - if "parent" in self._load_table: - unique_column = get_first_column_name_with_prop(self._load_table, "unique") - predicate = f"target.{unique_column} = source.{unique_column}" - else: - primary_keys = get_columns_names_with_prop(self._load_table, "primary_key") - predicate = " AND ".join([f"target.{c} = source.{c}" for c in primary_keys]) - - qry = ( - dt.merge( - source=arrow_rbr, - predicate=predicate, - source_alias="source", - target_alias="target", - ) - .when_matched_update_all() - .when_not_matched_insert_all() + with self.arrow_ds.scanner().to_reader() as arrow_rbr: # RecordBatchReader + if self._load_table["write_disposition"] == "merge" and self._delta_table is not None: + assert self._load_table["x-merge-strategy"] in self._job_client.capabilities.supported_merge_strategies # type: ignore[typeddict-item] + merge_delta_table( + table=self._delta_table, + data=arrow_rbr, + schema=self._load_table, + ) + else: + write_delta_table( + table_or_uri=( + self.make_remote_uri() if self._delta_table is None else self._delta_table + ), + data=arrow_rbr, + write_disposition=self._load_table["write_disposition"], + partition_by=self._partition_columns, + storage_options=self._storage_options, ) - qry.execute() + @property + def _storage_options(self) -> Dict[str, str]: + from dlt.common.libs.deltalake import _deltalake_storage_options + + return _deltalake_storage_options(self._job_client.config) - else: - write_delta_table( - table_or_uri=self.destination_file_name if dt is None else dt, - data=arrow_rbr, - write_disposition=self._load_table["write_disposition"], - partition_by=part_cols, - storage_options=storage_options, + @property + def _delta_table(self) -> Optional["DeltaTable"]: # type: ignore[name-defined] # noqa: F821 + from dlt.common.libs.deltalake import try_get_deltatable + + return try_get_deltatable(self.make_remote_uri(), storage_options=self._storage_options) + + @property + def _partition_columns(self) -> List[str]: + return get_columns_names_with_prop(self._load_table, "partition") + + def _create_or_evolve_delta_table(self) -> None: + from dlt.common.libs.deltalake import ( + DeltaTable, + ensure_delta_compatible_arrow_schema, + _evolve_delta_table_schema, + ) + + if self._delta_table is None: + DeltaTable.create( + table_uri=self.make_remote_uri(), + schema=ensure_delta_compatible_arrow_schema(self.arrow_ds.schema), + mode="overwrite", + partition_by=self._partition_columns, + storage_options=self._storage_options, ) + else: + _evolve_delta_table_schema(self._delta_table, self.arrow_ds.schema) class FilesystemLoadJobWithFollowup(HasFollowupJobs, FilesystemLoadJob): diff --git a/poetry.lock b/poetry.lock index d54a73a2ef..230b354b97 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand. [[package]] name = "about-time" @@ -2102,27 +2102,27 @@ typing-extensions = ">=3.10.0" [[package]] name = "databricks-sql-connector" -version = "3.1.2" +version = "3.3.0" description = "Databricks SQL Connector for Python" optional = true python-versions = "<4.0.0,>=3.8.0" files = [ - {file = "databricks_sql_connector-3.1.2-py3-none-any.whl", hash = "sha256:5292bc25b4d8d58d301079b55086331764f067e24862c9365698b2eeddedb737"}, - {file = "databricks_sql_connector-3.1.2.tar.gz", hash = "sha256:da0df114e0824d49ccfea36c4679c95689fe359191b056ad516446a058307c37"}, + {file = "databricks_sql_connector-3.3.0-py3-none-any.whl", hash = "sha256:55ee5a4a11291bf91a235ac76e41b419ddd66a9a321065a8bfaf119acbb26d6b"}, + {file = "databricks_sql_connector-3.3.0.tar.gz", hash = "sha256:19e82965da4c86574adfe9f788c17b4494d98eb8075ba4fd4306573d2edbf194"}, ] [package.dependencies] lz4 = ">=4.0.2,<5.0.0" numpy = [ - {version = ">=1.16.6", markers = "python_version >= \"3.8\" and python_version < \"3.11\""}, - {version = ">=1.23.4", markers = "python_version >= \"3.11\""}, + {version = ">=1.16.6,<2.0.0", markers = "python_version >= \"3.8\" and python_version < \"3.11\""}, + {version = ">=1.23.4,<2.0.0", markers = "python_version >= \"3.11\""}, ] oauthlib = ">=3.1.0,<4.0.0" openpyxl = ">=3.0.10,<4.0.0" pandas = {version = ">=1.2.5,<2.2.0", markers = "python_version >= \"3.8\""} -pyarrow = ">=14.0.1,<15.0.0" +pyarrow = ">=14.0.1,<17" requests = ">=2.18.1,<3.0.0" -thrift = ">=0.16.0,<0.17.0" +thrift = ">=0.16.0,<0.21.0" urllib3 = ">=1.26" [package.extras] @@ -2377,25 +2377,24 @@ files = [ [[package]] name = "deltalake" -version = "0.17.4" +version = "0.19.1" description = "Native Delta Lake Python binding based on delta-rs with Pandas integration" optional = true python-versions = ">=3.8" files = [ - {file = "deltalake-0.17.4-cp38-abi3-macosx_10_12_x86_64.whl", hash = "sha256:3f048bd4cdd3500fbb0d1b34046966ca4b7cefd1e9df71460b881ee8ad7f844a"}, - {file = "deltalake-0.17.4-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:b539265d8293794872e1dc3b2daad50abe05ab425e961824b3ac1155bb294604"}, - {file = "deltalake-0.17.4-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:55e6be5f5ab8d5d34d2ea58d86e93eec2da5d2476e3c15e9520239457618bca4"}, - {file = "deltalake-0.17.4-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:94dde6c2d0a07e9ce47be367d016541d3a499839350852205819353441e1a9c1"}, - {file = "deltalake-0.17.4-cp38-abi3-win_amd64.whl", hash = "sha256:f51f499d50dad88bdc18c5ed7c2319114759f3220f83aa2d32166c19accee4ce"}, - {file = "deltalake-0.17.4.tar.gz", hash = "sha256:c3c10577afc46d4b10ed16246d814a8c40b3663099066681eeba89f908373814"}, + {file = "deltalake-0.19.1-cp38-abi3-macosx_10_12_x86_64.whl", hash = "sha256:ddaaaa9c85a17791c3997cf320ac11dc1725d16cf4b6f0ff1b130853e7b56cd0"}, + {file = "deltalake-0.19.1-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:e0184d5a3f0d4f4f1fb992c3bdc8736329b78b6a4faf1a278109ec35d9945c1d"}, + {file = "deltalake-0.19.1-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ec9d117fcf6c198f3d554be2f3a6291ca3838530650db236741ff48d4d47abb4"}, + {file = "deltalake-0.19.1-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:447ef721319ed15f7b5f6da507efd5fed0e6172e5ae55ac044d5b8fc9b812e47"}, + {file = "deltalake-0.19.1-cp38-abi3-win_amd64.whl", hash = "sha256:b15bc343a9f8f3de80fbedcebd5d9472b539eb0f538a71739c7fcf699089127e"}, + {file = "deltalake-0.19.1.tar.gz", hash = "sha256:5e09fabb221fb81e989c283c16278eaffb6e85706d98364abcda5c0c6ca73598"}, ] [package.dependencies] -pyarrow = ">=8" -pyarrow-hotfix = "*" +pyarrow = ">=16" [package.extras] -devel = ["mypy (>=1.8.0,<1.9.0)", "packaging (>=20)", "pytest", "pytest-benchmark", "pytest-cov", "pytest-mock", "pytest-timeout", "ruff (>=0.3.0,<0.4.0)", "sphinx (<=4.5)", "sphinx-rtd-theme", "toml", "wheel"] +devel = ["azure-storage-blob (==12.20.0)", "mypy (==1.10.1)", "packaging (>=20)", "pytest", "pytest-benchmark", "pytest-cov", "pytest-mock", "pytest-timeout", "ruff (==0.5.2)", "sphinx (<=4.5)", "sphinx-rtd-theme", "toml", "wheel"] pandas = ["pandas"] pyspark = ["delta-spark", "numpy (==1.22.2)", "pyspark"] @@ -4567,17 +4566,17 @@ testing = ["pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", [[package]] name = "lancedb" -version = "0.9.0" +version = "0.13.0b1" description = "lancedb" optional = false -python-versions = ">=3.8" +python-versions = ">=3.9" files = [ - {file = "lancedb-0.9.0-cp38-abi3-macosx_10_15_x86_64.whl", hash = "sha256:b1ca08797c72c93ae512aa1078f1891756da157d910fbae8e194fac3528fc1ac"}, - {file = "lancedb-0.9.0-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:15129791f03c2c04b95f914ced2c1556b43d73a24710207b9af77b6e4008bdeb"}, - {file = "lancedb-0.9.0-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3f093d89447a2039b820d2540a0b64df3024e4549b6808ebd26b44fbe0345cc6"}, - {file = "lancedb-0.9.0-cp38-abi3-manylinux_2_24_aarch64.whl", hash = "sha256:a8c1f6777e217d2277451038866d280fa5fb38bd161795e51703b043c26dd345"}, - {file = "lancedb-0.9.0-cp38-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:78dd5800a1148f89d33b7e98d1c8b1c42dee146f03580abc1ca83cb05273ff7f"}, - {file = "lancedb-0.9.0-cp38-abi3-win_amd64.whl", hash = "sha256:ba5bdc727d3bc131f17414f42372acde5817073feeb553793a3d20003caa1658"}, + {file = "lancedb-0.13.0b1-cp38-abi3-macosx_10_15_x86_64.whl", hash = "sha256:687b9a08be55e6fa9520255b1b06dcd2e6ba6c64c947410821e9a3a52b2f48ec"}, + {file = "lancedb-0.13.0b1-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:ac00684f7e90ffc1b386298670e2c4ddaea8c0b61b6eb1b51dbd4e74feb87a86"}, + {file = "lancedb-0.13.0b1-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bbe8fc15bfeec89b6b2a4a42b4b919b6d3e138cf8684af35f77f361d73fe90cd"}, + {file = "lancedb-0.13.0b1-cp38-abi3-manylinux_2_24_aarch64.whl", hash = "sha256:231e1f00d724c468922f7951d902622d4ccb21c2db2a148b845beaebee5d35b3"}, + {file = "lancedb-0.13.0b1-cp38-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:fecdd71f137e52193bfb5843610f32fe025a60a1edf5f80530704de879706c6b"}, + {file = "lancedb-0.13.0b1-cp38-abi3-win_amd64.whl", hash = "sha256:7852d9c04a4402407af06bbbf78bf339a169f1df2bf5c70da586ca733ec40a68"}, ] [package.dependencies] @@ -4587,7 +4586,7 @@ deprecation = "*" overrides = ">=0.7" packaging = "*" pydantic = ">=1.10" -pylance = "0.13.0" +pylance = "0.16.1" ratelimiter = ">=1.0,<2.0" requests = ">=2.31.0" retry = ">=0.9.2" @@ -4598,8 +4597,8 @@ azure = ["adlfs (>=2024.2.0)"] clip = ["open-clip", "pillow", "torch"] dev = ["pre-commit", "ruff"] docs = ["mkdocs", "mkdocs-jupyter", "mkdocs-material", "mkdocstrings[python]"] -embeddings = ["awscli (>=1.29.57)", "boto3 (>=1.28.57)", "botocore (>=1.31.57)", "cohere", "google-generativeai", "huggingface-hub", "instructorembedding", "ollama", "open-clip-torch", "openai (>=1.6.1)", "pillow", "sentence-transformers", "torch"] -tests = ["aiohttp", "boto3", "duckdb", "pandas (>=1.4)", "polars (>=0.19)", "pytest", "pytest-asyncio", "pytest-mock", "pytz", "tantivy"] +embeddings = ["awscli (>=1.29.57)", "boto3 (>=1.28.57)", "botocore (>=1.31.57)", "cohere", "google-generativeai", "huggingface-hub", "ibm-watsonx-ai (>=1.1.2)", "instructorembedding", "ollama", "open-clip-torch", "openai (>=1.6.1)", "pillow", "sentence-transformers", "torch"] +tests = ["aiohttp", "boto3", "duckdb", "pandas (>=1.4)", "polars (>=0.19,<=1.3.0)", "pytest", "pytest-asyncio", "pytest-mock", "pytz", "tantivy"] [[package]] name = "lazy-object-proxy" @@ -6660,63 +6659,52 @@ files = [ [[package]] name = "pyarrow" -version = "14.0.2" +version = "16.1.0" description = "Python library for Apache Arrow" optional = false python-versions = ">=3.8" files = [ - {file = "pyarrow-14.0.2-cp310-cp310-macosx_10_14_x86_64.whl", hash = "sha256:ba9fe808596c5dbd08b3aeffe901e5f81095baaa28e7d5118e01354c64f22807"}, - {file = "pyarrow-14.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:22a768987a16bb46220cef490c56c671993fbee8fd0475febac0b3e16b00a10e"}, - {file = "pyarrow-14.0.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2dbba05e98f247f17e64303eb876f4a80fcd32f73c7e9ad975a83834d81f3fda"}, - {file = "pyarrow-14.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a898d134d00b1eca04998e9d286e19653f9d0fcb99587310cd10270907452a6b"}, - {file = "pyarrow-14.0.2-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:87e879323f256cb04267bb365add7208f302df942eb943c93a9dfeb8f44840b1"}, - {file = "pyarrow-14.0.2-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:76fc257559404ea5f1306ea9a3ff0541bf996ff3f7b9209fc517b5e83811fa8e"}, - {file = "pyarrow-14.0.2-cp310-cp310-win_amd64.whl", hash = "sha256:b0c4a18e00f3a32398a7f31da47fefcd7a927545b396e1f15d0c85c2f2c778cd"}, - {file = "pyarrow-14.0.2-cp311-cp311-macosx_10_14_x86_64.whl", hash = "sha256:87482af32e5a0c0cce2d12eb3c039dd1d853bd905b04f3f953f147c7a196915b"}, - {file = "pyarrow-14.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:059bd8f12a70519e46cd64e1ba40e97eae55e0cbe1695edd95384653d7626b23"}, - {file = "pyarrow-14.0.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3f16111f9ab27e60b391c5f6d197510e3ad6654e73857b4e394861fc79c37200"}, - {file = "pyarrow-14.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:06ff1264fe4448e8d02073f5ce45a9f934c0f3db0a04460d0b01ff28befc3696"}, - {file = "pyarrow-14.0.2-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:6dd4f4b472ccf4042f1eab77e6c8bce574543f54d2135c7e396f413046397d5a"}, - {file = "pyarrow-14.0.2-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:32356bfb58b36059773f49e4e214996888eeea3a08893e7dbde44753799b2a02"}, - {file = "pyarrow-14.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:52809ee69d4dbf2241c0e4366d949ba035cbcf48409bf404f071f624ed313a2b"}, - {file = "pyarrow-14.0.2-cp312-cp312-macosx_10_14_x86_64.whl", hash = "sha256:c87824a5ac52be210d32906c715f4ed7053d0180c1060ae3ff9b7e560f53f944"}, - {file = "pyarrow-14.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:a25eb2421a58e861f6ca91f43339d215476f4fe159eca603c55950c14f378cc5"}, - {file = "pyarrow-14.0.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5c1da70d668af5620b8ba0a23f229030a4cd6c5f24a616a146f30d2386fec422"}, - {file = "pyarrow-14.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2cc61593c8e66194c7cdfae594503e91b926a228fba40b5cf25cc593563bcd07"}, - {file = "pyarrow-14.0.2-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:78ea56f62fb7c0ae8ecb9afdd7893e3a7dbeb0b04106f5c08dbb23f9c0157591"}, - {file = "pyarrow-14.0.2-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:37c233ddbce0c67a76c0985612fef27c0c92aef9413cf5aa56952f359fcb7379"}, - {file = "pyarrow-14.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:e4b123ad0f6add92de898214d404e488167b87b5dd86e9a434126bc2b7a5578d"}, - {file = "pyarrow-14.0.2-cp38-cp38-macosx_10_14_x86_64.whl", hash = "sha256:e354fba8490de258be7687f341bc04aba181fc8aa1f71e4584f9890d9cb2dec2"}, - {file = "pyarrow-14.0.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:20e003a23a13da963f43e2b432483fdd8c38dc8882cd145f09f21792e1cf22a1"}, - {file = "pyarrow-14.0.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fc0de7575e841f1595ac07e5bc631084fd06ca8b03c0f2ecece733d23cd5102a"}, - {file = "pyarrow-14.0.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:66e986dc859712acb0bd45601229021f3ffcdfc49044b64c6d071aaf4fa49e98"}, - {file = "pyarrow-14.0.2-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:f7d029f20ef56673a9730766023459ece397a05001f4e4d13805111d7c2108c0"}, - {file = "pyarrow-14.0.2-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:209bac546942b0d8edc8debda248364f7f668e4aad4741bae58e67d40e5fcf75"}, - {file = "pyarrow-14.0.2-cp38-cp38-win_amd64.whl", hash = "sha256:1e6987c5274fb87d66bb36816afb6f65707546b3c45c44c28e3c4133c010a881"}, - {file = "pyarrow-14.0.2-cp39-cp39-macosx_10_14_x86_64.whl", hash = "sha256:a01d0052d2a294a5f56cc1862933014e696aa08cc7b620e8c0cce5a5d362e976"}, - {file = "pyarrow-14.0.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a51fee3a7db4d37f8cda3ea96f32530620d43b0489d169b285d774da48ca9785"}, - {file = "pyarrow-14.0.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:64df2bf1ef2ef14cee531e2dfe03dd924017650ffaa6f9513d7a1bb291e59c15"}, - {file = "pyarrow-14.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3c0fa3bfdb0305ffe09810f9d3e2e50a2787e3a07063001dcd7adae0cee3601a"}, - {file = "pyarrow-14.0.2-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:c65bf4fd06584f058420238bc47a316e80dda01ec0dfb3044594128a6c2db794"}, - {file = "pyarrow-14.0.2-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:63ac901baec9369d6aae1cbe6cca11178fb018a8d45068aaf5bb54f94804a866"}, - {file = "pyarrow-14.0.2-cp39-cp39-win_amd64.whl", hash = "sha256:75ee0efe7a87a687ae303d63037d08a48ef9ea0127064df18267252cfe2e9541"}, - {file = "pyarrow-14.0.2.tar.gz", hash = "sha256:36cef6ba12b499d864d1def3e990f97949e0b79400d08b7cf74504ffbd3eb025"}, + {file = "pyarrow-16.1.0-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:17e23b9a65a70cc733d8b738baa6ad3722298fa0c81d88f63ff94bf25eaa77b9"}, + {file = "pyarrow-16.1.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:4740cc41e2ba5d641071d0ab5e9ef9b5e6e8c7611351a5cb7c1d175eaf43674a"}, + {file = "pyarrow-16.1.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:98100e0268d04e0eec47b73f20b39c45b4006f3c4233719c3848aa27a03c1aef"}, + {file = "pyarrow-16.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f68f409e7b283c085f2da014f9ef81e885d90dcd733bd648cfba3ef265961848"}, + {file = "pyarrow-16.1.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:a8914cd176f448e09746037b0c6b3a9d7688cef451ec5735094055116857580c"}, + {file = "pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:48be160782c0556156d91adbdd5a4a7e719f8d407cb46ae3bb4eaee09b3111bd"}, + {file = "pyarrow-16.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:9cf389d444b0f41d9fe1444b70650fea31e9d52cfcb5f818b7888b91b586efff"}, + {file = "pyarrow-16.1.0-cp311-cp311-macosx_10_15_x86_64.whl", hash = "sha256:d0ebea336b535b37eee9eee31761813086d33ed06de9ab6fc6aaa0bace7b250c"}, + {file = "pyarrow-16.1.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2e73cfc4a99e796727919c5541c65bb88b973377501e39b9842ea71401ca6c1c"}, + {file = "pyarrow-16.1.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bf9251264247ecfe93e5f5a0cd43b8ae834f1e61d1abca22da55b20c788417f6"}, + {file = "pyarrow-16.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ddf5aace92d520d3d2a20031d8b0ec27b4395cab9f74e07cc95edf42a5cc0147"}, + {file = "pyarrow-16.1.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:25233642583bf658f629eb230b9bb79d9af4d9f9229890b3c878699c82f7d11e"}, + {file = "pyarrow-16.1.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:a33a64576fddfbec0a44112eaf844c20853647ca833e9a647bfae0582b2ff94b"}, + {file = "pyarrow-16.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:185d121b50836379fe012753cf15c4ba9638bda9645183ab36246923875f8d1b"}, + {file = "pyarrow-16.1.0-cp312-cp312-macosx_10_15_x86_64.whl", hash = "sha256:2e51ca1d6ed7f2e9d5c3c83decf27b0d17bb207a7dea986e8dc3e24f80ff7d6f"}, + {file = "pyarrow-16.1.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:06ebccb6f8cb7357de85f60d5da50e83507954af617d7b05f48af1621d331c9a"}, + {file = "pyarrow-16.1.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b04707f1979815f5e49824ce52d1dceb46e2f12909a48a6a753fe7cafbc44a0c"}, + {file = "pyarrow-16.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0d32000693deff8dc5df444b032b5985a48592c0697cb6e3071a5d59888714e2"}, + {file = "pyarrow-16.1.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:8785bb10d5d6fd5e15d718ee1d1f914fe768bf8b4d1e5e9bf253de8a26cb1628"}, + {file = "pyarrow-16.1.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:e1369af39587b794873b8a307cc6623a3b1194e69399af0efd05bb202195a5a7"}, + {file = "pyarrow-16.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:febde33305f1498f6df85e8020bca496d0e9ebf2093bab9e0f65e2b4ae2b3444"}, + {file = "pyarrow-16.1.0-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:b5f5705ab977947a43ac83b52ade3b881eb6e95fcc02d76f501d549a210ba77f"}, + {file = "pyarrow-16.1.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:0d27bf89dfc2576f6206e9cd6cf7a107c9c06dc13d53bbc25b0bd4556f19cf5f"}, + {file = "pyarrow-16.1.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0d07de3ee730647a600037bc1d7b7994067ed64d0eba797ac74b2bc77384f4c2"}, + {file = "pyarrow-16.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fbef391b63f708e103df99fbaa3acf9f671d77a183a07546ba2f2c297b361e83"}, + {file = "pyarrow-16.1.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:19741c4dbbbc986d38856ee7ddfdd6a00fc3b0fc2d928795b95410d38bb97d15"}, + {file = "pyarrow-16.1.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:f2c5fb249caa17b94e2b9278b36a05ce03d3180e6da0c4c3b3ce5b2788f30eed"}, + {file = "pyarrow-16.1.0-cp38-cp38-win_amd64.whl", hash = "sha256:e6b6d3cd35fbb93b70ade1336022cc1147b95ec6af7d36906ca7fe432eb09710"}, + {file = "pyarrow-16.1.0-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:18da9b76a36a954665ccca8aa6bd9f46c1145f79c0bb8f4f244f5f8e799bca55"}, + {file = "pyarrow-16.1.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:99f7549779b6e434467d2aa43ab2b7224dd9e41bdde486020bae198978c9e05e"}, + {file = "pyarrow-16.1.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f07fdffe4fd5b15f5ec15c8b64584868d063bc22b86b46c9695624ca3505b7b4"}, + {file = "pyarrow-16.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ddfe389a08ea374972bd4065d5f25d14e36b43ebc22fc75f7b951f24378bf0b5"}, + {file = "pyarrow-16.1.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:3b20bd67c94b3a2ea0a749d2a5712fc845a69cb5d52e78e6449bbd295611f3aa"}, + {file = "pyarrow-16.1.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:ba8ac20693c0bb0bf4b238751d4409e62852004a8cf031c73b0e0962b03e45e3"}, + {file = "pyarrow-16.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:31a1851751433d89a986616015841977e0a188662fcffd1a5677453f1df2de0a"}, + {file = "pyarrow-16.1.0.tar.gz", hash = "sha256:15fbb22ea96d11f0b5768504a3f961edab25eaf4197c341720c4a387f6c60315"}, ] [package.dependencies] numpy = ">=1.16.6" -[[package]] -name = "pyarrow-hotfix" -version = "0.6" -description = "" -optional = true -python-versions = ">=3.5" -files = [ - {file = "pyarrow_hotfix-0.6-py3-none-any.whl", hash = "sha256:dcc9ae2d220dff0083be6a9aa8e0cdee5182ad358d4931fce825c545e5c89178"}, - {file = "pyarrow_hotfix-0.6.tar.gz", hash = "sha256:79d3e030f7ff890d408a100ac16d6f00b14d44a502d7897cd9fc3e3a534e9945"}, -] - [[package]] name = "pyasn1" version = "0.5.0" @@ -6993,22 +6981,22 @@ tests = ["coverage[toml] (==5.0.4)", "pytest (>=6.0.0,<7.0.0)"] [[package]] name = "pylance" -version = "0.13.0" +version = "0.16.1" description = "python wrapper for Lance columnar format" optional = false python-versions = ">=3.9" files = [ - {file = "pylance-0.13.0-cp39-abi3-macosx_10_15_x86_64.whl", hash = "sha256:2f3d6f9eec1f59f45dccb01075ba79868b8d37c8371d6210bcf6418217a0dd8b"}, - {file = "pylance-0.13.0-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:f4861ab466c94b0f9a4b4e6de6e1dfa02f40e7242d8db87447bc7bb7d89606ac"}, - {file = "pylance-0.13.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e3cb92547e145f5bfb0ea7d6f483953913b9bdd44c45bea84fc95a18da9f5853"}, - {file = "pylance-0.13.0-cp39-abi3-manylinux_2_24_aarch64.whl", hash = "sha256:d1ddd7700924bc6b6b0774ea63d2aa23f9210a86cd6d6af0cdfa987df776d50d"}, - {file = "pylance-0.13.0-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:c51d4b6e59cf4dc97c11a35b299f11e80dbdf392e2d8dc498573c26474a3c19e"}, - {file = "pylance-0.13.0-cp39-abi3-win_amd64.whl", hash = "sha256:4018ba016f1445874960a4ba2ad5c80cb380f3116683282ee8beabd38fa8989d"}, + {file = "pylance-0.16.1-cp39-abi3-macosx_10_15_x86_64.whl", hash = "sha256:7092303ae21bc162edd98e20fc39785fa1ec6b67f04132977ac0fd63110ba16f"}, + {file = "pylance-0.16.1-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:7c2ebdf89928c68f053ab9e369a5477da0a2ba70d47c00075dc10a37039d9e90"}, + {file = "pylance-0.16.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b4525c2fd8095830b753a3efb7285f358b016836086683fe977f9f1de8e6866c"}, + {file = "pylance-0.16.1-cp39-abi3-manylinux_2_24_aarch64.whl", hash = "sha256:645f0ab338bc4bd42bf3321bbb4053261979117aefd8477c2192ba624de27778"}, + {file = "pylance-0.16.1-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:3a7464d60aca51e89196a79c638bcbff0bddb77158946e2ea6b5fcbc6cfc63e1"}, + {file = "pylance-0.16.1-cp39-abi3-win_amd64.whl", hash = "sha256:d12c628dfbd49efde15a5512247065341f3efb29989dd08fb5a7023f013471ee"}, ] [package.dependencies] -numpy = ">=1.22" -pyarrow = ">=12,<15.0.1" +numpy = ">=1.22,<2" +pyarrow = ">=12" [package.extras] benchmarks = ["pytest-benchmark"] @@ -9696,4 +9684,4 @@ weaviate = ["weaviate-client"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<3.13" -content-hash = "a64fdd2845d27c9abc344809be68cba08f46641aabdc07416c37c802450fe4f3" +content-hash = "2b8d00f91f33a380b2399989dcac0d1d106d0bd2cd8865c5b7e27a19885753b5" diff --git a/pyproject.toml b/pyproject.toml index f33bbbefcf..74161f5ccc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -80,7 +80,7 @@ databricks-sql-connector = {version = ">=2.9.3", optional = true} clickhouse-driver = { version = ">=0.2.7", optional = true } clickhouse-connect = { version = ">=0.7.7", optional = true } lancedb = { version = ">=0.8.2", optional = true, markers = "python_version >= '3.9'", allow-prereleases = true } -deltalake = { version = ">=0.17.4", optional = true } +deltalake = { version = ">=0.19.0", optional = true } [tool.poetry.extras] gcp = ["grpcio", "google-cloud-bigquery", "db-dtypes", "gcsfs"] diff --git a/tests/libs/test_deltalake.py b/tests/libs/test_deltalake.py index 3e2d7cc3f6..dc5586eb32 100644 --- a/tests/libs/test_deltalake.py +++ b/tests/libs/test_deltalake.py @@ -95,21 +95,9 @@ def arrow_data( # type: ignore[return] client = cast(FilesystemClient, client) storage_options = _deltalake_storage_options(client.config) - with pytest.raises(Exception): - # bug in `delta-rs` causes error when writing big decimal values - # https://github.com/delta-io/delta-rs/issues/2510 - # if this test fails, the bug has been fixed and we should remove this - # note from the docs: - write_delta_table( - remote_dir + "/corrupt_delta_table", - arrow_table_all_data_types("arrow-table", include_decimal_default_precision=True)[0], - write_disposition="append", - storage_options=storage_options, - ) - arrow_table = arrow_table_all_data_types( "arrow-table", - include_decimal_default_precision=False, + include_decimal_default_precision=True, include_decimal_arrow_max_precision=True, num_rows=2, )[0] diff --git a/tests/load/pipeline/test_filesystem_pipeline.py b/tests/load/pipeline/test_filesystem_pipeline.py index 4b8707e989..d88eba7c06 100644 --- a/tests/load/pipeline/test_filesystem_pipeline.py +++ b/tests/load/pipeline/test_filesystem_pipeline.py @@ -15,7 +15,7 @@ from dlt.common.storages.configuration import FilesystemConfiguration from dlt.common.storages.load_package import ParsedLoadJobFileName from dlt.common.utils import uniq_id -from dlt.common.exceptions import DependencyVersionException +from dlt.common.schema.typing import TWriteDisposition from dlt.destinations import filesystem from dlt.destinations.impl.filesystem.filesystem import FilesystemClient from dlt.destinations.impl.filesystem.typing import TExtraPlaceholders @@ -580,6 +580,103 @@ def two_part(): @pytest.mark.essential +@pytest.mark.parametrize( + "destination_config", + destinations_configs( + table_format_filesystem_configs=True, + table_format="delta", + bucket_subset=(FILE_BUCKET), + ), + ids=lambda x: x.name, +) +@pytest.mark.parametrize( + "write_disposition", + ( + "append", + "replace", + pytest.param({"disposition": "merge", "strategy": "upsert"}, id="upsert"), + ), +) +def test_delta_table_schema_evolution( + destination_config: DestinationTestConfiguration, + write_disposition: TWriteDisposition, +) -> None: + """Tests schema evolution (adding new columns) for `delta` table format.""" + from dlt.common.libs.deltalake import get_delta_tables, ensure_delta_compatible_arrow_data + from dlt.common.libs.pyarrow import pyarrow + + @dlt.resource( + write_disposition=write_disposition, + primary_key="pk", + table_format="delta", + ) + def delta_table(data): + yield data + + pipeline = destination_config.setup_pipeline("fs_pipe", dev_mode=True) + + # create Arrow table with one column, one row + pk_field = pyarrow.field("pk", pyarrow.int64(), nullable=False) + schema = pyarrow.schema([pk_field]) + arrow_table = pyarrow.Table.from_pydict({"pk": [1]}, schema=schema) + assert arrow_table.shape == (1, 1) + + # initial load + info = pipeline.run(delta_table(arrow_table)) + assert_load_info(info) + dt = get_delta_tables(pipeline, "delta_table")["delta_table"] + expected = ensure_delta_compatible_arrow_data(arrow_table) + actual = dt.to_pyarrow_table() + assert actual.equals(expected) + + # create Arrow table with many columns, two rows + arrow_table = arrow_table_all_data_types( + "arrow-table", + include_decimal_default_precision=True, + include_decimal_arrow_max_precision=True, + include_not_normalized_name=False, + include_null=False, + num_rows=2, + )[0] + arrow_table = arrow_table.add_column(0, pk_field, [[1, 2]]) + + # second load — this should evolve the schema (i.e. add the new columns) + info = pipeline.run(delta_table(arrow_table)) + assert_load_info(info) + dt = get_delta_tables(pipeline, "delta_table")["delta_table"] + actual = dt.to_pyarrow_table() + expected = ensure_delta_compatible_arrow_data(arrow_table) + if write_disposition == "append": + # just check shape and schema for `append`, because table comparison is + # more involved than with the other dispositions + assert actual.num_rows == 3 + actual.schema.equals(expected.schema) + else: + assert actual.sort_by("pk").equals(expected.sort_by("pk")) + + # create empty Arrow table with additional column + arrow_table = arrow_table.append_column( + pyarrow.field("another_new_column", pyarrow.string()), + [["foo", "foo"]], + ) + empty_arrow_table = arrow_table.schema.empty_table() + + # load 3 — this should evolve the schema without changing data + info = pipeline.run(delta_table(empty_arrow_table)) + assert_load_info(info) + dt = get_delta_tables(pipeline, "delta_table")["delta_table"] + actual = dt.to_pyarrow_table() + expected_schema = ensure_delta_compatible_arrow_data(arrow_table).schema + assert actual.schema.equals(expected_schema) + expected_num_rows = 3 if write_disposition == "append" else 2 + assert actual.num_rows == expected_num_rows + # new column should have NULLs only + assert ( + actual.column("another_new_column").combine_chunks().to_pylist() + == [None] * expected_num_rows + ) + + @pytest.mark.parametrize( "destination_config", destinations_configs( @@ -607,7 +704,7 @@ def delta_table(data): # create empty Arrow table with schema arrow_table = arrow_table_all_data_types( "arrow-table", - include_decimal_default_precision=False, + include_decimal_default_precision=True, include_decimal_arrow_max_precision=True, include_not_normalized_name=False, include_null=False, @@ -643,22 +740,6 @@ def delta_table(data): ensure_delta_compatible_arrow_data(empty_arrow_table).schema ) - # run 3: empty Arrow table with different schema - # this should not alter the Delta table - empty_arrow_table_2 = pa.schema( - [pa.field("foo", pa.int64()), pa.field("bar", pa.string())] - ).empty_table() - - info = pipeline.run(delta_table(empty_arrow_table_2)) - assert_load_info(info) - dt = get_delta_tables(pipeline, "delta_table")["delta_table"] - assert dt.version() == 1 # still 1, no new commit was done - dt_arrow_table = dt.to_pyarrow_table() - assert dt_arrow_table.shape == (2, empty_arrow_table.num_columns) # shape did not change - assert dt_arrow_table.schema.equals( # schema did not change - ensure_delta_compatible_arrow_data(empty_arrow_table).schema - ) - # test `dlt.mark.materialize_table_schema()` users_materialize_table_schema.apply_hints(table_format="delta") info = pipeline.run(users_materialize_table_schema()) @@ -810,6 +891,22 @@ def parent_delta(): with pytest.raises(ValueError): get_delta_tables(pipeline, "non_existing_table") + # test unknown schema + with pytest.raises(FileNotFoundError): + get_delta_tables(pipeline, "non_existing_table", schema_name="aux_2") + + # load to a new schema and under new name + aux_schema = dlt.Schema("aux_2") + # NOTE: you cannot have a file with name + info = pipeline.run(parent_delta().with_name("aux_delta"), schema=aux_schema) + # also state in seprate package + assert_load_info(info, expected_load_packages=2) + delta_tables = get_delta_tables(pipeline, schema_name="aux_2") + assert "aux_delta__child" in delta_tables.keys() + get_delta_tables(pipeline, "aux_delta", schema_name="aux_2") + with pytest.raises(ValueError): + get_delta_tables(pipeline, "aux_delta") + @pytest.mark.parametrize( "destination_config", diff --git a/tests/pipeline/utils.py b/tests/pipeline/utils.py index d3d87f0e0b..dfb5f3f82d 100644 --- a/tests/pipeline/utils.py +++ b/tests/pipeline/utils.py @@ -177,24 +177,27 @@ def _load_file(client: FSClientBase, filepath) -> List[Dict[str, Any]]: # -def _load_tables_to_dicts_fs(p: dlt.Pipeline, *table_names: str) -> Dict[str, List[Dict[str, Any]]]: +def _load_tables_to_dicts_fs( + p: dlt.Pipeline, *table_names: str, schema_name: str = None +) -> Dict[str, List[Dict[str, Any]]]: """For now this will expect the standard layout in the filesystem destination, if changed the results will not be correct""" - client = p._fs_client() + client = p._fs_client(schema_name=schema_name) + assert isinstance(client, FilesystemClient) + result: Dict[str, Any] = {} delta_table_names = [ table_name for table_name in table_names - if get_table_format(p.default_schema.tables, table_name) == "delta" + if get_table_format(client.schema.tables, table_name) == "delta" ] if len(delta_table_names) > 0: from dlt.common.libs.deltalake import get_delta_tables - delta_tables = get_delta_tables(p, *table_names) + delta_tables = get_delta_tables(p, *table_names, schema_name=schema_name) for table_name in table_names: - if table_name in p.default_schema.data_table_names() and table_name in delta_table_names: - assert isinstance(client, FilesystemClient) + if table_name in client.schema.data_table_names() and table_name in delta_table_names: dt = delta_tables[table_name] result[table_name] = dt.to_pyarrow_table().to_pylist() else: @@ -244,7 +247,7 @@ def _sort_list_of_dicts(list_: List[Dict[str, Any]], sortkey: str) -> List[Dict[ return sorted(list_, key=lambda d: d[sortkey]) if _is_filesystem(p): - result = _load_tables_to_dicts_fs(p, *table_names) + result = _load_tables_to_dicts_fs(p, *table_names, schema_name=schema_name) else: result = _load_tables_to_dicts_sql(p, *table_names, schema_name=schema_name) From e337cca079ab21742339e097eb381635eafc5de5 Mon Sep 17 00:00:00 2001 From: rudolfix Date: Tue, 27 Aug 2024 18:32:07 +0200 Subject: [PATCH 07/10] provides detail exception messages when cursor stored value cannot be coerced to data in incremental (#1748) --- .../impl/filesystem/filesystem.py | 1 + dlt/extract/incremental/exceptions.py | 26 ++++++++ dlt/extract/incremental/transform.py | 63 ++++++++++++++++--- tests/extract/test_incremental.py | 21 ++++++- 4 files changed, 101 insertions(+), 10 deletions(-) diff --git a/dlt/destinations/impl/filesystem/filesystem.py b/dlt/destinations/impl/filesystem/filesystem.py index 05261ccb1b..62263a10b9 100644 --- a/dlt/destinations/impl/filesystem/filesystem.py +++ b/dlt/destinations/impl/filesystem/filesystem.py @@ -89,6 +89,7 @@ def make_remote_path(self) -> str: ) def make_remote_uri(self) -> str: + """Returns path on a remote filesystem as a full uri including scheme.""" return self._job_client.make_remote_uri(self.make_remote_path()) def metrics(self) -> Optional[LoadJobMetrics]: diff --git a/dlt/extract/incremental/exceptions.py b/dlt/extract/incremental/exceptions.py index e318a028dc..a5f94c2974 100644 --- a/dlt/extract/incremental/exceptions.py +++ b/dlt/extract/incremental/exceptions.py @@ -1,3 +1,5 @@ +from typing import Any + from dlt.extract.exceptions import PipeException from dlt.common.typing import TDataItem @@ -13,6 +15,30 @@ def __init__(self, pipe_name: str, json_path: str, item: TDataItem, msg: str = N super().__init__(pipe_name, msg) +class IncrementalCursorInvalidCoercion(PipeException): + def __init__( + self, + pipe_name: str, + cursor_path: str, + cursor_value: TDataItem, + cursor_value_type: str, + item: TDataItem, + item_type: Any, + details: str, + ) -> None: + self.cursor_path = cursor_path + self.cursor_value = cursor_value + self.cursor_value_type = cursor_value_type + self.item = item + msg = ( + f"Could not coerce {cursor_value_type} with value {cursor_value} and type" + f" {type(cursor_value)} to actual data item {item} at path {cursor_path} with type" + f" {item_type}: {details}. You need to use different data type for" + f" {cursor_value_type} or cast your data ie. by using `add_map` on this resource." + ) + super().__init__(pipe_name, msg) + + class IncrementalPrimaryKeyMissing(PipeException): def __init__(self, pipe_name: str, primary_key_column: str, item: TDataItem) -> None: self.primary_key_column = primary_key_column diff --git a/dlt/extract/incremental/transform.py b/dlt/extract/incremental/transform.py index 947e21f7b8..0ac9fdf520 100644 --- a/dlt/extract/incremental/transform.py +++ b/dlt/extract/incremental/transform.py @@ -8,6 +8,7 @@ from dlt.common.typing import TDataItem from dlt.common.jsonpath import find_values, JSONPathFields, compile_path from dlt.extract.incremental.exceptions import ( + IncrementalCursorInvalidCoercion, IncrementalCursorPathMissing, IncrementalPrimaryKeyMissing, ) @@ -158,14 +159,36 @@ def __call__( # Check whether end_value has been reached # Filter end value ranges exclusively, so in case of "max" function we remove values >= end_value - if self.end_value is not None and ( - last_value_func((row_value, self.end_value)) != self.end_value - or last_value_func((row_value,)) == self.end_value - ): - return None, False, True - + if self.end_value is not None: + try: + if ( + last_value_func((row_value, self.end_value)) != self.end_value + or last_value_func((row_value,)) == self.end_value + ): + return None, False, True + except Exception as ex: + raise IncrementalCursorInvalidCoercion( + self.resource_name, + self.cursor_path, + self.end_value, + "end_value", + row_value, + type(row_value).__name__, + str(ex), + ) from ex check_values = (row_value,) + ((last_value,) if last_value is not None else ()) - new_value = last_value_func(check_values) + try: + new_value = last_value_func(check_values) + except Exception as ex: + raise IncrementalCursorInvalidCoercion( + self.resource_name, + self.cursor_path, + last_value, + "start_value/initial_value", + row_value, + type(row_value).__name__, + str(ex), + ) from ex # new_value is "less" or equal to last_value (the actual max) if last_value == new_value: # use func to compute row_value into last_value compatible @@ -294,14 +317,36 @@ def __call__( # If end_value is provided, filter to include table rows that are "less" than end_value if self.end_value is not None: - end_value_scalar = to_arrow_scalar(self.end_value, cursor_data_type) + try: + end_value_scalar = to_arrow_scalar(self.end_value, cursor_data_type) + except Exception as ex: + raise IncrementalCursorInvalidCoercion( + self.resource_name, + cursor_path, + self.end_value, + "end_value", + "", + cursor_data_type, + str(ex), + ) from ex tbl = tbl.filter(end_compare(tbl[cursor_path], end_value_scalar)) # Is max row value higher than end value? # NOTE: pyarrow bool *always* evaluates to python True. `as_py()` is necessary end_out_of_range = not end_compare(row_value_scalar, end_value_scalar).as_py() if self.start_value is not None: - start_value_scalar = to_arrow_scalar(self.start_value, cursor_data_type) + try: + start_value_scalar = to_arrow_scalar(self.start_value, cursor_data_type) + except Exception as ex: + raise IncrementalCursorInvalidCoercion( + self.resource_name, + cursor_path, + self.start_value, + "start_value/initial_value", + "", + cursor_data_type, + str(ex), + ) from ex # Remove rows lower or equal than the last start value keep_filter = last_value_compare(tbl[cursor_path], start_value_scalar) start_out_of_range = bool(pa.compute.any(pa.compute.invert(keep_filter)).as_py()) diff --git a/tests/extract/test_incremental.py b/tests/extract/test_incremental.py index f4082a7d86..c401552fb2 100644 --- a/tests/extract/test_incremental.py +++ b/tests/extract/test_incremental.py @@ -30,6 +30,7 @@ from dlt.sources.helpers.transform import take_first from dlt.extract.incremental import IncrementalResourceWrapper, Incremental from dlt.extract.incremental.exceptions import ( + IncrementalCursorInvalidCoercion, IncrementalCursorPathMissing, IncrementalPrimaryKeyMissing, ) @@ -1303,7 +1304,7 @@ def some_data( ) # will cause invalid comparison if item_type == "object": - with pytest.raises(InvalidStepFunctionArguments): + with pytest.raises(IncrementalCursorInvalidCoercion): list(resource) else: data = data_item_to_list(item_type, list(resource)) @@ -2065,3 +2066,21 @@ def test_source(): incremental_steps = test_source_incremental().table_name._pipe._steps assert isinstance(incremental_steps[-2], ValidateItem) assert isinstance(incremental_steps[-1], IncrementalResourceWrapper) + + +@pytest.mark.parametrize("item_type", ALL_TEST_DATA_ITEM_FORMATS) +def test_cursor_date_coercion(item_type: TestDataItemFormat) -> None: + today = datetime.today().date() + + @dlt.resource() + def updated_is_int(updated_at=dlt.sources.incremental("updated_at", initial_value=today)): + data = [{"updated_at": d} for d in [1, 2, 3]] + yield data_to_item_format(item_type, data) + + pip_1_name = "test_pydantic_columns_validator_" + uniq_id() + pipeline = dlt.pipeline(pipeline_name=pip_1_name, destination="duckdb") + + with pytest.raises(PipelineStepFailed) as pip_ex: + pipeline.run(updated_is_int()) + assert isinstance(pip_ex.value.__cause__, IncrementalCursorInvalidCoercion) + assert pip_ex.value.__cause__.cursor_path == "updated_at" From 98ca505fd06b8146a4355c6355174abe8b45ef66 Mon Sep 17 00:00:00 2001 From: VioletM Date: Wed, 28 Aug 2024 06:28:50 -0400 Subject: [PATCH 08/10] Expose staging tables truncation to config (#1717) * Expose staging tables truncation to config * Fix comments, add tests * Fix tests * Move implementation from mixing, add tests * Fix docs grammar --- dlt/common/destination/reference.py | 8 ++- dlt/destinations/impl/athena/athena.py | 2 +- dlt/destinations/impl/bigquery/bigquery.py | 3 + .../impl/clickhouse/clickhouse.py | 3 + .../impl/databricks/databricks.py | 3 + dlt/destinations/impl/dremio/dremio.py | 3 + dlt/destinations/impl/dummy/configuration.py | 2 + dlt/destinations/impl/dummy/dummy.py | 3 + dlt/destinations/impl/redshift/redshift.py | 3 + dlt/destinations/impl/snowflake/snowflake.py | 3 + dlt/destinations/impl/synapse/synapse.py | 3 + dlt/load/utils.py | 7 +- docs/website/docs/dlt-ecosystem/staging.md | 72 ++++++++++++------- tests/load/pipeline/test_stage_loading.py | 57 ++++++++++++++- tests/load/test_dummy_client.py | 17 +++++ 15 files changed, 152 insertions(+), 37 deletions(-) diff --git a/dlt/common/destination/reference.py b/dlt/common/destination/reference.py index 744cbbd1f5..0944b03bea 100644 --- a/dlt/common/destination/reference.py +++ b/dlt/common/destination/reference.py @@ -269,6 +269,8 @@ class DestinationClientDwhWithStagingConfiguration(DestinationClientDwhConfigura staging_config: Optional[DestinationClientStagingConfiguration] = None """configuration of the staging, if present, injected at runtime""" + truncate_tables_on_staging_destination_before_load: bool = True + """If dlt should truncate the tables on staging destination before loading data.""" TLoadJobState = Literal["ready", "running", "failed", "retry", "completed"] @@ -578,7 +580,7 @@ def with_staging_dataset(self) -> ContextManager["JobClientBase"]: return self # type: ignore -class SupportsStagingDestination: +class SupportsStagingDestination(ABC): """Adds capability to support a staging destination for the load""" def should_load_data_to_staging_dataset_on_staging_destination( @@ -586,9 +588,9 @@ def should_load_data_to_staging_dataset_on_staging_destination( ) -> bool: return False + @abstractmethod def should_truncate_table_before_load_on_staging_destination(self, table: TTableSchema) -> bool: - # the default is to truncate the tables on the staging destination... - return True + pass # TODO: type Destination properly diff --git a/dlt/destinations/impl/athena/athena.py b/dlt/destinations/impl/athena/athena.py index 0c90d171a3..b28309b930 100644 --- a/dlt/destinations/impl/athena/athena.py +++ b/dlt/destinations/impl/athena/athena.py @@ -531,7 +531,7 @@ def should_truncate_table_before_load_on_staging_destination(self, table: TTable if table["write_disposition"] == "replace" and not self._is_iceberg_table( self.prepare_load_table(table["name"]) ): - return True + return self.config.truncate_tables_on_staging_destination_before_load return False def should_load_data_to_staging_dataset_on_staging_destination( diff --git a/dlt/destinations/impl/bigquery/bigquery.py b/dlt/destinations/impl/bigquery/bigquery.py index 8291415434..11326cf3ed 100644 --- a/dlt/destinations/impl/bigquery/bigquery.py +++ b/dlt/destinations/impl/bigquery/bigquery.py @@ -503,6 +503,9 @@ def _should_autodetect_schema(self, table_name: str) -> bool: self.schema._schema_tables, table_name, AUTODETECT_SCHEMA_HINT, allow_none=True ) or (self.config.autodetect_schema and table_name not in self.schema.dlt_table_names()) + def should_truncate_table_before_load_on_staging_destination(self, table: TTableSchema) -> bool: + return self.config.truncate_tables_on_staging_destination_before_load + def _streaming_load( items: List[Dict[Any, Any]], table: Dict[str, Any], job_client: BigQueryClient diff --git a/dlt/destinations/impl/clickhouse/clickhouse.py b/dlt/destinations/impl/clickhouse/clickhouse.py index 5f17a5a18c..282fbaf338 100644 --- a/dlt/destinations/impl/clickhouse/clickhouse.py +++ b/dlt/destinations/impl/clickhouse/clickhouse.py @@ -372,3 +372,6 @@ def _from_db_type( self, ch_t: str, precision: Optional[int], scale: Optional[int] ) -> TColumnType: return self.type_mapper.from_db_type(ch_t, precision, scale) + + def should_truncate_table_before_load_on_staging_destination(self, table: TTableSchema) -> bool: + return self.config.truncate_tables_on_staging_destination_before_load diff --git a/dlt/destinations/impl/databricks/databricks.py b/dlt/destinations/impl/databricks/databricks.py index 2f23e88ea0..38412b2608 100644 --- a/dlt/destinations/impl/databricks/databricks.py +++ b/dlt/destinations/impl/databricks/databricks.py @@ -325,3 +325,6 @@ def _get_storage_table_query_columns(self) -> List[str]: "full_data_type" ) return fields + + def should_truncate_table_before_load_on_staging_destination(self, table: TTableSchema) -> bool: + return self.config.truncate_tables_on_staging_destination_before_load diff --git a/dlt/destinations/impl/dremio/dremio.py b/dlt/destinations/impl/dremio/dremio.py index 68a3fedc31..149d106dcd 100644 --- a/dlt/destinations/impl/dremio/dremio.py +++ b/dlt/destinations/impl/dremio/dremio.py @@ -210,3 +210,6 @@ def _make_add_column_sql( self, new_columns: Sequence[TColumnSchema], table_format: TTableFormat = None ) -> List[str]: return ["ADD COLUMNS (" + ", ".join(self._get_column_def_sql(c) for c in new_columns) + ")"] + + def should_truncate_table_before_load_on_staging_destination(self, table: TTableSchema) -> bool: + return self.config.truncate_tables_on_staging_destination_before_load diff --git a/dlt/destinations/impl/dummy/configuration.py b/dlt/destinations/impl/dummy/configuration.py index 023b88e51a..a066479294 100644 --- a/dlt/destinations/impl/dummy/configuration.py +++ b/dlt/destinations/impl/dummy/configuration.py @@ -34,6 +34,8 @@ class DummyClientConfiguration(DestinationClientConfiguration): """raise terminal exception in job init""" fail_transiently_in_init: bool = False """raise transient exception in job init""" + truncate_tables_on_staging_destination_before_load: bool = True + """truncate tables on staging destination""" # new jobs workflows create_followup_jobs: bool = False diff --git a/dlt/destinations/impl/dummy/dummy.py b/dlt/destinations/impl/dummy/dummy.py index 49b55ec65d..feb09369dc 100644 --- a/dlt/destinations/impl/dummy/dummy.py +++ b/dlt/destinations/impl/dummy/dummy.py @@ -202,6 +202,9 @@ def complete_load(self, load_id: str) -> None: def should_load_data_to_staging_dataset(self, table: TTableSchema) -> bool: return super().should_load_data_to_staging_dataset(table) + def should_truncate_table_before_load_on_staging_destination(self, table: TTableSchema) -> bool: + return self.config.truncate_tables_on_staging_destination_before_load + @contextmanager def with_staging_dataset(self) -> Iterator[JobClientBase]: try: diff --git a/dlt/destinations/impl/redshift/redshift.py b/dlt/destinations/impl/redshift/redshift.py index 93827c8163..0e201dc4e0 100644 --- a/dlt/destinations/impl/redshift/redshift.py +++ b/dlt/destinations/impl/redshift/redshift.py @@ -274,3 +274,6 @@ def _from_db_type( self, pq_t: str, precision: Optional[int], scale: Optional[int] ) -> TColumnType: return self.type_mapper.from_db_type(pq_t, precision, scale) + + def should_truncate_table_before_load_on_staging_destination(self, table: TTableSchema) -> bool: + return self.config.truncate_tables_on_staging_destination_before_load diff --git a/dlt/destinations/impl/snowflake/snowflake.py b/dlt/destinations/impl/snowflake/snowflake.py index 8b4eabc961..6688b5bc17 100644 --- a/dlt/destinations/impl/snowflake/snowflake.py +++ b/dlt/destinations/impl/snowflake/snowflake.py @@ -325,3 +325,6 @@ def _get_column_def_sql(self, c: TColumnSchema, table_format: TTableFormat = Non return ( f"{name} {self.type_mapper.to_db_type(c)} {self._gen_not_null(c.get('nullable', True))}" ) + + def should_truncate_table_before_load_on_staging_destination(self, table: TTableSchema) -> bool: + return self.config.truncate_tables_on_staging_destination_before_load diff --git a/dlt/destinations/impl/synapse/synapse.py b/dlt/destinations/impl/synapse/synapse.py index e43e2a6dfa..750a4895f0 100644 --- a/dlt/destinations/impl/synapse/synapse.py +++ b/dlt/destinations/impl/synapse/synapse.py @@ -173,6 +173,9 @@ def create_load_job( ) return job + def should_truncate_table_before_load_on_staging_destination(self, table: TTableSchema) -> bool: + return self.config.truncate_tables_on_staging_destination_before_load + class SynapseCopyFileLoadJob(CopyRemoteFileLoadJob): def __init__( diff --git a/dlt/load/utils.py b/dlt/load/utils.py index 741c01f249..e3a2ebcd79 100644 --- a/dlt/load/utils.py +++ b/dlt/load/utils.py @@ -179,9 +179,10 @@ def _init_dataset_and_update_schema( applied_update = job_client.update_stored_schema( only_tables=update_tables, expected_update=expected_update ) - logger.info( - f"Client for {job_client.config.destination_type} will truncate tables {staging_text}" - ) + if truncate_tables: + logger.info( + f"Client for {job_client.config.destination_type} will truncate tables {staging_text}" + ) job_client.initialize_storage(truncate_tables=truncate_tables) return applied_update diff --git a/docs/website/docs/dlt-ecosystem/staging.md b/docs/website/docs/dlt-ecosystem/staging.md index 05e31a574b..789189b7dd 100644 --- a/docs/website/docs/dlt-ecosystem/staging.md +++ b/docs/website/docs/dlt-ecosystem/staging.md @@ -1,36 +1,33 @@ --- title: Staging -description: Configure an s3 or gcs bucket for staging before copying into the destination +description: Configure an S3 or GCS bucket for staging before copying into the destination keywords: [staging, destination] --- # Staging -The goal of staging is to bring the data closer to the database engine so the modification of the destination (final) dataset happens faster and without errors. `dlt`, when asked, creates two -staging areas: +The goal of staging is to bring the data closer to the database engine so that the modification of the destination (final) dataset happens faster and without errors. `dlt`, when asked, creates two staging areas: 1. A **staging dataset** used by the [merge and replace loads](../general-usage/incremental-loading.md#merge-incremental_loading) to deduplicate and merge data with the destination. -2. A **staging storage** which is typically a s3/gcp bucket where [loader files](file-formats/) are copied before they are loaded by the destination. +2. A **staging storage** which is typically an S3/GCP bucket where [loader files](file-formats/) are copied before they are loaded by the destination. ## Staging dataset -`dlt` creates a staging dataset when write disposition of any of the loaded resources requires it. It creates and migrates required tables exactly like for the -main dataset. Data in staging tables is truncated when load step begins and only for tables that will participate in it. -Such staging dataset has the same name as the dataset passed to `dlt.pipeline` but with `_staging` suffix in the name. Alternatively, you can provide your own staging dataset pattern or use a fixed name, identical for all the -configured datasets. +`dlt` creates a staging dataset when the write disposition of any of the loaded resources requires it. It creates and migrates required tables exactly like for the main dataset. Data in staging tables is truncated when the load step begins and only for tables that will participate in it. +Such a staging dataset has the same name as the dataset passed to `dlt.pipeline` but with a `_staging` suffix in the name. Alternatively, you can provide your own staging dataset pattern or use a fixed name, identical for all the configured datasets. ```toml [destination.postgres] staging_dataset_name_layout="staging_%s" ``` -Entry above switches the pattern to `staging_` prefix and for example for dataset with name **github_data** `dlt` will create **staging_github_data**. +The entry above switches the pattern to `staging_` prefix and for example, for a dataset with the name **github_data**, `dlt` will create **staging_github_data**. -To configure static staging dataset name, you can do the following (we use destination factory) +To configure a static staging dataset name, you can do the following (we use the destination factory) ```py import dlt dest_ = dlt.destinations.postgres(staging_dataset_name_layout="_dlt_staging") ``` -All pipelines using `dest_` as destination will use **staging_dataset** to store staging tables. Make sure that your pipelines are not overwriting each other's tables. +All pipelines using `dest_` as the destination will use the **staging_dataset** to store staging tables. Make sure that your pipelines are not overwriting each other's tables. -### Cleanup up staging dataset automatically -`dlt` does not truncate tables in staging dataset at the end of the load. Data that is left after contains all the extracted data and may be useful for debugging. +### Cleanup staging dataset automatically +`dlt` does not truncate tables in the staging dataset at the end of the load. Data that is left after contains all the extracted data and may be useful for debugging. If you prefer to truncate it, put the following line in `config.toml`: ```toml @@ -39,19 +36,23 @@ truncate_staging_dataset=true ``` ## Staging storage -`dlt` allows to chain destinations where the first one (`staging`) is responsible for uploading the files from local filesystem to the remote storage. It then generates followup jobs for the second destination that (typically) copy the files from remote storage into destination. +`dlt` allows chaining destinations where the first one (`staging`) is responsible for uploading the files from the local filesystem to the remote storage. It then generates follow-up jobs for the second destination that (typically) copy the files from remote storage into the destination. -Currently, only one destination the [filesystem](destinations/filesystem.md) can be used as a staging. Following destinations can copy remote files: -1. [Redshift.](destinations/redshift.md#staging-support) -2. [Bigquery.](destinations/bigquery.md#staging-support) -3. [Snowflake.](destinations/snowflake.md#staging-support) +Currently, only one destination, the [filesystem](destinations/filesystem.md), can be used as staging. The following destinations can copy remote files: + +1. [Azure Synapse](destinations/synapse#staging-support) +1. [Athena](destinations/athena#staging-support) +1. [Bigquery](destinations/bigquery.md#staging-support) +1. [Dremio](destinations/dremio#staging-support) +1. [Redshift](destinations/redshift.md#staging-support) +1. [Snowflake](destinations/snowflake.md#staging-support) ### How to use -In essence, you need to set up two destinations and then pass them to `dlt.pipeline`. Below we'll use `filesystem` staging with `parquet` files to load into `Redshift` destination. +In essence, you need to set up two destinations and then pass them to `dlt.pipeline`. Below we'll use `filesystem` staging with `parquet` files to load into the `Redshift` destination. -1. **Set up the s3 bucket and filesystem staging.** +1. **Set up the S3 bucket and filesystem staging.** - Please follow our guide in [filesystem destination documentation](destinations/filesystem.md). Test the staging as standalone destination to make sure that files go where you want them. In your `secrets.toml` you should now have a working `filesystem` configuration: + Please follow our guide in the [filesystem destination documentation](destinations/filesystem.md). Test the staging as a standalone destination to make sure that files go where you want them. In your `secrets.toml`, you should now have a working `filesystem` configuration: ```toml [destination.filesystem] bucket_url = "s3://[your_bucket_name]" # replace with your bucket name, @@ -63,15 +64,15 @@ In essence, you need to set up two destinations and then pass them to `dlt.pipel 2. **Set up the Redshift destination.** - Please follow our guide in [redshift destination documentation](destinations/redshift.md). In your `secrets.toml` you added: + Please follow our guide in the [redshift destination documentation](destinations/redshift.md). In your `secrets.toml`, you added: ```toml # keep it at the top of your toml file! before any section starts destination.redshift.credentials="redshift://loader:@localhost/dlt_data?connect_timeout=15" ``` -3. **Authorize Redshift cluster to access the staging bucket.** +3. **Authorize the Redshift cluster to access the staging bucket.** - By default `dlt` will forward the credentials configured for `filesystem` to the `Redshift` COPY command. If you are fine with this, move to the next step. + By default, `dlt` will forward the credentials configured for `filesystem` to the `Redshift` COPY command. If you are fine with this, move to the next step. 4. **Chain staging to destination and request `parquet` file format.** @@ -79,7 +80,7 @@ In essence, you need to set up two destinations and then pass them to `dlt.pipel ```py # Create a dlt pipeline that will load # chess player data to the redshift destination - # via staging on s3 + # via staging on S3 pipeline = dlt.pipeline( pipeline_name='chess_pipeline', destination='redshift', @@ -87,7 +88,7 @@ In essence, you need to set up two destinations and then pass them to `dlt.pipel dataset_name='player_data' ) ``` - `dlt` will automatically select an appropriate loader file format for the staging files. Below we explicitly specify `parquet` file format (just to demonstrate how to do it): + `dlt` will automatically select an appropriate loader file format for the staging files. Below we explicitly specify the `parquet` file format (just to demonstrate how to do it): ```py info = pipeline.run(chess(), loader_file_format="parquet") ``` @@ -96,4 +97,21 @@ In essence, you need to set up two destinations and then pass them to `dlt.pipel Run the pipeline script as usual. -> 💡 Please note that `dlt` does not delete loaded files from the staging storage after the load is complete. +:::tip +Please note that `dlt` does not delete loaded files from the staging storage after the load is complete, but it truncates previously loaded files. +::: + +### How to prevent staging files truncation + +Before `dlt` loads data to the staging storage, it truncates previously loaded files. To prevent it and keep the whole history +of loaded files, you can use the following parameter: + +```toml +[destination.redshift] +truncate_table_before_load_on_staging_destination=false +``` + +:::caution +The [Athena](destinations/athena#staging-support) destination only truncates not iceberg tables with `replace` merge_disposition. +Therefore, the parameter `truncate_table_before_load_on_staging_destination` only controls the truncation of corresponding files for these tables. +::: diff --git a/tests/load/pipeline/test_stage_loading.py b/tests/load/pipeline/test_stage_loading.py index a760c86526..f216fa3c05 100644 --- a/tests/load/pipeline/test_stage_loading.py +++ b/tests/load/pipeline/test_stage_loading.py @@ -1,12 +1,12 @@ import pytest -from typing import Dict, Any, List +from typing import List import dlt, os -from dlt.common import json, sleep -from copy import deepcopy +from dlt.common import json from dlt.common.storages.configuration import FilesystemConfiguration from dlt.common.utils import uniq_id from dlt.common.schema.typing import TDataType +from dlt.destinations.impl.filesystem.filesystem import FilesystemClient from tests.load.pipeline.test_merge_disposition import github from tests.pipeline.utils import load_table_counts, assert_load_info @@ -40,6 +40,13 @@ def load_modified_issues(): yield from issues +@dlt.resource(table_name="events", write_disposition="append", primary_key="timestamp") +def event_many_load_2(): + with open("tests/normalize/cases/event.event.many_load_2.json", "r", encoding="utf-8") as f: + events = json.load(f) + yield from events + + @pytest.mark.parametrize( "destination_config", destinations_configs(all_staging_configs=True), ids=lambda x: x.name ) @@ -183,6 +190,50 @@ def test_staging_load(destination_config: DestinationTestConfiguration) -> None: assert replace_counts == initial_counts +@pytest.mark.parametrize( + "destination_config", destinations_configs(all_staging_configs=True), ids=lambda x: x.name +) +def test_truncate_staging_dataset(destination_config: DestinationTestConfiguration) -> None: + """This test checks if tables truncation on staging destination done according to the configuration. + + Test loads data to the destination three times: + * with truncation + * without truncation (after this 2 staging files should be left) + * with truncation (after this 1 staging file should be left) + """ + pipeline = destination_config.setup_pipeline( + pipeline_name="test_stage_loading", dataset_name="test_staging_load" + uniq_id() + ) + resource = event_many_load_2() + table_name: str = resource.table_name # type: ignore[assignment] + + # load the data, files stay on the stage after the load + info = pipeline.run(resource) + assert_load_info(info) + + # load the data without truncating of the staging, should see two files on staging + pipeline.destination.config_params["truncate_tables_on_staging_destination_before_load"] = False + info = pipeline.run(resource) + assert_load_info(info) + # check there are two staging files + _, staging_client = pipeline._get_destination_clients(pipeline.default_schema) + with staging_client: + assert len(staging_client.list_table_files(table_name)) == 2 # type: ignore[attr-defined] + + # load the data with truncating, so only new file is on the staging + pipeline.destination.config_params["truncate_tables_on_staging_destination_before_load"] = True + info = pipeline.run(resource) + assert_load_info(info) + # check that table exists in the destination + with pipeline.sql_client() as sql_client: + qual_name = sql_client.make_qualified_table_name + assert len(sql_client.execute_sql(f"SELECT * from {qual_name(table_name)}")) > 4 + # check there is only one staging file + _, staging_client = pipeline._get_destination_clients(pipeline.default_schema) + with staging_client: + assert len(staging_client.list_table_files(table_name)) == 1 # type: ignore[attr-defined] + + @pytest.mark.parametrize( "destination_config", destinations_configs(all_staging_configs=True), ids=lambda x: x.name ) diff --git a/tests/load/test_dummy_client.py b/tests/load/test_dummy_client.py index 9f0bca6ac5..59b7acac15 100644 --- a/tests/load/test_dummy_client.py +++ b/tests/load/test_dummy_client.py @@ -548,6 +548,23 @@ def test_completed_loop_with_delete_completed() -> None: assert_complete_job(load, should_delete_completed=True) +@pytest.mark.parametrize("to_truncate", [True, False]) +def test_truncate_table_before_load_on_stanging(to_truncate) -> None: + load = setup_loader( + client_config=DummyClientConfiguration( + truncate_tables_on_staging_destination_before_load=to_truncate + ) + ) + load_id, schema = prepare_load_package(load.load_storage, NORMALIZED_FILES) + destination_client = load.get_destination_client(schema) + assert ( + destination_client.should_truncate_table_before_load_on_staging_destination( # type: ignore + schema.tables["_dlt_version"] + ) + == to_truncate + ) + + def test_retry_on_new_loop() -> None: # test job that retries sitting in new jobs load = setup_loader(client_config=DummyClientConfiguration(retry_prob=1.0)) From 4e1c6077c7ed4bbaf127e34a2cbc7d87fe48d924 Mon Sep 17 00:00:00 2001 From: rudolfix Date: Wed, 28 Aug 2024 13:17:11 +0200 Subject: [PATCH 09/10] enables external location and named credential in databricks (#1755) * allows to configure external location and named credential for databricks * fixes #1703 * normalizes 'value' when wrapping simple objects in relational, fixes #1754 * simplifies fsspec globbing and allows various url formats that are preserved when reconstituting full url, allows abfss databricks format * adds info on partially loaded packages to docs * renames remote_uri to remote_url in traces * fixes delta for abfss * adds nested tables dlt columns collision test --- .github/workflows/test_destinations.yml | 1 + .../configuration/specs/azure_credentials.py | 2 + dlt/common/libs/deltalake.py | 3 +- dlt/common/metrics.py | 2 +- dlt/common/normalizers/json/__init__.py | 4 +- dlt/common/normalizers/json/relational.py | 4 +- dlt/common/storages/configuration.py | 119 +++++++++++++----- dlt/common/storages/fsspec_filesystem.py | 58 +++++---- dlt/destinations/impl/athena/athena.py | 1 - dlt/destinations/impl/bigquery/bigquery.py | 2 +- .../impl/databricks/configuration.py | 4 + .../impl/databricks/databricks.py | 108 ++++++++++------ dlt/destinations/impl/databricks/factory.py | 6 + dlt/destinations/impl/dummy/dummy.py | 4 +- .../impl/filesystem/filesystem.py | 32 ++--- .../dlt-ecosystem/destinations/databricks.md | 33 ++++- .../dlt-ecosystem/destinations/snowflake.md | 2 +- .../docs/running-in-production/running.md | 16 ++- tests/.dlt/config.toml | 3 +- tests/common/cases/normalizers/sql_upper.py | 2 - .../common/storages/test_local_filesystem.py | 10 +- .../test_destination_name_and_config.py | 4 +- .../test_databricks_configuration.py | 50 +++++++- .../load/filesystem/test_filesystem_common.py | 54 +++++--- .../load/pipeline/test_databricks_pipeline.py | 85 +++++++++++++ .../load/pipeline/test_filesystem_pipeline.py | 18 +-- tests/load/pipeline/test_stage_loading.py | 10 +- tests/load/test_dummy_client.py | 10 +- tests/load/utils.py | 12 +- .../cases/contracts/trace.schema.yaml | 2 +- tests/pipeline/test_pipeline.py | 14 +++ tests/pipeline/test_pipeline_trace.py | 2 +- 32 files changed, 510 insertions(+), 167 deletions(-) create mode 100644 tests/load/pipeline/test_databricks_pipeline.py diff --git a/.github/workflows/test_destinations.yml b/.github/workflows/test_destinations.yml index a034ac7eb0..7fae69ff9e 100644 --- a/.github/workflows/test_destinations.yml +++ b/.github/workflows/test_destinations.yml @@ -29,6 +29,7 @@ env: # Test redshift and filesystem with all buckets # postgres runs again here so we can test on mac/windows ACTIVE_DESTINATIONS: "[\"redshift\", \"postgres\", \"duckdb\", \"filesystem\", \"dummy\"]" + # note that all buckets are enabled for testing jobs: get_docs_changes: diff --git a/dlt/common/configuration/specs/azure_credentials.py b/dlt/common/configuration/specs/azure_credentials.py index 7fa34fa00f..6794b581ce 100644 --- a/dlt/common/configuration/specs/azure_credentials.py +++ b/dlt/common/configuration/specs/azure_credentials.py @@ -32,6 +32,8 @@ def to_object_store_rs_credentials(self) -> Dict[str, str]: creds = self.to_adlfs_credentials() if creds["sas_token"] is None: creds.pop("sas_token") + if creds["account_key"] is None: + creds.pop("account_key") return creds def create_sas_token(self) -> None: diff --git a/dlt/common/libs/deltalake.py b/dlt/common/libs/deltalake.py index d4cb46c600..38b23ea27a 100644 --- a/dlt/common/libs/deltalake.py +++ b/dlt/common/libs/deltalake.py @@ -176,7 +176,8 @@ def _deltalake_storage_options(config: FilesystemConfiguration) -> Dict[str, str """Returns dict that can be passed as `storage_options` in `deltalake` library.""" creds = {} extra_options = {} - if config.protocol in ("az", "gs", "s3"): + # TODO: create a mixin with to_object_store_rs_credentials for a proper discovery + if hasattr(config.credentials, "to_object_store_rs_credentials"): creds = config.credentials.to_object_store_rs_credentials() if config.deltalake_storage_options is not None: extra_options = config.deltalake_storage_options diff --git a/dlt/common/metrics.py b/dlt/common/metrics.py index 5cccee4045..d6acf19d0d 100644 --- a/dlt/common/metrics.py +++ b/dlt/common/metrics.py @@ -64,7 +64,7 @@ class LoadJobMetrics(NamedTuple): started_at: datetime.datetime finished_at: datetime.datetime state: Optional[str] - remote_uri: Optional[str] + remote_url: Optional[str] class LoadMetrics(StepMetrics): diff --git a/dlt/common/normalizers/json/__init__.py b/dlt/common/normalizers/json/__init__.py index a13bab15f4..725f6a8355 100644 --- a/dlt/common/normalizers/json/__init__.py +++ b/dlt/common/normalizers/json/__init__.py @@ -54,9 +54,9 @@ class SupportsDataItemNormalizer(Protocol): """A class with a name DataItemNormalizer deriving from normalizers.json.DataItemNormalizer""" -def wrap_in_dict(item: Any) -> DictStrAny: +def wrap_in_dict(label: str, item: Any) -> DictStrAny: """Wraps `item` that is not a dictionary into dictionary that can be json normalized""" - return {"value": item} + return {label: item} __all__ = [ diff --git a/dlt/common/normalizers/json/relational.py b/dlt/common/normalizers/json/relational.py index 1dbcec4bff..33184640f0 100644 --- a/dlt/common/normalizers/json/relational.py +++ b/dlt/common/normalizers/json/relational.py @@ -281,7 +281,7 @@ def _normalize_list( else: # list of simple types child_row_hash = DataItemNormalizer._get_child_row_hash(parent_row_id, table, idx) - wrap_v = wrap_in_dict(v) + wrap_v = wrap_in_dict(self.c_value, v) wrap_v[self.c_dlt_id] = child_row_hash e = self._link_row(wrap_v, parent_row_id, idx) DataItemNormalizer._extend_row(extend, e) @@ -387,7 +387,7 @@ def normalize_data_item( ) -> TNormalizedRowIterator: # wrap items that are not dictionaries in dictionary, otherwise they cannot be processed by the JSON normalizer if not isinstance(item, dict): - item = wrap_in_dict(item) + item = wrap_in_dict(self.c_value, item) # we will extend event with all the fields necessary to load it as root row row = cast(DictStrAny, item) # identify load id if loaded data must be processed after loading incrementally diff --git a/dlt/common/storages/configuration.py b/dlt/common/storages/configuration.py index b2bdb3a7b6..04780528c4 100644 --- a/dlt/common/storages/configuration.py +++ b/dlt/common/storages/configuration.py @@ -1,7 +1,7 @@ import os import pathlib from typing import Any, Literal, Optional, Type, get_args, ClassVar, Dict, Union -from urllib.parse import urlparse, unquote +from urllib.parse import urlparse, unquote, urlunparse from dlt.common.configuration import configspec, resolve_type from dlt.common.configuration.exceptions import ConfigurationValueError @@ -52,6 +52,53 @@ class LoadStorageConfiguration(BaseConfiguration): ] +def _make_az_url(scheme: str, fs_path: str, bucket_url: str) -> str: + parsed_bucket_url = urlparse(bucket_url) + if parsed_bucket_url.username: + # az://@.dfs.core.windows.net/ + # fs_path always starts with container + split_path = fs_path.split("/", maxsplit=1) + if len(split_path) == 1: + split_path.append("") + container, path = split_path + netloc = f"{container}@{parsed_bucket_url.hostname}" + return urlunparse(parsed_bucket_url._replace(path=path, scheme=scheme, netloc=netloc)) + return f"{scheme}://{fs_path}" + + +def _make_file_url(scheme: str, fs_path: str, bucket_url: str) -> str: + """Creates a normalized file:// url from a local path + + netloc is never set. UNC paths are represented as file://host/path + """ + p_ = pathlib.Path(fs_path) + p_ = p_.expanduser().resolve() + return p_.as_uri() + + +MAKE_URI_DISPATCH = {"az": _make_az_url, "file": _make_file_url} + +MAKE_URI_DISPATCH["adl"] = MAKE_URI_DISPATCH["az"] +MAKE_URI_DISPATCH["abfs"] = MAKE_URI_DISPATCH["az"] +MAKE_URI_DISPATCH["azure"] = MAKE_URI_DISPATCH["az"] +MAKE_URI_DISPATCH["abfss"] = MAKE_URI_DISPATCH["az"] +MAKE_URI_DISPATCH["local"] = MAKE_URI_DISPATCH["file"] + + +def make_fsspec_url(scheme: str, fs_path: str, bucket_url: str) -> str: + """Creates url from `fs_path` and `scheme` using bucket_url as an `url` template + + Args: + scheme (str): scheme of the resulting url + fs_path (str): kind of absolute path that fsspec uses to locate resources for particular filesystem. + bucket_url (str): an url template. the structure of url will be preserved if possible + """ + _maker = MAKE_URI_DISPATCH.get(scheme) + if _maker: + return _maker(scheme, fs_path, bucket_url) + return f"{scheme}://{fs_path}" + + @configspec class FilesystemConfiguration(BaseConfiguration): """A configuration defining filesystem location and access credentials. @@ -59,7 +106,7 @@ class FilesystemConfiguration(BaseConfiguration): When configuration is resolved, `bucket_url` is used to extract a protocol and request corresponding credentials class. * s3 * gs, gcs - * az, abfs, adl + * az, abfs, adl, abfss, azure * file, memory * gdrive """ @@ -72,6 +119,8 @@ class FilesystemConfiguration(BaseConfiguration): "az": AnyAzureCredentials, "abfs": AnyAzureCredentials, "adl": AnyAzureCredentials, + "abfss": AnyAzureCredentials, + "azure": AnyAzureCredentials, } bucket_url: str = None @@ -93,17 +142,21 @@ def protocol(self) -> str: else: return urlparse(self.bucket_url).scheme + @property + def is_local_filesystem(self) -> bool: + return self.protocol == "file" + def on_resolved(self) -> None: - uri = urlparse(self.bucket_url) - if not uri.path and not uri.netloc: + url = urlparse(self.bucket_url) + if not url.path and not url.netloc: raise ConfigurationValueError( "File path and netloc are missing. Field bucket_url of" - " FilesystemClientConfiguration must contain valid uri with a path or host:password" + " FilesystemClientConfiguration must contain valid url with a path or host:password" " component." ) # this is just a path in a local file system if self.is_local_path(self.bucket_url): - self.bucket_url = self.make_file_uri(self.bucket_url) + self.bucket_url = self.make_file_url(self.bucket_url) @resolve_type("credentials") def resolve_credentials_type(self) -> Type[CredentialsConfiguration]: @@ -122,44 +175,50 @@ def fingerprint(self) -> str: if self.is_local_path(self.bucket_url): return digest128("") - uri = urlparse(self.bucket_url) - return digest128(self.bucket_url.replace(uri.path, "")) + url = urlparse(self.bucket_url) + return digest128(self.bucket_url.replace(url.path, "")) + + def make_url(self, fs_path: str) -> str: + """Makes a full url (with scheme) form fs_path which is kind-of absolute path used by fsspec to identify resources. + This method will use `bucket_url` to infer the original form of the url. + """ + return make_fsspec_url(self.protocol, fs_path, self.bucket_url) def __str__(self) -> str: """Return displayable destination location""" - uri = urlparse(self.bucket_url) + url = urlparse(self.bucket_url) # do not show passwords - if uri.password: - new_netloc = f"{uri.username}:****@{uri.hostname}" - if uri.port: - new_netloc += f":{uri.port}" - return uri._replace(netloc=new_netloc).geturl() + if url.password: + new_netloc = f"{url.username}:****@{url.hostname}" + if url.port: + new_netloc += f":{url.port}" + return url._replace(netloc=new_netloc).geturl() return self.bucket_url @staticmethod - def is_local_path(uri: str) -> bool: - """Checks if `uri` is a local path, without a schema""" - uri_parsed = urlparse(uri) + def is_local_path(url: str) -> bool: + """Checks if `url` is a local path, without a schema""" + url_parsed = urlparse(url) # this prevents windows absolute paths to be recognized as schemas - return not uri_parsed.scheme or os.path.isabs(uri) + return not url_parsed.scheme or os.path.isabs(url) @staticmethod - def make_local_path(file_uri: str) -> str: + def make_local_path(file_url: str) -> str: """Gets a valid local filesystem path from file:// scheme. Supports POSIX/Windows/UNC paths Returns: str: local filesystem path """ - uri = urlparse(file_uri) - if uri.scheme != "file": - raise ValueError(f"Must be file scheme but is {uri.scheme}") - if not uri.path and not uri.netloc: + url = urlparse(file_url) + if url.scheme != "file": + raise ValueError(f"Must be file scheme but is {url.scheme}") + if not url.path and not url.netloc: raise ConfigurationValueError("File path and netloc are missing.") - local_path = unquote(uri.path) - if uri.netloc: + local_path = unquote(url.path) + if url.netloc: # or UNC file://localhost/path - local_path = "//" + unquote(uri.netloc) + local_path + local_path = "//" + unquote(url.netloc) + local_path else: # if we are on windows, strip the POSIX root from path which is always absolute if os.path.sep != local_path[0]: @@ -172,11 +231,9 @@ def make_local_path(file_uri: str) -> str: return str(pathlib.Path(local_path)) @staticmethod - def make_file_uri(local_path: str) -> str: - """Creates a normalized file:// uri from a local path + def make_file_url(local_path: str) -> str: + """Creates a normalized file:// url from a local path netloc is never set. UNC paths are represented as file://host/path """ - p_ = pathlib.Path(local_path) - p_ = p_.expanduser().resolve() - return p_.as_uri() + return make_fsspec_url("file", local_path, None) diff --git a/dlt/common/storages/fsspec_filesystem.py b/dlt/common/storages/fsspec_filesystem.py index be9ae2bbb1..7da5ebabef 100644 --- a/dlt/common/storages/fsspec_filesystem.py +++ b/dlt/common/storages/fsspec_filesystem.py @@ -21,7 +21,7 @@ ) from urllib.parse import urlparse -from fsspec import AbstractFileSystem, register_implementation +from fsspec import AbstractFileSystem, register_implementation, get_filesystem_class from fsspec.core import url_to_fs from dlt import version @@ -32,7 +32,11 @@ AzureCredentials, ) from dlt.common.exceptions import MissingDependencyException -from dlt.common.storages.configuration import FileSystemCredentials, FilesystemConfiguration +from dlt.common.storages.configuration import ( + FileSystemCredentials, + FilesystemConfiguration, + make_fsspec_url, +) from dlt.common.time import ensure_pendulum_datetime from dlt.common.typing import DictStrAny @@ -65,18 +69,20 @@ class FileItem(TypedDict, total=False): MTIME_DISPATCH["gs"] = MTIME_DISPATCH["gcs"] MTIME_DISPATCH["s3a"] = MTIME_DISPATCH["s3"] MTIME_DISPATCH["abfs"] = MTIME_DISPATCH["az"] +MTIME_DISPATCH["abfss"] = MTIME_DISPATCH["az"] # Map of protocol to a filesystem type CREDENTIALS_DISPATCH: Dict[str, Callable[[FilesystemConfiguration], DictStrAny]] = { "s3": lambda config: cast(AwsCredentials, config.credentials).to_s3fs_credentials(), - "adl": lambda config: cast(AzureCredentials, config.credentials).to_adlfs_credentials(), "az": lambda config: cast(AzureCredentials, config.credentials).to_adlfs_credentials(), - "gcs": lambda config: cast(GcpCredentials, config.credentials).to_gcs_credentials(), "gs": lambda config: cast(GcpCredentials, config.credentials).to_gcs_credentials(), "gdrive": lambda config: {"credentials": cast(GcpCredentials, config.credentials)}, - "abfs": lambda config: cast(AzureCredentials, config.credentials).to_adlfs_credentials(), - "azure": lambda config: cast(AzureCredentials, config.credentials).to_adlfs_credentials(), } +CREDENTIALS_DISPATCH["adl"] = CREDENTIALS_DISPATCH["az"] +CREDENTIALS_DISPATCH["abfs"] = CREDENTIALS_DISPATCH["az"] +CREDENTIALS_DISPATCH["azure"] = CREDENTIALS_DISPATCH["az"] +CREDENTIALS_DISPATCH["abfss"] = CREDENTIALS_DISPATCH["az"] +CREDENTIALS_DISPATCH["gcs"] = CREDENTIALS_DISPATCH["gs"] def fsspec_filesystem( @@ -90,7 +96,7 @@ def fsspec_filesystem( Please supply credentials instance corresponding to the protocol. The `protocol` is just the code name of the filesystem i.e.: * s3 - * az, abfs + * az, abfs, abfss, adl, azure * gcs, gs also see filesystem_from_config @@ -136,7 +142,7 @@ def fsspec_from_config(config: FilesystemConfiguration) -> Tuple[AbstractFileSys Authenticates following filesystems: * s3 - * az, abfs + * az, abfs, abfss, adl, azure * gcs, gs All other filesystems are not authenticated @@ -146,8 +152,14 @@ def fsspec_from_config(config: FilesystemConfiguration) -> Tuple[AbstractFileSys fs_kwargs = prepare_fsspec_args(config) try: + # first get the class to check the protocol + fs_cls = get_filesystem_class(config.protocol) + if fs_cls.protocol == "abfs": + # if storage account is present in bucket_url and in credentials, az fsspec will fail + if urlparse(config.bucket_url).username: + fs_kwargs.pop("account_name") return url_to_fs(config.bucket_url, **fs_kwargs) # type: ignore - except ModuleNotFoundError as e: + except ImportError as e: raise MissingDependencyException( "filesystem", [f"{version.DLT_PKG_NAME}[{config.protocol}]"] ) from e @@ -291,10 +303,8 @@ def glob_files( """ is_local_fs = "file" in fs_client.protocol if is_local_fs and FilesystemConfiguration.is_local_path(bucket_url): - bucket_url = FilesystemConfiguration.make_file_uri(bucket_url) - bucket_url_parsed = urlparse(bucket_url) - else: - bucket_url_parsed = urlparse(bucket_url) + bucket_url = FilesystemConfiguration.make_file_url(bucket_url) + bucket_url_parsed = urlparse(bucket_url) if is_local_fs: root_dir = FilesystemConfiguration.make_local_path(bucket_url) @@ -302,7 +312,8 @@ def glob_files( files = glob.glob(str(pathlib.Path(root_dir).joinpath(file_glob)), recursive=True) glob_result = {file: fs_client.info(file) for file in files} else: - root_dir = bucket_url_parsed._replace(scheme="", query="").geturl().lstrip("/") + # convert to fs_path + root_dir = fs_client._strip_protocol(bucket_url) filter_url = posixpath.join(root_dir, file_glob) glob_result = fs_client.glob(filter_url, detail=True) if isinstance(glob_result, list): @@ -314,20 +325,23 @@ def glob_files( for file, md in glob_result.items(): if md["type"] != "file": continue + scheme = bucket_url_parsed.scheme + # relative paths are always POSIX if is_local_fs: - rel_path = pathlib.Path(file).relative_to(root_dir).as_posix() - file_url = FilesystemConfiguration.make_file_uri(file) + # use OS pathlib for local paths + loc_path = pathlib.Path(file) + file_name = loc_path.name + rel_path = loc_path.relative_to(root_dir).as_posix() + file_url = FilesystemConfiguration.make_file_url(file) else: - rel_path = posixpath.relpath(file.lstrip("/"), root_dir) - file_url = bucket_url_parsed._replace( - path=posixpath.join(bucket_url_parsed.path, rel_path) - ).geturl() + file_name = posixpath.basename(file) + rel_path = posixpath.relpath(file, root_dir) + file_url = make_fsspec_url(scheme, file, bucket_url) - scheme = bucket_url_parsed.scheme mime_type, encoding = guess_mime_type(rel_path) yield FileItem( - file_name=posixpath.basename(rel_path), + file_name=file_name, relative_path=rel_path, file_url=file_url, mime_type=mime_type, diff --git a/dlt/destinations/impl/athena/athena.py b/dlt/destinations/impl/athena/athena.py index b28309b930..b3b2fbcf0f 100644 --- a/dlt/destinations/impl/athena/athena.py +++ b/dlt/destinations/impl/athena/athena.py @@ -34,7 +34,6 @@ from dlt.common import logger from dlt.common.exceptions import TerminalValueError -from dlt.common.storages.fsspec_filesystem import fsspec_from_config from dlt.common.utils import uniq_id, without_none from dlt.common.schema import TColumnSchema, Schema, TTableSchema from dlt.common.schema.typing import ( diff --git a/dlt/destinations/impl/bigquery/bigquery.py b/dlt/destinations/impl/bigquery/bigquery.py index 11326cf3ed..1dd4c727be 100644 --- a/dlt/destinations/impl/bigquery/bigquery.py +++ b/dlt/destinations/impl/bigquery/bigquery.py @@ -432,7 +432,7 @@ def _create_load_job(self, table: TTableSchema, file_path: str) -> bigquery.Load # append to table for merge loads (append to stage) and regular appends. table_name = table["name"] - # determine whether we load from local or uri + # determine whether we load from local or url bucket_path = None ext: str = os.path.splitext(file_path)[1][1:] if ReferenceFollowupJobRequest.is_reference_job(file_path): diff --git a/dlt/destinations/impl/databricks/configuration.py b/dlt/destinations/impl/databricks/configuration.py index 3bd2d12a5a..789dbedae9 100644 --- a/dlt/destinations/impl/databricks/configuration.py +++ b/dlt/destinations/impl/databricks/configuration.py @@ -43,6 +43,10 @@ def to_connector_params(self) -> Dict[str, Any]: class DatabricksClientConfiguration(DestinationClientDwhWithStagingConfiguration): destination_type: Final[str] = dataclasses.field(default="databricks", init=False, repr=False, compare=False) # type: ignore[misc] credentials: DatabricksCredentials = None + staging_credentials_name: Optional[str] = None + "If set, credentials with given name will be used in copy command" + is_staging_external_location: bool = False + """If true, the temporary credentials are not propagated to the COPY command""" def __str__(self) -> str: """Return displayable destination location""" diff --git a/dlt/destinations/impl/databricks/databricks.py b/dlt/destinations/impl/databricks/databricks.py index 38412b2608..614e6e97c5 100644 --- a/dlt/destinations/impl/databricks/databricks.py +++ b/dlt/destinations/impl/databricks/databricks.py @@ -1,4 +1,4 @@ -from typing import ClassVar, Dict, Optional, Sequence, Tuple, List, Any, Iterable, Type, cast +from typing import Optional, Sequence, List, cast from urllib.parse import urlparse, urlunparse from dlt import config @@ -6,20 +6,17 @@ from dlt.common.destination.reference import ( HasFollowupJobs, FollowupJobRequest, - TLoadJobState, RunnableLoadJob, - CredentialsConfiguration, SupportsStagingDestination, LoadJob, ) from dlt.common.configuration.specs import ( AwsCredentialsWithoutDefaults, - AzureCredentials, AzureCredentialsWithoutDefaults, ) from dlt.common.exceptions import TerminalValueError from dlt.common.storages.file_storage import FileStorage -from dlt.common.schema import TColumnSchema, Schema, TTableSchemaColumns +from dlt.common.schema import TColumnSchema, Schema from dlt.common.schema.typing import TTableSchema, TColumnType, TSchemaTables, TTableFormat from dlt.common.schema.utils import table_schema_has_type from dlt.common.storages import FilesystemConfiguration, fsspec_from_config @@ -35,6 +32,9 @@ from dlt.destinations.type_mapping import TypeMapper +AZURE_BLOB_STORAGE_PROTOCOLS = ["az", "abfss", "abfs"] + + class DatabricksTypeMapper(TypeMapper): sct_to_unbound_dbt = { "complex": "STRING", # Databricks supports complex types like ARRAY @@ -137,41 +137,51 @@ def run(self) -> None: if bucket_path: bucket_url = urlparse(bucket_path) bucket_scheme = bucket_url.scheme - # referencing an staged files via a bucket URL requires explicit AWS credentials - if bucket_scheme == "s3" and isinstance( - staging_credentials, AwsCredentialsWithoutDefaults - ): - s3_creds = staging_credentials.to_session_credentials() - credentials_clause = f"""WITH(CREDENTIAL( - AWS_ACCESS_KEY='{s3_creds["aws_access_key_id"]}', - AWS_SECRET_KEY='{s3_creds["aws_secret_access_key"]}', - - AWS_SESSION_TOKEN='{s3_creds["aws_session_token"]}' - )) - """ - from_clause = f"FROM '{bucket_path}'" - elif bucket_scheme in ["az", "abfs"] and isinstance( - staging_credentials, AzureCredentialsWithoutDefaults - ): - # Explicit azure credentials are needed to load from bucket without a named stage - credentials_clause = f"""WITH(CREDENTIAL(AZURE_SAS_TOKEN='{staging_credentials.azure_storage_sas_token}'))""" - # Converts an az:/// to abfss://@.dfs.core.windows.net/ - # as required by snowflake - _path = bucket_url.path - bucket_path = urlunparse( - bucket_url._replace( - scheme="abfss", - netloc=f"{bucket_url.netloc}@{staging_credentials.azure_storage_account_name}.dfs.core.windows.net", - path=_path, - ) - ) - from_clause = f"FROM '{bucket_path}'" - else: + + if bucket_scheme not in AZURE_BLOB_STORAGE_PROTOCOLS + ["s3"]: raise LoadJobTerminalException( self._file_path, f"Databricks cannot load data from staging bucket {bucket_path}. Only s3 and" " azure buckets are supported", ) + + if self._job_client.config.is_staging_external_location: + # just skip the credentials clause for external location + # https://docs.databricks.com/en/sql/language-manual/sql-ref-external-locations.html#external-location + pass + elif self._job_client.config.staging_credentials_name: + # add named credentials + credentials_clause = ( + f"WITH(CREDENTIAL {self._job_client.config.staging_credentials_name} )" + ) + else: + # referencing an staged files via a bucket URL requires explicit AWS credentials + if bucket_scheme == "s3": + assert isinstance(staging_credentials, AwsCredentialsWithoutDefaults) + s3_creds = staging_credentials.to_session_credentials() + credentials_clause = f"""WITH(CREDENTIAL( + AWS_ACCESS_KEY='{s3_creds["aws_access_key_id"]}', + AWS_SECRET_KEY='{s3_creds["aws_secret_access_key"]}', + + AWS_SESSION_TOKEN='{s3_creds["aws_session_token"]}' + )) + """ + elif bucket_scheme in AZURE_BLOB_STORAGE_PROTOCOLS: + assert isinstance(staging_credentials, AzureCredentialsWithoutDefaults) + # Explicit azure credentials are needed to load from bucket without a named stage + credentials_clause = f"""WITH(CREDENTIAL(AZURE_SAS_TOKEN='{staging_credentials.azure_storage_sas_token}'))""" + bucket_path = self.ensure_databricks_abfss_url( + bucket_path, staging_credentials.azure_storage_account_name + ) + + if bucket_scheme in AZURE_BLOB_STORAGE_PROTOCOLS: + assert isinstance(staging_credentials, AzureCredentialsWithoutDefaults) + bucket_path = self.ensure_databricks_abfss_url( + bucket_path, staging_credentials.azure_storage_account_name + ) + + # always add FROM clause + from_clause = f"FROM '{bucket_path}'" else: raise LoadJobTerminalException( self._file_path, @@ -231,6 +241,34 @@ def run(self) -> None: """ self._sql_client.execute_sql(statement) + @staticmethod + def ensure_databricks_abfss_url( + bucket_path: str, azure_storage_account_name: str = None + ) -> str: + bucket_url = urlparse(bucket_path) + # Converts an az:/// to abfss://@.dfs.core.windows.net/ + if bucket_url.username: + # has the right form, ensure abfss schema + return urlunparse(bucket_url._replace(scheme="abfss")) + + if not azure_storage_account_name: + raise TerminalValueError( + f"Could not convert azure blob storage url {bucket_path} into form required by" + " Databricks" + " (abfss://@.dfs.core.windows.net/)" + " because storage account name is not known. Please use Databricks abfss://" + " canonical url as bucket_url in staging credentials" + ) + # as required by databricks + _path = bucket_url.path + return urlunparse( + bucket_url._replace( + scheme="abfss", + netloc=f"{bucket_url.netloc}@{azure_storage_account_name}.dfs.core.windows.net", + path=_path, + ) + ) + class DatabricksMergeJob(SqlMergeFollowupJob): @classmethod diff --git a/dlt/destinations/impl/databricks/factory.py b/dlt/destinations/impl/databricks/factory.py index 409d3bc4be..6108b69da9 100644 --- a/dlt/destinations/impl/databricks/factory.py +++ b/dlt/destinations/impl/databricks/factory.py @@ -54,6 +54,8 @@ def client_class(self) -> t.Type["DatabricksClient"]: def __init__( self, credentials: t.Union[DatabricksCredentials, t.Dict[str, t.Any], str] = None, + is_staging_external_location: t.Optional[bool] = False, + staging_credentials_name: t.Optional[str] = None, destination_name: t.Optional[str] = None, environment: t.Optional[str] = None, **kwargs: t.Any, @@ -65,10 +67,14 @@ def __init__( Args: credentials: Credentials to connect to the databricks database. Can be an instance of `DatabricksCredentials` or a connection string in the format `databricks://user:password@host:port/database` + is_staging_external_location: If true, the temporary credentials are not propagated to the COPY command + staging_credentials_name: If set, credentials with given name will be used in copy command **kwargs: Additional arguments passed to the destination config """ super().__init__( credentials=credentials, + is_staging_external_location=is_staging_external_location, + staging_credentials_name=staging_credentials_name, destination_name=destination_name, environment=environment, **kwargs, diff --git a/dlt/destinations/impl/dummy/dummy.py b/dlt/destinations/impl/dummy/dummy.py index feb09369dc..fc87faaf5a 100644 --- a/dlt/destinations/impl/dummy/dummy.py +++ b/dlt/destinations/impl/dummy/dummy.py @@ -90,9 +90,9 @@ def run(self) -> None: def metrics(self) -> Optional[LoadJobMetrics]: m = super().metrics() - # add remote uri if there's followup job + # add remote url if there's followup job if self.config.create_followup_jobs: - m = m._replace(remote_uri=self._file_name) + m = m._replace(remote_url=self._file_name) return m diff --git a/dlt/destinations/impl/filesystem/filesystem.py b/dlt/destinations/impl/filesystem/filesystem.py index 62263a10b9..ac5ffb9ef3 100644 --- a/dlt/destinations/impl/filesystem/filesystem.py +++ b/dlt/destinations/impl/filesystem/filesystem.py @@ -56,7 +56,7 @@ def __init__( self._job_client: FilesystemClient = None def run(self) -> None: - self.__is_local_filesystem = self._job_client.config.protocol == "file" + self.__is_local_filesystem = self._job_client.config.is_local_filesystem # We would like to avoid failing for local filesystem where # deeply nested directory will not exist before writing a file. # It `auto_mkdir` is disabled by default in fsspec so we made some @@ -88,13 +88,13 @@ def make_remote_path(self) -> str: path_utils.normalize_path_sep(pathlib, destination_file_name), ) - def make_remote_uri(self) -> str: - """Returns path on a remote filesystem as a full uri including scheme.""" - return self._job_client.make_remote_uri(self.make_remote_path()) + def make_remote_url(self) -> str: + """Returns path on a remote filesystem as a full url including scheme.""" + return self._job_client.make_remote_url(self.make_remote_path()) def metrics(self) -> Optional[LoadJobMetrics]: m = super().metrics() - return m._replace(remote_uri=self.make_remote_uri()) + return m._replace(remote_url=self.make_remote_url()) class DeltaLoadFilesystemJob(FilesystemLoadJob): @@ -112,7 +112,7 @@ def make_remote_path(self) -> str: return self._job_client.get_table_dir(self.load_table_name) def run(self) -> None: - logger.info(f"Will copy file(s) {self.file_paths} to delta table {self.make_remote_uri()}") + logger.info(f"Will copy file(s) {self.file_paths} to delta table {self.make_remote_url()}") from dlt.common.libs.deltalake import write_delta_table, merge_delta_table @@ -133,7 +133,7 @@ def run(self) -> None: else: write_delta_table( table_or_uri=( - self.make_remote_uri() if self._delta_table is None else self._delta_table + self.make_remote_url() if self._delta_table is None else self._delta_table ), data=arrow_rbr, write_disposition=self._load_table["write_disposition"], @@ -151,7 +151,7 @@ def _storage_options(self) -> Dict[str, str]: def _delta_table(self) -> Optional["DeltaTable"]: # type: ignore[name-defined] # noqa: F821 from dlt.common.libs.deltalake import try_get_deltatable - return try_get_deltatable(self.make_remote_uri(), storage_options=self._storage_options) + return try_get_deltatable(self.make_remote_url(), storage_options=self._storage_options) @property def _partition_columns(self) -> List[str]: @@ -166,7 +166,7 @@ def _create_or_evolve_delta_table(self) -> None: if self._delta_table is None: DeltaTable.create( - table_uri=self.make_remote_uri(), + table_uri=self.make_remote_url(), schema=ensure_delta_compatible_arrow_schema(self.arrow_ds.schema), mode="overwrite", partition_by=self._partition_columns, @@ -185,7 +185,7 @@ def create_followup_jobs(self, final_state: TLoadJobState) -> List[FollowupJobRe elif final_state == "completed": ref_job = ReferenceFollowupJobRequest( original_file_name=self.file_name(), - remote_paths=[self._job_client.make_remote_uri(self.make_remote_path())], + remote_paths=[self._job_client.make_remote_url(self.make_remote_path())], ) jobs.append(ref_job) return jobs @@ -208,7 +208,7 @@ def __init__( ) -> None: super().__init__(schema, config, capabilities) self.fs_client, fs_path = fsspec_from_config(config) - self.is_local_filesystem = config.protocol == "file" + self.is_local_filesystem = config.is_local_filesystem self.bucket_path = ( config.make_local_path(config.bucket_url) if self.is_local_filesystem else fs_path ) @@ -319,7 +319,7 @@ def get_table_dir(self, table_name: str, remote: bool = False) -> str: table_prefix = self.get_table_prefix(table_name) table_dir: str = self.pathlib.dirname(table_prefix) if remote: - table_dir = self.make_remote_uri(table_dir) + table_dir = self.make_remote_url(table_dir) return table_dir def get_table_prefix(self, table_name: str) -> str: @@ -353,7 +353,7 @@ def list_files_with_prefixes(self, table_dir: str, prefixes: List[str]) -> List[ # we fallback to our own glob implementation that is tested to return consistent results for # filesystems we support. we were not able to use `find` or `walk` because they were selecting # files wrongly (on azure walk on path1/path2/ would also select files from path1/path2_v2/ but returning wrong dirs) - for details in glob_files(self.fs_client, self.make_remote_uri(table_dir), "**"): + for details in glob_files(self.fs_client, self.make_remote_url(table_dir), "**"): file = details["file_name"] filepath = self.pathlib.join(table_dir, details["relative_path"]) # skip INIT files @@ -388,12 +388,12 @@ def create_load_job( cls = FilesystemLoadJobWithFollowup if self.config.as_staging else FilesystemLoadJob return cls(file_path) - def make_remote_uri(self, remote_path: str) -> str: + def make_remote_url(self, remote_path: str) -> str: """Returns uri to the remote filesystem to which copy the file""" if self.is_local_filesystem: - return self.config.make_file_uri(remote_path) + return self.config.make_file_url(remote_path) else: - return f"{self.config.protocol}://{remote_path}" + return self.config.make_url(remote_path) def __enter__(self) -> "FilesystemClient": return self diff --git a/docs/website/docs/dlt-ecosystem/destinations/databricks.md b/docs/website/docs/dlt-ecosystem/destinations/databricks.md index 6cd5767dcb..ddb82c95b2 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/databricks.md +++ b/docs/website/docs/dlt-ecosystem/destinations/databricks.md @@ -117,6 +117,8 @@ access_token = "MY_ACCESS_TOKEN" catalog = "my_catalog" ``` +See [staging support](#staging-support) for authentication options when `dlt` copies files from buckets. + ## Write disposition All write dispositions are supported @@ -166,6 +168,11 @@ pipeline = dlt.pipeline( Refer to the [Azure Blob Storage filesystem documentation](./filesystem.md#azure-blob-storage) for details on connecting your Azure Blob Storage container with the bucket_url and credentials. +Databricks requires that you use ABFS urls in following format: +**abfss://container_name@storage_account_name.dfs.core.windows.net/path** + +`dlt` is able to adapt the other representation (ie **az://container-name/path**') still we recommend that you use the correct form. + Example to set up Databricks with Azure as a staging destination: ```py @@ -175,10 +182,34 @@ Example to set up Databricks with Azure as a staging destination: pipeline = dlt.pipeline( pipeline_name='chess_pipeline', destination='databricks', - staging=dlt.destinations.filesystem('az://your-container-name'), # add this to activate the staging location + staging=dlt.destinations.filesystem('abfss://dlt-ci-data@dltdata.dfs.core.windows.net'), # add this to activate the staging location dataset_name='player_data' ) + ``` + +### Use external locations and stored credentials +`dlt` forwards bucket credentials to `COPY INTO` SQL command by default. You may prefer to use [external locations or stored credentials instead](https://docs.databricks.com/en/sql/language-manual/sql-ref-external-locations.html#external-location) that are stored on the Databricks side. + +If you set up external location for your staging path, you can tell `dlt` to use it: +```toml +[destination.databricks] +is_staging_external_location=true +``` + +If you set up Databricks credential named ie. **credential_x**, you can tell `dlt` to use it: +```toml +[destination.databricks] +staging_credentials_name="credential_x" +``` + +Both options are available from code: +```py +import dlt + +bricks = dlt.destinations.databricks(staging_credentials_name="credential_x") +``` + ### dbt support This destination [integrates with dbt](../transformations/dbt/dbt.md) via [dbt-databricks](https://github.com/databricks/dbt-databricks) diff --git a/docs/website/docs/dlt-ecosystem/destinations/snowflake.md b/docs/website/docs/dlt-ecosystem/destinations/snowflake.md index d08578c5a2..57e6db311d 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/snowflake.md +++ b/docs/website/docs/dlt-ecosystem/destinations/snowflake.md @@ -176,7 +176,7 @@ Note that we ignore missing columns `ERROR_ON_COLUMN_COUNT_MISMATCH = FALSE` and Snowflake supports the following [column hints](https://dlthub.com/docs/general-usage/schema#tables-and-columns): * `cluster` - creates a cluster column(s). Many columns per table are supported and only when a new table is created. -### Table and column identifiers +## Table and column identifiers Snowflake supports both case sensitive and case insensitive identifiers. All unquoted and uppercase identifiers resolve case-insensitively in SQL statements. Case insensitive [naming conventions](../../general-usage/naming-convention.md#case-sensitive-and-insensitive-destinations) like the default **snake_case** will generate case insensitive identifiers. Case sensitive (like **sql_cs_v1**) will generate case sensitive identifiers that must be quoted in SQL statements. diff --git a/docs/website/docs/running-in-production/running.md b/docs/website/docs/running-in-production/running.md index 3b5762612c..cc089a1393 100644 --- a/docs/website/docs/running-in-production/running.md +++ b/docs/website/docs/running-in-production/running.md @@ -271,7 +271,7 @@ load_info.raise_on_failed_jobs() ``` You may also abort the load package with `LoadClientJobFailed` (terminal exception) on a first -failed job. Such package is immediately moved to completed but its load id is not added to the +failed job. Such package is will be completed but its load id is not added to the `_dlt_loads` table. All the jobs that were running in parallel are completed before raising. The dlt state, if present, will not be visible to `dlt`. Here's example `config.toml` to enable this option: @@ -282,6 +282,20 @@ load.workers=1 load.raise_on_failed_jobs=true ``` +:::caution +Note that certain write dispositions will irreversibly modify your data +1. `replace` write disposition with the default `truncate-and-insert` [strategy](../general-usage/full-loading.md) will truncate tables before loading. +2. `merge` write disposition will merge staging dataset tables into the destination dataset. This will happen only when all data for this table (and nested tables) got loaded. + +Here's what you can do to deal with partially loaded packages: +1. Retry the load step in case of transient errors +2. Use replace strategy with staging dataset so replace happens only when data for the table (and all nested tables) was fully loaded and is atomic operation (if possible) +3. Use only "append" write disposition. When your load package fails you are able to use `_dlt_load_id` to remove all unprocessed data. +4. Use "staging append" (`merge` disposition without primary key and merge key defined). + +::: + + ### What `run` does inside Before adding retry to pipeline steps, note how `run` method actually works: diff --git a/tests/.dlt/config.toml b/tests/.dlt/config.toml index ba86edf417..292175569b 100644 --- a/tests/.dlt/config.toml +++ b/tests/.dlt/config.toml @@ -6,7 +6,8 @@ bucket_url_gs="gs://ci-test-bucket" bucket_url_s3="s3://dlt-ci-test-bucket" bucket_url_file="_storage" bucket_url_az="az://dlt-ci-test-bucket" +bucket_url_abfss="abfss://dlt-ci-test-bucket@dltdata.dfs.core.windows.net" bucket_url_r2="s3://dlt-ci-test-bucket" # use "/" as root path bucket_url_gdrive="gdrive://15eC3e5MNew2XAIefWNlG8VlEa0ISnnaG" -memory="memory://m" \ No newline at end of file +memory="memory:///m" \ No newline at end of file diff --git a/tests/common/cases/normalizers/sql_upper.py b/tests/common/cases/normalizers/sql_upper.py index f2175f06ad..eb88775f95 100644 --- a/tests/common/cases/normalizers/sql_upper.py +++ b/tests/common/cases/normalizers/sql_upper.py @@ -1,5 +1,3 @@ -from typing import Any, Sequence - from dlt.common.normalizers.naming.naming import NamingConvention as BaseNamingConvention diff --git a/tests/common/storages/test_local_filesystem.py b/tests/common/storages/test_local_filesystem.py index 14e3cc23d4..1bfe6c0b5b 100644 --- a/tests/common/storages/test_local_filesystem.py +++ b/tests/common/storages/test_local_filesystem.py @@ -45,7 +45,7 @@ ) def test_local_path_win_configuration(bucket_url: str, file_url: str) -> None: assert FilesystemConfiguration.is_local_path(bucket_url) is True - assert FilesystemConfiguration.make_file_uri(bucket_url) == file_url + assert FilesystemConfiguration.make_file_url(bucket_url) == file_url c = resolve_configuration(FilesystemConfiguration(bucket_url)) assert c.protocol == "file" @@ -66,7 +66,7 @@ def test_local_path_win_configuration(bucket_url: str, file_url: str) -> None: def test_local_user_win_path_configuration(bucket_url: str) -> None: file_url = "file:///" + pathlib.Path(bucket_url).expanduser().as_posix().lstrip("/") assert FilesystemConfiguration.is_local_path(bucket_url) is True - assert FilesystemConfiguration.make_file_uri(bucket_url) == file_url + assert FilesystemConfiguration.make_file_url(bucket_url) == file_url c = resolve_configuration(FilesystemConfiguration(bucket_url)) assert c.protocol == "file" @@ -99,7 +99,7 @@ def test_file_win_configuration() -> None: ) def test_file_posix_configuration(bucket_url: str, file_url: str) -> None: assert FilesystemConfiguration.is_local_path(bucket_url) is True - assert FilesystemConfiguration.make_file_uri(bucket_url) == file_url + assert FilesystemConfiguration.make_file_url(bucket_url) == file_url c = resolve_configuration(FilesystemConfiguration(bucket_url)) assert c.protocol == "file" @@ -117,7 +117,7 @@ def test_file_posix_configuration(bucket_url: str, file_url: str) -> None: def test_local_user_posix_path_configuration(bucket_url: str) -> None: file_url = "file:///" + pathlib.Path(bucket_url).expanduser().as_posix().lstrip("/") assert FilesystemConfiguration.is_local_path(bucket_url) is True - assert FilesystemConfiguration.make_file_uri(bucket_url) == file_url + assert FilesystemConfiguration.make_file_url(bucket_url) == file_url c = resolve_configuration(FilesystemConfiguration(bucket_url)) assert c.protocol == "file" @@ -166,7 +166,7 @@ def test_file_filesystem_configuration( assert FilesystemConfiguration.make_local_path(bucket_url) == str( pathlib.Path(local_path).resolve() ) - assert FilesystemConfiguration.make_file_uri(local_path) == norm_bucket_url + assert FilesystemConfiguration.make_file_url(local_path) == norm_bucket_url if local_path == "": with pytest.raises(ConfigurationValueError): diff --git a/tests/destinations/test_destination_name_and_config.py b/tests/destinations/test_destination_name_and_config.py index 11de706722..1e432a7803 100644 --- a/tests/destinations/test_destination_name_and_config.py +++ b/tests/destinations/test_destination_name_and_config.py @@ -60,7 +60,7 @@ def test_set_name_and_environment() -> None: def test_preserve_destination_instance() -> None: dummy1 = dummy(destination_name="dummy1", environment="dev/null/1") filesystem1 = filesystem( - FilesystemConfiguration.make_file_uri(TEST_STORAGE_ROOT), + FilesystemConfiguration.make_file_url(TEST_STORAGE_ROOT), destination_name="local_fs", environment="devel", ) @@ -210,7 +210,7 @@ def test_destination_config_in_name(environment: DictStrStr) -> None: with pytest.raises(ConfigFieldMissingException): p.destination_client() - environment["DESTINATION__FILESYSTEM-PROD__BUCKET_URL"] = FilesystemConfiguration.make_file_uri( + environment["DESTINATION__FILESYSTEM-PROD__BUCKET_URL"] = FilesystemConfiguration.make_file_url( "_storage" ) assert p._fs_client().dataset_path.endswith(p.dataset_name) diff --git a/tests/load/databricks/test_databricks_configuration.py b/tests/load/databricks/test_databricks_configuration.py index f6a06180c9..bb989a887c 100644 --- a/tests/load/databricks/test_databricks_configuration.py +++ b/tests/load/databricks/test_databricks_configuration.py @@ -3,9 +3,12 @@ pytest.importorskip("databricks") +from dlt.common.exceptions import TerminalValueError +from dlt.destinations.impl.databricks.databricks import DatabricksLoadJob +from dlt.common.configuration import resolve_configuration +from dlt.destinations import databricks from dlt.destinations.impl.databricks.configuration import DatabricksClientConfiguration -from dlt.common.configuration import resolve_configuration # mark all tests as essential, do not remove pytestmark = pytest.mark.essential @@ -34,3 +37,48 @@ def test_databricks_credentials_to_connector_params(): assert params["extra_a"] == "a" assert params["extra_b"] == "b" assert params["_socket_timeout"] == credentials.socket_timeout + + +def test_databricks_configuration() -> None: + bricks = databricks() + config = bricks.configuration(None, accept_partial=True) + assert config.is_staging_external_location is False + assert config.staging_credentials_name is None + + os.environ["IS_STAGING_EXTERNAL_LOCATION"] = "true" + os.environ["STAGING_CREDENTIALS_NAME"] = "credential" + config = bricks.configuration(None, accept_partial=True) + assert config.is_staging_external_location is True + assert config.staging_credentials_name == "credential" + + # explicit params + bricks = databricks(is_staging_external_location=None, staging_credentials_name="credential2") + config = bricks.configuration(None, accept_partial=True) + assert config.staging_credentials_name == "credential2" + assert config.is_staging_external_location is None + + +def test_databricks_abfss_converter() -> None: + with pytest.raises(TerminalValueError): + DatabricksLoadJob.ensure_databricks_abfss_url("az://dlt-ci-test-bucket") + + abfss_url = DatabricksLoadJob.ensure_databricks_abfss_url( + "az://dlt-ci-test-bucket", "my_account" + ) + assert abfss_url == "abfss://dlt-ci-test-bucket@my_account.dfs.core.windows.net" + + abfss_url = DatabricksLoadJob.ensure_databricks_abfss_url( + "az://dlt-ci-test-bucket/path/to/file.parquet", "my_account" + ) + assert ( + abfss_url + == "abfss://dlt-ci-test-bucket@my_account.dfs.core.windows.net/path/to/file.parquet" + ) + + abfss_url = DatabricksLoadJob.ensure_databricks_abfss_url( + "az://dlt-ci-test-bucket@my_account.dfs.core.windows.net/path/to/file.parquet" + ) + assert ( + abfss_url + == "abfss://dlt-ci-test-bucket@my_account.dfs.core.windows.net/path/to/file.parquet" + ) diff --git a/tests/load/filesystem/test_filesystem_common.py b/tests/load/filesystem/test_filesystem_common.py index 3cad7dda2c..29ca1a2b57 100644 --- a/tests/load/filesystem/test_filesystem_common.py +++ b/tests/load/filesystem/test_filesystem_common.py @@ -3,8 +3,8 @@ from typing import Tuple, Union, Dict from urllib.parse import urlparse - -from fsspec import AbstractFileSystem +from fsspec import AbstractFileSystem, get_filesystem_class, register_implementation +from fsspec.core import filesystem as fs_filesystem import pytest from tenacity import retry, stop_after_attempt, wait_fixed @@ -15,6 +15,7 @@ from dlt.common.configuration.inject import with_config from dlt.common.configuration.specs import AnyAzureCredentials from dlt.common.storages import fsspec_from_config, FilesystemConfiguration +from dlt.common.storages.configuration import make_fsspec_url from dlt.common.storages.fsspec_filesystem import MTIME_DISPATCH, glob_files from dlt.common.utils import custom_environ, uniq_id from dlt.destinations import filesystem @@ -22,11 +23,12 @@ FilesystemDestinationClientConfiguration, ) from dlt.destinations.impl.filesystem.typing import TExtraPlaceholders + +from tests.common.configuration.utils import environment from tests.common.storages.utils import TEST_SAMPLE_FILES, assert_sample_files -from tests.load.utils import ALL_FILESYSTEM_DRIVERS, AWS_BUCKET +from tests.load.utils import ALL_FILESYSTEM_DRIVERS, AWS_BUCKET, WITH_GDRIVE_BUCKETS from tests.utils import autouse_test_storage -from .utils import self_signed_cert -from tests.common.configuration.utils import environment +from tests.load.filesystem.utils import self_signed_cert # mark all tests as essential, do not remove @@ -53,6 +55,24 @@ def test_filesystem_configuration() -> None: } +@pytest.mark.parametrize("bucket_url", WITH_GDRIVE_BUCKETS) +def test_remote_url(bucket_url: str) -> None: + # make absolute urls out of paths + scheme = urlparse(bucket_url).scheme + if not scheme: + scheme = "file" + bucket_url = FilesystemConfiguration.make_file_url(bucket_url) + if scheme == "gdrive": + from dlt.common.storages.fsspecs.google_drive import GoogleDriveFileSystem + + register_implementation("gdrive", GoogleDriveFileSystem, "GoogleDriveFileSystem") + + fs_class = get_filesystem_class(scheme) + fs_path = fs_class._strip_protocol(bucket_url) + # reconstitute url + assert make_fsspec_url(scheme, fs_path, bucket_url) == bucket_url + + def test_filesystem_instance(with_gdrive_buckets_env: str) -> None: @retry(stop=stop_after_attempt(10), wait=wait_fixed(1), reraise=True) def check_file_exists(filedir_: str, file_url_: str): @@ -72,10 +92,8 @@ def check_file_changed(file_url_: str): bucket_url = os.environ["DESTINATION__FILESYSTEM__BUCKET_URL"] config = get_config() # we do not add protocol to bucket_url (we need relative path) - assert bucket_url.startswith(config.protocol) or config.protocol == "file" + assert bucket_url.startswith(config.protocol) or config.is_local_filesystem filesystem, url = fsspec_from_config(config) - if config.protocol != "file": - assert bucket_url.endswith(url) # do a few file ops now = pendulum.now() filename = f"filesystem_common_{uniq_id()}" @@ -113,7 +131,9 @@ def test_glob_overlapping_path_files(with_gdrive_buckets_env: str) -> None: # "standard_source/sample" overlaps with a real existing "standard_source/samples". walk operation on azure # will return all files from "standard_source/samples" and report the wrong "standard_source/sample" path to the user # here we test we do not have this problem with out glob - bucket_url, _, filesystem = glob_test_setup(bucket_url, "standard_source/sample") + bucket_url, config, filesystem = glob_test_setup(bucket_url, "standard_source/sample") + if config.protocol in ["file"]: + pytest.skip(f"{config.protocol} not supported in this test") # use glob to get data all_file_items = list(glob_files(filesystem, bucket_url)) assert len(all_file_items) == 0 @@ -272,18 +292,18 @@ def glob_test_setup( config = get_config() # enable caches config.read_only = True - if config.protocol in ["file"]: - pytest.skip(f"{config.protocol} not supported in this test") # may contain query string - bucket_url_parsed = urlparse(bucket_url) - bucket_url = bucket_url_parsed._replace( - path=posixpath.join(bucket_url_parsed.path, glob_folder) - ).geturl() - filesystem, _ = fsspec_from_config(config) + filesystem, fs_path = fsspec_from_config(config) + bucket_url = make_fsspec_url(config.protocol, posixpath.join(fs_path, glob_folder), bucket_url) if config.protocol == "memory": - mem_path = os.path.join("m", "standard_source") + mem_path = os.path.join("/m", "standard_source") if not filesystem.isdir(mem_path): filesystem.mkdirs(mem_path) filesystem.upload(TEST_SAMPLE_FILES, mem_path, recursive=True) + if config.protocol == "file": + file_path = os.path.join("_storage", "standard_source") + if not filesystem.isdir(file_path): + filesystem.mkdirs(file_path) + filesystem.upload(TEST_SAMPLE_FILES, file_path, recursive=True) return bucket_url, config, filesystem diff --git a/tests/load/pipeline/test_databricks_pipeline.py b/tests/load/pipeline/test_databricks_pipeline.py new file mode 100644 index 0000000000..5f8641f9fa --- /dev/null +++ b/tests/load/pipeline/test_databricks_pipeline.py @@ -0,0 +1,85 @@ +import pytest +import os + +from dlt.common.utils import uniq_id +from tests.load.utils import DestinationTestConfiguration, destinations_configs, AZ_BUCKET +from tests.pipeline.utils import assert_load_info + + +# mark all tests as essential, do not remove +pytestmark = pytest.mark.essential + + +@pytest.mark.parametrize( + "destination_config", + destinations_configs( + default_sql_configs=True, bucket_subset=(AZ_BUCKET), subset=("databricks",) + ), + ids=lambda x: x.name, +) +def test_databricks_external_location(destination_config: DestinationTestConfiguration) -> None: + # do not interfere with state + os.environ["RESTORE_FROM_DESTINATION"] = "False" + dataset_name = "test_databricks_external_location" + uniq_id() + + from dlt.destinations import databricks, filesystem + from dlt.destinations.impl.databricks.databricks import DatabricksLoadJob + + abfss_bucket_url = DatabricksLoadJob.ensure_databricks_abfss_url(AZ_BUCKET, "dltdata") + stage = filesystem(abfss_bucket_url) + + # should load abfss formatted url just fine + bricks = databricks(is_staging_external_location=False) + pipeline = destination_config.setup_pipeline( + "test_databricks_external_location", + dataset_name=dataset_name, + destination=bricks, + staging=stage, + ) + info = pipeline.run([1, 2, 3], table_name="digits") + assert_load_info(info) + # get metrics + metrics = info.metrics[info.loads_ids[0]][0] + remote_url = list(metrics["job_metrics"].values())[0].remote_url + # abfss form was preserved + assert remote_url.startswith(abfss_bucket_url) + + # should fail on internal config error as external location is not configured + bricks = databricks(is_staging_external_location=True) + pipeline = destination_config.setup_pipeline( + "test_databricks_external_location", + dataset_name=dataset_name, + destination=bricks, + staging=stage, + ) + info = pipeline.run([1, 2, 3], table_name="digits") + assert info.has_failed_jobs is True + assert ( + "Invalid configuration value detected" + in pipeline.list_failed_jobs_in_package(info.loads_ids[0])[0].failed_message + ) + + # should fail on non existing stored credentials + bricks = databricks(is_staging_external_location=False, staging_credentials_name="CREDENTIAL_X") + pipeline = destination_config.setup_pipeline( + "test_databricks_external_location", + dataset_name=dataset_name, + destination=bricks, + staging=stage, + ) + info = pipeline.run([1, 2, 3], table_name="digits") + assert info.has_failed_jobs is True + assert ( + "credential_x" in pipeline.list_failed_jobs_in_package(info.loads_ids[0])[0].failed_message + ) + + # should fail on non existing stored credentials + # auto stage with regular az:// used + pipeline = destination_config.setup_pipeline( + "test_databricks_external_location", dataset_name=dataset_name, destination=bricks + ) + info = pipeline.run([1, 2, 3], table_name="digits") + assert info.has_failed_jobs is True + assert ( + "credential_x" in pipeline.list_failed_jobs_in_package(info.loads_ids[0])[0].failed_message + ) diff --git a/tests/load/pipeline/test_filesystem_pipeline.py b/tests/load/pipeline/test_filesystem_pipeline.py index d88eba7c06..bc6cbd9848 100644 --- a/tests/load/pipeline/test_filesystem_pipeline.py +++ b/tests/load/pipeline/test_filesystem_pipeline.py @@ -300,16 +300,16 @@ def data_types(): assert len(rows) == 10 assert_all_data_types_row(rows[0], schema=column_schemas) - # make sure remote_uri is in metrics + # make sure remote_url is in metrics metrics = info.metrics[info.loads_ids[0]][0] - # TODO: only final copy job has remote_uri. not the initial (empty) job for particular files - # we could implement an empty job for delta that generates correct remote_uri - remote_uri = list(metrics["job_metrics"].values())[-1].remote_uri - assert remote_uri.endswith("data_types") - bucket_uri = destination_config.bucket_url - if FilesystemConfiguration.is_local_path(bucket_uri): - bucket_uri = FilesystemConfiguration.make_file_uri(bucket_uri) - assert remote_uri.startswith(bucket_uri) + # TODO: only final copy job has remote_url. not the initial (empty) job for particular files + # we could implement an empty job for delta that generates correct remote_url + remote_url = list(metrics["job_metrics"].values())[-1].remote_url + assert remote_url.endswith("data_types") + bucket_url = destination_config.bucket_url + if FilesystemConfiguration.is_local_path(bucket_url): + bucket_url = FilesystemConfiguration.make_file_url(bucket_url) + assert remote_url.startswith(bucket_url) # another run should append rows to the table info = pipeline.run(data_types()) diff --git a/tests/load/pipeline/test_stage_loading.py b/tests/load/pipeline/test_stage_loading.py index f216fa3c05..42dee5fc8f 100644 --- a/tests/load/pipeline/test_stage_loading.py +++ b/tests/load/pipeline/test_stage_loading.py @@ -57,17 +57,17 @@ def test_staging_load(destination_config: DestinationTestConfiguration) -> None: info = pipeline.run(github(), loader_file_format=destination_config.file_format) assert_load_info(info) - # checks if remote_uri is set correctly on copy jobs + # checks if remote_url is set correctly on copy jobs metrics = info.metrics[info.loads_ids[0]][0] for job_metrics in metrics["job_metrics"].values(): - remote_uri = job_metrics.remote_uri + remote_url = job_metrics.remote_url job_ext = os.path.splitext(job_metrics.job_id)[1] if job_ext not in (".reference", ".sql"): - assert remote_uri.endswith(job_ext) + assert remote_url.endswith(job_ext) bucket_uri = destination_config.bucket_url if FilesystemConfiguration.is_local_path(bucket_uri): - bucket_uri = FilesystemConfiguration.make_file_uri(bucket_uri) - assert remote_uri.startswith(bucket_uri) + bucket_uri = FilesystemConfiguration.make_file_url(bucket_uri) + assert remote_url.startswith(bucket_uri) package_info = pipeline.get_load_package_info(info.loads_ids[0]) assert package_info.state == "loaded" diff --git a/tests/load/test_dummy_client.py b/tests/load/test_dummy_client.py index 59b7acac15..72c5772668 100644 --- a/tests/load/test_dummy_client.py +++ b/tests/load/test_dummy_client.py @@ -1012,17 +1012,17 @@ def assert_complete_job( if state == "failed_jobs" else "completed" ) - remote_uri = job_metrics.remote_uri + remote_url = job_metrics.remote_url if load.initial_client_config.create_followup_jobs: # type: ignore - assert remote_uri.endswith(job.file_name()) + assert remote_url.endswith(job.file_name()) elif load.is_staging_destination_job(job.file_name()): # staging destination should contain reference to remote filesystem assert ( - FilesystemConfiguration.make_file_uri(REMOTE_FILESYSTEM) - in remote_uri + FilesystemConfiguration.make_file_url(REMOTE_FILESYSTEM) + in remote_url ) else: - assert remote_uri is None + assert remote_url is None else: assert job_metrics is None diff --git a/tests/load/utils.py b/tests/load/utils.py index 086109de8b..15b1e1575e 100644 --- a/tests/load/utils.py +++ b/tests/load/utils.py @@ -70,6 +70,7 @@ AWS_BUCKET = dlt.config.get("tests.bucket_url_s3", str) GCS_BUCKET = dlt.config.get("tests.bucket_url_gs", str) AZ_BUCKET = dlt.config.get("tests.bucket_url_az", str) +ABFS_BUCKET = dlt.config.get("tests.bucket_url_abfss", str) GDRIVE_BUCKET = dlt.config.get("tests.bucket_url_gdrive", str) FILE_BUCKET = dlt.config.get("tests.bucket_url_file", str) R2_BUCKET = dlt.config.get("tests.bucket_url_r2", str) @@ -79,6 +80,7 @@ "s3", "gs", "az", + "abfss", "gdrive", "file", "memory", @@ -86,7 +88,15 @@ ] # Filter out buckets not in all filesystem drivers -WITH_GDRIVE_BUCKETS = [GCS_BUCKET, AWS_BUCKET, FILE_BUCKET, MEMORY_BUCKET, AZ_BUCKET, GDRIVE_BUCKET] +WITH_GDRIVE_BUCKETS = [ + GCS_BUCKET, + AWS_BUCKET, + FILE_BUCKET, + MEMORY_BUCKET, + ABFS_BUCKET, + AZ_BUCKET, + GDRIVE_BUCKET, +] WITH_GDRIVE_BUCKETS = [ bucket for bucket in WITH_GDRIVE_BUCKETS diff --git a/tests/pipeline/cases/contracts/trace.schema.yaml b/tests/pipeline/cases/contracts/trace.schema.yaml index 89831977c0..c324818338 100644 --- a/tests/pipeline/cases/contracts/trace.schema.yaml +++ b/tests/pipeline/cases/contracts/trace.schema.yaml @@ -562,7 +562,7 @@ tables: finished_at: data_type: timestamp nullable: true - remote_uri: + remote_url: data_type: text nullable: true parent: trace__steps diff --git a/tests/pipeline/test_pipeline.py b/tests/pipeline/test_pipeline.py index b6a7feffc1..027a2b4e72 100644 --- a/tests/pipeline/test_pipeline.py +++ b/tests/pipeline/test_pipeline.py @@ -2600,6 +2600,20 @@ def ids(_id=dlt.sources.incremental("_id", initial_value=2)): assert pipeline.last_trace.last_normalize_info.row_counts["_ids"] == 2 +def test_dlt_columns_nested_table_collisions() -> None: + # we generate all identifiers in upper case to test for a bug where dlt columns for nested tables were hardcoded to + # small caps. they got normalized to upper case after the first run and then added again as small caps + # generating duplicate columns and raising collision exception as duckdb is ci destination + duck = duckdb(naming_convention="tests.common.cases.normalizers.sql_upper") + pipeline = dlt.pipeline("test_dlt_columns_child_table_collisions", destination=duck) + customers = [ + {"id": 1, "name": "dave", "orders": [1, 2, 3]}, + ] + assert_load_info(pipeline.run(customers, table_name="CUSTOMERS")) + # this one would fail without bugfix + assert_load_info(pipeline.run(customers, table_name="CUSTOMERS")) + + def test_access_pipeline_in_resource() -> None: pipeline = dlt.pipeline("test_access_pipeline_in_resource", destination="duckdb") diff --git a/tests/pipeline/test_pipeline_trace.py b/tests/pipeline/test_pipeline_trace.py index 4e52d2aa29..d2bb035a17 100644 --- a/tests/pipeline/test_pipeline_trace.py +++ b/tests/pipeline/test_pipeline_trace.py @@ -315,7 +315,7 @@ def data(): return data() - # create pipeline with staging to get remote_uri in load step job_metrics + # create pipeline with staging to get remote_url in load step job_metrics dummy_dest = dummy(completed_prob=1.0) pipeline = dlt.pipeline( pipeline_name="test_trace_schema", From 63f89542678c7af51089f94365aa6834ccca90e7 Mon Sep 17 00:00:00 2001 From: Marcin Rudolf Date: Wed, 28 Aug 2024 13:20:16 +0200 Subject: [PATCH 10/10] bumps dlt version to 0.5.4 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 74161f5ccc..d32285572f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "dlt" -version = "0.5.4a0" +version = "0.5.4" description = "dlt is an open-source python-first scalable data loading library that does not require any backend to run." authors = ["dltHub Inc. "] maintainers = [ "Marcin Rudolf ", "Adrian Brudaru ", "Anton Burnashev ", "David Scharf " ]