From 24181da87e98afcf97f3cf00030a8c0e8e7a71e3 Mon Sep 17 00:00:00 2001 From: Steinthor Palsson Date: Fri, 1 Mar 2024 18:46:11 -0500 Subject: [PATCH] Init experimental refresh = full with drop command --- dlt/pipeline/__init__.py | 6 ++++- dlt/pipeline/configuration.py | 5 ++++ dlt/pipeline/helpers.py | 48 +++++++++++++++++++++++++++++++---- dlt/pipeline/pipeline.py | 11 ++++++-- dlt/pipeline/typing.py | 2 ++ 5 files changed, 64 insertions(+), 8 deletions(-) diff --git a/dlt/pipeline/__init__.py b/dlt/pipeline/__init__.py index cfc1ac17d3..0281ce31ef 100644 --- a/dlt/pipeline/__init__.py +++ b/dlt/pipeline/__init__.py @@ -1,4 +1,4 @@ -from typing import Sequence, cast, overload +from typing import Sequence, cast, overload, Optional from dlt.common.schema import Schema from dlt.common.schema.typing import TColumnSchema, TWriteDisposition, TSchemaContract @@ -14,6 +14,7 @@ from dlt.pipeline.pipeline import Pipeline from dlt.pipeline.progress import _from_name as collector_from_name, TCollectorArg, _NULL_COLLECTOR from dlt.pipeline.warnings import credentials_argument_deprecated, full_refresh_argument_deprecated +from dlt.pipeline.typing import TRefreshMode @overload @@ -28,6 +29,7 @@ def pipeline( export_schema_path: str = None, full_refresh: bool = False, dev_mode: bool = False, + refresh: Optional[TRefreshMode] = None, credentials: Any = None, progress: TCollectorArg = _NULL_COLLECTOR, ) -> Pipeline: @@ -97,6 +99,7 @@ def pipeline( export_schema_path: str = None, full_refresh: bool = False, dev_mode: bool = False, + refresh: Optional[TRefreshMode] = None, credentials: Any = None, progress: TCollectorArg = _NULL_COLLECTOR, **kwargs: Any, @@ -147,6 +150,7 @@ def pipeline( False, last_config(**kwargs), kwargs["runtime"], + refresh=refresh, ) # set it as current pipeline p.activate() diff --git a/dlt/pipeline/configuration.py b/dlt/pipeline/configuration.py index 9b602ed9f4..361f627dfc 100644 --- a/dlt/pipeline/configuration.py +++ b/dlt/pipeline/configuration.py @@ -5,6 +5,7 @@ from dlt.common.typing import AnyFun, TSecretValue from dlt.common.utils import digest256 from dlt.common.data_writers import TLoaderFileFormat +from dlt.pipeline.typing import TRefreshMode @configspec @@ -30,6 +31,10 @@ class PipelineConfiguration(BaseConfiguration): """When set to True, each instance of the pipeline with the `pipeline_name` starts from scratch when run and loads the data to a separate dataset.""" progress: Optional[str] = None runtime: RunConfiguration + refresh: Optional[TRefreshMode] = None + """Refresh mode for the pipeline, use with care. `full` completely wipes pipeline state and data before each run. + `replace` wipes only state and data from the resources selected to run. Default is `None` which means no refresh. + """ def on_resolved(self) -> None: if not self.pipeline_name: diff --git a/dlt/pipeline/helpers.py b/dlt/pipeline/helpers.py index 7bba5f84e7..e5c2e25cf8 100644 --- a/dlt/pipeline/helpers.py +++ b/dlt/pipeline/helpers.py @@ -1,5 +1,17 @@ import contextlib -from typing import Callable, Sequence, Iterable, Optional, Any, List, Dict, Tuple, Union, TypedDict +from typing import ( + Callable, + Sequence, + Iterable, + Optional, + Any, + List, + Dict, + Tuple, + Union, + TypedDict, + TYPE_CHECKING, +) from itertools import chain from dlt.common.jsonpath import resolve_paths, TAnyJsonPath, compile_paths @@ -17,6 +29,8 @@ _sources_state, _delete_source_state_keys, _get_matching_resources, + StateInjectableContext, + Container, ) from dlt.common.destination.reference import WithStagingDataset @@ -27,7 +41,10 @@ PipelineHasPendingDataException, ) from dlt.pipeline.typing import TPipelineStep -from dlt.pipeline import Pipeline +from dlt.common.configuration.exceptions import ContextDefaultCannotBeCreated + +if TYPE_CHECKING: + from dlt.pipeline import Pipeline def retry_load( @@ -77,13 +94,22 @@ class _DropInfo(TypedDict): class DropCommand: def __init__( self, - pipeline: Pipeline, + pipeline: "Pipeline", resources: Union[Iterable[Union[str, TSimpleRegex]], Union[str, TSimpleRegex]] = (), schema_name: Optional[str] = None, state_paths: TAnyJsonPath = (), drop_all: bool = False, state_only: bool = False, ) -> None: + """ + Args: + pipeline: Pipeline to drop tables and state from + resources: List of resources to drop. If empty, no resources are dropped unless `drop_all` is True + schema_name: Name of the schema to drop tables from. If not specified, the default schema is used + state_paths: JSON path(s) relative to the source state to drop + drop_all: Drop all resources and tables in the schema (supersedes `resources` list) + state_only: Drop only state, not tables + """ self.pipeline = pipeline if isinstance(resources, str): resources = [resources] @@ -187,7 +213,10 @@ def _create_modified_state(self) -> Dict[str, Any]: self.info["resource_states"].append(key) reset_resource_state(key, source_state) # drop additional state paths - resolved_paths = resolve_paths(self.state_paths_to_drop, source_state) + # Don't drop 'resources' key if jsonpath is wildcard + resolved_paths = [ + p for p in resolve_paths(self.state_paths_to_drop, source_state) if p != "resources" + ] if self.state_paths_to_drop and not resolved_paths: self.info["warnings"].append( f"State paths {self.state_paths_to_drop} did not select any paths in source" @@ -202,6 +231,15 @@ def _drop_state_keys(self) -> None: with self.pipeline.managed_state(extract_state=True) as state: # type: ignore[assignment] state.clear() state.update(self._new_state) + try: + # Also update the state in current context if one is active + # so that we can run the pipeline directly after drop in the same process + ctx = Container()[StateInjectableContext] + state = ctx.state # type: ignore[assignment] + state.clear() + state.update(self._new_state) + except ContextDefaultCannotBeCreated: + pass def __call__(self) -> None: if ( @@ -236,7 +274,7 @@ def __call__(self) -> None: def drop( - pipeline: Pipeline, + pipeline: "Pipeline", resources: Union[Iterable[str], str] = (), schema_name: str = None, state_paths: TAnyJsonPath = (), diff --git a/dlt/pipeline/pipeline.py b/dlt/pipeline/pipeline.py index 46d1c590f7..87cdb33727 100644 --- a/dlt/pipeline/pipeline.py +++ b/dlt/pipeline/pipeline.py @@ -124,7 +124,7 @@ end_trace_step, end_trace, ) -from dlt.pipeline.typing import TPipelineStep +from dlt.pipeline.typing import TPipelineStep, TRefreshMode from dlt.pipeline.state_sync import ( STATE_ENGINE_VERSION, bump_version_if_modified, @@ -135,6 +135,7 @@ json_decode_state, ) from dlt.pipeline.warnings import credentials_argument_deprecated +from dlt.pipeline.helpers import drop as drop_command def with_state_sync(may_extract_state: bool = False) -> Callable[[TFun], TFun]: @@ -291,6 +292,7 @@ class Pipeline(SupportsPipeline): collector: _Collector config: PipelineConfiguration runtime_config: RunConfiguration + refresh: Optional[TRefreshMode] = None def __init__( self, @@ -308,6 +310,7 @@ def __init__( must_attach_to_local_pipeline: bool, config: PipelineConfiguration, runtime: RunConfiguration, + refresh: Optional[TRefreshMode] = None, ) -> None: """Initializes the Pipeline class which implements `dlt` pipeline. Please use `pipeline` function in `dlt` module to create a new Pipeline instance.""" self.pipeline_salt = pipeline_salt @@ -317,6 +320,7 @@ def __init__( self.collector = progress or _NULL_COLLECTOR self.destination = None self.staging = None + self.refresh = refresh self._container = Container() self._pipeline_instance_id = self._create_pipeline_instance_id() @@ -386,6 +390,9 @@ def extract( schema_contract: TSchemaContract = None, ) -> ExtractInfo: """Extracts the `data` and prepare it for the normalization. Does not require destination or credentials to be configured. See `run` method for the arguments' description.""" + if self.refresh == "full": + drop_command(self, drop_all=True, state_paths="*") + # create extract storage to which all the sources will be extracted extract_step = Extract( self._schema_storage, @@ -1101,7 +1108,7 @@ def _get_destination_client_initial_config( if issubclass(client_spec, DestinationClientDwhConfiguration): if not self.dataset_name and self.dev_mode: logger.warning( - "Full refresh may not work if dataset name is not set. Please set the" + "Dev mode may not work if dataset name is not set. Please set the" " dataset_name argument in dlt.pipeline or run method" ) # set default schema name to load all incoming data to a single dataset, no matter what is the current schema name diff --git a/dlt/pipeline/typing.py b/dlt/pipeline/typing.py index f0192a504d..ec0eca4685 100644 --- a/dlt/pipeline/typing.py +++ b/dlt/pipeline/typing.py @@ -1,3 +1,5 @@ from typing import Literal TPipelineStep = Literal["sync", "extract", "normalize", "load"] + +TRefreshMode = Literal["full", "replace"]