diff --git a/dlt/common/destination/exceptions.py b/dlt/common/destination/exceptions.py new file mode 100644 index 0000000000..1b5423ff02 --- /dev/null +++ b/dlt/common/destination/exceptions.py @@ -0,0 +1,126 @@ +from typing import Any, Iterable, List + +from dlt.common.exceptions import DltException, TerminalException, TransientException + + +class DestinationException(DltException): + pass + + +class UnknownDestinationModule(DestinationException): + def __init__(self, destination_module: str) -> None: + self.destination_module = destination_module + if "." in destination_module: + msg = f"Destination module {destination_module} could not be found and imported" + else: + msg = f"Destination {destination_module} is not one of the standard dlt destinations" + super().__init__(msg) + + +class InvalidDestinationReference(DestinationException): + def __init__(self, destination_module: Any) -> None: + self.destination_module = destination_module + msg = f"Destination {destination_module} is not a valid destination module." + super().__init__(msg) + + +class DestinationTerminalException(DestinationException, TerminalException): + pass + + +class DestinationUndefinedEntity(DestinationTerminalException): + pass + + +class DestinationTransientException(DestinationException, TransientException): + pass + + +class DestinationLoadingViaStagingNotSupported(DestinationTerminalException): + def __init__(self, destination: str) -> None: + self.destination = destination + super().__init__(f"Destination {destination} does not support loading via staging.") + + +class DestinationLoadingWithoutStagingNotSupported(DestinationTerminalException): + def __init__(self, destination: str) -> None: + self.destination = destination + super().__init__(f"Destination {destination} does not support loading without staging.") + + +class DestinationNoStagingMode(DestinationTerminalException): + def __init__(self, destination: str) -> None: + self.destination = destination + super().__init__(f"Destination {destination} cannot be used as a staging") + + +class DestinationIncompatibleLoaderFileFormatException(DestinationTerminalException): + def __init__( + self, destination: str, staging: str, file_format: str, supported_formats: Iterable[str] + ) -> None: + self.destination = destination + self.staging = staging + self.file_format = file_format + self.supported_formats = supported_formats + supported_formats_str = ", ".join(supported_formats) + if self.staging: + if not supported_formats: + msg = ( + f"Staging {staging} cannot be used with destination {destination} because they" + " have no file formats in common." + ) + else: + msg = ( + f"Unsupported file format {file_format} for destination {destination} in" + f" combination with staging destination {staging}. Supported formats:" + f" {supported_formats_str}" + ) + else: + msg = ( + f"Unsupported file format {file_format} destination {destination}. Supported" + f" formats: {supported_formats_str}. Check the staging option in the dlt.pipeline" + " for additional formats." + ) + super().__init__(msg) + + +class IdentifierTooLongException(DestinationTerminalException): + def __init__( + self, + destination_name: str, + identifier_type: str, + identifier_name: str, + max_identifier_length: int, + ) -> None: + self.destination_name = destination_name + self.identifier_type = identifier_type + self.identifier_name = identifier_name + self.max_identifier_length = max_identifier_length + super().__init__( + f"The length of {identifier_type} {identifier_name} exceeds" + f" {max_identifier_length} allowed for {destination_name}" + ) + + +class DestinationHasFailedJobs(DestinationTerminalException): + def __init__(self, destination_name: str, load_id: str, failed_jobs: List[Any]) -> None: + self.destination_name = destination_name + self.load_id = load_id + self.failed_jobs = failed_jobs + super().__init__( + f"Destination {destination_name} has failed jobs in load package {load_id}" + ) + + +class DestinationSchemaTampered(DestinationTerminalException): + def __init__(self, schema_name: str, version_hash: str, stored_version_hash: str) -> None: + self.version_hash = version_hash + self.stored_version_hash = stored_version_hash + super().__init__( + f"Schema {schema_name} content was changed - by a loader or by destination code - from" + " the moment it was retrieved by load package. Such schema cannot reliably be updated" + f" nor saved. Current version hash: {version_hash} != stored version hash" + f" {stored_version_hash}. If you are using destination client directly, without storing" + " schema in load package, you should first save it into schema storage. You can also" + " use schema._bump_version() in test code to remove modified flag." + ) diff --git a/dlt/common/destination/reference.py b/dlt/common/destination/reference.py index 258efd80be..738c07bdc7 100644 --- a/dlt/common/destination/reference.py +++ b/dlt/common/destination/reference.py @@ -25,11 +25,6 @@ import inspect from dlt.common import logger -from dlt.common.exceptions import ( - IdentifierTooLongException, - InvalidDestinationReference, - UnknownDestinationModule, -) from dlt.common.schema import Schema, TTableSchema, TSchemaTables from dlt.common.schema.exceptions import SchemaException from dlt.common.schema.utils import ( @@ -43,13 +38,18 @@ from dlt.common.configuration.specs import BaseConfiguration, CredentialsConfiguration from dlt.common.configuration.accessors import config from dlt.common.destination.capabilities import DestinationCapabilitiesContext +from dlt.common.destination.exceptions import ( + IdentifierTooLongException, + InvalidDestinationReference, + UnknownDestinationModule, + DestinationSchemaTampered, +) from dlt.common.schema.utils import is_complete_column from dlt.common.schema.exceptions import UnknownTableException from dlt.common.storages import FileStorage from dlt.common.storages.load_storage import ParsedLoadJobFileName from dlt.common.configuration.specs import GcpCredentials, AwsCredentialsWithoutDefaults - TLoaderReplaceStrategy = Literal["truncate-and-insert", "insert-from-staging", "staging-optimized"] TDestinationConfig = TypeVar("TDestinationConfig", bound="DestinationClientConfiguration") TDestinationClient = TypeVar("TDestinationClient", bound="JobClientBase") @@ -318,6 +318,12 @@ def update_stored_schema( Optional[TSchemaTables]: Returns an update that was applied at the destination. """ self._verify_schema() + # make sure that schema being saved was not modified from the moment it was loaded from storage + version_hash = self.schema.version_hash + if self.schema.is_modified: + raise DestinationSchemaTampered( + self.schema.name, version_hash, self.schema.stored_version_hash + ) return expected_update @abstractmethod diff --git a/dlt/common/exceptions.py b/dlt/common/exceptions.py index c14a743f33..fe526c53dc 100644 --- a/dlt/common/exceptions.py +++ b/dlt/common/exceptions.py @@ -133,115 +133,6 @@ class SystemConfigurationException(DltException): pass -class DestinationException(DltException): - pass - - -class UnknownDestinationModule(DestinationException): - def __init__(self, destination_module: str) -> None: - self.destination_module = destination_module - if "." in destination_module: - msg = f"Destination module {destination_module} could not be found and imported" - else: - msg = f"Destination {destination_module} is not one of the standard dlt destinations" - super().__init__(msg) - - -class InvalidDestinationReference(DestinationException): - def __init__(self, destination_module: Any) -> None: - self.destination_module = destination_module - msg = f"Destination {destination_module} is not a valid destination module." - super().__init__(msg) - - -class DestinationTerminalException(DestinationException, TerminalException): - pass - - -class DestinationUndefinedEntity(DestinationTerminalException): - pass - - -class DestinationTransientException(DestinationException, TransientException): - pass - - -class DestinationLoadingViaStagingNotSupported(DestinationTerminalException): - def __init__(self, destination: str) -> None: - self.destination = destination - super().__init__(f"Destination {destination} does not support loading via staging.") - - -class DestinationLoadingWithoutStagingNotSupported(DestinationTerminalException): - def __init__(self, destination: str) -> None: - self.destination = destination - super().__init__(f"Destination {destination} does not support loading without staging.") - - -class DestinationNoStagingMode(DestinationTerminalException): - def __init__(self, destination: str) -> None: - self.destination = destination - super().__init__(f"Destination {destination} cannot be used as a staging") - - -class DestinationIncompatibleLoaderFileFormatException(DestinationTerminalException): - def __init__( - self, destination: str, staging: str, file_format: str, supported_formats: Iterable[str] - ) -> None: - self.destination = destination - self.staging = staging - self.file_format = file_format - self.supported_formats = supported_formats - supported_formats_str = ", ".join(supported_formats) - if self.staging: - if not supported_formats: - msg = ( - f"Staging {staging} cannot be used with destination {destination} because they" - " have no file formats in common." - ) - else: - msg = ( - f"Unsupported file format {file_format} for destination {destination} in" - f" combination with staging destination {staging}. Supported formats:" - f" {supported_formats_str}" - ) - else: - msg = ( - f"Unsupported file format {file_format} destination {destination}. Supported" - f" formats: {supported_formats_str}. Check the staging option in the dlt.pipeline" - " for additional formats." - ) - super().__init__(msg) - - -class IdentifierTooLongException(DestinationTerminalException): - def __init__( - self, - destination_name: str, - identifier_type: str, - identifier_name: str, - max_identifier_length: int, - ) -> None: - self.destination_name = destination_name - self.identifier_type = identifier_type - self.identifier_name = identifier_name - self.max_identifier_length = max_identifier_length - super().__init__( - f"The length of {identifier_type} {identifier_name} exceeds" - f" {max_identifier_length} allowed for {destination_name}" - ) - - -class DestinationHasFailedJobs(DestinationTerminalException): - def __init__(self, destination_name: str, load_id: str, failed_jobs: List[Any]) -> None: - self.destination_name = destination_name - self.load_id = load_id - self.failed_jobs = failed_jobs - super().__init__( - f"Destination {destination_name} has failed jobs in load package {load_id}" - ) - - class PipelineException(DltException): def __init__(self, pipeline_name: str, msg: str) -> None: """Base class for all pipeline exceptions. Should not be raised.""" diff --git a/dlt/common/pipeline.py b/dlt/common/pipeline.py index 3cbaafefbe..57dda11c39 100644 --- a/dlt/common/pipeline.py +++ b/dlt/common/pipeline.py @@ -32,24 +32,18 @@ from dlt.common.configuration.paths import get_dlt_data_dir from dlt.common.configuration.specs import RunConfiguration from dlt.common.destination import TDestinationReferenceArg, TDestination -from dlt.common.exceptions import ( - DestinationHasFailedJobs, - PipelineStateNotAvailable, - SourceSectionNotAvailable, -) +from dlt.common.destination.exceptions import DestinationHasFailedJobs +from dlt.common.exceptions import PipelineStateNotAvailable, SourceSectionNotAvailable from dlt.common.schema import Schema from dlt.common.schema.typing import TColumnNames, TColumnSchema, TWriteDisposition, TSchemaContract from dlt.common.source import get_current_pipe_name from dlt.common.storages.load_storage import LoadPackageInfo -from dlt.common.storages.load_package import PackageStorage - from dlt.common.time import ensure_pendulum_datetime, precise_time from dlt.common.typing import DictStrAny, REPattern, StrAny, SupportsHumanize from dlt.common.jsonpath import delete_matches, TAnyJsonPath from dlt.common.data_writers.writers import DataWriterMetrics, TLoaderFileFormat from dlt.common.utils import RowCounts, merge_row_counts from dlt.common.versioned_state import TVersionedState -from dlt.common.storages.load_package import TLoadPackageState class _StepInfo(NamedTuple): diff --git a/dlt/common/schema/schema.py b/dlt/common/schema/schema.py index 92598fff44..c738f1753e 100644 --- a/dlt/common/schema/schema.py +++ b/dlt/common/schema/schema.py @@ -1,4 +1,3 @@ -import yaml from copy import copy, deepcopy from typing import ClassVar, Dict, List, Mapping, Optional, Sequence, Tuple, Any, cast, Literal from dlt.common import json @@ -76,8 +75,8 @@ class Schema: _schema_name: str _dlt_tables_prefix: str - _stored_version: int # version at load/creation time - _stored_version_hash: str # version hash at load/creation time + _stored_version: int # version at load time + _stored_version_hash: str # version hash at load time _stored_previous_hashes: Optional[List[str]] # list of ancestor hashes of the schema _imported_version_hash: str # version hash of recently imported schema _schema_description: str # optional schema description @@ -136,12 +135,10 @@ def replace_schema_content( # do not bump version so hash from `schema` is preserved stored_schema = schema.to_dict(bump_version=False) if link_to_replaced_schema: - replaced_version_hash = self.stored_version_hash - assert replaced_version_hash is not None + replaced_version_hash = self.version_hash # do not store hash if the replaced schema is identical - if stored_schema["version_hash"] != replaced_version_hash: + if schema.version_hash != replaced_version_hash: utils.store_prev_hash(stored_schema, replaced_version_hash) - stored_schema["version_hash"] = replaced_version_hash self._reset_schema(schema.name, schema._normalizers_config) self._from_stored_schema(stored_schema) @@ -426,7 +423,7 @@ def update_table(self, partial_table: TPartialTableSchema) -> TPartialTableSchem self._schema_tables[table_name] = partial_table else: # merge tables performing additional checks - partial_table = utils.merge_tables(table, partial_table) + partial_table = utils.merge_table(table, partial_table) self.data_item_normalizer.extend_table(table_name) return partial_table @@ -442,19 +439,6 @@ def update_schema(self, schema: "Schema") -> None: self._settings = deepcopy(schema.settings) self._compile_settings() - def bump_version(self) -> Tuple[int, str]: - """Computes schema hash in order to check if schema content was modified. In such case the schema ``stored_version`` and ``stored_version_hash`` are updated. - - Should not be used in production code. The method ``to_dict`` will generate TStoredSchema with correct value, only once before persisting schema to storage. - - Returns: - Tuple[int, str]: Current (``stored_version``, ``stored_version_hash``) tuple - """ - self._stored_version, self._stored_version_hash, _, _ = utils.bump_version_if_modified( - self.to_dict(bump_version=False) - ) - return self._stored_version, self._stored_version_hash - def filter_row_with_hint(self, table_name: str, hint_type: TColumnHint, row: StrAny) -> StrAny: rv_row: DictStrAny = {} column_prop: TColumnProp = utils.hint_to_column_prop(hint_type) @@ -515,7 +499,7 @@ def normalize_table_identifiers(self, table: TTableSchema) -> TTableSchema: # re-index columns as the name changed, if name space was reduced then # some columns now clash with each other. so make sure that we merge columns that are already there if new_col_name in new_columns: - new_columns[new_col_name] = utils.merge_columns( + new_columns[new_col_name] = utils.merge_column( new_columns[new_col_name], c, merge_defaults=False ) else: @@ -631,6 +615,19 @@ def stored_version_hash(self) -> str: """Version hash of the schema content form the time of schema loading/creation.""" return self._stored_version_hash + @property + def is_modified(self) -> bool: + """Checks if schema was modified from the time it was saved or if this is a new schema + + A current version hash is computed and compared with stored version hash + """ + return self.version_hash != self._stored_version_hash + + @property + def is_new(self) -> bool: + """Checks if schema was ever saved""" + return self._stored_version_hash is None + @property def name(self) -> str: return self._schema_name @@ -646,22 +643,24 @@ def settings(self) -> TSchemaSettings: def to_pretty_json(self, remove_defaults: bool = True) -> str: d = self.to_dict(remove_defaults=remove_defaults) - return json.dumps(d, pretty=True) + return utils.to_pretty_json(d) def to_pretty_yaml(self, remove_defaults: bool = True) -> str: d = self.to_dict(remove_defaults=remove_defaults) - return yaml.dump(d, allow_unicode=True, default_flow_style=False, sort_keys=False) + return utils.to_pretty_yaml(d) def clone(self, with_name: str = None, update_normalizers: bool = False) -> "Schema": """Make a deep copy of the schema, optionally changing the name, and updating normalizers and identifiers in the schema if `update_normalizers` is True - Note that changing of name will break the previous version chain + Note that changing of name will set the schema as new """ - d = deepcopy(self.to_dict()) + d = deepcopy(self.to_dict(bump_version=False)) if with_name is not None: + d["version"] = d["version_hash"] = None + d.pop("imported_version_hash", None) d["name"] = with_name d["previous_hashes"] = [] - schema = Schema.from_dict(d) # type: ignore + schema = Schema.from_stored_schema(d) # update normalizers and possibly all schema identifiers if update_normalizers: schema.update_normalizers() @@ -782,7 +781,7 @@ def _coerce_non_null_value( # if there's incomplete new_column then merge it with inferred column if new_column: # use all values present in incomplete column to override inferred column - also the defaults - new_column = utils.merge_columns(inferred_column, new_column) + new_column = utils.merge_column(inferred_column, new_column) else: new_column = inferred_column @@ -807,6 +806,28 @@ def _infer_hint(self, hint_type: TColumnHint, _: Any, col_name: str) -> bool: else: return False + def _bump_version(self) -> Tuple[int, str]: + """Computes schema hash in order to check if schema content was modified. In such case the schema ``stored_version`` and ``stored_version_hash`` are updated. + + Should not be used directly. The method ``to_dict`` will generate TStoredSchema with correct value, only once before persisting schema to storage. + + Returns: + Tuple[int, str]: Current (``stored_version``, ``stored_version_hash``) tuple + """ + self._stored_version, self._stored_version_hash, _, _ = utils.bump_version_if_modified( + self.to_dict(bump_version=False) + ) + return self._stored_version, self._stored_version_hash + + def _drop_version(self) -> None: + """Stores first prev hash as stored hash and decreases numeric version""" + if len(self.previous_hashes) == 0 or self._stored_version is None: + self._stored_version = None + self._stored_version_hash = None + else: + self._stored_version -= 1 + self._stored_version_hash = self._stored_previous_hashes.pop(0) + def _add_standard_tables(self) -> None: self._schema_tables[self.version_table_name] = self.normalize_table_identifiers( utils.version_table() @@ -849,7 +870,7 @@ def _configure_normalizers(self, normalizers: TNormalizersConfig) -> None: def _reset_schema(self, name: str, normalizers: TNormalizersConfig = None) -> None: self._schema_tables: TSchemaTables = {} self._schema_name: str = None - self._stored_version = 1 + self._stored_version = None self._stored_version_hash: str = None self._imported_version_hash: str = None self._schema_description: str = None @@ -878,8 +899,6 @@ def _reset_schema(self, name: str, normalizers: TNormalizersConfig = None) -> No self._add_standard_tables() # compile all known regexes self._compile_settings() - # set initial version hash - self._stored_version_hash = self.version_hash def _from_stored_schema(self, stored_schema: TStoredSchema) -> None: self._schema_tables = stored_schema.get("tables") or {} diff --git a/dlt/common/schema/utils.py b/dlt/common/schema/utils.py index 4f2a4aa22d..0a4e00759d 100644 --- a/dlt/common/schema/utils.py +++ b/dlt/common/schema/utils.py @@ -1,7 +1,7 @@ import re import base64 import hashlib - +import yaml from copy import deepcopy, copy from typing import Dict, List, Sequence, Tuple, Type, Any, cast, Iterable, Optional, Union @@ -164,9 +164,11 @@ def bump_version_if_modified(stored_schema: TStoredSchema) -> Tuple[int, str, st """Bumps the `stored_schema` version and version hash if content modified, returns (new version, new hash, old hash, 10 last hashes) tuple""" hash_ = generate_version_hash(stored_schema) previous_hash = stored_schema.get("version_hash") + previous_version = stored_schema.get("version") if not previous_hash: # if hash was not set, set it without bumping the version, that's initial schema - pass + # previous_version may not be None for migrating schemas + stored_schema["version"] = previous_version or 1 elif hash_ != previous_hash: stored_schema["version"] += 1 store_prev_hash(stored_schema, previous_hash) @@ -340,7 +342,7 @@ def compare_complete_columns(a: TColumnSchema, b: TColumnSchema) -> bool: return a["data_type"] == b["data_type"] and a["name"] == b["name"] -def merge_columns( +def merge_column( col_a: TColumnSchema, col_b: TColumnSchema, merge_defaults: bool = True ) -> TColumnSchema: """Merges `col_b` into `col_a`. if `merge_defaults` is True, only hints from `col_b` that are not default in `col_a` will be set. @@ -354,7 +356,7 @@ def merge_columns( return col_a -def diff_tables(tab_a: TTableSchema, tab_b: TPartialTableSchema) -> TPartialTableSchema: +def diff_table(tab_a: TTableSchema, tab_b: TPartialTableSchema) -> TPartialTableSchema: """Creates a partial table that contains properties found in `tab_b` that are not present or different in `tab_a`. The name is always present in returned partial. It returns new columns (not present in tab_a) and merges columns from tab_b into tab_a (overriding non-default hint values). @@ -389,7 +391,7 @@ def diff_tables(tab_a: TTableSchema, tab_b: TPartialTableSchema) -> TPartialTabl None, ) # all other properties can change - merged_column = merge_columns(copy(col_a), col_b) + merged_column = merge_column(copy(col_a), col_b) if merged_column != col_a: new_columns.append(merged_column) else: @@ -428,11 +430,12 @@ def diff_tables(tab_a: TTableSchema, tab_b: TPartialTableSchema) -> TPartialTabl # return False -def merge_tables(table: TTableSchema, partial_table: TPartialTableSchema) -> TPartialTableSchema: +def merge_table(table: TTableSchema, partial_table: TPartialTableSchema) -> TPartialTableSchema: """Merges "partial_table" into "table". `table` is merged in place. Returns the diff partial table. `table` and `partial_table` names must be identical. A table diff is generated and applied to `table`: * new columns are added, updated columns are replaced from diff + * incomplete columns in `table` that got completed in `partial_table` are removed to preserve order * table hints are added or replaced from diff * nothing gets deleted """ @@ -441,14 +444,20 @@ def merge_tables(table: TTableSchema, partial_table: TPartialTableSchema) -> TPa raise TablePropertiesConflictException( table["name"], "name", table["name"], partial_table["name"] ) - diff_table = diff_tables(table, partial_table) + diff = diff_table(table, partial_table) + # remove incomplete columns in table that are complete in diff table + for col_name, column in diff["columns"].items(): + if is_complete_column(column): + table_column = table["columns"].get(col_name) + if table_column and not is_complete_column(table_column): + table["columns"].pop(col_name) # add new columns when all checks passed - table["columns"].update(diff_table["columns"]) + table["columns"].update(diff["columns"]) updated_columns = table["columns"] - table.update(diff_table) + table.update(diff) table["columns"] = updated_columns - return diff_table + return diff def has_table_seen_data(table: TTableSchema) -> bool: @@ -725,3 +734,11 @@ def standard_hints() -> Dict[TColumnHint, List[TSimpleRegex]]: def standard_type_detections() -> List[TTypeDetections]: return ["iso_timestamp"] + + +def to_pretty_json(stored_schema: TStoredSchema) -> str: + return json.dumps(stored_schema, pretty=True) + + +def to_pretty_yaml(stored_schema: TStoredSchema) -> str: + return yaml.dump(stored_schema, allow_unicode=True, default_flow_style=False, sort_keys=False) diff --git a/dlt/common/storages/live_schema_storage.py b/dlt/common/storages/live_schema_storage.py index d3d5f14fe5..fb94a21b7a 100644 --- a/dlt/common/storages/live_schema_storage.py +++ b/dlt/common/storages/live_schema_storage.py @@ -17,22 +17,17 @@ def __init__( def __getitem__(self, name: str) -> Schema: if name in self.live_schemas: schema = self.live_schemas[name] - else: - # return new schema instance - schema = super().load_schema(name) - self.update_live_schema(schema) - + if not self.is_live_schema_committed(name): + return schema + # return new schema instance + schema = self.load_schema(name) + schema = self.set_live_schema(schema) return schema - # def load_schema(self, name: str) -> Schema: - # self.commit_live_schema(name) - # # now live schema is saved so we can load it with the changes - # return super().load_schema(name) - def save_schema(self, schema: Schema) -> str: - rv = super().save_schema(schema) # update the live schema with schema being saved, if no live schema exist, create one to be available for a getter - self.update_live_schema(schema) + schema = self.set_live_schema(schema) + rv = super().save_schema(schema) return rv def remove_schema(self, name: str) -> None: @@ -40,44 +35,47 @@ def remove_schema(self, name: str) -> None: # also remove the live schema self.live_schemas.pop(name, None) - def save_import_schema_if_not_exists(self, schema: Schema) -> None: + def save_import_schema_if_not_exists(self, schema: Schema) -> bool: + """Saves import schema, if not exists. If schema was saved, link itself as imported from""" if self.config.import_schema_path: try: self._load_import_schema(schema.name) except FileNotFoundError: # save import schema only if it not exist self._export_schema(schema, self.config.import_schema_path) + # if import schema got saved then add own version hash as import version hash + schema._imported_version_hash = schema.version_hash + return True - def commit_live_schema(self, name: str) -> Schema: - # if live schema exists and is modified then it must be used as an import schema - live_schema = self.live_schemas.get(name) - if live_schema and live_schema.stored_version_hash != live_schema.version_hash: - live_schema.bump_version() - self._save_schema(live_schema) - return live_schema + return False + + def commit_live_schema(self, name: str) -> str: + """Saves live schema in storage if it was modified""" + if not self.is_live_schema_committed(name): + live_schema = self.live_schemas[name] + return self._save_schema(live_schema) + # not saved + return None def is_live_schema_committed(self, name: str) -> bool: """Checks if live schema is present in storage and have same hash""" live_schema = self.live_schemas.get(name) if live_schema is None: raise SchemaNotFoundError(name, f"live-schema://{name}") - try: - stored_schema_json = self._load_schema_json(name) - return live_schema.version_hash == cast(str, stored_schema_json.get("version_hash")) - except FileNotFoundError: - return False + return not live_schema.is_modified - def update_live_schema(self, schema: Schema, can_create_new: bool = True) -> None: - """Will update live schema content without writing to storage. Optionally allows to create a new live schema""" + def set_live_schema(self, schema: Schema) -> Schema: + """Will add or update live schema content without writing to storage.""" live_schema = self.live_schemas.get(schema.name) if live_schema: if id(live_schema) != id(schema): # replace content without replacing instance # print(f"live schema {live_schema} updated in place") live_schema.replace_schema_content(schema, link_to_replaced_schema=True) - elif can_create_new: + else: # print(f"live schema {schema.name} created from schema") - self.live_schemas[schema.name] = schema + live_schema = self.live_schemas[schema.name] = schema + return live_schema def list_schemas(self) -> List[str]: names = list(set(super().list_schemas()) | set(self.live_schemas.keys())) diff --git a/dlt/common/storages/load_package.py b/dlt/common/storages/load_package.py index bb66e28671..b9c36143ac 100644 --- a/dlt/common/storages/load_package.py +++ b/dlt/common/storages/load_package.py @@ -210,7 +210,7 @@ def schema_name(self) -> str: @property def schema_hash(self) -> str: - return self.schema.stored_version_hash + return self.schema.version_hash def asdict(self) -> DictStrAny: d = self._asdict() diff --git a/dlt/common/storages/schema_storage.py b/dlt/common/storages/schema_storage.py index 4745d50dcc..23b695b839 100644 --- a/dlt/common/storages/schema_storage.py +++ b/dlt/common/storages/schema_storage.py @@ -4,6 +4,7 @@ from dlt.common import json, logger from dlt.common.configuration import with_config from dlt.common.configuration.accessors import config +from dlt.common.schema.utils import to_pretty_json, to_pretty_yaml from dlt.common.storages.configuration import ( SchemaStorageConfiguration, TSchemaFileFormat, @@ -106,32 +107,33 @@ def _maybe_import_schema(self, name: str, storage_schema: DictStrAny = None) -> if storage_schema is None: # import schema when no schema in storage rv_schema = Schema.from_dict(imported_schema) - # if schema was imported, overwrite storage schema + # store import hash to self to track changes rv_schema._imported_version_hash = rv_schema.version_hash - self._save_schema(rv_schema) logger.info( f"Schema {name} not present in {self.storage.storage_path} and got imported" f" with version {rv_schema.stored_version} and imported hash" f" {rv_schema._imported_version_hash}" ) + # if schema was imported, overwrite storage schema + self._save_schema(rv_schema) + if self.config.export_schema_path: + self._export_schema(rv_schema, self.config.export_schema_path) else: # import schema when imported schema was modified from the last import - sc = Schema.from_dict(storage_schema) - rv_schema = Schema.from_dict(imported_schema) - if rv_schema.version_hash != sc._imported_version_hash: - # use imported schema but version must be bumped and imported hash set - rv_schema._stored_version = sc.stored_version + 1 - rv_schema._imported_version_hash = rv_schema.version_hash - # if schema was imported, overwrite storage schema - self._save_schema(rv_schema) + rv_schema = Schema.from_dict(storage_schema) + i_s = Schema.from_dict(imported_schema) + if i_s.version_hash != rv_schema._imported_version_hash: + rv_schema.replace_schema_content(i_s, link_to_replaced_schema=True) + rv_schema._imported_version_hash = i_s.version_hash logger.info( f"Schema {name} was present in {self.storage.storage_path} but is" - f" overwritten with imported schema version {rv_schema.stored_version} and" - f" imported hash {rv_schema._imported_version_hash}" + f" overwritten with imported schema version {i_s.version} and" + f" imported hash {i_s.version_hash}" ) - else: - # use storage schema as nothing changed - rv_schema = sc + # if schema was imported, overwrite storage schema + self._save_schema(rv_schema) + if self.config.export_schema_path: + self._export_schema(rv_schema, self.config.export_schema_path) except FileNotFoundError: # no schema to import -> skip silently and return the original if storage_schema is None: @@ -154,14 +156,11 @@ def _load_import_schema(self, name: str) -> DictStrAny: ) def _export_schema(self, schema: Schema, export_path: str) -> None: + stored_schema = schema.to_dict(remove_defaults=True) if self.config.external_schema_format == "json": - exported_schema_s = schema.to_pretty_json( - remove_defaults=self.config.external_schema_format_remove_defaults - ) + exported_schema_s = to_pretty_json(stored_schema) elif self.config.external_schema_format == "yaml": - exported_schema_s = schema.to_pretty_yaml( - remove_defaults=self.config.external_schema_format_remove_defaults - ) + exported_schema_s = to_pretty_yaml(stored_schema) else: raise ValueError(self.config.external_schema_format) @@ -170,13 +169,19 @@ def _export_schema(self, schema: Schema, export_path: str) -> None: export_storage.save(schema_file, exported_schema_s) logger.info( f"Schema {schema.name} exported to {export_path} with version" - f" {schema.stored_version} as {self.config.external_schema_format}" + f" {stored_schema['version']}:{stored_schema['version_hash']} as" + f" {self.config.external_schema_format}" ) def _save_schema(self, schema: Schema) -> str: # save a schema to schema store schema_file = self._file_name_in_store(schema.name, "json") - return self.storage.save(schema_file, schema.to_pretty_json(remove_defaults=False)) + stored_schema = schema.to_dict() + saved_path = self.storage.save(schema_file, to_pretty_json(stored_schema)) + # this should be the only place where this function is called. we bump a version and + # clean modified status + schema._bump_version() + return saved_path @staticmethod def load_schema_file( diff --git a/dlt/destinations/exceptions.py b/dlt/destinations/exceptions.py index cc4d4fd836..5e6adb007d 100644 --- a/dlt/destinations/exceptions.py +++ b/dlt/destinations/exceptions.py @@ -1,5 +1,6 @@ from typing import Sequence -from dlt.common.exceptions import ( + +from dlt.common.destination.exceptions import ( DestinationTerminalException, DestinationTransientException, DestinationUndefinedEntity, @@ -63,18 +64,6 @@ def __init__(self, table_name: str, columns: Sequence[str], msg: str) -> None: ) -class DestinationSchemaTampered(DestinationTerminalException): - def __init__(self, schema_name: str, version_hash: str, stored_version_hash: str) -> None: - self.version_hash = version_hash - self.stored_version_hash = stored_version_hash - super().__init__( - f"Schema {schema_name} content was changed - by a loader or by destination code - from" - " the moment it was retrieved by load package. Such schema cannot reliably be updated" - f" or saved. Current version hash: {version_hash} != stored version hash" - f" {stored_version_hash}" - ) - - class LoadJobNotExistsException(DestinationTerminalException): def __init__(self, job_id: str) -> None: super().__init__(f"Job with id/file name {job_id} not found") diff --git a/dlt/destinations/impl/dummy/dummy.py b/dlt/destinations/impl/dummy/dummy.py index c46e329819..0d91220d88 100644 --- a/dlt/destinations/impl/dummy/dummy.py +++ b/dlt/destinations/impl/dummy/dummy.py @@ -16,9 +16,12 @@ from dlt.common import pendulum from dlt.common.schema import Schema, TTableSchema, TSchemaTables -from dlt.common.schema.typing import TWriteDisposition from dlt.common.storages import FileStorage from dlt.common.destination import DestinationCapabilitiesContext +from dlt.common.destination.exceptions import ( + DestinationTerminalException, + DestinationTransientException, +) from dlt.common.destination.reference import ( FollowupJob, NewLoadJob, @@ -32,10 +35,7 @@ from dlt.destinations.exceptions import ( LoadJobNotExistsException, LoadJobInvalidStateTransitionException, - DestinationTerminalException, - DestinationTransientException, ) - from dlt.destinations.impl.dummy import capabilities from dlt.destinations.impl.dummy.configuration import DummyClientConfiguration from dlt.destinations.job_impl import NewReferenceJob diff --git a/dlt/destinations/impl/motherduck/configuration.py b/dlt/destinations/impl/motherduck/configuration.py index f4ab571e5c..35f02f709a 100644 --- a/dlt/destinations/impl/motherduck/configuration.py +++ b/dlt/destinations/impl/motherduck/configuration.py @@ -2,7 +2,7 @@ from dlt.common.configuration import configspec from dlt.common.destination.reference import DestinationClientDwhWithStagingConfiguration -from dlt.common.exceptions import DestinationTerminalException +from dlt.common.destination.exceptions import DestinationTerminalException from dlt.common.typing import TSecretValue from dlt.common.utils import digest128 from dlt.common.configuration.exceptions import ConfigurationValueError diff --git a/dlt/destinations/impl/qdrant/qdrant_client.py b/dlt/destinations/impl/qdrant/qdrant_client.py index 2df3023d86..febfe38ec9 100644 --- a/dlt/destinations/impl/qdrant/qdrant_client.py +++ b/dlt/destinations/impl/qdrant/qdrant_client.py @@ -283,6 +283,7 @@ def _delete_sentinel_collection(self) -> None: def update_stored_schema( self, only_tables: Iterable[str] = None, expected_update: TSchemaTables = None ) -> Optional[TSchemaTables]: + super().update_stored_schema(only_tables, expected_update) applied_update: TSchemaTables = {} schema_info = self.get_stored_schema_by_hash(self.schema.stored_version_hash) if schema_info is None: diff --git a/dlt/destinations/impl/weaviate/exceptions.py b/dlt/destinations/impl/weaviate/exceptions.py index bff1b4cacc..ee798e4e76 100644 --- a/dlt/destinations/impl/weaviate/exceptions.py +++ b/dlt/destinations/impl/weaviate/exceptions.py @@ -1,4 +1,4 @@ -from dlt.common.exceptions import DestinationException, DestinationTerminalException +from dlt.common.destination.exceptions import DestinationException, DestinationTerminalException class WeaviateBatchError(DestinationException): diff --git a/dlt/destinations/impl/weaviate/weaviate_client.py b/dlt/destinations/impl/weaviate/weaviate_client.py index 2d23dc38f7..6486a75e6e 100644 --- a/dlt/destinations/impl/weaviate/weaviate_client.py +++ b/dlt/destinations/impl/weaviate/weaviate_client.py @@ -14,7 +14,7 @@ cast, ) -from dlt.common.exceptions import ( +from dlt.common.destination.exceptions import ( DestinationUndefinedEntity, DestinationTransientException, DestinationTerminalException, @@ -424,6 +424,7 @@ def _delete_sentinel_class(self) -> None: def update_stored_schema( self, only_tables: Iterable[str] = None, expected_update: TSchemaTables = None ) -> Optional[TSchemaTables]: + super().update_stored_schema(only_tables, expected_update) # Retrieve the schema from Weaviate applied_update: TSchemaTables = {} try: diff --git a/dlt/destinations/job_client_impl.py b/dlt/destinations/job_client_impl.py index 7896fa2cc4..ea0d10d11d 100644 --- a/dlt/destinations/job_client_impl.py +++ b/dlt/destinations/job_client_impl.py @@ -35,13 +35,13 @@ ) from dlt.common.storages import FileStorage from dlt.common.schema import TColumnSchema, Schema, TTableSchemaColumns, TSchemaTables +from dlt.common.schema.typing import LOADS_TABLE_NAME, VERSION_TABLE_NAME from dlt.common.destination.reference import ( StateInfo, StorageSchemaInfo, WithStateSync, DestinationClientConfiguration, DestinationClientDwhConfiguration, - DestinationClientDwhWithStagingConfiguration, NewLoadJob, WithStagingDataset, TLoadJobState, @@ -50,15 +50,10 @@ FollowupJob, CredentialsConfiguration, ) -from dlt.destinations.exceptions import ( - DatabaseUndefinedRelation, - DestinationSchemaTampered, - DestinationSchemaWillNotUpdate, -) + +from dlt.destinations.exceptions import DatabaseUndefinedRelation from dlt.destinations.job_impl import EmptyLoadJobWithoutFollowup, NewReferenceJob from dlt.destinations.sql_jobs import SqlMergeJob, SqlStagingCopyJob -from dlt.common.schema.typing import LOADS_TABLE_NAME, VERSION_TABLE_NAME - from dlt.destinations.typing import TNativeConn from dlt.destinations.sql_client import SqlClientBase @@ -539,10 +534,6 @@ def _replace_schema_in_storage(self, schema: Schema) -> None: self._update_schema_in_storage(schema) def _update_schema_in_storage(self, schema: Schema) -> None: - # make sure that schema being saved was not modified from the moment it was loaded from storage - version_hash = schema.version_hash - if version_hash != schema.stored_version_hash: - raise DestinationSchemaTampered(schema.name, version_hash, schema.stored_version_hash) # get schema string or zip schema_str = json.dumps(schema.to_dict()) # TODO: not all databases store data as utf-8 but this exception is mostly for redshift diff --git a/dlt/extract/decorators.py b/dlt/extract/decorators.py index 6e916ff6e1..e5525519ec 100644 --- a/dlt/extract/decorators.py +++ b/dlt/extract/decorators.py @@ -28,6 +28,7 @@ from dlt.common.configuration.specs.config_section_context import ConfigSectionContext from dlt.common.exceptions import ArgumentsOverloadException from dlt.common.pipeline import PipelineContext +from dlt.common.schema.utils import DEFAULT_WRITE_DISPOSITION from dlt.common.source import _SOURCES, SourceInfo from dlt.common.schema.schema import Schema from dlt.common.schema.typing import ( @@ -447,7 +448,7 @@ def make_resource( ) -> DltResource: table_template = make_hints( table_name, - write_disposition=write_disposition, + write_disposition=write_disposition or DEFAULT_WRITE_DISPOSITION, columns=columns, primary_key=primary_key, merge_key=merge_key, diff --git a/dlt/extract/extract.py b/dlt/extract/extract.py index 3b3d0704ea..2fc4fd77aa 100644 --- a/dlt/extract/extract.py +++ b/dlt/extract/extract.py @@ -76,8 +76,7 @@ def choose_schema() -> Schema: """Except of explicitly passed schema, use a clone that will get discarded if extraction fails""" if schema: schema_ = schema - # TODO: We should start with a new schema of the same name here ideally, but many tests fail - # because of this. So some investigation is needed. + # take pipeline schema to make newest version visible to the resources elif pipeline.default_schema_name: schema_ = pipeline.schemas[pipeline.default_schema_name].clone() else: diff --git a/dlt/extract/extractors.py b/dlt/extract/extractors.py index 52ecd66920..b8e615aae4 100644 --- a/dlt/extract/extractors.py +++ b/dlt/extract/extractors.py @@ -105,12 +105,16 @@ def write_items(self, resource: DltResource, items: TDataItems, meta: Any) -> No if isinstance(meta, HintsMeta): # update the resource with new hints, remove all caches so schema is recomputed # and contracts re-applied - resource.merge_hints(meta.hints) + resource.merge_hints(meta.hints, meta.create_table_variant) + # convert to table meta if created table variant so item is assigned to this table + if meta.create_table_variant: + # name in hints meta must be a string, otherwise merge_hints would fail + meta = TableNameMeta(meta.hints["name"]) # type: ignore[arg-type] self._reset_contracts_cache() if table_name := self._get_static_table_name(resource, meta): # write item belonging to table with static name - self._write_to_static_table(resource, table_name, items) + self._write_to_static_table(resource, table_name, items, meta) else: # table has name or other hints depending on data items self._write_to_dynamic_table(resource, items) @@ -157,30 +161,32 @@ def _write_to_dynamic_table(self, resource: DltResource, items: TDataItems) -> N if table_name in self._filtered_tables: continue if table_name not in self._table_contracts or resource._table_has_other_dynamic_hints: - item = self._compute_and_update_table(resource, table_name, item) + item = self._compute_and_update_table( + resource, table_name, item, TableNameMeta(table_name) + ) # write to storage with inferred table name if table_name not in self._filtered_tables: self._write_item(table_name, resource.name, item) def _write_to_static_table( - self, resource: DltResource, table_name: str, items: TDataItems + self, resource: DltResource, table_name: str, items: TDataItems, meta: Any ) -> None: if table_name not in self._table_contracts: - items = self._compute_and_update_table(resource, table_name, items) + items = self._compute_and_update_table(resource, table_name, items, meta) if table_name not in self._filtered_tables: self._write_item(table_name, resource.name, items) - def _compute_table(self, resource: DltResource, items: TDataItems) -> TTableSchema: + def _compute_table(self, resource: DltResource, items: TDataItems, meta: Any) -> TTableSchema: """Computes a schema for a new or dynamic table and normalizes identifiers""" - return self.schema.normalize_table_identifiers(resource.compute_table_schema(items)) + return self.schema.normalize_table_identifiers(resource.compute_table_schema(items, meta)) def _compute_and_update_table( - self, resource: DltResource, table_name: str, items: TDataItems + self, resource: DltResource, table_name: str, items: TDataItems, meta: Any ) -> TDataItems: """ Computes new table and does contract checks, if false is returned, the table may not be created and no items should be written """ - computed_table = self._compute_table(resource, items) + computed_table = self._compute_table(resource, items, meta) # overwrite table name (if coming from meta) computed_table["name"] = table_name # get or compute contract @@ -193,7 +199,7 @@ def _compute_and_update_table( computed_table["x-normalizer"] = {"evolve-columns-once": True} # type: ignore[typeddict-unknown-key] existing_table = self.schema._schema_tables.get(table_name, None) if existing_table: - diff_table = utils.diff_tables(existing_table, computed_table) + diff_table = utils.diff_table(existing_table, computed_table) else: diff_table = computed_table @@ -300,9 +306,11 @@ def _write_item( ] super()._write_item(table_name, resource_name, items, columns) - def _compute_table(self, resource: DltResource, items: TDataItems) -> TPartialTableSchema: + def _compute_table( + self, resource: DltResource, items: TDataItems, meta: Any + ) -> TPartialTableSchema: items = items[0] - computed_table = super()._compute_table(resource, items) + computed_table = super()._compute_table(resource, items, Any) # Merge the columns to include primary_key and other hints that may be set on the resource arrow_table = copy(computed_table) @@ -329,9 +337,9 @@ def _compute_table(self, resource: DltResource, items: TDataItems) -> TPartialTa return arrow_table def _compute_and_update_table( - self, resource: DltResource, table_name: str, items: TDataItems + self, resource: DltResource, table_name: str, items: TDataItems, meta: Any ) -> TDataItems: - items = super()._compute_and_update_table(resource, table_name, items) + items = super()._compute_and_update_table(resource, table_name, items, meta) # filter data item as filters could be updated in compute table items = [self._apply_contract_filters(item, resource, table_name) for item in items] return items diff --git a/dlt/extract/hints.py b/dlt/extract/hints.py index 54ce00a806..01a99a23fe 100644 --- a/dlt/extract/hints.py +++ b/dlt/extract/hints.py @@ -12,7 +12,8 @@ TTableFormat, TSchemaContract, ) -from dlt.common.schema.utils import DEFAULT_WRITE_DISPOSITION, merge_columns, new_column, new_table +from dlt.common import logger +from dlt.common.schema.utils import DEFAULT_WRITE_DISPOSITION, merge_column, new_column, new_table from dlt.common.typing import TDataItem, DictStrAny, DictStrStr from dlt.common.utils import update_dict_nested from dlt.common.validation import validate_dict_ignoring_xkeys @@ -21,7 +22,7 @@ InconsistentTableTemplate, ) from dlt.extract.incremental import Incremental -from dlt.extract.items import TFunHintTemplate, TTableHintTemplate, ValidateItem +from dlt.extract.items import TFunHintTemplate, TTableHintTemplate, TableNameMeta, ValidateItem from dlt.extract.utils import ensure_table_schema_columns, ensure_table_schema_columns_hint from dlt.extract.validation import create_item_validator @@ -43,12 +44,14 @@ class TResourceHints(TypedDict, total=False): class HintsMeta: - __slots__ = "hints" + __slots__ = ("hints", "create_table_variant") - hints: TResourceHints - - def __init__(self, hints: TResourceHints) -> None: + def __init__(self, hints: TResourceHints, create_table_variant: bool) -> None: self.hints = hints + self.create_table_variant = create_table_variant + + +NATURAL_CALLABLES = ["incremental", "validator", "original_columns"] def make_hints( @@ -105,8 +108,11 @@ def __init__(self, table_schema_template: TResourceHints = None): self._table_name_hint_fun: TFunHintTemplate[str] = None self._table_has_other_dynamic_hints: bool = False self._hints: TResourceHints = None + """Hints for the resource""" + self._hints_variants: Dict[str, TResourceHints] = {} + """Hints for tables emitted from resources""" if table_schema_template: - self.set_hints(table_schema_template) + self._set_hints(table_schema_template) @property def name(self) -> str: @@ -143,16 +149,23 @@ def columns(self) -> TTableHintTemplate[TTableSchemaColumns]: def schema_contract(self) -> TTableHintTemplate[TSchemaContract]: return self._hints.get("schema_contract") - def compute_table_schema(self, item: TDataItem = None) -> TTableSchema: - """Computes the table schema based on hints and column definitions passed during resource creation. `item` parameter is used to resolve table hints based on data.""" - if not self._hints: + def compute_table_schema(self, item: TDataItem = None, meta: Any = None) -> TTableSchema: + """Computes the table schema based on hints and column definitions passed during resource creation. + `item` parameter is used to resolve table hints based on data. + `meta` parameter is taken from Pipe and may further specify table name if variant is to be used + """ + if isinstance(meta, TableNameMeta): + # look for variant + table_template = self._hints_variants.get(meta.table_name, self._hints) + else: + table_template = self._hints + if not table_template: return new_table(self.name, resource=self.name) # resolve a copy of a held template - table_template = copy(self._hints) + table_template = self._clone_hints(table_template) if "name" not in table_template: table_template["name"] = self.name - table_template["columns"] = copy(self._hints["columns"]) # if table template present and has dynamic hints, the data item must be provided. if self._table_name_hint_fun and item is None: @@ -161,7 +174,7 @@ def compute_table_schema(self, item: TDataItem = None) -> TTableSchema: resolved_template: TResourceHints = { k: self._resolve_hint(item, v) for k, v in table_template.items() - if k not in ["incremental", "validator", "original_columns"] + if k not in NATURAL_CALLABLES } # type: ignore table_schema = self._merge_keys(resolved_template) table_schema["resource"] = self.name @@ -184,9 +197,14 @@ def apply_hints( schema_contract: TTableHintTemplate[TSchemaContract] = None, additional_table_hints: Optional[Dict[str, TTableHintTemplate[Any]]] = None, table_format: TTableHintTemplate[TTableFormat] = None, + create_table_variant: bool = False, ) -> None: """Creates or modifies existing table schema by setting provided hints. Accepts both static and dynamic hints based on data. + If `create_table_variant` is specified, the `table_name` must be a string and hints will be used to create a separate set of hints + for a particular `table_name`. Such hints may be retrieved via compute_table_schema(meta=TableNameMeta(table_name)). + Table variant hints may not contain dynamic hints. + This method accepts the same table hints arguments as `dlt.resource` decorator with the following additions. Skip the argument or pass None to leave the existing hint. Pass empty value (for a particular type i.e. "" for a string) to remove a hint. @@ -197,7 +215,24 @@ def apply_hints( Please note that for efficient incremental loading, the resource must be aware of the Incremental by accepting it as one if its arguments and then using are to skip already loaded data. In non-aware resources, `dlt` will filter out the loaded values, however, the resource will yield all the values again. """ - if not self._hints: + if create_table_variant: + if not isinstance(table_name, str): + raise ValueError( + "Please provide string table name if you want to create a table variant of" + " hints" + ) + # select hints variant + t = self._hints_variants.get(table_name, None) + if t is None: + # use resource hints as starting point + if self._hints: + t = self._clone_hints(self._hints) + # but remove callables + t = {n: h for n, h in t.items() if not callable(h)} # type: ignore[assignment] + else: + t = self._hints + + if t is None: # if there is no template yet, create and set a new one. default_wd = None if parent_table_name else DEFAULT_WRITE_DISPOSITION t = make_hints( @@ -211,8 +246,7 @@ def apply_hints( table_format, ) else: - # set single hints - t = self._clone_hints(self._hints) + t = self._clone_hints(t) if table_name is not None: if table_name: t["name"] = table_name @@ -279,20 +313,46 @@ def apply_hints( if incremental is not None: t["incremental"] = None if incremental is Incremental.EMPTY else incremental - self.set_hints(t) + self._set_hints(t, create_table_variant) - def set_hints(self, hints_template: TResourceHints) -> None: + def _set_hints( + self, hints_template: TResourceHints, create_table_variant: bool = False + ) -> None: DltResourceHints.validate_dynamic_hints(hints_template) - # if "name" is callable in the template, then the table schema requires data item to be inferred. - name_hint = hints_template.get("name") - self._table_name_hint_fun = name_hint if callable(name_hint) else None - # check if any other hints in the table template should be inferred from data. - self._table_has_other_dynamic_hints = any( - callable(v) for k, v in hints_template.items() if k != "name" - ) - self._hints = hints_template + if create_table_variant: + table_name: str = hints_template["name"] # type: ignore[assignment] + # incremental cannot be specified in variant + if hints_template.get("incremental"): + raise InconsistentTableTemplate( + f"You can specify incremental only for the resource `{self.name}` hints, not in" + f" table `{table_name}` variant-" + ) + if hints_template.get("validator"): + logger.warning( + f"A data item validator was created from column schema in {self.name} for a" + f" table `{table_name}` variant. Currently such validator is ignored." + ) + # dynamic hints will be ignored + for name, hint in hints_template.items(): + if callable(hint) and name not in NATURAL_CALLABLES: + raise InconsistentTableTemplate( + f"Table `{table_name}` variant hint is resource {self.name} cannot have" + f" dynamic hint but {name} does." + ) + self._hints_variants[table_name] = hints_template + else: + # if "name" is callable in the template, then the table schema requires data item to be inferred. + name_hint = hints_template.get("name") + self._table_name_hint_fun = name_hint if callable(name_hint) else None + # check if any other hints in the table template should be inferred from data. + self._table_has_other_dynamic_hints = any( + callable(v) for k, v in hints_template.items() if k != "name" + ) + self._hints = hints_template - def merge_hints(self, hints_template: TResourceHints) -> None: + def merge_hints( + self, hints_template: TResourceHints, create_table_variant: bool = False + ) -> None: self.apply_hints( table_name=hints_template.get("name"), parent_table_name=hints_template.get("parent"), @@ -303,6 +363,7 @@ def merge_hints(self, hints_template: TResourceHints) -> None: incremental=hints_template.get("incremental"), schema_contract=hints_template.get("schema_contract"), table_format=hints_template.get("table_format"), + create_table_variant=create_table_variant, ) @staticmethod @@ -324,7 +385,7 @@ def _merge_key(hint: TColumnProp, keys: TColumnNames, partial: TPartialTableSche keys = [keys] for key in keys: if key in partial["columns"]: - merge_columns(partial["columns"][key], {hint: True}) # type: ignore + merge_column(partial["columns"][key], {hint: True}) # type: ignore else: partial["columns"][key] = new_column(key, nullable=False) partial["columns"][key][hint] = True @@ -347,9 +408,7 @@ def validate_dynamic_hints(template: TResourceHints) -> None: table_name = template.get("name") # if any of the hints is a function, then name must be as well. if any( - callable(v) - for k, v in template.items() - if k not in ["name", "incremental", "validator", "original_columns"] + callable(v) for k, v in template.items() if k not in ["name", *NATURAL_CALLABLES] ) and not callable(table_name): raise InconsistentTableTemplate( f"Table name {table_name} must be a function if any other table hint is a function" diff --git a/dlt/extract/items.py b/dlt/extract/items.py index c6e1f0a4b8..fec31e2846 100644 --- a/dlt/extract/items.py +++ b/dlt/extract/items.py @@ -81,10 +81,7 @@ class SourcePipeItem(NamedTuple): class DataItemWithMeta: - __slots__ = "meta", "data" - - meta: Any - data: TDataItems + __slots__ = ("meta", "data") def __init__(self, meta: Any, data: TDataItems) -> None: self.meta = meta @@ -92,9 +89,7 @@ def __init__(self, meta: Any, data: TDataItems) -> None: class TableNameMeta: - __slots__ = "table_name" - - table_name: str + __slots__ = ("table_name",) def __init__(self, table_name: str) -> None: self.table_name = table_name diff --git a/dlt/extract/resource.py b/dlt/extract/resource.py index 0fef502112..4776158bbb 100644 --- a/dlt/extract/resource.py +++ b/dlt/extract/resource.py @@ -63,13 +63,17 @@ def with_table_name(item: TDataItems, table_name: str) -> DataItemWithMeta: return DataItemWithMeta(TableNameMeta(table_name), item) -def with_hints(item: TDataItems, hints: TResourceHints) -> DataItemWithMeta: +def with_hints( + item: TDataItems, hints: TResourceHints, create_table_variant: bool = False +) -> DataItemWithMeta: """Marks `item` to update the resource with specified `hints`. + Will create a separate variant of hints for a table if `name` is provided in `hints` and `create_table_variant` is set. + Create `TResourceHints` with `make_hints`. Setting `table_name` will dispatch the `item` to a specified table, like `with_table_name` """ - return DataItemWithMeta(HintsMeta(hints), item) + return DataItemWithMeta(HintsMeta(hints, create_table_variant), item) class DltResource(Iterable[TDataItem], DltResourceHints): @@ -388,25 +392,29 @@ def add_step( self._pipe.insert_step(item_transform, insert_at) return self - def set_hints(self, table_schema_template: TResourceHints) -> None: - super().set_hints(table_schema_template) - incremental = self.incremental - # try to late assign incremental - if table_schema_template.get("incremental") is not None: - if incremental: - incremental._incremental = table_schema_template["incremental"] - else: - # if there's no wrapper add incremental as a transform - incremental = table_schema_template["incremental"] # type: ignore - self.add_step(incremental) + def _set_hints( + self, table_schema_template: TResourceHints, create_table_variant: bool = False + ) -> None: + super()._set_hints(table_schema_template, create_table_variant) + # validators and incremental apply only to resource hints + if not create_table_variant: + incremental = self.incremental + # try to late assign incremental + if table_schema_template.get("incremental") is not None: + if incremental: + incremental._incremental = table_schema_template["incremental"] + else: + # if there's no wrapper add incremental as a transform + incremental = table_schema_template["incremental"] # type: ignore + self.add_step(incremental) - if incremental: - primary_key = table_schema_template.get("primary_key", incremental.primary_key) - if primary_key is not None: - incremental.primary_key = primary_key + if incremental: + primary_key = table_schema_template.get("primary_key", incremental.primary_key) + if primary_key is not None: + incremental.primary_key = primary_key - if table_schema_template.get("validator") is not None: - self.validator = table_schema_template["validator"] + if table_schema_template.get("validator") is not None: + self.validator = table_schema_template["validator"] def bind(self, *args: Any, **kwargs: Any) -> "DltResource": """Binds the parametrized resource to passed arguments. Modifies resource pipe in place. Does not evaluate generators or iterators.""" diff --git a/dlt/helpers/streamlit_app/utils.py b/dlt/helpers/streamlit_app/utils.py index 6b2dab495c..cf1728c33b 100644 --- a/dlt/helpers/streamlit_app/utils.py +++ b/dlt/helpers/streamlit_app/utils.py @@ -38,9 +38,7 @@ def render_with_pipeline(render_func: Callable[..., None]) -> None: render_func(pipeline) -def query_using_cache( - pipeline: dlt.Pipeline, ttl: int -) -> Callable[..., Optional[pd.DataFrame]]: +def query_using_cache(pipeline: dlt.Pipeline, ttl: int) -> Callable[..., Optional[pd.DataFrame]]: @st.cache_data(ttl=ttl) def do_query( # type: ignore[return] query: str, diff --git a/dlt/load/exceptions.py b/dlt/load/exceptions.py index 8a704660ce..e85dffd2e9 100644 --- a/dlt/load/exceptions.py +++ b/dlt/load/exceptions.py @@ -1,10 +1,8 @@ from typing import Sequence -from dlt.destinations.exceptions import DestinationTerminalException, DestinationTransientException - - -# class LoadException(DltException): -# def __init__(self, msg: str) -> None: -# super().__init__(msg) +from dlt.common.destination.exceptions import ( + DestinationTerminalException, + DestinationTransientException, +) class LoadClientJobFailed(DestinationTerminalException): diff --git a/dlt/load/load.py b/dlt/load/load.py index 23c3dea820..a0909fa2d0 100644 --- a/dlt/load/load.py +++ b/dlt/load/load.py @@ -21,15 +21,10 @@ from dlt.common.runners import TRunMetrics, Runnable, workermethod, NullExecutor from dlt.common.runtime.collector import Collector, NULL_COLLECTOR from dlt.common.runtime.logger import pretty_format_exception -from dlt.common.exceptions import ( - TerminalValueError, - DestinationTerminalException, - DestinationTransientException, -) +from dlt.common.exceptions import TerminalValueError from dlt.common.configuration.container import Container - +from dlt.common.configuration.specs.config_section_context import ConfigSectionContext from dlt.common.schema import Schema, TSchemaTables - from dlt.common.storages import LoadStorage from dlt.common.destination.reference import ( DestinationClientDwhConfiguration, @@ -44,7 +39,10 @@ SupportsStagingDestination, TDestination, ) -from dlt.common.configuration.specs.config_section_context import ConfigSectionContext +from dlt.common.destination.exceptions import ( + DestinationTerminalException, + DestinationTransientException, +) from dlt.destinations.job_impl import EmptyLoadJob diff --git a/dlt/normalize/items_normalizers.py b/dlt/normalize/items_normalizers.py index 8565a5d2b2..fc1e152ff2 100644 --- a/dlt/normalize/items_normalizers.py +++ b/dlt/normalize/items_normalizers.py @@ -199,7 +199,7 @@ def __call__( root_table_name, items, may_have_pua(line), skip_write=False ) schema_updates.append(partial_update) - logger.debug(f"Processed {line_no} lines from file {extracted_items_file}") + logger.debug(f"Processed {line_no+1} lines from file {extracted_items_file}") if line is None and root_table_name in self.schema.tables: # TODO: we should push the truncate jobs via package state # not as empty jobs. empty jobs should be reserved for diff --git a/dlt/normalize/normalize.py b/dlt/normalize/normalize.py index 18f8faaa25..4a17b9eef8 100644 --- a/dlt/normalize/normalize.py +++ b/dlt/normalize/normalize.py @@ -309,11 +309,16 @@ def spool_files( f"Table {table_name} has seen data for a first time with load id {load_id}" ) x_normalizer["seen-data"] = True - logger.info( - f"Saving schema {schema.name} with version {schema.stored_version}:{schema.version}" - ) # schema is updated, save it to schema volume - self.schema_storage.save_schema(schema) + if schema.is_modified: + logger.info( + f"Saving schema {schema.name} with version {schema.stored_version}:{schema.version}" + ) + self.schema_storage.save_schema(schema) + else: + logger.info( + f"Schema {schema.name} with version {schema.version} was not modified. Save skipped" + ) # save schema new package self.load_storage.new_packages.save_schema(load_id, schema) # save schema updates even if empty @@ -376,8 +381,9 @@ def run(self, pool: Optional[Executor]) -> TRunMetrics: schema = self.normalize_storage.extracted_packages.load_schema(load_id) # prefer schema from schema storage if it exists try: - # also import the schema - storage_schema = self.schema_storage.load_schema(schema.name) + # use live schema instance via getter if on live storage, it will also do import + # schema as live schemas are committed before calling normalize + storage_schema = self.schema_storage[schema.name] if schema.stored_version_hash != storage_schema.stored_version_hash: logger.warning( f"When normalizing package {load_id} with schema {schema.name}: the storage" diff --git a/dlt/pipeline/helpers.py b/dlt/pipeline/helpers.py index 7bba5f84e7..c242a26eaa 100644 --- a/dlt/pipeline/helpers.py +++ b/dlt/pipeline/helpers.py @@ -12,7 +12,6 @@ from dlt.common.schema.typing import TSimpleRegex from dlt.common.typing import REPattern from dlt.common.pipeline import ( - TSourceState, reset_resource_state, _sources_state, _delete_source_state_keys, @@ -26,6 +25,7 @@ PipelineStepFailed, PipelineHasPendingDataException, ) +from dlt.pipeline.state_sync import force_state_extract from dlt.pipeline.typing import TPipelineStep from dlt.pipeline import Pipeline @@ -122,7 +122,7 @@ def __init__( else: self.tables_to_drop = [] self.drop_tables = False # No tables to drop - self.drop_state = not not self.state_paths_to_drop + self.drop_state = not not self.state_paths_to_drop # obtain truth value self.drop_all = drop_all self.info: _DropInfo = dict( @@ -167,10 +167,11 @@ def _drop_destination_tables(self) -> None: with client.with_staging_dataset(): client.drop_tables(*table_names, replace_schema=True) - def _delete_pipeline_tables(self) -> None: + def _delete_schema_tables(self) -> None: for tbl in self.tables_to_drop: del self.schema_tables[tbl["name"]] - self.schema.bump_version() + # bump schema, we'll save later + self.schema._bump_version() def _list_state_paths(self, source_state: Dict[str, Any]) -> List[str]: return resolve_paths(self.state_paths_to_drop, source_state) @@ -197,7 +198,7 @@ def _create_modified_state(self) -> Dict[str, Any]: self.info["state_paths"].extend(f"{source_name}.{p}" for p in resolved_paths) return state # type: ignore[return-value] - def _drop_state_keys(self) -> None: + def _extract_state(self) -> None: state: Dict[str, Any] with self.pipeline.managed_state(extract_state=True) as state: # type: ignore[assignment] state.clear() @@ -216,12 +217,12 @@ def __call__(self) -> None: return # Nothing to drop if self.drop_tables: - self._delete_pipeline_tables() + self._delete_schema_tables() self._drop_destination_tables() if self.drop_tables: self.pipeline.schemas.save_schema(self.schema) if self.drop_state: - self._drop_state_keys() + self._extract_state() # Send updated state to destination self.pipeline.normalize() try: @@ -230,8 +231,7 @@ def __call__(self) -> None: # Clear extracted state on failure so command can run again self.pipeline.drop_pending_packages() with self.pipeline.managed_state() as state: - state["_local"].pop("_last_extracted_at", None) - state["_local"].pop("_last_extracted_hash", None) + force_state_extract(state) raise diff --git a/dlt/pipeline/pipeline.py b/dlt/pipeline/pipeline.py index d1d558b3b8..efb6ae078b 100644 --- a/dlt/pipeline/pipeline.py +++ b/dlt/pipeline/pipeline.py @@ -28,14 +28,14 @@ ) from dlt.common.configuration.specs.config_section_context import ConfigSectionContext from dlt.common.configuration.resolve import initialize_credentials -from dlt.common.exceptions import ( +from dlt.common.destination.exceptions import ( DestinationLoadingViaStagingNotSupported, DestinationLoadingWithoutStagingNotSupported, DestinationNoStagingMode, - MissingDependencyException, DestinationUndefinedEntity, DestinationIncompatibleLoaderFileFormatException, ) +from dlt.common.exceptions import MissingDependencyException from dlt.common.normalizers import explicit_normalizers, import_normalizers from dlt.common.runtime import signals, initialize_runtime from dlt.common.schema.typing import ( @@ -129,6 +129,7 @@ PIPELINE_STATE_ENGINE_VERSION, bump_pipeline_state_version_if_modified, load_pipeline_state_from_destination, + mark_state_extracted, migrate_pipeline_state, state_resource, json_encode_state, @@ -172,7 +173,7 @@ def _wrap(self: "Pipeline", *args: Any, **kwargs: Any) -> Any: for name in list(self._schema_storage.live_schemas.keys()): try: schema = self._schema_storage.load_schema(name) - self._schema_storage.update_live_schema(schema, can_create_new=False) + schema.replace_schema_content(schema, link_to_replaced_schema=False) except FileNotFoundError: # no storage schema yet so pop live schema (created in call to f) self._schema_storage.live_schemas.pop(name, None) @@ -182,9 +183,10 @@ def _wrap(self: "Pipeline", *args: Any, **kwargs: Any) -> Any: else: # save modified live schemas for name, schema in self._schema_storage.live_schemas.items(): - self._schema_storage.commit_live_schema(name) # also save import schemas only here self._schema_storage.save_import_schema_if_not_exists(schema) + # only now save the schema, already linked to itself if saved as import schema + self._schema_storage.commit_live_schema(name) # refresh list of schemas if any new schemas are added self.schema_names = self._list_schemas_sorted() return rv @@ -488,7 +490,6 @@ def normalize( ) from n_ex @with_runtime_trace(send_state=True) - @with_schemas_sync @with_state_sync() @with_config_section((known_sections.LOAD,)) def load( @@ -725,8 +726,7 @@ def sync_destination( # set the pipeline props from merged state self._state_to_props(state) # add that the state is already extracted - state["_local"]["_last_extracted_hash"] = state["_version_hash"] - state["_local"]["_last_extracted_at"] = pendulum.now() + mark_state_extracted(state, state["_version_hash"]) # on merge schemas are replaced so we delete all old versions self._schema_storage.clear_storage() for schema in restored_schemas: @@ -1054,15 +1054,11 @@ def _extract_source( # discover the existing pipeline schema try: # all live schemas are initially committed and during the extract will accumulate changes in memory - # if schema is committed try to take schema from storage - if self._schema_storage.is_live_schema_committed(source.schema.name): - # this will (1) save live schema if modified (2) look for import schema if present - # (3) load import schema an overwrite pipeline schema if import schema modified - # (4) load pipeline schema if no import schema is present - pipeline_schema = self.schemas.load_schema(source.schema.name) - else: - # if schema is not committed we know we are in process of extraction - pipeline_schema = self.schemas[source.schema.name] + # line below may create another live schema if source schema is not a part of storage + # this will (1) look for import schema if present + # (2) load import schema an overwrite pipeline schema if import schema modified + # (3) load pipeline schema if no import schema is present + pipeline_schema = self.schemas[source.schema.name] pipeline_schema = pipeline_schema.clone() # use clone until extraction complete # apply all changes in the source schema to pipeline schema # NOTE: we do not apply contracts to changes done programmatically @@ -1080,7 +1076,7 @@ def _extract_source( # self._schema_storage.save_import_schema_if_not_exists(source.schema) # update live schema but not update the store yet - self._schema_storage.update_live_schema(source.schema) + source.schema = self._schema_storage.set_live_schema(source.schema) # set as default if this is first schema in pipeline if not self.default_schema_name: @@ -1560,9 +1556,11 @@ def _bump_version_and_extract_state( extract_ = extract or Extract( self._schema_storage, self._normalize_storage_config(), original_data=data ) - self._extract_source(extract_, data_to_sources(data, self)[0], 1, 1) - state["_local"]["_last_extracted_at"] = pendulum.now() - state["_local"]["_last_extracted_hash"] = hash_ + self._extract_source( + extract_, data_to_sources(data, self, self.default_schema)[0], 1, 1 + ) + # set state to be extracted + mark_state_extracted(state, hash_) # commit only if we created storage if not extract: extract_.commit_packages() diff --git a/dlt/pipeline/state_sync.py b/dlt/pipeline/state_sync.py index 8c72a218a4..5366b9c46d 100644 --- a/dlt/pipeline/state_sync.py +++ b/dlt/pipeline/state_sync.py @@ -68,6 +68,21 @@ def bump_pipeline_state_version_if_modified(state: TPipelineState) -> Tuple[int, return bump_state_version_if_modified(state, exclude_attrs=["_local"]) +def mark_state_extracted(state: TPipelineState, hash_: str) -> None: + """Marks state as extracted by setting last extracted hash to hash_ (which is current version_hash) + + `_last_extracted_hash` is kept locally and never synced with the destination + """ + state["_local"]["_last_extracted_at"] = pendulum.now() + state["_local"]["_last_extracted_hash"] = hash_ + + +def force_state_extract(state: TPipelineState) -> None: + """Forces `state` to be extracted by removing local information on the most recent extraction""" + state["_local"].pop("_last_extracted_at", None) + state["_local"].pop("_last_extracted_hash", None) + + def migrate_pipeline_state( pipeline_name: str, state: DictStrAny, from_engine: int, to_engine: int ) -> TPipelineState: diff --git a/docs/tools/fix_grammar_gpt.py b/docs/tools/fix_grammar_gpt.py index 1e4cf748dd..051448a2d4 100644 --- a/docs/tools/fix_grammar_gpt.py +++ b/docs/tools/fix_grammar_gpt.py @@ -41,7 +41,10 @@ parser.add_argument( "-f", "--files", - help="Specify the file name. Grammar Checker will filter all .md files containing this string in the filepath.", + help=( + "Specify the file name. Grammar Checker will filter all .md files containing this" + " string in the filepath." + ), type=str, ) diff --git a/docs/tools/utils.py b/docs/tools/utils.py index 074b19b8e1..b7d401b893 100644 --- a/docs/tools/utils.py +++ b/docs/tools/utils.py @@ -5,12 +5,15 @@ DOCS_DIR = "../website/docs" +BLOG_DIR = "../website/blog" def collect_markdown_files(verbose: bool) -> List[str]: """ Discovers all docs markdown files """ + + # collect docs pages markdown_files: List[str] = [] for path, _, files in os.walk(DOCS_DIR): if "api_reference" in path: @@ -23,6 +26,14 @@ def collect_markdown_files(verbose: bool) -> List[str]: if verbose: fmt.echo(f"Discovered {os.path.join(path, file)}") + # collect blog pages + for path, _, files in os.walk(BLOG_DIR): + for file in files: + if file.endswith(".md"): + markdown_files.append(os.path.join(path, file)) + if verbose: + fmt.echo(f"Discovered {os.path.join(path, file)}") + if len(markdown_files) < 50: # sanity check fmt.error("Found too few files. Something went wrong.") exit(1) diff --git a/docs/website/blog/2023-06-14-dlthub-gpt-accelerated learning_01.md b/docs/website/blog/2023-06-14-dlthub-gpt-accelerated learning_01.md index 394504dc64..08180b379e 100644 --- a/docs/website/blog/2023-06-14-dlthub-gpt-accelerated learning_01.md +++ b/docs/website/blog/2023-06-14-dlthub-gpt-accelerated learning_01.md @@ -47,9 +47,11 @@ The code provided below demonstrates training a chat-oriented GPT model using th -```python -!python3 -m pip install --upgrade langchain deeplake openai tiktoken +```sh +python -m pip install --upgrade langchain deeplake openai tiktoken +``` +```py # Create accounts on platform.openai.com and deeplake.ai. After registering, retrieve the access tokens for both platforms and securely store them for use in the next step. Enter the access tokens grabbed in the last step and enter them when prompted import os @@ -65,7 +67,7 @@ embeddings = OpenAIEmbeddings(disallowed_special=()) #### 2. Create a directory to store the code for training the model. Clone the desired repositories into that. -```python +```sh # making a new directory named dlt-repo !mkdir dlt-repo # changing the directory to dlt-repo @@ -80,7 +82,7 @@ embeddings = OpenAIEmbeddings(disallowed_special=()) ``` #### 3. Load the files from the directory -```python +```py import os from langchain.document_loaders import TextLoader @@ -95,7 +97,7 @@ for dirpath, dirnames, filenames in os.walk(root_dir): pass ``` #### 4. Load the files from the directory -```python +```py import os from langchain.document_loaders import TextLoader @@ -111,7 +113,7 @@ for dirpath, dirnames, filenames in os.walk(root_dir): ``` #### 5. Splitting files to chunks -```python +```py # This code uses CharacterTextSplitter to split documents into smaller chunksbased on character count and store the resulting chunks in the texts variable. from langchain.text_splitter import CharacterTextSplitter @@ -119,7 +121,8 @@ text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0) texts = text_splitter.split_documents(docs) ``` #### 6. Create Deeplake dataset -```python + +```sh # Set up your deeplake dataset by replacing the username with your Deeplake account and setting the dataset name. For example if the deeplakes username is “your_name” and the dataset is “dlt-hub-dataset” username = "your_deeplake_username" # replace with your username from app.activeloop.ai @@ -138,7 +141,7 @@ retriever.search_kwargs['maximal_marginal_relevance'] = True retriever.search_kwargs['k'] = 10 ``` #### 7. Initialize the GPT model -```python +```py from langchain.chat_models import ChatOpenAI from langchain.chains import ConversationalRetrievalChain diff --git a/docs/website/blog/2023-08-14-dlt-motherduck-blog.md b/docs/website/blog/2023-08-14-dlt-motherduck-blog.md index 9f48d808a5..21aa7139f3 100644 --- a/docs/website/blog/2023-08-14-dlt-motherduck-blog.md +++ b/docs/website/blog/2023-08-14-dlt-motherduck-blog.md @@ -70,7 +70,7 @@ This is a perfect problem to test out my new super simple and highly customizabl `dlt init bigquery duckdb` This creates a folder with the directory structure - ``` + ```text ├── .dlt │ ├── config.toml │ └── secrets.toml diff --git a/docs/website/blog/2023-08-21-dlt-lineage-support.md b/docs/website/blog/2023-08-21-dlt-lineage-support.md index a76f89ed6a..90f6eb58aa 100644 --- a/docs/website/blog/2023-08-21-dlt-lineage-support.md +++ b/docs/website/blog/2023-08-21-dlt-lineage-support.md @@ -63,7 +63,7 @@ By combining row and column level lineage, you can have an easy overview of wher After a pipeline run, the schema evolution info gets stored in the load info. Load it back to the database to persist the column lineage: -```python +```py load_info = pipeline.run(data, write_disposition="append", table_name="users") diff --git a/docs/website/blog/2023-08-24-dlt-etlt.md b/docs/website/blog/2023-08-24-dlt-etlt.md index 3e27a21338..fb8215c9a0 100644 --- a/docs/website/blog/2023-08-24-dlt-etlt.md +++ b/docs/website/blog/2023-08-24-dlt-etlt.md @@ -83,7 +83,7 @@ This engine is configurable in both how it works and what it does, you can read more here: [Normaliser, schema settings](https://dlthub.com/docs/general-usage/schema#data-normalizer) Here is a usage example (it's built into the pipeline): -```python +```py import dlt @@ -119,7 +119,7 @@ Besides your own customisations, `dlt` also supports injecting your transform co Here is a code example of pseudonymisation, a common case where data needs to be transformed before loading: -```python +```py import dlt import hashlib @@ -168,7 +168,7 @@ load_info = pipeline.run(data_source) Finally, once you have clean data loaded, you will probably prefer to use SQL and one of the standard tools. `dlt` offers a dbt runner to get you started easily with your transformation package. -```python +```py pipeline = dlt.pipeline( pipeline_name='pipedrive', destination='bigquery', diff --git a/docs/website/blog/2023-09-05-mongo-etl.md b/docs/website/blog/2023-09-05-mongo-etl.md index 19e1f18682..0e4a3d83f2 100644 --- a/docs/website/blog/2023-09-05-mongo-etl.md +++ b/docs/website/blog/2023-09-05-mongo-etl.md @@ -139,21 +139,21 @@ Here's a code explanation of how it works under the hood: example of how this nested data could look: ```json - data = { - 'id': 1, - 'name': 'Alice', - 'job': { + { + "id": 1, + "name": "Alice", + "job": { "company": "ScaleVector", - "title": "Data Scientist", + "title": "Data Scientist" }, - 'children': [ + "children": [ { - 'id': 1, - 'name': 'Eve' + "id": 1, + "name": "Eve" }, { - 'id': 2, - 'name': 'Wendy' + "id": 2, + "name": "Wendy" } ] } @@ -161,7 +161,7 @@ Here's a code explanation of how it works under the hood: 1. We can load the data to a supported destination declaratively: - ```python + ```py import dlt pipeline = dlt.pipeline( diff --git a/docs/website/blog/2023-09-26-verba-dlt-zendesk.md b/docs/website/blog/2023-09-26-verba-dlt-zendesk.md index 1990a5df7f..f3825b4427 100644 --- a/docs/website/blog/2023-09-26-verba-dlt-zendesk.md +++ b/docs/website/blog/2023-09-26-verba-dlt-zendesk.md @@ -40,7 +40,7 @@ In this blog post, we'll guide you through the process of building a RAG applica Create a new folder for your project and install Verba: -```bash +```sh mkdir verba-dlt-zendesk cd verba-dlt-zendesk python -m venv venv @@ -50,7 +50,7 @@ pip install goldenverba To configure Verba, we need to set the following environment variables: -```bash +```sh VERBA_URL=https://your-cluster.weaviate.network # your Weaviate instance URL VERBA_API_KEY=F8...i4WK # the API key of your Weaviate instance OPENAI_API_KEY=sk-...R # your OpenAI API key @@ -61,13 +61,13 @@ You can put them in a `.env` file in the root of your project or export them in Let's test that Verba is installed correctly: -```bash +```sh verba start ``` You should see the following output: -```bash +```sh INFO: Uvicorn running on (Press CTRL+C to quit) ℹ Setting up client ✔ Client connected to Weaviate Cluster @@ -88,7 +88,7 @@ If you try to ask a question now, you'll get an error in return. That's because We get our data from Zendesk using dlt. Let's install it along with the Weaviate extra: -```bash +```sh pip install "dlt[weaviate]" ``` @@ -96,7 +96,7 @@ This also installs a handy CLI tool called `dlt`. It will help us initialize the Let's initialize the verified source: -```bash +```sh dlt init zendesk weaviate ``` @@ -104,7 +104,7 @@ dlt init zendesk weaviate To make things easier, we'll use the email address and password authentication method for Zendesk API. Let's add our credentials to `secrets.toml`: -```yaml +```toml [sources.zendesk.credentials] password = "your-password" subdomain = "your-subdomain" @@ -113,14 +113,13 @@ email = "your-email@example.com" We also need to specify the URL and the API key of our Weaviate instance. Copy the credentials for the Weaviate instance you created earlier and add them to `secrets.toml`: -```yaml +```toml [destination.weaviate.credentials] url = "https://your-cluster.weaviate.network" api_key = "F8.....i4WK" [destination.weaviate.credentials.additional_headers] X-OpenAI-Api-Key = "sk-....." - ``` All the components are now in place and configured. Let's set up a pipeline to import data from Zendesk. @@ -129,7 +128,7 @@ All the components are now in place and configured. Let's set up a pipeline to i Open your favorite text editor and create a file called `zendesk_verba.py`. Add the following code to it: -```python +```py import itertools import dlt @@ -217,13 +216,13 @@ Finally, we run the pipeline and print the load info. Let's run the pipeline: -```bash +```sh python zendesk_verba.py ``` You should see the following output: -```bash +```sh Pipeline zendesk_verba completed in 8.27 seconds 1 load package(s) were loaded to destination weaviate and into dataset None The weaviate destination used location to store data @@ -235,13 +234,13 @@ Verba is now populated with data from Zendesk Support. However there are a coupl Run the following command: -```bash +```sh verba init ``` You should see the following output: -```bash +```sh ===================== Creating Document and Chunk class ===================== ℹ Setting up client ✔ Client connected to Weaviate Cluster @@ -264,7 +263,7 @@ Document class already exists, do you want to overwrite it? (y/n): n We're almost there! Let's start Verba: -```bash +```sh verba start ``` diff --git a/docs/website/blog/2023-10-06-dlt-holistics.md b/docs/website/blog/2023-10-06-dlt-holistics.md index b2791bd2ec..c5e9b2ca46 100644 --- a/docs/website/blog/2023-10-06-dlt-holistics.md +++ b/docs/website/blog/2023-10-06-dlt-holistics.md @@ -92,7 +92,7 @@ In this section, we walk through how to set up a MongoDB data pipeline using `dl Use the command below to install `dlt`. -```bash +```sh pip3 install -U dlt ``` @@ -100,13 +100,13 @@ Consider setting up a virtual environment for your projects and installing the p Once we have `dlt` installed, we can go ahead and initialize a verified MongoDB pipeline with the destination set to Google BigQuery. First, create a project directory and then execute the command below: -```python +```sh dlt init mongodb bigquery ``` The above command will create a local ready-made pipeline that we can customize to our needs. After executing the command your project directory will look as follows: -```bash +```text . ├── .dlt │ ├── config.toml @@ -127,7 +127,7 @@ We also need to set up the GCP service account credentials to get permissions to Once all the credentials are set add them to the `secrets.toml` file. Your file should look something like this: -```bash +```toml # put your secret values and credentials here. do not share this file and do not push it to github [sources.mongodb] connection_url = "mongodb+srv://:@.cvanypn.mongodb.net" # please set me up! @@ -143,7 +143,7 @@ client_email = "@analytics.iam.gserviceaccount.com" # please set me up The `mongodb_pipeline.py` at the root of your project directory is the script that runs the pipeline. It contains many functions that provide different ways of loading the data. The selection of the function depends on your specific use case, but for this demo, we try to keep it simple and use the `load_entire_database` function. -```python +```py def load_entire_database(pipeline: Pipeline = None) -> LoadInfo: """Use the mongo source to completely load all collection in a database""" if pipeline is None: @@ -165,13 +165,13 @@ def load_entire_database(pipeline: Pipeline = None) -> LoadInfo: Before we execute the pipeline script let's install the dependencies for the pipeline by executing the `requirements.txt` file. -```bash +```sh pip install -r requirements.txt ``` Finally, we are ready to execute the script. In the main function uncomment the `load_entire_database` function call and run the script. -```bash +```sh python mongodb_pipeline.py ``` @@ -290,7 +290,7 @@ This is a typical way data is structured in a NoSQL database. The data is in a J The ddl (data definition language) for the movies table in BigQuery can be seen below: -```json +```sql CREATE TABLE `dlthub-analytics.mongo_database.movies` ( _id STRING NOT NULL, @@ -354,7 +354,7 @@ In Holistics, add a new data source click on the plus sign (+) on the top menu, Once the BigQuery source is added we are ready to import the schemas from BigQuery into Holistics. The schema(`dataset_name`) name under which dlt loaded the MongoDB data is defined in the `load_entire_database` function when we create the MongoDB pipeline. -```bash +```sh # Create a pipeline pipeline = dlt.pipeline( pipeline_name="local_mongo", @@ -399,13 +399,13 @@ The resulting relationship can seen As Code using the Holistics 4.0 Analytics as Previously, we created the relationship between the `cast` and the `movies` tables using GUI, now let’s add the relationship between the `directors` and `movies` tables using the Analytics as Code feature. In the `dataset.aml` file append the relationships block with the following line of code: -```python +```py relationship(model__mongo_database_movies_directors.dlt_parent_id > model__mongo_database_movies.dlt_id, true) ``` After the change, the `dataset.aml` file should look like this: -```python +```sh import '../Models/mongo_database_movies.model.aml' { mongo_database_movies as model__mongo_database_movies } diff --git a/docs/website/blog/2023-10-09-dlt-ops-startups.md b/docs/website/blog/2023-10-09-dlt-ops-startups.md index 94c1ff662b..dd21725f90 100644 --- a/docs/website/blog/2023-10-09-dlt-ops-startups.md +++ b/docs/website/blog/2023-10-09-dlt-ops-startups.md @@ -61,14 +61,14 @@ The `dlt` [init command](https://dlthub.com/docs/reference/command-line-interfac - Open `.dlt/secrets.toml` file on your laptop. - Enter the OpenAI secrets: - ``` + ```toml [sources.unstructured_data] openai_api_key = "openai_api_key" ``` - Enter your email account secrets in the same section `[sources.unstructured_data]`: - ``` + ```toml host = 'imap.example.com' email_account = "example@example.com" password = 'set me up!' @@ -78,7 +78,7 @@ The `dlt` [init command](https://dlthub.com/docs/reference/command-line-interfac - Enter the BigQuery secrets: - ``` + ```toml [destination.bigquery] location = "US" [destination.bigquery.credentials] @@ -96,7 +96,7 @@ This is the part where you can define what you’d like to see as an outcome. Queries example: -```python +```py INVOICE_QUERIES = { "recipient_company_name": "Who is the recipient of the invoice? Just return the name. If you don't know, then return None", "invoice_amount": "What is the total amount of the invoice? Just return the amount as decimal number, no currency or text. If you don't know, then return None", diff --git a/docs/website/blog/2023-10-16-first-data-warehouse.md b/docs/website/blog/2023-10-16-first-data-warehouse.md index 79186fd267..641751eb1d 100644 --- a/docs/website/blog/2023-10-16-first-data-warehouse.md +++ b/docs/website/blog/2023-10-16-first-data-warehouse.md @@ -75,7 +75,7 @@ For those new to pushing data via an API, it may seem intimidating. Let's simplify - sending data to an API endpoint for loading or updating an object is similar to making a `GET` request. Here's a straightforward example in Python: -```python +```py # Assuming data is in this format import requests # assume we have a table of contacts we want to push to Pipedrive. diff --git a/docs/website/blog/2023-10-19-dbt-runners.md b/docs/website/blog/2023-10-19-dbt-runners.md index 713815abb0..9eb22c050f 100644 --- a/docs/website/blog/2023-10-19-dbt-runners.md +++ b/docs/website/blog/2023-10-19-dbt-runners.md @@ -149,7 +149,7 @@ The Cloud runner we support can do the following: - Check the status of a dbt job in your account. Code example: -```python +```py from dlt.helpers.dbt_cloud import run_dbt_cloud_job # Trigger a job run with additional data @@ -179,7 +179,7 @@ The core runner does the following: - Execute the package and report the outcome. Code example: -```python +```py # Create a transformation on a new dataset called 'pipedrive_dbt' # we created a local dbt package # and added pipedrive_raw to its sources.yml @@ -210,7 +210,7 @@ for m in models: f"Model {m.model_name} materialized" + f"in {m.time}" + f"with status {m.status}" + - f"and message {m.message}" + f"and message {m.message}") ``` ## 4. A short demo on how to do that with dlt’s dbt runner. diff --git a/docs/website/blog/2023-10-23-arrow-loading.md b/docs/website/blog/2023-10-23-arrow-loading.md index 978586fa76..2f25511d73 100644 --- a/docs/website/blog/2023-10-23-arrow-loading.md +++ b/docs/website/blog/2023-10-23-arrow-loading.md @@ -18,13 +18,13 @@ Here we achieved ~30x speedups when loading data from (local) postgres database We’ll start with [ConnectorX library](https://github.com/sfu-db/connector-x) that creates Arrow tables from SQL queries on most of the popular database engines. -```python +```sh pip install connectorx ``` Lib has Rust inside, zero copy extraction and is amazingly fast. We’ll extract and normalize 10 000 000 [test rows](https://github.com/dlt-hub/verified-sources/blob/master/tests/sql_database/sql_source.py#L88) from local postgresql. The table **chat_message** looks like Slack messages dump. Messages have unique autoincrement **id** which we use to load in chunks: -```python +```py import connectorx as cx import dlt from dlt.sources.credentials import ConnectionStringCredentials @@ -49,7 +49,7 @@ chat_messages = dlt.resource( In this demo I just extract and normalize data and skip the loading step. -```python +```py pipeline = dlt.pipeline(destination="duckdb", full_refresh=True) # extract first pipeline.extract(chat_messages) @@ -78,7 +78,7 @@ Step normalize COMPLETED in 0.08 seconds. Here’s corresponding code working with **SqlAlchemy**. We process 10 000 000 rows, yielding in 100k rows packs and normalize to parquet in 3 parallel processes. -```python +```py from itertools import islice import dlt from sqlalchemy import create_engine diff --git a/docs/website/blog/2023-10-25-dlt-deepnote.md b/docs/website/blog/2023-10-25-dlt-deepnote.md index 864353a36d..2674ceae7d 100644 --- a/docs/website/blog/2023-10-25-dlt-deepnote.md +++ b/docs/website/blog/2023-10-25-dlt-deepnote.md @@ -37,7 +37,7 @@ likely than not, you spend more time fixing data pipelines or data formats then on ML algorithms or dashboard designs. We aren’t always lucky enough to get structured data to work with. Imagine a world where your training data is just this statement without no prior work: -```jsx +```sql select * from ``` diff --git a/docs/website/blog/2023-10-26-dlt-prefect.md b/docs/website/blog/2023-10-26-dlt-prefect.md index 8bd6321489..6e9caa3fea 100644 --- a/docs/website/blog/2023-10-26-dlt-prefect.md +++ b/docs/website/blog/2023-10-26-dlt-prefect.md @@ -82,8 +82,7 @@ It would take some effort to interpret even a simple response like this one for "updated": 1502138686, "is_app_user": false, "has_2fa": false - }, - // ... (more data) + } ] } ``` @@ -92,14 +91,14 @@ You can use dlt to build a Slack to BigQuery pipeline in just a few seconds with Seriously, it is that simple. In preparation, let’s make sure to install what we need: -```bash +```sh pip install dlt pip install prefect ```` Then just run a simple init command: -```bash +```sh dlt init slack bigquery ``` @@ -126,7 +125,7 @@ Note that we are redacting some of the code in the preview for brevity, to follow along completely navigate to the repo. -```python +```py # Pipeline to load Slack into BigQuery from typing import List @@ -190,14 +189,14 @@ that can make sure your pipelines aren’t causing you stress in the middle of t Make sure you’re logged in to Prefect Cloud by [signing up](https://app.prefect.cloud/?utm_source=dltblog) and using the following command: -```bash +```sh prefect cloud login ``` Luckily, Prefect is also incredibly Pythonic. Turning any pipeline into an observable, scheduled Prefect flow is as simple as adding decorators to your functions and `serving` it up. Here’s our `dlt` generated pipeline, scheduled daily: -```python +```py from typing import List import dlt diff --git a/docs/website/blog/2023-10-30-data-modelling-tools.md b/docs/website/blog/2023-10-30-data-modelling-tools.md index e5839ee66e..960d80a569 100644 --- a/docs/website/blog/2023-10-30-data-modelling-tools.md +++ b/docs/website/blog/2023-10-30-data-modelling-tools.md @@ -71,7 +71,7 @@ Our database is based on the data published by [LivWell](https://www.nature.com/ Sample input structure: -```jsx +```py [{"survey_id": "AM2000DHS", "country": "Armenia", "marriage_related": [{...}, {...}, ...], @@ -81,7 +81,7 @@ Sample input structure: "health_related": [{...}, {...}, ...], "age_related": [{...}, {...}, ...] }, - {...}, {...}, {...}, ...}] + {...}, {...}, {...}, {...}] ``` To break it up into proper tables representing the different sections of the surveys, we gave this data to **dlt** to unpack it into a flat relational structure into BigQuery. dlt automatically unpacked the original data into connected tables. The various child tables link to the parent table `wellness` using foreign keys. `Wellness` contains surveys identified by ID and country. The final setup of indicators broken up into different categories can be found below, as displayed by Power BI. This structured database has been used to experiment with all three dashboarding tools in this article. diff --git a/docs/website/blog/2023-11-01-dlt-dagster.md b/docs/website/blog/2023-11-01-dlt-dagster.md index 4da685be73..dc05a35bff 100644 --- a/docs/website/blog/2023-11-01-dlt-dagster.md +++ b/docs/website/blog/2023-11-01-dlt-dagster.md @@ -33,7 +33,7 @@ As we will be ingesting data into BigQuery we first need to create service accou Once we have the credentials we are ready to begin. Let’s first install Dagster and `dlt`. The below commands should install both. -```python +```sh pip install dlt pip install dagster dagster-webserver ``` @@ -42,13 +42,13 @@ pip install dagster dagster-webserver As a first step, we will create the GitHub issues pipeline using `dlt`. -```bash +```sh dlt init github_issues bigquery ``` This will generate a template for us to create a new pipeline. Under `.dlt/secrets.toml` add the service account credentials for BigQuery. Then in the `github_issues.py` delete the generated code and add the following: -```python +```py @dlt.resource(write_disposition="append") def github_issues_resource(api_secret_key=dlt.secrets.value): owner = 'dlt-hub' @@ -88,7 +88,7 @@ The above code creates a simple **github_issues** pipeline that gets the issues To run the pipeline execute the below commands: -```bash +```sh pip install -r requirements.txt python github_issues.py ``` @@ -103,7 +103,7 @@ We will need to adjust our pipeline a bit to orchestrate it using Dagster. - Create a new directory for your Dagster project and scaffold the basic structure: -```bash +```sh mkdir dagster_github_issues cd dagster_github_issues dagster project scaffold --name github-issues @@ -115,7 +115,7 @@ This will generate the default files for Dagster that we will use as a starting - Inside the `github-issues/github_issues` directory create the following folders: `assets`, `resources`, and `dlt`. -```bash +```sh . ├── README.md ├── github_issues @@ -143,7 +143,7 @@ This will generate the default files for Dagster that we will use as a starting - Define a `DltResource` class in `resources/__init__.py` as a Dagster configurable resource. This class allows you to reuse pipeline code inside an asset. -```python +```py from dagster import ConfigurableResource import dlt @@ -167,7 +167,7 @@ class DltResource(ConfigurableResource): - Define the asset, `issues_pipeline`, in `assets/__init__.py`. This asset uses the configurable resource to create a dlt pipeline and ingests data into BigQuery. -```python +```py from dagster import asset, get_dagster_logger from ..resources import DltResource from ..dlt import github_issues_resource @@ -188,12 +188,12 @@ The defined asset (**issues_pipeline**) takes as input the configurable resource - Add the schema evolution code to the asset to make our pipelines more resilient to changes. -```python +```py from dagster import AssetExecutionContext @asset def issues_pipeline(context: AssetExecutionContext, pipeline: DltResource): -... -md_content="" + ... + md_content="" for package in result.load_packages: for table_name, table in package.schema_update.items(): for column_name, column in table["columns"].items(): @@ -207,7 +207,7 @@ md_content="" - In the `__init.py__` under the **github_issues** folder add the definitions: -```python +```py all_assets = load_assets_from_modules([assets]) simple_pipeline = define_asset_job(name="simple_pipeline", selection= ['issues_pipeline']) @@ -255,20 +255,20 @@ One of the main strengths of `dlt` lies in its ability to extract, normalize, an - Start by creating a new Dagster project scaffold: -```python +```sh dagster project scaffold --name mongodb-dlt ``` - Follow the steps mentioned earlier and create an `assets`, and `resources` directory under `mongodb-dlt/mongodb_dlt`. - Initialize a `dlt` MongoDB pipeline in the same directory: -```python +```sh dlt init mongodb bigquery ``` This will create a template with all the necessary logic implemented for extracting data from MongoDB. After running the command your directory structure should be as follows: -```python +```text . ├── README.md ├── mongodb_dlt @@ -303,7 +303,7 @@ Next, create a `.env` file and add the BigQuery and MongoDB credentials to the f Create a `DltResouce` under the **resources** directory. Add the following code to the `__init__.py`: -```python +```py from dagster import ConfigurableResource import dlt @@ -335,7 +335,7 @@ In the `mongodb_pipeline.py` file, locate the `load_select_collection_hint_db` f In the `__init__.py` file under the **assets** directory, define the `dlt_asset_factory`: -```python +```py from ..mongodb import mongodb from ..resources import DltResource @@ -386,7 +386,7 @@ dlt_assets = dlt_asset_factory(DATABASE_COLLECTIONS) Add the definitions in the `__init__.py` in the root directory: -```python +```py from dagster import Definitions from .assets import dlt_assets diff --git a/docs/website/blog/2023-11-22-dlt-webhooks-event-based-ingestion.md b/docs/website/blog/2023-11-22-dlt-webhooks-event-based-ingestion.md index 292879fc95..aa433dc883 100644 --- a/docs/website/blog/2023-11-22-dlt-webhooks-event-based-ingestion.md +++ b/docs/website/blog/2023-11-22-dlt-webhooks-event-based-ingestion.md @@ -79,7 +79,7 @@ in-depth guide, please refer to the detailed documentation. 1. Click 'Create Function' in Cloud Functions, and select your region and environment setup. 1. Choose HTTP as the trigger, enable 'Allow unauthenticated invocations', save, and click 'Next'. 1. Set the environment to Python 3.10 and prepare to insert code into main.py: - ```python + ```py import dlt import json import time @@ -106,7 +106,7 @@ in-depth guide, please refer to the detailed documentation. dlt[bigquery] ``` 1. Post-deployment, a webhook URL is generated, typically following a specific format. - ```bash + ```sh https://{region]-{project-id}.cloudfunctions.net/{cloud-function-name} ``` @@ -140,7 +140,7 @@ Set up the webhook by creating a cloud function, using the same steps as for the 1. Here’s what `main.py` looks like: - ```python + ```py import dlt from flask import jsonify @@ -215,7 +215,7 @@ Set up the webhook by creating a cloud function, using the same steps as for the 1. Here’s what `main.py`looks like: - ```python + ```py import dlt from flask import jsonify @@ -227,7 +227,8 @@ Set up the webhook by creating a cloud function, using the same steps as for the # Initialize and configure the DLT pipeline pipeline = dlt.pipeline( - pipeline_name=ßigquery', # Destination service for the data + pipeline_name="hubspot", + destination='bigquery', # Destination service for the data dataset_name='hubspot_webhooks_dataset', # BigQuery dataset name ) diff --git a/docs/website/blog/2023-11-27-dlt-data-lineage.md b/docs/website/blog/2023-11-27-dlt-data-lineage.md index 233ef58800..d91659eb6b 100644 --- a/docs/website/blog/2023-11-27-dlt-data-lineage.md +++ b/docs/website/blog/2023-11-27-dlt-data-lineage.md @@ -42,7 +42,7 @@ The **load_info** produced by `dlt` for both pipelines is also populated into Bi To get started install `dlt` and dbt: -```jsx +```sh pip install dlt pip install dbt-bigquery ``` @@ -59,13 +59,13 @@ We use the following CSV files as our data sources for this demo: To get started we initialize a dlt pipeline and selecting BigQuery as our destination by running the following command: -```python +```sh dlt init data_lineage bigquery ``` This will create default scaffolding to build our pipeline. Install the dependencies by running the following command: -```python +```sh pip install -r requirements.txt ``` @@ -76,7 +76,7 @@ As a first step, we will load the sales data from the online and physical store In the `data_lineage.py` file remove the default code and add the following: -```python +```py FILEPATH = "data/supermarket_sales.csv" FILEPATH_SHOPIFY = "data/orders_export_1.csv" @@ -109,7 +109,7 @@ Any changes in the underlying data are captured by the dlt **load_info**. To sho We will add the **load_info** back to BigQuery to use in our Dashboard. The Dashboard will provide an overview data lineage for our ingested data. -```python +```py if __name__ == "__main__": data_store = pd.read_csv(FILEPATH) @@ -134,7 +134,7 @@ if __name__ == "__main__": dataset_name='sales_shopify' ) - load_a = pipeline_store.run_pipeline( + load_a = pipeline_store.run_pipeline( data=select_c_data_store, table_name='sales_info', write_disposition='replace' @@ -161,7 +161,7 @@ if __name__ == "__main__": To run the pipeline, execute the following command: -```python +```sh python data_lineage.py ``` @@ -175,7 +175,7 @@ Now that both the Shopify and Store data are available in BigQuery, we will use To get started initialize a dbt project in the root directory: -```python +```sh dbt init sales_dbt ``` @@ -244,7 +244,7 @@ In the query, we combine the **load_info** for both sources by doing a union ove In the `data_lineage.py` add the code to run the dbt package using `dlt`. -```python +```py pipeline_transform = dlt.pipeline( pipeline_name='pipeline_transform', destination='bigquery', @@ -271,7 +271,7 @@ for m in models: Next, run the pipeline using the following command: -```python +```sh python data_lineage.py ``` diff --git a/docs/website/blog/2023-12-01-dlt-kestra-demo.md b/docs/website/blog/2023-12-01-dlt-kestra-demo.md index da47384194..9f1d7acba2 100644 --- a/docs/website/blog/2023-12-01-dlt-kestra-demo.md +++ b/docs/website/blog/2023-12-01-dlt-kestra-demo.md @@ -78,7 +78,7 @@ In my scenario, the email data doesn't have nested structures, so there's no nee Here's how the pipeline is defined and subsequently run in the first task of the main flow in **`Kestra`**: -```python +```py # Run dlt pipeline to load email data from gmail to BigQuery pipeline = dlt.pipeline( pipeline_name="standard_inbox", diff --git a/docs/website/blog/2023-12-13-dlt-aws-taktile-blog.md b/docs/website/blog/2023-12-13-dlt-aws-taktile-blog.md index c819f90741..296d303dcb 100644 --- a/docs/website/blog/2023-12-13-dlt-aws-taktile-blog.md +++ b/docs/website/blog/2023-12-13-dlt-aws-taktile-blog.md @@ -46,13 +46,13 @@ SAM is a lightweight Infrastructure-As-Code framework provided by AWS. Using SAM 1. Install the SAM CLI [add link or command here] - ```bash + ```sh pip install aws-sam-cli ``` 2. Define your resources in a `template.yml` file - ```yaml + ```text AWSTemplateFormatVersion: "2010-09-09" Transform: AWS::Serverless-2016-10-31 @@ -86,7 +86,7 @@ SAM is a lightweight Infrastructure-As-Code framework provided by AWS. Using SAM Effect: Allow Action: - secretsmanager:GetSecretValue - Resource: !Sub arn:aws:secretsmanager:${AWS::Region}:${AWS::AccountId}:secret:DLT_* + Resource: !Sub "arn:aws:secretsmanager:${AWS::Region}:${AWS::AccountId}:secret:DLT_*" Metadata: DockerTag: dlt-aws DockerContext: . @@ -99,13 +99,13 @@ SAM is a lightweight Infrastructure-As-Code framework provided by AWS. Using SAM 3. Build a deployment package - ```bash + ```sh sam build ``` 4. Test your setup locally - ```bash + ```sh sam local start-api # in a second terminal window @@ -114,7 +114,7 @@ SAM is a lightweight Infrastructure-As-Code framework provided by AWS. Using SAM 5. Deploy your resources to AWS - ```bash + ```sh sam deploy --stack-name= --resolve-image-repos --resolve-s3 --capabilities CAPABILITY_IAM ``` diff --git a/docs/website/blog/2024-01-08-streaming-pubsub-json-gcp.md b/docs/website/blog/2024-01-08-streaming-pubsub-json-gcp.md index d31d9a7e3a..e6e7d2ba18 100644 --- a/docs/website/blog/2024-01-08-streaming-pubsub-json-gcp.md +++ b/docs/website/blog/2024-01-08-streaming-pubsub-json-gcp.md @@ -125,7 +125,7 @@ By using this micro-batch architecture, we strive to maintain a balance of datab insert efficiency (by writing multiple records at a time) with near real-time insertion (by keeping the window size around 5 seconds). -```python +```py pipeline = dlt.pipeline( pipeline_name="pubsub_dlt", diff --git a/docs/website/blog/2024-01-10-dlt-mode.md b/docs/website/blog/2024-01-10-dlt-mode.md index b92425184d..1d6bf8ca0e 100644 --- a/docs/website/blog/2024-01-10-dlt-mode.md +++ b/docs/website/blog/2024-01-10-dlt-mode.md @@ -123,13 +123,13 @@ With the model we just created, called Products, a chart can be instantly create In this demo, we’ll forego the authentication issues of connecting to a data warehouse, and choose the DuckDB destination to show how the Python environment within Mode can be used to initialize a data pipeline and dump normalized data into a destination. In order to see how it works, we first install dlt[duckdb] into the Python environment. -```python +```sh !pip install dlt[duckdb] ``` Next, we initialize the dlt pipeline: -```python +```py # initializing the dlt pipeline with your # data warehouse destination pipeline = dlt.pipeline( diff --git a/docs/website/blog/2024-01-15-dlt-dbt-runner-on-cloud-functions.md b/docs/website/blog/2024-01-15-dlt-dbt-runner-on-cloud-functions.md index b36748aed9..e21154d98e 100644 --- a/docs/website/blog/2024-01-15-dlt-dbt-runner-on-cloud-functions.md +++ b/docs/website/blog/2024-01-15-dlt-dbt-runner-on-cloud-functions.md @@ -86,7 +86,7 @@ We recommend setting up and testing dbt-core locally before using it in cloud fu 1. Next, modify the `main.py` as follows: - ```python + ```py import os import subprocess import logging @@ -191,9 +191,10 @@ To integrate dlt and dbt in cloud functions, use the dlt-dbt runner; here’s ho 1. Next, configure the `main.py` as follows: - ```python + ```py import dlt - import logging, json + import logging + import json from flask import jsonify from dlt.common.runtime.slack import send_slack_message @@ -306,7 +307,7 @@ To integrate dlt and dbt in cloud functions, use the dlt-dbt runner; here’s ho 1. Next, list runtime-installable modules in `requirements.txt`: - ``` + ```sh dbt-core dbt-bigquery ``` diff --git a/docs/website/blog/2024-01-16-dlt-dbt-semantic-layer.md b/docs/website/blog/2024-01-16-dlt-dbt-semantic-layer.md index e67e203caf..415a55f9b9 100644 --- a/docs/website/blog/2024-01-16-dlt-dbt-semantic-layer.md +++ b/docs/website/blog/2024-01-16-dlt-dbt-semantic-layer.md @@ -38,7 +38,7 @@ Here’s how a pipeline could look: The data being used is of a questionnaire, which includes questions, the options of those questions, respondents and responses. This data is contained within a nested json object, that we’ll pass as a raw source to `dlt` to structure, normalize and dump into a BigQuery destination. -```python +```py # initializing the dlt pipeline with your data warehouse destination pipeline = dlt.pipeline( pipeline_name="survey_pipeline", @@ -89,20 +89,20 @@ measures: - name: surveys_total description: The total surveys for each --dimension. agg: count - # if all rows need to be counted then expr = 1 + # if all rows need to be counted then expr = 1 expr: 1 # where in SQL you would: group by columns dimensions: - # default dbt requirement + # default dbt requirement - name: surveyed_at type: time type_params: time_granularity: day # count entry per answer - - name: people_per_color + - name: people_per_color type: categorical expr: answer - # count entry per question + # count entry per question - name: question type: categorical expr: question @@ -117,10 +117,10 @@ metrics: type: simple label: Favorite Colors type_params: - # reference of the measure created in the semantic model + # reference of the measure created in the semantic model measure: surveys_total - filter: | # adding a filter on the "question" column for asking about favorite color - {{ Dimension('id__question') }} = 'What is your favorite color?' + filter: | # adding a filter on the "question" column for asking about favorite color + {{ Dimension('id__question') }} = 'What is your favorite color?' ``` The DAG then looks like this: diff --git a/docs/website/blog/2024-02-21-pipelines-single-pane-of-glass.md b/docs/website/blog/2024-02-21-pipelines-single-pane-of-glass.md index 553284bc6f..ff54c463bd 100644 --- a/docs/website/blog/2024-02-21-pipelines-single-pane-of-glass.md +++ b/docs/website/blog/2024-02-21-pipelines-single-pane-of-glass.md @@ -42,7 +42,7 @@ Since “checking” things can be tedious, we rather forget about it and be not Here’s a gist of how to use it -```python +```py from dlt.common.runtime.slack import send_slack_message def run_pipeline_and_notify(pipeline, data): diff --git a/docs/website/blog/2024-03-07-openapi-generation-chargebee.md b/docs/website/blog/2024-03-07-openapi-generation-chargebee.md index 367f8db2ca..3d77c3ea4c 100644 --- a/docs/website/blog/2024-03-07-openapi-generation-chargebee.md +++ b/docs/website/blog/2024-03-07-openapi-generation-chargebee.md @@ -90,7 +90,7 @@ There were no great challenges. The most ~~difficult~~ tedious probably was to m 1) Authentication The provided Authentication was a bit off. The generated code assumed the using of a username and password but what was actually required was — an empty username + api_key as a password. So super easy fix was changing -```python +```py def to_http_params(self) -> CredentialsHttpParams: cred = f"{self.api_key}:{self.password}" if self.password else f"{self.username}" encoded = b64encode(f"{cred}".encode()).decode() @@ -99,9 +99,9 @@ def to_http_params(self) -> CredentialsHttpParams: to -```python +```py def to_http_params(self) -> CredentialsHttpParams: - encoded = b64encode(f"{self.api_key}".encode()).decode() + encoded = b64encode(f"{self.api_key}".encode()).decode() return dict(cookies={}, headers={"Authorization": "Basic " + encoded}, params={}) ``` @@ -111,13 +111,14 @@ Also I was pleasantly surprised that generator had several different authenticat For the code generator it’s hard to guess a pagination method by OpenAPI specification, so the generated code has no pagination 😞. So I had to replace a line -```python -yield _build_response(requests.request(**kwargs)) +```py +def f(): + yield _build_response(requests.request(**kwargs)) ``` with yielding form a 6-lines `get_page` function -```python +```py def get_pages(kwargs: Dict[str, Any], data_json_path): has_more = True while has_more: @@ -133,7 +134,7 @@ The downside — I had to do it for each resource. The code wouldn’t run because it wasn’t able to find some models. I found a commented line in generator script -```python +```py # self._build_models() ``` diff --git a/docs/website/blog/2024-03-11-moving-away-from-segment.md b/docs/website/blog/2024-03-11-moving-away-from-segment.md index f834e25060..4f4b7d0a80 100644 --- a/docs/website/blog/2024-03-11-moving-away-from-segment.md +++ b/docs/website/blog/2024-03-11-moving-away-from-segment.md @@ -67,7 +67,7 @@ Next, we focus on establishing the necessary permissions for our pipeline. A cru Please refer to the Google Cloud documentation [here](https://cloud.google.com/iam/docs/service-accounts-create#console) to set up a service account. Once created, it's important to assign the necessary permissions to the service account. The project [README](https://github.com/dlt-hub/dlt_pubsub_demo) lists the necessary permissions. Finally, generate a key for the created service account and download the JSON file. Pass the credentials as environment variables in the project root directory. -```bash +```sh export GOOGLE_APPLICATION_CREDENTIALS="/path/to/keyfile.json" ``` @@ -75,7 +75,7 @@ export GOOGLE_APPLICATION_CREDENTIALS="/path/to/keyfile.json" To set up our pipeline, start by cloning the [GitHub Repository](https://github.com/dlt-hub/dlt_pubsub_demo). The repository contains all the necessary components, structured as follows: -```bash +```sh . ├── README.md ├── cloud_functions @@ -102,7 +102,7 @@ Meanwhile, the **cloud_functions** folder includes the code for the Cloud Functi To begin, integrate the service account credentials with Terraform to enable authorization and resource management on Google Cloud. Edit the `terraform/main.tf` file to include the path to your service account's credentials file as follows: -```bash +```sh provider "google" { credentials = file("./../credentials.json") project = var.project_id @@ -114,7 +114,7 @@ provider "google" { Next, in the `terraform/variables.tf` define the required variables. These variables correspond to details within your `credentials.json` file and include your project's ID, the region for resource deployment, and any other parameters required by your Terraform configuration: -```bash +```sh variable "project_id" { type = string default = "Add Project ID" @@ -128,7 +128,6 @@ variable "region" { variable "service_account_email" { type = string default = "Add Service Account Email" - } ``` @@ -138,7 +137,7 @@ We are now ready to set up some cloud resources. To get started, navigate into t With the initialization complete, you're ready to proceed with the creation of your cloud resources. To do this, run the following Terraform commands in sequence. These commands instruct Terraform to plan and apply the configurations defined in your `.tf` files, setting up the infrastructure on Google Cloud as specified. -```bash +```sh terraform plan terraform apply ``` @@ -161,7 +160,7 @@ The following resources are created on Google Cloud once `terraform apply` comma Now that our cloud infrastructure is in place, it's time to activate the event publisher. Look for the `publisher.py` file in the project root directory. You'll need to provide specific details to enable the publisher to send events to the correct Pub/Sub topic. Update the file with the following: -```python +```py # TODO(developer) project_id = "Add GCP Project ID" topic_id = "telemetry_data_tera" @@ -169,7 +168,7 @@ topic_id = "telemetry_data_tera" The `publisher.py` script is designed to generate dummy events, simulating real-world data, and then sends these events to the specified Pub/Sub topic. This process is crucial for testing the end-to-end functionality of our event streaming pipeline, ensuring that data flows from the source (the publisher) to our intended destinations (BigQuery, via the Cloud Function and dlt). To run the publisher execute the following command: -```python +```sh python publisher.py ``` @@ -179,7 +178,7 @@ Once the publisher sends events to the Pub/Sub Topic, the pipeline is activated. The average completion time of the pipeline is approximately 12 minutes, accounting for the 10-minute time interval after which the subscriber pushes data to storage plus the Cloud Function execution time. The push interval of the subscriber can be adjusted by changing the **max_duration** in `pubsub.tf` -```bash +```sh cloud_storage_config { bucket = google_storage_bucket.tel_bucket_storage.name diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/scrapy.md b/docs/website/docs/dlt-ecosystem/verified-sources/scrapy.md new file mode 100644 index 0000000000..2e6b588c18 --- /dev/null +++ b/docs/website/docs/dlt-ecosystem/verified-sources/scrapy.md @@ -0,0 +1,189 @@ +--- +title: Scrapy +description: dlt verified source for Scraping using scrapy +keywords: [scraping, scraping verified source, scrapy] +--- + +# Scrapy + +This verified source utilizes Scrapy, an open-source and collaborative framework for web scraping. +Scrapy enables efficient extraction of required data from websites. + +## Setup Guide + +### Initialize the verified source + +To get started with your data pipeline, follow these steps: + +1. Enter the following command: + + ```sh + dlt init scraping duckdb + ``` + + [This command](../../reference/command-line-interface) will initialize + [the pipeline example](https://github.com/dlt-hub/verified-sources/blob/master/sources/scraping_pipeline.py) + with Scrapy as the [source](../../general-usage/source) and [duckdb](../destinations/duckdb.md) + as the [destination](../destinations). + +1. If you'd like to use a different destination, simply replace `duckdb` with the name of your + preferred [destination](../destinations). + +1. After running this command, a new directory will be created with the necessary files and + configuration settings to get started. + +For more information, read the guide on +[how to add a verified source.](../../walkthroughs/add-a-verified-source) + +### Add credentials + +1. The `config.toml`, looks like: + ```toml + # put your configuration values here + [sources.scraping] + start_urls = ["URL to be scraped"] # please set me up! + start_urls_file = "/path/to/urls.txt" # please set me up! + ``` + > When both `start_urls` and `start_urls_file` are provided they will be merged and deduplicated + > to ensure a Scrapy gets a unique set of start URLs. + +1. Inside the `.dlt` folder, you'll find a file called `secrets.toml`, which is where you can securely + store your access tokens and other sensitive information. It's important to handle this + file with care and keep it safe. + +1. Next, follow the [destination documentation](../../dlt-ecosystem/destinations) instructions to + add credentials for your chosen destination, ensuring proper routing of your data to the final + destination. +For more information, read [Secrets and Configs.](../../general-usage/credentials) + +## Run the pipeline + +In this section, we demonstrate how to use the `MySpider` class defined in "scraping_pipeline.py" to +scrape data from "https://quotes.toscrape.com/page/1/". + +1. Start with configuring the `config.toml` as follows: + + ```toml + [sources.scraping] + start_urls = ["https://quotes.toscrape.com/page/1/"] # please set me up! + ``` + + Additionally, set destination credentials in `secrets.toml`, as [discussed](#add-credentials). + +1. Before running the pipeline, ensure that you have installed all the necessary dependencies by + running the command: + + ```sh + pip install -r requirements.txt + ``` + +1. You're now ready to run the pipeline! To get started, run the following command: + + ```sh + python scraping_pipeline.py + ``` + +## Customization + +### Create your own pipeline + +If you wish to create your data pipeline, follow these steps: + +1. The first step requires creating a spider class that scrapes data + from the website. For example, class `Myspider` below scrapes data from + URL: "https://quotes.toscrape.com/page/1/". + + ```py + class MySpider(Spider): + def parse(self, response: Response, **kwargs: Any) -> Any: + # Iterate through each "next" page link found + for next_page in response.css("li.next a::attr(href)"): + if next_page: + yield response.follow(next_page.get(), self.parse) + + # Iterate through each quote block found on the page + for quote in response.css("div.quote"): + # Extract the quote details + result = { + "quote": { + "text": quote.css("span.text::text").get(), + "author": quote.css("small.author::text").get(), + "tags": quote.css("div.tags a.tag::text").getall(), + }, + } + yield result + + ``` + + > Define your own class tailored to the website you intend to scrape. + +1. Configure the pipeline by specifying the pipeline name, destination, and dataset as follows: + + ```py + pipeline = dlt.pipeline( + pipeline_name="scrapy_pipeline", # Use a custom name if desired + destination="duckdb", # Choose the appropriate destination (e.g., bigquery, redshift) + dataset_name="scrapy_data", # Use a custom name if desired + ) + ``` + + To read more about pipeline configuration, please refer to our + [documentation](../../general-usage/pipeline). + +1. To run the pipeline with customized scrapy settings: + + ```py + run_pipeline( + pipeline, + MySpider, + # you can pass scrapy settings overrides here + scrapy_settings={ + # How many sub pages to scrape + # https://docs.scrapy.org/en/latest/topics/settings.html#depth-limit + "DEPTH_LIMIT": 100, + "SPIDER_MIDDLEWARES": { + "scrapy.spidermiddlewares.depth.DepthMiddleware": 200, + "scrapy.spidermiddlewares.httperror.HttpErrorMiddleware": 300, + }, + "HTTPERROR_ALLOW_ALL": False, + }, + write_disposition="append", + ) + ``` + + In the above example, scrapy settings are passed as a parameter. For more information about + scrapy settings, please refer to the + [Scrapy documentation.](https://docs.scrapy.org/en/latest/topics/settings.html). + +1. To limit the number of items processed, use the "on_before_start" function to set a limit on + the resources the pipeline processes. For instance, setting the resource limit to two allows + the pipeline to yield a maximum of two resources. + + ```py + def on_before_start(res: DltResource) -> None: + res.add_limit(2) + + run_pipeline( + pipeline, + MySpider, + batch_size=10, + scrapy_settings={ + "DEPTH_LIMIT": 100, + "SPIDER_MIDDLEWARES": { + "scrapy.spidermiddlewares.depth.DepthMiddleware": 200, + "scrapy.spidermiddlewares.httperror.HttpErrorMiddleware": 300, + } + }, + on_before_start=on_before_start, + write_disposition="append", + ) + ``` + +1. To create a pipeline using Scrapy host, use `create_pipeline_runner` defined in + `helpers.py`. As follows: + + ```py + scraping_host = create_pipeline_runner(pipeline, MySpider, batch_size=10) + scraping_host.pipeline_runner.scraping_resource.add_limit(2) + scraping_host.run(dataset_name="quotes", write_disposition="append") + ``` diff --git a/docs/website/docs/general-usage/schema-evolution.md b/docs/website/docs/general-usage/schema-evolution.md index dd3aa0bf8a..9e225fba01 100644 --- a/docs/website/docs/general-usage/schema-evolution.md +++ b/docs/website/docs/general-usage/schema-evolution.md @@ -97,7 +97,7 @@ What happened? - Removed column stopped loading: - New data to column `room` is not loaded. - Column stopped loading and new one was added: - - A new column `address__building` was added and now data will be loaded to that and stop loading in the column `address__main_block`. + - A new column `address__main_block` was added and now data will be loaded to that and stop loading in the column `address__building`. ## Alert schema changes to curate new data diff --git a/docs/website/sidebars.js b/docs/website/sidebars.js index a313367908..275c1f438a 100644 --- a/docs/website/sidebars.js +++ b/docs/website/sidebars.js @@ -69,6 +69,7 @@ const sidebars = { 'dlt-ecosystem/verified-sources/personio', 'dlt-ecosystem/verified-sources/pipedrive', 'dlt-ecosystem/verified-sources/salesforce', + 'dlt-ecosystem/verified-sources/scrapy', 'dlt-ecosystem/verified-sources/shopify', 'dlt-ecosystem/verified-sources/sql_database', 'dlt-ecosystem/verified-sources/slack', diff --git a/tests/cli/common/test_telemetry_command.py b/tests/cli/common/test_telemetry_command.py index 18bd67a5e0..1b6588c9c8 100644 --- a/tests/cli/common/test_telemetry_command.py +++ b/tests/cli/common/test_telemetry_command.py @@ -139,7 +139,6 @@ def test_instrumentation_wrappers() -> None: COMMAND_DEPLOY_REPO_LOCATION, DeploymentMethods, ) - from dlt.common.exceptions import UnknownDestinationModule with patch("dlt.common.runtime.segment.before_send", _mock_before_send): start_test_telemetry() diff --git a/tests/common/schema/test_merges.py b/tests/common/schema/test_merges.py index 0bb7818b31..fe9e4b1476 100644 --- a/tests/common/schema/test_merges.py +++ b/tests/common/schema/test_merges.py @@ -132,7 +132,7 @@ def test_new_incomplete_column() -> None: def test_merge_columns() -> None: # tab_b overrides non default - col_a = utils.merge_columns(copy(COL_1_HINTS), copy(COL_2_HINTS), merge_defaults=False) + col_a = utils.merge_column(copy(COL_1_HINTS), copy(COL_2_HINTS), merge_defaults=False) # nullable is False - tab_b has it as default and those are not merged assert col_a == { "name": "test_2", @@ -146,7 +146,7 @@ def test_merge_columns() -> None: "prop": None, } - col_a = utils.merge_columns(copy(COL_1_HINTS), copy(COL_2_HINTS), merge_defaults=True) + col_a = utils.merge_column(copy(COL_1_HINTS), copy(COL_2_HINTS), merge_defaults=True) # nullable is True and primary_key is present - default values are merged assert col_a == { "name": "test_2", @@ -173,10 +173,10 @@ def test_diff_tables() -> None: empty = utils.new_table("table") del empty["resource"] print(empty) - partial = utils.diff_tables(empty, deepcopy(table)) + partial = utils.diff_table(empty, deepcopy(table)) # partial is simply table assert partial == table - partial = utils.diff_tables(deepcopy(table), empty) + partial = utils.diff_table(deepcopy(table), empty) # partial is empty assert partial == empty @@ -184,7 +184,7 @@ def test_diff_tables() -> None: changed = deepcopy(table) changed["description"] = "new description" changed["name"] = "new name" - partial = utils.diff_tables(deepcopy(table), changed) + partial = utils.diff_table(deepcopy(table), changed) print(partial) assert partial == {"name": "new name", "description": "new description", "columns": {}} @@ -192,7 +192,7 @@ def test_diff_tables() -> None: existing = deepcopy(table) changed["write_disposition"] = "append" changed["schema_contract"] = "freeze" - partial = utils.diff_tables(deepcopy(existing), changed) + partial = utils.diff_table(deepcopy(existing), changed) assert partial == { "name": "new name", "description": "new description", @@ -202,14 +202,14 @@ def test_diff_tables() -> None: } existing["write_disposition"] = "append" existing["schema_contract"] = "freeze" - partial = utils.diff_tables(deepcopy(existing), changed) + partial = utils.diff_table(deepcopy(existing), changed) assert partial == {"name": "new name", "description": "new description", "columns": {}} # detect changed column existing = deepcopy(table) changed = deepcopy(table) changed["columns"]["test"]["cluster"] = True - partial = utils.diff_tables(existing, changed) + partial = utils.diff_table(existing, changed) assert "test" in partial["columns"] assert "test_2" not in partial["columns"] assert existing["columns"]["test"] == table["columns"]["test"] != partial["columns"]["test"] @@ -218,7 +218,7 @@ def test_diff_tables() -> None: existing = deepcopy(table) changed = deepcopy(table) changed["columns"]["test"]["foreign_key"] = False - partial = utils.diff_tables(existing, changed) + partial = utils.diff_table(existing, changed) assert "test" in partial["columns"] # even if not present in tab_a at all @@ -226,7 +226,7 @@ def test_diff_tables() -> None: changed = deepcopy(table) changed["columns"]["test"]["foreign_key"] = False del existing["columns"]["test"]["foreign_key"] - partial = utils.diff_tables(existing, changed) + partial = utils.diff_table(existing, changed) assert "test" in partial["columns"] @@ -242,7 +242,7 @@ def test_diff_tables_conflicts() -> None: other = utils.new_table("table_2") with pytest.raises(TablePropertiesConflictException) as cf_ex: - utils.diff_tables(table, other) + utils.diff_table(table, other) assert cf_ex.value.table_name == "table" assert cf_ex.value.prop_name == "parent" @@ -250,7 +250,7 @@ def test_diff_tables_conflicts() -> None: changed = deepcopy(table) changed["columns"]["test"]["data_type"] = "bigint" with pytest.raises(CannotCoerceColumnException): - utils.diff_tables(table, changed) + utils.diff_table(table, changed) def test_merge_tables() -> None: @@ -261,6 +261,7 @@ def test_merge_tables() -> None: "x-special": 128, "columns": {"test": COL_1_HINTS, "test_2": COL_2_HINTS}, } + print(table) changed = deepcopy(table) changed["x-special"] = 129 # type: ignore[typeddict-unknown-key] changed["description"] = "new description" @@ -269,7 +270,7 @@ def test_merge_tables() -> None: changed["new-prop-3"] = False # type: ignore[typeddict-unknown-key] # drop column so partial has it del table["columns"]["test"] - partial = utils.merge_tables(table, changed) + partial = utils.merge_table(table, changed) assert "test" in table["columns"] assert table["x-special"] == 129 # type: ignore[typeddict-item] assert table["description"] == "new description" @@ -281,3 +282,39 @@ def test_merge_tables() -> None: # one column in partial assert len(partial["columns"]) == 1 assert partial["columns"]["test"] == COL_1_HINTS + # still has incomplete column + assert table["columns"]["test_2"] == COL_2_HINTS + # check order, we dropped test so it is added at the end + print(table) + assert list(table["columns"].keys()) == ["test_2", "test"] + + +def test_merge_tables_incomplete_columns() -> None: + table: TTableSchema = { + "name": "table", + "columns": {"test_2": COL_2_HINTS, "test": COL_1_HINTS}, + } + changed = deepcopy(table) + # reverse order, this order we want to have at the end + changed["columns"] = deepcopy({"test": COL_1_HINTS, "test_2": COL_2_HINTS}) + # it is completed now + changed["columns"]["test_2"]["data_type"] = "bigint" + partial = utils.merge_table(table, changed) + assert list(partial["columns"].keys()) == ["test_2"] + # test_2 goes to the end, it was incomplete in table so it got dropped before update + assert list(table["columns"].keys()) == ["test", "test_2"] + + table = { + "name": "table", + "columns": {"test_2": COL_2_HINTS, "test": COL_1_HINTS}, + } + + changed = deepcopy(table) + # reverse order, this order we want to have at the end + changed["columns"] = deepcopy({"test": COL_1_HINTS, "test_2": COL_2_HINTS}) + # still incomplete but changed + changed["columns"]["test_2"]["nullable"] = False + partial = utils.merge_table(table, changed) + assert list(partial["columns"].keys()) == ["test_2"] + # incomplete -> incomplete stays in place + assert list(table["columns"].keys()) == ["test_2", "test"] diff --git a/tests/common/schema/test_schema.py b/tests/common/schema/test_schema.py index 653e9cc351..887b0aa9a0 100644 --- a/tests/common/schema/test_schema.py +++ b/tests/common/schema/test_schema.py @@ -84,15 +84,27 @@ def test_normalize_schema_name(schema: Schema) -> None: def test_new_schema(schema: Schema) -> None: assert schema.name == "event" + assert_is_new_schema(schema) + assert_new_schema_props(schema) + stored_schema = schema.to_dict() # version hash is present - assert len(stored_schema["version_hash"]) > 0 + assert stored_schema["version"] == 1 + assert stored_schema["version_hash"] is not None utils.validate_stored_schema(stored_schema) - assert_new_schema_values(schema) + + # to dict without bumping version should be used only internally + stored_schema = schema.to_dict(bump_version=False) + # version hash is present + assert stored_schema["version"] is None + assert stored_schema["version_hash"] is None + with pytest.raises(DictValidationException): + utils.validate_stored_schema(stored_schema) def test_new_schema_custom_normalizers(cn_schema: Schema) -> None: - assert_new_schema_values_custom_normalizers(cn_schema) + assert_is_new_schema(cn_schema) + assert_new_schema_props_custom_normalizers(cn_schema) def test_schema_config_normalizers(schema: Schema, schema_storage_no_import: SchemaStorage) -> None: @@ -222,8 +234,9 @@ def test_replace_schema_content() -> None: eth_v5: TStoredSchema = load_yml_case("schemas/eth/ethereum_schema_v5") eth_v5["imported_version_hash"] = "IMP_HASH" schema_eth = Schema.from_dict(eth_v5) # type: ignore[arg-type] - schema.replace_schema_content(schema_eth) + schema.replace_schema_content(schema_eth.clone()) assert schema_eth.stored_version_hash == schema.stored_version_hash + assert schema_eth.stored_version == schema.stored_version assert schema_eth.version == schema.version assert schema_eth.version_hash == schema.version_hash assert schema_eth._imported_version_hash == schema._imported_version_hash @@ -239,16 +252,52 @@ def test_replace_schema_content() -> None: # make sure we linked the replaced schema to the incoming schema = Schema("simple") + # generate version and hash + schema._bump_version() eth_v5 = load_yml_case("schemas/eth/ethereum_schema_v5") - schema_eth = Schema.from_dict(eth_v5, bump_version=False) # type: ignore[arg-type] - schema_eth.bump_version() + schema_eth = Schema.from_dict(eth_v5) # type: ignore[arg-type] + assert not schema_eth.is_modified # modify simple schema by adding a table schema.update_table(schema_eth.get_table("blocks")) - replaced_stored_hash = schema.stored_version_hash + replaced_stored_hash = schema.version_hash schema.replace_schema_content(schema_eth, link_to_replaced_schema=True) assert replaced_stored_hash in schema.previous_hashes - assert replaced_stored_hash == schema.stored_version_hash - assert schema.stored_version_hash != schema.version_hash + assert schema_eth.stored_version_hash == schema.stored_version_hash + assert schema_eth.stored_version == schema.stored_version + assert schema_eth.version_hash == schema.version_hash + assert schema_eth.version == schema.version + assert not schema.is_modified + + # incoming schema still modified after replace + schema = Schema("simple") + # generate version and hash + schema._bump_version() + eth_v5 = load_yml_case("schemas/eth/ethereum_schema_v5") + schema_eth = Schema.from_dict(eth_v5, bump_version=False) # type: ignore[arg-type] + assert schema_eth.is_modified + schema.replace_schema_content(schema_eth, link_to_replaced_schema=True) + assert schema.is_modified + + # replace content of new schema + schema = Schema("simple") + eth_v5 = load_yml_case("schemas/eth/ethereum_schema_v5") + schema_eth = Schema.from_dict(eth_v5, bump_version=False) # type: ignore[arg-type] + schema_eth._bump_version() + schema.replace_schema_content(schema_eth, link_to_replaced_schema=True) + # nothing got added to prev hashes + assert schema.to_dict() == schema_eth.to_dict() + + # replace content with new schema + schema = Schema("simple") + eth_v5 = load_yml_case("schemas/eth/ethereum_schema_v5") + schema_eth = Schema.from_dict(eth_v5, bump_version=False) # type: ignore[arg-type] + schema_eth.replace_schema_content(schema, link_to_replaced_schema=True) + # schema tracked + assert schema_eth.name == "simple" + assert Schema.from_dict(eth_v5, bump_version=False).version_hash in schema.previous_hashes # type: ignore[arg-type] + # but still new + assert schema_eth.is_new + assert schema_eth.is_modified # replace with self eth_v5 = load_yml_case("schemas/eth/ethereum_schema_v5") @@ -270,6 +319,40 @@ def test_replace_schema_content() -> None: assert schema_eth.version_hash not in schema_eth.previous_hashes +def test_clone(schema: Schema) -> None: + # set normalizers but ignore them when cloning + os.environ["SCHEMA__NAMING"] = "direct" + + cloned = schema.clone() + assert cloned.to_dict(bump_version=False) == schema.to_dict(bump_version=False) + # dicts are not shared + assert id(cloned._settings) != id(schema._settings) + assert id(cloned._schema_tables) != id(schema._schema_tables) + # make sure version didn't change + assert cloned._stored_version == schema._stored_version + + # clone with name + cloned = schema.clone(with_name="second") + assert cloned.name == "second" + assert cloned.is_new + assert cloned.is_modified + assert cloned._imported_version_hash is None + assert cloned.previous_hashes == [] + + # clone with normalizers update + cloned = schema.clone("second", update_normalizers=True) + assert cloned._normalizers_config != schema._normalizers_config + assert cloned._normalizers_config["names"] == "direct" + + # clone modified schema + simple = Schema("simple") + cloned = simple.clone() + assert cloned.to_dict(bump_version=False) == simple.to_dict(bump_version=False) + assert cloned.is_new + assert cloned.is_modified + assert cloned._normalizers_config["names"] == "direct" + + @pytest.mark.parametrize( "columns,hint,value", [ @@ -300,13 +383,15 @@ def test_new_schema_alt_name() -> None: def test_save_store_schema(schema: Schema, schema_storage: SchemaStorage) -> None: assert not schema_storage.storage.has_file(EXPECTED_FILE_NAME) saved_file_name = schema_storage.save_schema(schema) + assert schema.is_modified is False + assert schema.is_new is False # return absolute path assert saved_file_name == schema_storage.storage.make_full_path(EXPECTED_FILE_NAME) assert schema_storage.storage.has_file(EXPECTED_FILE_NAME) schema_copy = schema_storage.load_schema("event") assert schema.name == schema_copy.name assert schema.version == schema_copy.version - assert_new_schema_values(schema_copy) + assert_new_schema_props(schema_copy) def test_save_store_schema_custom_normalizers( @@ -314,7 +399,7 @@ def test_save_store_schema_custom_normalizers( ) -> None: schema_storage.save_schema(cn_schema) schema_copy = schema_storage.load_schema(cn_schema.name) - assert_new_schema_values_custom_normalizers(schema_copy) + assert_new_schema_props_custom_normalizers(schema_copy) def test_save_load_incomplete_column( @@ -707,7 +792,7 @@ def test_normalize_table_identifiers_merge_columns() -> None: } -def assert_new_schema_values_custom_normalizers(schema: Schema) -> None: +def assert_new_schema_props_custom_normalizers(schema: Schema) -> None: # check normalizers config assert schema._normalizers_config["names"] == "tests.common.normalizers.custom_normalizers" assert ( @@ -727,13 +812,19 @@ def assert_new_schema_values_custom_normalizers(schema: Schema) -> None: assert row[0] == (("a_table", None), {"bool": True}) -def assert_new_schema_values(schema: Schema) -> None: - assert schema.version == 1 - assert schema.stored_version == 1 - assert schema.stored_version_hash is not None - assert schema.version_hash is not None +def assert_is_new_schema(schema: Schema) -> None: + assert schema.stored_version is None + assert schema.stored_version_hash is None assert schema.ENGINE_VERSION == 9 assert schema._stored_previous_hashes == [] + assert schema.is_modified + assert schema.is_new + + +def assert_new_schema_props(schema: Schema) -> None: + assert schema.version == 1 + assert schema.version_hash is not None + assert len(schema.settings["default_hints"]) > 0 # check settings assert ( diff --git a/tests/common/schema/test_versioning.py b/tests/common/schema/test_versioning.py index dde05001e8..b67b028161 100644 --- a/tests/common/schema/test_versioning.py +++ b/tests/common/schema/test_versioning.py @@ -22,6 +22,9 @@ def test_content_hash() -> None: assert utils.generate_version_hash(eth_v4) == hash2 eth_v4["version_hash"] = "xxxx" assert utils.generate_version_hash(eth_v4) == hash2 + # import schema hash is also excluded + eth_v4["imported_version_hash"] = "xxxx" + assert utils.generate_version_hash(eth_v4) == hash2 # changing table order does not impact the hash loads_table = eth_v4["tables"].pop("_dlt_loads") # insert at the end: _dlt_loads was first originally @@ -65,22 +68,22 @@ def test_infer_column_bumps_version() -> None: _, new_table = schema.coerce_row("event_user", None, row) schema.update_table(new_table) # schema version will be recomputed - assert schema.version == 2 + assert schema.version == 1 assert schema.version_hash is not None version_hash = schema.version_hash # another table _, new_table = schema.coerce_row("event_bot", None, row) schema.update_table(new_table) - # version is still 2 (increment of 1) - assert schema.version == 2 + # version is still 1 (increment of 1) + assert schema.version == 1 # but the hash changed assert schema.version_hash != version_hash # save saved_schema = schema.to_dict() assert saved_schema["version_hash"] == schema.version_hash - assert saved_schema["version"] == 2 + assert saved_schema["version"] == 1 def test_preserve_version_on_load() -> None: diff --git a/tests/common/storages/test_schema_storage.py b/tests/common/storages/test_schema_storage.py index 0e04554649..6cb76fba9d 100644 --- a/tests/common/storages/test_schema_storage.py +++ b/tests/common/storages/test_schema_storage.py @@ -28,36 +28,43 @@ ) -@pytest.fixture -def storage() -> SchemaStorage: - return init_storage(SchemaStorageConfiguration()) +@pytest.fixture(params=[LiveSchemaStorage, SchemaStorage]) +def storage(request) -> SchemaStorage: + return init_storage(request.param, SchemaStorageConfiguration()) @pytest.fixture -def synced_storage() -> SchemaStorage: +def live_storage() -> LiveSchemaStorage: + return init_storage(LiveSchemaStorage, SchemaStorageConfiguration()) # type: ignore[return-value] + + +@pytest.fixture(params=[LiveSchemaStorage, SchemaStorage]) +def synced_storage(request) -> SchemaStorage: # will be created in /schemas return init_storage( + request.param, SchemaStorageConfiguration( import_schema_path=TEST_STORAGE_ROOT + "/import", export_schema_path=TEST_STORAGE_ROOT + "/import", - ) + ), ) -@pytest.fixture -def ie_storage() -> SchemaStorage: +@pytest.fixture(params=[LiveSchemaStorage, SchemaStorage]) +def ie_storage(request) -> SchemaStorage: # will be created in /schemas return init_storage( + request.param, SchemaStorageConfiguration( import_schema_path=TEST_STORAGE_ROOT + "/import", export_schema_path=TEST_STORAGE_ROOT + "/export", - ) + ), ) -def init_storage(C: SchemaStorageConfiguration) -> SchemaStorage: +def init_storage(cls, C: SchemaStorageConfiguration) -> SchemaStorage: # use live schema storage for test which must be backward compatible with schema storage - s = LiveSchemaStorage(C, makedirs=True) + s = cls(C, makedirs=True) assert C is s.config if C.export_schema_path: os.makedirs(C.export_schema_path, exist_ok=True) @@ -101,13 +108,17 @@ def test_import_overwrites_existing_if_modified( def test_skip_import_if_not_modified(synced_storage: SchemaStorage, storage: SchemaStorage) -> None: storage_schema = assert_schema_imported(synced_storage, storage) + assert not storage_schema.is_modified + initial_version = storage_schema.stored_version # stored_version = storage_schema.stored_version # stored_version_hash = storage_schema.stored_version_hash # evolve schema row = {"floatX": 78172.128, "confidenceX": 1.2, "strX": "STR"} _, new_table = storage_schema.coerce_row("event_user", None, row) storage_schema.update_table(new_table) + assert storage_schema.is_modified storage.save_schema(storage_schema) + assert not storage_schema.is_modified # now use synced storage to load schema again reloaded_schema = synced_storage.load_schema("ethereum") # the schema was not overwritten @@ -119,6 +130,7 @@ def test_skip_import_if_not_modified(synced_storage: SchemaStorage, storage: Sch # the import schema gets modified storage_schema.tables["_dlt_loads"]["write_disposition"] = "append" storage_schema.tables.pop("event_user") + # we save the import schema (using export method) synced_storage._export_schema(storage_schema, synced_storage.config.export_schema_path) # now load will import again reloaded_schema = synced_storage.load_schema("ethereum") @@ -130,8 +142,8 @@ def test_skip_import_if_not_modified(synced_storage: SchemaStorage, storage: Sch assert reloaded_schema._imported_version_hash == storage_schema.version_hash assert storage_schema.previous_hashes == reloaded_schema.previous_hashes - # but original version has increased - assert reloaded_schema.stored_version == storage_schema.version + 1 + # but original version has increased twice (because it was modified twice) + assert reloaded_schema.stored_version == storage_schema.version == initial_version + 2 def test_store_schema_tampered(synced_storage: SchemaStorage, storage: SchemaStorage) -> None: @@ -188,7 +200,7 @@ def test_remove_schema(storage: SchemaStorage) -> None: assert storage.list_schemas() == [] -def test_mapping_interface(storage: SchemaStorage) -> None: +def test_getter(storage: SchemaStorage) -> None: # empty storage assert len(storage) == 0 assert "ethereum" not in storage @@ -219,6 +231,34 @@ def test_mapping_interface(storage: SchemaStorage) -> None: assert set(i[0] for i in items) == set(["ethereum", "event"]) +def test_getter_with_import(ie_storage: SchemaStorage) -> None: + with pytest.raises(KeyError): + ie_storage["ethereum"] + prepare_import_folder(ie_storage) + # schema will be imported + schema = ie_storage["ethereum"] + assert schema.name == "ethereum" + version_hash = schema.version_hash + # the import schema gets modified + schema.tables["_dlt_loads"]["write_disposition"] = "append" + mod_version_hash = schema.version_hash + assert schema.is_modified + ie_storage.save_schema(schema) + assert not schema.is_modified + # now load via getter + schema_copy = ie_storage["ethereum"] + assert schema_copy.version_hash == schema_copy.stored_version_hash == mod_version_hash + assert schema_copy._imported_version_hash == version_hash + + # now save the schema as import + ie_storage._export_schema(schema, ie_storage.config.import_schema_path) + # if you get the schema, import hash will change + schema = ie_storage["ethereum"] + assert schema._imported_version_hash == mod_version_hash + # only true for live schema + # assert id(schema) == id(schema_copy) + + def test_save_store_schema_over_import(ie_storage: SchemaStorage) -> None: prepare_import_folder(ie_storage) # we have ethereum schema to be imported but we create new schema and save it @@ -269,7 +309,11 @@ def test_save_store_schema(storage: SchemaStorage) -> None: d_n = explicit_normalizers() d_n["names"] = "tests.common.normalizers.custom_normalizers" schema = Schema("column_event", normalizers=d_n) + assert schema.is_new + assert schema.is_modified storage.save_schema(schema) + assert not schema.is_new + assert not schema.is_modified assert storage.storage.has_file( SchemaStorage.NAMED_SCHEMA_FILE_PATTERN % ("column_event", "json") ) @@ -309,6 +353,118 @@ def test_schema_from_file() -> None: ) +def test_live_schema_instances(live_storage: LiveSchemaStorage) -> None: + schema = Schema("simple") + live_storage.save_schema(schema) + + # get schema via getter + getter_schema = live_storage["simple"] + # same id + assert id(getter_schema) == id(schema) + + # live schema is same as in storage + assert live_storage.is_live_schema_committed("simple") + # modify getter schema + getter_schema._schema_description = "this is getter schema" + assert getter_schema.is_modified + # getter is not committed + assert not live_storage.is_live_schema_committed("simple") + + # separate instance via load + load_schema = live_storage.load_schema("simple") + assert id(load_schema) != id(schema) + # changes not visible + assert load_schema._schema_description is None + + # bypass live schema to simulate 3rd party change + SchemaStorage.save_schema(live_storage, getter_schema) + # committed because hashes are matching with file + assert live_storage.is_live_schema_committed("simple") + getter_schema = live_storage["simple"] + assert id(getter_schema) == id(schema) + + SchemaStorage.save_schema(live_storage, load_schema) + # still committed + assert live_storage.is_live_schema_committed("simple") + # and aware of changes in storage + getter_schema = live_storage["simple"] + assert id(getter_schema) == id(schema) + assert getter_schema._schema_description is None + getter_schema_mod_hash = getter_schema.version_hash + + # create a new "simple" schema + second_simple = Schema("simple") + second_simple._schema_description = "Second simple" + live_storage.save_schema(second_simple) + # got saved + load_schema = live_storage.load_schema("simple") + assert load_schema._schema_description == "Second simple" + # live schema seamlessly updated + assert schema._schema_description == "Second simple" + assert not schema.is_modified + assert getter_schema_mod_hash in schema.previous_hashes + + +def test_commit_live_schema(live_storage: LiveSchemaStorage) -> None: + with pytest.raises(SchemaNotFoundError): + live_storage.commit_live_schema("simple") + # set live schema + schema = Schema("simple") + set_schema = live_storage.set_live_schema(schema) + assert id(set_schema) == id(schema) + assert "simple" in live_storage.live_schemas + assert not live_storage.is_live_schema_committed("simple") + # nothing in storage + with pytest.raises(SchemaNotFoundError): + SchemaStorage.__getitem__(live_storage, "simple") + with pytest.raises(SchemaNotFoundError): + live_storage.load_schema("simple") + assert not live_storage.is_live_schema_committed("simple") + + # commit + assert live_storage.commit_live_schema("simple") is not None + # schema in storage + live_storage.load_schema("simple") + assert live_storage.is_live_schema_committed("simple") + + # second commit does not save + assert live_storage.commit_live_schema("simple") is None + + # mod the schema + schema._schema_description = "mod the schema" + assert not live_storage.is_live_schema_committed("simple") + mod_hash = schema.version_hash + + # save another instance under the same name + schema_2 = Schema("simple") + schema_2._schema_description = "instance 2" + live_storage.save_schema(schema_2) + assert live_storage.is_live_schema_committed("simple") + # content replaces in place + assert schema._schema_description == "instance 2" + assert mod_hash in schema.previous_hashes + + +def test_live_schema_getter_when_committed(live_storage: LiveSchemaStorage) -> None: + # getter on committed is aware of changes to storage (also import) + schema = Schema("simple") + live_storage.set_live_schema(schema) + set_schema = live_storage["simple"] + live_storage.commit_live_schema("simple") + # change content in storage + cloned = set_schema.clone() + cloned._schema_description = "cloned" + SchemaStorage.save_schema(live_storage, cloned) + set_schema_2 = live_storage["simple"] + assert set_schema_2._schema_description == "cloned" + assert id(set_schema_2) == id(set_schema) + + +def test_new_live_schema_committed(live_storage: LiveSchemaStorage) -> None: + with pytest.raises(SchemaNotFoundError): + live_storage.is_live_schema_committed("simple") + + # def test_save_empty_schema_name(storage: SchemaStorage) -> None: # schema = Schema("") # schema.settings["schema_sealed"] = True diff --git a/tests/common/test_destination.py b/tests/common/test_destination.py index b93cb5b483..5240b889f3 100644 --- a/tests/common/test_destination.py +++ b/tests/common/test_destination.py @@ -2,7 +2,7 @@ from dlt.common.destination.reference import DestinationClientDwhConfiguration, Destination from dlt.common.destination import DestinationCapabilitiesContext -from dlt.common.exceptions import InvalidDestinationReference, UnknownDestinationModule +from dlt.common.destination.exceptions import InvalidDestinationReference, UnknownDestinationModule from dlt.common.schema import Schema from tests.utils import ACTIVE_DESTINATIONS diff --git a/tests/common/test_utils.py b/tests/common/test_utils.py index 456ef3cb91..229ce17085 100644 --- a/tests/common/test_utils.py +++ b/tests/common/test_utils.py @@ -3,7 +3,7 @@ import binascii import pytest from typing import Dict -from dlt.common.exceptions import IdentifierTooLongException, PipelineException, TerminalValueError +from dlt.common.exceptions import PipelineException, TerminalValueError from dlt.common.runners import Venv from dlt.common.utils import ( @@ -231,6 +231,8 @@ def test_extend_list_deduplicated() -> None: def test_exception_traces() -> None: + from dlt.common.destination.exceptions import IdentifierTooLongException + # bare exception without stack trace trace = get_exception_trace(Exception("Message")) assert trace["message"] == "Message" @@ -243,7 +245,7 @@ def test_exception_traces() -> None: raise IdentifierTooLongException("postgres", "table", "too_long_table", 8) except Exception as exc: trace = get_exception_trace(exc) - assert trace["exception_type"] == "dlt.common.exceptions.IdentifierTooLongException" + assert trace["exception_type"] == "dlt.common.destination.exceptions.IdentifierTooLongException" assert isinstance(trace["stack_trace"], list) assert trace["exception_attrs"] == { "destination_name": "postgres", @@ -262,6 +264,8 @@ def test_exception_traces() -> None: def test_exception_trace_chain() -> None: + from dlt.common.destination.exceptions import IdentifierTooLongException + try: raise TerminalValueError("Val") except Exception: @@ -276,7 +280,10 @@ def test_exception_trace_chain() -> None: # outer exception first assert len(traces) == 3 assert traces[0]["exception_type"] == "dlt.common.exceptions.PipelineException" - assert traces[1]["exception_type"] == "dlt.common.exceptions.IdentifierTooLongException" + assert ( + traces[1]["exception_type"] + == "dlt.common.destination.exceptions.IdentifierTooLongException" + ) assert traces[2]["exception_type"] == "dlt.common.exceptions.TerminalValueError" diff --git a/tests/destinations/test_custom_destination.py b/tests/destinations/test_custom_destination.py index 7b74e5406c..7280ec419b 100644 --- a/tests/destinations/test_custom_destination.py +++ b/tests/destinations/test_custom_destination.py @@ -12,16 +12,16 @@ from dlt.common.schema import TTableSchema from dlt.common.data_writers.writers import TLoaderFileFormat from dlt.common.destination.reference import Destination -from dlt.pipeline.exceptions import PipelineStepFailed -from dlt.common.utils import uniq_id -from dlt.common.exceptions import DestinationTerminalException, InvalidDestinationReference +from dlt.common.destination.exceptions import InvalidDestinationReference from dlt.common.configuration.exceptions import ConfigFieldMissingException from dlt.common.configuration.specs import ConnectionStringCredentials -from dlt.destinations.impl.destination.factory import _DESTINATIONS -from dlt.destinations.impl.destination.configuration import CustomDestinationClientConfiguration from dlt.common.configuration.inject import get_fun_spec from dlt.common.configuration.specs import BaseConfiguration +from dlt.destinations.impl.destination.factory import _DESTINATIONS +from dlt.destinations.impl.destination.configuration import CustomDestinationClientConfiguration +from dlt.pipeline.exceptions import PipelineStepFailed + from tests.load.utils import ( TABLE_ROW_ALL_DATA_TYPES, TABLE_UPDATE_COLUMNS_SCHEMA, @@ -462,7 +462,6 @@ class MyDestinationSpec(CustomDestinationClientConfiguration): def sink_func_with_spec( items: TDataItems, table: TTableSchema, my_predefined_val=dlt.config.value ) -> None: - # raise DestinationTerminalException("NEVER") pass wrapped_callable = sink_func_with_spec().config_params["destination_callable"] diff --git a/tests/extract/test_decorators.py b/tests/extract/test_decorators.py index 0f19239330..dca4c0be6e 100644 --- a/tests/extract/test_decorators.py +++ b/tests/extract/test_decorators.py @@ -45,6 +45,24 @@ from tests.common.utils import IMPORTED_VERSION_HASH_ETH_V9 +def test_default_resource() -> None: + @dlt.resource + def resource(): + yield [1, 2, 3] + + # simple generated table schema + assert resource().compute_table_schema() == { + "columns": {}, + "name": "resource", + "resource": "resource", + "write_disposition": "append", + } + assert resource().name == "resource" + assert resource._args_bound is False + assert resource.incremental is None + assert resource.write_disposition == "append" + + def test_none_returning_source() -> None: with pytest.raises(SourceNotAFunction): dlt.source("data")() # type: ignore[call-overload] diff --git a/tests/extract/test_extract.py b/tests/extract/test_extract.py index b86e198988..1879eaa9eb 100644 --- a/tests/extract/test_extract.py +++ b/tests/extract/test_extract.py @@ -15,6 +15,7 @@ from dlt.extract.extract import ExtractStorage, Extract from dlt.extract.hints import make_hints +from dlt.extract.items import TableNameMeta from tests.utils import clean_test_storage, TEST_STORAGE_ROOT from tests.extract.utils import expect_extracted_file @@ -164,6 +165,52 @@ def with_table_hints(): assert "pk" not in table["columns"] +def test_extract_hints_table_variant(extract_step: Extract) -> None: + os.environ["DATA_WRITER__DISABLE_COMPRESSION"] = "TRUE" + + @dlt.resource(primary_key="pk") + def with_table_hints(): + yield dlt.mark.with_hints( + {"id": 1, "pk": "A"}, + make_hints(table_name="table_a", columns=[{"name": "id", "data_type": "bigint"}]), + create_table_variant=True, + ) + # get the resource + resource = dlt.current.source().resources[dlt.current.resource_name()] + assert "table_a" in resource._hints_variants + # get table + table = resource.compute_table_schema(meta=TableNameMeta("table_a")) + assert "pk" in table["columns"] + assert "id" in table["columns"] + assert table["columns"]["pk"]["primary_key"] is True + assert table["columns"]["id"]["data_type"] == "bigint" + + schema = dlt.current.source_schema() + # table table_a will be created + assert "table_a" in schema.tables + schema_table = schema.tables["table_a"] + assert table == schema_table + + # dispatch to table b + yield dlt.mark.with_hints( + {"id": 2, "pk": "B"}, + make_hints(table_name="table_b", write_disposition="replace"), + create_table_variant=True, + ) + assert "table_b" in resource._hints_variants + # get table + table = resource.compute_table_schema(meta=TableNameMeta("table_b")) + assert table["write_disposition"] == "replace" + schema_table = schema.tables["table_b"] + assert table == schema_table + + # item to resource + yield {"id": 3, "pk": "C"} + + source = DltSource(dlt.Schema("hintable"), "module", [with_table_hints]) + extract_step.extract(source, 20, 1) + + # def test_extract_pipe_from_unknown_resource(): # pass diff --git a/tests/extract/test_sources.py b/tests/extract/test_sources.py index d9c73dfb20..6ff1a0bf5f 100644 --- a/tests/extract/test_sources.py +++ b/tests/extract/test_sources.py @@ -12,6 +12,7 @@ from dlt.common.typing import TDataItems from dlt.extract import DltResource, DltSource, Incremental +from dlt.extract.items import TableNameMeta from dlt.extract.source import DltResourceDict from dlt.extract.exceptions import ( DataItemRequiredForDynamicTableHints, @@ -1362,6 +1363,57 @@ def empty_gen(): assert table["columns"]["tags"] == {"name": "tags"} +def test_apply_hints_table_variants() -> None: + def empty_gen(): + yield [1, 2, 3] + + empty = DltResource.from_data(empty_gen) + + # table name must be a string + with pytest.raises(ValueError): + empty.apply_hints(write_disposition="append", create_table_variant=True) + with pytest.raises(ValueError): + empty.apply_hints( + table_name=lambda ev: ev["t"], write_disposition="append", create_table_variant=True + ) + + # table a with replace + empty.apply_hints(table_name="table_a", write_disposition="replace", create_table_variant=True) + table_a = empty.compute_table_schema(meta=TableNameMeta("table_a")) + assert table_a["name"] == "table_a" + assert table_a["write_disposition"] == "replace" + + # unknown table (without variant) - created out resource hints + table_unk = empty.compute_table_schema(meta=TableNameMeta("table_unk")) + assert table_unk["name"] == "empty_gen" + assert table_unk["write_disposition"] == "append" + + # resource hints are base for table variants + empty.apply_hints( + primary_key="id", + incremental=dlt.sources.incremental(cursor_path="x"), + columns=[{"name": "id", "data_type": "bigint"}], + ) + empty.apply_hints(table_name="table_b", write_disposition="merge", create_table_variant=True) + table_b = empty.compute_table_schema(meta=TableNameMeta("table_b")) + assert table_b["name"] == "table_b" + assert table_b["write_disposition"] == "merge" + assert len(table_b["columns"]) == 1 + assert table_b["columns"]["id"]["primary_key"] is True + # overwrite table_b, remove column def and primary_key + empty.apply_hints(table_name="table_b", columns=[], primary_key=(), create_table_variant=True) + table_b = empty.compute_table_schema(meta=TableNameMeta("table_b")) + assert table_b["name"] == "table_b" + assert table_b["write_disposition"] == "merge" + assert len(table_b["columns"]) == 0 + + # dyn hints not allowed + with pytest.raises(InconsistentTableTemplate): + empty.apply_hints( + table_name="table_b", write_disposition=lambda ev: ev["wd"], create_table_variant=True + ) + + def test_resource_no_template() -> None: empty = DltResource.from_data([1, 2, 3], name="table") assert empty.write_disposition == "append" diff --git a/tests/load/pipeline/test_drop.py b/tests/load/pipeline/test_drop.py index 8614af4734..afae1c22ca 100644 --- a/tests/load/pipeline/test_drop.py +++ b/tests/load/pipeline/test_drop.py @@ -56,7 +56,11 @@ def droppable_d( dlt.state()["data_from_d"] = {"foo1": {"bar": 1}, "foo2": {"bar": 2}} yield [dict(o=55), dict(o=22)] - return [droppable_a(), droppable_b(), droppable_c(), droppable_d()] + @dlt.resource(selected=True) + def droppable_no_state(): + yield [1, 2, 3] + + return [droppable_a(), droppable_b(), droppable_c(), droppable_d(), droppable_no_state] RESOURCE_TABLES = dict( @@ -64,8 +68,11 @@ def droppable_d( droppable_b=["droppable_b", "droppable_b__items"], droppable_c=["droppable_c", "droppable_c__items", "droppable_c__items__labels"], droppable_d=["droppable_d"], + droppable_no_state=["droppable_no_state"], ) +NO_STATE_RESOURCES = {"droppable_no_state"} + def assert_dropped_resources(pipeline: Pipeline, resources: List[str]) -> None: assert_dropped_resource_tables(pipeline, resources) @@ -95,7 +102,7 @@ def assert_dropped_resource_tables(pipeline: Pipeline, resources: List[str]) -> def assert_dropped_resource_states(pipeline: Pipeline, resources: List[str]) -> None: # Verify only requested resource keys are removed from state - all_resources = set(RESOURCE_TABLES.keys()) + all_resources = set(RESOURCE_TABLES.keys()) - NO_STATE_RESOURCES expected_keys = all_resources - set(resources) sources_state = pipeline.state["sources"] result_keys = set(sources_state["droppable"]["resources"].keys()) @@ -109,6 +116,8 @@ def assert_destination_state_loaded(pipeline: Pipeline) -> None: destination_state = state_sync.load_pipeline_state_from_destination( pipeline.pipeline_name, client ) + # current pipeline schema available in the destination + client.get_stored_schema_by_hash(pipeline.default_schema.version_hash) pipeline_state = dict(pipeline.state) del pipeline_state["_local"] assert pipeline_state == destination_state @@ -144,8 +153,7 @@ def test_drop_command_resources_and_state(destination_config: DestinationTestCon "destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name ) def test_drop_command_only_state(destination_config: DestinationTestConfiguration) -> None: - """Test the drop command with resource and state path options and - verify correct data is deleted from destination and locally""" + """Test drop command that deletes part of the state and syncs with destination""" source = droppable_source() pipeline = destination_config.setup_pipeline("drop_test_" + uniq_id(), full_refresh=True) pipeline.run(source) @@ -164,6 +172,28 @@ def test_drop_command_only_state(destination_config: DestinationTestConfiguratio assert_destination_state_loaded(pipeline) +@pytest.mark.parametrize( + "destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name +) +def test_drop_command_only_tables(destination_config: DestinationTestConfiguration) -> None: + """Test drop only tables and makes sure that schema and state are synced""" + source = droppable_source() + pipeline = destination_config.setup_pipeline("drop_test_" + uniq_id(), full_refresh=True) + pipeline.run(source) + sources_state = pipeline.state["sources"] + + attached = _attach(pipeline) + helpers.drop(attached, resources=["droppable_no_state"]) + + attached = _attach(pipeline) + + assert_dropped_resources(attached, ["droppable_no_state"]) + # source state didn't change + assert pipeline.state["sources"] == sources_state + + assert_destination_state_loaded(pipeline) + + @pytest.mark.parametrize( "destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name ) @@ -202,7 +232,7 @@ def test_fail_after_drop_tables(destination_config: DestinationTestConfiguration attached = _attach(pipeline) with mock.patch.object( - helpers.DropCommand, "_drop_state_keys", side_effect=RuntimeError("Something went wrong") + helpers.DropCommand, "_extract_state", side_effect=RuntimeError("Something went wrong") ): with pytest.raises(RuntimeError): helpers.drop(attached, resources=("droppable_a", "droppable_b")) diff --git a/tests/load/pipeline/test_pipelines.py b/tests/load/pipeline/test_pipelines.py index 05c70e2f62..017bef2c01 100644 --- a/tests/load/pipeline/test_pipelines.py +++ b/tests/load/pipeline/test_pipelines.py @@ -6,13 +6,16 @@ import dlt -from dlt.common.pipeline import SupportsPipeline from dlt.common import json, sleep +from dlt.common.pipeline import SupportsPipeline from dlt.common.destination import Destination +from dlt.common.destination.exceptions import DestinationHasFailedJobs +from dlt.common.schema.exceptions import CannotCoerceColumnException from dlt.common.schema.schema import Schema from dlt.common.schema.typing import VERSION_TABLE_NAME from dlt.common.typing import TDataItem from dlt.common.utils import uniq_id + from dlt.destinations.exceptions import DatabaseUndefinedRelation from dlt.extract.exceptions import ResourceNameMissing from dlt.extract import DltSource @@ -21,8 +24,6 @@ PipelineConfigMissing, PipelineStepFailed, ) -from dlt.common.schema.exceptions import CannotCoerceColumnException -from dlt.common.exceptions import DestinationHasFailedJobs from tests.utils import TEST_STORAGE_ROOT, data_to_item_format, preserve_environ from tests.pipeline.utils import assert_data_table_counts, assert_load_info diff --git a/tests/load/pipeline/test_restore_state.py b/tests/load/pipeline/test_restore_state.py index 02da91cefe..e50654adcc 100644 --- a/tests/load/pipeline/test_restore_state.py +++ b/tests/load/pipeline/test_restore_state.py @@ -6,12 +6,12 @@ import dlt from dlt.common import pendulum -from dlt.common.schema.schema import Schema, utils -from dlt.common.utils import custom_environ, uniq_id -from dlt.common.exceptions import DestinationUndefinedEntity +from dlt.common.schema.schema import Schema +from dlt.common.utils import uniq_id +from dlt.common.destination.exceptions import DestinationUndefinedEntity + from dlt.load import Load from dlt.pipeline.exceptions import SqlClientNotAvailable - from dlt.pipeline.pipeline import Pipeline from dlt.pipeline.state_sync import ( STATE_TABLE_COLUMNS, @@ -207,6 +207,7 @@ def _make_dn_name(schema_name: str) -> str: job_client ) == default_schema.naming.normalize_table_identifier(dataset_name) schema_two = Schema("two") + schema_two._bump_version() with p._get_destination_clients(schema_two)[0] as job_client: # use the job_client to do that job_client.initialize_storage() diff --git a/tests/load/test_job_client.py b/tests/load/test_job_client.py index 63f9d3c28d..2e23086f81 100644 --- a/tests/load/test_job_client.py +++ b/tests/load/test_job_client.py @@ -111,7 +111,7 @@ def test_get_update_basic_schema(client: SqlJobClientBase) -> None: # modify schema schema.tables["event_slot"]["write_disposition"] = "replace" - schema.bump_version() + schema._bump_version() assert schema.version > this_schema.version # update in storage @@ -126,7 +126,7 @@ def test_get_update_basic_schema(client: SqlJobClientBase) -> None: # in that case the version will not change or go down first_schema = Schema.from_dict(json.loads(first_version_schema)) first_schema.tables["event_bot"]["write_disposition"] = "replace" - first_schema.bump_version() + first_schema._bump_version() assert first_schema.version == this_schema.version == 2 # wait to make load_newest_schema deterministic sleep(0.1) @@ -143,7 +143,7 @@ def test_get_update_basic_schema(client: SqlJobClientBase) -> None: # mock other schema in client and get the newest schema. it should not exist... client.schema = Schema("ethereum") assert client.get_stored_schema() is None - client.schema.bump_version() + client.schema._bump_version() schema_update = client.update_stored_schema() # no schema updates because schema has no tables assert schema_update == {} @@ -206,7 +206,7 @@ def test_schema_update_create_table_redshift(client: SqlJobClientBase) -> None: record_hash = schema._infer_column("_dlt_id", "m,i0392903jdlkasjdlk") assert record_hash["unique"] is True schema.update_table(new_table(table_name, columns=[timestamp, sender_id, record_hash])) - schema.bump_version() + schema._bump_version() schema_update = client.update_stored_schema() # check hints in schema update table_update = schema_update[table_name]["columns"] @@ -233,7 +233,7 @@ def test_schema_update_create_table_bigquery(client: SqlJobClientBase) -> None: # this will be not null record_hash = schema._infer_column("_dlt_id", "m,i0392903jdlkasjdlk") schema.update_table(new_table("event_test_table", columns=[timestamp, sender_id, record_hash])) - schema.bump_version() + schema._bump_version() schema_update = client.update_stored_schema() # check hints in schema update table_update = schema_update["event_test_table"]["columns"] @@ -259,7 +259,7 @@ def test_schema_update_alter_table(client: SqlJobClientBase) -> None: col1 = schema._infer_column("col1", "string") table_name = "event_test_table" + uniq_id() schema.update_table(new_table(table_name, columns=[col1])) - schema.bump_version() + schema._bump_version() schema_update = client.update_stored_schema() assert table_name in schema_update assert len(schema_update[table_name]["columns"]) == 1 @@ -267,7 +267,7 @@ def test_schema_update_alter_table(client: SqlJobClientBase) -> None: # with single alter table col2 = schema._infer_column("col2", 1) schema.update_table(new_table(table_name, columns=[col2])) - schema.bump_version() + schema._bump_version() schema_update = client.update_stored_schema() assert len(schema_update) == 1 assert len(schema_update[table_name]["columns"]) == 1 @@ -278,7 +278,7 @@ def test_schema_update_alter_table(client: SqlJobClientBase) -> None: col4 = schema._infer_column("col4", 182879721.182912) col4["data_type"] = "timestamp" schema.update_table(new_table(table_name, columns=[col3, col4])) - schema.bump_version() + schema._bump_version() schema_update = client.update_stored_schema() assert len(schema_update[table_name]["columns"]) == 2 assert schema_update[table_name]["columns"]["col3"]["data_type"] == "double" @@ -297,7 +297,7 @@ def test_drop_tables(client: SqlJobClientBase) -> None: # Add columns in all tables schema.tables["event_user"]["columns"] = dict(schema.tables["event_slot"]["columns"]) schema.tables["event_bot"]["columns"] = dict(schema.tables["event_slot"]["columns"]) - schema.bump_version() + schema._bump_version() client.update_stored_schema() # Create a second schema with 2 hashes @@ -312,10 +312,10 @@ def test_drop_tables(client: SqlJobClientBase) -> None: schema_2.tables[tbl_name + "_2"]["name"] = tbl_name + "_2" client.schema = schema_2 - client.schema.bump_version() + client.schema._bump_version() client.update_stored_schema() client.schema.tables["event_slot_2"]["columns"]["value"]["nullable"] = False - client.schema.bump_version() + client.schema._bump_version() client.update_stored_schema() # Drop tables from the first schema @@ -323,7 +323,7 @@ def test_drop_tables(client: SqlJobClientBase) -> None: tables_to_drop = ["event_slot", "event_user"] for tbl in tables_to_drop: del schema.tables[tbl] - schema.bump_version() + schema._bump_version() client.drop_tables(*tables_to_drop) if isinstance(client, WithStagingDataset): with contextlib.suppress(DatabaseUndefinedRelation): @@ -363,7 +363,7 @@ def test_get_storage_table_with_all_types(client: SqlJobClientBase) -> None: schema = client.schema table_name = "event_test_table" + uniq_id() schema.update_table(new_table(table_name, columns=TABLE_UPDATE)) - schema.bump_version() + schema._bump_version() schema_update = client.update_stored_schema() # we have all columns in the update table_update = schema_update[table_name]["columns"] @@ -407,7 +407,7 @@ def test_preserve_column_order(client: SqlJobClientBase) -> None: random.shuffle(columns) schema.update_table(new_table(table_name, columns=columns)) - schema.bump_version() + schema._bump_version() def _assert_columns_order(sql_: str) -> None: idx = 0 @@ -514,7 +514,7 @@ def test_load_with_all_types( table_name, write_disposition=write_disposition, columns=list(column_schemas.values()) ) ) - client.schema.bump_version() + client.schema._bump_version() client.update_stored_schema() if client.should_load_data_to_staging_dataset(client.schema.tables[table_name]): # type: ignore[attr-defined] @@ -569,7 +569,7 @@ def test_write_dispositions( client.schema.update_table( new_table(child_table, columns=TABLE_UPDATE, parent_table_name=table_name) ) - client.schema.bump_version() + client.schema._bump_version() client.update_stored_schema() if write_disposition == "merge": @@ -578,7 +578,7 @@ def test_write_dispositions( # create staging for merge dataset with client.with_staging_dataset(): # type: ignore[attr-defined] client.initialize_storage() - client.schema.bump_version() + client.schema._bump_version() client.update_stored_schema() for idx in range(2): # in the replace strategies, tables get truncated between loads @@ -728,7 +728,7 @@ def _load_something(_client: SqlJobClientBase, expected_rows: int) -> None: user_table = load_table("event_user")["event_user"] client.schema.update_table(new_table("event_user", columns=list(user_table.values()))) - client.schema.bump_version() + client.schema._bump_version() schema_update = client.update_stored_schema() assert len(schema_update) > 0 @@ -741,7 +741,7 @@ def _load_something(_client: SqlJobClientBase, expected_rows: int) -> None: event_2_schema = Schema.from_stored_schema(schema_dict) # swap schemas in client instance client.schema = event_2_schema - client.schema.bump_version() + client.schema._bump_version() schema_update = client.update_stored_schema() # no were detected - even if the schema is new. all the tables overlap assert schema_update == {} @@ -760,7 +760,7 @@ def _load_something(_client: SqlJobClientBase, expected_rows: int) -> None: event_3_schema.tables["event_user"]["columns"]["input_channel"]["nullable"] = False # swap schemas in client instance client.schema = event_3_schema - client.schema.bump_version() + client.schema._bump_version() schema_update = client.update_stored_schema() # no were detected - even if the schema is new. all the tables overlap and change in nullability does not do any updates assert schema_update == {} @@ -771,7 +771,7 @@ def _load_something(_client: SqlJobClientBase, expected_rows: int) -> None: event_3_schema.tables["event_user"]["columns"]["mandatory_column"] = new_column( "mandatory_column", "text", nullable=False ) - client.schema.bump_version() + client.schema._bump_version() with pytest.raises(DatabaseException) as py_ex: client.update_stored_schema() assert ( @@ -788,6 +788,6 @@ def prepare_schema(client: SqlJobClientBase, case: str) -> Tuple[List[Dict[str, table: TTableSchemaColumns = {k: client.schema._infer_column(k, v) for k, v in rows[0].items()} table_name = f"event_{case}_{uniq_id()}" client.schema.update_table(new_table(table_name, columns=list(table.values()))) - client.schema.bump_version() + client.schema._bump_version() client.update_stored_schema() return rows, table_name diff --git a/tests/load/test_sql_client.py b/tests/load/test_sql_client.py index 026e481ede..d82925a7d3 100644 --- a/tests/load/test_sql_client.py +++ b/tests/load/test_sql_client.py @@ -5,17 +5,17 @@ from time import sleep from dlt.common import pendulum, Decimal -from dlt.common.exceptions import IdentifierTooLongException +from dlt.common.destination.exceptions import IdentifierTooLongException from dlt.common.schema.typing import LOADS_TABLE_NAME, VERSION_TABLE_NAME from dlt.common.storages import FileStorage -from dlt.common.utils import derives_from_class_of_name, uniq_id +from dlt.common.utils import uniq_id + from dlt.destinations.exceptions import ( DatabaseException, DatabaseTerminalException, DatabaseTransientException, DatabaseUndefinedRelation, ) - from dlt.destinations.sql_client import DBApiCursor, SqlClientBase from dlt.destinations.job_client_impl import SqlJobClientBase from dlt.destinations.typing import TNativeConn @@ -570,7 +570,7 @@ def test_max_column_identifier_length(client: SqlJobClientBase) -> None: def test_recover_on_explicit_tx(client: SqlJobClientBase) -> None: if client.capabilities.supports_transactions is False: pytest.skip("Destination does not support tx") - client.schema.bump_version() + client.schema._bump_version() client.update_stored_schema() version_table = client.sql_client.make_qualified_table_name("_dlt_version") # simple syntax error diff --git a/tests/load/utils.py b/tests/load/utils.py index 7b4cf72b47..8c5eda6d3b 100644 --- a/tests/load/utils.py +++ b/tests/load/utils.py @@ -459,7 +459,7 @@ def prepare_table( table_name: str = "event_user", make_uniq_table: bool = True, ) -> str: - client.schema.bump_version() + client.schema._bump_version() client.update_stored_schema() user_table = load_table(case_name)[table_name] if make_uniq_table: @@ -467,7 +467,7 @@ def prepare_table( else: user_table_name = table_name client.schema.update_table(new_table(user_table_name, columns=list(user_table.values()))) - client.schema.bump_version() + client.schema._bump_version() client.update_stored_schema() return user_table_name diff --git a/tests/load/weaviate/test_weaviate_client.py b/tests/load/weaviate/test_weaviate_client.py index 48153f7706..3f966c2330 100644 --- a/tests/load/weaviate/test_weaviate_client.py +++ b/tests/load/weaviate/test_weaviate_client.py @@ -76,7 +76,7 @@ def test_all_data_types( client.schema.update_table( new_table(class_name, write_disposition=write_disposition, columns=TABLE_UPDATE) ) - client.schema.bump_version() + client.schema._bump_version() client.update_stored_schema() # write row @@ -113,7 +113,7 @@ def test_case_sensitive_properties_create(client: WeaviateClient) -> None: client.schema.update_table( client.schema.normalize_table_identifiers(new_table(class_name, columns=table_create)) ) - client.schema.bump_version() + client.schema._bump_version() with pytest.raises(PropertyNameConflict): client.update_stored_schema() @@ -128,7 +128,7 @@ def test_case_insensitive_properties_create(ci_client: WeaviateClient) -> None: ci_client.schema.update_table( ci_client.schema.normalize_table_identifiers(new_table(class_name, columns=table_create)) ) - ci_client.schema.bump_version() + ci_client.schema._bump_version() ci_client.update_stored_schema() _, table_columns = ci_client.get_storage_table("ColClass") # later column overwrites earlier one so: double @@ -145,13 +145,13 @@ def test_case_sensitive_properties_add(client: WeaviateClient) -> None: client.schema.update_table( client.schema.normalize_table_identifiers(new_table(class_name, columns=table_create)) ) - client.schema.bump_version() + client.schema._bump_version() client.update_stored_schema() client.schema.update_table( client.schema.normalize_table_identifiers(new_table(class_name, columns=table_update)) ) - client.schema.bump_version() + client.schema._bump_version() with pytest.raises(PropertyNameConflict): client.update_stored_schema() @@ -166,7 +166,7 @@ def test_load_case_sensitive_data(client: WeaviateClient, file_storage: FileStor "col1": {"name": "col1", "data_type": "bigint", "nullable": False} } client.schema.update_table(new_table(class_name, columns=[table_create["col1"]])) - client.schema.bump_version() + client.schema._bump_version() client.update_stored_schema() # prepare a data item where is name clash due to Weaviate being CI data_clash = {"col1": 72187328, "coL1": 726171} @@ -185,7 +185,7 @@ def test_load_case_sensitive_data_ci(ci_client: WeaviateClient, file_storage: Fi "col1": {"name": "col1", "data_type": "bigint", "nullable": False} } ci_client.schema.update_table(new_table(class_name, columns=[table_create["col1"]])) - ci_client.schema.bump_version() + ci_client.schema._bump_version() ci_client.update_stored_schema() # prepare a data item where is name clash due to Weaviate being CI # but here we normalize the item diff --git a/tests/pipeline/test_import_export_schema.py b/tests/pipeline/test_import_export_schema.py index b1c2284f24..6f40e1d1eb 100644 --- a/tests/pipeline/test_import_export_schema.py +++ b/tests/pipeline/test_import_export_schema.py @@ -2,6 +2,7 @@ from dlt.common.utils import uniq_id +from tests.pipeline.utils import assert_load_info from tests.utils import TEST_STORAGE_ROOT from dlt.common.schema import Schema from dlt.common.storages.schema_storage import SchemaStorage @@ -83,7 +84,17 @@ def test_import_schema_is_respected() -> None: export_schema_path=EXPORT_SCHEMA_PATH, ) p.run(EXAMPLE_DATA, table_name="person") + # initial schema + evolved in normalize == version 2 + assert p.default_schema.stored_version == 2 assert p.default_schema.tables["person"]["columns"]["id"]["data_type"] == "bigint" + # import schema got saved + import_schema = _get_import_schema(name) + assert "person" in import_schema.tables + # initial schema (after extract) got saved + assert import_schema.stored_version == 1 + # import schema hash is set + assert p.default_schema._imported_version_hash == import_schema.version_hash + assert not p.default_schema.is_modified # take default schema, modify column type and save it to import folder modified_schema = p.default_schema.clone() @@ -91,14 +102,12 @@ def test_import_schema_is_respected() -> None: with open(os.path.join(IMPORT_SCHEMA_PATH, name + ".schema.yaml"), "w", encoding="utf-8") as f: f.write(modified_schema.to_pretty_yaml()) - # this will provoke a CannotCoerceColumnException - with pytest.raises(PipelineStepFailed) as exc: - p.run(EXAMPLE_DATA, table_name="person") - assert type(exc.value.exception) == CannotCoerceColumnException - - # schema is changed + # import schema will be imported into pipeline + p.run(EXAMPLE_DATA, table_name="person") + # again: extract + normalize + assert p.default_schema.stored_version == 3 + # change in pipeline schema assert p.default_schema.tables["person"]["columns"]["id"]["data_type"] == "text" - # import schema is not overwritten assert _get_import_schema(name).tables["person"]["columns"]["id"]["data_type"] == "text" @@ -110,7 +119,15 @@ def test_import_schema_is_respected() -> None: export_schema_path=EXPORT_SCHEMA_PATH, full_refresh=True, ) - p.run(EXAMPLE_DATA, table_name="person") + p.extract(EXAMPLE_DATA, table_name="person") + # starts with import schema v 1 that is dirty -> 2 + assert p.default_schema.stored_version == 3 + p.normalize() + assert p.default_schema.stored_version == 3 + info = p.load() + assert_load_info(info) + assert p.default_schema.stored_version == 3 + assert p.default_schema.tables["person"]["columns"]["id"]["data_type"] == "text" # import schema is not overwritten diff --git a/tests/pipeline/test_pipeline.py b/tests/pipeline/test_pipeline.py index 2f221ac8a0..37356c2b44 100644 --- a/tests/pipeline/test_pipeline.py +++ b/tests/pipeline/test_pipeline.py @@ -20,12 +20,12 @@ from dlt.common.configuration.specs.gcp_credentials import GcpOAuthCredentials from dlt.common.destination import DestinationCapabilitiesContext from dlt.common.destination.reference import WithStateSync -from dlt.common.exceptions import ( +from dlt.common.destination.exceptions import ( DestinationHasFailedJobs, DestinationTerminalException, - PipelineStateNotAvailable, UnknownDestinationModule, ) +from dlt.common.exceptions import PipelineStateNotAvailable from dlt.common.pipeline import LoadInfo, PipelineContext from dlt.common.runtime.collector import LogCollector from dlt.common.schema.utils import new_column, new_table @@ -441,6 +441,86 @@ def with_mark(): assert p.default_schema.tables["spec_table"]["resource"] == "with_mark" +def test_mark_hints_with_variant() -> None: + @dlt.resource(primary_key="pk") + def with_table_hints(): + # dispatch to table a + yield dlt.mark.with_hints( + {"id": 1, "pk": "A"}, + dlt.mark.make_hints( + table_name="table_a", columns=[{"name": "id", "data_type": "bigint"}] + ), + create_table_variant=True, + ) + + # dispatch to table b + yield dlt.mark.with_hints( + {"id": 2, "pk": "B"}, + dlt.mark.make_hints(table_name="table_b", write_disposition="replace"), + create_table_variant=True, + ) + + # item to resource + yield {"id": 3, "pk": "C"} + # table a with table_hints + yield dlt.mark.with_table_name({"id": 4, "pk": "D"}, "table_a") + # table b with table_hints + yield dlt.mark.with_table_name({"id": 5, "pk": "E"}, "table_b") + + pipeline_name = "pipe_" + uniq_id() + pipeline = dlt.pipeline(pipeline_name=pipeline_name, destination="duckdb") + info = pipeline.run(with_table_hints) + assert_load_info(info) + assert pipeline.last_trace.last_normalize_info.row_counts == { + "_dlt_pipeline_state": 1, + "table_a": 2, + "table_b": 2, + "with_table_hints": 1, + } + # check table counts + assert_data_table_counts(pipeline, {"table_a": 2, "table_b": 2, "with_table_hints": 1}) + + +def test_mark_hints_variant_dynamic_name() -> None: + @dlt.resource(table_name=lambda item: "table_" + item["tag"]) + def with_table_hints(): + # dispatch to table a + yield dlt.mark.with_hints( + {"id": 1, "pk": "A", "tag": "a"}, + dlt.mark.make_hints( + table_name="table_a", + primary_key="pk", + columns=[{"name": "id", "data_type": "bigint"}], + ), + create_table_variant=True, + ) + + # dispatch to table b + yield dlt.mark.with_hints( + {"id": 2, "pk": "B", "tag": "b"}, + dlt.mark.make_hints(table_name="table_b", write_disposition="replace"), + create_table_variant=True, + ) + + # dispatch by tag + yield {"id": 3, "pk": "C", "tag": "c"} + yield {"id": 4, "pk": "D", "tag": "a"} + yield {"id": 5, "pk": "E", "tag": "b"} + + pipeline_name = "pipe_" + uniq_id() + pipeline = dlt.pipeline(pipeline_name=pipeline_name, destination="duckdb") + info = pipeline.run(with_table_hints) + assert_load_info(info) + assert pipeline.last_trace.last_normalize_info.row_counts == { + "_dlt_pipeline_state": 1, + "table_a": 2, + "table_b": 2, + "table_c": 1, + } + # check table counts + assert_data_table_counts(pipeline, {"table_a": 2, "table_b": 2, "table_c": 1}) + + def test_restore_state_on_dummy() -> None: os.environ["COMPLETED_PROB"] = "1.0" # make it complete immediately @@ -952,6 +1032,73 @@ def reverse_order(item): ] +def test_preserve_new_fields_order_on_append() -> None: + pipeline_name = "pipe_" + uniq_id() + p = dlt.pipeline(pipeline_name=pipeline_name, destination="dummy") + + item = {"c1": 1, "c2": 2, "c3": "list"} + p.extract([item], table_name="order_1") + p.normalize() + assert list(p.default_schema.get_table_columns("order_1").keys()) == [ + "c1", + "c2", + "c3", + "_dlt_load_id", + "_dlt_id", + ] + + # add columns + item = {"c1": 1, "c4": 2.0, "c3": "list", "c5": {"x": 1}} + p.extract([item], table_name="order_1") + p.normalize() + assert list(p.default_schema.get_table_columns("order_1").keys()) == [ + "c1", + "c2", + "c3", + "_dlt_load_id", + "_dlt_id", + "c4", + "c5__x", + ] + + +def test_preserve_fields_order_incomplete_columns() -> None: + p = dlt.pipeline(pipeline_name="column_order", destination="dummy") + # incomplete columns (without data type) will be added in order of fields in data + + @dlt.resource(columns={"c3": {"precision": 32}}, primary_key="c2") + def items(): + yield {"c1": 1, "c2": 1, "c3": 1} + + p.extract(items) + p.normalize() + assert list(p.default_schema.get_table_columns("items").keys()) == [ + "c1", + "c2", + "c3", + "_dlt_load_id", + "_dlt_id", + ] + + # complete columns preserve order in "columns" + p = p.drop() + + @dlt.resource(columns={"c3": {"precision": 32, "data_type": "decimal"}}, primary_key="c1") + def items2(): + yield {"c1": 1, "c2": 1, "c3": 1} + + p.extract(items2) + p.normalize() + # c3 was first so goes first + assert list(p.default_schema.get_table_columns("items2").keys()) == [ + "c3", + "c1", + "c2", + "_dlt_load_id", + "_dlt_id", + ] + + def test_pipeline_log_progress() -> None: os.environ["TIMEOUT"] = "3.0" @@ -1269,7 +1416,7 @@ def test_drop_with_new_name() -> None: assert new_pipeline.pipeline_name == new_test_name -def test_remove_autodetect() -> None: +def test_schema_version_increase_and_source_update() -> None: now = pendulum.now() @dlt.source @@ -1282,12 +1429,81 @@ def autodetect(): ) pipeline = dlt.pipeline(destination="duckdb") + # control version of the schema + auto_source = autodetect() + assert auto_source.schema.stored_version is None + pipeline.extract(auto_source) + # extract did a first save + assert pipeline.default_schema.stored_version == 1 + # only one prev hash + assert len(pipeline.default_schema.previous_hashes) == 1 + # source schema was updated in the pipeline + assert auto_source.schema.stored_version == 1 + # source has pipeline schema + assert pipeline.default_schema is auto_source.schema + + pipeline.normalize() + # columns added and schema was saved in between + assert pipeline.default_schema.stored_version == 2 + assert len(pipeline.default_schema.previous_hashes) == 2 + # source schema still updated + assert auto_source.schema.stored_version == 2 + assert pipeline.default_schema is auto_source.schema + pipeline.load() + # nothing changed in load + assert pipeline.default_schema.stored_version == 2 + assert pipeline.default_schema is auto_source.schema + + # run same source again + pipeline.extract(auto_source) + assert pipeline.default_schema.stored_version == 2 + assert pipeline.default_schema is auto_source.schema + pipeline.normalize() + assert pipeline.default_schema.stored_version == 2 + pipeline.load() + assert pipeline.default_schema.stored_version == 2 + + # run another instance of the same source pipeline.run(autodetect()) + assert pipeline.default_schema.stored_version == 2 + assert pipeline.default_schema is auto_source.schema + assert "timestamp" in pipeline.default_schema.settings["detections"] + + # data has compatible schema with "numbers" but schema is taken from pipeline + pipeline.run([1, 2, 3], table_name="numbers") + assert "timestamp" in pipeline.default_schema.settings["detections"] + assert pipeline.default_schema.stored_version == 2 + assert pipeline.default_schema is auto_source.schema + + # new table will evolve schema + pipeline.run([1, 2, 3], table_name="seq") + assert "timestamp" in pipeline.default_schema.settings["detections"] + assert pipeline.default_schema.stored_version == 4 + assert pipeline.default_schema is auto_source.schema + + +def test_remove_autodetect() -> None: + now = pendulum.now() + + @dlt.source + def autodetect(): + # add unix ts autodetection to current source schema + dlt.current.source_schema().add_type_detection("timestamp") + return dlt.resource( + [int(now.timestamp()), int(now.timestamp() + 1), int(now.timestamp() + 2)], + name="numbers", + ) + + pipeline = dlt.pipeline(destination="duckdb") + auto_source = autodetect() + pipeline.extract(auto_source) + pipeline.normalize() # unix ts recognized assert ( pipeline.default_schema.get_table("numbers")["columns"]["value"]["data_type"] == "timestamp" ) + pipeline.load() pipeline = pipeline.drop() @@ -1388,8 +1604,13 @@ def test_pipeline_list_packages() -> None: ) load_ids = pipeline.list_extracted_load_packages() assert len(load_ids) == 3 + extracted_package = pipeline.get_load_package_info(load_ids[1]) + assert extracted_package.schema_name == "airtable_emojis" + extracted_package = pipeline.get_load_package_info(load_ids[2]) + assert extracted_package.schema_name == "emojis_2" extracted_package = pipeline.get_load_package_info(load_ids[0]) assert extracted_package.state == "extracted" + assert extracted_package.schema_name == "airtable_emojis" # same load id continues till the end pipeline.normalize() load_ids_n = pipeline.list_normalized_load_packages() diff --git a/tests/pipeline/test_pipeline_file_format_resolver.py b/tests/pipeline/test_pipeline_file_format_resolver.py index 49a38c455b..588ad720a5 100644 --- a/tests/pipeline/test_pipeline_file_format_resolver.py +++ b/tests/pipeline/test_pipeline_file_format_resolver.py @@ -3,7 +3,7 @@ import dlt import pytest -from dlt.common.exceptions import ( +from dlt.common.destination.exceptions import ( DestinationIncompatibleLoaderFileFormatException, DestinationLoadingViaStagingNotSupported, DestinationNoStagingMode,