dlt-hub · rudolfix · Apr 14, 2024 · Mar 31, 2024 · Mar 31, 2024 · Apr 1, 2024
diff --git a/dlt/common/destination/reference.py b/dlt/common/destination/reference.py
@@ -26,6 +26,7 @@
 
 from dlt.common import logger
 from dlt.common.schema import Schema, TTableSchema, TSchemaTables
+from dlt.common.schema.typing import MERGE_STRATEGIES
 from dlt.common.schema.exceptions import SchemaException
 from dlt.common.schema.utils import (
     get_write_disposition,
@@ -344,6 +345,12 @@ def _verify_schema(self) -> None:
                     table_name,
                     self.capabilities.max_identifier_length,
                 )
+            if table.get("write_disposition") == "merge":
+                if "x-merge-strategy" in table and table["x-merge-strategy"] not in MERGE_STRATEGIES:  # type: ignore[typeddict-item]
+                    raise SchemaException(
+                        f'"{table["x-merge-strategy"]}" is not a valid merge strategy. '  # type: ignore[typeddict-item]
+                        f"""Allowed values: {', '.join(['"' + s + '"' for s in MERGE_STRATEGIES])}."""
+                    )
             if has_column_with_prop(table, "hard_delete"):
                 if len(get_columns_names_with_prop(table, "hard_delete")) > 1:
                     raise SchemaException(

diff --git a/dlt/common/normalizers/json/relational.py b/dlt/common/normalizers/json/relational.py
@@ -1,13 +1,20 @@
-from typing import Dict, List, Mapping, Optional, Sequence, Tuple, cast, TypedDict, Any
-from dlt.common.data_types.typing import TDataType
+from typing import Dict, Mapping, Optional, Sequence, Tuple, cast, TypedDict, Any
+from dlt.common import json
 from dlt.common.normalizers.exceptions import InvalidJsonNormalizer
 from dlt.common.normalizers.typing import TJSONNormalizer
 from dlt.common.normalizers.utils import generate_dlt_id, DLT_ID_LENGTH_BYTES
 
 from dlt.common.typing import DictStrAny, DictStrStr, TDataItem, StrAny
 from dlt.common.schema import Schema
-from dlt.common.schema.typing import TColumnSchema, TColumnName, TSimpleRegex
-from dlt.common.schema.utils import column_name_validator
+from dlt.common.schema.typing import (
+    TTableSchema,
+    TColumnSchema,
+    TColumnName,
+    TSimpleRegex,
+    DLT_NAME_PREFIX,
+)
+from dlt.common.schema.utils import column_name_validator, get_validity_column_names
+from dlt.common.schema.exceptions import ColumnNameConflictException
 from dlt.common.utils import digest128, update_dict_nested
 from dlt.common.normalizers.json import (
     TNormalizedRowIterator,
@@ -127,6 +134,18 @@ def norm_row_dicts(dict_row: StrAny, __r_lvl: int, path: Tuple[str, ...] = ()) -
         norm_row_dicts(dict_row, _r_lvl)
         return cast(TDataItemRow, out_rec_row), out_rec_list
 
+    @staticmethod
+    def get_row_hash(row: Dict[str, Any]) -> str:
+        """Returns hash of row.
+
+        Hash includes column names and values and is ordered by column name.
+        Excludes dlt system columns.
+        Can be used as deterministic row identifier.
+        """
+        row_filtered = {k: v for k, v in row.items() if not k.startswith(DLT_NAME_PREFIX)}
+        row_str = json.dumps(row_filtered, sort_keys=True)
+        return digest128(row_str, DLT_ID_LENGTH_BYTES)
+
     @staticmethod
     def _get_child_row_hash(parent_row_id: str, child_table: str, list_idx: int) -> str:
         # create deterministic unique id of the child row taking into account that all lists are ordered
@@ -220,10 +239,14 @@ def _normalize_row(
         parent_row_id: Optional[str] = None,
         pos: Optional[int] = None,
         _r_lvl: int = 0,
+        row_hash: bool = False,
     ) -> TNormalizedRowIterator:
         schema = self.schema
         table = schema.naming.shorten_fragments(*parent_path, *ident_path)
-
+        # compute row hash and set as row id
+        if row_hash:
+            row_id = self.get_row_hash(dict_row)  # type: ignore[arg-type]
+            dict_row["_dlt_id"] = row_id
         # flatten current row and extract all lists to recur into
         flattened_row, lists = self._flatten(table, dict_row, _r_lvl)
         # always extend row
@@ -296,10 +319,18 @@ def normalize_data_item(
         row = cast(TDataItemRowRoot, item)
         # identify load id if loaded data must be processed after loading incrementally
         row["_dlt_load_id"] = load_id
+        # determine if row hash should be used as dlt id
+        row_hash = False
+        if table_name in self.schema.data_table_names():
+            table = self.schema.get_table(table_name)
+            if table.get("x-merge-strategy") == "scd2":
+                self._validate_validity_column_names(table, item)
+                row_hash = True
         yield from self._normalize_row(
             cast(TDataItemRowChild, row),
             {},
             (self.schema.naming.normalize_table_identifier(table_name),),
+            row_hash=row_hash,
         )
 
     @classmethod
@@ -333,3 +364,13 @@ def _validate_normalizer_config(schema: Schema, config: RelationalNormalizerConf
             "./normalizers/json/config",
             validator_f=column_name_validator(schema.naming),
         )
+
+    @staticmethod
+    def _validate_validity_column_names(table: TTableSchema, item: TDataItem) -> None:
+        """Raises exception if configured validity column name appears in data item."""
+        for validity_column_name in get_validity_column_names(table):
+            if validity_column_name in item.keys():
+                raise ColumnNameConflictException(
+                    "Found column in data item with same name as validity column"
+                    f' "{validity_column_name}".'
+                )
diff --git a/dlt/common/pipeline.py b/dlt/common/pipeline.py
@@ -36,7 +36,12 @@
 from dlt.common.destination.exceptions import DestinationHasFailedJobs
 from dlt.common.exceptions import PipelineStateNotAvailable, SourceSectionNotAvailable
 from dlt.common.schema import Schema
-from dlt.common.schema.typing import TColumnNames, TColumnSchema, TWriteDisposition, TSchemaContract
+from dlt.common.schema.typing import (
+    TColumnNames,
+    TColumnSchema,
+    TWriteDispositionConfig,
+    TSchemaContract,
+)
 from dlt.common.source import get_current_pipe_name
 from dlt.common.storages.load_storage import LoadPackageInfo
 from dlt.common.time import ensure_pendulum_datetime, precise_time
@@ -521,7 +526,7 @@ def run(
         dataset_name: str = None,
         credentials: Any = None,
         table_name: str = None,
-        write_disposition: TWriteDisposition = None,
+        write_disposition: TWriteDispositionConfig = None,
         columns: Sequence[TColumnSchema] = None,
         primary_key: TColumnNames = None,
         schema: Schema = None,
@@ -544,7 +549,7 @@ def __call__(
         dataset_name: str = None,
         credentials: Any = None,
         table_name: str = None,
-        write_disposition: TWriteDisposition = None,
+        write_disposition: TWriteDispositionConfig = None,
         columns: Sequence[TColumnSchema] = None,
         schema: Schema = None,
         loader_file_format: TLoaderFileFormat = None,

diff --git a/dlt/common/schema/exceptions.py b/dlt/common/schema/exceptions.py
@@ -152,3 +152,7 @@ class UnknownTableException(SchemaException):
     def __init__(self, table_name: str) -> None:
         self.table_name = table_name
         super().__init__(f"Trying to access unknown table {table_name}.")
+
+
+class ColumnNameConflictException(SchemaException):
+    pass
diff --git a/dlt/common/schema/typing.py b/dlt/common/schema/typing.py
@@ -7,7 +7,6 @@
     Optional,
     Sequence,
     Set,
-    Tuple,
     Type,
     TypedDict,
     NewType,
@@ -64,7 +63,6 @@
     "dedup_sort",
 ]
 """Known hints of a column used to declare hint regexes."""
-TWriteDisposition = Literal["skip", "append", "replace", "merge"]
 TTableFormat = Literal["iceberg", "parquet", "jsonl"]
 TTypeDetections = Literal[
     "timestamp", "iso_timestamp", "iso_date", "large_integer", "hexbytes_to_text", "wei_to_double"
@@ -86,7 +84,6 @@
         "root_key",
     ]
 )
-WRITE_DISPOSITIONS: Set[TWriteDisposition] = set(get_args(TWriteDisposition))
 
 
 class TColumnType(TypedDict, total=False):
@@ -155,6 +152,26 @@ class NormalizerInfo(TypedDict, total=True):
     new_table: bool
 
 
+TWriteDisposition = Literal["skip", "append", "replace", "merge"]
+TLoaderMergeStrategy = Literal["delete-insert", "scd2"]
+
+
+WRITE_DISPOSITIONS: Set[TWriteDisposition] = set(get_args(TWriteDisposition))
+MERGE_STRATEGIES: Set[TLoaderMergeStrategy] = set(get_args(TLoaderMergeStrategy))
+
+
+class TWriteDispositionDict(TypedDict):
+    mode: TWriteDisposition
+
+
+class TMergeDispositionDict(TWriteDispositionDict, total=False):
+    strategy: Optional[TLoaderMergeStrategy]
+    validity_column_names: Optional[List[str]]
+
+
+TWriteDispositionConfig = Union[TWriteDisposition, TWriteDispositionDict, TMergeDispositionDict]
+
+
 # TypedDict that defines properties of a table
 
 

diff --git a/dlt/common/schema/utils.py b/dlt/common/schema/utils.py
@@ -34,6 +34,7 @@
     TTypeDetectionFunc,
     TTypeDetections,
     TWriteDisposition,
+    TLoaderMergeStrategy,
     TSchemaContract,
     TSortOrder,
 )
@@ -47,6 +48,9 @@
 
 RE_NON_ALPHANUMERIC_UNDERSCORE = re.compile(r"[^a-zA-Z\d_]")
 DEFAULT_WRITE_DISPOSITION: TWriteDisposition = "append"
+DEFAULT_MERGE_STRATEGY: TLoaderMergeStrategy = "delete-insert"
+DEFAULT_VALIDITY_COLUMN_NAMES = ["_dlt_valid_from", "_dlt_valid_to"]
+"""Default values for validity column names used in `scd2` merge strategy."""
 
 
 def is_valid_schema_name(name: str) -> bool:
@@ -516,6 +520,13 @@ def get_dedup_sort_tuple(
     return (dedup_sort_col, dedup_sort_order)
 
 
+def get_validity_column_names(table: TTableSchema) -> List[Optional[str]]:
+    return [
+        get_first_column_name_with_prop(table, "x-valid-from"),
+        get_first_column_name_with_prop(table, "x-valid-to"),
+    ]
+
+
 def merge_schema_updates(schema_updates: Sequence[TSchemaUpdate]) -> TSchemaTables:
     aggregated_update: TSchemaTables = {}
     for schema_update in schema_updates:

diff --git a/dlt/destinations/sql_jobs.py b/dlt/destinations/sql_jobs.py
@@ -1,23 +1,33 @@
-from typing import Any, Callable, List, Sequence, Tuple, cast, TypedDict, Optional
+from typing import Any, List, Sequence, Tuple, cast, TypedDict, Optional
 
 import yaml
 from dlt.common.logger import pretty_format_exception
 
-from dlt.common.schema.typing import TTableSchema, TSortOrder
+from dlt.common import pendulum
+from dlt.common.schema.typing import (
+    TTableSchema,
+    TSortOrder,
+)
 from dlt.common.schema.utils import (
     get_columns_names_with_prop,
     get_first_column_name_with_prop,
     get_dedup_sort_tuple,
+    get_validity_column_names,
 )
 from dlt.common.storages.load_storage import ParsedLoadJobFileName
 from dlt.common.utils import uniq_id
 from dlt.common.destination.capabilities import DestinationCapabilitiesContext
 from dlt.destinations.exceptions import MergeDispositionException
 from dlt.destinations.job_impl import NewLoadJobImpl
 from dlt.destinations.sql_client import SqlClientBase
+from dlt.pipeline.current import load_package as current_load_package
+
+
+HIGH_TS = pendulum.datetime(9999, 12, 31)
+"""High timestamp used to indicate active records in `scd2` merge strategy."""
 
 
-class SqlJobParams(TypedDict):
+class SqlJobParams(TypedDict, total=False):
     replace: Optional[bool]
 
 
@@ -40,7 +50,7 @@ def from_table_chain(
 
         The `table_chain` contains a list schemas of a tables with parent-child relationship, ordered by the ancestry (the root of the tree is first on the list).
         """
-        params = cast(SqlJobParams, {**DEFAULTS, **(params or {})})  # type: ignore
+        params = cast(SqlJobParams, {**DEFAULTS, **(params or {})})
         top_table = table_chain[0]
         file_info = ParsedLoadJobFileName(
             top_table["name"], ParsedLoadJobFileName.new_file_id(), 0, "sql"
@@ -138,25 +148,16 @@ class SqlMergeJob(SqlBaseJob):
     failed_text: str = "Tried to generate a merge sql job for the following tables:"
 
     @classmethod
-    def generate_sql(
+    def generate_sql(  # type: ignore[return]
         cls,
         table_chain: Sequence[TTableSchema],
         sql_client: SqlClientBase[Any],
         params: Optional[SqlJobParams] = None,
     ) -> List[str]:
-        """Generates a list of sql statements that merge the data in staging dataset with the data in destination dataset.
-
-        The `table_chain` contains a list schemas of a tables with parent-child relationship, ordered by the ancestry (the root of the tree is first on the list).
-        The root table is merged using primary_key and merge_key hints which can be compound and be both specified. In that case the OR clause is generated.
-        The child tables are merged based on propagated `root_key` which is a type of foreign key but always leading to a root table.
-
-        First we store the root_keys of root table elements to be deleted in the temp table. Then we use the temp table to delete records from root and all child tables in the destination dataset.
-        At the end we copy the data from the staging dataset into destination dataset.
-
-        If a hard_delete column is specified, records flagged as deleted will be excluded from the copy into the destination dataset.
-        If a dedup_sort column is specified in conjunction with a primary key, records will be sorted before deduplication, so the "latest" record remains.
-        """
-        return cls.gen_merge_sql(table_chain, sql_client)
+        if table_chain[0].get("x-merge-strategy") == "delete-insert":
+            return cls.gen_merge_sql(table_chain, sql_client)
+        elif table_chain[0].get("x-merge-strategy") == "scd2":
+            return cls.gen_scd2_sql(table_chain, sql_client)
 
     @classmethod
     def _gen_key_table_clauses(
@@ -333,6 +334,18 @@ def _to_temp_table(cls, select_sql: str, temp_table_name: str) -> str:
     def gen_merge_sql(
         cls, table_chain: Sequence[TTableSchema], sql_client: SqlClientBase[Any]
     ) -> List[str]:
+        """Generates a list of sql statements that merge the data in staging dataset with the data in destination dataset.
+
+        The `table_chain` contains a list schemas of a tables with parent-child relationship, ordered by the ancestry (the root of the tree is first on the list).
+        The root table is merged using primary_key and merge_key hints which can be compound and be both specified. In that case the OR clause is generated.
+        The child tables are merged based on propagated `root_key` which is a type of foreign key but always leading to a root table.
+
+        First we store the root_keys of root table elements to be deleted in the temp table. Then we use the temp table to delete records from root and all child tables in the destination dataset.
+        At the end we copy the data from the staging dataset into destination dataset.
+
+        If a hard_delete column is specified, records flagged as deleted will be excluded from the copy into the destination dataset.
+        If a dedup_sort column is specified in conjunction with a primary key, records will be sorted before deduplication, so the "latest" record remains.
+        """
         sql: List[str] = []
         root_table = table_chain[0]
 
@@ -478,3 +491,60 @@ def gen_merge_sql(
 
             sql.append(f"INSERT INTO {table_name}({col_str}) {select_sql};")
         return sql
+
+    @classmethod
+    def gen_scd2_sql(
+        cls, table_chain: Sequence[TTableSchema], sql_client: SqlClientBase[Any]
+    ) -> List[str]:
+        """Generates SQL statements for the `scd2` merge strategy.
+
+        The root table can be inserted into and updated.
+        Updates only take place when a record retires (because there is a new version
+        or it is deleted) and only affect the "valid to" column.
+        Child tables are insert-only.
+        """
+        sql: List[str] = []
+        root_table = table_chain[0]
+        root_table_name = sql_client.make_qualified_table_name(root_table["name"])
+        with sql_client.with_staging_dataset(staging=True):
+            staging_root_table_name = sql_client.make_qualified_table_name(root_table["name"])
+
+        # get validity column names
+        escape_id = sql_client.capabilities.escape_identifier
+        from_, to = list(map(escape_id, get_validity_column_names(root_table)))
+
+        # define values for validity columns
+        boundary_ts = current_load_package()["state"]["created_at"]
+        active_record_ts = HIGH_TS.isoformat()
+
+        # retire updated and deleted records
+        sql.append(f"""
+            UPDATE {root_table_name} SET {to} = '{boundary_ts}'
+            WHERE NOT EXISTS (
+                SELECT s._dlt_id FROM {staging_root_table_name} AS s
+                WHERE {root_table_name}._dlt_id = s._dlt_id
+            ) AND {to} = '{active_record_ts}';
+        """)
+
+        # insert new active records in root table
+        columns = map(escape_id, list(root_table["columns"].keys()))
+        col_str = ", ".join([c for c in columns if c not in (from_, to)])
+        sql.append(f"""
+            INSERT INTO {root_table_name} ({col_str}, {from_}, {to})
+            SELECT {col_str}, '{boundary_ts}' AS {from_}, '{active_record_ts}' AS {to}
+            FROM {staging_root_table_name} AS s
+            WHERE NOT EXISTS (SELECT s._dlt_id FROM {root_table_name} AS f WHERE f._dlt_id = s._dlt_id);
+        """)
+
+        # insert list elements for new active records in child tables
+        for table in table_chain[1:]:
+            table_name = sql_client.make_qualified_table_name(table["name"])
+            with sql_client.with_staging_dataset(staging=True):
+                staging_table_name = sql_client.make_qualified_table_name(table["name"])
+            sql.append(f"""
+                INSERT INTO {table_name}
+                SELECT *
+                FROM {staging_table_name} AS s
+                WHERE NOT EXISTS (SELECT s._dlt_id FROM {table_name} AS f WHERE f._dlt_id = s._dlt_id);
+            """)
+        return sql