From c4e9a856b4cc55fb7a5f79c9177c04f042bf8a77 Mon Sep 17 00:00:00 2001 From: rudolfix Date: Tue, 27 Aug 2024 00:20:06 +0200 Subject: [PATCH] Feat/1711 create with not exists dlt tables (#1740) * uses normalized column names when linking tables in relational * destination cap if create table if not exits supported * generates IF NOT EXISTS for dlt tables * adds logging for terminal and retry exception in run_managed of load job * passes schema update to be collected in trace in filesystem * fixes job log exception message --- dlt/common/destination/capabilities.py | 1 + dlt/common/destination/reference.py | 4 ++++ dlt/common/normalizers/json/relational.py | 12 +++++------- dlt/destinations/impl/athena/athena.py | 2 +- .../impl/filesystem/filesystem.py | 5 ++++- dlt/destinations/impl/mssql/factory.py | 1 + dlt/destinations/impl/synapse/factory.py | 4 ++++ dlt/destinations/job_client_impl.py | 19 ++++++++++++++----- .../parent_child_relationship.py | 9 ++++----- .../test_parent_child_relationship.py | 10 ++++------ tests/load/mssql/test_mssql_table_builder.py | 12 ++++++++++-- .../postgres/test_postgres_table_builder.py | 11 ++++++++++- tests/pipeline/test_pipeline_trace.py | 2 +- 13 files changed, 63 insertions(+), 29 deletions(-) diff --git a/dlt/common/destination/capabilities.py b/dlt/common/destination/capabilities.py index be71cb50e9..52e7d74833 100644 --- a/dlt/common/destination/capabilities.py +++ b/dlt/common/destination/capabilities.py @@ -76,6 +76,7 @@ class DestinationCapabilitiesContext(ContainerInjectableContext): # use naming convention in the schema naming_convention: TNamingConventionReferenceArg = None alter_add_multi_column: bool = True + supports_create_table_if_not_exists: bool = True supports_truncate_command: bool = True schema_supports_numeric_precision: bool = True timestamp_precision: int = 6 diff --git a/dlt/common/destination/reference.py b/dlt/common/destination/reference.py index b6c7041592..744cbbd1f5 100644 --- a/dlt/common/destination/reference.py +++ b/dlt/common/destination/reference.py @@ -383,9 +383,13 @@ def run_managed( except (DestinationTerminalException, TerminalValueError) as e: self._state = "failed" self._exception = e + logger.exception(f"Terminal exception in job {self.job_id()} in file {self._file_path}") except (DestinationTransientException, Exception) as e: self._state = "retry" self._exception = e + logger.exception( + f"Transient exception in job {self.job_id()} in file {self._file_path}" + ) finally: self._finished_at = pendulum.now() # sanity check diff --git a/dlt/common/normalizers/json/relational.py b/dlt/common/normalizers/json/relational.py index 8e296445eb..1dbcec4bff 100644 --- a/dlt/common/normalizers/json/relational.py +++ b/dlt/common/normalizers/json/relational.py @@ -184,11 +184,10 @@ def _get_child_row_hash(parent_row_id: str, child_table: str, list_idx: int) -> # and all child tables must be lists return digest128(f"{parent_row_id}_{child_table}_{list_idx}", DLT_ID_LENGTH_BYTES) - @staticmethod - def _link_row(row: DictStrAny, parent_row_id: str, list_idx: int) -> DictStrAny: + def _link_row(self, row: DictStrAny, parent_row_id: str, list_idx: int) -> DictStrAny: assert parent_row_id - row["_dlt_parent_id"] = parent_row_id - row["_dlt_list_idx"] = list_idx + row[self.c_dlt_parent_id] = parent_row_id + row[self.c_dlt_list_idx] = list_idx return row @@ -227,7 +226,7 @@ def _add_row_id( if row_id_type == "row_hash": row_id = DataItemNormalizer._get_child_row_hash(parent_row_id, table, pos) # link to parent table - DataItemNormalizer._link_row(flattened_row, parent_row_id, pos) + self._link_row(flattened_row, parent_row_id, pos) flattened_row[self.c_dlt_id] = row_id return row_id @@ -260,7 +259,6 @@ def _normalize_list( parent_row_id: Optional[str] = None, _r_lvl: int = 0, ) -> TNormalizedRowIterator: - v: DictStrAny = None table = self.schema.naming.shorten_fragments(*parent_path, *ident_path) for idx, v in enumerate(seq): @@ -285,7 +283,7 @@ def _normalize_list( child_row_hash = DataItemNormalizer._get_child_row_hash(parent_row_id, table, idx) wrap_v = wrap_in_dict(v) wrap_v[self.c_dlt_id] = child_row_hash - e = DataItemNormalizer._link_row(wrap_v, parent_row_id, idx) + e = self._link_row(wrap_v, parent_row_id, idx) DataItemNormalizer._extend_row(extend, e) yield (table, self.schema.naming.shorten_fragments(*parent_path)), e diff --git a/dlt/destinations/impl/athena/athena.py b/dlt/destinations/impl/athena/athena.py index 1429b28240..0c90d171a3 100644 --- a/dlt/destinations/impl/athena/athena.py +++ b/dlt/destinations/impl/athena/athena.py @@ -452,7 +452,7 @@ def _get_table_update_sql( partition_clause = self._iceberg_partition_clause( cast(Optional[Dict[str, str]], table.get(PARTITION_HINT)) ) - sql.append(f"""CREATE TABLE {qualified_table_name} + sql.append(f"""{self._make_create_table(qualified_table_name, table)} ({columns}) {partition_clause} LOCATION '{location.rstrip('/')}' diff --git a/dlt/destinations/impl/filesystem/filesystem.py b/dlt/destinations/impl/filesystem/filesystem.py index 2e09871ba9..5445fd2ae9 100644 --- a/dlt/destinations/impl/filesystem/filesystem.py +++ b/dlt/destinations/impl/filesystem/filesystem.py @@ -303,6 +303,7 @@ def update_stored_schema( only_tables: Iterable[str] = None, expected_update: TSchemaTables = None, ) -> TSchemaTables: + applied_update = super().update_stored_schema(only_tables, expected_update) # create destination dirs for all tables table_names = only_tables or self.schema.tables.keys() dirs_to_create = self.get_table_dirs(table_names) @@ -316,7 +317,9 @@ def update_stored_schema( if not self.config.as_staging: self._store_current_schema() - return expected_update + # we assume that expected_update == applied_update so table schemas in dest were not + # externally changed + return applied_update def get_table_dir(self, table_name: str, remote: bool = False) -> str: # dlt tables do not respect layout (for now) diff --git a/dlt/destinations/impl/mssql/factory.py b/dlt/destinations/impl/mssql/factory.py index 85c94c21b7..f1a8bb136a 100644 --- a/dlt/destinations/impl/mssql/factory.py +++ b/dlt/destinations/impl/mssql/factory.py @@ -37,6 +37,7 @@ def _raw_capabilities(self) -> DestinationCapabilitiesContext: caps.max_text_data_type_length = 2**30 - 1 caps.is_max_text_data_type_length_in_bytes = False caps.supports_ddl_transactions = True + caps.supports_create_table_if_not_exists = False # IF NOT EXISTS not supported caps.max_rows_per_insert = 1000 caps.timestamp_precision = 7 caps.supported_merge_strategies = ["delete-insert", "upsert", "scd2"] diff --git a/dlt/destinations/impl/synapse/factory.py b/dlt/destinations/impl/synapse/factory.py index bb117e48d2..d5a0281bec 100644 --- a/dlt/destinations/impl/synapse/factory.py +++ b/dlt/destinations/impl/synapse/factory.py @@ -63,6 +63,10 @@ def _raw_capabilities(self) -> DestinationCapabilitiesContext: caps.supports_transactions = True caps.supports_ddl_transactions = False + caps.supports_create_table_if_not_exists = ( + False # IF NOT EXISTS on CREATE TABLE not supported + ) + # Synapse throws "Some part of your SQL statement is nested too deeply. Rewrite the query or break it up into smaller queries." # if number of records exceeds a certain number. Which exact number that is seems not deterministic: # in tests, I've seen a query with 12230 records run succesfully on one run, but fail on a subsequent run, while the query remained exactly the same. diff --git a/dlt/destinations/job_client_impl.py b/dlt/destinations/job_client_impl.py index 92132dd751..1d6403a2c8 100644 --- a/dlt/destinations/job_client_impl.py +++ b/dlt/destinations/job_client_impl.py @@ -522,22 +522,31 @@ def _make_add_column_sql( """Make one or more ADD COLUMN sql clauses to be joined in ALTER TABLE statement(s)""" return [f"ADD COLUMN {self._get_column_def_sql(c, table_format)}" for c in new_columns] + def _make_create_table(self, qualified_name: str, table: TTableSchema) -> str: + not_exists_clause = " " + if ( + table["name"] in self.schema.dlt_table_names() + and self.capabilities.supports_create_table_if_not_exists + ): + not_exists_clause = " IF NOT EXISTS " + return f"CREATE TABLE{not_exists_clause}{qualified_name}" + def _get_table_update_sql( self, table_name: str, new_columns: Sequence[TColumnSchema], generate_alter: bool ) -> List[str]: # build sql - canonical_name = self.sql_client.make_qualified_table_name(table_name) + qualified_name = self.sql_client.make_qualified_table_name(table_name) table = self.prepare_load_table(table_name) table_format = table.get("table_format") sql_result: List[str] = [] if not generate_alter: # build CREATE - sql = f"CREATE TABLE {canonical_name} (\n" + sql = self._make_create_table(qualified_name, table) + " (\n" sql += ",\n".join([self._get_column_def_sql(c, table_format) for c in new_columns]) sql += ")" sql_result.append(sql) else: - sql_base = f"ALTER TABLE {canonical_name}\n" + sql_base = f"ALTER TABLE {qualified_name}\n" add_column_statements = self._make_add_column_sql(new_columns, table_format) if self.capabilities.alter_add_multi_column: column_sql = ",\n" @@ -561,13 +570,13 @@ def _get_table_update_sql( if hint == "not_null": logger.warning( f"Column(s) {hint_columns} with NOT NULL are being added to existing" - f" table {canonical_name}. If there's data in the table the operation" + f" table {qualified_name}. If there's data in the table the operation" " will fail." ) else: logger.warning( f"Column(s) {hint_columns} with hint {hint} are being added to existing" - f" table {canonical_name}. Several hint types may not be added to" + f" table {qualified_name}. Several hint types may not be added to" " existing tables." ) return sql_result diff --git a/docs/examples/parent_child_relationship/parent_child_relationship.py b/docs/examples/parent_child_relationship/parent_child_relationship.py index 39c9f577cc..6de00ffb28 100644 --- a/docs/examples/parent_child_relationship/parent_child_relationship.py +++ b/docs/examples/parent_child_relationship/parent_child_relationship.py @@ -22,6 +22,7 @@ from typing import List, Dict, Any, Generator import dlt + # Define a dlt resource with write disposition to 'merge' @dlt.resource(name="parent_with_children", write_disposition={"disposition": "merge"}) def data_source() -> Generator[List[Dict[str, Any]], None, None]: @@ -44,6 +45,7 @@ def data_source() -> Generator[List[Dict[str, Any]], None, None]: yield data + # Function to add parent_id to each child record within a parent record def add_parent_id(record: Dict[str, Any]) -> Dict[str, Any]: parent_id_key = "parent_id" @@ -51,6 +53,7 @@ def add_parent_id(record: Dict[str, Any]) -> Dict[str, Any]: child[parent_id_key] = record[parent_id_key] return record + if __name__ == "__main__": # Create and configure the dlt pipeline pipeline = dlt.pipeline( @@ -60,10 +63,6 @@ def add_parent_id(record: Dict[str, Any]) -> Dict[str, Any]: ) # Run the pipeline - load_info = pipeline.run( - data_source() - .add_map(add_parent_id), - primary_key="parent_id" - ) + load_info = pipeline.run(data_source().add_map(add_parent_id), primary_key="parent_id") # Output the load information after pipeline execution print(load_info) diff --git a/docs/examples/parent_child_relationship/test_parent_child_relationship.py b/docs/examples/parent_child_relationship/test_parent_child_relationship.py index f671040823..95d1bade97 100644 --- a/docs/examples/parent_child_relationship/test_parent_child_relationship.py +++ b/docs/examples/parent_child_relationship/test_parent_child_relationship.py @@ -1,4 +1,3 @@ - import pytest from tests.utils import skipifgithubfork @@ -29,6 +28,7 @@ from typing import List, Dict, Any, Generator import dlt + # Define a dlt resource with write disposition to 'merge' @dlt.resource(name="parent_with_children", write_disposition={"disposition": "merge"}) def data_source() -> Generator[List[Dict[str, Any]], None, None]: @@ -51,6 +51,7 @@ def data_source() -> Generator[List[Dict[str, Any]], None, None]: yield data + # Function to add parent_id to each child record within a parent record def add_parent_id(record: Dict[str, Any]) -> Dict[str, Any]: parent_id_key = "parent_id" @@ -58,6 +59,7 @@ def add_parent_id(record: Dict[str, Any]) -> Dict[str, Any]: child[parent_id_key] = record[parent_id_key] return record + @skipifgithubfork @pytest.mark.forked def test_parent_child_relationship(): @@ -69,10 +71,6 @@ def test_parent_child_relationship(): ) # Run the pipeline - load_info = pipeline.run( - data_source() - .add_map(add_parent_id), - primary_key="parent_id" - ) + load_info = pipeline.run(data_source().add_map(add_parent_id), primary_key="parent_id") # Output the load information after pipeline execution print(load_info) diff --git a/tests/load/mssql/test_mssql_table_builder.py b/tests/load/mssql/test_mssql_table_builder.py index d6cf3ec3e8..3f3896de6c 100644 --- a/tests/load/mssql/test_mssql_table_builder.py +++ b/tests/load/mssql/test_mssql_table_builder.py @@ -55,8 +55,8 @@ def test_alter_table(client: MsSqlJobClient) -> None: # existing table has no columns sql = client._get_table_update_sql("event_test_table", TABLE_UPDATE, True)[0] sqlfluff.parse(sql, dialect="tsql") - canonical_name = client.sql_client.make_qualified_table_name("event_test_table") - assert sql.count(f"ALTER TABLE {canonical_name}\nADD") == 1 + qualified_name = client.sql_client.make_qualified_table_name("event_test_table") + assert sql.count(f"ALTER TABLE {qualified_name}\nADD") == 1 assert "event_test_table" in sql assert '"col1" bigint NOT NULL' in sql assert '"col2" float NOT NULL' in sql @@ -75,3 +75,11 @@ def test_alter_table(client: MsSqlJobClient) -> None: assert '"col6_precision" decimal(6,2) NOT NULL' in sql assert '"col7_precision" varbinary(19)' in sql assert '"col11_precision" time(3) NOT NULL' in sql + + +def test_create_dlt_table(client: MsSqlJobClient) -> None: + # non existing table + sql = client._get_table_update_sql("_dlt_version", TABLE_UPDATE, False)[0] + sqlfluff.parse(sql, dialect="tsql") + qualified_name = client.sql_client.make_qualified_table_name("_dlt_version") + assert f"CREATE TABLE {qualified_name}" in sql diff --git a/tests/load/postgres/test_postgres_table_builder.py b/tests/load/postgres/test_postgres_table_builder.py index 86bd67db9a..28fd4eec9d 100644 --- a/tests/load/postgres/test_postgres_table_builder.py +++ b/tests/load/postgres/test_postgres_table_builder.py @@ -57,7 +57,8 @@ def test_create_table(client: PostgresClient) -> None: # non existing table sql = client._get_table_update_sql("event_test_table", TABLE_UPDATE, False)[0] sqlfluff.parse(sql, dialect="postgres") - assert "event_test_table" in sql + qualified_name = client.sql_client.make_qualified_table_name("event_test_table") + assert f"CREATE TABLE {qualified_name}" in sql assert '"col1" bigint NOT NULL' in sql assert '"col2" double precision NOT NULL' in sql assert '"col3" boolean NOT NULL' in sql @@ -173,3 +174,11 @@ def test_create_table_case_sensitive(cs_client: PostgresClient) -> None: # every line starts with "Col" for line in sql.split("\n")[1:]: assert line.startswith('"Col') + + +def test_create_dlt_table(client: PostgresClient) -> None: + # non existing table + sql = client._get_table_update_sql("_dlt_version", TABLE_UPDATE, False)[0] + sqlfluff.parse(sql, dialect="postgres") + qualified_name = client.sql_client.make_qualified_table_name("_dlt_version") + assert f"CREATE TABLE IF NOT EXISTS {qualified_name}" in sql diff --git a/tests/pipeline/test_pipeline_trace.py b/tests/pipeline/test_pipeline_trace.py index 69c0f01b8b..4e52d2aa29 100644 --- a/tests/pipeline/test_pipeline_trace.py +++ b/tests/pipeline/test_pipeline_trace.py @@ -551,7 +551,7 @@ def test_trace_telemetry() -> None: for item in SENTRY_SENT_ITEMS: # print(item) print(item["logentry"]["message"]) - assert len(SENTRY_SENT_ITEMS) == 2 + assert len(SENTRY_SENT_ITEMS) == 4 # trace with exception @dlt.resource