From 3387110b411d91c7a08baaab76e29caa3bcb3a28 Mon Sep 17 00:00:00 2001
From: skrydal <piotr.skrydalewicz@acryl.io>
Date: Mon, 14 Oct 2024 11:35:34 +0200
Subject: [PATCH] fix(ingestion/redshift): Fix for Redshift COPY-based lineage
 (#11552)

---
 .../ingestion/source/redshift/lineage_v2.py   | 13 ++-
 .../ingestion/source/redshift/query.py        | 89 ++++++-------------
 .../sql_parsing/sql_parsing_aggregator.py     |  4 +-
 3 files changed, 41 insertions(+), 65 deletions(-)

diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage_v2.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage_v2.py
index 4b7f710beed08..4df64c80bad8a 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage_v2.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage_v2.py
@@ -334,19 +334,26 @@ def _process_view_lineage(self, lineage_row: LineageRow) -> None:
         )
 
     def _process_copy_command(self, lineage_row: LineageRow) -> None:
-        source = self._lineage_v1._get_sources(
+        logger.debug(f"Processing COPY command for lineage row: {lineage_row}")
+        sources = self._lineage_v1._get_sources(
             lineage_type=LineageCollectorType.COPY,
             db_name=self.database,
             source_schema=None,
             source_table=None,
             ddl=None,
             filename=lineage_row.filename,
-        )[0]
+        )
+        logger.debug(f"Recognized sources: {sources}")
+        source = sources[0]
         if not source:
+            logger.debug("Ignoring command since couldn't recognize proper source")
             return
         s3_urn = source[0].urn
-
+        logger.debug(f"Recognized s3 dataset urn: {s3_urn}")
         if not lineage_row.target_schema or not lineage_row.target_table:
+            logger.debug(
+                f"Didn't find target schema (found: {lineage_row.target_schema}) or target table (found: {lineage_row.target_table})"
+            )
             return
         target = self._make_filtered_target(lineage_row)
         if not target:
diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/query.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/query.py
index affbcd00b5107..39370b93b561c 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/redshift/query.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/query.py
@@ -283,6 +283,34 @@ def alter_table_rename_query(
             AND        SYS.query_text ILIKE '%alter table % rename to %'
         """
 
+    @staticmethod
+    def list_copy_commands_sql(
+        db_name: str, start_time: datetime, end_time: datetime
+    ) -> str:
+        return """
+                select
+                    distinct
+                        "schema" as target_schema,
+                        "table" as target_table,
+                        c.file_name as filename
+                from
+                    SYS_QUERY_DETAIL as si
+                join SYS_LOAD_DETAIL as c on
+                    si.query_id = c.query_id
+                join SVV_TABLE_INFO sti on
+                    sti.table_id = si.table_id
+                where
+                    database = '{db_name}'
+                    and si.start_time >= '{start_time}'
+                    and si.start_time < '{end_time}'
+                order by target_schema, target_table, si.start_time asc
+                """.format(
+            # We need the original database name for filtering
+            db_name=db_name,
+            start_time=start_time.strftime(redshift_datetime_format),
+            end_time=end_time.strftime(redshift_datetime_format),
+        )
+
     @staticmethod
     def additional_table_metadata_query() -> str:
         raise NotImplementedError
@@ -317,12 +345,6 @@ def list_insert_create_queries_sql(
     ) -> str:
         raise NotImplementedError
 
-    @staticmethod
-    def list_copy_commands_sql(
-        db_name: str, start_time: datetime, end_time: datetime
-    ) -> str:
-        raise NotImplementedError
-
 
 class RedshiftProvisionedQuery(RedshiftCommonQuery):
     @staticmethod
@@ -536,34 +558,6 @@ def list_insert_create_queries_sql(
             end_time=end_time.strftime(redshift_datetime_format),
         )
 
-    @staticmethod
-    def list_copy_commands_sql(
-        db_name: str, start_time: datetime, end_time: datetime
-    ) -> str:
-        return """
-                select
-                    distinct
-                        "schema" as target_schema,
-                        "table" as target_table,
-                        filename
-                from
-                    stl_insert as si
-                join stl_load_commits as c on
-                    si.query = c.query
-                join SVV_TABLE_INFO sti on
-                    sti.table_id = tbl
-                where
-                    database = '{db_name}'
-                    and si.starttime >= '{start_time}'
-                    and si.starttime < '{end_time}'
-                order by target_schema, target_table, starttime asc
-                """.format(
-            # We need the original database name for filtering
-            db_name=db_name,
-            start_time=start_time.strftime(redshift_datetime_format),
-            end_time=end_time.strftime(redshift_datetime_format),
-        )
-
     @staticmethod
     def temp_table_ddl_query(start_time: datetime, end_time: datetime) -> str:
         start_time_str: str = start_time.strftime(redshift_datetime_format)
@@ -941,33 +935,6 @@ def list_insert_create_queries_sql(
 
     # when loading from s3 using prefix with a single file it produces 2 lines (for file and just directory) - also
     # behaves like this when run in the old way
-    @staticmethod
-    def list_copy_commands_sql(
-        db_name: str, start_time: datetime, end_time: datetime
-    ) -> str:
-        return """
-                select
-                    distinct
-                        "schema" as target_schema,
-                        "table" as target_table,
-                        c.file_name
-                from
-                    SYS_QUERY_DETAIL as si
-                join SYS_LOAD_DETAIL as c on
-                    si.query_id = c.query_id
-                join SVV_TABLE_INFO sti on
-                    sti.table_id = si.table_id
-                where
-                    database = '{db_name}'
-                    and si.start_time >= '{start_time}'
-                    and si.start_time < '{end_time}'
-                order by target_schema, target_table, si.start_time asc
-                """.format(
-            # We need the original database name for filtering
-            db_name=db_name,
-            start_time=start_time.strftime(redshift_datetime_format),
-            end_time=end_time.strftime(redshift_datetime_format),
-        )
 
     # handles "create table IF ..." statements wrong probably - "create command" field contains only "create table if" in such cases
     # also similar happens if for example table name contains special characters quoted with " i.e. "test-table1"
diff --git a/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py b/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py
index 52934f9f72a70..5f2709fe42660 100644
--- a/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py
+++ b/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py
@@ -613,7 +613,9 @@ def add_known_lineage_mapping(
             upstream_urn: The upstream dataset URN.
             downstream_urn: The downstream dataset URN.
         """
-
+        logger.debug(
+            f"Adding lineage to the map, downstream: {downstream_urn}, upstream: {upstream_urn}"
+        )
         self.report.num_known_mapping_lineage += 1
 
         # We generate a fake "query" object to hold the lineage.