datahub-project · treff7es · Dec 16, 2024 · Dec 13, 2024 · Dec 13, 2024 · Dec 13, 2024
diff --git a/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_database_reader.py b/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_database_reader.py
@@ -147,6 +147,53 @@ def query(self) -> str:
             version
         """
 
+    def _get_rows(
+        self, from_createdon: datetime, stop_time: datetime
+    ) -> Iterable[Dict[str, Any]]:
+        # Cursor for MySql and PostgreSQL are by default client side cursor and we need to create server side cursor -> https://docs.sqlalchemy.org/en/20/core/connections.html#using-server-side-cursors-a-k-a-stream-results
+        # For MySQL, we need to use SSCursor to create server side cursor
+        # For PostgreSQL, we need to use stream_results=True to create server side cursor and need to be run in transaction
+        with self.engine.connect() as conn:
+            if self.engine.dialect.name == "postgresql":
+                with conn.begin():  # Transaction required for PostgreSQL server-side cursor
+                    conn = conn.execution_options(
+                        stream_results=True,
+                        yield_per=self.config.database_query_batch_size,
+                    )
+                    result = conn.execute(
+                        self.query,
+                        {
+                            "exclude_aspects": list(self.config.exclude_aspects),
+                            "since_createdon": from_createdon.strftime(DATETIME_FORMAT),
+                        },
+                    )
+                    for row in result:
+                        yield dict(row)
+            elif self.engine.dialect.name == "mysql":  # MySQL
+                import MySQLdb
+
+                with contextlib.closing(
+                    conn.connection.cursor(MySQLdb.cursors.SSCursor)
+                ) as cursor:
+                    logger.debug(f"Using Cursor type: {cursor.__class__.__name__}")
+                    cursor.execute(
+                        self.query,
+                        {
+                            "exclude_aspects": list(self.config.exclude_aspects),
+                            "since_createdon": from_createdon.strftime(DATETIME_FORMAT),
+                        },
+                    )
+
+                    columns = [desc[0] for desc in cursor.description]
+                    while True:
+                        rows = cursor.fetchmany(self.config.database_query_batch_size)
+                        if not rows:
+                            break  # Use break instead of return in generator
+                        for row in rows:
+                            yield dict(zip(columns, row))
+            else:
+                raise ValueError(f"Unsupported dialect: {self.engine.dialect.name}")
+
     def get_aspects(
         self, from_createdon: datetime, stop_time: datetime
     ) -> Iterable[Tuple[MetadataChangeProposalWrapper, datetime]]:
@@ -159,27 +206,6 @@ def get_aspects(
             if mcp:
                 yield mcp, row["createdon"]
 
-    def _get_rows(
-        self, from_createdon: datetime, stop_time: datetime
-    ) -> Iterable[Dict[str, Any]]:
-        with self.engine.connect() as conn:
-            with contextlib.closing(conn.connection.cursor()) as cursor:
-                cursor.execute(
-                    self.query,
-                    {
-                        "exclude_aspects": list(self.config.exclude_aspects),
-                        "since_createdon": from_createdon.strftime(DATETIME_FORMAT),
-                    },
-                )
-
-                columns = [desc[0] for desc in cursor.description]
-                while True:
-                    rows = cursor.fetchmany(self.config.database_query_batch_size)
-                    if not rows:
-                        return
-                    for row in rows:
-                        yield dict(zip(columns, row))
-
     def get_soft_deleted_rows(self) -> Iterable[Dict[str, Any]]:
         """
         Fetches all soft-deleted entities from the database.