MetaphorData · usefulalgorithm · Oct 28, 2024 · Oct 29, 2024 · Oct 30, 2024 · Oct 30, 2024
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -39,7 +39,7 @@ jobs:
 
       - name: Setup dev libs
         run: |
-          sudo apt-get install libsasl2-dev
+          sudo apt-get install libsasl2-dev libpq-dev
 
       - name: Lint & Type Check
         run: |

diff --git a/.gitignore b/.gitignore
@@ -144,5 +144,9 @@ poetry.toml
 # Minio system files
 tests/s3/data/.minio.sys
 
+# GX configs in tests
+tests/great_expectations/**/gx/
+tests/great_expectations/**/snowflake/config.yml
+
 # pytest-testmon metadata
 .testmondata*
diff --git a/README.md b/README.md
@@ -64,6 +64,7 @@ Each connector is placed under its own directory under [metaphor](./metaphor) an
 | [dbt.cloud](metaphor/dbt/cloud/)                                  | dbt model, test, lineage                 |
 | [fivetran](metaphor/fivetran/)                                    | Lineage, Pipeline                        |
 | [glue](metaphor/glue/)                                            | Schema, description                      |
+| [great_expectations](metaphor/great_expectations/)                | Data monitor                             |
 | [informatica](metaphor/informatica/)                              | Lineage, Pipeline                        |
 | [looker](metaphor/looker/)                                        | Looker view, explore, dashboard, lineage |
 | [kafka](metaphor/kafka/)                                          | Schema, description                      |

diff --git a/metaphor/glue/extractor.py b/metaphor/glue/extractor.py
@@ -99,7 +99,7 @@ def _get_tables(self, database: str):
                 )
                 table_type = table.get("TableType")
                 parameters = table.get("Parameters")
-                row_count = parameters.get("numRows") if parameters else None
+                row_count = int(parameters.get("numRows")) if parameters else 0
                 description = table.get("Description")
 
                 dataset = self._init_dataset(

diff --git a/metaphor/great_expectations/README.md b/metaphor/great_expectations/README.md
@@ -0,0 +1,47 @@
+# Great Expectations Connector
+
+This connector extracts technical metadata from Great Expectations using [GX Core](https://greatexpectations.io/gx-core).
+
+## Setup
+
+This connector runs by parsing existing Great Expectations run artifacts. To use this connector, make sure the execution context has been persisted. For example:
+
+```python
+import great_expectations as gx
+
+ctx = gx.get_context(mode="file", project_root_dir="SOME_DIR") # This works, the artifacts are persisted to `SOME_DIR`, the connector will parse it to get the validation results.
+
+# ctx = gx.get_context() # XXX This does not work, it creates a context only in memory and nothing is persisted
+```
+
+## Config File
+
+Create a YAML config file based on the following template.
+
+### Required Configurations
+
+```yaml
+project_root_dir: <PROJECT_ROOT_DIR> # The project root directory. This is the directory that contains your Great Expectations artifacts, i.e. where the `gx` directory lives.
+```
+
+### Optional Configurations
+
+```yaml
+snowflake_account: <SNOWFLAKE_ACCOUNT> # The Snowflake account to use if the Great Expectations run was targeted at Snowflake.
+```
+
+#### Output Destination
+
+See [Output Config](../common/docs/output.md) for more information.
+
+## Testing
+
+Follow the [Installation](../../README.md) instructions to install `metaphor-connectors` in your environment (or virtualenv).
+
+Run the following command to test the connector locally:
+
+```shell
+metaphor great_expectations <config_file>
+```
+
+Manually verify the output after the command finishes.
diff --git a/metaphor/great_expectations/__init__.py b/metaphor/great_expectations/__init__.py
@@ -0,0 +1,6 @@
+from metaphor.common.cli import cli_main
+from metaphor.great_expectations.extractor import GreatExpectationsExtractor
+
+
+def main(config_file: str):
+    cli_main(GreatExpectationsExtractor, config_file)
diff --git a/metaphor/great_expectations/config.py b/metaphor/great_expectations/config.py
@@ -0,0 +1,20 @@
+from typing import Optional
+
+from pydantic import DirectoryPath
+from pydantic.dataclasses import dataclass
+
+from metaphor.common.base_config import BaseConfig
+from metaphor.common.dataclass import ConnectorConfig
+
+
+@dataclass(config=ConnectorConfig)
+class GreatExpectationConfig(BaseConfig):
+    project_root_dir: DirectoryPath
+    """
+    The project root directory. This is the directory that contains your Great Expectations artifacts, i.e. where the `gx` directory lives.
+    """
+
+    snowflake_account: Optional[str] = None
+    """
+    The Snowflake account to use if the Great Expectations run was targeted at Snowflake.
+    """
diff --git a/metaphor/great_expectations/extractor.py b/metaphor/great_expectations/extractor.py
@@ -0,0 +1,279 @@
+from typing import Collection, Dict, List, Optional
+
+import great_expectations as gx
+from great_expectations.core.batch import LegacyBatchDefinition
+from great_expectations.core.batch_spec import BatchSpec
+from great_expectations.core.expectation_validation_result import (
+    ExpectationSuiteValidationResult,
+)
+from great_expectations.datasource.fluent import DataAsset
+from great_expectations.execution_engine import (
+    PandasExecutionEngine,
+    SparkDFExecutionEngine,
+    SqlAlchemyExecutionEngine,
+)
+from sqlalchemy import URL
+
+from metaphor.common.base_extractor import BaseExtractor
+from metaphor.common.entity_id import (
+    dataset_normalized_name,
+    parts_to_dataset_entity_id,
+)
+from metaphor.common.event_util import ENTITY_TYPES
+from metaphor.common.logger import get_logger
+from metaphor.great_expectations.config import GreatExpectationConfig
+from metaphor.models.crawler_run_metadata import Platform
+from metaphor.models.metadata_change_event import (
+    DataMonitor,
+    DataMonitorStatus,
+    DataMonitorTarget,
+    DataPlatform,
+    DataQualityProvider,
+    Dataset,
+    DatasetDataQuality,
+    DatasetLogicalID,
+)
+
+logger = get_logger()
+
+
+class GreatExpectationsExtractor(BaseExtractor):
+    """
+    Great Expectations metadata extractor. The extractor runs by
+    parsing existing Great Expectations data context, so make sure the
+    execution context has been persisted. In other words, it will not
+    work if you get Great Expectations context like this in your script:
+    ```python
+    ctx = gx.get_context() # This creates a context in memory, and nothing is persisted
+    ```
+    """
+
+    _description = "Great Expectations metadata crawler"
+    _platform = Platform.GREAT_EXPECTATIONS
+
+    @staticmethod
+    def from_config_file(config_file: str) -> "GreatExpectationsExtractor":
+        return GreatExpectationsExtractor(
+            GreatExpectationConfig.from_yaml_file(config_file)
+        )
+
+    def __init__(self, config: GreatExpectationConfig) -> None:
+        super().__init__(config)
+        self._config = config
+        self._datasets: Dict[str, Dataset] = {}
+
+    async def extract(self) -> Collection[ENTITY_TYPES]:
+        self.context = gx.get_context(
+            project_root_dir=self._config.project_root_dir, mode="file"
+        )
+        for validation_result in self.context.validation_results_store.get_all():
+            if isinstance(validation_result, ExpectationSuiteValidationResult):
+                self._parse_suite_result(validation_result)
+        return self._datasets.values()
+
+    @staticmethod
+    def _get_dataset_key(
+        platform: DataPlatform,
+        account: Optional[str],
+        database: Optional[str],
+        schema: Optional[str],
+        table: str,
+    ):
+        return str(
+            parts_to_dataset_entity_id(platform, account, database, schema, table)
+        )
+
+    def _init_dataset(
+        self,
+        platform: DataPlatform,
+        account: Optional[str],
+        database: Optional[str],
+        schema: Optional[str],
+        table: str,
+    ) -> Dataset:
+        key = self._get_dataset_key(platform, account, database, schema, table)
+        dataset_name = dataset_normalized_name(database, schema, table)
+        if key not in self._datasets:
+            self._datasets[key] = Dataset(
+                logical_id=DatasetLogicalID(
+                    account=account,
+                    name=dataset_name,
+                    platform=platform,
+                ),
+            )
+
+        dataset = self._datasets[key]
+        if dataset.data_quality is None:
+            dataset.data_quality = DatasetDataQuality(
+                provider=DataQualityProvider.GREAT_EXPECTATIONS,
+                monitors=[],
+            )
+        return dataset
+
+    def _parse_suite_result(
+        self, validation_result: ExpectationSuiteValidationResult
+    ) -> None:
+        logger.info(f"Parsing validation result: {validation_result.id}")
+        active_batch_definition: LegacyBatchDefinition = validation_result.meta[
+            "active_batch_definition"
+        ]
+
+        datasource = self.context.data_sources.get(
+            active_batch_definition["datasource_name"]
+        )
+        execution_engine = datasource.get_execution_engine()
+
+        # TODO: support PandasExecutionEngine
+        if isinstance(execution_engine, PandasExecutionEngine):
+            logger.warning(
+                "PandasExecutionEngine not supported, not parsing this validation result"
+            )
+            return
+
+        # TODO: support SparkDFExecutionEngine
+        if isinstance(execution_engine, SparkDFExecutionEngine):
+            logger.warning(
+                "SparkDFExecution not supported, not parsing this validation result"
+            )
+            return
+
+        if not isinstance(execution_engine, SqlAlchemyExecutionEngine):
+            logger.warning(
+                f"Cannot process execution engine: {execution_engine}, not parsing this validation result"
+            )
+            return
+
+        self._parse_sql_execution_engine_result(
+            validation_result,
+            execution_engine,
+            datasource.get_asset(active_batch_definition["data_asset_name"]),
+        )
+
+    def _parse_sql_execution_engine_result(
+        self,
+        validation_result: ExpectationSuiteValidationResult,
+        execution_engine: SqlAlchemyExecutionEngine,
+        data_asset: DataAsset,
+    ) -> None:
+        # batch_spec is always just a dict, using isinstance to get its type will not work
+        batch_spec: BatchSpec = validation_result.meta["batch_spec"]
+        logger.info(f"batch spec: {batch_spec}")
+
+        if "query" in batch_spec:
+            # This is a RuntimeQueryBatchSpec, we should parse the query and see what datasets
+            # are referenced in it.
+            logger.warning(
+                "RuntimeQueryBatchSpec not supported, not parsing this validation result"
+            )
+            return
+
+        if "batch_data" in batch_spec:
+            logger.warning(
+                "RuntimeDataBatchSpec not supported, not parsing this validation result"
+            )
+            return
+
+        if "schema_name" not in batch_spec and "table_name" not in batch_spec:
+            # At this point the only batch spec we care is SqlAlchemyDatasourceBatchSpec,
+            # anything else we are not parsing.
+            logger.warning(
+                f"Cannot parse batch spec {batch_spec}, ignoring this validation result"
+            )
+            return
+
+        url = execution_engine.engine.url
+        backend = url.get_backend_name().upper()
+        platform = next((x for x in DataPlatform if x.value == backend), None)
+        if not platform:
+            logger.warning(
+                f"Unknown SqlAlchemy backend: {backend}, not parsing this validation result"
+            )
+            return
+
+        account = (
+            self._config.snowflake_account
+            if platform is DataPlatform.SNOWFLAKE
+            else None
+        )
+        database = self._extract_database_from_sqlalchemy_url(url, platform)
+        schema = batch_spec.get("schema_name")
+        table = batch_spec.get("table_name")
+
+        dataset = self._init_dataset(
+            platform,
+            account,
+            database,
+            schema,
+            table or data_asset.name,
+        )
+
+        assert dataset.data_quality and dataset.data_quality.monitors is not None
+
+        # Right now the whole suite is a single DataMonitor, so if one expectation fails
+        # the whole monitor fails.
+        # TODO: decide if we want to make a DataMonitor for each `validation_result.result`.
+        dataset.data_quality.monitors.append(
+            DataMonitor(
+                title=validation_result.suite_name,
+                status=(
+                    DataMonitorStatus.PASSED
+                    if validation_result.success
+                    else DataMonitorStatus.ERROR
+                ),
+                targets=self._parse_result_targets(validation_result, dataset),
+                url=validation_result.result_url,
+                exceptions=self._parse_result_exceptions(validation_result),
+            )
+        )
+
+    @staticmethod
+    def _parse_result_targets(
+        validation_result: ExpectationSuiteValidationResult, dataset: Dataset
+    ) -> Optional[List[DataMonitorTarget]]:
+        assert dataset.logical_id and dataset.logical_id.name
+        targets = [
+            DataMonitorTarget(
+                dataset=dataset.logical_id.name,
+                column=result.expectation_config.kwargs["column"],
+            )
+            for result in validation_result.results
+            if result.expectation_config
+            and result.expectation_config.kwargs.get("column")
+        ]
+        return targets or None
+
+    @staticmethod
+    def _parse_result_exceptions(
+        validation_result: ExpectationSuiteValidationResult,
+    ) -> Optional[List[str]]:
+        exceptions = [
+            result.exception_info["exception_message"]
+            for result in validation_result.results
+            if result.exception_info
+            and result.exception_info.get("raised_exception", False)
+        ]
+        return exceptions or None
+
+    @staticmethod
+    def _extract_database_from_sqlalchemy_url(url: URL, platform: DataPlatform) -> str:
+        """
+        Reference:
+        https://docs.greatexpectations.io/docs/core/connect_to_data/sql_data/#procedure
+        """
+
+        if platform is DataPlatform.SNOWFLAKE:
+            # GX connect string for Snowflake looks like
+            # snowflake://<USER_NAME>:<PASSWORD>@<ACCOUNT_NAME>/<DATABASE_NAME>/<SCHEMA_NAME>?warehouse=<WAREHOUSE_NAME>&role=<ROLE_NAME>&application=great_expectations_oss
+            # And SQLAlchemy URL considers whatever is behind `ACCOUNT_NAME` the database of the url.
+            #
+            # We want to extract `SCHEMA_NAME` from `DATABASE_NAME/SCHEMA_NAME`.
+            return (url.database or "").rsplit("/", maxsplit=1)[0]
+
+        if platform is DataPlatform.POSTGRESQL:
+            # PostgreSQL connect string:
+            # postgresql+psycopg2://<USERNAME>:<PASSWORD>@<HOST>:<PORT>/<DATABASE>
+            return url.database or ""
+
+        database = url.database or ""
+        logger.warning(f"Using {database} for platform = {platform.value}")
+        return database