From 27ea1a70ce0c39ce08a0bda9bb1a1a02ee1dbe0c Mon Sep 17 00:00:00 2001
From: Mars Lan <mars.th.lan@gmail.com>
Date: Fri, 13 Oct 2023 07:58:51 -0700
Subject: [PATCH] Fix dbt 1.6.6 manifest schema validation errors (#627)

---
 metaphor/dbt/manifest_parser.py |  25 ++++++-
 pyproject.toml                  |   2 +-
 tests/dbt/test_extractor.py     | 125 ++++++++++++++++++++++++++++++++
 3 files changed, 148 insertions(+), 4 deletions(-)

diff --git a/metaphor/dbt/manifest_parser.py b/metaphor/dbt/manifest_parser.py
index 4edb4ead..13cf89c8 100644
--- a/metaphor/dbt/manifest_parser.py
+++ b/metaphor/dbt/manifest_parser.py
@@ -213,14 +213,16 @@ def __init__(
         if self._account and platform == DataPlatform.SNOWFLAKE:
             self._account = normalize_snowflake_account(self._account)
 
-    def sanitize_manifest(self, manifest_json: Dict, schema_version: str) -> Dict:
+    @staticmethod
+    def sanitize_manifest(manifest_json: Dict, schema_version: str) -> Dict:
         manifest_json = deepcopy(manifest_json)
 
         # It's possible for dbt to generate "docs block" in the manifest that doesn't
         # conform to the JSON schema. Specifically, the "name" field can be None in
         # some cases. Since the field is not actually used, it's safe to clear it out to
         # avoid hitting any validation issues.
-        manifest_json["docs"] = {}
+        if manifest_json.get("docs") is not None:
+            manifest_json["docs"] = {}
 
         # dbt can erroneously generate null for test's "depends_on"
         # Filter these out for now
@@ -245,6 +247,23 @@ def sanitize_manifest(self, manifest_json: Dict, schema_version: str) -> Dict:
             for measure in semantic_model.get("measures", []):
                 measure.pop("label", None)
 
+        # Temporarily strip off all the extra "fill_nulls_with" &
+        # "join_to_teimspine" fields in metrics' "type_params" until
+        # https://github.com/dbt-labs/dbt-core/issues/8851 is fixed
+        for _, metric in manifest_json.get("metrics", {}).items():
+            type_params = metric.get("type_params", {})
+
+            measure = type_params.get("measure", {})
+            if measure is not None:
+                measure.pop("fill_nulls_with", None)
+                measure.pop("join_to_timespine", None)
+
+            input_measures = type_params.get("input_measures", [])
+            if input_measures is not None:
+                for input_measure in input_measures:
+                    input_measure.pop("fill_nulls_with", None)
+                    input_measure.pop("join_to_timespine", None)
+
         return manifest_json
 
     def parse(self, manifest_json: Dict) -> None:
@@ -257,7 +276,7 @@ def parse(self, manifest_json: Dict) -> None:
         )
         logger.info(f"parsing manifest.json {schema_version} ...")
 
-        manifest_json = self.sanitize_manifest(manifest_json, schema_version)
+        manifest_json = ManifestParser.sanitize_manifest(manifest_json, schema_version)
 
         dbt_manifest_class = dbt_version_manifest_class_map.get(schema_version)
         if dbt_manifest_class is None:
diff --git a/pyproject.toml b/pyproject.toml
index f1de8d48..c653cf7f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "metaphor-connectors"
-version = "0.13.4"
+version = "0.13.5"
 license = "Apache-2.0"
 description = "A collection of Python-based 'connectors' that extract metadata from various sources to ingest into the Metaphor app."
 authors = ["Metaphor <dev@metaphor.io>"]
diff --git a/tests/dbt/test_extractor.py b/tests/dbt/test_extractor.py
index 99d80314..584e9d78 100644
--- a/tests/dbt/test_extractor.py
+++ b/tests/dbt/test_extractor.py
@@ -4,6 +4,7 @@
 from metaphor.common.event_util import EventUtil
 from metaphor.dbt.config import DbtRunConfig, MetaOwnership, MetaTag
 from metaphor.dbt.extractor import DbtExtractor
+from metaphor.dbt.manifest_parser import ManifestParser
 from tests.test_utils import load_json
 
 
@@ -89,3 +90,127 @@ async def _test_project(
     events = [EventUtil.trim_event(e) for e in await extractor.extract()]
 
     assert events == load_json(expected)
+
+
+def test_sanitize_manifest_empty_docs(test_root_dir):
+    manifest = {
+        "docs": {"foo": "bar"},
+    }
+
+    assert ManifestParser.sanitize_manifest(manifest, "v10") == {
+        "docs": {},
+    }
+
+
+def test_sanitize_manifest_strip_null_tests_depends_on(test_root_dir):
+    manifest = {
+        "nodes": {
+            "test.example": {
+                "depends_on": {
+                    "macros": [None, "macro1"],
+                    "nodes": ["node1", None, "node2"],
+                }
+            }
+        }
+    }
+
+    assert ManifestParser.sanitize_manifest(manifest, "v10") == {
+        "nodes": {
+            "test.example": {
+                "depends_on": {
+                    "macros": ["macro1"],
+                    "nodes": ["node1", "node2"],
+                }
+            }
+        }
+    }
+
+
+def test_sanitize_manifest_strip_semantic_models_labels(test_root_dir):
+    manifest = {
+        "semantic_models": {
+            "model1": {
+                "label": "foo",
+                "entities": [
+                    {
+                        "name": "entity1",
+                        "label": "bar",
+                    }
+                ],
+                "dimensions": [
+                    {
+                        "name": "dim1",
+                        "label": "baz",
+                    }
+                ],
+                "measures": [
+                    {
+                        "name": "measure1",
+                        "label": "qux",
+                    }
+                ],
+            }
+        }
+    }
+
+    assert ManifestParser.sanitize_manifest(manifest, "v10") == {
+        "semantic_models": {
+            "model1": {
+                "entities": [
+                    {
+                        "name": "entity1",
+                    }
+                ],
+                "dimensions": [
+                    {
+                        "name": "dim1",
+                    }
+                ],
+                "measures": [
+                    {
+                        "name": "measure1",
+                    }
+                ],
+            }
+        }
+    }
+
+
+def test_sanitize_manifest_strip_metric_type_params_extra_fields(test_root_dir):
+    manifest = {
+        "metrics": {
+            "metric1": {
+                "type_params": {
+                    "measure": {
+                        "name": "measure1",
+                        "fill_nulls_with": "foo",
+                        "join_to_timespine": "bar",
+                    },
+                    "input_measures": [
+                        {
+                            "name": "measure2",
+                            "fill_nulls_with": "foo",
+                            "join_to_timespine": "bar",
+                        }
+                    ],
+                }
+            }
+        }
+    }
+
+    assert ManifestParser.sanitize_manifest(manifest, "v10") == {
+        "metrics": {
+            "metric1": {
+                "type_params": {
+                    "measure": {
+                        "name": "measure1",
+                    },
+                    "input_measures": [
+                        {
+                            "name": "measure2",
+                        }
+                    ],
+                }
+            }
+        }
+    }