From 27ea1a70ce0c39ce08a0bda9bb1a1a02ee1dbe0c Mon Sep 17 00:00:00 2001 From: Mars Lan Date: Fri, 13 Oct 2023 07:58:51 -0700 Subject: [PATCH] Fix dbt 1.6.6 manifest schema validation errors (#627) --- metaphor/dbt/manifest_parser.py | 25 ++++++- pyproject.toml | 2 +- tests/dbt/test_extractor.py | 125 ++++++++++++++++++++++++++++++++ 3 files changed, 148 insertions(+), 4 deletions(-) diff --git a/metaphor/dbt/manifest_parser.py b/metaphor/dbt/manifest_parser.py index 4edb4ead..13cf89c8 100644 --- a/metaphor/dbt/manifest_parser.py +++ b/metaphor/dbt/manifest_parser.py @@ -213,14 +213,16 @@ def __init__( if self._account and platform == DataPlatform.SNOWFLAKE: self._account = normalize_snowflake_account(self._account) - def sanitize_manifest(self, manifest_json: Dict, schema_version: str) -> Dict: + @staticmethod + def sanitize_manifest(manifest_json: Dict, schema_version: str) -> Dict: manifest_json = deepcopy(manifest_json) # It's possible for dbt to generate "docs block" in the manifest that doesn't # conform to the JSON schema. Specifically, the "name" field can be None in # some cases. Since the field is not actually used, it's safe to clear it out to # avoid hitting any validation issues. - manifest_json["docs"] = {} + if manifest_json.get("docs") is not None: + manifest_json["docs"] = {} # dbt can erroneously generate null for test's "depends_on" # Filter these out for now @@ -245,6 +247,23 @@ def sanitize_manifest(self, manifest_json: Dict, schema_version: str) -> Dict: for measure in semantic_model.get("measures", []): measure.pop("label", None) + # Temporarily strip off all the extra "fill_nulls_with" & + # "join_to_teimspine" fields in metrics' "type_params" until + # https://github.com/dbt-labs/dbt-core/issues/8851 is fixed + for _, metric in manifest_json.get("metrics", {}).items(): + type_params = metric.get("type_params", {}) + + measure = type_params.get("measure", {}) + if measure is not None: + measure.pop("fill_nulls_with", None) + measure.pop("join_to_timespine", None) + + input_measures = type_params.get("input_measures", []) + if input_measures is not None: + for input_measure in input_measures: + input_measure.pop("fill_nulls_with", None) + input_measure.pop("join_to_timespine", None) + return manifest_json def parse(self, manifest_json: Dict) -> None: @@ -257,7 +276,7 @@ def parse(self, manifest_json: Dict) -> None: ) logger.info(f"parsing manifest.json {schema_version} ...") - manifest_json = self.sanitize_manifest(manifest_json, schema_version) + manifest_json = ManifestParser.sanitize_manifest(manifest_json, schema_version) dbt_manifest_class = dbt_version_manifest_class_map.get(schema_version) if dbt_manifest_class is None: diff --git a/pyproject.toml b/pyproject.toml index f1de8d48..c653cf7f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "metaphor-connectors" -version = "0.13.4" +version = "0.13.5" license = "Apache-2.0" description = "A collection of Python-based 'connectors' that extract metadata from various sources to ingest into the Metaphor app." authors = ["Metaphor "] diff --git a/tests/dbt/test_extractor.py b/tests/dbt/test_extractor.py index 99d80314..584e9d78 100644 --- a/tests/dbt/test_extractor.py +++ b/tests/dbt/test_extractor.py @@ -4,6 +4,7 @@ from metaphor.common.event_util import EventUtil from metaphor.dbt.config import DbtRunConfig, MetaOwnership, MetaTag from metaphor.dbt.extractor import DbtExtractor +from metaphor.dbt.manifest_parser import ManifestParser from tests.test_utils import load_json @@ -89,3 +90,127 @@ async def _test_project( events = [EventUtil.trim_event(e) for e in await extractor.extract()] assert events == load_json(expected) + + +def test_sanitize_manifest_empty_docs(test_root_dir): + manifest = { + "docs": {"foo": "bar"}, + } + + assert ManifestParser.sanitize_manifest(manifest, "v10") == { + "docs": {}, + } + + +def test_sanitize_manifest_strip_null_tests_depends_on(test_root_dir): + manifest = { + "nodes": { + "test.example": { + "depends_on": { + "macros": [None, "macro1"], + "nodes": ["node1", None, "node2"], + } + } + } + } + + assert ManifestParser.sanitize_manifest(manifest, "v10") == { + "nodes": { + "test.example": { + "depends_on": { + "macros": ["macro1"], + "nodes": ["node1", "node2"], + } + } + } + } + + +def test_sanitize_manifest_strip_semantic_models_labels(test_root_dir): + manifest = { + "semantic_models": { + "model1": { + "label": "foo", + "entities": [ + { + "name": "entity1", + "label": "bar", + } + ], + "dimensions": [ + { + "name": "dim1", + "label": "baz", + } + ], + "measures": [ + { + "name": "measure1", + "label": "qux", + } + ], + } + } + } + + assert ManifestParser.sanitize_manifest(manifest, "v10") == { + "semantic_models": { + "model1": { + "entities": [ + { + "name": "entity1", + } + ], + "dimensions": [ + { + "name": "dim1", + } + ], + "measures": [ + { + "name": "measure1", + } + ], + } + } + } + + +def test_sanitize_manifest_strip_metric_type_params_extra_fields(test_root_dir): + manifest = { + "metrics": { + "metric1": { + "type_params": { + "measure": { + "name": "measure1", + "fill_nulls_with": "foo", + "join_to_timespine": "bar", + }, + "input_measures": [ + { + "name": "measure2", + "fill_nulls_with": "foo", + "join_to_timespine": "bar", + } + ], + } + } + } + } + + assert ManifestParser.sanitize_manifest(manifest, "v10") == { + "metrics": { + "metric1": { + "type_params": { + "measure": { + "name": "measure1", + }, + "input_measures": [ + { + "name": "measure2", + } + ], + } + } + } + }