From 0344fe27877661f5f2579d6abfec162798c0098d Mon Sep 17 00:00:00 2001 From: alyiwang Date: Mon, 21 Oct 2024 12:05:34 -0700 Subject: [PATCH] Snowflake profiler is not normalizing the [sc-29479] --- metaphor/bigquery/profile/extractor.py | 21 ++++++------ metaphor/common/fieldpath.py | 27 +++++++++++++++- metaphor/postgresql/profile/extractor.py | 18 +++++------ metaphor/snowflake/profile/extractor.py | 21 ++++++------ metaphor/unity_catalog/profile/extractor.py | 36 ++++++++++----------- pyproject.toml | 2 +- tests/snowflake/profile/expected.json | 30 ++++++++--------- 7 files changed, 89 insertions(+), 66 deletions(-) diff --git a/metaphor/bigquery/profile/extractor.py b/metaphor/bigquery/profile/extractor.py index 69c098a5..65860a7f 100644 --- a/metaphor/bigquery/profile/extractor.py +++ b/metaphor/bigquery/profile/extractor.py @@ -10,7 +10,6 @@ print("Please install metaphor[bigquery] extra\n") raise - from metaphor.bigquery.extractor import BigQueryExtractor from metaphor.bigquery.profile.config import BigQueryProfileRunConfig, SamplingConfig from metaphor.bigquery.utils import build_client, get_credentials @@ -18,6 +17,7 @@ from metaphor.common.column_statistics import ColumnStatistics from metaphor.common.entity_id import dataset_normalized_name from metaphor.common.event_util import ENTITY_TYPES +from metaphor.common.fieldpath import build_field_statistics from metaphor.common.filter import DatasetFilter from metaphor.common.logger import get_logger from metaphor.common.utils import safe_float @@ -28,7 +28,6 @@ DatasetFieldStatistics, DatasetLogicalID, DatasetSchema, - FieldStatistics, ) logger = get_logger() @@ -264,15 +263,15 @@ def _parse_result( index += 1 fields.append( - FieldStatistics( - field_path=field.field_path, - distinct_value_count=unique_count, - null_value_count=nulls, - nonnull_value_count=non_nulls, - min_value=min_value, - max_value=max_value, - average=avg, - std_dev=std_dev, + build_field_statistics( + field.field_path, + unique_count, + nulls, + non_nulls, + min_value, + max_value, + avg, + std_dev, ) ) diff --git a/metaphor/common/fieldpath.py b/metaphor/common/fieldpath.py index 9298cca2..6a00173e 100644 --- a/metaphor/common/fieldpath.py +++ b/metaphor/common/fieldpath.py @@ -1,7 +1,7 @@ from enum import Enum from typing import Optional -from metaphor.models.metadata_change_event import SchemaField +from metaphor.models.metadata_change_event import FieldStatistics, SchemaField class FieldDataType(Enum): @@ -59,3 +59,28 @@ def build_schema_field( nullable=nullable, precision=precision, ) + + +def build_field_statistics( + column: str, + unique_count: Optional[float] = None, + nulls: Optional[float] = None, + non_nulls: Optional[float] = None, + min_value: Optional[float] = None, + max_value: Optional[float] = None, + avg: Optional[float] = None, + std_dev: Optional[float] = None, +) -> FieldStatistics: + """ + Build field statistics for a column based on extracted column statistics. + """ + return FieldStatistics( + field_path=column.lower(), + distinct_value_count=unique_count, + null_value_count=nulls, + nonnull_value_count=non_nulls, + min_value=min_value, + max_value=max_value, + average=avg, + std_dev=std_dev, + ) diff --git a/metaphor/postgresql/profile/extractor.py b/metaphor/postgresql/profile/extractor.py index af660c8e..50b6a7bd 100644 --- a/metaphor/postgresql/profile/extractor.py +++ b/metaphor/postgresql/profile/extractor.py @@ -11,13 +11,13 @@ from metaphor.common.column_statistics import ColumnStatistics from metaphor.common.entity_id import dataset_normalized_name from metaphor.common.event_util import ENTITY_TYPES +from metaphor.common.fieldpath import build_field_statistics from metaphor.common.logger import get_logger from metaphor.common.sampling import SamplingConfig from metaphor.common.utils import safe_float from metaphor.models.metadata_change_event import ( Dataset, DatasetFieldStatistics, - FieldStatistics, MaterializationType, ) from metaphor.postgresql.extractor import PostgreSQLExtractor @@ -179,14 +179,14 @@ def _parse_result( index += 1 dataset.field_statistics.field_statistics.append( - FieldStatistics( - field_path=field.field_path, - distinct_value_count=unique_values, - null_value_count=nulls, - nonnull_value_count=(row_count - nulls), - min_value=min_value, - max_value=max_value, - average=avg, + build_field_statistics( + field.field_path, + unique_values, + nulls, + row_count - nulls, + min_value, + max_value, + avg, ) ) diff --git a/metaphor/snowflake/profile/extractor.py b/metaphor/snowflake/profile/extractor.py index 52c1cddf..680988d6 100644 --- a/metaphor/snowflake/profile/extractor.py +++ b/metaphor/snowflake/profile/extractor.py @@ -8,13 +8,13 @@ print("Please install metaphor[snowflake] extra\n") raise - from metaphor.common.base_extractor import BaseExtractor from metaphor.common.entity_id import ( dataset_normalized_name, normalize_full_dataset_name, ) from metaphor.common.event_util import ENTITY_TYPES +from metaphor.common.fieldpath import build_field_statistics from metaphor.common.logger import get_logger from metaphor.common.sampling import SamplingConfig from metaphor.common.snowflake import normalize_snowflake_account @@ -25,7 +25,6 @@ Dataset, DatasetFieldStatistics, DatasetLogicalID, - FieldStatistics, ) from metaphor.snowflake import auth from metaphor.snowflake.extractor import DEFAULT_FILTER, SnowflakeExtractor @@ -307,15 +306,15 @@ def _parse_profiling_result( index += 1 fields.append( - FieldStatistics( - field_path=column, - distinct_value_count=unique_count, - null_value_count=nulls, - nonnull_value_count=non_nulls, - min_value=min_value, - max_value=max_value, - average=avg, - std_dev=std_dev, + build_field_statistics( + column, + unique_count, + nulls, + non_nulls, + min_value, + max_value, + avg, + std_dev, ) ) diff --git a/metaphor/unity_catalog/profile/extractor.py b/metaphor/unity_catalog/profile/extractor.py index b4287c37..32909037 100644 --- a/metaphor/unity_catalog/profile/extractor.py +++ b/metaphor/unity_catalog/profile/extractor.py @@ -4,26 +4,18 @@ from queue import Queue from typing import Collection, List, Optional, Tuple -from databricks.sql.client import Connection - -from metaphor.unity_catalog.extractor import DEFAULT_FILTER -from metaphor.unity_catalog.profile.config import UnityCatalogProfileRunConfig -from metaphor.unity_catalog.utils import ( - create_api, - create_connection_pool, - escape_special_characters, -) - try: from databricks.sdk.service.catalog import ColumnTypeName, TableInfo, TableType except ImportError: print("Please install metaphor[unity_catalog] extra\n") raise +from databricks.sql.client import Connection from metaphor.common.base_extractor import BaseExtractor from metaphor.common.entity_id import normalize_full_dataset_name from metaphor.common.event_util import ENTITY_TYPES +from metaphor.common.fieldpath import build_field_statistics from metaphor.common.logger import get_logger from metaphor.common.utils import safe_float from metaphor.models.crawler_run_metadata import Platform @@ -33,7 +25,13 @@ DatasetFieldStatistics, DatasetLogicalID, DatasetStatistics, - FieldStatistics, +) +from metaphor.unity_catalog.extractor import DEFAULT_FILTER +from metaphor.unity_catalog.profile.config import UnityCatalogProfileRunConfig +from metaphor.unity_catalog.utils import ( + create_api, + create_connection_pool, + escape_special_characters, ) logger = get_logger() @@ -267,14 +265,16 @@ def get_value_from_row(key: str) -> Optional[float]: return safe_float(value) return value - stats = FieldStatistics( - distinct_value_count=get_value_from_row("distinct_count"), - field_path=numeric_column, - max_value=get_value_from_row("max"), - min_value=get_value_from_row("min"), - null_value_count=get_value_from_row("num_nulls"), + field_statistics.field_statistics.append( + build_field_statistics( + numeric_column, + get_value_from_row("distinct_count"), + get_value_from_row("num_nulls"), + None, + get_value_from_row("min"), + get_value_from_row("max"), + ) ) - field_statistics.field_statistics.append(stats) logger.info( f"Profiled {table_info.full_name} " diff --git a/pyproject.toml b/pyproject.toml index 497665e1..3812af4b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "metaphor-connectors" -version = "0.14.130" +version = "0.14.131" license = "Apache-2.0" description = "A collection of Python-based 'connectors' that extract metadata from various sources to ingest into the Metaphor app." authors = ["Metaphor "] diff --git a/tests/snowflake/profile/expected.json b/tests/snowflake/profile/expected.json index f0e34c0d..0d803484 100644 --- a/tests/snowflake/profile/expected.json +++ b/tests/snowflake/profile/expected.json @@ -3,17 +3,17 @@ "fieldStatistics": { "fieldStatistics": [ { - "fieldPath": "ID", + "fieldPath": "id", "nonnullValueCount": 4.0, "nullValueCount": 0.0 }, { - "fieldPath": "COMMON", + "fieldPath": "common", "nonnullValueCount": 4.0, "nullValueCount": 0.0 }, { - "fieldPath": "CLASS", + "fieldPath": "class", "nonnullValueCount": 4.0, "nullValueCount": 0.0 } @@ -29,32 +29,32 @@ "fieldStatistics": { "fieldStatistics": [ { - "fieldPath": "ID", + "fieldPath": "id", "nonnullValueCount": 0.0, "nullValueCount": 0.0 }, { - "fieldPath": "FOO", + "fieldPath": "foo", "nonnullValueCount": 0.0, "nullValueCount": 0.0 }, { - "fieldPath": "BAR", + "fieldPath": "bar", "nonnullValueCount": 0.0, "nullValueCount": 0.0 }, { - "fieldPath": "ID", + "fieldPath": "id", "nonnullValueCount": 0.0, "nullValueCount": 0.0 }, { - "fieldPath": "FoO", + "fieldPath": "foo", "nonnullValueCount": 0.0, "nullValueCount": 0.0 }, { - "fieldPath": "Phone Number", + "fieldPath": "phone number", "nonnullValueCount": 0.0, "nullValueCount": 0.0 } @@ -70,22 +70,22 @@ "fieldStatistics": { "fieldStatistics": [ { - "fieldPath": "D", + "fieldPath": "d", "nonnullValueCount": 5.0, "nullValueCount": 0.0 }, { - "fieldPath": "LOC", + "fieldPath": "loc", "nonnullValueCount": 5.0, "nullValueCount": 0.0 }, { - "fieldPath": "B_ID", + "fieldPath": "b_id", "nonnullValueCount": 5.0, "nullValueCount": 0.0 }, { - "fieldPath": "C", + "fieldPath": "c", "nonnullValueCount": 5.0, "nullValueCount": 0.0 } @@ -101,12 +101,12 @@ "fieldStatistics": { "fieldStatistics": [ { - "fieldPath": "ID", + "fieldPath": "id", "nonnullValueCount": 2.0, "nullValueCount": 0.0 }, { - "fieldPath": "NAME", + "fieldPath": "name", "nonnullValueCount": 2.0, "nullValueCount": 0.0 }