Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Snowflake profiler is not normalizing the [sc-29479] #1014

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 10 additions & 11 deletions metaphor/bigquery/profile/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,14 @@
print("Please install metaphor[bigquery] extra\n")
raise


from metaphor.bigquery.extractor import BigQueryExtractor
from metaphor.bigquery.profile.config import BigQueryProfileRunConfig, SamplingConfig
from metaphor.bigquery.utils import build_client, get_credentials
from metaphor.common.base_extractor import BaseExtractor
from metaphor.common.column_statistics import ColumnStatistics
from metaphor.common.entity_id import dataset_normalized_name
from metaphor.common.event_util import ENTITY_TYPES
from metaphor.common.fieldpath import build_field_statistics
from metaphor.common.filter import DatasetFilter
from metaphor.common.logger import get_logger
from metaphor.common.utils import safe_float
Expand All @@ -28,7 +28,6 @@
DatasetFieldStatistics,
DatasetLogicalID,
DatasetSchema,
FieldStatistics,
)

logger = get_logger()
Expand Down Expand Up @@ -264,15 +263,15 @@ def _parse_result(
index += 1

fields.append(
FieldStatistics(
field_path=field.field_path,
distinct_value_count=unique_count,
null_value_count=nulls,
nonnull_value_count=non_nulls,
min_value=min_value,
max_value=max_value,
average=avg,
std_dev=std_dev,
build_field_statistics(
field.field_path,
unique_count,
nulls,
non_nulls,
min_value,
max_value,
avg,
std_dev,
)
)

Expand Down
27 changes: 26 additions & 1 deletion metaphor/common/fieldpath.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from enum import Enum
from typing import Optional

from metaphor.models.metadata_change_event import SchemaField
from metaphor.models.metadata_change_event import FieldStatistics, SchemaField


class FieldDataType(Enum):
Expand Down Expand Up @@ -59,3 +59,28 @@ def build_schema_field(
nullable=nullable,
precision=precision,
)


def build_field_statistics(
column: str,
unique_count: Optional[float] = None,
nulls: Optional[float] = None,
non_nulls: Optional[float] = None,
min_value: Optional[float] = None,
max_value: Optional[float] = None,
avg: Optional[float] = None,
std_dev: Optional[float] = None,
) -> FieldStatistics:
"""
Build field statistics for a column based on extracted column statistics.
"""
return FieldStatistics(
field_path=column.lower(),
distinct_value_count=unique_count,
null_value_count=nulls,
nonnull_value_count=non_nulls,
min_value=min_value,
max_value=max_value,
average=avg,
std_dev=std_dev,
)
18 changes: 9 additions & 9 deletions metaphor/postgresql/profile/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,13 @@
from metaphor.common.column_statistics import ColumnStatistics
from metaphor.common.entity_id import dataset_normalized_name
from metaphor.common.event_util import ENTITY_TYPES
from metaphor.common.fieldpath import build_field_statistics
from metaphor.common.logger import get_logger
from metaphor.common.sampling import SamplingConfig
from metaphor.common.utils import safe_float
from metaphor.models.metadata_change_event import (
Dataset,
DatasetFieldStatistics,
FieldStatistics,
MaterializationType,
)
from metaphor.postgresql.extractor import PostgreSQLExtractor
Expand Down Expand Up @@ -179,14 +179,14 @@ def _parse_result(
index += 1

dataset.field_statistics.field_statistics.append(
FieldStatistics(
field_path=field.field_path,
distinct_value_count=unique_values,
null_value_count=nulls,
nonnull_value_count=(row_count - nulls),
min_value=min_value,
max_value=max_value,
average=avg,
build_field_statistics(
field.field_path,
unique_values,
nulls,
row_count - nulls,
min_value,
max_value,
avg,
)
)

Expand Down
21 changes: 10 additions & 11 deletions metaphor/snowflake/profile/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,13 @@
print("Please install metaphor[snowflake] extra\n")
raise


from metaphor.common.base_extractor import BaseExtractor
from metaphor.common.entity_id import (
dataset_normalized_name,
normalize_full_dataset_name,
)
from metaphor.common.event_util import ENTITY_TYPES
from metaphor.common.fieldpath import build_field_statistics
from metaphor.common.logger import get_logger
from metaphor.common.sampling import SamplingConfig
from metaphor.common.snowflake import normalize_snowflake_account
Expand All @@ -25,7 +25,6 @@
Dataset,
DatasetFieldStatistics,
DatasetLogicalID,
FieldStatistics,
)
from metaphor.snowflake import auth
from metaphor.snowflake.extractor import DEFAULT_FILTER, SnowflakeExtractor
Expand Down Expand Up @@ -307,15 +306,15 @@ def _parse_profiling_result(
index += 1

fields.append(
FieldStatistics(
field_path=column,
distinct_value_count=unique_count,
null_value_count=nulls,
nonnull_value_count=non_nulls,
min_value=min_value,
max_value=max_value,
average=avg,
std_dev=std_dev,
build_field_statistics(
column,
unique_count,
nulls,
non_nulls,
min_value,
max_value,
avg,
std_dev,
)
)

Expand Down
36 changes: 18 additions & 18 deletions metaphor/unity_catalog/profile/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,26 +4,18 @@
from queue import Queue
from typing import Collection, List, Optional, Tuple

from databricks.sql.client import Connection

from metaphor.unity_catalog.extractor import DEFAULT_FILTER
from metaphor.unity_catalog.profile.config import UnityCatalogProfileRunConfig
from metaphor.unity_catalog.utils import (
create_api,
create_connection_pool,
escape_special_characters,
)

try:
from databricks.sdk.service.catalog import ColumnTypeName, TableInfo, TableType
except ImportError:
print("Please install metaphor[unity_catalog] extra\n")
raise

from databricks.sql.client import Connection

from metaphor.common.base_extractor import BaseExtractor
from metaphor.common.entity_id import normalize_full_dataset_name
from metaphor.common.event_util import ENTITY_TYPES
from metaphor.common.fieldpath import build_field_statistics
from metaphor.common.logger import get_logger
from metaphor.common.utils import safe_float
from metaphor.models.crawler_run_metadata import Platform
Expand All @@ -33,7 +25,13 @@
DatasetFieldStatistics,
DatasetLogicalID,
DatasetStatistics,
FieldStatistics,
)
from metaphor.unity_catalog.extractor import DEFAULT_FILTER
from metaphor.unity_catalog.profile.config import UnityCatalogProfileRunConfig
from metaphor.unity_catalog.utils import (
create_api,
create_connection_pool,
escape_special_characters,
)

logger = get_logger()
Expand Down Expand Up @@ -267,14 +265,16 @@ def get_value_from_row(key: str) -> Optional[float]:
return safe_float(value)
return value

stats = FieldStatistics(
distinct_value_count=get_value_from_row("distinct_count"),
field_path=numeric_column,
max_value=get_value_from_row("max"),
min_value=get_value_from_row("min"),
null_value_count=get_value_from_row("num_nulls"),
field_statistics.field_statistics.append(
build_field_statistics(
numeric_column,
get_value_from_row("distinct_count"),
get_value_from_row("num_nulls"),
None,
get_value_from_row("min"),
get_value_from_row("max"),
)
)
field_statistics.field_statistics.append(stats)

logger.info(
f"Profiled {table_info.full_name} "
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "metaphor-connectors"
version = "0.14.130"
version = "0.14.131"
license = "Apache-2.0"
description = "A collection of Python-based 'connectors' that extract metadata from various sources to ingest into the Metaphor app."
authors = ["Metaphor <[email protected]>"]
Expand Down
30 changes: 15 additions & 15 deletions tests/snowflake/profile/expected.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,17 @@
"fieldStatistics": {
"fieldStatistics": [
{
"fieldPath": "ID",
"fieldPath": "id",
"nonnullValueCount": 4.0,
"nullValueCount": 0.0
},
{
"fieldPath": "COMMON",
"fieldPath": "common",
"nonnullValueCount": 4.0,
"nullValueCount": 0.0
},
{
"fieldPath": "CLASS",
"fieldPath": "class",
"nonnullValueCount": 4.0,
"nullValueCount": 0.0
}
Expand All @@ -29,32 +29,32 @@
"fieldStatistics": {
"fieldStatistics": [
{
"fieldPath": "ID",
"fieldPath": "id",
"nonnullValueCount": 0.0,
"nullValueCount": 0.0
},
{
"fieldPath": "FOO",
"fieldPath": "foo",
"nonnullValueCount": 0.0,
"nullValueCount": 0.0
},
{
"fieldPath": "BAR",
"fieldPath": "bar",
"nonnullValueCount": 0.0,
"nullValueCount": 0.0
},
{
"fieldPath": "ID",
"fieldPath": "id",
"nonnullValueCount": 0.0,
"nullValueCount": 0.0
},
{
"fieldPath": "FoO",
"fieldPath": "foo",
"nonnullValueCount": 0.0,
"nullValueCount": 0.0
},
{
"fieldPath": "Phone Number",
"fieldPath": "phone number",
"nonnullValueCount": 0.0,
"nullValueCount": 0.0
}
Expand All @@ -70,22 +70,22 @@
"fieldStatistics": {
"fieldStatistics": [
{
"fieldPath": "D",
"fieldPath": "d",
"nonnullValueCount": 5.0,
"nullValueCount": 0.0
},
{
"fieldPath": "LOC",
"fieldPath": "loc",
"nonnullValueCount": 5.0,
"nullValueCount": 0.0
},
{
"fieldPath": "B_ID",
"fieldPath": "b_id",
"nonnullValueCount": 5.0,
"nullValueCount": 0.0
},
{
"fieldPath": "C",
"fieldPath": "c",
"nonnullValueCount": 5.0,
"nullValueCount": 0.0
}
Expand All @@ -101,12 +101,12 @@
"fieldStatistics": {
"fieldStatistics": [
{
"fieldPath": "ID",
"fieldPath": "id",
"nonnullValueCount": 2.0,
"nullValueCount": 0.0
},
{
"fieldPath": "NAME",
"fieldPath": "name",
"nonnullValueCount": 2.0,
"nullValueCount": 0.0
}
Expand Down
Loading