From fd89af21b44b22a396cce7fa3494188d5e956957 Mon Sep 17 00:00:00 2001 From: William Shin Date: Thu, 14 Mar 2024 13:21:50 -0700 Subject: [PATCH] [FEATURE] MetricListMetricRetriever - 0.18.x (#9615) --- ...mn_descriptive_metrics_metric_retriever.py | 97 +-- .../metric_list_metric_retriever.py | 253 ++++++ .../metric_repository/metric_retriever.py | 72 ++ .../experimental/metric_repository/metrics.py | 9 +- .../test_metric_list_metric_retriever.py | 791 ++++++++++++++++++ ...etric_list_metric_retriever_integration.py | 287 +++++++ 6 files changed, 1417 insertions(+), 92 deletions(-) create mode 100644 great_expectations/experimental/metric_repository/metric_list_metric_retriever.py create mode 100644 tests/experimental/metric_repository/test_metric_list_metric_retriever.py create mode 100644 tests/experimental/metric_repository/test_metric_list_metric_retriever_integration.py diff --git a/great_expectations/experimental/metric_repository/column_descriptive_metrics_metric_retriever.py b/great_expectations/experimental/metric_repository/column_descriptive_metrics_metric_retriever.py index d4ca0040363e..ae8fa6dc4cf7 100644 --- a/great_expectations/experimental/metric_repository/column_descriptive_metrics_metric_retriever.py +++ b/great_expectations/experimental/metric_repository/column_descriptive_metrics_metric_retriever.py @@ -1,7 +1,7 @@ from __future__ import annotations from itertools import chain -from typing import TYPE_CHECKING, Any, List, Sequence +from typing import TYPE_CHECKING, List, Sequence from great_expectations.compatibility.typing_extensions import override from great_expectations.experimental.metric_repository.metric_retriever import ( @@ -10,16 +10,13 @@ from great_expectations.experimental.metric_repository.metrics import ( ColumnMetric, Metric, - TableMetric, ) if TYPE_CHECKING: from great_expectations.data_context import AbstractDataContext from great_expectations.datasource.fluent import BatchRequest from great_expectations.validator.metrics_calculator import ( - _AbortedMetricsInfoDict, _MetricKey, - _MetricsDict, ) @@ -31,7 +28,7 @@ def __init__(self, context: AbstractDataContext): @override def get_metrics(self, batch_request: BatchRequest) -> Sequence[Metric]: - table_metrics = self._get_table_metrics(batch_request) + table_metrics = self._calculate_table_metrics(batch_request) # We need to skip columns that do not report a type, because the metric computation # to determine semantic type will fail. @@ -68,96 +65,14 @@ def get_metrics(self, batch_request: BatchRequest) -> Sequence[Metric]: ) return bundled_list - def _get_table_metrics(self, batch_request: BatchRequest) -> Sequence[Metric]: - table_metric_names = ["table.row_count", "table.columns", "table.column_types"] - table_metric_configs = self._generate_table_metric_configurations( - table_metric_names - ) - batch_id, computed_metrics, aborted_metrics = self._compute_metrics( - batch_request, table_metric_configs - ) - + def _calculate_table_metrics(self, batch_request: BatchRequest) -> Sequence[Metric]: metrics = [ - self._get_table_row_count(batch_id, computed_metrics, aborted_metrics), - self._get_table_columns(batch_id, computed_metrics, aborted_metrics), - self._get_table_column_types(batch_id, computed_metrics, aborted_metrics), + self._get_table_row_count(batch_request), + self._get_table_columns(batch_request), + self._get_table_column_types(batch_request), ] - return metrics - def _get_table_row_count( - self, - batch_id: str, - computed_metrics: _MetricsDict, - aborted_metrics: _AbortedMetricsInfoDict, - ) -> Metric: - metric_name = "table.row_count" - value, exception = self._get_metric_from_computed_metrics( - metric_name=metric_name, - computed_metrics=computed_metrics, - aborted_metrics=aborted_metrics, - ) - return TableMetric[int]( - batch_id=batch_id, - metric_name=metric_name, - value=value, - exception=exception, - ) - - def _get_table_columns( - self, - batch_id: str, - computed_metrics: _MetricsDict, - aborted_metrics: _AbortedMetricsInfoDict, - ) -> Metric: - metric_name = "table.columns" - value, exception = self._get_metric_from_computed_metrics( - metric_name=metric_name, - computed_metrics=computed_metrics, - aborted_metrics=aborted_metrics, - ) - return TableMetric[List[str]]( - batch_id=batch_id, - metric_name=metric_name, - value=value, - exception=exception, - ) - - def _get_table_column_types( - self, - batch_id: str, - computed_metrics: _MetricsDict, - aborted_metrics: _AbortedMetricsInfoDict, - ) -> Metric: - metric_name = "table.column_types" - metric_lookup_key: _MetricKey = (metric_name, tuple(), "include_nested=True") - value, exception = self._get_metric_from_computed_metrics( - metric_name=metric_name, - metric_lookup_key=metric_lookup_key, - computed_metrics=computed_metrics, - aborted_metrics=aborted_metrics, - ) - raw_column_types: list[dict[str, Any]] = value - # If type is not found, don't add empty type field. This can happen if our db introspection fails. - column_types_converted_to_str: list[dict[str, str]] = [] - for raw_column_type in raw_column_types: - if raw_column_type.get("type"): - column_types_converted_to_str.append( - { - "name": raw_column_type["name"], - "type": str(raw_column_type["type"]), - } - ) - else: - column_types_converted_to_str.append({"name": raw_column_type["name"]}) - - return TableMetric[List[str]]( - batch_id=batch_id, - metric_name=metric_name, - value=column_types_converted_to_str, - exception=exception, - ) - def _get_numeric_column_metrics( self, batch_request: BatchRequest, column_list: List[str] ) -> Sequence[Metric]: diff --git a/great_expectations/experimental/metric_repository/metric_list_metric_retriever.py b/great_expectations/experimental/metric_repository/metric_list_metric_retriever.py new file mode 100644 index 000000000000..efbb08e00991 --- /dev/null +++ b/great_expectations/experimental/metric_repository/metric_list_metric_retriever.py @@ -0,0 +1,253 @@ +from __future__ import annotations + +from itertools import chain +from typing import TYPE_CHECKING, List, Optional, Sequence + +from great_expectations.compatibility.typing_extensions import override +from great_expectations.experimental.metric_repository.metric_retriever import ( + MetricRetriever, +) +from great_expectations.experimental.metric_repository.metrics import ( + ColumnMetric, + Metric, + MetricTypes, +) + +if TYPE_CHECKING: + from great_expectations.data_context import AbstractDataContext + from great_expectations.datasource.fluent.batch_request import BatchRequest + from great_expectations.validator.validator import ( + Validator, + ) + + +class MetricListMetricRetriever(MetricRetriever): + def __init__(self, context: AbstractDataContext): + super().__init__(context=context) + self._validator: Validator | None = None + + @override + def get_metrics( + self, + batch_request: BatchRequest, + metric_list: Optional[List[MetricTypes]] = None, + ) -> Sequence[Metric]: + metrics_result: List[Metric] = [] + + if not metric_list: + raise ValueError("metric_list cannot be empty") + + self._check_valid_metric_types(metric_list) + + table_metrics = self._calculate_table_metrics( + batch_request=batch_request, metric_list=metric_list + ) + metrics_result.extend(table_metrics) + + # exit early if only Table Metrics exist + if not self._column_metrics_in_metric_list(metric_list): + return metrics_result + + table_column_types = list( + filter( + lambda m: m.metric_name == MetricTypes.TABLE_COLUMN_TYPES, table_metrics + ) + )[0] + + # We need to skip columns that do not report a type, because the metric computation + # to determine semantic type will fail. + exclude_column_names = self._get_columns_to_exclude(table_column_types) + + numeric_column_names = self._get_numeric_column_names( + batch_request=batch_request, exclude_column_names=exclude_column_names + ) + timestamp_column_names = self._get_timestamp_column_names( + batch_request=batch_request, exclude_column_names=exclude_column_names + ) + numeric_column_metrics = self._get_numeric_column_metrics( + metric_list, batch_request, numeric_column_names + ) + timestamp_column_metrics = self._get_timestamp_column_metrics( + metric_list, batch_request, timestamp_column_names + ) + all_column_names: List[str] = self._get_all_column_names(table_metrics) + non_numeric_column_metrics = self._get_non_numeric_column_metrics( + metric_list, batch_request, all_column_names + ) + + bundled_list = list( + chain( + table_metrics, + numeric_column_metrics, + timestamp_column_metrics, + non_numeric_column_metrics, + ) + ) + + return bundled_list + + def _get_non_numeric_column_metrics( + self, + metrics_list: List[MetricTypes], + batch_request: BatchRequest, + column_list: List[str], + ) -> Sequence[Metric]: + """Calculate column metrics for non-numeric columns. + + Args: + metrics_list (List[MetricTypes]): list of metrics sent from Agent. + batch_request (BatchRequest): for current batch. + column_list (List[str]): list of non-numeric columns. + + Returns: + Sequence[Metric]: List of metrics for non-numeric columns. + """ + # currently only the null-count is supported. If more metrics are added, this set will need to be updated. + column_metric_names = {MetricTypes.COLUMN_NULL_COUNT} + metrics: list[Metric] = [] + metrics_list_as_set = set(metrics_list) + metrics_to_calculate = sorted( + column_metric_names.intersection(metrics_list_as_set) + ) + + if not metrics_to_calculate: + return metrics + else: + return self._get_column_metrics( + batch_request=batch_request, + column_list=column_list, + column_metric_names=list(metrics_to_calculate), + column_metric_type=ColumnMetric[int], + ) + + def _get_numeric_column_metrics( + self, + metrics_list: List[MetricTypes], + batch_request: BatchRequest, + column_list: List[str], + ) -> Sequence[Metric]: + """Calculate column metrics for numeric columns. + + Args: + metrics_list (List[MetricTypes]): list of metrics sent from Agent. + batch_request (BatchRequest): for current batch. + column_list (List[str]): list of numeric columns. + + Returns: + Sequence[Metric]: List of metrics for numeric columns. + """ + metrics: list[Metric] = [] + column_metric_names = { + MetricTypes.COLUMN_MIN, + MetricTypes.COLUMN_MAX, + MetricTypes.COLUMN_MEAN, + MetricTypes.COLUMN_MEDIAN, + } + metrics_list_as_set = set(metrics_list) + metrics_to_calculate = sorted( + column_metric_names.intersection(metrics_list_as_set) + ) + if not metrics_to_calculate: + return metrics + + return self._get_column_metrics( + batch_request=batch_request, + column_list=column_list, + column_metric_names=list(metrics_to_calculate), + column_metric_type=ColumnMetric[float], + ) + + def _get_timestamp_column_metrics( + self, + metrics_list: List[MetricTypes], + batch_request: BatchRequest, + column_list: List[str], + ) -> Sequence[Metric]: + """Calculate column metrics for timestamp columns. + + Args: + metrics_list (List[MetricTypes]): list of metrics sent from Agent. + batch_request (BatchRequest): for current batch. + column_list (List[str]): list of timestamp columns. + + Returns: + Sequence[Metric]: List of metrics for timestamp columns. + """ + metrics: list[Metric] = [] + column_metric_names = { + MetricTypes.COLUMN_MIN, + MetricTypes.COLUMN_MAX, + # MetricTypes.COLUMN_MEAN, # Currently not supported for timestamp in Snowflake + # MetricTypes.COLUMN_MEDIAN, # Currently not supported for timestamp in Snowflake + } + metrics_list_as_set = set(metrics_list) + metrics_to_calculate = sorted( + column_metric_names.intersection(metrics_list_as_set) + ) + if not metrics_to_calculate: + return metrics + + # Note: Timestamps are returned as strings for Snowflake, this may need to be adjusted + # when we support other datasources. For example in Pandas, timestamps can be returned as Timestamp(). + return self._get_column_metrics( + batch_request=batch_request, + column_list=column_list, + column_metric_names=list(metrics_to_calculate), + column_metric_type=ColumnMetric[str], + ) + + def _calculate_table_metrics( + self, batch_request: BatchRequest, metric_list: List[MetricTypes] + ) -> List[Metric]: + """Calculate table metrics, which include row_count, column names and types. + + Args: + metrics_list (List[MetricTypes]): list of metrics sent from Agent. + batch_request (BatchRequest): for current batch. + + Returns: + Sequence[Metric]: List of table metrics. + """ + metrics: List[Metric] = [] + if MetricTypes.TABLE_ROW_COUNT in metric_list: + metrics.append(self._get_table_row_count(batch_request=batch_request)) + if MetricTypes.TABLE_COLUMNS in metric_list: + metrics.append(self._get_table_columns(batch_request=batch_request)) + if MetricTypes.TABLE_COLUMN_TYPES in metric_list: + metrics.append(self._get_table_column_types(batch_request=batch_request)) + return metrics + + def _check_valid_metric_types(self, metric_list: List[MetricTypes]) -> bool: + """Check whether all the metric types in the list are valid. + + Args: + metric_list (List[MetricTypes]): list of MetricTypes that are passed in to MetricListMetricRetriever. + + Returns: + bool: True if all the metric types in the list are valid, False otherwise. + """ + for metric in metric_list: + if metric not in MetricTypes: + return False + return True + + def _column_metrics_in_metric_list(self, metric_list: List[MetricTypes]) -> bool: + """Helper method to check whether any column metrics are present in the metric list. + + Args: + metric_list (List[MetricTypes]): list of MetricTypes that are passed in to MetricListMetricRetriever. + + Returns: + bool: True if any column metrics are present in the metric list, False otherwise. + """ + column_metrics: List[MetricTypes] = [ + MetricTypes.COLUMN_MIN, + MetricTypes.COLUMN_MAX, + MetricTypes.COLUMN_MEDIAN, + MetricTypes.COLUMN_MEAN, + MetricTypes.COLUMN_NULL_COUNT, + ] + for metric in column_metrics: + if metric in metric_list: + return True + return False diff --git a/great_expectations/experimental/metric_repository/metric_retriever.py b/great_expectations/experimental/metric_repository/metric_retriever.py index 8ef508bae84b..7b1a472c28e1 100644 --- a/great_expectations/experimental/metric_repository/metric_retriever.py +++ b/great_expectations/experimental/metric_repository/metric_retriever.py @@ -15,6 +15,7 @@ ColumnMetric, MetricException, MetricTypes, + TableMetric, ) from great_expectations.rule_based_profiler.domain_builder import ColumnDomainBuilder from great_expectations.validator.exception_info import ExceptionInfo @@ -174,6 +175,25 @@ def _get_column_names_for_semantic_types( ) return column_names + def _get_table_metrics( + self, + batch_request: BatchRequest, + metric_name: MetricTypes | str, + metric_type: type[Metric], + ) -> Metric: + metric_configs = self._generate_table_metric_configurations([metric_name]) + batch_id, computed_metrics, aborted_metrics = self._compute_metrics( + batch_request, metric_configs + ) + value, exception = self._get_metric_from_computed_metrics( + metric_name=metric_name, + computed_metrics=computed_metrics, + aborted_metrics=aborted_metrics, + ) + return metric_type( + batch_id=batch_id, metric_name=metric_name, value=value, exception=exception + ) + def _get_column_metrics( self, batch_request: BatchRequest, @@ -189,6 +209,7 @@ def _get_column_metrics( ) # Convert computed_metrics + ColumnMetric.update_forward_refs() metrics: list[Metric] = [] metric_lookup_key: _MetricKey @@ -233,3 +254,54 @@ def _get_all_column_names(self, metrics: Sequence[Metric]) -> List[str]: if metric.metric_name == MetricTypes.TABLE_COLUMNS: column_list = metric.value return column_list + + def _get_table_row_count(self, batch_request: BatchRequest) -> Metric: + return self._get_table_metrics( + batch_request=batch_request, + metric_name=MetricTypes.TABLE_ROW_COUNT, + metric_type=TableMetric[int], + ) + + def _get_table_columns(self, batch_request: BatchRequest) -> Metric: + return self._get_table_metrics( + batch_request=batch_request, + metric_name=MetricTypes.TABLE_COLUMNS, + metric_type=TableMetric[List[str]], + ) + + def _get_table_column_types(self, batch_request: BatchRequest) -> Metric: + metric_name = MetricTypes.TABLE_COLUMN_TYPES + + metric_lookup_key: _MetricKey = (metric_name, tuple(), "include_nested=True") + table_metric_configs = self._generate_table_metric_configurations( + table_metric_names=[metric_name] + ) + batch_id, computed_metrics, aborted_metrics = self._compute_metrics( + batch_request, table_metric_configs + ) + value, exception = self._get_metric_from_computed_metrics( + metric_name=metric_name, + metric_lookup_key=metric_lookup_key, + computed_metrics=computed_metrics, + aborted_metrics=aborted_metrics, + ) + raw_column_types: list[dict[str, Any]] = value + # If type is not found, don't add empty type field. This can happen if our db introspection fails. + column_types_converted_to_str: list[dict[str, str]] = [] + for raw_column_type in raw_column_types: + if raw_column_type.get("type"): + column_types_converted_to_str.append( + { + "name": raw_column_type["name"], + "type": str(raw_column_type["type"]), + } + ) + else: + column_types_converted_to_str.append({"name": raw_column_type["name"]}) + + return TableMetric[List[str]]( + batch_id=batch_id, + metric_name=metric_name, + value=column_types_converted_to_str, + exception=exception, + ) diff --git a/great_expectations/experimental/metric_repository/metrics.py b/great_expectations/experimental/metric_repository/metrics.py index 8886b3dbc22b..fb0a3b5a838f 100644 --- a/great_expectations/experimental/metric_repository/metrics.py +++ b/great_expectations/experimental/metric_repository/metrics.py @@ -24,7 +24,14 @@ AbstractSetIntStr = AbstractSet[Union[int, str]] -class MetricTypes(str, enum.Enum): +class MetricTypesMeta(enum.EnumMeta): + """Metaclass definition for MetricTypes that allows for membership checking.""" + + def __contains__(cls, item): + return item in cls.__members__.values() + + +class MetricTypes(str, enum.Enum, metaclass=MetricTypesMeta): """Represents Metric types in OSS that are used for ColumnDescriptiveMetrics and MetricRepository. More Metric types will be added in the future. diff --git a/tests/experimental/metric_repository/test_metric_list_metric_retriever.py b/tests/experimental/metric_repository/test_metric_list_metric_retriever.py new file mode 100644 index 000000000000..a6c0c3919262 --- /dev/null +++ b/tests/experimental/metric_repository/test_metric_list_metric_retriever.py @@ -0,0 +1,791 @@ +from typing import Dict, List + +import pytest + +from great_expectations.data_context import CloudDataContext +from great_expectations.datasource.fluent import BatchRequest +from great_expectations.datasource.fluent.interfaces import Batch +from great_expectations.experimental.metric_repository.metric_list_metric_retriever import ( + MetricListMetricRetriever, +) +from great_expectations.experimental.metric_repository.metrics import ( + ColumnMetric, + MetricException, + MetricTypes, + TableMetric, +) +from great_expectations.rule_based_profiler.domain_builder import ColumnDomainBuilder +from great_expectations.validator.exception_info import ExceptionInfo +from great_expectations.validator.validator import Validator + +pytestmark = pytest.mark.unit + +from pytest_mock import MockerFixture + + +def test_get_metrics_table_metrics_only(mocker: MockerFixture): + mock_context = mocker.Mock(spec=CloudDataContext) + mock_validator = mocker.Mock(spec=Validator) + mock_context.get_validator.return_value = mock_validator + computed_metrics = { + ("table.row_count", (), ()): 2, + ("table.columns", (), ()): ["col1", "col2"], + ("table.column_types", (), "include_nested=True"): [ + {"name": "col1", "type": "float"}, + {"name": "col2", "type": "float"}, + ], + } + table_metrics_list = [ + MetricTypes.TABLE_ROW_COUNT, + MetricTypes.TABLE_COLUMNS, + MetricTypes.TABLE_COLUMN_TYPES, + ] + aborted_metrics: Dict[str, str] = {} + mock_validator.compute_metrics.return_value = ( + computed_metrics, + aborted_metrics, + ) + mock_batch = mocker.Mock(spec=Batch) + mock_batch.id = "batch_id" + mock_validator.active_batch = mock_batch + + metric_retriever = MetricListMetricRetriever(context=mock_context) + + mock_batch_request = mocker.Mock(spec=BatchRequest) + + metrics = metric_retriever.get_metrics( + batch_request=mock_batch_request, + metric_list=table_metrics_list, + ) + assert metrics == [ + TableMetric[int]( + batch_id="batch_id", + metric_name="table.row_count", + value=2, + exception=None, + ), + TableMetric[List[str]]( + batch_id="batch_id", + metric_name="table.columns", + value=["col1", "col2"], + exception=None, + ), + TableMetric[List[str]]( + batch_id="batch_id", + metric_name="table.column_types", + value=[ + {"name": "col1", "type": "float"}, + {"name": "col2", "type": "float"}, + ], + exception=None, + ), + ] + + +def test_get_metrics_full_list(mocker: MockerFixture): + mock_context = mocker.Mock(spec=CloudDataContext) + mock_validator = mocker.Mock(spec=Validator) + mock_context.get_validator.return_value = mock_validator + computed_metrics = { + ("table.row_count", (), ()): 2, + ("table.columns", (), ()): ["col1", "col2"], + ("table.column_types", (), "include_nested=True"): [ + {"name": "col1", "type": "float"}, + {"name": "col2", "type": "float"}, + ], + ("column.min", "column=col1", ()): 2.5, + ("column.min", "column=col2", ()): 2.7, + ("column.max", "column=col1", ()): 5.5, + ("column.max", "column=col2", ()): 5.7, + ("column.mean", "column=col1", ()): 2.5, + ("column.mean", "column=col2", ()): 2.7, + ("column.median", "column=col1", ()): 2.5, + ("column.median", "column=col2", ()): 2.7, + ("column_values.null.count", "column=col1", ()): 1, + ("column_values.null.count", "column=col2", ()): 1, + } + cdm_metrics_list = [ + MetricTypes.TABLE_ROW_COUNT, + MetricTypes.TABLE_COLUMNS, + MetricTypes.TABLE_COLUMN_TYPES, + MetricTypes.COLUMN_MIN, + MetricTypes.COLUMN_MAX, + MetricTypes.COLUMN_MEAN, + MetricTypes.COLUMN_MEDIAN, + MetricTypes.COLUMN_NULL_COUNT, + ] + aborted_metrics: Dict[str, str] = {} + mock_validator.compute_metrics.return_value = ( + computed_metrics, + aborted_metrics, + ) + mock_batch = mocker.Mock(spec=Batch) + mock_batch.id = "batch_id" + mock_validator.active_batch = mock_batch + + metric_retriever = MetricListMetricRetriever(context=mock_context) + + mock_batch_request = mocker.Mock(spec=BatchRequest) + + mocker.patch( + f"{MetricListMetricRetriever.__module__}.{MetricListMetricRetriever.__name__}._get_numeric_column_names", + return_value=["col1", "col2"], + ) + mocker.patch( + f"{MetricListMetricRetriever.__module__}.{MetricListMetricRetriever.__name__}._get_timestamp_column_names", + return_value=[], + ) + metrics = metric_retriever.get_metrics( + batch_request=mock_batch_request, + metric_list=cdm_metrics_list, + ) + + assert metrics == [ + TableMetric[int]( + batch_id="batch_id", metric_name="table.row_count", value=2, exception=None + ), + TableMetric[List[str]]( + batch_id="batch_id", + metric_name="table.columns", + value=["col1", "col2"], + exception=None, + ), + TableMetric[List[str]]( + batch_id="batch_id", + metric_name="table.column_types", + value=[ + {"name": "col1", "type": "float"}, + {"name": "col2", "type": "float"}, + ], + exception=None, + ), + ColumnMetric[float]( + batch_id="batch_id", + metric_name="column.max", + value=5.5, + exception=None, + column="col1", + ), + ColumnMetric[float]( + batch_id="batch_id", + metric_name="column.max", + value=5.7, + exception=None, + column="col2", + ), + ColumnMetric[float]( + batch_id="batch_id", + metric_name="column.mean", + value=2.5, + exception=None, + column="col1", + ), + ColumnMetric[float]( + batch_id="batch_id", + metric_name="column.mean", + value=2.7, + exception=None, + column="col2", + ), + ColumnMetric[float]( + batch_id="batch_id", + metric_name="column.median", + value=2.5, + exception=None, + column="col1", + ), + ColumnMetric[float]( + batch_id="batch_id", + metric_name="column.median", + value=2.7, + exception=None, + column="col2", + ), + ColumnMetric[float]( + batch_id="batch_id", + metric_name="column.min", + value=2.5, + exception=None, + column="col1", + ), + ColumnMetric[float]( + batch_id="batch_id", + metric_name="column.min", + value=2.7, + exception=None, + column="col2", + ), + ColumnMetric[int]( + batch_id="batch_id", + metric_name="column_values.null.count", + value=1, + exception=None, + column="col1", + ), + ColumnMetric[int]( + batch_id="batch_id", + metric_name="column_values.null.count", + value=1, + exception=None, + column="col2", + ), + ] + + +def test_get_metrics_metrics_missing(mocker: MockerFixture): + """This test is meant to simulate metrics missing from the computed metrics.""" + mock_context = mocker.Mock(spec=CloudDataContext) + mock_validator = mocker.Mock(spec=Validator) + mock_context.get_validator.return_value = mock_validator + mock_computed_metrics = { + # ("table.row_count", (), ()): 2, # Missing table.row_count metric + ("table.columns", (), ()): ["col1", "col2"], + ("table.column_types", (), "include_nested=True"): [ + {"name": "col1", "type": "float"}, + {"name": "col2", "type": "float"}, + ], + # ("column.min", "column=col1", ()): 2.5, # Missing column.min metric for col1 + ("column.min", "column=col2", ()): 2.7, + } + + cdm_metrics_list: List[MetricTypes] = [ + MetricTypes.TABLE_ROW_COUNT, + MetricTypes.TABLE_COLUMNS, + MetricTypes.TABLE_COLUMN_TYPES, + MetricTypes.COLUMN_MIN, + ] + mock_aborted_metrics = {} + mock_validator.compute_metrics.return_value = ( + mock_computed_metrics, + mock_aborted_metrics, + ) + mock_batch = mocker.Mock(spec=Batch) + mock_batch.id = "batch_id" + mock_validator.active_batch = mock_batch + + metric_retriever = MetricListMetricRetriever(context=mock_context) + + mock_batch_request = mocker.Mock(spec=BatchRequest) + + mocker.patch( + f"{MetricListMetricRetriever.__module__}.{MetricListMetricRetriever.__name__}._get_numeric_column_names", + return_value=["col1", "col2"], + ) + mocker.patch( + f"{MetricListMetricRetriever.__module__}.{MetricListMetricRetriever.__name__}._get_timestamp_column_names", + return_value=[], + ) + metrics = metric_retriever.get_metrics( + batch_request=mock_batch_request, metric_list=cdm_metrics_list + ) + assert metrics == [ + TableMetric[int]( + batch_id="batch_id", + metric_name="table.row_count", + value=None, + exception=MetricException( + type="Not found", + message="Metric was not successfully computed but exception was not found.", + ), + ), + TableMetric[List[str]]( + batch_id="batch_id", + metric_name="table.columns", + value=["col1", "col2"], + exception=None, + ), + TableMetric[List[str]]( + batch_id="batch_id", + metric_name="table.column_types", + value=[ + {"name": "col1", "type": "float"}, + {"name": "col2", "type": "float"}, + ], + exception=None, + ), + ColumnMetric[float]( + batch_id="batch_id", + metric_name="column.min", + value=None, + exception=MetricException( + type="Not found", + message="Metric was not successfully computed but exception was not found.", + ), + column="col1", + ), + ColumnMetric[float]( + batch_id="batch_id", + metric_name="column.min", + value=2.7, + exception=None, + column="col2", + ), + ] + + +def test_get_metrics_with_exception(mocker: MockerFixture): + """This test is meant to simulate failed metrics in the computed metrics.""" + mock_context = mocker.Mock(spec=CloudDataContext) + mock_validator = mocker.Mock(spec=Validator) + mock_context.get_validator.return_value = mock_validator + + exception_info = ExceptionInfo( + exception_traceback="test exception traceback", + exception_message="test exception message", + raised_exception=True, + ) + aborted_metrics = { + ("table.row_count", (), ()): { + "metric_configuration": {}, # Leaving out for brevity + "num_failures": 3, + "exception_info": exception_info, + }, + ("column.min", "column=col1", ()): { + "metric_configuration": {}, # Leaving out for brevity + "num_failures": 3, + "exception_info": exception_info, + }, + } + computed_metrics = { + # ("table.row_count", (), ()): 2, # Error in table.row_count metric + ("table.columns", (), ()): ["col1", "col2"], + ("table.column_types", (), "include_nested=True"): [ + {"name": "col1", "type": "float"}, + {"name": "col2", "type": "float"}, + ], + } + mock_validator.compute_metrics.return_value = ( + computed_metrics, + aborted_metrics, + ) + mock_batch = mocker.Mock(spec=Batch) + mock_batch.id = "batch_id" + mock_validator.active_batch = mock_batch + + cdm_metrics_list: List[MetricTypes] = [ + MetricTypes.TABLE_ROW_COUNT, + MetricTypes.TABLE_COLUMNS, + MetricTypes.TABLE_COLUMN_TYPES, + ] + + metric_retriever = MetricListMetricRetriever(context=mock_context) + + mock_batch_request = mocker.Mock(spec=BatchRequest) + + metrics = metric_retriever.get_metrics( + batch_request=mock_batch_request, metric_list=cdm_metrics_list + ) + + assert metrics == [ + TableMetric[int]( + batch_id="batch_id", + metric_name="table.row_count", + value=None, + exception=MetricException(type="Unknown", message="test exception message"), + ), + TableMetric[List[str]]( + batch_id="batch_id", + metric_name="table.columns", + value=["col1", "col2"], + exception=None, + ), + TableMetric[List[str]]( + batch_id="batch_id", + metric_name="table.column_types", + value=[ + {"name": "col1", "type": "float"}, + {"name": "col2", "type": "float"}, + ], + exception=None, + ), + ] + + +def test_get_metrics_with_column_type_missing(mocker: MockerFixture): + """This test is meant to simulate failed metrics in the computed metrics.""" + mock_context = mocker.Mock(spec=CloudDataContext) + mock_validator = mocker.Mock(spec=Validator) + mock_context.get_validator.return_value = mock_validator + + exception_info = ExceptionInfo( + exception_traceback="test exception traceback", + exception_message="test exception message", + raised_exception=True, + ) + + aborted_metrics = { + ("table.row_count", (), ()): { + "metric_configuration": {}, # Leaving out for brevity + "num_failures": 3, + "exception_info": exception_info, + }, + ("column.min", "column=col1", ()): { + "metric_configuration": {}, # Leaving out for brevity + "num_failures": 3, + "exception_info": exception_info, + }, + } + + computed_metrics = { + # ("table.row_count", (), ()): 2, # Error in table.row_count metric + ("table.columns", (), ()): ["col1", "col2"], + ("table.column_types", (), "include_nested=True"): [ + {"name": "col1", "type": "float"}, + { + "name": "col2", + }, # Missing type for col2 + ], + # ("column.min", "column=col1", ()): 2.5, # Error in column.min metric for col1 + ("column.min", "column=col2", ()): 2.7, + } + mock_validator.compute_metrics.return_value = ( + computed_metrics, + aborted_metrics, + ) + mock_batch = mocker.Mock(spec=Batch) + mock_batch.id = "batch_id" + mock_validator.active_batch = mock_batch + + cdm_metrics_list: List[MetricTypes] = [ + MetricTypes.TABLE_ROW_COUNT, + MetricTypes.TABLE_COLUMNS, + MetricTypes.TABLE_COLUMN_TYPES, + MetricTypes.COLUMN_MIN, + ] + + metric_retriever = MetricListMetricRetriever(context=mock_context) + + mock_batch_request = mocker.Mock(spec=BatchRequest) + + mocker.patch( + f"{MetricListMetricRetriever.__module__}.{MetricListMetricRetriever.__name__}._get_numeric_column_names", + return_value=["col1", "col2"], + ) + mocker.patch( + f"{MetricListMetricRetriever.__module__}.{MetricListMetricRetriever.__name__}._get_timestamp_column_names", + return_value=[], + ) + metrics = metric_retriever.get_metrics( + batch_request=mock_batch_request, metric_list=cdm_metrics_list + ) + assert metrics == [ + TableMetric[int]( + batch_id="batch_id", + metric_name="table.row_count", + value=None, + exception=MetricException(type="Unknown", message="test exception message"), + ), + TableMetric[List[str]]( + batch_id="batch_id", + metric_name="table.columns", + value=["col1", "col2"], + exception=None, + ), + TableMetric[List[str]]( + batch_id="batch_id", + metric_name="table.column_types", + value=[ + {"name": "col1", "type": "float"}, + { + "name": "col2", + }, # Note: No type for col2 + ], + exception=None, + ), + ColumnMetric[float]( + batch_id="batch_id", + metric_name="column.min", + column="col1", + value=None, + exception=MetricException(type="Unknown", message="test exception message"), + ), + ColumnMetric[float]( + batch_id="batch_id", + metric_name="column.min", + column="col2", + value=2.7, + exception=None, + ), + ] + + +def test_get_metrics_with_timestamp_columns(mocker: MockerFixture): + mock_context = mocker.Mock(spec=CloudDataContext) + mock_validator = mocker.Mock(spec=Validator) + mock_context.get_validator.return_value = mock_validator + computed_metrics = { + ("table.row_count", (), ()): 2, + ("table.columns", (), ()): ["timestamp_col"], + ("table.column_types", (), "include_nested=True"): [ + {"name": "timestamp_col", "type": "TIMESTAMP_NTZ"}, + ], + ("column.min", "column=timestamp_col", ()): "2023-01-01T00:00:00", + ("column.max", "column=timestamp_col", ()): "2023-12-31T00:00:00", + ("column_values.null.count", "column=timestamp_col", ()): 1, + } + cdm_metrics_list: List[MetricTypes] = [ + MetricTypes.TABLE_ROW_COUNT, + MetricTypes.TABLE_COLUMNS, + MetricTypes.TABLE_COLUMN_TYPES, + MetricTypes.COLUMN_MIN, + MetricTypes.COLUMN_MAX, + MetricTypes.COLUMN_NULL_COUNT, + ] + aborted_metrics = {} + mock_validator.compute_metrics.return_value = ( + computed_metrics, + aborted_metrics, + ) + mock_batch = mocker.Mock(spec=Batch) + mock_batch.id = "batch_id" + mock_validator.active_batch = mock_batch + + metric_retriever = MetricListMetricRetriever(context=mock_context) + + mock_batch_request = mocker.Mock(spec=BatchRequest) + + mocker.patch( + f"{MetricListMetricRetriever.__module__}.{MetricListMetricRetriever.__name__}._get_numeric_column_names", + return_value=[], + ) + mocker.patch( + f"{MetricListMetricRetriever.__module__}.{MetricListMetricRetriever.__name__}._get_timestamp_column_names", + return_value=["timestamp_col"], + ) + metrics = metric_retriever.get_metrics( + batch_request=mock_batch_request, metric_list=cdm_metrics_list + ) + + assert metrics == [ + TableMetric[int]( + batch_id="batch_id", + metric_name="table.row_count", + value=2, + exception=None, + ), + TableMetric[List[str]]( + batch_id="batch_id", + metric_name="table.columns", + value=["timestamp_col"], + exception=None, + ), + TableMetric[List[str]]( + batch_id="batch_id", + metric_name="table.column_types", + value=[{"name": "timestamp_col", "type": "TIMESTAMP_NTZ"}], + exception=None, + ), + ColumnMetric[str]( + batch_id="batch_id", + metric_name="column.max", + value="2023-12-31T00:00:00", + exception=None, + column="timestamp_col", + ), + ColumnMetric[str]( + batch_id="batch_id", + metric_name="column.min", + value="2023-01-01T00:00:00", + exception=None, + column="timestamp_col", + ), + ColumnMetric[int]( + batch_id="batch_id", + metric_name="column_values.null.count", + value=1, + exception=None, + column="timestamp_col", + ), + ] + + +def test_get_metrics_only_gets_a_validator_once(mocker: MockerFixture): + mock_context = mocker.Mock(spec=CloudDataContext) + mock_validator = mocker.Mock(spec=Validator) + mock_context.get_validator.return_value = mock_validator + + aborted_metrics = {} + + computed_metrics = { + ("table.row_count", (), ()): 2, + ("table.columns", (), ()): ["col1", "col2"], + ("table.column_types", (), "include_nested=True"): [ + {"name": "col1", "type": "float"}, + {"name": "col2", "type": "float"}, + ], + } + cdm_metrics_list: List[MetricTypes] = [ + MetricTypes.TABLE_ROW_COUNT, + MetricTypes.TABLE_COLUMNS, + MetricTypes.TABLE_COLUMN_TYPES, + ] + mock_validator.compute_metrics.return_value = ( + computed_metrics, + aborted_metrics, + ) + mock_batch = mocker.Mock(spec=Batch) + mock_batch.id = "batch_id" + mock_validator.active_batch = mock_batch + + metric_retriever = MetricListMetricRetriever(context=mock_context) + + mock_batch_request = mocker.Mock(spec=BatchRequest) + + mocker.patch( + f"{ColumnDomainBuilder.__module__}.{ColumnDomainBuilder.__name__}.get_effective_column_names", + return_value=["col1", "col2"], + ) + metric_retriever.get_metrics( + batch_request=mock_batch_request, metric_list=cdm_metrics_list + ) + + mock_context.get_validator.assert_called_once_with(batch_request=mock_batch_request) + + +def test_get_metrics_with_no_metrics(mocker: MockerFixture): + mock_context = mocker.Mock(spec=CloudDataContext) + mock_validator = mocker.Mock(spec=Validator) + mock_context.get_validator.return_value = mock_validator + computed_metrics = {} + cdm_metrics_list: List[MetricTypes] = [] + aborted_metrics = {} + mock_validator.compute_metrics.return_value = ( + computed_metrics, + aborted_metrics, + ) + mock_batch = mocker.Mock(spec=Batch) + mock_batch.id = "batch_id" + mock_validator.active_batch = mock_batch + + metric_retriever = MetricListMetricRetriever(context=mock_context) + + mock_batch_request = mocker.Mock(spec=BatchRequest) + + with pytest.raises(ValueError): + metric_retriever.get_metrics( + batch_request=mock_batch_request, metric_list=cdm_metrics_list + ) + + +def test_valid_metric_types_true(mocker: MockerFixture): + mock_context = mocker.Mock(spec=CloudDataContext) + metric_retriever = MetricListMetricRetriever(context=mock_context) + + valid_metric_types = [ + MetricTypes.TABLE_ROW_COUNT, + MetricTypes.TABLE_COLUMNS, + MetricTypes.TABLE_COLUMN_TYPES, + MetricTypes.COLUMN_MIN, + MetricTypes.COLUMN_MAX, + MetricTypes.COLUMN_MEAN, + MetricTypes.COLUMN_MEDIAN, + MetricTypes.COLUMN_NULL_COUNT, + ] + assert metric_retriever._check_valid_metric_types(valid_metric_types) is True + + +def test_valid_metric_types_false(mocker: MockerFixture): + mock_context = mocker.Mock(spec=CloudDataContext) + metric_retriever = MetricListMetricRetriever(context=mock_context) + + invalid_metric_type = ["I_am_invalid"] + assert metric_retriever._check_valid_metric_types(invalid_metric_type) is False + + +def test_column_metrics_in_metrics_list_only_table_metrics(mocker: MockerFixture): + mock_context = mocker.Mock(spec=CloudDataContext) + metric_retriever = MetricListMetricRetriever(context=mock_context) + table_metrics_only = [ + MetricTypes.TABLE_ROW_COUNT, + MetricTypes.TABLE_COLUMNS, + MetricTypes.TABLE_COLUMN_TYPES, + ] + assert metric_retriever._column_metrics_in_metric_list(table_metrics_only) is False + + +def test_column_metrics_in_metrics_list_with_column_metrics(mocker: MockerFixture): + mock_context = mocker.Mock(spec=CloudDataContext) + metric_retriever = MetricListMetricRetriever(context=mock_context) + metrics_list_with_column_metrics = [ + MetricTypes.TABLE_ROW_COUNT, + MetricTypes.TABLE_COLUMNS, + MetricTypes.TABLE_COLUMN_TYPES, + MetricTypes.COLUMN_MIN, + ] + assert ( + metric_retriever._column_metrics_in_metric_list( + metrics_list_with_column_metrics + ) + is True + ) + + +def test_get_table_column_types(mocker: MockerFixture): + mock_context = mocker.Mock(spec=CloudDataContext) + mock_validator = mocker.Mock(spec=Validator) + mock_context.get_validator.return_value = mock_validator + mock_batch_request = mocker.Mock(spec=BatchRequest) + computed_metrics = { + ("table.column_types", (), "include_nested=True"): [ + {"name": "col1", "type": "float"}, + {"name": "col2", "type": "float"}, + ], + } + aborted_metrics = {} + mock_validator.compute_metrics.return_value = ( + computed_metrics, + aborted_metrics, + ) + mock_batch = mocker.Mock(spec=Batch) + mock_batch.id = "batch_id" + mock_validator.active_batch = mock_batch + + metric_retriever = MetricListMetricRetriever(context=mock_context) + ret = metric_retriever._get_table_column_types(mock_batch_request) + print(ret) + + +def test_get_table_columns(mocker: MockerFixture): + mock_context = mocker.Mock(spec=CloudDataContext) + mock_validator = mocker.Mock(spec=Validator) + mock_context.get_validator.return_value = mock_validator + mock_batch_request = mocker.Mock(spec=BatchRequest) + computed_metrics = { + ("table.columns", (), ()): ["col1", "col2"], + } + aborted_metrics = {} + mock_validator.compute_metrics.return_value = (computed_metrics, aborted_metrics) + mock_batch = mocker.Mock(spec=Batch) + mock_batch.id = "batch_id" + mock_validator.active_batch = mock_batch + + metric_retriever = MetricListMetricRetriever(context=mock_context) + ret = metric_retriever._get_table_columns(mock_batch_request) + assert ret == TableMetric[List[str]]( + batch_id="batch_id", + metric_name="table.columns", + value=["col1", "col2"], + exception=None, + ) + + +def test_get_table_row_count(mocker: MockerFixture): + mock_context = mocker.Mock(spec=CloudDataContext) + mock_validator = mocker.Mock(spec=Validator) + mock_context.get_validator.return_value = mock_validator + mock_batch_request = mocker.Mock(spec=BatchRequest) + computed_metrics = {("table.row_count", (), ()): 2} + aborted_metrics = {} + mock_validator.compute_metrics.return_value = (computed_metrics, aborted_metrics) + mock_batch = mocker.Mock(spec=Batch) + mock_batch.id = "batch_id" + mock_validator.active_batch = mock_batch + + metric_retriever = MetricListMetricRetriever(context=mock_context) + ret = metric_retriever._get_table_row_count(mock_batch_request) + assert ret == TableMetric[int]( + batch_id="batch_id", + metric_name="table.row_count", + value=2, + exception=None, + ) diff --git a/tests/experimental/metric_repository/test_metric_list_metric_retriever_integration.py b/tests/experimental/metric_repository/test_metric_list_metric_retriever_integration.py new file mode 100644 index 000000000000..7e0aea8c65a5 --- /dev/null +++ b/tests/experimental/metric_repository/test_metric_list_metric_retriever_integration.py @@ -0,0 +1,287 @@ +"""Test using actual sample data.""" + +from __future__ import annotations + +from typing import List + +import pandas as pd +import pytest +from pandas import Timestamp + +from great_expectations.data_context import CloudDataContext +from great_expectations.datasource.fluent.batch_request import BatchRequest +from great_expectations.experimental.metric_repository.metric_list_metric_retriever import ( + MetricListMetricRetriever, +) +from great_expectations.experimental.metric_repository.metrics import ( + ColumnMetric, + MetricTypes, + TableMetric, +) + + +@pytest.fixture +def cloud_context_and_batch_request_with_simple_dataframe( + empty_cloud_context_fluent: CloudDataContext, # used as a fixture +): + context = empty_cloud_context_fluent + datasource = context.sources.add_pandas(name="my_pandas_datasource") + + d = { + "numeric_with_nulls_1": [1, 2, None], + "numeric_with_nulls_2": [3, 4, None], + "string": ["a", "b", "c"], + "string_with_nulls": ["a", "b", None], + "boolean": [True, False, True], + "datetime": [ + pd.to_datetime("2020-01-01"), + pd.to_datetime("2020-01-02"), + pd.to_datetime("2020-01-03"), + ], + } + df = pd.DataFrame(data=d) + + name = "dataframe" + data_asset = datasource.add_dataframe_asset(name=name) + batch_request = data_asset.build_batch_request(dataframe=df) + return context, batch_request + + +@pytest.mark.cloud +def test_get_metrics_table_metrics_only( + cloud_context_and_batch_request_with_simple_dataframe: tuple[ + CloudDataContext, BatchRequest + ], +): + context, batch_request = cloud_context_and_batch_request_with_simple_dataframe + table_metrics_list: List[MetricTypes] = [ + MetricTypes.TABLE_ROW_COUNT, + MetricTypes.TABLE_COLUMNS, + MetricTypes.TABLE_COLUMN_TYPES, + ] + metric_retriever = MetricListMetricRetriever(context) + metrics = metric_retriever.get_metrics( + batch_request=batch_request, metric_list=table_metrics_list + ) + validator = context.get_validator(batch_request=batch_request) + batch_id = validator.active_batch.id + + expected_metrics = [ + TableMetric[int]( + batch_id=batch_id, + metric_name="table.row_count", + value=3, + exception=None, + ), + TableMetric[List[str]]( + batch_id=batch_id, + metric_name="table.columns", + value=[ + "numeric_with_nulls_1", + "numeric_with_nulls_2", + "string", + "string_with_nulls", + "boolean", + "datetime", + ], + exception=None, + ), + TableMetric[List[str]]( + batch_id=batch_id, + metric_name="table.column_types", + value=[ + {"name": "numeric_with_nulls_1", "type": "float64"}, + {"name": "numeric_with_nulls_2", "type": "float64"}, + {"name": "string", "type": "object"}, + {"name": "string_with_nulls", "type": "object"}, + {"name": "boolean", "type": "bool"}, + {"name": "datetime", "type": "datetime64[ns]"}, + ], + exception=None, + ), + ] + + # Assert each metric so it is easier to see which one fails (instead of assert metrics == expected_metrics): + assert len(metrics) == len(expected_metrics) + for metric in metrics: + assert metric.dict() in [ + expected_metric.dict() for expected_metric in expected_metrics + ] + + +@pytest.mark.cloud +def test_get_metrics_full_cdm( + cloud_context_and_batch_request_with_simple_dataframe: tuple[ + CloudDataContext, BatchRequest + ], +): + context, batch_request = cloud_context_and_batch_request_with_simple_dataframe + cdm_metrics_list: List[MetricTypes] = [ + MetricTypes.TABLE_ROW_COUNT, + MetricTypes.TABLE_COLUMNS, + MetricTypes.TABLE_COLUMN_TYPES, + MetricTypes.COLUMN_MIN, + MetricTypes.COLUMN_MAX, + MetricTypes.COLUMN_MEAN, + MetricTypes.COLUMN_MEDIAN, + MetricTypes.COLUMN_NULL_COUNT, + ] + metric_retriever = MetricListMetricRetriever(context) + metrics = metric_retriever.get_metrics( + batch_request=batch_request, metric_list=cdm_metrics_list + ) + validator = context.get_validator(batch_request=batch_request) + batch_id = validator.active_batch.id + + expected_metrics = [ + TableMetric[int]( + batch_id=batch_id, + metric_name="table.row_count", + value=3, + exception=None, + ), + TableMetric[List[str]]( + batch_id=batch_id, + metric_name="table.columns", + value=[ + "numeric_with_nulls_1", + "numeric_with_nulls_2", + "string", + "string_with_nulls", + "boolean", + "datetime", + ], + exception=None, + ), + ColumnMetric[float]( + batch_id=batch_id, + metric_name="column.min", + column="numeric_with_nulls_1", + value=1, + exception=None, + ), + ColumnMetric[float]( + batch_id=batch_id, + metric_name="column.min", + column="numeric_with_nulls_2", + value=3, + exception=None, + ), + ColumnMetric[float]( + batch_id=batch_id, + metric_name="column.max", + column="numeric_with_nulls_1", + value=2, + exception=None, + ), + ColumnMetric[float]( + batch_id=batch_id, + metric_name="column.max", + column="numeric_with_nulls_2", + value=4, + exception=None, + ), + ColumnMetric[float]( + batch_id=batch_id, + metric_name="column.mean", + column="numeric_with_nulls_1", + value=1.5, + exception=None, + ), + ColumnMetric[float]( + batch_id=batch_id, + metric_name="column.mean", + column="numeric_with_nulls_2", + value=3.5, + exception=None, + ), + ColumnMetric[float]( + batch_id=batch_id, + metric_name="column.median", + column="numeric_with_nulls_1", + value=1.5, + exception=None, + ), + ColumnMetric[float]( + batch_id=batch_id, + metric_name="column.median", + column="numeric_with_nulls_2", + value=3.5, + exception=None, + ), + TableMetric[List[str]]( + batch_id=batch_id, + metric_name="table.column_types", + value=[ + {"name": "numeric_with_nulls_1", "type": "float64"}, + {"name": "numeric_with_nulls_2", "type": "float64"}, + {"name": "string", "type": "object"}, + {"name": "string_with_nulls", "type": "object"}, + {"name": "boolean", "type": "bool"}, + {"name": "datetime", "type": "datetime64[ns]"}, + ], + exception=None, + ), + ColumnMetric[int]( + batch_id=batch_id, + metric_name="column_values.null.count", + column="numeric_with_nulls_1", + value=1, + exception=None, + ), + ColumnMetric[int]( + batch_id=batch_id, + metric_name="column_values.null.count", + column="numeric_with_nulls_2", + value=1, + exception=None, + ), + ColumnMetric[int]( + batch_id=batch_id, + metric_name="column_values.null.count", + column="string", + value=0, + exception=None, + ), + ColumnMetric[int]( + batch_id=batch_id, + metric_name="column_values.null.count", + column="string_with_nulls", + value=1, + exception=None, + ), + ColumnMetric[int]( + batch_id=batch_id, + metric_name="column_values.null.count", + column="boolean", + value=0, + exception=None, + ), + ColumnMetric[int]( + batch_id=batch_id, + metric_name="column_values.null.count", + column="datetime", + value=0, + exception=None, + ), + ColumnMetric[str]( + batch_id=batch_id, + metric_name="column.min", + value=Timestamp("2020-01-01 00:00:00"), + exception=None, + column="datetime", + ), + ColumnMetric[str]( + batch_id=batch_id, + metric_name="column.max", + value=Timestamp("2020-01-03 00:00:00"), + exception=None, + column="datetime", + ), + ] + + assert len(metrics) == len(expected_metrics) + for metric in metrics: + assert metric.dict() in [ + expected_metric.dict() for expected_metric in expected_metrics + ]