[MAINTENANCE] Add tests around expectations (#10688)

great-expectations · Dec 1, 2024 · 44382ef · 44382ef
1 parent 50ea23a
commit 44382ef
Show file tree

Hide file tree

Showing 72 changed files with 5,195 additions and 5,287 deletions.
diff --git a/great_expectations/expectations/core/expect_column_kl_divergence_to_be_less_than.py b/great_expectations/expectations/core/expect_column_kl_divergence_to_be_less_than.py
@@ -327,9 +327,13 @@ class ExpectColumnKLDivergenceToBeLessThan(ColumnAggregateExpectation):
     tail_weight_holdout: Union[float, None] = pydantic.Field(
         default=0, ge=0, le=1, description=TAIL_WEIGHT_HOLDOUT_DESCRIPTION
     )
-    bucketize_data: bool = pydantic.Field(True, description=BUCKETIZE_DATA_DESCRIPTION)
-    min_value: Optional[Comparable] = pydantic.Field(None, description=MIN_VALUE_DESCRIPTION)
-    max_value: Optional[Comparable] = pydantic.Field(None, description=MAX_VALUE_DESCRIPTION)
+    bucketize_data: bool = pydantic.Field(default=True, description=BUCKETIZE_DATA_DESCRIPTION)
+    min_value: Optional[Comparable] = pydantic.Field(
+        default=None, description=MIN_VALUE_DESCRIPTION
+    )
+    max_value: Optional[Comparable] = pydantic.Field(
+        default=None, description=MAX_VALUE_DESCRIPTION
+    )
 
     # This dictionary contains metadata for display in the public gallery
     library_metadata: ClassVar[Dict[str, Union[str, list, bool]]] = {

diff --git a/great_expectations/expectations/core/expect_column_most_common_value_to_be_in_set.py b/great_expectations/expectations/core/expect_column_most_common_value_to_be_in_set.py
@@ -177,7 +177,7 @@ class ExpectColumnMostCommonValueToBeInSet(ColumnAggregateExpectation):
 
     value_set: ValueSetField
     ties_okay: Union[bool, None] = pydantic.Field(
-        None,
+        default=None,
         description=TIES_OKAY_DESCRIPTION,
     )
 

diff --git a/...expectations/expectations/core/expect_column_proportion_of_unique_values_to_be_between.py b/...expectations/expectations/core/expect_column_proportion_of_unique_values_to_be_between.py
@@ -185,10 +185,14 @@ class ExpectColumnProportionOfUniqueValuesToBeBetween(ColumnAggregateExpectation
                 }}
     """  # noqa: E501
 
-    min_value: Optional[Comparable] = pydantic.Field(None, description=MIN_VALUE_DESCRIPTION)
-    max_value: Optional[Comparable] = pydantic.Field(None, description=MAX_VALUE_DESCRIPTION)
-    strict_min: bool = pydantic.Field(False, description=STRICT_MIN_DESCRIPTION)
-    strict_max: bool = pydantic.Field(False, description=STRICT_MAX_DESCRIPTION)
+    min_value: Optional[Comparable] = pydantic.Field(
+        default=None, description=MIN_VALUE_DESCRIPTION
+    )
+    max_value: Optional[Comparable] = pydantic.Field(
+        default=None, description=MAX_VALUE_DESCRIPTION
+    )
+    strict_min: bool = pydantic.Field(default=False, description=STRICT_MIN_DESCRIPTION)
+    strict_max: bool = pydantic.Field(default=False, description=STRICT_MAX_DESCRIPTION)
 
     # This dictionary contains metadata for display in the public gallery
     library_metadata = {

diff --git a/great_expectations/expectations/core/expect_column_quantile_values_to_be_between.py b/great_expectations/expectations/core/expect_column_quantile_values_to_be_between.py
@@ -236,7 +236,7 @@ class ExpectColumnQuantileValuesToBeBetween(ColumnAggregateExpectation):
 
     quantile_ranges: QuantileRange = pydantic.Field(description=QUANTILE_RANGES_DESCRIPTION)
     allow_relative_error: Union[bool, str] = pydantic.Field(
-        False,
+        default=False,
         description=ALLOW_RELATIVE_ERROR_DESCRIPTION,
     )
 

diff --git a/great_expectations/expectations/core/expect_column_stdev_to_be_between.py b/great_expectations/expectations/core/expect_column_stdev_to_be_between.py
@@ -176,10 +176,14 @@ class ExpectColumnStdevToBeBetween(ColumnAggregateExpectation):
                 }}
     """  # noqa: E501
 
-    min_value: Optional[Comparable] = pydantic.Field(None, description=MIN_VALUE_DESCRIPTION)
-    max_value: Optional[Comparable] = pydantic.Field(None, description=MAX_VALUE_DESCRIPTION)
-    strict_min: bool = pydantic.Field(False, description=STRICT_MIN_DESCRIPTION)
-    strict_max: bool = pydantic.Field(False, description=STRICT_MAX_DESCRIPTION)
+    min_value: Optional[Comparable] = pydantic.Field(
+        default=None, description=MIN_VALUE_DESCRIPTION
+    )
+    max_value: Optional[Comparable] = pydantic.Field(
+        default=None, description=MAX_VALUE_DESCRIPTION
+    )
+    strict_min: bool = pydantic.Field(default=False, description=STRICT_MIN_DESCRIPTION)
+    strict_max: bool = pydantic.Field(default=False, description=STRICT_MAX_DESCRIPTION)
 
     # This dictionary contains metadata for display in the public gallery
     library_metadata: ClassVar[Dict[str, Union[str, list, bool]]] = {

diff --git a/great_expectations/expectations/core/expect_column_sum_to_be_between.py b/great_expectations/expectations/core/expect_column_sum_to_be_between.py
@@ -164,10 +164,14 @@ class ExpectColumnSumToBeBetween(ColumnAggregateExpectation):
                 }}
     """  # noqa: E501
 
-    min_value: Optional[Comparable] = pydantic.Field(None, description=MIN_VALUE_DESCRIPTION)
-    max_value: Optional[Comparable] = pydantic.Field(None, description=MAX_VALUE_DESCRIPTION)
-    strict_min: bool = pydantic.Field(False, description=STRICT_MIN_DESCRIPTION)
-    strict_max: bool = pydantic.Field(False, description=STRICT_MAX_DESCRIPTION)
+    min_value: Optional[Comparable] = pydantic.Field(
+        default=None, description=MIN_VALUE_DESCRIPTION
+    )
+    max_value: Optional[Comparable] = pydantic.Field(
+        default=None, description=MAX_VALUE_DESCRIPTION
+    )
+    strict_min: bool = pydantic.Field(default=False, description=STRICT_MIN_DESCRIPTION)
+    strict_max: bool = pydantic.Field(default=False, description=STRICT_MAX_DESCRIPTION)
 
     # This dictionary contains metadata for display in the public gallery
     library_metadata: ClassVar[Dict[str, Union[str, list, bool]]] = {

diff --git a/great_expectations/expectations/core/expect_column_unique_value_count_to_be_between.py b/great_expectations/expectations/core/expect_column_unique_value_count_to_be_between.py
@@ -176,14 +176,18 @@ class ExpectColumnUniqueValueCountToBeBetween(ColumnAggregateExpectation):
                 }}
     """  # noqa: E501
 
-    min_value: Optional[Comparable] = pydantic.Field(None, description=MIN_VALUE_DESCRIPTION)
-    max_value: Optional[Comparable] = pydantic.Field(None, description=MAX_VALUE_DESCRIPTION)
+    min_value: Optional[Comparable] = pydantic.Field(
+        default=None, description=MIN_VALUE_DESCRIPTION
+    )
+    max_value: Optional[Comparable] = pydantic.Field(
+        default=None, description=MAX_VALUE_DESCRIPTION
+    )
     strict_min: bool = pydantic.Field(
-        False,
+        default=False,
         description=STRICT_MIN_DESCRIPTION,
     )
     strict_max: bool = pydantic.Field(
-        False,
+        default=False,
         description=STRICT_MAX_DESCRIPTION,
     )
 

diff --git a/tests/integration/data_sources_and_expectations/expectations/__init__.py b/tests/integration/data_sources_and_expectations/expectations/__init__.py
@@ -0,0 +1,4 @@
+"""Tests around individual expectations.
+
+Files here should have a 1:1 relationship with the expectations they test.
+"""
diff --git a/..._sources_and_expectations/expectations/test_expect_column_distinct_values_to_be_in_set.py b/..._sources_and_expectations/expectations/test_expect_column_distinct_values_to_be_in_set.py
@@ -0,0 +1,107 @@
+from datetime import datetime
+
+import pandas as pd
+import pytest
+
+import great_expectations.expectations as gxe
+from great_expectations.core.result_format import ResultFormat
+from great_expectations.datasource.fluent.interfaces import Batch
+from tests.integration.conftest import parameterize_batch_for_data_sources
+from tests.integration.data_sources_and_expectations.test_canonical_expectations import (
+    ALL_DATA_SOURCES,
+    DATA_SOURCES_THAT_SUPPORT_DATE_COMPARISONS,
+    JUST_PANDAS_DATA_SOURCES,
+)
+
+COL_NAME = "my_col"
+
+ONES_AND_TWOS = pd.DataFrame({COL_NAME: [1, 2, 2, 2]})
+
+
+@parameterize_batch_for_data_sources(data_source_configs=ALL_DATA_SOURCES, data=ONES_AND_TWOS)
+def test_success_complete_results(batch_for_datasource: Batch) -> None:
+    expectation = gxe.ExpectColumnDistinctValuesToBeInSet(column=COL_NAME, value_set=[1, 2])
+    result = batch_for_datasource.validate(expectation, result_format=ResultFormat.COMPLETE)
+    assert result.success
+    assert result.to_json_dict()["result"] == {
+        "details": {
+            "value_counts": [
+                {"value": 1, "count": 1},
+                {"value": 2, "count": 3},
+            ]
+        },
+        "observed_value": [1, 2],
+    }
+
+
+@parameterize_batch_for_data_sources(
+    data_source_configs=ALL_DATA_SOURCES,
+    data=pd.DataFrame({COL_NAME: ["foo", "bar"]}),
+)
+def test_strings(batch_for_datasource: Batch) -> None:
+    expectation = gxe.ExpectColumnDistinctValuesToBeInSet(
+        column=COL_NAME, value_set=["foo", "bar", "baz"]
+    )
+    result = batch_for_datasource.validate(expectation)
+    assert result.success
+
+
+@parameterize_batch_for_data_sources(
+    data_source_configs=DATA_SOURCES_THAT_SUPPORT_DATE_COMPARISONS,
+    data=pd.DataFrame({COL_NAME: [datetime(2024, 11, 19).date(), datetime(2024, 11, 20).date()]}),  # noqa: DTZ001
+)
+def test_dates(batch_for_datasource: Batch) -> None:
+    expectation = gxe.ExpectColumnDistinctValuesToBeInSet(
+        column=COL_NAME,
+        value_set=[datetime(2024, 11, 19).date(), datetime(2024, 11, 20).date()],  # noqa: DTZ001
+    )
+    result = batch_for_datasource.validate(expectation)
+    assert result.success
+
+
+@parameterize_batch_for_data_sources(
+    data_source_configs=JUST_PANDAS_DATA_SOURCES, data=pd.DataFrame({COL_NAME: [1, 2, None]})
+)
+def test_ignores_nulls(batch_for_datasource: Batch) -> None:
+    expectation = gxe.ExpectColumnDistinctValuesToBeInSet(column=COL_NAME, value_set=[1, 2])
+    result = batch_for_datasource.validate(expectation)
+    assert result.success
+
+
+@parameterize_batch_for_data_sources(
+    data_source_configs=JUST_PANDAS_DATA_SOURCES, data=ONES_AND_TWOS
+)
+def test_data_is_subset(batch_for_datasource: Batch) -> None:
+    expectation = gxe.ExpectColumnDistinctValuesToBeInSet(column=COL_NAME, value_set=[1, 2, 3])
+    result = batch_for_datasource.validate(expectation)
+    assert result.success
+
+
+@pytest.mark.xfail(strict=True)
+@parameterize_batch_for_data_sources(
+    data_source_configs=JUST_PANDAS_DATA_SOURCES, data=ONES_AND_TWOS
+)
+def test_empty_value_set(batch_for_datasource: Batch) -> None:
+    """Failing test that seems like a (pretty minor) bug"""
+    expectation = gxe.ExpectColumnDistinctValuesToBeInSet(column=COL_NAME, value_set=[])
+    result = batch_for_datasource.validate(expectation)
+    assert not result.success
+
+
+@parameterize_batch_for_data_sources(
+    data_source_configs=JUST_PANDAS_DATA_SOURCES, data=ONES_AND_TWOS
+)
+def test_value_set_is_none(batch_for_datasource: Batch) -> None:
+    # why do we even allow this?!?
+    expectation = gxe.ExpectColumnDistinctValuesToBeInSet(column=COL_NAME, value_set=None)
+    result = batch_for_datasource.validate(expectation)
+    assert result.success
+
+
+@parameterize_batch_for_data_sources(
+    data_source_configs=JUST_PANDAS_DATA_SOURCES, data=ONES_AND_TWOS
+)
+def test_failure(batch_for_datasource: Batch) -> None:
+    expectation = gxe.ExpectColumnDistinctValuesToBeInSet(column=COL_NAME, value_set=[1])
+    result = batch_for_datasource.validate(expectation)
+    assert not result.success
diff --git a/...ources_and_expectations/expectations/test_expect_column_distinct_values_to_contain_set.py b/...ources_and_expectations/expectations/test_expect_column_distinct_values_to_contain_set.py
@@ -0,0 +1,83 @@
+from datetime import datetime
+
+import pandas as pd
+
+import great_expectations.expectations as gxe
+from great_expectations.core.result_format import ResultFormat
+from great_expectations.datasource.fluent.interfaces import Batch
+from tests.integration.conftest import parameterize_batch_for_data_sources
+from tests.integration.data_sources_and_expectations.test_canonical_expectations import (
+    ALL_DATA_SOURCES,
+    DATA_SOURCES_THAT_SUPPORT_DATE_COMPARISONS,
+    JUST_PANDAS_DATA_SOURCES,
+)
+
+COL_NAME = "my_col"
+
+ONES_AND_TWOS = pd.DataFrame({COL_NAME: [1, 2, 2, 2]})
+
+
+@parameterize_batch_for_data_sources(data_source_configs=ALL_DATA_SOURCES, data=ONES_AND_TWOS)
+def test_success_complete_results(batch_for_datasource: Batch) -> None:
+    expectation = gxe.ExpectColumnDistinctValuesToContainSet(column=COL_NAME, value_set=[1, 2])
+    result = batch_for_datasource.validate(expectation, result_format=ResultFormat.COMPLETE)
+    assert result.success
+    assert result.to_json_dict()["result"] == {
+        "details": {
+            "value_counts": [
+                {"value": 1, "count": 1},
+                {"value": 2, "count": 3},
+            ]
+        },
+        "observed_value": [1, 2],
+    }
+
+
+@parameterize_batch_for_data_sources(
+    data_source_configs=ALL_DATA_SOURCES,
+    data=pd.DataFrame({COL_NAME: ["foo", "bar"]}),
+)
+def test_strings(batch_for_datasource: Batch) -> None:
+    expectation = gxe.ExpectColumnDistinctValuesToContainSet(column=COL_NAME, value_set=["foo"])
+    result = batch_for_datasource.validate(expectation)
+    assert result.success
+
+
+@parameterize_batch_for_data_sources(
+    data_source_configs=DATA_SOURCES_THAT_SUPPORT_DATE_COMPARISONS,
+    data=pd.DataFrame({COL_NAME: [datetime(2024, 11, 19).date(), datetime(2024, 11, 20).date()]}),  # noqa: DTZ001
+)
+def test_dates(batch_for_datasource: Batch) -> None:
+    expectation = gxe.ExpectColumnDistinctValuesToContainSet(
+        column=COL_NAME,
+        value_set=[datetime(2024, 11, 19).date()],  # noqa: DTZ001
+    )
+    result = batch_for_datasource.validate(expectation)
+    assert result.success
+
+
+@parameterize_batch_for_data_sources(
+    data_source_configs=JUST_PANDAS_DATA_SOURCES, data=pd.DataFrame({COL_NAME: [1, 2, None]})
+)
+def test_ignores_nulls(batch_for_datasource: Batch) -> None:
+    expectation = gxe.ExpectColumnDistinctValuesToContainSet(column=COL_NAME, value_set=[1, 2])
+    result = batch_for_datasource.validate(expectation)
+    assert result.success
+
+
+@parameterize_batch_for_data_sources(
+    data_source_configs=JUST_PANDAS_DATA_SOURCES, data=pd.DataFrame({COL_NAME: [1, 2, None]})
+)
+def test_data_is_superset(batch_for_datasource: Batch) -> None:
+    expectation = gxe.ExpectColumnDistinctValuesToContainSet(column=COL_NAME, value_set=[1])
+    result = batch_for_datasource.validate(expectation)
+    assert result.success
+
+
+@parameterize_batch_for_data_sources(
+    data_source_configs=JUST_PANDAS_DATA_SOURCES, data=ONES_AND_TWOS
+)
+def test_failure(batch_for_datasource: Batch) -> None:
+    expectation = gxe.ExpectColumnDistinctValuesToContainSet(column=COL_NAME, value_set=[1, 2, 3])
+    result = batch_for_datasource.validate(expectation)
+    assert not result.success
diff --git a/..._sources_and_expectations/expectations/test_expect_column_distinct_values_to_equal_set.py b/..._sources_and_expectations/expectations/test_expect_column_distinct_values_to_equal_set.py
@@ -0,0 +1,81 @@
+from datetime import datetime
+from typing import Optional
+
+import pandas as pd
+import pytest
+
+import great_expectations.expectations as gxe
+from great_expectations.core.result_format import ResultFormat
+from great_expectations.datasource.fluent.interfaces import Batch
+from tests.integration.conftest import parameterize_batch_for_data_sources
+from tests.integration.data_sources_and_expectations.test_canonical_expectations import (
+    ALL_DATA_SOURCES,
+    DATA_SOURCES_THAT_SUPPORT_DATE_COMPARISONS,
+    JUST_PANDAS_DATA_SOURCES,
+)
+
+COL_NAME = "my_col"
+
+ONES_AND_TWOS = pd.DataFrame({COL_NAME: [1, 2, 2, 2]})
+
+
+@parameterize_batch_for_data_sources(data_source_configs=ALL_DATA_SOURCES, data=ONES_AND_TWOS)
+def test_success_complete_results(batch_for_datasource: Batch) -> None:
+    expectation = gxe.ExpectColumnDistinctValuesToEqualSet(column=COL_NAME, value_set=[1, 2])
+    result = batch_for_datasource.validate(expectation, result_format=ResultFormat.COMPLETE)
+    assert result.success
+    assert result.to_json_dict()["result"] == {
+        "details": {
+            "value_counts": [
+                {"value": 1, "count": 1},
+                {"value": 2, "count": 3},
+            ]
+        },
+        "observed_value": [1, 2],
+    }
+
+
+@parameterize_batch_for_data_sources(
+    data_source_configs=ALL_DATA_SOURCES,
+    data=pd.DataFrame({COL_NAME: ["foo", "bar"]}),
+)
+def test_strings(batch_for_datasource: Batch) -> None:
+    expectation = gxe.ExpectColumnDistinctValuesToEqualSet(
+        column=COL_NAME, value_set=["foo", "bar"]
+    )
+    result = batch_for_datasource.validate(expectation)
+    assert result.success
+
+
+@parameterize_batch_for_data_sources(
+    data_source_configs=DATA_SOURCES_THAT_SUPPORT_DATE_COMPARISONS,
+    data=pd.DataFrame({COL_NAME: [datetime(2024, 11, 19).date(), datetime(2024, 11, 20).date()]}),  # noqa: DTZ001
+)
+def test_dates(batch_for_datasource: Batch) -> None:
+    expectation = gxe.ExpectColumnDistinctValuesToEqualSet(
+        column=COL_NAME,
+        value_set=[datetime(2024, 11, 19).date(), datetime(2024, 11, 20).date()],  # noqa: DTZ001
+    )
+    result = batch_for_datasource.validate(expectation)
+    assert result.success
+
+
+@parameterize_batch_for_data_sources(
+    data_source_configs=JUST_PANDAS_DATA_SOURCES, data=pd.DataFrame({COL_NAME: [1, 2, None]})
+)
+def test_ignores_nulls(batch_for_datasource: Batch) -> None:
+    expectation = gxe.ExpectColumnDistinctValuesToEqualSet(column=COL_NAME, value_set=[1, 2])
+    result = batch_for_datasource.validate(expectation)
+    assert result.success
+
+
+@pytest.mark.parametrize("value_set", [None, [], [1], [1, 4], [1, 2, 3]])
+@parameterize_batch_for_data_sources(
+    data_source_configs=JUST_PANDAS_DATA_SOURCES, data=ONES_AND_TWOS
+)
+def test_fails_if_data_is_not_equal(
+    batch_for_datasource: Batch, value_set: Optional[list[int]]
+) -> None:
+    expectation = gxe.ExpectColumnDistinctValuesToEqualSet(column=COL_NAME, value_set=value_set)
+    result = batch_for_datasource.validate(expectation)
+    assert not result.success