Skip to content

Commit

Permalink
[MAINTENANCE] Add tests around expectations (#10688)
Browse files Browse the repository at this point in the history
  • Loading branch information
tyler-hoffman authored Dec 1, 2024
1 parent 50ea23a commit 44382ef
Show file tree
Hide file tree
Showing 72 changed files with 5,195 additions and 5,287 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -327,9 +327,13 @@ class ExpectColumnKLDivergenceToBeLessThan(ColumnAggregateExpectation):
tail_weight_holdout: Union[float, None] = pydantic.Field(
default=0, ge=0, le=1, description=TAIL_WEIGHT_HOLDOUT_DESCRIPTION
)
bucketize_data: bool = pydantic.Field(True, description=BUCKETIZE_DATA_DESCRIPTION)
min_value: Optional[Comparable] = pydantic.Field(None, description=MIN_VALUE_DESCRIPTION)
max_value: Optional[Comparable] = pydantic.Field(None, description=MAX_VALUE_DESCRIPTION)
bucketize_data: bool = pydantic.Field(default=True, description=BUCKETIZE_DATA_DESCRIPTION)
min_value: Optional[Comparable] = pydantic.Field(
default=None, description=MIN_VALUE_DESCRIPTION
)
max_value: Optional[Comparable] = pydantic.Field(
default=None, description=MAX_VALUE_DESCRIPTION
)

# This dictionary contains metadata for display in the public gallery
library_metadata: ClassVar[Dict[str, Union[str, list, bool]]] = {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,7 @@ class ExpectColumnMostCommonValueToBeInSet(ColumnAggregateExpectation):

value_set: ValueSetField
ties_okay: Union[bool, None] = pydantic.Field(
None,
default=None,
description=TIES_OKAY_DESCRIPTION,
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -185,10 +185,14 @@ class ExpectColumnProportionOfUniqueValuesToBeBetween(ColumnAggregateExpectation
}}
""" # noqa: E501

min_value: Optional[Comparable] = pydantic.Field(None, description=MIN_VALUE_DESCRIPTION)
max_value: Optional[Comparable] = pydantic.Field(None, description=MAX_VALUE_DESCRIPTION)
strict_min: bool = pydantic.Field(False, description=STRICT_MIN_DESCRIPTION)
strict_max: bool = pydantic.Field(False, description=STRICT_MAX_DESCRIPTION)
min_value: Optional[Comparable] = pydantic.Field(
default=None, description=MIN_VALUE_DESCRIPTION
)
max_value: Optional[Comparable] = pydantic.Field(
default=None, description=MAX_VALUE_DESCRIPTION
)
strict_min: bool = pydantic.Field(default=False, description=STRICT_MIN_DESCRIPTION)
strict_max: bool = pydantic.Field(default=False, description=STRICT_MAX_DESCRIPTION)

# This dictionary contains metadata for display in the public gallery
library_metadata = {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,7 @@ class ExpectColumnQuantileValuesToBeBetween(ColumnAggregateExpectation):

quantile_ranges: QuantileRange = pydantic.Field(description=QUANTILE_RANGES_DESCRIPTION)
allow_relative_error: Union[bool, str] = pydantic.Field(
False,
default=False,
description=ALLOW_RELATIVE_ERROR_DESCRIPTION,
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -176,10 +176,14 @@ class ExpectColumnStdevToBeBetween(ColumnAggregateExpectation):
}}
""" # noqa: E501

min_value: Optional[Comparable] = pydantic.Field(None, description=MIN_VALUE_DESCRIPTION)
max_value: Optional[Comparable] = pydantic.Field(None, description=MAX_VALUE_DESCRIPTION)
strict_min: bool = pydantic.Field(False, description=STRICT_MIN_DESCRIPTION)
strict_max: bool = pydantic.Field(False, description=STRICT_MAX_DESCRIPTION)
min_value: Optional[Comparable] = pydantic.Field(
default=None, description=MIN_VALUE_DESCRIPTION
)
max_value: Optional[Comparable] = pydantic.Field(
default=None, description=MAX_VALUE_DESCRIPTION
)
strict_min: bool = pydantic.Field(default=False, description=STRICT_MIN_DESCRIPTION)
strict_max: bool = pydantic.Field(default=False, description=STRICT_MAX_DESCRIPTION)

# This dictionary contains metadata for display in the public gallery
library_metadata: ClassVar[Dict[str, Union[str, list, bool]]] = {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -164,10 +164,14 @@ class ExpectColumnSumToBeBetween(ColumnAggregateExpectation):
}}
""" # noqa: E501

min_value: Optional[Comparable] = pydantic.Field(None, description=MIN_VALUE_DESCRIPTION)
max_value: Optional[Comparable] = pydantic.Field(None, description=MAX_VALUE_DESCRIPTION)
strict_min: bool = pydantic.Field(False, description=STRICT_MIN_DESCRIPTION)
strict_max: bool = pydantic.Field(False, description=STRICT_MAX_DESCRIPTION)
min_value: Optional[Comparable] = pydantic.Field(
default=None, description=MIN_VALUE_DESCRIPTION
)
max_value: Optional[Comparable] = pydantic.Field(
default=None, description=MAX_VALUE_DESCRIPTION
)
strict_min: bool = pydantic.Field(default=False, description=STRICT_MIN_DESCRIPTION)
strict_max: bool = pydantic.Field(default=False, description=STRICT_MAX_DESCRIPTION)

# This dictionary contains metadata for display in the public gallery
library_metadata: ClassVar[Dict[str, Union[str, list, bool]]] = {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -176,14 +176,18 @@ class ExpectColumnUniqueValueCountToBeBetween(ColumnAggregateExpectation):
}}
""" # noqa: E501

min_value: Optional[Comparable] = pydantic.Field(None, description=MIN_VALUE_DESCRIPTION)
max_value: Optional[Comparable] = pydantic.Field(None, description=MAX_VALUE_DESCRIPTION)
min_value: Optional[Comparable] = pydantic.Field(
default=None, description=MIN_VALUE_DESCRIPTION
)
max_value: Optional[Comparable] = pydantic.Field(
default=None, description=MAX_VALUE_DESCRIPTION
)
strict_min: bool = pydantic.Field(
False,
default=False,
description=STRICT_MIN_DESCRIPTION,
)
strict_max: bool = pydantic.Field(
False,
default=False,
description=STRICT_MAX_DESCRIPTION,
)

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
"""Tests around individual expectations.
Files here should have a 1:1 relationship with the expectations they test.
"""
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
from datetime import datetime

import pandas as pd
import pytest

import great_expectations.expectations as gxe
from great_expectations.core.result_format import ResultFormat
from great_expectations.datasource.fluent.interfaces import Batch
from tests.integration.conftest import parameterize_batch_for_data_sources
from tests.integration.data_sources_and_expectations.test_canonical_expectations import (
ALL_DATA_SOURCES,
DATA_SOURCES_THAT_SUPPORT_DATE_COMPARISONS,
JUST_PANDAS_DATA_SOURCES,
)

COL_NAME = "my_col"

ONES_AND_TWOS = pd.DataFrame({COL_NAME: [1, 2, 2, 2]})


@parameterize_batch_for_data_sources(data_source_configs=ALL_DATA_SOURCES, data=ONES_AND_TWOS)
def test_success_complete_results(batch_for_datasource: Batch) -> None:
expectation = gxe.ExpectColumnDistinctValuesToBeInSet(column=COL_NAME, value_set=[1, 2])
result = batch_for_datasource.validate(expectation, result_format=ResultFormat.COMPLETE)
assert result.success
assert result.to_json_dict()["result"] == {
"details": {
"value_counts": [
{"value": 1, "count": 1},
{"value": 2, "count": 3},
]
},
"observed_value": [1, 2],
}


@parameterize_batch_for_data_sources(
data_source_configs=ALL_DATA_SOURCES,
data=pd.DataFrame({COL_NAME: ["foo", "bar"]}),
)
def test_strings(batch_for_datasource: Batch) -> None:
expectation = gxe.ExpectColumnDistinctValuesToBeInSet(
column=COL_NAME, value_set=["foo", "bar", "baz"]
)
result = batch_for_datasource.validate(expectation)
assert result.success


@parameterize_batch_for_data_sources(
data_source_configs=DATA_SOURCES_THAT_SUPPORT_DATE_COMPARISONS,
data=pd.DataFrame({COL_NAME: [datetime(2024, 11, 19).date(), datetime(2024, 11, 20).date()]}), # noqa: DTZ001
)
def test_dates(batch_for_datasource: Batch) -> None:
expectation = gxe.ExpectColumnDistinctValuesToBeInSet(
column=COL_NAME,
value_set=[datetime(2024, 11, 19).date(), datetime(2024, 11, 20).date()], # noqa: DTZ001
)
result = batch_for_datasource.validate(expectation)
assert result.success


@parameterize_batch_for_data_sources(
data_source_configs=JUST_PANDAS_DATA_SOURCES, data=pd.DataFrame({COL_NAME: [1, 2, None]})
)
def test_ignores_nulls(batch_for_datasource: Batch) -> None:
expectation = gxe.ExpectColumnDistinctValuesToBeInSet(column=COL_NAME, value_set=[1, 2])
result = batch_for_datasource.validate(expectation)
assert result.success


@parameterize_batch_for_data_sources(
data_source_configs=JUST_PANDAS_DATA_SOURCES, data=ONES_AND_TWOS
)
def test_data_is_subset(batch_for_datasource: Batch) -> None:
expectation = gxe.ExpectColumnDistinctValuesToBeInSet(column=COL_NAME, value_set=[1, 2, 3])
result = batch_for_datasource.validate(expectation)
assert result.success


@pytest.mark.xfail(strict=True)
@parameterize_batch_for_data_sources(
data_source_configs=JUST_PANDAS_DATA_SOURCES, data=ONES_AND_TWOS
)
def test_empty_value_set(batch_for_datasource: Batch) -> None:
"""Failing test that seems like a (pretty minor) bug"""
expectation = gxe.ExpectColumnDistinctValuesToBeInSet(column=COL_NAME, value_set=[])
result = batch_for_datasource.validate(expectation)
assert not result.success


@parameterize_batch_for_data_sources(
data_source_configs=JUST_PANDAS_DATA_SOURCES, data=ONES_AND_TWOS
)
def test_value_set_is_none(batch_for_datasource: Batch) -> None:
# why do we even allow this?!?
expectation = gxe.ExpectColumnDistinctValuesToBeInSet(column=COL_NAME, value_set=None)
result = batch_for_datasource.validate(expectation)
assert result.success


@parameterize_batch_for_data_sources(
data_source_configs=JUST_PANDAS_DATA_SOURCES, data=ONES_AND_TWOS
)
def test_failure(batch_for_datasource: Batch) -> None:
expectation = gxe.ExpectColumnDistinctValuesToBeInSet(column=COL_NAME, value_set=[1])
result = batch_for_datasource.validate(expectation)
assert not result.success
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
from datetime import datetime

import pandas as pd

import great_expectations.expectations as gxe
from great_expectations.core.result_format import ResultFormat
from great_expectations.datasource.fluent.interfaces import Batch
from tests.integration.conftest import parameterize_batch_for_data_sources
from tests.integration.data_sources_and_expectations.test_canonical_expectations import (
ALL_DATA_SOURCES,
DATA_SOURCES_THAT_SUPPORT_DATE_COMPARISONS,
JUST_PANDAS_DATA_SOURCES,
)

COL_NAME = "my_col"

ONES_AND_TWOS = pd.DataFrame({COL_NAME: [1, 2, 2, 2]})


@parameterize_batch_for_data_sources(data_source_configs=ALL_DATA_SOURCES, data=ONES_AND_TWOS)
def test_success_complete_results(batch_for_datasource: Batch) -> None:
expectation = gxe.ExpectColumnDistinctValuesToContainSet(column=COL_NAME, value_set=[1, 2])
result = batch_for_datasource.validate(expectation, result_format=ResultFormat.COMPLETE)
assert result.success
assert result.to_json_dict()["result"] == {
"details": {
"value_counts": [
{"value": 1, "count": 1},
{"value": 2, "count": 3},
]
},
"observed_value": [1, 2],
}


@parameterize_batch_for_data_sources(
data_source_configs=ALL_DATA_SOURCES,
data=pd.DataFrame({COL_NAME: ["foo", "bar"]}),
)
def test_strings(batch_for_datasource: Batch) -> None:
expectation = gxe.ExpectColumnDistinctValuesToContainSet(column=COL_NAME, value_set=["foo"])
result = batch_for_datasource.validate(expectation)
assert result.success


@parameterize_batch_for_data_sources(
data_source_configs=DATA_SOURCES_THAT_SUPPORT_DATE_COMPARISONS,
data=pd.DataFrame({COL_NAME: [datetime(2024, 11, 19).date(), datetime(2024, 11, 20).date()]}), # noqa: DTZ001
)
def test_dates(batch_for_datasource: Batch) -> None:
expectation = gxe.ExpectColumnDistinctValuesToContainSet(
column=COL_NAME,
value_set=[datetime(2024, 11, 19).date()], # noqa: DTZ001
)
result = batch_for_datasource.validate(expectation)
assert result.success


@parameterize_batch_for_data_sources(
data_source_configs=JUST_PANDAS_DATA_SOURCES, data=pd.DataFrame({COL_NAME: [1, 2, None]})
)
def test_ignores_nulls(batch_for_datasource: Batch) -> None:
expectation = gxe.ExpectColumnDistinctValuesToContainSet(column=COL_NAME, value_set=[1, 2])
result = batch_for_datasource.validate(expectation)
assert result.success


@parameterize_batch_for_data_sources(
data_source_configs=JUST_PANDAS_DATA_SOURCES, data=pd.DataFrame({COL_NAME: [1, 2, None]})
)
def test_data_is_superset(batch_for_datasource: Batch) -> None:
expectation = gxe.ExpectColumnDistinctValuesToContainSet(column=COL_NAME, value_set=[1])
result = batch_for_datasource.validate(expectation)
assert result.success


@parameterize_batch_for_data_sources(
data_source_configs=JUST_PANDAS_DATA_SOURCES, data=ONES_AND_TWOS
)
def test_failure(batch_for_datasource: Batch) -> None:
expectation = gxe.ExpectColumnDistinctValuesToContainSet(column=COL_NAME, value_set=[1, 2, 3])
result = batch_for_datasource.validate(expectation)
assert not result.success
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
from datetime import datetime
from typing import Optional

import pandas as pd
import pytest

import great_expectations.expectations as gxe
from great_expectations.core.result_format import ResultFormat
from great_expectations.datasource.fluent.interfaces import Batch
from tests.integration.conftest import parameterize_batch_for_data_sources
from tests.integration.data_sources_and_expectations.test_canonical_expectations import (
ALL_DATA_SOURCES,
DATA_SOURCES_THAT_SUPPORT_DATE_COMPARISONS,
JUST_PANDAS_DATA_SOURCES,
)

COL_NAME = "my_col"

ONES_AND_TWOS = pd.DataFrame({COL_NAME: [1, 2, 2, 2]})


@parameterize_batch_for_data_sources(data_source_configs=ALL_DATA_SOURCES, data=ONES_AND_TWOS)
def test_success_complete_results(batch_for_datasource: Batch) -> None:
expectation = gxe.ExpectColumnDistinctValuesToEqualSet(column=COL_NAME, value_set=[1, 2])
result = batch_for_datasource.validate(expectation, result_format=ResultFormat.COMPLETE)
assert result.success
assert result.to_json_dict()["result"] == {
"details": {
"value_counts": [
{"value": 1, "count": 1},
{"value": 2, "count": 3},
]
},
"observed_value": [1, 2],
}


@parameterize_batch_for_data_sources(
data_source_configs=ALL_DATA_SOURCES,
data=pd.DataFrame({COL_NAME: ["foo", "bar"]}),
)
def test_strings(batch_for_datasource: Batch) -> None:
expectation = gxe.ExpectColumnDistinctValuesToEqualSet(
column=COL_NAME, value_set=["foo", "bar"]
)
result = batch_for_datasource.validate(expectation)
assert result.success


@parameterize_batch_for_data_sources(
data_source_configs=DATA_SOURCES_THAT_SUPPORT_DATE_COMPARISONS,
data=pd.DataFrame({COL_NAME: [datetime(2024, 11, 19).date(), datetime(2024, 11, 20).date()]}), # noqa: DTZ001
)
def test_dates(batch_for_datasource: Batch) -> None:
expectation = gxe.ExpectColumnDistinctValuesToEqualSet(
column=COL_NAME,
value_set=[datetime(2024, 11, 19).date(), datetime(2024, 11, 20).date()], # noqa: DTZ001
)
result = batch_for_datasource.validate(expectation)
assert result.success


@parameterize_batch_for_data_sources(
data_source_configs=JUST_PANDAS_DATA_SOURCES, data=pd.DataFrame({COL_NAME: [1, 2, None]})
)
def test_ignores_nulls(batch_for_datasource: Batch) -> None:
expectation = gxe.ExpectColumnDistinctValuesToEqualSet(column=COL_NAME, value_set=[1, 2])
result = batch_for_datasource.validate(expectation)
assert result.success


@pytest.mark.parametrize("value_set", [None, [], [1], [1, 4], [1, 2, 3]])
@parameterize_batch_for_data_sources(
data_source_configs=JUST_PANDAS_DATA_SOURCES, data=ONES_AND_TWOS
)
def test_fails_if_data_is_not_equal(
batch_for_datasource: Batch, value_set: Optional[list[int]]
) -> None:
expectation = gxe.ExpectColumnDistinctValuesToEqualSet(column=COL_NAME, value_set=value_set)
result = batch_for_datasource.validate(expectation)
assert not result.success
Loading

0 comments on commit 44382ef

Please sign in to comment.