diff --git a/nannyml/drift/univariate/methods.py b/nannyml/drift/univariate/methods.py index a635c831..1c88eefe 100644 --- a/nannyml/drift/univariate/methods.py +++ b/nannyml/drift/univariate/methods.py @@ -29,7 +29,7 @@ from scipy.stats import chi2_contingency, ks_2samp, wasserstein_distance from nannyml._typing import Self -from nannyml.base import _column_is_categorical, _remove_nans +from nannyml.base import _remove_nans from nannyml.chunk import Chunker from nannyml.exceptions import InvalidArgumentsException, NotFittedException from nannyml.thresholds import Threshold, calculate_threshold_values @@ -44,6 +44,7 @@ def __init__( column_name: str, chunker: Chunker, threshold: Threshold, + feature_type: FeatureType, computation_params: Optional[Dict[str, Any]] = None, upper_threshold_limit: Optional[float] = None, lower_threshold_limit: Optional[float] = None, @@ -73,6 +74,7 @@ def __init__( self.column_name = column_name self.threshold = threshold + self.feature_type = feature_type self.upper_threshold_value: Optional[float] = None self.lower_threshold_value: Optional[float] = None self.lower_threshold_value_limit: Optional[float] = lower_threshold_limit @@ -203,6 +205,8 @@ def create(cls, key: str, feature_type: FeatureType, **kwargs) -> Method: if kwargs is None: kwargs = {} + kwargs['feature_type'] = feature_type + method_class = cls.registry[key][feature_type] return method_class(**kwargs) @@ -272,23 +276,21 @@ def __init__(self, **kwargs) -> None: lower_threshold_limit : float, default=0 An optional lower threshold for the performance metric. """ - self._treat_as_type: str self._bins: np.ndarray self._reference_proba_in_bins: np.ndarray def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None): reference_data = _remove_nans(reference_data) - if _column_is_categorical(reference_data): - treat_as_type = 'cat' - else: + + if self.feature_type == FeatureType.CONTINUOUS: n_unique_values = len(np.unique(reference_data)) len_reference = len(reference_data) - if n_unique_values > 50 or n_unique_values / len_reference > 0.1: - treat_as_type = 'cont' - else: - treat_as_type = 'cat' + if not (n_unique_values > 50 or n_unique_values / len_reference > 0.1): + self._logger.warning( + "Continuous feature has few unique values. Consider including in `treat_as_categorical`." + ) - if treat_as_type == 'cont': + if self.feature_type == FeatureType.CONTINUOUS: bins = np.histogram_bin_edges(reference_data, bins='doane') reference_proba_in_bins = np.histogram(reference_data, bins=bins)[0] / len_reference self._bins = bins @@ -299,8 +301,6 @@ def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None self._bins = reference_unique self._reference_proba_in_bins = reference_proba_per_unique - self._treat_as_type = treat_as_type - return self def _calculate(self, data: pd.Series): @@ -308,14 +308,14 @@ def _calculate(self, data: pd.Series): data = _remove_nans(data) if data.empty: return np.nan - if self._treat_as_type == 'cont': + if self.feature_type == FeatureType.CONTINUOUS: len_data = len(data) data_proba_in_bins = np.histogram(data, bins=self._bins)[0] / len_data else: data_unique, data_counts = np.unique(data, return_counts=True) data_counts_dic = dict(zip(data_unique, data_counts)) - data_count_on_ref_bins = [data_counts_dic[key] if key in data_counts_dic else 0 for key in self._bins] + data_count_on_ref_bins = [data_counts_dic.get(key, 0) for key in self._bins] data_proba_in_bins = np.array(data_count_on_ref_bins) / len(data) leftover = 1 - np.sum(data_proba_in_bins) @@ -693,23 +693,20 @@ def __init__(self, **kwargs) -> None: An optional lower threshold for the performance metric. """ - self._treat_as_type: str self._bins: np.ndarray self._reference_proba_in_bins: np.ndarray def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None) -> Self: reference_data = _remove_nans(reference_data) - if _column_is_categorical(reference_data): - treat_as_type = 'cat' - else: + if self.feature_type == FeatureType.CONTINUOUS: n_unique_values = len(np.unique(reference_data)) len_reference = len(reference_data) - if n_unique_values > 50 or n_unique_values / len_reference > 0.1: - treat_as_type = 'cont' - else: - treat_as_type = 'cat' + if not (n_unique_values > 50 or n_unique_values / len_reference > 0.1): + self._logger.warning( + "Continuous feature has few unique values. Consider including in `treat_as_categorical`." + ) - if treat_as_type == 'cont': + if self.feature_type == FeatureType.CONTINUOUS: bins = np.histogram_bin_edges(reference_data, bins='doane') reference_proba_in_bins = np.histogram(reference_data, bins=bins)[0] / len_reference self._bins = bins @@ -720,8 +717,6 @@ def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None self._bins = reference_unique self._reference_proba_in_bins = reference_proba_per_unique - self._treat_as_type = treat_as_type - return self def _calculate(self, data: pd.Series): @@ -729,14 +724,14 @@ def _calculate(self, data: pd.Series): if data.empty: return np.nan reference_proba_in_bins = copy(self._reference_proba_in_bins) - if self._treat_as_type == 'cont': + if self.feature_type == FeatureType.CONTINUOUS: len_data = len(data) data_proba_in_bins = np.histogram(data, bins=self._bins)[0] / len_data else: data_unique, data_counts = np.unique(data, return_counts=True) data_counts_dic = dict(zip(data_unique, data_counts)) - data_count_on_ref_bins = [data_counts_dic[key] if key in data_counts_dic else 0 for key in self._bins] + data_count_on_ref_bins = [data_counts_dic.get(key, 0) for key in self._bins] data_proba_in_bins = np.array(data_count_on_ref_bins) / len(data) leftover = 1 - np.sum(data_proba_in_bins) diff --git a/tests/drift/test_univariate_drift_methods.py b/tests/drift/test_univariate_drift_methods.py index 50d742c8..cc3f09cf 100644 --- a/tests/drift/test_univariate_drift_methods.py +++ b/tests/drift/test_univariate_drift_methods.py @@ -1,4 +1,5 @@ """Unit tests for the UnivariateDriftCalculator methods.""" + import numpy as np import pandas as pd import pytest @@ -10,6 +11,7 @@ KolmogorovSmirnovStatistic, LInfinityDistance, WassersteinDistance, + FeatureType, ) from nannyml.thresholds import ConstantThreshold @@ -22,7 +24,7 @@ def test_js_for_0_distance(): # noqa: D103 np.random.seed(1) reference = pd.Series(np.random.choice(np.linspace(0, 2, 6), 10_000), name='A') - js = JensenShannonDistance(chunker=chunker, threshold=threshold) + js = JensenShannonDistance(chunker=chunker, threshold=threshold, feature_type=FeatureType.CONTINUOUS) js.fit(reference) distance = js.calculate(reference) assert distance == 0 @@ -32,7 +34,7 @@ def test_js_for_both_continuous(): # noqa: D103 np.random.seed(1) reference = pd.Series(np.random.normal(0, 1, 10_000), name='A') analysis = pd.Series(np.random.normal(0, 1, 1000), name='A') - js = JensenShannonDistance(chunker=chunker, threshold=threshold) + js = JensenShannonDistance(chunker=chunker, threshold=threshold, feature_type=FeatureType.CONTINUOUS) js.fit(reference) distance = js.calculate(analysis) assert np.round(distance, 2) == 0.05 @@ -42,7 +44,7 @@ def test_js_for_quasi_continuous(): # noqa: D103 np.random.seed(1) reference = pd.Series(np.random.choice(np.linspace(0, 2, 6), 10_000), name='A') analysis = pd.Series(np.random.choice(np.linspace(0, 2, 3), 1000), name='A') - js = JensenShannonDistance(chunker=chunker, threshold=threshold) + js = JensenShannonDistance(chunker=chunker, threshold=threshold, feature_type=FeatureType.CONTINUOUS) js.fit(reference) distance = js.calculate(analysis) assert np.round(distance, 2) == 0.73 @@ -52,7 +54,7 @@ def test_js_for_categorical(): # noqa: D103 np.random.seed(1) reference = pd.Series(np.random.choice(['a', 'b', 'c', 'd'], 10_000), name='A') analysis = pd.Series(np.random.choice(['a', 'b', 'c', 'e'], 1000), name='A') - js = JensenShannonDistance(chunker=chunker, threshold=threshold) + js = JensenShannonDistance(chunker=chunker, threshold=threshold, feature_type=FeatureType.CATEGORICAL) js.fit(reference) distance = js.calculate(analysis) assert np.round(distance, 2) == 0.5 @@ -61,7 +63,7 @@ def test_js_for_categorical(): # noqa: D103 def test_l_infinity_for_new_category(): # noqa: D103 reference = pd.Series(['a', 'a', 'b', 'b', 'c', 'c'], name='A') analysis = pd.Series(['a', 'a', 'b', 'b', 'c', 'c', 'd'], name='A') - infnorm = LInfinityDistance(chunker=chunker, threshold=threshold) + infnorm = LInfinityDistance(chunker=chunker, threshold=threshold, feature_type=FeatureType.CATEGORICAL) infnorm.fit(reference) distance = infnorm.calculate(analysis) assert np.round(distance, 2) == 0.14 @@ -70,7 +72,7 @@ def test_l_infinity_for_new_category(): # noqa: D103 def test_l_infinity_for_no_change(): # noqa: D103 reference = pd.Series(['a', 'a', 'b', 'b', 'c', 'c'], name='A') analysis = pd.Series(['a', 'a', 'b', 'b', 'c', 'c'], name='A') - infnorm = LInfinityDistance(chunker=chunker, threshold=threshold) + infnorm = LInfinityDistance(chunker=chunker, threshold=threshold, feature_type=FeatureType.CATEGORICAL) infnorm.fit(reference) distance = infnorm.calculate(analysis) assert np.round(distance, 2) == 0.0 @@ -79,7 +81,7 @@ def test_l_infinity_for_no_change(): # noqa: D103 def test_l_infinity_for_total_change(): # noqa: D103 reference = pd.Series(['a', 'a', 'b', 'b', 'c', 'c'], name='A') analysis = pd.Series(['b', 'b', 'b', 'b', 'b'], name='A') - infnorm = LInfinityDistance(chunker=chunker, threshold=threshold) + infnorm = LInfinityDistance(chunker=chunker, threshold=threshold, feature_type=FeatureType.CATEGORICAL) infnorm.fit(reference) distance = infnorm.calculate(analysis) assert np.round(distance, 2) == 0.67 @@ -92,7 +94,7 @@ def test_wasserstein_both_continuous_0_distance(): # noqa: D103 np.random.seed(1) reference = pd.Series(np.random.normal(0, 1, 10_000), name='A') analysis = reference - wass_dist = WassersteinDistance(chunker=chunker, threshold=threshold) + wass_dist = WassersteinDistance(chunker=chunker, threshold=threshold, feature_type=FeatureType.CONTINUOUS) wass_dist = wass_dist.fit(reference).calculate(analysis) wass_dist = np.round(wass_dist, 2) assert wass_dist == 0 @@ -102,7 +104,7 @@ def test_wasserstein_both_continuous_positive_means_small_drift(): # noqa: D103 np.random.seed(1) reference = pd.Series(np.random.normal(0, 1, 10000), name='A') analysis = pd.Series(np.random.normal(1, 1, 1000), name='A') - wass_dist = WassersteinDistance(chunker=chunker, threshold=threshold) + wass_dist = WassersteinDistance(chunker=chunker, threshold=threshold, feature_type=FeatureType.CONTINUOUS) wass_dist = wass_dist.fit(reference).calculate(analysis) wass_dist = np.round(wass_dist, 2) assert wass_dist == 1.01 @@ -112,7 +114,7 @@ def test_wasserstein_both_continuous_analysis_with_neg_mean_medium_drift(): # n np.random.seed(1) reference = pd.Series(np.random.normal(0, 1, 100000), name='A') analysis = pd.Series(np.random.normal(-4, 1, 1000), name='A') - wass_dist = WassersteinDistance(chunker=chunker, threshold=threshold) + wass_dist = WassersteinDistance(chunker=chunker, threshold=threshold, feature_type=FeatureType.CONTINUOUS) wass_dist = wass_dist.fit(reference).calculate(analysis) wass_dist = np.round(wass_dist, 2) assert wass_dist == 3.99 @@ -122,7 +124,7 @@ def test_wasserstein_both_continuous_analysis_estimate_with_out_of_reference_dri np.random.seed(1) reference = pd.Series(np.random.normal(0, 1, 15_000), name='A') analysis = pd.Series(np.random.normal(0, 10, 1_000_000), name='A') - wass_dist = WassersteinDistance(chunker=chunker, threshold=threshold) + wass_dist = WassersteinDistance(chunker=chunker, threshold=threshold, feature_type=FeatureType.CONTINUOUS) wass_dist = wass_dist.fit(reference).calculate(analysis) wass_dist = np.round(wass_dist, 3) assert wass_dist == 7.180 @@ -135,7 +137,11 @@ def test_hellinger_complete_overlap(): # noqa: D103 np.random.seed(1) reference = pd.Series(np.random.normal(0, 1, 10_000), name='A') analysis = reference - hell_dist = HellingerDistance(chunker=chunker, threshold=threshold).fit(reference).calculate(analysis) + hell_dist = ( + HellingerDistance(chunker=chunker, threshold=threshold, feature_type=FeatureType.CONTINUOUS) + .fit(reference) + .calculate(analysis) + ) hell_dist = np.round(hell_dist, 2) assert hell_dist == 0 @@ -144,7 +150,11 @@ def test_hellinger_no_overlap(): # noqa: D103 np.random.seed(1) reference = pd.Series(np.random.normal(0, 1, 10_000), name='A') analysis = pd.Series(np.random.normal(7, 1, 10_000), name='A') - hell_dist = HellingerDistance(chunker=chunker, threshold=threshold).fit(reference).calculate(analysis) + hell_dist = ( + HellingerDistance(chunker=chunker, threshold=threshold, feature_type=FeatureType.CONTINUOUS) + .fit(reference) + .calculate(analysis) + ) hell_dist = np.round(hell_dist, 2) assert hell_dist == 1 @@ -153,7 +163,11 @@ def test_hellinger_both_continuous_analysis_with_small_drift(): # noqa: D103 np.random.seed(1) reference = pd.Series(np.random.normal(0, 1, 10_000), name='A') analysis = pd.Series(np.random.normal(-2, 1, 10_000), name='A') - hell_dist = HellingerDistance(chunker=chunker, threshold=threshold).fit(reference).calculate(analysis) + hell_dist = ( + HellingerDistance(chunker=chunker, threshold=threshold, feature_type=FeatureType.CONTINUOUS) + .fit(reference) + .calculate(analysis) + ) hell_dist = np.round(hell_dist, 2) assert hell_dist == 0.63 @@ -162,7 +176,7 @@ def test_hellinger_for_quasi_continuous(): # noqa: D103 np.random.seed(1) reference = pd.Series(np.random.choice(np.linspace(0, 2, 6), 10_000), name='A') analysis = pd.Series(np.random.choice(np.linspace(0, 2, 3), 1000), name='A') - hell_dist = HellingerDistance(chunker=chunker, threshold=threshold) + hell_dist = HellingerDistance(chunker=chunker, threshold=threshold, feature_type=FeatureType.CONTINUOUS) hell_dist.fit(reference) distance = hell_dist.calculate(analysis) assert np.round(distance, 2) == 0.72 @@ -172,7 +186,7 @@ def test_hellinger_for_categorical(): # noqa: D103 np.random.seed(1) reference = pd.Series(np.random.choice(['a', 'b', 'c', 'd'], 10_000), name='A') analysis = pd.Series(np.random.choice(['a', 'b', 'c', 'e'], 1000), name='A') - hell_dist = HellingerDistance(chunker=chunker, threshold=threshold) + hell_dist = HellingerDistance(chunker=chunker, threshold=threshold, feature_type=FeatureType.CATEGORICAL) hell_dist.fit(reference) distance = hell_dist.calculate(analysis) assert np.round(distance, 2) == 0.5 @@ -181,11 +195,31 @@ def test_hellinger_for_categorical(): # noqa: D103 @pytest.mark.parametrize( 'method', [ - KolmogorovSmirnovStatistic(chunker=DefaultChunker(), threshold=ConstantThreshold(lower=-1, upper=None)), - LInfinityDistance(chunker=DefaultChunker(), threshold=ConstantThreshold(lower=-1, upper=None)), - JensenShannonDistance(chunker=DefaultChunker(), threshold=ConstantThreshold(lower=-1, upper=None)), - WassersteinDistance(chunker=DefaultChunker(), threshold=ConstantThreshold(lower=-1, upper=None)), - HellingerDistance(chunker=DefaultChunker(), threshold=ConstantThreshold(lower=-1, upper=None)), + KolmogorovSmirnovStatistic( + chunker=DefaultChunker(), + threshold=ConstantThreshold(lower=-1, upper=None), + feature_type=FeatureType.CONTINUOUS, + ), + LInfinityDistance( + chunker=DefaultChunker(), + threshold=ConstantThreshold(lower=-1, upper=None), + feature_type=FeatureType.CATEGORICAL, + ), + JensenShannonDistance( + chunker=DefaultChunker(), + threshold=ConstantThreshold(lower=-1, upper=None), + feature_type=FeatureType.CONTINUOUS, + ), + WassersteinDistance( + chunker=DefaultChunker(), + threshold=ConstantThreshold(lower=-1, upper=None), + feature_type=FeatureType.CONTINUOUS, + ), + HellingerDistance( + chunker=DefaultChunker(), + threshold=ConstantThreshold(lower=-1, upper=None), + feature_type=FeatureType.CONTINUOUS, + ), ], ) def test_method_logs_warning_when_lower_threshold_is_overridden_by_metric_limits(caplog, method): # noqa: D103 @@ -202,7 +236,9 @@ def test_method_logs_warning_when_lower_threshold_is_overridden_by_metric_limits @pytest.mark.parametrize( 'method', [ - KolmogorovSmirnovStatistic(chunker=DefaultChunker(), threshold=ConstantThreshold(upper=2)), + KolmogorovSmirnovStatistic( + chunker=DefaultChunker(), threshold=ConstantThreshold(upper=2), feature_type=FeatureType.CONTINUOUS + ), ], ) def test_method_logs_warning_when_upper_threshold_is_overridden_by_metric_limits(caplog, method): # noqa: D103