NannyML · Duncan-Hunter · Jun 17, 2024
@@ -29,7 +29,7 @@
 from scipy.stats import chi2_contingency, ks_2samp, wasserstein_distance
 
 from nannyml._typing import Self
-from nannyml.base import _column_is_categorical, _remove_nans
+from nannyml.base import _remove_nans
 from nannyml.chunk import Chunker
 from nannyml.exceptions import InvalidArgumentsException, NotFittedException
 from nannyml.thresholds import Threshold, calculate_threshold_values
@@ -44,6 +44,7 @@ def __init__(
         column_name: str,
         chunker: Chunker,
         threshold: Threshold,
+        feature_type: FeatureType,
         computation_params: Optional[Dict[str, Any]] = None,
         upper_threshold_limit: Optional[float] = None,
         lower_threshold_limit: Optional[float] = None,
@@ -73,6 +74,7 @@ def __init__(
         self.column_name = column_name
 
         self.threshold = threshold
+        self.feature_type = feature_type
         self.upper_threshold_value: Optional[float] = None
         self.lower_threshold_value: Optional[float] = None
         self.lower_threshold_value_limit: Optional[float] = lower_threshold_limit
@@ -203,6 +205,8 @@ def create(cls, key: str, feature_type: FeatureType, **kwargs) -> Method:
         if kwargs is None:
             kwargs = {}
 
+        kwargs['feature_type'] = feature_type
+
         method_class = cls.registry[key][feature_type]
         return method_class(**kwargs)
 
@@ -272,23 +276,21 @@ def __init__(self, **kwargs) -> None:
         lower_threshold_limit : float, default=0
             An optional lower threshold for the performance metric.
         """
-        self._treat_as_type: str
         self._bins: np.ndarray
         self._reference_proba_in_bins: np.ndarray
 
     def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None):
         reference_data = _remove_nans(reference_data)
-        if _column_is_categorical(reference_data):
-            treat_as_type = 'cat'
-        else:
+
+        if self.feature_type == FeatureType.CONTINUOUS:
             n_unique_values = len(np.unique(reference_data))
             len_reference = len(reference_data)
-            if n_unique_values > 50 or n_unique_values / len_reference > 0.1:
-                treat_as_type = 'cont'
-            else:
-                treat_as_type = 'cat'
+            if not (n_unique_values > 50 or n_unique_values / len_reference > 0.1):
+                self._logger.warning(
+                    "Continuous feature has few unique values. Consider including in `treat_as_categorical`."
+                )
 
-        if treat_as_type == 'cont':
+        if self.feature_type == FeatureType.CONTINUOUS:
             bins = np.histogram_bin_edges(reference_data, bins='doane')
             reference_proba_in_bins = np.histogram(reference_data, bins=bins)[0] / len_reference
             self._bins = bins
@@ -299,23 +301,21 @@ def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None
             self._bins = reference_unique
             self._reference_proba_in_bins = reference_proba_per_unique
 
-        self._treat_as_type = treat_as_type
-
         return self
 
     def _calculate(self, data: pd.Series):
         reference_proba_in_bins = copy(self._reference_proba_in_bins)
         data = _remove_nans(data)
         if data.empty:
             return np.nan
-        if self._treat_as_type == 'cont':
+        if self.feature_type == FeatureType.CONTINUOUS:
             len_data = len(data)
             data_proba_in_bins = np.histogram(data, bins=self._bins)[0] / len_data
 
         else:
             data_unique, data_counts = np.unique(data, return_counts=True)
             data_counts_dic = dict(zip(data_unique, data_counts))
-            data_count_on_ref_bins = [data_counts_dic[key] if key in data_counts_dic else 0 for key in self._bins]
+            data_count_on_ref_bins = [data_counts_dic.get(key, 0) for key in self._bins]
             data_proba_in_bins = np.array(data_count_on_ref_bins) / len(data)
 
         leftover = 1 - np.sum(data_proba_in_bins)
@@ -693,23 +693,20 @@ def __init__(self, **kwargs) -> None:
             An optional lower threshold for the performance metric.
         """
 
-        self._treat_as_type: str
         self._bins: np.ndarray
         self._reference_proba_in_bins: np.ndarray
 
     def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None) -> Self:
         reference_data = _remove_nans(reference_data)
-        if _column_is_categorical(reference_data):
-            treat_as_type = 'cat'
-        else:
+        if self.feature_type == FeatureType.CONTINUOUS:
             n_unique_values = len(np.unique(reference_data))
             len_reference = len(reference_data)
-            if n_unique_values > 50 or n_unique_values / len_reference > 0.1:
-                treat_as_type = 'cont'
-            else:
-                treat_as_type = 'cat'
+            if not (n_unique_values > 50 or n_unique_values / len_reference > 0.1):
+                self._logger.warning(
+                    "Continuous feature has few unique values. Consider including in `treat_as_categorical`."
+                )
 
-        if treat_as_type == 'cont':
+        if self.feature_type == FeatureType.CONTINUOUS:
             bins = np.histogram_bin_edges(reference_data, bins='doane')
             reference_proba_in_bins = np.histogram(reference_data, bins=bins)[0] / len_reference
             self._bins = bins
@@ -720,23 +717,21 @@ def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None
             self._bins = reference_unique
             self._reference_proba_in_bins = reference_proba_per_unique
 
-        self._treat_as_type = treat_as_type
-
         return self
 
     def _calculate(self, data: pd.Series):
         data = _remove_nans(data)
         if data.empty:
             return np.nan
         reference_proba_in_bins = copy(self._reference_proba_in_bins)
-        if self._treat_as_type == 'cont':
+        if self.feature_type == FeatureType.CONTINUOUS:
             len_data = len(data)
             data_proba_in_bins = np.histogram(data, bins=self._bins)[0] / len_data
 
         else:
             data_unique, data_counts = np.unique(data, return_counts=True)
             data_counts_dic = dict(zip(data_unique, data_counts))
-            data_count_on_ref_bins = [data_counts_dic[key] if key in data_counts_dic else 0 for key in self._bins]
+            data_count_on_ref_bins = [data_counts_dic.get(key, 0) for key in self._bins]
             data_proba_in_bins = np.array(data_count_on_ref_bins) / len(data)
 
         leftover = 1 - np.sum(data_proba_in_bins)

@@ -1,4 +1,5 @@
 """Unit tests for the UnivariateDriftCalculator methods."""
+
 import numpy as np
 import pandas as pd
 import pytest
@@ -10,6 +11,7 @@
     KolmogorovSmirnovStatistic,
     LInfinityDistance,
     WassersteinDistance,
+    FeatureType,
 )
 from nannyml.thresholds import ConstantThreshold
 
@@ -22,7 +24,7 @@
 def test_js_for_0_distance():  # noqa: D103
     np.random.seed(1)
     reference = pd.Series(np.random.choice(np.linspace(0, 2, 6), 10_000), name='A')
-    js = JensenShannonDistance(chunker=chunker, threshold=threshold)
+    js = JensenShannonDistance(chunker=chunker, threshold=threshold, feature_type=FeatureType.CONTINUOUS)
     js.fit(reference)
     distance = js.calculate(reference)
     assert distance == 0
@@ -32,7 +34,7 @@ def test_js_for_both_continuous():  # noqa: D103
     np.random.seed(1)
     reference = pd.Series(np.random.normal(0, 1, 10_000), name='A')
     analysis = pd.Series(np.random.normal(0, 1, 1000), name='A')
-    js = JensenShannonDistance(chunker=chunker, threshold=threshold)
+    js = JensenShannonDistance(chunker=chunker, threshold=threshold, feature_type=FeatureType.CONTINUOUS)
     js.fit(reference)
     distance = js.calculate(analysis)
     assert np.round(distance, 2) == 0.05
@@ -42,7 +44,7 @@ def test_js_for_quasi_continuous():  # noqa: D103
     np.random.seed(1)
     reference = pd.Series(np.random.choice(np.linspace(0, 2, 6), 10_000), name='A')
     analysis = pd.Series(np.random.choice(np.linspace(0, 2, 3), 1000), name='A')
-    js = JensenShannonDistance(chunker=chunker, threshold=threshold)
+    js = JensenShannonDistance(chunker=chunker, threshold=threshold, feature_type=FeatureType.CONTINUOUS)
     js.fit(reference)
     distance = js.calculate(analysis)
     assert np.round(distance, 2) == 0.73
@@ -52,7 +54,7 @@ def test_js_for_categorical():  # noqa: D103
     np.random.seed(1)
     reference = pd.Series(np.random.choice(['a', 'b', 'c', 'd'], 10_000), name='A')
     analysis = pd.Series(np.random.choice(['a', 'b', 'c', 'e'], 1000), name='A')
-    js = JensenShannonDistance(chunker=chunker, threshold=threshold)
+    js = JensenShannonDistance(chunker=chunker, threshold=threshold, feature_type=FeatureType.CATEGORICAL)
     js.fit(reference)
     distance = js.calculate(analysis)
     assert np.round(distance, 2) == 0.5
@@ -61,7 +63,7 @@ def test_js_for_categorical():  # noqa: D103
 def test_l_infinity_for_new_category():  # noqa: D103
     reference = pd.Series(['a', 'a', 'b', 'b', 'c', 'c'], name='A')
     analysis = pd.Series(['a', 'a', 'b', 'b', 'c', 'c', 'd'], name='A')
-    infnorm = LInfinityDistance(chunker=chunker, threshold=threshold)
+    infnorm = LInfinityDistance(chunker=chunker, threshold=threshold, feature_type=FeatureType.CATEGORICAL)
     infnorm.fit(reference)
     distance = infnorm.calculate(analysis)
     assert np.round(distance, 2) == 0.14
@@ -70,7 +72,7 @@ def test_l_infinity_for_new_category():  # noqa: D103
 def test_l_infinity_for_no_change():  # noqa: D103
     reference = pd.Series(['a', 'a', 'b', 'b', 'c', 'c'], name='A')
     analysis = pd.Series(['a', 'a', 'b', 'b', 'c', 'c'], name='A')
-    infnorm = LInfinityDistance(chunker=chunker, threshold=threshold)
+    infnorm = LInfinityDistance(chunker=chunker, threshold=threshold, feature_type=FeatureType.CATEGORICAL)
     infnorm.fit(reference)
     distance = infnorm.calculate(analysis)
     assert np.round(distance, 2) == 0.0
@@ -79,7 +81,7 @@ def test_l_infinity_for_no_change():  # noqa: D103
 def test_l_infinity_for_total_change():  # noqa: D103
     reference = pd.Series(['a', 'a', 'b', 'b', 'c', 'c'], name='A')
     analysis = pd.Series(['b', 'b', 'b', 'b', 'b'], name='A')
-    infnorm = LInfinityDistance(chunker=chunker, threshold=threshold)
+    infnorm = LInfinityDistance(chunker=chunker, threshold=threshold, feature_type=FeatureType.CATEGORICAL)
     infnorm.fit(reference)
     distance = infnorm.calculate(analysis)
     assert np.round(distance, 2) == 0.67
@@ -92,7 +94,7 @@ def test_wasserstein_both_continuous_0_distance():  # noqa: D103
     np.random.seed(1)
     reference = pd.Series(np.random.normal(0, 1, 10_000), name='A')
     analysis = reference
-    wass_dist = WassersteinDistance(chunker=chunker, threshold=threshold)
+    wass_dist = WassersteinDistance(chunker=chunker, threshold=threshold, feature_type=FeatureType.CONTINUOUS)
     wass_dist = wass_dist.fit(reference).calculate(analysis)
     wass_dist = np.round(wass_dist, 2)
     assert wass_dist == 0
@@ -102,7 +104,7 @@ def test_wasserstein_both_continuous_positive_means_small_drift():  # noqa: D103
     np.random.seed(1)
     reference = pd.Series(np.random.normal(0, 1, 10000), name='A')
     analysis = pd.Series(np.random.normal(1, 1, 1000), name='A')
-    wass_dist = WassersteinDistance(chunker=chunker, threshold=threshold)
+    wass_dist = WassersteinDistance(chunker=chunker, threshold=threshold, feature_type=FeatureType.CONTINUOUS)
     wass_dist = wass_dist.fit(reference).calculate(analysis)
     wass_dist = np.round(wass_dist, 2)
     assert wass_dist == 1.01
@@ -112,7 +114,7 @@ def test_wasserstein_both_continuous_analysis_with_neg_mean_medium_drift():  # n
     np.random.seed(1)
     reference = pd.Series(np.random.normal(0, 1, 100000), name='A')
     analysis = pd.Series(np.random.normal(-4, 1, 1000), name='A')
-    wass_dist = WassersteinDistance(chunker=chunker, threshold=threshold)
+    wass_dist = WassersteinDistance(chunker=chunker, threshold=threshold, feature_type=FeatureType.CONTINUOUS)
     wass_dist = wass_dist.fit(reference).calculate(analysis)
     wass_dist = np.round(wass_dist, 2)
     assert wass_dist == 3.99
@@ -122,7 +124,7 @@ def test_wasserstein_both_continuous_analysis_estimate_with_out_of_reference_dri
     np.random.seed(1)
     reference = pd.Series(np.random.normal(0, 1, 15_000), name='A')
     analysis = pd.Series(np.random.normal(0, 10, 1_000_000), name='A')
-    wass_dist = WassersteinDistance(chunker=chunker, threshold=threshold)
+    wass_dist = WassersteinDistance(chunker=chunker, threshold=threshold, feature_type=FeatureType.CONTINUOUS)
     wass_dist = wass_dist.fit(reference).calculate(analysis)
     wass_dist = np.round(wass_dist, 3)
     assert wass_dist == 7.180
@@ -135,7 +137,11 @@ def test_hellinger_complete_overlap():  # noqa: D103
     np.random.seed(1)
     reference = pd.Series(np.random.normal(0, 1, 10_000), name='A')
     analysis = reference
-    hell_dist = HellingerDistance(chunker=chunker, threshold=threshold).fit(reference).calculate(analysis)
+    hell_dist = (
+        HellingerDistance(chunker=chunker, threshold=threshold, feature_type=FeatureType.CONTINUOUS)
+        .fit(reference)
+        .calculate(analysis)
+    )
     hell_dist = np.round(hell_dist, 2)
     assert hell_dist == 0
 
@@ -144,7 +150,11 @@ def test_hellinger_no_overlap():  # noqa: D103
     np.random.seed(1)
     reference = pd.Series(np.random.normal(0, 1, 10_000), name='A')
     analysis = pd.Series(np.random.normal(7, 1, 10_000), name='A')
-    hell_dist = HellingerDistance(chunker=chunker, threshold=threshold).fit(reference).calculate(analysis)
+    hell_dist = (
+        HellingerDistance(chunker=chunker, threshold=threshold, feature_type=FeatureType.CONTINUOUS)
+        .fit(reference)
+        .calculate(analysis)
+    )
     hell_dist = np.round(hell_dist, 2)
     assert hell_dist == 1
 
@@ -153,7 +163,11 @@ def test_hellinger_both_continuous_analysis_with_small_drift():  # noqa: D103
     np.random.seed(1)
     reference = pd.Series(np.random.normal(0, 1, 10_000), name='A')
     analysis = pd.Series(np.random.normal(-2, 1, 10_000), name='A')
-    hell_dist = HellingerDistance(chunker=chunker, threshold=threshold).fit(reference).calculate(analysis)
+    hell_dist = (
+        HellingerDistance(chunker=chunker, threshold=threshold, feature_type=FeatureType.CONTINUOUS)
+        .fit(reference)
+        .calculate(analysis)
+    )
     hell_dist = np.round(hell_dist, 2)
     assert hell_dist == 0.63
 
@@ -162,7 +176,7 @@ def test_hellinger_for_quasi_continuous():  # noqa: D103
     np.random.seed(1)
     reference = pd.Series(np.random.choice(np.linspace(0, 2, 6), 10_000), name='A')
     analysis = pd.Series(np.random.choice(np.linspace(0, 2, 3), 1000), name='A')
-    hell_dist = HellingerDistance(chunker=chunker, threshold=threshold)
+    hell_dist = HellingerDistance(chunker=chunker, threshold=threshold, feature_type=FeatureType.CONTINUOUS)
     hell_dist.fit(reference)
     distance = hell_dist.calculate(analysis)
     assert np.round(distance, 2) == 0.72
@@ -172,7 +186,7 @@ def test_hellinger_for_categorical():  # noqa: D103
     np.random.seed(1)
     reference = pd.Series(np.random.choice(['a', 'b', 'c', 'd'], 10_000), name='A')
     analysis = pd.Series(np.random.choice(['a', 'b', 'c', 'e'], 1000), name='A')
-    hell_dist = HellingerDistance(chunker=chunker, threshold=threshold)
+    hell_dist = HellingerDistance(chunker=chunker, threshold=threshold, feature_type=FeatureType.CATEGORICAL)
     hell_dist.fit(reference)
     distance = hell_dist.calculate(analysis)
     assert np.round(distance, 2) == 0.5
@@ -181,11 +195,31 @@ def test_hellinger_for_categorical():  # noqa: D103
 @pytest.mark.parametrize(
     'method',
     [
-        KolmogorovSmirnovStatistic(chunker=DefaultChunker(), threshold=ConstantThreshold(lower=-1, upper=None)),
-        LInfinityDistance(chunker=DefaultChunker(), threshold=ConstantThreshold(lower=-1, upper=None)),
-        JensenShannonDistance(chunker=DefaultChunker(), threshold=ConstantThreshold(lower=-1, upper=None)),
-        WassersteinDistance(chunker=DefaultChunker(), threshold=ConstantThreshold(lower=-1, upper=None)),
-        HellingerDistance(chunker=DefaultChunker(), threshold=ConstantThreshold(lower=-1, upper=None)),
+        KolmogorovSmirnovStatistic(
+            chunker=DefaultChunker(),
+            threshold=ConstantThreshold(lower=-1, upper=None),
+            feature_type=FeatureType.CONTINUOUS,
+        ),
+        LInfinityDistance(
+            chunker=DefaultChunker(),
+            threshold=ConstantThreshold(lower=-1, upper=None),
+            feature_type=FeatureType.CATEGORICAL,
+        ),
+        JensenShannonDistance(
+            chunker=DefaultChunker(),
+            threshold=ConstantThreshold(lower=-1, upper=None),
+            feature_type=FeatureType.CONTINUOUS,
+        ),
+        WassersteinDistance(
+            chunker=DefaultChunker(),
+            threshold=ConstantThreshold(lower=-1, upper=None),
+            feature_type=FeatureType.CONTINUOUS,
+        ),
+        HellingerDistance(
+            chunker=DefaultChunker(),
+            threshold=ConstantThreshold(lower=-1, upper=None),
+            feature_type=FeatureType.CONTINUOUS,
+        ),
     ],
 )
 def test_method_logs_warning_when_lower_threshold_is_overridden_by_metric_limits(caplog, method):  # noqa: D103
@@ -202,7 +236,9 @@ def test_method_logs_warning_when_lower_threshold_is_overridden_by_metric_limits
 @pytest.mark.parametrize(
     'method',
     [
-        KolmogorovSmirnovStatistic(chunker=DefaultChunker(), threshold=ConstantThreshold(upper=2)),
+        KolmogorovSmirnovStatistic(
+            chunker=DefaultChunker(), threshold=ConstantThreshold(upper=2), feature_type=FeatureType.CONTINUOUS
+        ),
     ],
 )
 def test_method_logs_warning_when_upper_threshold_is_overridden_by_metric_limits(caplog, method):  # noqa: D103