Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

propose solution to remove hidden feature type conversion in Univariate Drift #398

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 22 additions & 27 deletions nannyml/drift/univariate/methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
from scipy.stats import chi2_contingency, ks_2samp, wasserstein_distance

from nannyml._typing import Self
from nannyml.base import _column_is_categorical, _remove_nans
from nannyml.base import _remove_nans
from nannyml.chunk import Chunker
from nannyml.exceptions import InvalidArgumentsException, NotFittedException
from nannyml.thresholds import Threshold, calculate_threshold_values
Expand All @@ -44,6 +44,7 @@ def __init__(
column_name: str,
chunker: Chunker,
threshold: Threshold,
feature_type: FeatureType,
computation_params: Optional[Dict[str, Any]] = None,
upper_threshold_limit: Optional[float] = None,
lower_threshold_limit: Optional[float] = None,
Expand Down Expand Up @@ -73,6 +74,7 @@ def __init__(
self.column_name = column_name

self.threshold = threshold
self.feature_type = feature_type
self.upper_threshold_value: Optional[float] = None
self.lower_threshold_value: Optional[float] = None
self.lower_threshold_value_limit: Optional[float] = lower_threshold_limit
Expand Down Expand Up @@ -203,6 +205,8 @@ def create(cls, key: str, feature_type: FeatureType, **kwargs) -> Method:
if kwargs is None:
kwargs = {}

kwargs['feature_type'] = feature_type

method_class = cls.registry[key][feature_type]
return method_class(**kwargs)

Expand Down Expand Up @@ -272,23 +276,21 @@ def __init__(self, **kwargs) -> None:
lower_threshold_limit : float, default=0
An optional lower threshold for the performance metric.
"""
self._treat_as_type: str
self._bins: np.ndarray
self._reference_proba_in_bins: np.ndarray

def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None):
reference_data = _remove_nans(reference_data)
if _column_is_categorical(reference_data):
treat_as_type = 'cat'
else:

if self.feature_type == FeatureType.CONTINUOUS:
n_unique_values = len(np.unique(reference_data))
len_reference = len(reference_data)
if n_unique_values > 50 or n_unique_values / len_reference > 0.1:
treat_as_type = 'cont'
else:
treat_as_type = 'cat'
if not (n_unique_values > 50 or n_unique_values / len_reference > 0.1):
self._logger.warning(
"Continuous feature has few unique values. Consider including in `treat_as_categorical`."
)

if treat_as_type == 'cont':
if self.feature_type == FeatureType.CONTINUOUS:
bins = np.histogram_bin_edges(reference_data, bins='doane')
reference_proba_in_bins = np.histogram(reference_data, bins=bins)[0] / len_reference
self._bins = bins
Expand All @@ -299,23 +301,21 @@ def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None
self._bins = reference_unique
self._reference_proba_in_bins = reference_proba_per_unique

self._treat_as_type = treat_as_type

return self

def _calculate(self, data: pd.Series):
reference_proba_in_bins = copy(self._reference_proba_in_bins)
data = _remove_nans(data)
if data.empty:
return np.nan
if self._treat_as_type == 'cont':
if self.feature_type == FeatureType.CONTINUOUS:
len_data = len(data)
data_proba_in_bins = np.histogram(data, bins=self._bins)[0] / len_data

else:
data_unique, data_counts = np.unique(data, return_counts=True)
data_counts_dic = dict(zip(data_unique, data_counts))
data_count_on_ref_bins = [data_counts_dic[key] if key in data_counts_dic else 0 for key in self._bins]
data_count_on_ref_bins = [data_counts_dic.get(key, 0) for key in self._bins]
data_proba_in_bins = np.array(data_count_on_ref_bins) / len(data)

leftover = 1 - np.sum(data_proba_in_bins)
Expand Down Expand Up @@ -693,23 +693,20 @@ def __init__(self, **kwargs) -> None:
An optional lower threshold for the performance metric.
"""

self._treat_as_type: str
self._bins: np.ndarray
self._reference_proba_in_bins: np.ndarray

def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None) -> Self:
reference_data = _remove_nans(reference_data)
if _column_is_categorical(reference_data):
treat_as_type = 'cat'
else:
if self.feature_type == FeatureType.CONTINUOUS:
n_unique_values = len(np.unique(reference_data))
len_reference = len(reference_data)
if n_unique_values > 50 or n_unique_values / len_reference > 0.1:
treat_as_type = 'cont'
else:
treat_as_type = 'cat'
if not (n_unique_values > 50 or n_unique_values / len_reference > 0.1):
self._logger.warning(
"Continuous feature has few unique values. Consider including in `treat_as_categorical`."
)

if treat_as_type == 'cont':
if self.feature_type == FeatureType.CONTINUOUS:
bins = np.histogram_bin_edges(reference_data, bins='doane')
reference_proba_in_bins = np.histogram(reference_data, bins=bins)[0] / len_reference
self._bins = bins
Expand All @@ -720,23 +717,21 @@ def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None
self._bins = reference_unique
self._reference_proba_in_bins = reference_proba_per_unique

self._treat_as_type = treat_as_type

return self

def _calculate(self, data: pd.Series):
data = _remove_nans(data)
if data.empty:
return np.nan
reference_proba_in_bins = copy(self._reference_proba_in_bins)
if self._treat_as_type == 'cont':
if self.feature_type == FeatureType.CONTINUOUS:
len_data = len(data)
data_proba_in_bins = np.histogram(data, bins=self._bins)[0] / len_data

else:
data_unique, data_counts = np.unique(data, return_counts=True)
data_counts_dic = dict(zip(data_unique, data_counts))
data_count_on_ref_bins = [data_counts_dic[key] if key in data_counts_dic else 0 for key in self._bins]
data_count_on_ref_bins = [data_counts_dic.get(key, 0) for key in self._bins]
data_proba_in_bins = np.array(data_count_on_ref_bins) / len(data)

leftover = 1 - np.sum(data_proba_in_bins)
Expand Down
80 changes: 58 additions & 22 deletions tests/drift/test_univariate_drift_methods.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Unit tests for the UnivariateDriftCalculator methods."""

import numpy as np
import pandas as pd
import pytest
Expand All @@ -10,6 +11,7 @@
KolmogorovSmirnovStatistic,
LInfinityDistance,
WassersteinDistance,
FeatureType,
)
from nannyml.thresholds import ConstantThreshold

Expand All @@ -22,7 +24,7 @@
def test_js_for_0_distance(): # noqa: D103
np.random.seed(1)
reference = pd.Series(np.random.choice(np.linspace(0, 2, 6), 10_000), name='A')
js = JensenShannonDistance(chunker=chunker, threshold=threshold)
js = JensenShannonDistance(chunker=chunker, threshold=threshold, feature_type=FeatureType.CONTINUOUS)
js.fit(reference)
distance = js.calculate(reference)
assert distance == 0
Expand All @@ -32,7 +34,7 @@ def test_js_for_both_continuous(): # noqa: D103
np.random.seed(1)
reference = pd.Series(np.random.normal(0, 1, 10_000), name='A')
analysis = pd.Series(np.random.normal(0, 1, 1000), name='A')
js = JensenShannonDistance(chunker=chunker, threshold=threshold)
js = JensenShannonDistance(chunker=chunker, threshold=threshold, feature_type=FeatureType.CONTINUOUS)
js.fit(reference)
distance = js.calculate(analysis)
assert np.round(distance, 2) == 0.05
Expand All @@ -42,7 +44,7 @@ def test_js_for_quasi_continuous(): # noqa: D103
np.random.seed(1)
reference = pd.Series(np.random.choice(np.linspace(0, 2, 6), 10_000), name='A')
analysis = pd.Series(np.random.choice(np.linspace(0, 2, 3), 1000), name='A')
js = JensenShannonDistance(chunker=chunker, threshold=threshold)
js = JensenShannonDistance(chunker=chunker, threshold=threshold, feature_type=FeatureType.CONTINUOUS)
js.fit(reference)
distance = js.calculate(analysis)
assert np.round(distance, 2) == 0.73
Expand All @@ -52,7 +54,7 @@ def test_js_for_categorical(): # noqa: D103
np.random.seed(1)
reference = pd.Series(np.random.choice(['a', 'b', 'c', 'd'], 10_000), name='A')
analysis = pd.Series(np.random.choice(['a', 'b', 'c', 'e'], 1000), name='A')
js = JensenShannonDistance(chunker=chunker, threshold=threshold)
js = JensenShannonDistance(chunker=chunker, threshold=threshold, feature_type=FeatureType.CATEGORICAL)
js.fit(reference)
distance = js.calculate(analysis)
assert np.round(distance, 2) == 0.5
Expand All @@ -61,7 +63,7 @@ def test_js_for_categorical(): # noqa: D103
def test_l_infinity_for_new_category(): # noqa: D103
reference = pd.Series(['a', 'a', 'b', 'b', 'c', 'c'], name='A')
analysis = pd.Series(['a', 'a', 'b', 'b', 'c', 'c', 'd'], name='A')
infnorm = LInfinityDistance(chunker=chunker, threshold=threshold)
infnorm = LInfinityDistance(chunker=chunker, threshold=threshold, feature_type=FeatureType.CATEGORICAL)
infnorm.fit(reference)
distance = infnorm.calculate(analysis)
assert np.round(distance, 2) == 0.14
Expand All @@ -70,7 +72,7 @@ def test_l_infinity_for_new_category(): # noqa: D103
def test_l_infinity_for_no_change(): # noqa: D103
reference = pd.Series(['a', 'a', 'b', 'b', 'c', 'c'], name='A')
analysis = pd.Series(['a', 'a', 'b', 'b', 'c', 'c'], name='A')
infnorm = LInfinityDistance(chunker=chunker, threshold=threshold)
infnorm = LInfinityDistance(chunker=chunker, threshold=threshold, feature_type=FeatureType.CATEGORICAL)
infnorm.fit(reference)
distance = infnorm.calculate(analysis)
assert np.round(distance, 2) == 0.0
Expand All @@ -79,7 +81,7 @@ def test_l_infinity_for_no_change(): # noqa: D103
def test_l_infinity_for_total_change(): # noqa: D103
reference = pd.Series(['a', 'a', 'b', 'b', 'c', 'c'], name='A')
analysis = pd.Series(['b', 'b', 'b', 'b', 'b'], name='A')
infnorm = LInfinityDistance(chunker=chunker, threshold=threshold)
infnorm = LInfinityDistance(chunker=chunker, threshold=threshold, feature_type=FeatureType.CATEGORICAL)
infnorm.fit(reference)
distance = infnorm.calculate(analysis)
assert np.round(distance, 2) == 0.67
Expand All @@ -92,7 +94,7 @@ def test_wasserstein_both_continuous_0_distance(): # noqa: D103
np.random.seed(1)
reference = pd.Series(np.random.normal(0, 1, 10_000), name='A')
analysis = reference
wass_dist = WassersteinDistance(chunker=chunker, threshold=threshold)
wass_dist = WassersteinDistance(chunker=chunker, threshold=threshold, feature_type=FeatureType.CONTINUOUS)
wass_dist = wass_dist.fit(reference).calculate(analysis)
wass_dist = np.round(wass_dist, 2)
assert wass_dist == 0
Expand All @@ -102,7 +104,7 @@ def test_wasserstein_both_continuous_positive_means_small_drift(): # noqa: D103
np.random.seed(1)
reference = pd.Series(np.random.normal(0, 1, 10000), name='A')
analysis = pd.Series(np.random.normal(1, 1, 1000), name='A')
wass_dist = WassersteinDistance(chunker=chunker, threshold=threshold)
wass_dist = WassersteinDistance(chunker=chunker, threshold=threshold, feature_type=FeatureType.CONTINUOUS)
wass_dist = wass_dist.fit(reference).calculate(analysis)
wass_dist = np.round(wass_dist, 2)
assert wass_dist == 1.01
Expand All @@ -112,7 +114,7 @@ def test_wasserstein_both_continuous_analysis_with_neg_mean_medium_drift(): # n
np.random.seed(1)
reference = pd.Series(np.random.normal(0, 1, 100000), name='A')
analysis = pd.Series(np.random.normal(-4, 1, 1000), name='A')
wass_dist = WassersteinDistance(chunker=chunker, threshold=threshold)
wass_dist = WassersteinDistance(chunker=chunker, threshold=threshold, feature_type=FeatureType.CONTINUOUS)
wass_dist = wass_dist.fit(reference).calculate(analysis)
wass_dist = np.round(wass_dist, 2)
assert wass_dist == 3.99
Expand All @@ -122,7 +124,7 @@ def test_wasserstein_both_continuous_analysis_estimate_with_out_of_reference_dri
np.random.seed(1)
reference = pd.Series(np.random.normal(0, 1, 15_000), name='A')
analysis = pd.Series(np.random.normal(0, 10, 1_000_000), name='A')
wass_dist = WassersteinDistance(chunker=chunker, threshold=threshold)
wass_dist = WassersteinDistance(chunker=chunker, threshold=threshold, feature_type=FeatureType.CONTINUOUS)
wass_dist = wass_dist.fit(reference).calculate(analysis)
wass_dist = np.round(wass_dist, 3)
assert wass_dist == 7.180
Expand All @@ -135,7 +137,11 @@ def test_hellinger_complete_overlap(): # noqa: D103
np.random.seed(1)
reference = pd.Series(np.random.normal(0, 1, 10_000), name='A')
analysis = reference
hell_dist = HellingerDistance(chunker=chunker, threshold=threshold).fit(reference).calculate(analysis)
hell_dist = (
HellingerDistance(chunker=chunker, threshold=threshold, feature_type=FeatureType.CONTINUOUS)
.fit(reference)
.calculate(analysis)
)
hell_dist = np.round(hell_dist, 2)
assert hell_dist == 0

Expand All @@ -144,7 +150,11 @@ def test_hellinger_no_overlap(): # noqa: D103
np.random.seed(1)
reference = pd.Series(np.random.normal(0, 1, 10_000), name='A')
analysis = pd.Series(np.random.normal(7, 1, 10_000), name='A')
hell_dist = HellingerDistance(chunker=chunker, threshold=threshold).fit(reference).calculate(analysis)
hell_dist = (
HellingerDistance(chunker=chunker, threshold=threshold, feature_type=FeatureType.CONTINUOUS)
.fit(reference)
.calculate(analysis)
)
hell_dist = np.round(hell_dist, 2)
assert hell_dist == 1

Expand All @@ -153,7 +163,11 @@ def test_hellinger_both_continuous_analysis_with_small_drift(): # noqa: D103
np.random.seed(1)
reference = pd.Series(np.random.normal(0, 1, 10_000), name='A')
analysis = pd.Series(np.random.normal(-2, 1, 10_000), name='A')
hell_dist = HellingerDistance(chunker=chunker, threshold=threshold).fit(reference).calculate(analysis)
hell_dist = (
HellingerDistance(chunker=chunker, threshold=threshold, feature_type=FeatureType.CONTINUOUS)
.fit(reference)
.calculate(analysis)
)
hell_dist = np.round(hell_dist, 2)
assert hell_dist == 0.63

Expand All @@ -162,7 +176,7 @@ def test_hellinger_for_quasi_continuous(): # noqa: D103
np.random.seed(1)
reference = pd.Series(np.random.choice(np.linspace(0, 2, 6), 10_000), name='A')
analysis = pd.Series(np.random.choice(np.linspace(0, 2, 3), 1000), name='A')
hell_dist = HellingerDistance(chunker=chunker, threshold=threshold)
hell_dist = HellingerDistance(chunker=chunker, threshold=threshold, feature_type=FeatureType.CONTINUOUS)
hell_dist.fit(reference)
distance = hell_dist.calculate(analysis)
assert np.round(distance, 2) == 0.72
Expand All @@ -172,7 +186,7 @@ def test_hellinger_for_categorical(): # noqa: D103
np.random.seed(1)
reference = pd.Series(np.random.choice(['a', 'b', 'c', 'd'], 10_000), name='A')
analysis = pd.Series(np.random.choice(['a', 'b', 'c', 'e'], 1000), name='A')
hell_dist = HellingerDistance(chunker=chunker, threshold=threshold)
hell_dist = HellingerDistance(chunker=chunker, threshold=threshold, feature_type=FeatureType.CATEGORICAL)
hell_dist.fit(reference)
distance = hell_dist.calculate(analysis)
assert np.round(distance, 2) == 0.5
Expand All @@ -181,11 +195,31 @@ def test_hellinger_for_categorical(): # noqa: D103
@pytest.mark.parametrize(
'method',
[
KolmogorovSmirnovStatistic(chunker=DefaultChunker(), threshold=ConstantThreshold(lower=-1, upper=None)),
LInfinityDistance(chunker=DefaultChunker(), threshold=ConstantThreshold(lower=-1, upper=None)),
JensenShannonDistance(chunker=DefaultChunker(), threshold=ConstantThreshold(lower=-1, upper=None)),
WassersteinDistance(chunker=DefaultChunker(), threshold=ConstantThreshold(lower=-1, upper=None)),
HellingerDistance(chunker=DefaultChunker(), threshold=ConstantThreshold(lower=-1, upper=None)),
KolmogorovSmirnovStatistic(
chunker=DefaultChunker(),
threshold=ConstantThreshold(lower=-1, upper=None),
feature_type=FeatureType.CONTINUOUS,
),
LInfinityDistance(
chunker=DefaultChunker(),
threshold=ConstantThreshold(lower=-1, upper=None),
feature_type=FeatureType.CATEGORICAL,
),
JensenShannonDistance(
chunker=DefaultChunker(),
threshold=ConstantThreshold(lower=-1, upper=None),
feature_type=FeatureType.CONTINUOUS,
),
WassersteinDistance(
chunker=DefaultChunker(),
threshold=ConstantThreshold(lower=-1, upper=None),
feature_type=FeatureType.CONTINUOUS,
),
HellingerDistance(
chunker=DefaultChunker(),
threshold=ConstantThreshold(lower=-1, upper=None),
feature_type=FeatureType.CONTINUOUS,
),
],
)
def test_method_logs_warning_when_lower_threshold_is_overridden_by_metric_limits(caplog, method): # noqa: D103
Expand All @@ -202,7 +236,9 @@ def test_method_logs_warning_when_lower_threshold_is_overridden_by_metric_limits
@pytest.mark.parametrize(
'method',
[
KolmogorovSmirnovStatistic(chunker=DefaultChunker(), threshold=ConstantThreshold(upper=2)),
KolmogorovSmirnovStatistic(
chunker=DefaultChunker(), threshold=ConstantThreshold(upper=2), feature_type=FeatureType.CONTINUOUS
),
],
)
def test_method_logs_warning_when_upper_threshold_is_overridden_by_metric_limits(caplog, method): # noqa: D103
Expand Down
Loading