Skip to content

Commit

Permalink
Adjusting some comments and general linting stuff
Browse files Browse the repository at this point in the history
Adjusting some comments and general linting stuff
  • Loading branch information
Niels Nuyttens committed Jul 19, 2024
1 parent 6100ab8 commit c8409db
Show file tree
Hide file tree
Showing 6 changed files with 43 additions and 21 deletions.
2 changes: 1 addition & 1 deletion nannyml/data_quality/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,4 @@

from .missing import MissingValuesCalculator
from .unseen import UnseenValuesCalculator
from .range import NumericalRangeCalculator
from .range import NumericalRangeCalculator
31 changes: 20 additions & 11 deletions nannyml/data_quality/range/calculator.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
#
# License: Apache Software License 2.0

"""Continous numerical variable range monitor to ensure range supplied is within training bounds."""
"""Continuous numerical variable range monitor to ensure range supplied is within training bounds."""

from typing import Any, Dict, List, Optional, Union

Expand All @@ -15,7 +15,6 @@
from nannyml.exceptions import InvalidArgumentsException
from nannyml.thresholds import Threshold, calculate_threshold_values, ConstantThreshold
from nannyml.usage_logging import UsageEvent, log_usage

from .result import Result

"""
Expand All @@ -24,7 +23,7 @@


class NumericalRangeCalculator(AbstractCalculator):
"""NumericalRangeCalculator implementation to ensure inference data numerical ranges match training."""
"""NumericalRangeCalculator ensures the monitoring data set numerical ranges match the reference data set ones."""

def __init__(
self,
Expand Down Expand Up @@ -68,7 +67,8 @@ def __init__(
--------
>>> import nannyml as nml
>>> reference_df, analysis_df, _ = nml.load_synthetic_car_price_dataset()
>>> feature_column_names = [col for col in reference_df.columns if col not in ['fuel','transmission','timestamp', 'y_pred', 'y_true']]
>>> feature_column_names = [col for col in reference_df.columns if col not in [
... 'fuel','transmission','timestamp', 'y_pred', 'y_true']]
>>> calc = nml.NumericalRangeCalculator(
... column_names=feature_column_names,
... timestamp_column_name='timestamp',
Expand Down Expand Up @@ -117,7 +117,7 @@ def __init__(
def _calculate_out_of_range_stats(self, data: pd.Series, lower_bound: float, upper_bound: float):
# to do make this calc out of range stats
count_tot = data.shape[0]
count_out_of_range = ((data < lower_bound) | (data > upper_bound)).sum()
count_out_of_range = ((data < lower_bound) | (data > upper_bound)).sum()
if self.normalize:
count_out_of_range = count_out_of_range / count_tot
return count_out_of_range
Expand All @@ -138,7 +138,7 @@ def _fit(self, reference_data: pd.DataFrame, *args, **kwargs):
f"Specified columns_names for NumericalRangeCalculator must all be continuous.\n"
f"Categorical columns found:\n{categorical_column_names}"
)

for col in self.column_names:
self._reference_value_ranges[col] = [reference_data[col].min(), reference_data[col].max()]

Expand Down Expand Up @@ -212,7 +212,10 @@ def _calculate_for_column(self, data: pd.DataFrame, column_name: str) -> Dict[st

def _set_metric_thresholds(self, result_data: pd.DataFrame):
for column_name in self.column_names:
self._lower_alert_thresholds[column_name], self._upper_alert_thresholds[column_name] = calculate_threshold_values( # noqa: E501
(
self._lower_alert_thresholds[column_name],
self._upper_alert_thresholds[column_name],
) = calculate_threshold_values( # noqa: E501
threshold=self.threshold,
data=result_data.loc[:, (column_name, 'value')],
lower_threshold_value_limit=self.lower_threshold_value_limit,
Expand All @@ -227,11 +230,17 @@ def _populate_alert_thresholds(self, result_data: pd.DataFrame) -> pd.DataFrame:
result_data[(column_name, 'alert')] = result_data.apply(
lambda row: True
if (
row[(column_name, 'value')] > (
np.inf if row[(column_name, 'upper_threshold')] is None else row[(column_name, 'upper_threshold')] # noqa: E501
row[(column_name, 'value')]
> (
np.inf
if row[(column_name, 'upper_threshold')] is None
else row[(column_name, 'upper_threshold')] # noqa: E501
)
or row[(column_name, 'value')] < (
-np.inf if row[(column_name, 'lower_threshold')] is None else row[(column_name, 'lower_threshold')] # noqa: E501
or row[(column_name, 'value')]
< (
-np.inf
if row[(column_name, 'lower_threshold')] is None
else row[(column_name, 'lower_threshold')] # noqa: E501
)
)
else False,
Expand Down
3 changes: 2 additions & 1 deletion nannyml/data_quality/range/result.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,8 @@ def plot(
--------
>>> import nannyml as nml
>>> reference, analysis, _ = nml.load_synthetic_car_price_dataset()
>>> column_names = [col for col in reference.columns if col not in ['car_age', 'km_driven', 'price_new', 'accident_count', 'door_count','timestamp', 'y_pred', 'y_true']]
>>> column_names = [col for col in reference.columns if col not in [
... 'car_age', 'km_driven', 'price_new', 'accident_count', 'door_count','timestamp', 'y_pred', 'y_true']]
>>> calc = nml.UnseenValuesCalculator(
... column_names=column_names,
... timestamp_column_name='timestamp',
Expand Down
23 changes: 17 additions & 6 deletions nannyml/data_quality/unseen/calculator.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

from nannyml.base import AbstractCalculator, _list_missing, _split_features_by_type
from nannyml.chunk import Chunker

# from nannyml.data_quality.base import _add_alert_flag
from nannyml.exceptions import InvalidArgumentsException
from nannyml.thresholds import ConstantThreshold, Threshold, calculate_threshold_values
Expand Down Expand Up @@ -69,7 +70,8 @@ def __init__(
--------
>>> import nannyml as nml
>>> reference, analysis, _ = nml.load_synthetic_car_price_dataset()
>>> column_names = [col for col in reference.columns if col not in ['car_age', 'km_driven', 'price_new', 'accident_count', 'door_count','timestamp', 'y_pred', 'y_true']]
>>> column_names = [col for col in reference.columns if col not in [
... 'car_age', 'km_driven', 'price_new', 'accident_count', 'door_count','timestamp', 'y_pred', 'y_true']]
>>> calc = nml.UnseenValuesCalculator(
... column_names=column_names,
... timestamp_column_name='timestamp',
Expand Down Expand Up @@ -217,7 +219,10 @@ def _calculate_for_column(self, data: pd.DataFrame, column_name: str) -> Dict[st

def _set_metric_thresholds(self, result_data: pd.DataFrame):
for column_name in self.column_names:
self._lower_alert_thresholds[column_name], self._upper_alert_thresholds[column_name] = calculate_threshold_values( # noqa: E501
(
self._lower_alert_thresholds[column_name],
self._upper_alert_thresholds[column_name],
) = calculate_threshold_values( # noqa: E501
threshold=self.threshold,
data=result_data.loc[:, (column_name, 'value')],
lower_threshold_value_limit=self.lower_threshold_value_limit,
Expand All @@ -232,11 +237,17 @@ def _populate_alert_thresholds(self, result_data: pd.DataFrame) -> pd.DataFrame:
result_data[(column_name, 'alert')] = result_data.apply(
lambda row: True
if (
row[(column_name, 'value')] > (
np.inf if row[(column_name, 'upper_threshold')] is None else row[(column_name, 'upper_threshold')] # noqa: E501
row[(column_name, 'value')]
> (
np.inf
if row[(column_name, 'upper_threshold')] is None
else row[(column_name, 'upper_threshold')] # noqa: E501
)
or row[(column_name, 'value')] < (
-np.inf if row[(column_name, 'lower_threshold')] is None else row[(column_name, 'lower_threshold')] # noqa: E501
or row[(column_name, 'value')]
< (
-np.inf
if row[(column_name, 'lower_threshold')] is None
else row[(column_name, 'lower_threshold')] # noqa: E501
)
)
else False,
Expand Down
3 changes: 2 additions & 1 deletion nannyml/data_quality/unseen/result.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,8 @@ def plot(
--------
>>> import nannyml as nml
>>> reference, analysis, _ = nml.load_synthetic_car_price_dataset()
>>> column_names = [col for col in reference.columns if col not in ['car_age', 'km_driven', 'price_new', 'accident_count', 'door_count','timestamp', 'y_pred', 'y_true']]
>>> column_names = [col for col in reference.columns if col not in [
.... 'car_age', 'km_driven', 'price_new', 'accident_count', 'door_count','timestamp', 'y_pred', 'y_true']]
>>> calc = nml.UnseenValuesCalculator(
... column_names=column_names,
... timestamp_column_name='timestamp',
Expand Down
2 changes: 1 addition & 1 deletion nannyml/usage_logging.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ class UsageEvent(str, Enum):

DQ_CALC_VALUES_OUT_OF_RANGE_FIT = "Data Quality Calculator Values Out Of Range fit"
DQ_CALC_VALUES_OUT_OF_RANGE_RUN = "Data Quality Calculator Values Out Of Range run"
DQ_CALC_VALUES_OUT_OF_RANGE_PLOT = "Data Quality Calculator Values Out Of Range Plot"
DQ_CALC_VALUES_OUT_OF_RANGE_PLOT = "Data Quality Calculator Values Out Of Range Plot"

UNIVAR_DRIFT_CALC_FIT = "Univariate drift calculator fit"
UNIVAR_DRIFT_CALC_RUN = "Univariate drift calculator run"
Expand Down

0 comments on commit c8409db

Please sign in to comment.