Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update Missing Values Handling #378

Merged
merged 29 commits into from
May 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
ab7e79d
variable code fixes
nikml Apr 12, 2024
dd12310
bc auroc sampling error calculation
nikml Apr 12, 2024
b7a566b
refactor auroc sampling error nan handling
nikml Apr 16, 2024
d19353d
auroc realized perf nan handling
nikml Apr 16, 2024
da9cf05
realized ap missing value handling
nikml Apr 17, 2024
4fd75fe
upd bc metrics missing value handling
nikml Apr 19, 2024
588abe7
upd realized perf mc missing value handling
nikml Apr 19, 2024
47c0367
upd realized perf regr missing value handling and fixes
nikml Apr 19, 2024
a7a92d9
upd dle missing value handling
nikml Apr 19, 2024
a1d180f
wip update CBPE missing value handling
nikml Apr 19, 2024
aed2c48
update CBPE BC missing value handling
nikml Apr 22, 2024
3418be0
update MC CBPE missing value handling
nikml Apr 22, 2024
a86b638
remove redundant methods/classes
nikml Apr 23, 2024
d12d0e5
linting updates
nikml Apr 23, 2024
8b2d978
linting for DLE
nikml Apr 23, 2024
cf1878e
performance calculation linting updates
nikml Apr 23, 2024
68e6954
remove unneeded import
nikml Apr 23, 2024
c5ceabd
mypy fixes
nikml Apr 23, 2024
518162f
more mypy fixes
nikml Apr 23, 2024
b653620
mypy updates
nikml Apr 24, 2024
ee2b1c8
cbpe lingint
nikml Apr 24, 2024
1f1dbe8
sampling error update
nikml Apr 24, 2024
22eb707
ap fix
nikml Apr 24, 2024
c664133
code fixes wip
nikml Apr 24, 2024
782869a
nan code updates
nikml Apr 24, 2024
a67f196
Removed some superfluous comments
nnansters May 6, 2024
2e03ba6
Remove exception re-raise as it causes the "fallback scenario" to be …
nnansters May 6, 2024
332e9da
Merge branch 'refs/heads/main' into fix_nan_issue
nnansters May 7, 2024
f30e875
mypy and linting
nnansters May 7, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions nannyml/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -614,3 +614,33 @@
"\tLog-based metrics are not supported for negative target values.\n"
f"\tCheck '{column.name}' at rows {str(negative_item_indices)}."
)


def common_nan_removal(data: pd.DataFrame, selected_columns: List[str]) -> Tuple[pd.DataFrame, bool]:
"""Remove rows of dataframe containing NaN values on selected columns.

Parameters
----------
data: pd.DataFrame
Pandas dataframe containing data.
selected_columns: List[str]
List containing the strings of column names

Returns
-------
df:
Dataframe with rows containing NaN's on selected_columns removed. All columns of original
dataframe are being returned.
empty:
Boolean whether the resulting data are contain any rows (false) or not (true)
"""
# If we want target and it's not available we get None
if not set(selected_columns) <= set(data.columns):
raise InvalidArgumentsException(

Check warning on line 639 in nannyml/base.py

View check run for this annotation

Codecov / codecov/patch

nannyml/base.py#L639

Added line #L639 was not covered by tests
f"Selected columns: {selected_columns} not all present in provided data columns {list(data.columns)}"
)
df = data.dropna(axis=0, how='any', inplace=False, subset=selected_columns).reset_index(drop=True).infer_objects()
empty: bool = False
if df.shape[0] == 0:
empty = True
return (df, empty)
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

"""

from typing import List, Optional, Tuple, Union, Dict
from typing import Dict, List, Optional, Tuple, Union

import numpy as np
import pandas as pd
Expand Down
7 changes: 2 additions & 5 deletions nannyml/performance_calculation/calculator.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,11 +111,9 @@ def __init__(
When it is not given, only the ROC AUC and Average Precision metrics are supported.
problem_type: Union[str, ProblemType]
Determines which method to use. Allowed values are:

- 'regression'
- 'classification_binary'
- 'classification_multiclass'

y_pred_proba: ModelOutputsType, default=None
Name(s) of the column(s) containing your model output.
Pass a single string when there is only a single model output column, e.g. in binary classification cases.
Expand All @@ -124,7 +122,6 @@ def __init__(
timestamp_column_name: str, default=None
The name of the column containing the timestamp of the model prediction.
thresholds: dict

The default values are::

{
Expand Down Expand Up @@ -158,7 +155,7 @@ def __init__(
chunk_period: str, default=None
Splits the data according to the given period.
Only one of `chunk_size`, `chunk_number` or `chunk_period` should be given.
chunker : Chunker, default=None
chunker: Chunker, default=None
The `Chunker` used to split the data sets into a lists of chunks.
normalize_confusion_matrix: str, default=None
Determines how the confusion matrix will be normalized. Allowed values are None, 'all', 'true' and
Expand Down Expand Up @@ -311,7 +308,7 @@ def _calculate(self, data: pd.DataFrame, *args, **kwargs) -> Result:
data = data.copy(deep=True)

# Setup for target completeness rate
data['NML_TARGET_INCOMPLETE'] = data[self.y_true].isna().astype(np.int16)
data[TARGET_COMPLETENESS_RATE_COLUMN_NAME] = data[self.y_true].isna().astype(np.int16)

# Generate chunks
if self.chunker is None:
Expand Down
9 changes: 8 additions & 1 deletion nannyml/performance_calculation/metrics/base.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Author: Niels Nuyttens <[email protected]>
#
# License: Apache Software License 2.0
"""Base Classes for performane calculation."""
nnansters marked this conversation as resolved.
Show resolved Hide resolved
import abc
import logging
from logging import Logger
Expand Down Expand Up @@ -134,7 +135,6 @@ def sampling_error(self, data: pd.DataFrame):

Returns
-------

sampling_error: float
The expected sampling error.

Expand All @@ -153,6 +153,7 @@ def alert(self, value: float) -> bool:
----------
value: float
Value of a calculated metric.

Returns
-------
bool: bool
Expand Down Expand Up @@ -206,18 +207,22 @@ def get_chunk_record(self, chunk_data: pd.DataFrame) -> Dict:

@property
def display_name(self) -> str:
"""Get metric display name."""
return self.name

@property
def column_name(self) -> str:
"""Get metric column name."""
return self.components[0][1]

@property
def display_names(self) -> List[str]:
"""Get metric display names."""
return [c[0] for c in self.components]

@property
def column_names(self) -> List[str]:
"""Get metric column names."""
return [c[1] for c in self.components]


Expand Down Expand Up @@ -256,6 +261,8 @@ def create(cls, key: str, use_case: ProblemType, **kwargs) -> Metric:

@classmethod
def register(cls, metric: str, use_case: ProblemType) -> Callable:
"""Register performance metric class in MetricFactory."""

def inner_wrapper(wrapped_class: Type[Metric]) -> Type[Metric]:
if metric in cls.registry:
if use_case in cls.registry[metric]:
Expand Down
Loading
Loading