Skip to content
This repository has been archived by the owner on Dec 18, 2023. It is now read-only.

Commit

Permalink
Merge pull request #3 from credo-ai/bugfix/json_nans
Browse files Browse the repository at this point in the history
Add helper function to container base class to replace NaNs with NoneType to accommodate JSON outputs
  • Loading branch information
esherman-credo authored Nov 28, 2022
2 parents 93f6528 + a344080 commit c816b2c
Show file tree
Hide file tree
Showing 7 changed files with 101 additions and 11 deletions.
2 changes: 1 addition & 1 deletion connect/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,6 @@
Credo AI Connect package
"""

__version__ = "0.0.3"
__version__ = "0.0.4"

__all__ = ["governance", "evidence", "utils"]
14 changes: 11 additions & 3 deletions connect/evidence/containers.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

import pandas as pd

from connect.utils import ValidationError
from connect.utils import Scrubber, ValidationError

from .evidence import MetricEvidence, TableEvidence

Expand Down Expand Up @@ -48,6 +48,10 @@ def __init__(self, evidence_class, data, labels=None, metadata=None):
def data(self):
return self._data

@property
def scrubbed_data(self):
return Scrubber.remove_NaNs(self._data)

@abstractmethod
def to_evidence(self):
pass
Expand All @@ -70,7 +74,7 @@ def __init__(self, data: pd.DataFrame, labels: dict = None, metadata: dict = Non

def to_evidence(self, **metadata):
evidence = []
for _, data in self._data.iterrows():
for _, data in self.scrubbed_data.iterrows():
evidence.append(
self.evidence_class(
additional_labels=self.labels, **data, **self.metadata, **metadata
Expand All @@ -96,7 +100,11 @@ def __init__(self, data: pd.DataFrame, labels: dict = None, metadata: dict = Non
def to_evidence(self, **metadata):
return [
self.evidence_class(
self._data.name, self._data, self.labels, **self.metadata, **metadata
self.scrubbed_data.name,
self.scrubbed_data,
self.labels,
**self.metadata,
**metadata,
)
]

Expand Down
10 changes: 7 additions & 3 deletions connect/evidence/deepchecks_evidence/containers.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,15 +22,19 @@ def __init__(
super().__init__(DeepchecksEvidence, data, labels, metadata)
self.name = name

@property
def scrubbed_data(self):
return self._data

def to_evidence(self, **metadata):
checks_2_df = {"Check_Name": list(), "Status": list()}
for check in self._data.get_not_passed_checks():
for check in self.scrubbed_data.get_not_passed_checks():
checks_2_df["Check_Name"].append(check.header)
checks_2_df["Status"].append("Not Passed")
for check in self._data.get_passed_checks():
for check in self.scrubbed_data.get_passed_checks():
checks_2_df["Check_Name"].append(check.header)
checks_2_df["Status"].append("Passed")
for check in self._data.get_not_ran_checks():
for check in self.scrubbed_data.get_not_ran_checks():
checks_2_df["Check_Name"].append(check.header)
checks_2_df["Status"].append("Not Run")

Expand Down
14 changes: 11 additions & 3 deletions connect/evidence/lens_evidence/containers.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import numpy as np

from connect.evidence import EvidenceContainer
from connect.utils import ValidationError
from connect.utils import Scrubber, ValidationError

from .evidence import DataProfilerEvidence, ModelProfilerEvidence
from .utils import get_pandas_profile_type
Expand All @@ -13,10 +15,14 @@ class DataProfilerContainer(EvidenceContainer):
def __init__(self, data, labels: dict = None, metadata: dict = None):
super().__init__(DataProfilerEvidence, data, labels, metadata)

@property
def scrubbed_data(self):
return Scrubber.remove_NaNs(self.data.get_description())

def to_evidence(self, **metadata):
return [
self.evidence_class(
self._data.get_description(), self.labels, **self.metadata, **metadata
self.scrubbed_data, self.labels, **self.metadata, **metadata
)
]

Expand All @@ -38,7 +44,9 @@ def __init__(self, data, labels=None, metadata=None):

def to_evidence(self, **metadata):
return [
self.evidence_class(self._data, self.labels, **self.metadata, **metadata)
self.evidence_class(
self.scrubbed_data, self.labels, **self.metadata, **metadata
)
]

def _validate(self, data):
Expand Down
1 change: 1 addition & 0 deletions connect/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,5 @@
"""

from .common import *
from .data_scrubbing import Scrubber
from .logging import *
2 changes: 1 addition & 1 deletion connect/utils/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ def default(self, obj):

def json_dumps(obj):
"""Custom json dumps with encoder"""
return json.dumps(obj, cls=CredoEncoder, indent=2)
return json.dumps(obj, cls=CredoEncoder, indent=2, default=str)


def dict_hash(dictionary: Dict[str, Any]) -> str:
Expand Down
69 changes: 69 additions & 0 deletions connect/utils/data_scrubbing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
from copy import deepcopy

import numpy as np
import pandas as pd


class Scrubber:
@staticmethod
def remove_NaNs(obj):
if isinstance(obj, pd.DataFrame):
return Scrubber._df_remove_NaNs(obj)
elif isinstance(obj, np.ndarray):
return Scrubber._array_remove_NaNs(obj)
elif isinstance(obj, dict):
return Scrubber._dict_remove_NaNs(obj)
elif isinstance(obj, list):
return Scrubber._list_remove_NaNs(obj)

@staticmethod
def _df_remove_NaNs(data: pd.DataFrame):
# Assume DataFrame is well-formed: does not contain lists, DFs, or other complex objects
# returns copy of object --> no deep copy concern
name = getattr(data, "name", None)
scrubbed = data.fillna(np.nan).replace([np.nan], [None])
if name:
scrubbed.name = name
return scrubbed

@staticmethod
def _list_remove_NaNs(data: list):
return_list = deepcopy(data)
for idx, item in enumerate(return_list):
if isinstance(item, pd.DataFrame):
return_list[idx] = Scrubber._df_remove_NaNs(item)
elif isinstance(item, np.ndarray):
return_list[idx] = Scrubber._array_remove_NaNs(item)
elif isinstance(item, dict):
return_list[idx] = Scrubber._dict_remove_NaNs(item)
elif isinstance(item, list):
return_list[idx] = Scrubber._list_remove_NaNs(item)
else:
# Assume no other iterable data types could be stored in list
if item is np.nan:
return_list[idx] = None
return return_list

@staticmethod
def _array_remove_NaNs(data: np.ndarray):
# Assume array is well-formed: does not contain lists or other complex objects
# returns copy of object --> no deep copy concern
return np.where(np.isnan(data), None, data)

@staticmethod
def _dict_remove_NaNs(data: dict):
return_dict = deepcopy(data)
for key, val in return_dict.items():
if isinstance(val, pd.DataFrame):
return_dict[key] = Scrubber._df_remove_NaNs(val)
elif isinstance(val, np.ndarray):
return_dict[key] = Scrubber._array_remove_NaNs(val)
elif isinstance(val, dict):
return_dict[key] = Scrubber._dict_remove_NaNs(val)
elif isinstance(val, list):
return_dict[key] = Scrubber._list_remove_NaNs(val)
else:
# Assume no other iterable data types could be stored in dictionary
if val is np.nan:
return_dict[key] = None
return return_dict

0 comments on commit c816b2c

Please sign in to comment.