Merge pull request #28 from cparmet/more-assert-methods

Add more validation methods
cparmet · Jun 25, 2024 · ff91abb · ff91abb
2 parents 4b7ff72 + e19dcde
commit ff91abb
Show file tree

Hide file tree

Showing 19 changed files with 2,206 additions and 250 deletions.
diff --git a/.gitignore b/.gitignore
@@ -11,3 +11,4 @@ dev.ipynb
 dev_script.py
 
 dist/*
+site/*
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -13,9 +13,8 @@ repos:
     - id: debug-statements
     - id: detect-private-key
     - id: end-of-file-fixer
-    - id: trailing-whitespace # Disabled because some markdown styles require trailing whitespace
+    - id: trailing-whitespace
       exclude: md
-      # args: [--markdown-linebreak-ext=md]
 
 -   repo: https://github.com/psf/black
     rev: 22.12.0

diff --git a/README.md b/README.md
diff --git a/docs/API reference/DataFrameVet.md → docs/API reference/DataFrameChecks.md b/docs/API reference/DataFrameVet.md → docs/API reference/DataFrameChecks.md
diff --git a/docs/API reference/SeriesVet.md → docs/API reference/SeriesChecks.md b/docs/API reference/SeriesVet.md → docs/API reference/SeriesChecks.md
diff --git a/docs/usage.md b/docs/usage.md
diff --git a/pandas_checks/DataFrameChecks.py b/pandas_checks/DataFrameChecks.py
diff --git a/pandas_checks/SeriesChecks.py b/pandas_checks/SeriesChecks.py
diff --git a/pandas_checks/display.py b/pandas_checks/display.py
@@ -113,7 +113,7 @@ def _render_text(
             )
             display(
                 Markdown(
-                    f"<{tag} style='text-align: left'>{lead_in_rendered + ' ' if lead_in_rendered else ''}<span 'color:{text_color};' 'background-color:{text_background_color}'>{_filter_emojis(text)}</span></{tag}>"
+                    f"<{tag} style='text-align: left'>{lead_in_rendered + ' ' if lead_in_rendered else ''}<span style='color:{text_color}; background-color:{text_background_color}'>{_filter_emojis(text)}</span></{tag}>"
                 )
             )
 
@@ -301,7 +301,7 @@ def _lead_in(lead_in: Union[str, None], foreground: str, background: str) -> str
         The formatted lead-in text.
     """
     return (
-        f"<span style='color:{foreground}; background-color:{background}'>{_filter_emojis(lead_in).strip()}:</span>"
+        f"<span style='color:{foreground}; background-color:{background}'>{_filter_emojis(lead_in).strip()}</span>:"
         if lead_in
         else ""
     )

diff --git a/pandas_checks/run_checks.py b/pandas_checks/run_checks.py
@@ -2,8 +2,6 @@
 
 from typing import Any, Callable, List, Union
 
-import pandas as pd
-
 from .display import _display_check
 from .options import get_mode
 

diff --git a/pandas_checks/utils.py b/pandas_checks/utils.py
@@ -1,10 +1,14 @@
 """
 Utility functions for the pandas_checks package.
 """
+from datetime import datetime, timedelta
 from inspect import getsourcelines
-from typing import Callable
+from typing import Any, Callable, Type, Union
 
 import pandas as pd
+from pandas.core.groupby.groupby import DataError
+
+from .display import _display_line
 
 
 def _lambda_to_string(lambda_func: Callable) -> str:
@@ -22,3 +26,63 @@ def _lambda_to_string(lambda_func: Callable) -> str:
             Try other ways to get just the argument we want.
     """
     return "".join(getsourcelines(lambda_func)[0]).lstrip(" .")
+
+
+def _has_nulls(
+    data: Union[pd.DataFrame, pd.Series],
+    fail_message: str,
+    raise_exception: bool = True,
+    exception_to_raise: Type[BaseException] = DataError,
+) -> bool:
+    """Utility function to check for nulls as part of a larger check"""
+    if isinstance(data, pd.DataFrame):
+        has_nulls = data.isna().any().any()
+    elif isinstance(data, pd.Series):
+        has_nulls = data.isna().any()
+    else:
+        raise AttributeError(f"Unexpected data type in _has_nulls(): {type(data)}")
+
+    if has_nulls:
+        if raise_exception:
+            raise exception_to_raise(
+                f"{fail_message}: Nulls present (to disable, pass `assert_not_null=False`)"
+            )
+        else:
+            _display_line(
+                lead_in=fail_message,
+                line="Nulls present (to disable, pass `assert_not_null=False`)",
+                colors={
+                    "lead_in_text_color": pd.get_option(
+                        "pdchecks.fail_message_fg_color"
+                    ),
+                    "lead_in_background_color": pd.get_option(
+                        "pdchecks.fail_message_bg_color"
+                    ),
+                },
+            )
+    return has_nulls
+
+
+def _series_is_type(s: pd.Series, dtype: Type[Any]) -> bool:
+    """Utility function to check if a series has an expected type.
+    Includes special handling for strings, since 'object' type in Pandas
+    may not mean a string"""
+    if dtype in [str, "str"]:
+        return pd.api.types.is_string_dtype(s)
+    elif dtype in [datetime, "datetime", "date"]:
+        return pd.api.types.is_datetime64_any_dtype(
+            s
+        ) or pd.api.types.is_datetime64tz_dtype(s)
+    elif dtype in [timedelta, "timedelta"]:
+        return pd.api.types.is_timedelta64_dtype(s)
+    else:
+        return s.dtypes == dtype
+
+
+def _is_type(data: pd.DataFrame, dtype: Type[Any]) -> bool:
+    """Utility function to check if a dataframe's columns or one series has an expected type.
+    Includes special handling for strings, since 'object' type in Pandas
+    may not mean a string"""
+    if isinstance(data, pd.Series):
+        return _series_is_type(data, dtype)
+    return all([_series_is_type(data[col], dtype) for col in data.columns])
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "pandas-checks"
-version = "0.1.8"
+version = "0.2.0"
 description = "Non-invasive health checks for Pandas method chains"
 authors = ["Chad Parmet <[email protected]>"]
 readme = "README.md"

diff --git a/tests/cases_dataframechecks.py b/tests/cases_dataframechecks.py
@@ -1,4 +1,6 @@
-"""Dataframe methods to test in batch"""
+"""DataframeCheck methods to test in batch
+that they don't change the actual dataframe
+in the method chain """
 
 import pandas_checks as pdc
 
@@ -15,6 +17,100 @@ def method_assert_data():
     )
 
 
+def method_assert_datetime():
+    return lambda df, args: df.check.assert_datetime(
+        subset=args["first_num_col"],
+        raise_exception=False,
+    )
+
+
+def method_assert_float():
+    return lambda df, args: df.check.assert_float(
+        subset=args["first_num_col"],
+        raise_exception=False,
+    )
+
+
+def method_assert_int():
+    return lambda df, args: df.check.assert_int(
+        subset=args["first_num_col"],
+        raise_exception=False,
+    )
+
+
+def method_assert_less_than():
+    return lambda df, args: df.check.assert_less_than(
+        max=1000,
+        subset=args["first_num_col"],
+        raise_exception=False,
+    )
+
+
+def method_assert_greater_than():
+    return lambda df, args: df.check.assert_greater_than(
+        min=-1000,
+        subset=args["first_num_col"],
+        raise_exception=False,
+    )
+
+
+def method_assert_negative():
+    return lambda df, args: df.check.assert_negative(
+        subset=args["first_num_col"],
+        raise_exception=False,
+    )
+
+
+def method_assert_not_null():
+    return lambda df, args: df.check.assert_not_null(
+        subset=args["first_num_col"],
+        raise_exception=False,
+    )
+
+
+def method_assert_null():
+    return lambda df, args: df.check.assert_null(
+        subset=args["first_num_col"],
+        raise_exception=False,
+    )
+
+
+def method_assert_positive():
+    return lambda df, args: df.check.assert_positive(
+        subset=args["first_num_col"],
+        raise_exception=False,
+    )
+
+
+def method_assert_str():
+    return lambda df, args: df.check.assert_str(
+        subset=args["first_num_col"],
+        raise_exception=False,
+    )
+
+
+def method_assert_timedelta():
+    return lambda df, args: df.check.assert_timedelta(
+        subset=args["first_num_col"],
+        raise_exception=False,
+    )
+
+
+def method_assert_type():
+    return lambda df, args: df.check.assert_type(
+        dtype=float,
+        subset=args["first_num_col"],
+        raise_exception=False,
+    )
+
+
+def method_assert_unique():
+    return lambda df, args: df.check.assert_unique(
+        subset=args["first_num_col"],
+        raise_exception=False,
+    )
+
+
 def method_columns():
     return lambda df, _: df.check.columns(fn=lambda df: df.dropna(), check_name="Test")
Original file line number	Diff line number	Diff line change
Expand Up		@@ -11,3 +11,4 @@ dev.ipynb
		dev_script.py

		dist/*
		site/*