Added failsafe if subset size is greater than data size and test case

CaderIdris · May 7, 2024 · 56e064d · 56e064d
1 parent cd31faf
commit 56e064d
Show file tree

Hide file tree

Showing 2 changed files with 36 additions and 9 deletions.
diff --git a/src/calidhayte/calibrate.py b/src/calidhayte/calibrate.py
@@ -369,11 +369,14 @@ def setup(
             error_string = f"{target} does not exist in both columns."
             raise ValueError(error_string)
         if subsample_data is not None:
-            x_data = cls.subsample_df(
-                x_data,
-                target,
-                subsample_data
-            )
+            try:
+                x_data = cls.subsample_df(
+                    x_data,
+                    target,
+                    subsample_data
+                )
+            except ValueError:
+                logger.warning('Subset size larger than dataset size')
         join_index = (
             x_data.join(y_data, how="inner", lsuffix="x", rsuffix="y")
             .dropna()
@@ -2474,7 +2477,7 @@ def subsample_df(
     ) -> pd.DataFrame:
         """Create stratified k-folds on continuous variable.
         """
-        _df = df.copy()
+        _df = df.copy().dropna(subset=target_var)
         _df["Group"] = pd.qcut(
             _df.loc[:, target_var], strat_groups, labels=False
         )

diff --git a/tests/test_calibrate.py b/tests/test_calibrate.py
@@ -49,7 +49,7 @@ def test_data_split(full_data, folds):
     """
     Tests whether data is split properly
     """
-    tests = dict()
+    tests = {}
     print(full_data["x"])
     print(full_data["y"])
     coeff_inst = Calibrate.setup(
@@ -85,7 +85,7 @@ def test_skl_cals(full_data, polynomial_degree, vif_bound, time_col):
     Combines all possible multivariate key combos with each skl calibration
     method except omp which needs at least 1 mv key
     """
-    tests = dict()
+    tests = {}
     funcs: List[Callable[..., None]] = [
         Calibrate.bayesian_ard,
         Calibrate.bayesian_ridge,
@@ -186,7 +186,7 @@ def test_skl_cals(full_data, polynomial_degree, vif_bound, time_col):
 )
 def test_subsample(full_data, proportion):
     """Test setting subsample of data."""
-    tests = dict()
+    tests = {}
     df_size = full_data['x'].shape[0]
     coeff_inst = Calibrate.setup(
         x_data=full_data["x"],
@@ -210,3 +210,27 @@ def test_subsample(full_data, proportion):
         if not result:
             print(f"{test}: {result}")
     assert all(tests.values())
+
+
+@pytest.mark.cal()
+def test_subsample_failsafe(full_data):
+    """Test setting subsample of data."""
+    tests = {}
+    df_size = full_data['x'].shape[0]
+    coeff_inst = Calibrate.setup(
+        x_data=full_data["x"],
+        y_data=full_data["y"],
+        target="x",
+        subsample_data=10000,
+    )
+    measurements = coeff_inst.return_measurements()
+
+    tests['Same size'] = (
+            measurements['x'].shape[0] == measurements['y'].shape[0]
+    )
+    tests['Correct size'] = measurements['x'].shape[0] == df_size
+
+    for test, result in tests.items():
+        if not result:
+            print(f"{test}: {result}")
+    assert all(tests.values())