Skip to content

Commit

Permalink
Added failsafe if subset size is greater than data size and test case
Browse files Browse the repository at this point in the history
  • Loading branch information
CaderIdris committed May 7, 2024
1 parent cd31faf commit 56e064d
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 9 deletions.
15 changes: 9 additions & 6 deletions src/calidhayte/calibrate.py
Original file line number Diff line number Diff line change
Expand Up @@ -369,11 +369,14 @@ def setup(
error_string = f"{target} does not exist in both columns."
raise ValueError(error_string)
if subsample_data is not None:
x_data = cls.subsample_df(
x_data,
target,
subsample_data
)
try:
x_data = cls.subsample_df(
x_data,
target,
subsample_data
)
except ValueError:
logger.warning('Subset size larger than dataset size')
join_index = (
x_data.join(y_data, how="inner", lsuffix="x", rsuffix="y")
.dropna()
Expand Down Expand Up @@ -2474,7 +2477,7 @@ def subsample_df(
) -> pd.DataFrame:
"""Create stratified k-folds on continuous variable.
"""
_df = df.copy()
_df = df.copy().dropna(subset=target_var)
_df["Group"] = pd.qcut(
_df.loc[:, target_var], strat_groups, labels=False
)
Expand Down
30 changes: 27 additions & 3 deletions tests/test_calibrate.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def test_data_split(full_data, folds):
"""
Tests whether data is split properly
"""
tests = dict()
tests = {}
print(full_data["x"])
print(full_data["y"])
coeff_inst = Calibrate.setup(
Expand Down Expand Up @@ -85,7 +85,7 @@ def test_skl_cals(full_data, polynomial_degree, vif_bound, time_col):
Combines all possible multivariate key combos with each skl calibration
method except omp which needs at least 1 mv key
"""
tests = dict()
tests = {}
funcs: List[Callable[..., None]] = [
Calibrate.bayesian_ard,
Calibrate.bayesian_ridge,
Expand Down Expand Up @@ -186,7 +186,7 @@ def test_skl_cals(full_data, polynomial_degree, vif_bound, time_col):
)
def test_subsample(full_data, proportion):
"""Test setting subsample of data."""
tests = dict()
tests = {}
df_size = full_data['x'].shape[0]
coeff_inst = Calibrate.setup(
x_data=full_data["x"],
Expand All @@ -210,3 +210,27 @@ def test_subsample(full_data, proportion):
if not result:
print(f"{test}: {result}")
assert all(tests.values())


@pytest.mark.cal()
def test_subsample_failsafe(full_data):
"""Test setting subsample of data."""
tests = {}
df_size = full_data['x'].shape[0]
coeff_inst = Calibrate.setup(
x_data=full_data["x"],
y_data=full_data["y"],
target="x",
subsample_data=10000,
)
measurements = coeff_inst.return_measurements()

tests['Same size'] = (
measurements['x'].shape[0] == measurements['y'].shape[0]
)
tests['Correct size'] = measurements['x'].shape[0] == df_size

for test, result in tests.items():
if not result:
print(f"{test}: {result}")
assert all(tests.values())

0 comments on commit 56e064d

Please sign in to comment.