Skip to content

Commit

Permalink
Merge pull request #275 from juaml/fix/invert_cv_order
Browse files Browse the repository at this point in the history
Invert order of CV so it runs the largest first
  • Loading branch information
synchon authored Oct 17, 2024
2 parents d0fcf31 + ffca079 commit 2007273
Show file tree
Hide file tree
Showing 6 changed files with 25 additions and 23 deletions.
1 change: 1 addition & 0 deletions docs/changes/newsfragments/275.enh
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Place the final model CV split at the beginning instead of the end of the CV iterator wrapper by `Fede Raimondo`_
8 changes: 4 additions & 4 deletions julearn/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -594,13 +594,13 @@ def run_cross_validation(
)

if include_final_model:
# If we include the final model, we need to remove the last item in
# If we include the final model, we need to remove the first item in
# the scores as this is the final model
pipeline = scores["estimator"][-1]
pipeline = scores["estimator"][0]
if return_estimator == "final":
scores.pop("estimator")
scores = {k: v[:-1] for k, v in scores.items()}
fold_sizes = fold_sizes[:-1]
scores = {k: v[1:] for k, v in scores.items()}
fold_sizes = fold_sizes[1:]

n_repeats = getattr(cv_outer, "n_repeats", 1)
n_folds = len(scores["fit_time"]) // n_repeats
Expand Down
5 changes: 3 additions & 2 deletions julearn/model_selection/final_model_cv.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,11 +68,12 @@ def split(
profitting for joblib calls.
"""
yield from self.cv.split(X, y, groups)
# For the first fold, train on all samples and return only 2 for test
all_inds = np.arange(len(X))
# For the last fold, train on all samples and return only 2 for testing
yield all_inds, all_inds[:2]

yield from self.cv.split(X, y, groups)

def get_n_splits(self) -> int:
"""Get the number of splits.
Expand Down
12 changes: 6 additions & 6 deletions julearn/model_selection/tests/test_final_model_cv.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,13 +31,13 @@ def test_final_model_cv() -> None:
all_sk = list(sklearn_cv.split(X, y))

assert len(all_ju) == len(all_sk) + 1
for i in range(10):
assert_array_equal(all_ju[i][0], all_sk[i][0])
assert_array_equal( all_ju[i][1], all_sk[i][1])
for i in range(1, 11):
assert_array_equal(all_ju[i][0], all_sk[i-1][0])
assert_array_equal(all_ju[i][1], all_sk[i-1][1])

assert all_ju[-1][0].shape[0] == n_samples
assert all_ju[-1][1].shape[0] == 2
assert_array_equal(all_ju[-1][0], np.arange(n_samples))
assert all_ju[0][0].shape[0] == n_samples
assert all_ju[0][1].shape[0] == 2
assert_array_equal(all_ju[0][0], np.arange(n_samples))


def test_final_model_cv_mdsum() -> None:
Expand Down
2 changes: 1 addition & 1 deletion julearn/models/tests/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ def test_naive_bayes_estimators(
"estimator": DecisionTreeClassifier(random_state=42),
},
),
("gradientboost", GradientBoostingClassifier, {}),
("gradientboost", GradientBoostingClassifier, {"random_state": 42}),
],
)
def test_classificationestimators(
Expand Down
20 changes: 10 additions & 10 deletions julearn/tests/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -415,8 +415,8 @@ def test_tune_hyperparam_gridsearch(df_iris: pd.DataFrame) -> None:
scoring = "accuracy"

np.random.seed(42)
cv_outer = RepeatedKFold(n_splits=3, n_repeats=2)
cv_inner = RepeatedKFold(n_splits=3, n_repeats=2)
cv_outer = RepeatedKFold(n_splits=3, n_repeats=2, random_state=9)
cv_inner = RepeatedKFold(n_splits=3, n_repeats=2, random_state=10)

model_params = {"svm__C": [0.01, 0.001]}
search_params = {"cv": cv_inner}
Expand All @@ -438,8 +438,8 @@ def test_tune_hyperparam_gridsearch(df_iris: pd.DataFrame) -> None:

# Now do the same with scikit-learn
np.random.seed(42)
cv_outer = RepeatedKFold(n_splits=3, n_repeats=2)
cv_inner = RepeatedKFold(n_splits=3, n_repeats=2)
cv_outer = RepeatedKFold(n_splits=3, n_repeats=2, random_state=9)
cv_inner = RepeatedKFold(n_splits=3, n_repeats=2, random_state=10)

clf = make_pipeline(SVC())
gs = GridSearchCV(
Expand Down Expand Up @@ -672,8 +672,8 @@ def test_tune_hyperparams_multiple_grid(df_iris: pd.DataFrame) -> None:
scoring = "accuracy"

np.random.seed(42)
cv_outer = RepeatedKFold(n_splits=2, n_repeats=1)
cv_inner = RepeatedKFold(n_splits=2, n_repeats=1)
cv_outer = RepeatedKFold(n_splits=2, n_repeats=1, random_state=9)
cv_inner = RepeatedKFold(n_splits=2, n_repeats=1, random_state=10)

search_params = {"cv": cv_inner}
actual1, actual_estimator1 = run_cross_validation(
Expand Down Expand Up @@ -701,8 +701,8 @@ def test_tune_hyperparams_multiple_grid(df_iris: pd.DataFrame) -> None:
)

np.random.seed(42)
cv_outer = RepeatedKFold(n_splits=2, n_repeats=1)
cv_inner = RepeatedKFold(n_splits=2, n_repeats=1)
cv_outer = RepeatedKFold(n_splits=2, n_repeats=1, random_state=9)
cv_inner = RepeatedKFold(n_splits=2, n_repeats=1, random_state=10)
search_params = {"cv": cv_inner}
actual2, actual_estimator2 = run_cross_validation(
X=X,
Expand All @@ -718,8 +718,8 @@ def test_tune_hyperparams_multiple_grid(df_iris: pd.DataFrame) -> None:

# Now do the same with scikit-learn
np.random.seed(42)
cv_outer = RepeatedKFold(n_splits=2, n_repeats=1)
cv_inner = RepeatedKFold(n_splits=2, n_repeats=1)
cv_outer = RepeatedKFold(n_splits=2, n_repeats=1, random_state=9)
cv_inner = RepeatedKFold(n_splits=2, n_repeats=1, random_state=10)

clf = make_pipeline(SVC())
grid = [
Expand Down

0 comments on commit 2007273

Please sign in to comment.