Merge pull request #275 from juaml/fix/invert_cv_order

Invert order of CV so it runs the largest first
juaml · Oct 17, 2024 · 2007273 · 2007273
2 parents d0fcf31 + ffca079
commit 2007273
Show file tree

Hide file tree

Showing 6 changed files with 25 additions and 23 deletions.
diff --git a/docs/changes/newsfragments/275.enh b/docs/changes/newsfragments/275.enh
@@ -0,0 +1 @@
+Place the final model CV split at the beginning instead of the end of the CV iterator wrapper by `Fede Raimondo`_
diff --git a/julearn/api.py b/julearn/api.py
@@ -594,13 +594,13 @@ def run_cross_validation(
     )
 
     if include_final_model:
-        # If we include the final model, we need to remove the last item in
+        # If we include the final model, we need to remove the first item in
         # the scores as this is the final model
-        pipeline = scores["estimator"][-1]
+        pipeline = scores["estimator"][0]
         if return_estimator == "final":
             scores.pop("estimator")
-        scores = {k: v[:-1] for k, v in scores.items()}
-        fold_sizes = fold_sizes[:-1]
+        scores = {k: v[1:] for k, v in scores.items()}
+        fold_sizes = fold_sizes[1:]
 
     n_repeats = getattr(cv_outer, "n_repeats", 1)
     n_folds = len(scores["fit_time"]) // n_repeats

diff --git a/julearn/model_selection/final_model_cv.py b/julearn/model_selection/final_model_cv.py
@@ -68,11 +68,12 @@ def split(
         profitting for joblib calls.
 
         """
-        yield from self.cv.split(X, y, groups)
+        # For the first fold, train on all samples and return only 2 for test
         all_inds = np.arange(len(X))
-        # For the last fold, train on all samples and return only 2 for testing
         yield all_inds, all_inds[:2]
 
+        yield from self.cv.split(X, y, groups)
+
     def get_n_splits(self) -> int:
         """Get the number of splits.
 

diff --git a/julearn/model_selection/tests/test_final_model_cv.py b/julearn/model_selection/tests/test_final_model_cv.py
@@ -31,13 +31,13 @@ def test_final_model_cv() -> None:
     all_sk = list(sklearn_cv.split(X, y))
 
     assert len(all_ju) == len(all_sk) + 1
-    for i in range(10):
-        assert_array_equal(all_ju[i][0], all_sk[i][0])
-        assert_array_equal( all_ju[i][1], all_sk[i][1])
+    for i in range(1, 11):
+        assert_array_equal(all_ju[i][0], all_sk[i-1][0])
+        assert_array_equal(all_ju[i][1], all_sk[i-1][1])
 
-    assert all_ju[-1][0].shape[0] == n_samples
-    assert all_ju[-1][1].shape[0] == 2
-    assert_array_equal(all_ju[-1][0], np.arange(n_samples))
+    assert all_ju[0][0].shape[0] == n_samples
+    assert all_ju[0][1].shape[0] == 2
+    assert_array_equal(all_ju[0][0], np.arange(n_samples))
 
 
 def test_final_model_cv_mdsum() -> None:

diff --git a/julearn/models/tests/test_models.py b/julearn/models/tests/test_models.py
@@ -189,7 +189,7 @@ def test_naive_bayes_estimators(
                 "estimator": DecisionTreeClassifier(random_state=42),
             },
         ),
-        ("gradientboost", GradientBoostingClassifier, {}),
+        ("gradientboost", GradientBoostingClassifier, {"random_state": 42}),
     ],
 )
 def test_classificationestimators(

diff --git a/julearn/tests/test_api.py b/julearn/tests/test_api.py
@@ -415,8 +415,8 @@ def test_tune_hyperparam_gridsearch(df_iris: pd.DataFrame) -> None:
     scoring = "accuracy"
 
     np.random.seed(42)
-    cv_outer = RepeatedKFold(n_splits=3, n_repeats=2)
-    cv_inner = RepeatedKFold(n_splits=3, n_repeats=2)
+    cv_outer = RepeatedKFold(n_splits=3, n_repeats=2, random_state=9)
+    cv_inner = RepeatedKFold(n_splits=3, n_repeats=2, random_state=10)
 
     model_params = {"svm__C": [0.01, 0.001]}
     search_params = {"cv": cv_inner}
@@ -438,8 +438,8 @@ def test_tune_hyperparam_gridsearch(df_iris: pd.DataFrame) -> None:
 
     # Now do the same with scikit-learn
     np.random.seed(42)
-    cv_outer = RepeatedKFold(n_splits=3, n_repeats=2)
-    cv_inner = RepeatedKFold(n_splits=3, n_repeats=2)
+    cv_outer = RepeatedKFold(n_splits=3, n_repeats=2, random_state=9)
+    cv_inner = RepeatedKFold(n_splits=3, n_repeats=2, random_state=10)
 
     clf = make_pipeline(SVC())
     gs = GridSearchCV(
@@ -672,8 +672,8 @@ def test_tune_hyperparams_multiple_grid(df_iris: pd.DataFrame) -> None:
     scoring = "accuracy"
 
     np.random.seed(42)
-    cv_outer = RepeatedKFold(n_splits=2, n_repeats=1)
-    cv_inner = RepeatedKFold(n_splits=2, n_repeats=1)
+    cv_outer = RepeatedKFold(n_splits=2, n_repeats=1, random_state=9)
+    cv_inner = RepeatedKFold(n_splits=2, n_repeats=1, random_state=10)
 
     search_params = {"cv": cv_inner}
     actual1, actual_estimator1 = run_cross_validation(
@@ -701,8 +701,8 @@ def test_tune_hyperparams_multiple_grid(df_iris: pd.DataFrame) -> None:
     )
 
     np.random.seed(42)
-    cv_outer = RepeatedKFold(n_splits=2, n_repeats=1)
-    cv_inner = RepeatedKFold(n_splits=2, n_repeats=1)
+    cv_outer = RepeatedKFold(n_splits=2, n_repeats=1, random_state=9)
+    cv_inner = RepeatedKFold(n_splits=2, n_repeats=1, random_state=10)
     search_params = {"cv": cv_inner}
     actual2, actual_estimator2 = run_cross_validation(
         X=X,
@@ -718,8 +718,8 @@ def test_tune_hyperparams_multiple_grid(df_iris: pd.DataFrame) -> None:
 
     # Now do the same with scikit-learn
     np.random.seed(42)
-    cv_outer = RepeatedKFold(n_splits=2, n_repeats=1)
-    cv_inner = RepeatedKFold(n_splits=2, n_repeats=1)
+    cv_outer = RepeatedKFold(n_splits=2, n_repeats=1, random_state=9)
+    cv_inner = RepeatedKFold(n_splits=2, n_repeats=1, random_state=10)
 
     clf = make_pipeline(SVC())
     grid = [