From 5b538f94653841d86c2f08349b1874f1c2882aa8 Mon Sep 17 00:00:00 2001
From: thivyanth <thivyanth2004@gmail.com>
Date: Tue, 1 Oct 2024 11:05:37 -0400
Subject: [PATCH 1/5] implement partial evaluation for missing splits

---
 mteb/evaluation/MTEB.py | 39 ++++++++++++++++++++++++++++++++-------
 1 file changed, 32 insertions(+), 7 deletions(-)

diff --git a/mteb/evaluation/MTEB.py b/mteb/evaluation/MTEB.py
index 70f3e21ca8..7189b813e5 100644
--- a/mteb/evaluation/MTEB.py
+++ b/mteb/evaluation/MTEB.py
@@ -339,14 +339,26 @@ def run(
             # skip evaluation if results folder exists and overwrite_results is False
             if output_path:
                 save_path = output_path / f"{task.metadata.name}{task.save_suffix}.json"
-                if save_path.exists() and not overwrite_results:
-                    logger.info(
-                        f"{task.metadata.name} results already exists. Loading results from disk. Set overwrite_results=True to overwrite."
+                existing_results = self.load_existing_results(save_path)
+                if existing_results and not overwrite_results:
+                    task_eval_splits = (
+                        eval_splits if eval_splits is not None else task.eval_splits
                     )
-                    mteb_results = MTEBResults.from_disk(save_path)
-                    evaluation_results.append(mteb_results)
-                    del self.tasks[0]  # empty memory
-                    continue
+                    missing_splits = self.compare_splits_and_subsets(existing_results, task_eval_splits)
+                    if not missing_splits:
+                        logger.info(
+                            f"{task.metadata.name} results already exists. Loading results from disk. Set overwrite_results=True to overwrite."
+                        )
+                        mteb_results = MTEBResults.from_disk(save_path)
+                        evaluation_results.append(mteb_results)
+                        del self.tasks[0]  # empty memory
+                        continue
+                    else:
+                        logger.info(
+                            f"{task.metadata.name} results exist but missing splits: {missing_splits}. Running evaluation for missing splits."
+                        )
+                        task_eval_splits = missing_splits
+
             try:
                 task_eval_splits = (
                     eval_splits if eval_splits is not None else task.eval_splits
@@ -488,3 +500,16 @@ def _save_model_metadata(model_meta: ModelMeta, output_folder: Path) -> None:
 
         with save_path.open("w") as f:
             json.dump(model_meta.to_dict(), f)
+
+    def load_existing_results(self, save_path):
+        if save_path.exists():
+            with open(save_path, "r") as f:
+                return json.load(f)
+        return None
+
+    def compare_splits_and_subsets(self, existing_results, task_eval_splits):
+        missing_splits = []
+        for split in task_eval_splits:
+            if split not in existing_results:
+                missing_splits.append(split)
+        return missing_splits
\ No newline at end of file

From 7cec0d7533e902024ddb36f531b2558b7a1debe0 Mon Sep 17 00:00:00 2001
From: thivyanth <thivyanth2004@gmail.com>
Date: Tue, 1 Oct 2024 11:09:11 -0400
Subject: [PATCH 2/5] lint

---
 mteb/evaluation/MTEB.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/mteb/evaluation/MTEB.py b/mteb/evaluation/MTEB.py
index 7189b813e5..65f919e0b3 100644
--- a/mteb/evaluation/MTEB.py
+++ b/mteb/evaluation/MTEB.py
@@ -344,7 +344,9 @@ def run(
                     task_eval_splits = (
                         eval_splits if eval_splits is not None else task.eval_splits
                     )
-                    missing_splits = self.compare_splits_and_subsets(existing_results, task_eval_splits)
+                    missing_splits = self.compare_splits_and_subsets(
+                        existing_results, task_eval_splits
+                    )
                     if not missing_splits:
                         logger.info(
                             f"{task.metadata.name} results already exists. Loading results from disk. Set overwrite_results=True to overwrite."
@@ -503,7 +505,7 @@ def _save_model_metadata(model_meta: ModelMeta, output_folder: Path) -> None:
 
     def load_existing_results(self, save_path):
         if save_path.exists():
-            with open(save_path, "r") as f:
+            with open(save_path) as f:
                 return json.load(f)
         return None
 
@@ -512,4 +514,4 @@ def compare_splits_and_subsets(self, existing_results, task_eval_splits):
         for split in task_eval_splits:
             if split not in existing_results:
                 missing_splits.append(split)
-        return missing_splits
\ No newline at end of file
+        return missing_splits

From 00d92b8f81e399dc017104854b7d3d09db4f9322 Mon Sep 17 00:00:00 2001
From: thivyanth <thivyanth2004@gmail.com>
Date: Fri, 4 Oct 2024 09:55:38 -0400
Subject: [PATCH 3/5] requested changes done from scratch

---
 mteb/evaluation/MTEB.py | 134 ++++++++++++++++++++++++++--------------
 1 file changed, 87 insertions(+), 47 deletions(-)

diff --git a/mteb/evaluation/MTEB.py b/mteb/evaluation/MTEB.py
index 65f919e0b3..32632a03a1 100644
--- a/mteb/evaluation/MTEB.py
+++ b/mteb/evaluation/MTEB.py
@@ -13,6 +13,7 @@
 import datasets
 from sentence_transformers import SentenceTransformer
 
+from mteb.abstasks.AbsTask import ScoresDict
 from mteb.encoder_interface import Encoder
 from mteb.model_meta import ModelMeta
 from mteb.models import model_meta_from_sentence_transformers
@@ -273,6 +274,59 @@ def _run_eval(
         tock = time()
         return results, tick, tock
 
+    @staticmethod
+    def _get_missing_splits(
+        existing_results: MTEBResults | None, task_eval_splits: list[str], task: AbsTask
+    ) -> list[str]:
+        if existing_results is None:
+            return task_eval_splits
+
+        missing_splits = []
+        for split in task_eval_splits:
+            if split not in existing_results.scores:
+                missing_splits.append(split)
+            elif not existing_results.scores[
+                split
+            ]:  # Check if the split has any scores
+                missing_splits.append(split)
+
+        return missing_splits
+
+    @staticmethod
+    def _merge_results(
+        existing_results: MTEBResults, new_results: MTEBResults
+    ) -> MTEBResults:
+        merged_scores = existing_results.scores.copy()
+
+        for split, scores in new_results.scores.items():
+            if split in merged_scores:
+                merged_scores[split] = MTEB._merge_split_scores(
+                    merged_scores[split], scores
+                )
+            else:
+                merged_scores[split] = scores
+
+        merged_results = MTEBResults(
+            dataset_revision=existing_results.dataset_revision,
+            task_name=existing_results.task_name,
+            mteb_version=existing_results.mteb_version,
+            scores=merged_scores,
+            evaluation_time=existing_results.evaluation_time
+            + new_results.evaluation_time,
+            kg_co2_emissions=existing_results.kg_co2_emissions,
+        )
+
+        return merged_results
+
+    @staticmethod
+    def _merge_split_scores(
+        existing_scores: list[ScoresDict], new_scores: list[ScoresDict]
+    ) -> list[ScoresDict]:
+        merged = {score["hf_subset"]: score for score in existing_scores}
+        for score in new_scores:
+            merged[score["hf_subset"]] = score
+        return list(merged.values())
+
     def run(
         self,
         model: SentenceTransformer | Encoder,
@@ -336,38 +390,36 @@ def run(
                 f"\n\n********************** Evaluating {task.metadata.name} **********************"
             )
 
-            # skip evaluation if results folder exists and overwrite_results is False
             if output_path:
                 save_path = output_path / f"{task.metadata.name}{task.save_suffix}.json"
-                existing_results = self.load_existing_results(save_path)
-                if existing_results and not overwrite_results:
-                    task_eval_splits = (
-                        eval_splits if eval_splits is not None else task.eval_splits
-                    )
-                    missing_splits = self.compare_splits_and_subsets(
-                        existing_results, task_eval_splits
-                    )
-                    if not missing_splits:
-                        logger.info(
-                            f"{task.metadata.name} results already exists. Loading results from disk. Set overwrite_results=True to overwrite."
-                        )
-                        mteb_results = MTEBResults.from_disk(save_path)
-                        evaluation_results.append(mteb_results)
-                        del self.tasks[0]  # empty memory
-                        continue
-                    else:
-                        logger.info(
-                            f"{task.metadata.name} results exist but missing splits: {missing_splits}. Running evaluation for missing splits."
-                        )
-                        task_eval_splits = missing_splits
+                existing_results = None
+                if save_path.exists() and not overwrite_results:
+                    try:
+                        existing_results = MTEBResults.from_disk(save_path)
+                    except Exception as e:
+                        logger.warning(f"Error loading existing results: {e}")
 
-            try:
                 task_eval_splits = (
                     eval_splits if eval_splits is not None else task.eval_splits
                 )
+                missing_splits = self._get_missing_splits(
+                    existing_results, task_eval_splits, task
+                )
+
+                if not missing_splits and existing_results:
+                    logger.info(
+                        f"{task.metadata.name} results already exist. Loading results from disk."
+                    )
+                    evaluation_results.append(existing_results)
+                    del self.tasks[0]
+                    continue
+
+                if missing_splits:
+                    logger.info(
+                        f"Running evaluation for missing splits: {missing_splits}"
+                    )
 
-                # load data
-                logger.info(f"Loading dataset for {task.metadata_dict['name']}")
+            try:
                 task.check_if_dataset_is_superseeded()
                 task.load_data(eval_splits=task_eval_splits, **kwargs)
 
@@ -375,7 +427,8 @@ def run(
                 task_results = {}
                 evaluation_time = 0
                 kg_co2_emissions: int | None = 0 if co2_tracker else None
-                for split in task_eval_splits:
+
+                for split in missing_splits:
                     if co2_tracker:
                         try:
                             from codecarbon import EmissionsTracker
@@ -418,21 +471,22 @@ def run(
                     if verbosity >= 1:
                         logger.info(f"Scores: {results}")
 
-                mteb_task_result = MTEBResults.from_task_results(
+                new_results = MTEBResults.from_task_results(
                     task,
                     task_results,
                     evaluation_time=evaluation_time,
                     kg_co2_emissions=kg_co2_emissions,
                 )
 
-                # save results
+                if existing_results:
+                    merged_results = self._merge_results(existing_results, new_results)
+                else:
+                    merged_results = new_results
+
                 if output_path:
-                    with open(save_path, "w") as f_out:
-                        json.dump(
-                            mteb_task_result.to_dict(), f_out, indent=2, sort_keys=True
-                        )
+                    merged_results.to_disk(save_path)
 
-                evaluation_results.append(mteb_task_result)
+                evaluation_results.append(merged_results)
 
             except Exception as e:
                 logger.error(
@@ -451,7 +505,6 @@ def run(
             # empty memory
             del self.tasks[0]
 
-        # restore original tasks
         self.tasks = original_tasks
         return evaluation_results
 
@@ -502,16 +555,3 @@ def _save_model_metadata(model_meta: ModelMeta, output_folder: Path) -> None:
 
         with save_path.open("w") as f:
             json.dump(model_meta.to_dict(), f)
-
-    def load_existing_results(self, save_path):
-        if save_path.exists():
-            with open(save_path) as f:
-                return json.load(f)
-        return None
-
-    def compare_splits_and_subsets(self, existing_results, task_eval_splits):
-        missing_splits = []
-        for split in task_eval_splits:
-            if split not in existing_results:
-                missing_splits.append(split)
-        return missing_splits

From 6ec791cd4cf53a6840db8a6e62a523732db612db Mon Sep 17 00:00:00 2001
From: thivyanth <thivyanth2004@gmail.com>
Date: Fri, 4 Oct 2024 15:47:57 -0400
Subject: [PATCH 4/5] test for missing split evaluation added

---
 mteb/evaluation/MTEB.py                       | 18 +++-
 .../test_evaluation/test_split_evaluation.py  | 89 +++++++++++++++++++
 2 files changed, 106 insertions(+), 1 deletion(-)
 create mode 100644 tests/test_evaluation/test_split_evaluation.py

diff --git a/mteb/evaluation/MTEB.py b/mteb/evaluation/MTEB.py
index 32632a03a1..9c4f2e8f2a 100644
--- a/mteb/evaluation/MTEB.py
+++ b/mteb/evaluation/MTEB.py
@@ -4,7 +4,7 @@
 import logging
 import os
 import traceback
-from copy import copy
+from copy import copy, deepcopy
 from datetime import datetime
 from pathlib import Path
 from time import time
@@ -77,6 +77,8 @@ def __init__(
         self._version = version
         self.err_logs_path = err_logs_path
 
+        self.last_evaluated_splits = {}
+
         self.select_tasks(**kwargs)
 
     def deprecation_warning(
@@ -384,6 +386,7 @@ def run(
         original_tasks = (
             self.tasks.copy()
         )  # save them in case we re-use the object (e.g. for reranking)
+        self.last_evaluated_splits = {}
         while len(self.tasks) > 0:
             task = self.tasks[0]
             logger.info(
@@ -411,6 +414,7 @@ def run(
                         f"{task.metadata.name} results already exist. Loading results from disk."
                     )
                     evaluation_results.append(existing_results)
+                    self.last_evaluated_splits[task.metadata.name] = []  # Add this line
                     del self.tasks[0]
                     continue
 
@@ -471,6 +475,10 @@ def run(
                     if verbosity >= 1:
                         logger.info(f"Scores: {results}")
 
+                    if task.metadata_dict["name"] not in self.last_evaluated_splits:
+                        self.last_evaluated_splits[task.metadata_dict["name"]] = set()
+                    self.last_evaluated_splits[task.metadata_dict["name"]].add(split)
+
                 new_results = MTEBResults.from_task_results(
                     task,
                     task_results,
@@ -555,3 +563,11 @@ def _save_model_metadata(model_meta: ModelMeta, output_folder: Path) -> None:
 
         with save_path.open("w") as f:
             json.dump(model_meta.to_dict(), f)
+
+    def get_last_evaluated_splits(self):
+        """Returns a dictionary of tasks and their evaluated splits from the most recent run.
+        Tasks with empty lists indicate that results already existed and no splits were evaluated.
+        """
+        return deepcopy(
+            {task: list(splits) for task, splits in self.last_evaluated_splits.items()}
+        )
diff --git a/tests/test_evaluation/test_split_evaluation.py b/tests/test_evaluation/test_split_evaluation.py
new file mode 100644
index 0000000000..eb5f484645
--- /dev/null
+++ b/tests/test_evaluation/test_split_evaluation.py
@@ -0,0 +1,89 @@
+from __future__ import annotations
+
+import pytest
+from sentence_transformers import SentenceTransformer
+
+import mteb
+from mteb import MTEB
+
+
+@pytest.fixture
+def model():
+    return SentenceTransformer("all-MiniLM-L6-v2")
+
+
+@pytest.fixture
+def nfcorpus_tasks():
+    return mteb.get_tasks(tasks=["NFCorpus"], languages=["eng"])
+
+
+# def test_all_splits_evaluated(model, nfcorpus_tasks, tmp_path):
+#     evaluation = MTEB(tasks=nfcorpus_tasks)
+#     evaluation.run(
+#         model,
+#         eval_splits=["train", "test"],
+#         save_predictions=True,
+#         output_folder=str(tmp_path / "testcase1"),
+#         verbosity=2,
+#     )
+#     last_evaluated_splits = evaluation.get_last_evaluated_splits()
+#     print(last_evaluated_splits)
+#     assert "NFCorpus" in last_evaluated_splits
+#     assert set(last_evaluated_splits["NFCorpus"]) == {"train", "test"}
+#     assert len(last_evaluated_splits["NFCorpus"]) == 2
+
+
+def test_one_missing_split(model, nfcorpus_tasks, tmp_path):
+    evaluation = MTEB(tasks=nfcorpus_tasks)
+    evaluation.run(
+        model,
+        eval_splits=["train"],
+        save_predictions=True,
+        output_folder=str(tmp_path / "testcase2"),
+        verbosity=2,
+    )
+
+    # Get model and tasks again
+    model = SentenceTransformer("all-MiniLM-L6-v2")
+    nfcorpus_tasks = mteb.get_tasks(tasks=["NFCorpus"], languages=["eng"])
+
+    evaluation_2 = MTEB(tasks=nfcorpus_tasks)
+    evaluation_2.run(
+        model,
+        eval_splits=["train", "test"],
+        save_predictions=True,
+        output_folder=str(tmp_path / "testcase2"),
+        verbosity=2,
+    )
+
+    last_evaluated_splits = evaluation_2.get_last_evaluated_splits()
+
+    print(last_evaluated_splits)
+    assert "NFCorpus" in last_evaluated_splits
+    assert set(last_evaluated_splits["NFCorpus"]) == {"test"}
+    assert len(last_evaluated_splits["NFCorpus"]) == 1
+
+
+def test_no_missing_splits(model, nfcorpus_tasks, tmp_path):
+    evaluation_1 = MTEB(tasks=nfcorpus_tasks)
+    evaluation_1.run(
+        model,
+        eval_splits=["train", "test"],
+        save_predictions=True,
+        output_folder=str(tmp_path / "testcase3"),
+        verbosity=2,
+    )
+
+    evaluation_2 = MTEB(tasks=nfcorpus_tasks)
+    evaluation_2.run(
+        model,
+        eval_splits=["train", "test"],
+        save_predictions=True,
+        output_folder=str(tmp_path / "testcase3"),
+        verbosity=2,
+    )
+
+    last_evaluated_splits = evaluation_2.get_last_evaluated_splits()
+
+    assert "NFCorpus" in last_evaluated_splits
+    assert len(last_evaluated_splits["NFCorpus"]) == 0

From 3987e3ef80257b23d155ce6600b0efcaee255740 Mon Sep 17 00:00:00 2001
From: thivyanth <thivyanth2004@gmail.com>
Date: Fri, 4 Oct 2024 15:56:00 -0400
Subject: [PATCH 5/5] uncomment test

---
 .../test_evaluation/test_split_evaluation.py  | 28 +++++++++----------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/tests/test_evaluation/test_split_evaluation.py b/tests/test_evaluation/test_split_evaluation.py
index eb5f484645..419055d0ef 100644
--- a/tests/test_evaluation/test_split_evaluation.py
+++ b/tests/test_evaluation/test_split_evaluation.py
@@ -17,20 +17,20 @@ def nfcorpus_tasks():
     return mteb.get_tasks(tasks=["NFCorpus"], languages=["eng"])
 
 
-# def test_all_splits_evaluated(model, nfcorpus_tasks, tmp_path):
-#     evaluation = MTEB(tasks=nfcorpus_tasks)
-#     evaluation.run(
-#         model,
-#         eval_splits=["train", "test"],
-#         save_predictions=True,
-#         output_folder=str(tmp_path / "testcase1"),
-#         verbosity=2,
-#     )
-#     last_evaluated_splits = evaluation.get_last_evaluated_splits()
-#     print(last_evaluated_splits)
-#     assert "NFCorpus" in last_evaluated_splits
-#     assert set(last_evaluated_splits["NFCorpus"]) == {"train", "test"}
-#     assert len(last_evaluated_splits["NFCorpus"]) == 2
+def test_all_splits_evaluated(model, nfcorpus_tasks, tmp_path):
+    evaluation = MTEB(tasks=nfcorpus_tasks)
+    evaluation.run(
+        model,
+        eval_splits=["train", "test"],
+        save_predictions=True,
+        output_folder=str(tmp_path / "testcase1"),
+        verbosity=2,
+    )
+    last_evaluated_splits = evaluation.get_last_evaluated_splits()
+    print(last_evaluated_splits)
+    assert "NFCorpus" in last_evaluated_splits
+    assert set(last_evaluated_splits["NFCorpus"]) == {"train", "test"}
+    assert len(last_evaluated_splits["NFCorpus"]) == 2
 
 
 def test_one_missing_split(model, nfcorpus_tasks, tmp_path):