From 2cf92e1f5d24ea6f3174a2c75eee0461fbb10c9c Mon Sep 17 00:00:00 2001
From: sam021313 <40773225+sam021313@users.noreply.github.com>
Date: Fri, 17 Jan 2025 14:45:35 +0100
Subject: [PATCH 01/11] update install for tests

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index ac6e9fa60..1e5233488 100644
--- a/Makefile
+++ b/Makefile
@@ -5,7 +5,7 @@ install:
 install-for-tests:
 	@echo "--- 🚀 Installing project dependencies for test ---"
 	@echo "This ensures that the project is not installed in editable mode"
-	pip install ".[dev,speedtask]"
+	pip install ".[dev,speedtask,bm25s,pylate]"
 
 lint:
 	@echo "--- 🧹 Running linters ---"

From 8323369837ed1384567e0b0ece7556140ad46d81 Mon Sep 17 00:00:00 2001
From: sam021313 <40773225+sam021313@users.noreply.github.com>
Date: Fri, 17 Jan 2025 15:37:31 +0100
Subject: [PATCH 02/11] use tmp dir for tests

---
 tests/test_benchmark/test_models.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/tests/test_benchmark/test_models.py b/tests/test_benchmark/test_models.py
index ee5bed091..80232df52 100644
--- a/tests/test_benchmark/test_models.py
+++ b/tests/test_benchmark/test_models.py
@@ -1,5 +1,7 @@
 from __future__ import annotations
 
+from pathlib import Path
+
 import pytest
 
 import mteb
@@ -11,7 +13,7 @@
 
 @pytest.mark.parametrize("model", ["colbert-ir/colbertv2.0"])
 @pytest.mark.parametrize("task", [MockRetrievalTask()])
-def test_colbert_model_e2e(task: AbsTask, model: str):
+def test_colbert_model_e2e(task: AbsTask, model: str, tmp_path: Path):
     pytest.importorskip("pylate", reason="pylate not installed")
     eval_splits = ["test"]
     model = mteb.get_model(model)
@@ -21,13 +23,14 @@ def test_colbert_model_e2e(task: AbsTask, model: str):
         model,
         eval_splits=eval_splits,
         corpus_chunk_size=500,
+        output_folder=str(tmp_path),
     )
     result = results[0]
 
     assert result.scores["test"][0]["ndcg_at_1"] == 1.0
 
 
-def test_bm25s_e2e():
+def test_bm25s_e2e(tmp_path: Path):
     # fails for dataset smaller then 1000
     pytest.importorskip("bm25s", reason="bm25s not installed")
     pytest.importorskip("Stemmer", reason="PyStemmer not installed")
@@ -38,7 +41,9 @@ def test_bm25s_e2e():
 
     evaluation = MTEB(tasks=tasks)
 
-    results = evaluation.run(model, eval_splits=eval_splits)
+    results = evaluation.run(
+        model, eval_splits=eval_splits, output_folder=str(tmp_path)
+    )
     result = results[0]
 
     assert result.scores["test"][0]["ndcg_at_1"] == 0.42879

From d0191ab9e5e6c80d69325c7c0bf0a30900cacc84 Mon Sep 17 00:00:00 2001
From: sam021313 <40773225+sam021313@users.noreply.github.com>
Date: Fri, 17 Jan 2025 16:06:40 +0100
Subject: [PATCH 03/11] ref: use tmp_path for output_folder

---
 tests/test_benchmark/test_benchmark.py | 45 +++++++++++++-------------
 1 file changed, 22 insertions(+), 23 deletions(-)

diff --git a/tests/test_benchmark/test_benchmark.py b/tests/test_benchmark/test_benchmark.py
index 1393d46f1..1288642a3 100644
--- a/tests/test_benchmark/test_benchmark.py
+++ b/tests/test_benchmark/test_benchmark.py
@@ -56,7 +56,9 @@ def test_mulitple_mteb_tasks(tasks: list[AbsTask], model: mteb.Encoder, tmp_path
         MockTorchbf16Encoder(),
     ],
 )
-def test_benchmark_encoders_on_task(task: str | AbsTask, model: mteb.Encoder):
+def test_benchmark_encoders_on_task(
+    task: str | AbsTask, model: mteb.Encoder, tmp_path: Path
+):
     """Test that a task can be fetched and run using a variety of encoders"""
     if isinstance(task, str):
         tasks = mteb.get_tasks(tasks=[task])
@@ -64,7 +66,7 @@ def test_benchmark_encoders_on_task(task: str | AbsTask, model: mteb.Encoder):
         tasks = [task]
 
     eval = mteb.MTEB(tasks=tasks)
-    eval.run(model, output_folder="tests/results", overwrite_results=True)
+    eval.run(model, output_folder=str(tmp_path))
 
 
 @pytest.mark.parametrize("task", [MockMultilingualRetrievalTask()])
@@ -72,7 +74,9 @@ def test_benchmark_encoders_on_task(task: str | AbsTask, model: mteb.Encoder):
     "model",
     [MockSentenceTransformer()],
 )
-def test_run_eval_without_co2_tracking(task: str | AbsTask, model: mteb.Encoder):
+def test_run_eval_without_co2_tracking(
+    task: str | AbsTask, model: mteb.Encoder, tmp_path: Path
+):
     """Test that a task can be fetched and run without CO2 tracking"""
     if isinstance(task, str):
         tasks = mteb.get_tasks(tasks=[task])
@@ -80,9 +84,7 @@ def test_run_eval_without_co2_tracking(task: str | AbsTask, model: mteb.Encoder)
         tasks = [task]
 
     eval = mteb.MTEB(tasks=tasks)
-    eval.run(
-        model, output_folder="tests/results", overwrite_results=True, co2_tracker=False
-    )
+    eval.run(model, output_folder=str(tmp_path), co2_tracker=False)
 
 
 @pytest.mark.parametrize("task", MOCK_TASK_TEST_GRID[:1])
@@ -108,7 +110,7 @@ def test_reload_results(task: str | AbsTask, model: mteb.Encoder, tmp_path: Path
 
 
 @pytest.mark.parametrize("task_name", MOCK_TASK_TEST_GRID)
-def test_prompt_name_passed_to_all_encodes(task_name: str | AbsTask):
+def test_prompt_name_passed_to_all_encodes(task_name: str | AbsTask, tmp_path: Path):
     """Test that all tasks correctly pass down the prompt_name to the encoder which supports it, and that the encoder which does not support it does not
     receive it.
     """
@@ -141,7 +143,7 @@ def encode(self, sentences, **kwargs):
 
     eval.run(
         model,
-        output_folder="tests/results",
+        output_folder=str(tmp_path),
         overwrite_results=True,
     )
     # Test that the task_name is not passed down to the encoder
@@ -151,7 +153,7 @@ def encode(self, sentences, **kwargs):
 
 
 @pytest.mark.parametrize("task_name", MOCK_TASK_TEST_GRID)
-def test_encode_kwargs_passed_to_all_encodes(task_name: str | AbsTask):
+def test_encode_kwargs_passed_to_all_encodes(task_name: str | AbsTask, tmp_path: Path):
     """Test that all tasks correctly pass down the encode_kwargs to the encoder."""
     my_encode_kwargs = {"no_one_uses_this_args": "but_its_here"}
 
@@ -175,7 +177,7 @@ def encode(self, sentences, task_name: str | None = None, **kwargs):
     model = MockEncoderWithKwargs()
     eval.run(
         model,
-        output_folder="tests/results",
+        output_folder=str(tmp_path),
         overwrite_results=True,
         encode_kwargs=my_encode_kwargs,
     )
@@ -195,16 +197,14 @@ def test_run_using_benchmark(model: mteb.Encoder):
 
 
 @pytest.mark.parametrize("model", [MockNumpyEncoder()])
-def test_run_using_list_of_benchmark(model: mteb.Encoder):
+def test_run_using_list_of_benchmark(model: mteb.Encoder, tmp_path: Path):
     """Test that a list of benchmark objects can be run using the MTEB class."""
     bench = [
         Benchmark(name="test_bench", tasks=mteb.get_tasks(tasks=["STS12", "SummEval"]))
     ]
 
     eval = mteb.MTEB(tasks=bench)
-    eval.run(
-        model, output_folder="tests/results", overwrite_results=True
-    )  # we just want to test that it runs
+    eval.run(model, output_folder=str(tmp_path))  # we just want to test that it runs
 
 
 def test_benchmark_names_must_be_unique():
@@ -229,7 +229,7 @@ def test_get_benchmark(name):
 @pytest.mark.parametrize("task", MOCK_TASK_TEST_GRID)
 @pytest.mark.parametrize("is_task_name", [True, False])
 def test_prompt_name_passed_to_all_encodes_with_prompts(
-    task: AbsTask | str, is_task_name: bool
+    task: AbsTask | str, is_task_name: bool, tmp_path: Path
 ):
     """Test that all tasks and task_types correctly pass down the prompt_name to the encoder with prompts."""
     _task_name = task.metadata.name if isinstance(task, AbsTask) else task
@@ -258,8 +258,7 @@ def encode(self, sentences, prompt_name: str | None = None, **kwargs):
     )
     eval.run(
         model,
-        output_folder="tests/results",
-        overwrite_results=True,
+        output_folder=str(tmp_path),
     )
 
     class MockEncoderWithExistingPrompts(mteb.Encoder):
@@ -275,7 +274,7 @@ def encode(self, sentences, prompt_name: str | None = None, **kwargs):
     model = MockSentenceTransformerWrapper(MockEncoderWithExistingPrompts())
     eval.run(
         model,
-        output_folder="tests/results",
+        output_folder=str(tmp_path),
         overwrite_results=True,
     )
 
@@ -292,7 +291,9 @@ def encode(self, sentences, prompt_name: str | None = None, **kwargs):
     ],
 )
 @pytest.mark.parametrize("is_task_name", [True, False])
-def test_model_query_passage_prompts_task_type(task: AbsTask | str, is_task_name: bool):
+def test_model_query_passage_prompts_task_type(
+    task: AbsTask | str, is_task_name: bool, tmp_path: Path
+):
     """Test that the model with prompts is correctly called."""
     tasks = [task]
 
@@ -331,8 +332,7 @@ def encode(self, sentences, prompt_name: str | None = None, *args, **kwargs):
     eval.run(
         model,
         model_prompts=prompt_list,
-        output_folder="tests/results",
-        overwrite_results=True,
+        output_folder=str(tmp_path),
     )
     model = MockSentenceTransformerWrapper(
         MockSentenceEncoderWithPrompts(), model_prompts=prompt_list
@@ -341,6 +341,5 @@ def encode(self, sentences, prompt_name: str | None = None, *args, **kwargs):
     eval.run(
         model,
         model_prompts=prompt_list,
-        output_folder="tests/results",
-        overwrite_results=True,
+        output_folder=str(tmp_path),
     )

From 5c585a4c57eabe98e5330aa9578bc7f7483b63ff Mon Sep 17 00:00:00 2001
From: sam021313 <40773225+sam021313@users.noreply.github.com>
Date: Fri, 17 Jan 2025 16:17:45 +0100
Subject: [PATCH 04/11] ref: clean up tests

---
 tests/test_benchmark/test_benchmark.py        |  6 ++---
 ...est_benchmark_integration_with_datasets.py |  5 ++--
 ...k_integration_with_sentencetransformers.py |  7 ++++--
 tests/test_cli.py                             |  7 +++---
 tests/test_reproducible_workflow.py           |  7 ++++--
 tests/test_tasks/test_mteb_rerank.py          | 25 +++++++++++--------
 6 files changed, 33 insertions(+), 24 deletions(-)

diff --git a/tests/test_benchmark/test_benchmark.py b/tests/test_benchmark/test_benchmark.py
index 1288642a3..4418e9296 100644
--- a/tests/test_benchmark/test_benchmark.py
+++ b/tests/test_benchmark/test_benchmark.py
@@ -149,7 +149,7 @@ def encode(self, sentences, **kwargs):
     # Test that the task_name is not passed down to the encoder
     model = EncoderWithoutInstructions("average_word_embeddings_levy_dependency")
     assert model.prompts == {}, "The encoder should not have any prompts"
-    eval.run(model, output_folder="tests/results", overwrite_results=True)
+    eval.run(model, output_folder=tmp_path.as_posix(), overwrite_results=True)
 
 
 @pytest.mark.parametrize("task_name", MOCK_TASK_TEST_GRID)
@@ -184,7 +184,7 @@ def encode(self, sentences, task_name: str | None = None, **kwargs):
 
 
 @pytest.mark.parametrize("model", [MockNumpyEncoder()])
-def test_run_using_benchmark(model: mteb.Encoder):
+def test_run_using_benchmark(model: mteb.Encoder, tmp_path: Path):
     """Test that a benchmark object can be run using the MTEB class."""
     bench = Benchmark(
         name="test_bench", tasks=mteb.get_tasks(tasks=["STS12", "SummEval"])
@@ -192,7 +192,7 @@ def test_run_using_benchmark(model: mteb.Encoder):
 
     eval = mteb.MTEB(tasks=[bench])
     eval.run(
-        model, output_folder="tests/results", overwrite_results=True
+        model, output_folder=tmp_path.as_posix(), overwrite_results=True
     )  # we just want to test that it runs
 
 
diff --git a/tests/test_benchmark/test_benchmark_integration_with_datasets.py b/tests/test_benchmark/test_benchmark_integration_with_datasets.py
index 81d4c6b67..8288680c3 100644
--- a/tests/test_benchmark/test_benchmark_integration_with_datasets.py
+++ b/tests/test_benchmark/test_benchmark_integration_with_datasets.py
@@ -3,6 +3,7 @@
 from __future__ import annotations
 
 import logging
+from pathlib import Path
 
 import pytest
 
@@ -18,7 +19,7 @@
 
 @pytest.mark.parametrize("task", TASK_TEST_GRID)
 @pytest.mark.parametrize("model", [MockNumpyEncoder()])
-def test_benchmark_datasets(task: str | AbsTask, model: mteb.Encoder):
+def test_benchmark_datasets(task: str | AbsTask, model: mteb.Encoder, tmp_path: Path):
     """Test that a task can be fetched and run"""
     eval = MTEB(tasks=[task])
-    eval.run(model, output_folder="tests/results", overwrite_results=True)
+    eval.run(model, output_folder=tmp_path.as_posix(), overwrite_results=True)
diff --git a/tests/test_benchmark/test_benchmark_integration_with_sentencetransformers.py b/tests/test_benchmark/test_benchmark_integration_with_sentencetransformers.py
index 4ca0056cd..e79515be5 100644
--- a/tests/test_benchmark/test_benchmark_integration_with_sentencetransformers.py
+++ b/tests/test_benchmark/test_benchmark_integration_with_sentencetransformers.py
@@ -3,6 +3,7 @@
 from __future__ import annotations
 
 import logging
+from pathlib import Path
 
 import pytest
 from sentence_transformers import SentenceTransformer
@@ -22,9 +23,11 @@
         "average_word_embeddings_levy_dependency",
     ],
 )
-def test_benchmark_sentence_transformer(task: str | AbsTask, model_name: str):
+def test_benchmark_sentence_transformer(
+    task: str | AbsTask, model_name: str, tmp_path: Path
+):
     """Test that a task can be fetched and run"""
     if isinstance(model_name, str):
         model = SentenceTransformer(model_name)
     eval = MTEB(tasks=[task])
-    eval.run(model, output_folder="tests/results", overwrite_results=True)
+    eval.run(model, output_folder=tmp_path.as_posix(), overwrite_results=True)
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 7c71528f0..130c71bb5 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -50,12 +50,13 @@ def test_run_task(
     model_name: str,
     task_name: str,
     model_revision: str,
+    tmp_path: Path,
 ):
     args = Namespace(
         model=model_name,
         tasks=[task_name],
         model_revision=model_revision,
-        output_folder="tests/results/test_model",
+        output_folder=tmp_path.as_posix(),
         verbosity=3,
         device=None,
         categories=None,
@@ -71,9 +72,7 @@ def test_run_task(
     run(args)
 
     model_name_as_path = model_name.replace("/", "__").replace(" ", "_")
-    results_path = Path(
-        f"tests/results/test_model/{model_name_as_path}/{model_revision}"
-    )
+    results_path = tmp_path / {model_name_as_path} / {model_revision}
     assert results_path.exists(), "Output folder not created"
     assert "model_meta.json" in [
         f.name for f in list(results_path.glob("*.json"))
diff --git a/tests/test_reproducible_workflow.py b/tests/test_reproducible_workflow.py
index 1c7536076..1973072ba 100644
--- a/tests/test_reproducible_workflow.py
+++ b/tests/test_reproducible_workflow.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import logging
+from pathlib import Path
 
 import pytest
 
@@ -18,7 +19,9 @@
 @pytest.mark.parametrize("task_name", ["BornholmBitextMining"])
 @pytest.mark.parametrize("model_name", ["sentence-transformers/all-MiniLM-L6-v2"])
 @pytest.mark.parametrize("model_revision", ["8b3219a92973c328a8e22fadcfa821b5dc75636a"])
-def test_reproducibility_workflow(task_name: str, model_name: str, model_revision: str):
+def test_reproducibility_workflow(
+    task_name: str, model_name: str, model_revision: str, tmp_path: Path
+):
     """Test that a model and a task can be fetched and run in a reproducible fashion."""
     model_meta = mteb.get_model_meta(model_name, revision=model_revision)
     task = mteb.get_task(task_name)
@@ -30,7 +33,7 @@ def test_reproducibility_workflow(task_name: str, model_name: str, model_revisio
     assert isinstance(model, Encoder)
 
     eval = MTEB(tasks=[task])
-    eval.run(model, output_folder="tests/results", overwrite_results=True)
+    eval.run(model, output_folder=tmp_path.as_posix(), overwrite_results=True)
 
 
 @pytest.mark.parametrize(
diff --git a/tests/test_tasks/test_mteb_rerank.py b/tests/test_tasks/test_mteb_rerank.py
index 565b00e22..9d3366faa 100644
--- a/tests/test_tasks/test_mteb_rerank.py
+++ b/tests/test_tasks/test_mteb_rerank.py
@@ -339,7 +339,7 @@ def test_mteb_rerank(tmp_path: Path):
 
     eval.run(
         model,  # type: ignore
-        output_folder="tests/results",
+        output_folder=tmp_path.as_posix(),
         overwrite_results=True,
         eval_splits=["test"],
         top_k=2,
@@ -358,7 +358,7 @@ def test_mteb_rerank(tmp_path: Path):
     assert "18670" in results["1"]
 
 
-def test_reranker_same_ndcg1():
+def test_reranker_same_ndcg1(tmp_path: Path):
     de_name = "average_word_embeddings_komninos"
     revision = "21eec43590414cb8e3a6f654857abed0483ae36e"
     de = SentenceTransformer(de_name, revision=revision)
@@ -372,32 +372,35 @@ def test_reranker_same_ndcg1():
         release_date="2021-04-15",
     )
     eval = MTEB(tasks=mteb.get_tasks(["SciFact"]))
+    stage1_path = tmp_path / "stage1"
     eval.run(
         de,
-        output_folder="tests/results/stage1",
+        output_folder=stage1_path.as_posix(),
         overwrite_results=True,
         save_predictions=True,
         eval_splits=["test"],
     )
+    stage2_path = tmp_path / "stage2"
     eval.run(
         ce,  # type: ignore
-        output_folder="tests/results/stage2",
+        output_folder=stage2_path.as_posix(),
         overwrite_results=True,
-        previous_results="tests/results/stage1/SciFact_default_predictions.json",
+        previous_results=(stage1_path / "SciFact_default_predictions.json"),
         save_predictions=False,
         eval_splits=["test"],
         top_k=1,  # don't allow it to rerank more than 1 so we can check for top_1 being the same
     )
 
     # read in stage 1 and stage two and check ndcg@1 is the same
-    with open(
-        f"tests/results/stage1/sentence-transformers__{de_name}/{revision}/SciFact.json"
-    ) as f:
+    with (
+        stage1_path / f"sentence-transformers__{de_name}/{revision}/SciFact.json"
+    ).open() as f:
         stage1 = json.load(f)
 
-    with open(
-        f"tests/results/stage2/cross-encoder__ms-marco-TinyBERT-L-2-v2/{ce_revision}/SciFact.json"
-    ) as f:
+    with (
+        stage2_path
+        / f"cross-encoder__ms-marco-TinyBERT-L-2-v2/{ce_revision}/SciFact.json"
+    ).open() as f:
         stage2 = json.load(f)
 
     assert (

From 1e37876e7e25f7838dd18b7d5da182867d2bb7f3 Mon Sep 17 00:00:00 2001
From: sam021313 <40773225+sam021313@users.noreply.github.com>
Date: Fri, 17 Jan 2025 16:22:41 +0100
Subject: [PATCH 05/11] skip test for pylate python < 3.10

---
 tests/test_benchmark/test_models.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/test_benchmark/test_models.py b/tests/test_benchmark/test_models.py
index 80232df52..996f633ff 100644
--- a/tests/test_benchmark/test_models.py
+++ b/tests/test_benchmark/test_models.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import sys
 from pathlib import Path
 
 import pytest
@@ -11,6 +12,7 @@
 from .mock_tasks import MockRetrievalTask
 
 
+@pytest.mark.skipif(sys.version_info < (3, 10), reason="Requires Python 3.10 or higher")
 @pytest.mark.parametrize("model", ["colbert-ir/colbertv2.0"])
 @pytest.mark.parametrize("task", [MockRetrievalTask()])
 def test_colbert_model_e2e(task: AbsTask, model: str, tmp_path: Path):

From 623d043b8feea6ccfd0dca7bae3e0ad4c843ffeb Mon Sep 17 00:00:00 2001
From: sam021313 <40773225+sam021313@users.noreply.github.com>
Date: Fri, 17 Jan 2025 16:38:45 +0100
Subject: [PATCH 06/11] fix: tests

---
 tests/test_cli.py                    | 2 +-
 tests/test_tasks/test_mteb_rerank.py | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/tests/test_cli.py b/tests/test_cli.py
index 130c71bb5..518d3c411 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -72,7 +72,7 @@ def test_run_task(
     run(args)
 
     model_name_as_path = model_name.replace("/", "__").replace(" ", "_")
-    results_path = tmp_path / {model_name_as_path} / {model_revision}
+    results_path = tmp_path / model_name_as_path / model_revision
     assert results_path.exists(), "Output folder not created"
     assert "model_meta.json" in [
         f.name for f in list(results_path.glob("*.json"))
diff --git a/tests/test_tasks/test_mteb_rerank.py b/tests/test_tasks/test_mteb_rerank.py
index 9d3366faa..dd8275e58 100644
--- a/tests/test_tasks/test_mteb_rerank.py
+++ b/tests/test_tasks/test_mteb_rerank.py
@@ -346,10 +346,11 @@ def test_mteb_rerank(tmp_path: Path):
         previous_results=tmp_file,
         save_predictions=True,
     )
-    tmp_file.unlink()
 
     # read in the results
-    with open("tests/results/SciFact_default_predictions.json") as f:
+    with (
+        tmp_path / "SciFact_cross-encoder__ms-marco-TinyBERT-L-2-v2.json"
+    ).open() as f:
         results = json.load(f)
 
     # check that only the top two results are re-orderd

From 1e23caa95827be290b2dda3ec6d76033d0771138 Mon Sep 17 00:00:00 2001
From: sam021313 <40773225+sam021313@users.noreply.github.com>
Date: Fri, 17 Jan 2025 21:27:33 +0100
Subject: [PATCH 07/11] fix: tests

---
 tests/test_tasks/test_mteb_rerank.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tests/test_tasks/test_mteb_rerank.py b/tests/test_tasks/test_mteb_rerank.py
index dd8275e58..9707f203b 100644
--- a/tests/test_tasks/test_mteb_rerank.py
+++ b/tests/test_tasks/test_mteb_rerank.py
@@ -348,9 +348,7 @@ def test_mteb_rerank(tmp_path: Path):
     )
 
     # read in the results
-    with (
-        tmp_path / "SciFact_cross-encoder__ms-marco-TinyBERT-L-2-v2.json"
-    ).open() as f:
+    with (tmp_path / "SciFact_default_predictions.json").open() as f:
         results = json.load(f)
 
     # check that only the top two results are re-orderd

From ea8f6a05bbec46eeb18a44bbbfad4519cfc116e6 Mon Sep 17 00:00:00 2001
From: sam021313 <40773225+sam021313@users.noreply.github.com>
Date: Fri, 17 Jan 2025 22:19:12 +0100
Subject: [PATCH 08/11] fix: model meta CrossEncoder

---
 mteb/evaluation/MTEB.py | 10 ++++++++--
 mteb/models/__init__.py |  2 ++
 mteb/models/overview.py | 37 ++++++++++++++++++++++++++++++++++++-
 3 files changed, 46 insertions(+), 3 deletions(-)

diff --git a/mteb/evaluation/MTEB.py b/mteb/evaluation/MTEB.py
index 3c94f2478..30d88a521 100644
--- a/mteb/evaluation/MTEB.py
+++ b/mteb/evaluation/MTEB.py
@@ -20,7 +20,10 @@
 from mteb.abstasks.AbsTask import ScoresDict
 from mteb.encoder_interface import Encoder
 from mteb.model_meta import ModelMeta
-from mteb.models import model_meta_from_sentence_transformers
+from mteb.models import (
+    model_meta_from_cross_encoder,
+    model_meta_from_sentence_transformers,
+)
 
 from ..abstasks.AbsTask import AbsTask
 from ..load_results.task_results import TaskResult
@@ -495,7 +498,10 @@ def create_model_meta(model: Encoder) -> ModelMeta:
             meta = model.mteb_model_meta  # type: ignore
         else:
             try:
-                meta = model_meta_from_sentence_transformers(model)  # type: ignore
+                if isinstance(model, CrossEncoder):
+                    meta = model_meta_from_cross_encoder(model)
+                else:
+                    meta = model_meta_from_sentence_transformers(model)  # type: ignore
             except AttributeError:
                 logger.warning(
                     "Could not find model metadata. Please set the model.mteb_model_meta attribute or if you are using "
diff --git a/mteb/models/__init__.py b/mteb/models/__init__.py
index 1c70b528c..1389e2398 100644
--- a/mteb/models/__init__.py
+++ b/mteb/models/__init__.py
@@ -6,6 +6,7 @@
     get_model,
     get_model_meta,
     get_model_metas,
+    model_meta_from_cross_encoder,
     model_meta_from_sentence_transformers,
 )
 from mteb.models.sentence_transformer_wrapper import SentenceTransformerWrapper
@@ -17,5 +18,6 @@
     "get_model_meta",
     "get_model_metas",
     "model_meta_from_sentence_transformers",
+    "model_meta_from_cross_encoder",
     "SentenceTransformerWrapper",
 ]
diff --git a/mteb/models/overview.py b/mteb/models/overview.py
index e9774cacd..c637e7e7f 100644
--- a/mteb/models/overview.py
+++ b/mteb/models/overview.py
@@ -6,7 +6,7 @@
 from typing import Any
 
 from huggingface_hub import ModelCard
-from sentence_transformers import SentenceTransformer
+from sentence_transformers import CrossEncoder, SentenceTransformer
 
 from mteb.abstasks.AbsTask import AbsTask
 from mteb.encoder_interface import Encoder
@@ -164,6 +164,11 @@ def get_model(model_name: str, revision: str | None = None, **kwargs: Any) -> En
         if not meta.similarity_fn_name:
             meta.similarity_fn_name = _meta.similarity_fn_name
 
+    elif isinstance(model, CrossEncoder):
+        _meta = model_meta_from_cross_encoder(model.model)
+        if meta.revision is None:
+            meta.revision = _meta.revision if _meta.revision else meta.revision
+
     model.mteb_model_meta = meta  # type: ignore
     return model
 
@@ -226,6 +231,36 @@ def model_meta_from_hf_hub(model_name: str) -> ModelMeta:
         )
 
 
+def model_meta_from_cross_encoder(model: CrossEncoder) -> ModelMeta:
+    try:
+        name = model.model.name_or_path
+        # languages = (
+        #    [model.model_card_data.language]
+        #    if isinstance(model.model_card_data.language, str)
+        #    else model.model_card_data.language
+        # )
+
+        meta = ModelMeta(
+            name=name,
+            revision=model.config._commit_hash,
+            release_date=None,
+            languages=None,
+            framework=["Sentence Transformers"],
+            similarity_fn_name=None,
+        )
+    except AttributeError as e:
+        logger.warning(
+            f"Failed to extract metadata from model: {e}. Upgrading to sentence-transformers v3.0.0 or above is recommended."
+        )
+        meta = ModelMeta(
+            name=None,
+            revision=None,
+            languages=None,
+            release_date=None,
+        )
+    return meta
+
+
 def model_meta_from_sentence_transformers(model: SentenceTransformer) -> ModelMeta:
     try:
         name = (

From 7f6a12d61b24100a1975650ba9dc379f9cb816f1 Mon Sep 17 00:00:00 2001
From: sam021313 <40773225+sam021313@users.noreply.github.com>
Date: Fri, 17 Jan 2025 22:19:23 +0100
Subject: [PATCH 09/11] test: model meta

---
 tests/test_model_meta/test_model_meta.py | 46 ++++++++++++++++++++++++
 1 file changed, 46 insertions(+)
 create mode 100644 tests/test_model_meta/test_model_meta.py

diff --git a/tests/test_model_meta/test_model_meta.py b/tests/test_model_meta/test_model_meta.py
new file mode 100644
index 000000000..d4252902b
--- /dev/null
+++ b/tests/test_model_meta/test_model_meta.py
@@ -0,0 +1,46 @@
+from __future__ import annotations
+
+from pathlib import Path
+
+import pytest
+from sentence_transformers import CrossEncoder, SentenceTransformer
+
+from mteb import MTEB
+from mteb.abstasks import AbsTask
+from tests.test_benchmark.mock_tasks import MockRetrievalTask
+
+
+def test_create_model_meta_from_sentence_transformers(tmp_path: Path):
+    model_name = "sentence-transformers/average_word_embeddings_levy_dependency"
+    model = SentenceTransformer(model_name)
+
+    meta = MTEB.create_model_meta(model)
+
+    assert meta.name == model_name
+    assert meta.revision == model.model_card_data.base_model_revision
+
+
+def test_create_model_meta_from_cross_encoder(tmp_path: Path):
+    model_name = "cross-encoder/ms-marco-TinyBERT-L-2-v2"
+
+    model = CrossEncoder(model_name)
+
+    meta = MTEB.create_model_meta(model)
+    # model.name_or_path
+    # _commit_hash
+    assert meta.name == model_name
+    assert meta.revision == model.config._commit_hash
+
+    return meta
+
+
+@pytest.mark.parametrize("task", [MockRetrievalTask()])
+def test_output_folder_model_meta(task: AbsTask, tmp_path: Path):
+    mteb = MTEB(tasks=[task])
+    model_name = "cross-encoder/ms-marco-TinyBERT-L-2-v2"
+    model = CrossEncoder(model_name)
+    meta = mteb.create_model_meta(model)
+    output_path = mteb.create_output_folder(
+        model_meta=meta, output_folder=tmp_path.as_posix()
+    )
+    assert Path(output_path).exists()

From cdf4f1bdbd38b8ac264b3c99253721119c1ff6e0 Mon Sep 17 00:00:00 2001
From: sam021313 <40773225+sam021313@users.noreply.github.com>
Date: Fri, 17 Jan 2025 22:33:59 +0100
Subject: [PATCH 10/11] update path test

---
 tests/test_model_meta/test_model_meta.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/tests/test_model_meta/test_model_meta.py b/tests/test_model_meta/test_model_meta.py
index d4252902b..993f9dc8c 100644
--- a/tests/test_model_meta/test_model_meta.py
+++ b/tests/test_model_meta/test_model_meta.py
@@ -10,7 +10,7 @@
 from tests.test_benchmark.mock_tasks import MockRetrievalTask
 
 
-def test_create_model_meta_from_sentence_transformers(tmp_path: Path):
+def test_create_model_meta_from_sentence_transformers():
     model_name = "sentence-transformers/average_word_embeddings_levy_dependency"
     model = SentenceTransformer(model_name)
 
@@ -20,7 +20,7 @@ def test_create_model_meta_from_sentence_transformers(tmp_path: Path):
     assert meta.revision == model.model_card_data.base_model_revision
 
 
-def test_create_model_meta_from_cross_encoder(tmp_path: Path):
+def test_create_model_meta_from_cross_encoder():
     model_name = "cross-encoder/ms-marco-TinyBERT-L-2-v2"
 
     model = CrossEncoder(model_name)
@@ -44,3 +44,8 @@ def test_output_folder_model_meta(task: AbsTask, tmp_path: Path):
         model_meta=meta, output_folder=tmp_path.as_posix()
     )
     assert Path(output_path).exists()
+    assert Path(output_path).is_dir()
+    assert Path(output_path).name == model.config._commit_hash
+    assert Path(output_path).parent.name == "cross-encoder__ms-marco-TinyBERT-L-2-v2"
+    assert Path(output_path).parent.parent == tmp_path
+

From 2f39f84ac5e72a2d782e004415f45927419570bd Mon Sep 17 00:00:00 2001
From: sam021313 <40773225+sam021313@users.noreply.github.com>
Date: Fri, 17 Jan 2025 22:36:54 +0100
Subject: [PATCH 11/11] lint

---
 tests/test_model_meta/test_model_meta.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/test_model_meta/test_model_meta.py b/tests/test_model_meta/test_model_meta.py
index 993f9dc8c..bd682701e 100644
--- a/tests/test_model_meta/test_model_meta.py
+++ b/tests/test_model_meta/test_model_meta.py
@@ -48,4 +48,3 @@ def test_output_folder_model_meta(task: AbsTask, tmp_path: Path):
     assert Path(output_path).name == model.config._commit_hash
     assert Path(output_path).parent.name == "cross-encoder__ms-marco-TinyBERT-L-2-v2"
     assert Path(output_path).parent.parent == tmp_path
-