From 2cf92e1f5d24ea6f3174a2c75eee0461fbb10c9c Mon Sep 17 00:00:00 2001 From: sam021313 <40773225+sam021313@users.noreply.github.com> Date: Fri, 17 Jan 2025 14:45:35 +0100 Subject: [PATCH 01/11] update install for tests --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index ac6e9fa60..1e5233488 100644 --- a/Makefile +++ b/Makefile @@ -5,7 +5,7 @@ install: install-for-tests: @echo "--- ๐Ÿš€ Installing project dependencies for test ---" @echo "This ensures that the project is not installed in editable mode" - pip install ".[dev,speedtask]" + pip install ".[dev,speedtask,bm25s,pylate]" lint: @echo "--- ๐Ÿงน Running linters ---" From 8323369837ed1384567e0b0ece7556140ad46d81 Mon Sep 17 00:00:00 2001 From: sam021313 <40773225+sam021313@users.noreply.github.com> Date: Fri, 17 Jan 2025 15:37:31 +0100 Subject: [PATCH 02/11] use tmp dir for tests --- tests/test_benchmark/test_models.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/tests/test_benchmark/test_models.py b/tests/test_benchmark/test_models.py index ee5bed091..80232df52 100644 --- a/tests/test_benchmark/test_models.py +++ b/tests/test_benchmark/test_models.py @@ -1,5 +1,7 @@ from __future__ import annotations +from pathlib import Path + import pytest import mteb @@ -11,7 +13,7 @@ @pytest.mark.parametrize("model", ["colbert-ir/colbertv2.0"]) @pytest.mark.parametrize("task", [MockRetrievalTask()]) -def test_colbert_model_e2e(task: AbsTask, model: str): +def test_colbert_model_e2e(task: AbsTask, model: str, tmp_path: Path): pytest.importorskip("pylate", reason="pylate not installed") eval_splits = ["test"] model = mteb.get_model(model) @@ -21,13 +23,14 @@ def test_colbert_model_e2e(task: AbsTask, model: str): model, eval_splits=eval_splits, corpus_chunk_size=500, + output_folder=str(tmp_path), ) result = results[0] assert result.scores["test"][0]["ndcg_at_1"] == 1.0 -def test_bm25s_e2e(): +def test_bm25s_e2e(tmp_path: Path): # fails for dataset smaller then 1000 pytest.importorskip("bm25s", reason="bm25s not installed") pytest.importorskip("Stemmer", reason="PyStemmer not installed") @@ -38,7 +41,9 @@ def test_bm25s_e2e(): evaluation = MTEB(tasks=tasks) - results = evaluation.run(model, eval_splits=eval_splits) + results = evaluation.run( + model, eval_splits=eval_splits, output_folder=str(tmp_path) + ) result = results[0] assert result.scores["test"][0]["ndcg_at_1"] == 0.42879 From d0191ab9e5e6c80d69325c7c0bf0a30900cacc84 Mon Sep 17 00:00:00 2001 From: sam021313 <40773225+sam021313@users.noreply.github.com> Date: Fri, 17 Jan 2025 16:06:40 +0100 Subject: [PATCH 03/11] ref: use tmp_path for output_folder --- tests/test_benchmark/test_benchmark.py | 45 +++++++++++++------------- 1 file changed, 22 insertions(+), 23 deletions(-) diff --git a/tests/test_benchmark/test_benchmark.py b/tests/test_benchmark/test_benchmark.py index 1393d46f1..1288642a3 100644 --- a/tests/test_benchmark/test_benchmark.py +++ b/tests/test_benchmark/test_benchmark.py @@ -56,7 +56,9 @@ def test_mulitple_mteb_tasks(tasks: list[AbsTask], model: mteb.Encoder, tmp_path MockTorchbf16Encoder(), ], ) -def test_benchmark_encoders_on_task(task: str | AbsTask, model: mteb.Encoder): +def test_benchmark_encoders_on_task( + task: str | AbsTask, model: mteb.Encoder, tmp_path: Path +): """Test that a task can be fetched and run using a variety of encoders""" if isinstance(task, str): tasks = mteb.get_tasks(tasks=[task]) @@ -64,7 +66,7 @@ def test_benchmark_encoders_on_task(task: str | AbsTask, model: mteb.Encoder): tasks = [task] eval = mteb.MTEB(tasks=tasks) - eval.run(model, output_folder="tests/results", overwrite_results=True) + eval.run(model, output_folder=str(tmp_path)) @pytest.mark.parametrize("task", [MockMultilingualRetrievalTask()]) @@ -72,7 +74,9 @@ def test_benchmark_encoders_on_task(task: str | AbsTask, model: mteb.Encoder): "model", [MockSentenceTransformer()], ) -def test_run_eval_without_co2_tracking(task: str | AbsTask, model: mteb.Encoder): +def test_run_eval_without_co2_tracking( + task: str | AbsTask, model: mteb.Encoder, tmp_path: Path +): """Test that a task can be fetched and run without CO2 tracking""" if isinstance(task, str): tasks = mteb.get_tasks(tasks=[task]) @@ -80,9 +84,7 @@ def test_run_eval_without_co2_tracking(task: str | AbsTask, model: mteb.Encoder) tasks = [task] eval = mteb.MTEB(tasks=tasks) - eval.run( - model, output_folder="tests/results", overwrite_results=True, co2_tracker=False - ) + eval.run(model, output_folder=str(tmp_path), co2_tracker=False) @pytest.mark.parametrize("task", MOCK_TASK_TEST_GRID[:1]) @@ -108,7 +110,7 @@ def test_reload_results(task: str | AbsTask, model: mteb.Encoder, tmp_path: Path @pytest.mark.parametrize("task_name", MOCK_TASK_TEST_GRID) -def test_prompt_name_passed_to_all_encodes(task_name: str | AbsTask): +def test_prompt_name_passed_to_all_encodes(task_name: str | AbsTask, tmp_path: Path): """Test that all tasks correctly pass down the prompt_name to the encoder which supports it, and that the encoder which does not support it does not receive it. """ @@ -141,7 +143,7 @@ def encode(self, sentences, **kwargs): eval.run( model, - output_folder="tests/results", + output_folder=str(tmp_path), overwrite_results=True, ) # Test that the task_name is not passed down to the encoder @@ -151,7 +153,7 @@ def encode(self, sentences, **kwargs): @pytest.mark.parametrize("task_name", MOCK_TASK_TEST_GRID) -def test_encode_kwargs_passed_to_all_encodes(task_name: str | AbsTask): +def test_encode_kwargs_passed_to_all_encodes(task_name: str | AbsTask, tmp_path: Path): """Test that all tasks correctly pass down the encode_kwargs to the encoder.""" my_encode_kwargs = {"no_one_uses_this_args": "but_its_here"} @@ -175,7 +177,7 @@ def encode(self, sentences, task_name: str | None = None, **kwargs): model = MockEncoderWithKwargs() eval.run( model, - output_folder="tests/results", + output_folder=str(tmp_path), overwrite_results=True, encode_kwargs=my_encode_kwargs, ) @@ -195,16 +197,14 @@ def test_run_using_benchmark(model: mteb.Encoder): @pytest.mark.parametrize("model", [MockNumpyEncoder()]) -def test_run_using_list_of_benchmark(model: mteb.Encoder): +def test_run_using_list_of_benchmark(model: mteb.Encoder, tmp_path: Path): """Test that a list of benchmark objects can be run using the MTEB class.""" bench = [ Benchmark(name="test_bench", tasks=mteb.get_tasks(tasks=["STS12", "SummEval"])) ] eval = mteb.MTEB(tasks=bench) - eval.run( - model, output_folder="tests/results", overwrite_results=True - ) # we just want to test that it runs + eval.run(model, output_folder=str(tmp_path)) # we just want to test that it runs def test_benchmark_names_must_be_unique(): @@ -229,7 +229,7 @@ def test_get_benchmark(name): @pytest.mark.parametrize("task", MOCK_TASK_TEST_GRID) @pytest.mark.parametrize("is_task_name", [True, False]) def test_prompt_name_passed_to_all_encodes_with_prompts( - task: AbsTask | str, is_task_name: bool + task: AbsTask | str, is_task_name: bool, tmp_path: Path ): """Test that all tasks and task_types correctly pass down the prompt_name to the encoder with prompts.""" _task_name = task.metadata.name if isinstance(task, AbsTask) else task @@ -258,8 +258,7 @@ def encode(self, sentences, prompt_name: str | None = None, **kwargs): ) eval.run( model, - output_folder="tests/results", - overwrite_results=True, + output_folder=str(tmp_path), ) class MockEncoderWithExistingPrompts(mteb.Encoder): @@ -275,7 +274,7 @@ def encode(self, sentences, prompt_name: str | None = None, **kwargs): model = MockSentenceTransformerWrapper(MockEncoderWithExistingPrompts()) eval.run( model, - output_folder="tests/results", + output_folder=str(tmp_path), overwrite_results=True, ) @@ -292,7 +291,9 @@ def encode(self, sentences, prompt_name: str | None = None, **kwargs): ], ) @pytest.mark.parametrize("is_task_name", [True, False]) -def test_model_query_passage_prompts_task_type(task: AbsTask | str, is_task_name: bool): +def test_model_query_passage_prompts_task_type( + task: AbsTask | str, is_task_name: bool, tmp_path: Path +): """Test that the model with prompts is correctly called.""" tasks = [task] @@ -331,8 +332,7 @@ def encode(self, sentences, prompt_name: str | None = None, *args, **kwargs): eval.run( model, model_prompts=prompt_list, - output_folder="tests/results", - overwrite_results=True, + output_folder=str(tmp_path), ) model = MockSentenceTransformerWrapper( MockSentenceEncoderWithPrompts(), model_prompts=prompt_list @@ -341,6 +341,5 @@ def encode(self, sentences, prompt_name: str | None = None, *args, **kwargs): eval.run( model, model_prompts=prompt_list, - output_folder="tests/results", - overwrite_results=True, + output_folder=str(tmp_path), ) From 5c585a4c57eabe98e5330aa9578bc7f7483b63ff Mon Sep 17 00:00:00 2001 From: sam021313 <40773225+sam021313@users.noreply.github.com> Date: Fri, 17 Jan 2025 16:17:45 +0100 Subject: [PATCH 04/11] ref: clean up tests --- tests/test_benchmark/test_benchmark.py | 6 ++--- ...est_benchmark_integration_with_datasets.py | 5 ++-- ...k_integration_with_sentencetransformers.py | 7 ++++-- tests/test_cli.py | 7 +++--- tests/test_reproducible_workflow.py | 7 ++++-- tests/test_tasks/test_mteb_rerank.py | 25 +++++++++++-------- 6 files changed, 33 insertions(+), 24 deletions(-) diff --git a/tests/test_benchmark/test_benchmark.py b/tests/test_benchmark/test_benchmark.py index 1288642a3..4418e9296 100644 --- a/tests/test_benchmark/test_benchmark.py +++ b/tests/test_benchmark/test_benchmark.py @@ -149,7 +149,7 @@ def encode(self, sentences, **kwargs): # Test that the task_name is not passed down to the encoder model = EncoderWithoutInstructions("average_word_embeddings_levy_dependency") assert model.prompts == {}, "The encoder should not have any prompts" - eval.run(model, output_folder="tests/results", overwrite_results=True) + eval.run(model, output_folder=tmp_path.as_posix(), overwrite_results=True) @pytest.mark.parametrize("task_name", MOCK_TASK_TEST_GRID) @@ -184,7 +184,7 @@ def encode(self, sentences, task_name: str | None = None, **kwargs): @pytest.mark.parametrize("model", [MockNumpyEncoder()]) -def test_run_using_benchmark(model: mteb.Encoder): +def test_run_using_benchmark(model: mteb.Encoder, tmp_path: Path): """Test that a benchmark object can be run using the MTEB class.""" bench = Benchmark( name="test_bench", tasks=mteb.get_tasks(tasks=["STS12", "SummEval"]) @@ -192,7 +192,7 @@ def test_run_using_benchmark(model: mteb.Encoder): eval = mteb.MTEB(tasks=[bench]) eval.run( - model, output_folder="tests/results", overwrite_results=True + model, output_folder=tmp_path.as_posix(), overwrite_results=True ) # we just want to test that it runs diff --git a/tests/test_benchmark/test_benchmark_integration_with_datasets.py b/tests/test_benchmark/test_benchmark_integration_with_datasets.py index 81d4c6b67..8288680c3 100644 --- a/tests/test_benchmark/test_benchmark_integration_with_datasets.py +++ b/tests/test_benchmark/test_benchmark_integration_with_datasets.py @@ -3,6 +3,7 @@ from __future__ import annotations import logging +from pathlib import Path import pytest @@ -18,7 +19,7 @@ @pytest.mark.parametrize("task", TASK_TEST_GRID) @pytest.mark.parametrize("model", [MockNumpyEncoder()]) -def test_benchmark_datasets(task: str | AbsTask, model: mteb.Encoder): +def test_benchmark_datasets(task: str | AbsTask, model: mteb.Encoder, tmp_path: Path): """Test that a task can be fetched and run""" eval = MTEB(tasks=[task]) - eval.run(model, output_folder="tests/results", overwrite_results=True) + eval.run(model, output_folder=tmp_path.as_posix(), overwrite_results=True) diff --git a/tests/test_benchmark/test_benchmark_integration_with_sentencetransformers.py b/tests/test_benchmark/test_benchmark_integration_with_sentencetransformers.py index 4ca0056cd..e79515be5 100644 --- a/tests/test_benchmark/test_benchmark_integration_with_sentencetransformers.py +++ b/tests/test_benchmark/test_benchmark_integration_with_sentencetransformers.py @@ -3,6 +3,7 @@ from __future__ import annotations import logging +from pathlib import Path import pytest from sentence_transformers import SentenceTransformer @@ -22,9 +23,11 @@ "average_word_embeddings_levy_dependency", ], ) -def test_benchmark_sentence_transformer(task: str | AbsTask, model_name: str): +def test_benchmark_sentence_transformer( + task: str | AbsTask, model_name: str, tmp_path: Path +): """Test that a task can be fetched and run""" if isinstance(model_name, str): model = SentenceTransformer(model_name) eval = MTEB(tasks=[task]) - eval.run(model, output_folder="tests/results", overwrite_results=True) + eval.run(model, output_folder=tmp_path.as_posix(), overwrite_results=True) diff --git a/tests/test_cli.py b/tests/test_cli.py index 7c71528f0..130c71bb5 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -50,12 +50,13 @@ def test_run_task( model_name: str, task_name: str, model_revision: str, + tmp_path: Path, ): args = Namespace( model=model_name, tasks=[task_name], model_revision=model_revision, - output_folder="tests/results/test_model", + output_folder=tmp_path.as_posix(), verbosity=3, device=None, categories=None, @@ -71,9 +72,7 @@ def test_run_task( run(args) model_name_as_path = model_name.replace("/", "__").replace(" ", "_") - results_path = Path( - f"tests/results/test_model/{model_name_as_path}/{model_revision}" - ) + results_path = tmp_path / {model_name_as_path} / {model_revision} assert results_path.exists(), "Output folder not created" assert "model_meta.json" in [ f.name for f in list(results_path.glob("*.json")) diff --git a/tests/test_reproducible_workflow.py b/tests/test_reproducible_workflow.py index 1c7536076..1973072ba 100644 --- a/tests/test_reproducible_workflow.py +++ b/tests/test_reproducible_workflow.py @@ -1,6 +1,7 @@ from __future__ import annotations import logging +from pathlib import Path import pytest @@ -18,7 +19,9 @@ @pytest.mark.parametrize("task_name", ["BornholmBitextMining"]) @pytest.mark.parametrize("model_name", ["sentence-transformers/all-MiniLM-L6-v2"]) @pytest.mark.parametrize("model_revision", ["8b3219a92973c328a8e22fadcfa821b5dc75636a"]) -def test_reproducibility_workflow(task_name: str, model_name: str, model_revision: str): +def test_reproducibility_workflow( + task_name: str, model_name: str, model_revision: str, tmp_path: Path +): """Test that a model and a task can be fetched and run in a reproducible fashion.""" model_meta = mteb.get_model_meta(model_name, revision=model_revision) task = mteb.get_task(task_name) @@ -30,7 +33,7 @@ def test_reproducibility_workflow(task_name: str, model_name: str, model_revisio assert isinstance(model, Encoder) eval = MTEB(tasks=[task]) - eval.run(model, output_folder="tests/results", overwrite_results=True) + eval.run(model, output_folder=tmp_path.as_posix(), overwrite_results=True) @pytest.mark.parametrize( diff --git a/tests/test_tasks/test_mteb_rerank.py b/tests/test_tasks/test_mteb_rerank.py index 565b00e22..9d3366faa 100644 --- a/tests/test_tasks/test_mteb_rerank.py +++ b/tests/test_tasks/test_mteb_rerank.py @@ -339,7 +339,7 @@ def test_mteb_rerank(tmp_path: Path): eval.run( model, # type: ignore - output_folder="tests/results", + output_folder=tmp_path.as_posix(), overwrite_results=True, eval_splits=["test"], top_k=2, @@ -358,7 +358,7 @@ def test_mteb_rerank(tmp_path: Path): assert "18670" in results["1"] -def test_reranker_same_ndcg1(): +def test_reranker_same_ndcg1(tmp_path: Path): de_name = "average_word_embeddings_komninos" revision = "21eec43590414cb8e3a6f654857abed0483ae36e" de = SentenceTransformer(de_name, revision=revision) @@ -372,32 +372,35 @@ def test_reranker_same_ndcg1(): release_date="2021-04-15", ) eval = MTEB(tasks=mteb.get_tasks(["SciFact"])) + stage1_path = tmp_path / "stage1" eval.run( de, - output_folder="tests/results/stage1", + output_folder=stage1_path.as_posix(), overwrite_results=True, save_predictions=True, eval_splits=["test"], ) + stage2_path = tmp_path / "stage2" eval.run( ce, # type: ignore - output_folder="tests/results/stage2", + output_folder=stage2_path.as_posix(), overwrite_results=True, - previous_results="tests/results/stage1/SciFact_default_predictions.json", + previous_results=(stage1_path / "SciFact_default_predictions.json"), save_predictions=False, eval_splits=["test"], top_k=1, # don't allow it to rerank more than 1 so we can check for top_1 being the same ) # read in stage 1 and stage two and check ndcg@1 is the same - with open( - f"tests/results/stage1/sentence-transformers__{de_name}/{revision}/SciFact.json" - ) as f: + with ( + stage1_path / f"sentence-transformers__{de_name}/{revision}/SciFact.json" + ).open() as f: stage1 = json.load(f) - with open( - f"tests/results/stage2/cross-encoder__ms-marco-TinyBERT-L-2-v2/{ce_revision}/SciFact.json" - ) as f: + with ( + stage2_path + / f"cross-encoder__ms-marco-TinyBERT-L-2-v2/{ce_revision}/SciFact.json" + ).open() as f: stage2 = json.load(f) assert ( From 1e37876e7e25f7838dd18b7d5da182867d2bb7f3 Mon Sep 17 00:00:00 2001 From: sam021313 <40773225+sam021313@users.noreply.github.com> Date: Fri, 17 Jan 2025 16:22:41 +0100 Subject: [PATCH 05/11] skip test for pylate python < 3.10 --- tests/test_benchmark/test_models.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/test_benchmark/test_models.py b/tests/test_benchmark/test_models.py index 80232df52..996f633ff 100644 --- a/tests/test_benchmark/test_models.py +++ b/tests/test_benchmark/test_models.py @@ -1,5 +1,6 @@ from __future__ import annotations +import sys from pathlib import Path import pytest @@ -11,6 +12,7 @@ from .mock_tasks import MockRetrievalTask +@pytest.mark.skipif(sys.version_info < (3, 10), reason="Requires Python 3.10 or higher") @pytest.mark.parametrize("model", ["colbert-ir/colbertv2.0"]) @pytest.mark.parametrize("task", [MockRetrievalTask()]) def test_colbert_model_e2e(task: AbsTask, model: str, tmp_path: Path): From 623d043b8feea6ccfd0dca7bae3e0ad4c843ffeb Mon Sep 17 00:00:00 2001 From: sam021313 <40773225+sam021313@users.noreply.github.com> Date: Fri, 17 Jan 2025 16:38:45 +0100 Subject: [PATCH 06/11] fix: tests --- tests/test_cli.py | 2 +- tests/test_tasks/test_mteb_rerank.py | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/test_cli.py b/tests/test_cli.py index 130c71bb5..518d3c411 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -72,7 +72,7 @@ def test_run_task( run(args) model_name_as_path = model_name.replace("/", "__").replace(" ", "_") - results_path = tmp_path / {model_name_as_path} / {model_revision} + results_path = tmp_path / model_name_as_path / model_revision assert results_path.exists(), "Output folder not created" assert "model_meta.json" in [ f.name for f in list(results_path.glob("*.json")) diff --git a/tests/test_tasks/test_mteb_rerank.py b/tests/test_tasks/test_mteb_rerank.py index 9d3366faa..dd8275e58 100644 --- a/tests/test_tasks/test_mteb_rerank.py +++ b/tests/test_tasks/test_mteb_rerank.py @@ -346,10 +346,11 @@ def test_mteb_rerank(tmp_path: Path): previous_results=tmp_file, save_predictions=True, ) - tmp_file.unlink() # read in the results - with open("tests/results/SciFact_default_predictions.json") as f: + with ( + tmp_path / "SciFact_cross-encoder__ms-marco-TinyBERT-L-2-v2.json" + ).open() as f: results = json.load(f) # check that only the top two results are re-orderd From 1e23caa95827be290b2dda3ec6d76033d0771138 Mon Sep 17 00:00:00 2001 From: sam021313 <40773225+sam021313@users.noreply.github.com> Date: Fri, 17 Jan 2025 21:27:33 +0100 Subject: [PATCH 07/11] fix: tests --- tests/test_tasks/test_mteb_rerank.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/test_tasks/test_mteb_rerank.py b/tests/test_tasks/test_mteb_rerank.py index dd8275e58..9707f203b 100644 --- a/tests/test_tasks/test_mteb_rerank.py +++ b/tests/test_tasks/test_mteb_rerank.py @@ -348,9 +348,7 @@ def test_mteb_rerank(tmp_path: Path): ) # read in the results - with ( - tmp_path / "SciFact_cross-encoder__ms-marco-TinyBERT-L-2-v2.json" - ).open() as f: + with (tmp_path / "SciFact_default_predictions.json").open() as f: results = json.load(f) # check that only the top two results are re-orderd From ea8f6a05bbec46eeb18a44bbbfad4519cfc116e6 Mon Sep 17 00:00:00 2001 From: sam021313 <40773225+sam021313@users.noreply.github.com> Date: Fri, 17 Jan 2025 22:19:12 +0100 Subject: [PATCH 08/11] fix: model meta CrossEncoder --- mteb/evaluation/MTEB.py | 10 ++++++++-- mteb/models/__init__.py | 2 ++ mteb/models/overview.py | 37 ++++++++++++++++++++++++++++++++++++- 3 files changed, 46 insertions(+), 3 deletions(-) diff --git a/mteb/evaluation/MTEB.py b/mteb/evaluation/MTEB.py index 3c94f2478..30d88a521 100644 --- a/mteb/evaluation/MTEB.py +++ b/mteb/evaluation/MTEB.py @@ -20,7 +20,10 @@ from mteb.abstasks.AbsTask import ScoresDict from mteb.encoder_interface import Encoder from mteb.model_meta import ModelMeta -from mteb.models import model_meta_from_sentence_transformers +from mteb.models import ( + model_meta_from_cross_encoder, + model_meta_from_sentence_transformers, +) from ..abstasks.AbsTask import AbsTask from ..load_results.task_results import TaskResult @@ -495,7 +498,10 @@ def create_model_meta(model: Encoder) -> ModelMeta: meta = model.mteb_model_meta # type: ignore else: try: - meta = model_meta_from_sentence_transformers(model) # type: ignore + if isinstance(model, CrossEncoder): + meta = model_meta_from_cross_encoder(model) + else: + meta = model_meta_from_sentence_transformers(model) # type: ignore except AttributeError: logger.warning( "Could not find model metadata. Please set the model.mteb_model_meta attribute or if you are using " diff --git a/mteb/models/__init__.py b/mteb/models/__init__.py index 1c70b528c..1389e2398 100644 --- a/mteb/models/__init__.py +++ b/mteb/models/__init__.py @@ -6,6 +6,7 @@ get_model, get_model_meta, get_model_metas, + model_meta_from_cross_encoder, model_meta_from_sentence_transformers, ) from mteb.models.sentence_transformer_wrapper import SentenceTransformerWrapper @@ -17,5 +18,6 @@ "get_model_meta", "get_model_metas", "model_meta_from_sentence_transformers", + "model_meta_from_cross_encoder", "SentenceTransformerWrapper", ] diff --git a/mteb/models/overview.py b/mteb/models/overview.py index e9774cacd..c637e7e7f 100644 --- a/mteb/models/overview.py +++ b/mteb/models/overview.py @@ -6,7 +6,7 @@ from typing import Any from huggingface_hub import ModelCard -from sentence_transformers import SentenceTransformer +from sentence_transformers import CrossEncoder, SentenceTransformer from mteb.abstasks.AbsTask import AbsTask from mteb.encoder_interface import Encoder @@ -164,6 +164,11 @@ def get_model(model_name: str, revision: str | None = None, **kwargs: Any) -> En if not meta.similarity_fn_name: meta.similarity_fn_name = _meta.similarity_fn_name + elif isinstance(model, CrossEncoder): + _meta = model_meta_from_cross_encoder(model.model) + if meta.revision is None: + meta.revision = _meta.revision if _meta.revision else meta.revision + model.mteb_model_meta = meta # type: ignore return model @@ -226,6 +231,36 @@ def model_meta_from_hf_hub(model_name: str) -> ModelMeta: ) +def model_meta_from_cross_encoder(model: CrossEncoder) -> ModelMeta: + try: + name = model.model.name_or_path + # languages = ( + # [model.model_card_data.language] + # if isinstance(model.model_card_data.language, str) + # else model.model_card_data.language + # ) + + meta = ModelMeta( + name=name, + revision=model.config._commit_hash, + release_date=None, + languages=None, + framework=["Sentence Transformers"], + similarity_fn_name=None, + ) + except AttributeError as e: + logger.warning( + f"Failed to extract metadata from model: {e}. Upgrading to sentence-transformers v3.0.0 or above is recommended." + ) + meta = ModelMeta( + name=None, + revision=None, + languages=None, + release_date=None, + ) + return meta + + def model_meta_from_sentence_transformers(model: SentenceTransformer) -> ModelMeta: try: name = ( From 7f6a12d61b24100a1975650ba9dc379f9cb816f1 Mon Sep 17 00:00:00 2001 From: sam021313 <40773225+sam021313@users.noreply.github.com> Date: Fri, 17 Jan 2025 22:19:23 +0100 Subject: [PATCH 09/11] test: model meta --- tests/test_model_meta/test_model_meta.py | 46 ++++++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 tests/test_model_meta/test_model_meta.py diff --git a/tests/test_model_meta/test_model_meta.py b/tests/test_model_meta/test_model_meta.py new file mode 100644 index 000000000..d4252902b --- /dev/null +++ b/tests/test_model_meta/test_model_meta.py @@ -0,0 +1,46 @@ +from __future__ import annotations + +from pathlib import Path + +import pytest +from sentence_transformers import CrossEncoder, SentenceTransformer + +from mteb import MTEB +from mteb.abstasks import AbsTask +from tests.test_benchmark.mock_tasks import MockRetrievalTask + + +def test_create_model_meta_from_sentence_transformers(tmp_path: Path): + model_name = "sentence-transformers/average_word_embeddings_levy_dependency" + model = SentenceTransformer(model_name) + + meta = MTEB.create_model_meta(model) + + assert meta.name == model_name + assert meta.revision == model.model_card_data.base_model_revision + + +def test_create_model_meta_from_cross_encoder(tmp_path: Path): + model_name = "cross-encoder/ms-marco-TinyBERT-L-2-v2" + + model = CrossEncoder(model_name) + + meta = MTEB.create_model_meta(model) + # model.name_or_path + # _commit_hash + assert meta.name == model_name + assert meta.revision == model.config._commit_hash + + return meta + + +@pytest.mark.parametrize("task", [MockRetrievalTask()]) +def test_output_folder_model_meta(task: AbsTask, tmp_path: Path): + mteb = MTEB(tasks=[task]) + model_name = "cross-encoder/ms-marco-TinyBERT-L-2-v2" + model = CrossEncoder(model_name) + meta = mteb.create_model_meta(model) + output_path = mteb.create_output_folder( + model_meta=meta, output_folder=tmp_path.as_posix() + ) + assert Path(output_path).exists() From cdf4f1bdbd38b8ac264b3c99253721119c1ff6e0 Mon Sep 17 00:00:00 2001 From: sam021313 <40773225+sam021313@users.noreply.github.com> Date: Fri, 17 Jan 2025 22:33:59 +0100 Subject: [PATCH 10/11] update path test --- tests/test_model_meta/test_model_meta.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/tests/test_model_meta/test_model_meta.py b/tests/test_model_meta/test_model_meta.py index d4252902b..993f9dc8c 100644 --- a/tests/test_model_meta/test_model_meta.py +++ b/tests/test_model_meta/test_model_meta.py @@ -10,7 +10,7 @@ from tests.test_benchmark.mock_tasks import MockRetrievalTask -def test_create_model_meta_from_sentence_transformers(tmp_path: Path): +def test_create_model_meta_from_sentence_transformers(): model_name = "sentence-transformers/average_word_embeddings_levy_dependency" model = SentenceTransformer(model_name) @@ -20,7 +20,7 @@ def test_create_model_meta_from_sentence_transformers(tmp_path: Path): assert meta.revision == model.model_card_data.base_model_revision -def test_create_model_meta_from_cross_encoder(tmp_path: Path): +def test_create_model_meta_from_cross_encoder(): model_name = "cross-encoder/ms-marco-TinyBERT-L-2-v2" model = CrossEncoder(model_name) @@ -44,3 +44,8 @@ def test_output_folder_model_meta(task: AbsTask, tmp_path: Path): model_meta=meta, output_folder=tmp_path.as_posix() ) assert Path(output_path).exists() + assert Path(output_path).is_dir() + assert Path(output_path).name == model.config._commit_hash + assert Path(output_path).parent.name == "cross-encoder__ms-marco-TinyBERT-L-2-v2" + assert Path(output_path).parent.parent == tmp_path + From 2f39f84ac5e72a2d782e004415f45927419570bd Mon Sep 17 00:00:00 2001 From: sam021313 <40773225+sam021313@users.noreply.github.com> Date: Fri, 17 Jan 2025 22:36:54 +0100 Subject: [PATCH 11/11] lint --- tests/test_model_meta/test_model_meta.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_model_meta/test_model_meta.py b/tests/test_model_meta/test_model_meta.py index 993f9dc8c..bd682701e 100644 --- a/tests/test_model_meta/test_model_meta.py +++ b/tests/test_model_meta/test_model_meta.py @@ -48,4 +48,3 @@ def test_output_folder_model_meta(task: AbsTask, tmp_path: Path): assert Path(output_path).name == model.config._commit_hash assert Path(output_path).parent.name == "cross-encoder__ms-marco-TinyBERT-L-2-v2" assert Path(output_path).parent.parent == tmp_path -