Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ci: run bm25 and ColBERT test in ci #1829

Open
wants to merge 11 commits into
base: v2.0.0
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ install:
install-for-tests:
@echo "--- 🚀 Installing project dependencies for test ---"
@echo "This ensures that the project is not installed in editable mode"
pip install ".[dev,speedtask]"
pip install ".[dev,speedtask,bm25s,pylate]"

lint:
@echo "--- 🧹 Running linters ---"
Expand Down
10 changes: 8 additions & 2 deletions mteb/evaluation/MTEB.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,10 @@
from mteb.abstasks.AbsTask import ScoresDict
from mteb.encoder_interface import Encoder
from mteb.model_meta import ModelMeta
from mteb.models import model_meta_from_sentence_transformers
from mteb.models import (
model_meta_from_cross_encoder,
model_meta_from_sentence_transformers,
)

from ..abstasks.AbsTask import AbsTask
from ..load_results.task_results import TaskResult
Expand Down Expand Up @@ -495,7 +498,10 @@ def create_model_meta(model: Encoder) -> ModelMeta:
meta = model.mteb_model_meta # type: ignore
else:
try:
meta = model_meta_from_sentence_transformers(model) # type: ignore
if isinstance(model, CrossEncoder):
meta = model_meta_from_cross_encoder(model)
else:
meta = model_meta_from_sentence_transformers(model) # type: ignore
except AttributeError:
logger.warning(
"Could not find model metadata. Please set the model.mteb_model_meta attribute or if you are using "
Expand Down
2 changes: 2 additions & 0 deletions mteb/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
get_model,
get_model_meta,
get_model_metas,
model_meta_from_cross_encoder,
model_meta_from_sentence_transformers,
)
from mteb.models.sentence_transformer_wrapper import SentenceTransformerWrapper
Expand All @@ -17,5 +18,6 @@
"get_model_meta",
"get_model_metas",
"model_meta_from_sentence_transformers",
"model_meta_from_cross_encoder",
"SentenceTransformerWrapper",
]
37 changes: 36 additions & 1 deletion mteb/models/overview.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from typing import Any

from huggingface_hub import ModelCard
from sentence_transformers import SentenceTransformer
from sentence_transformers import CrossEncoder, SentenceTransformer

from mteb.abstasks.AbsTask import AbsTask
from mteb.encoder_interface import Encoder
Expand Down Expand Up @@ -164,6 +164,11 @@ def get_model(model_name: str, revision: str | None = None, **kwargs: Any) -> En
if not meta.similarity_fn_name:
meta.similarity_fn_name = _meta.similarity_fn_name

elif isinstance(model, CrossEncoder):
_meta = model_meta_from_cross_encoder(model.model)
if meta.revision is None:
meta.revision = _meta.revision if _meta.revision else meta.revision

model.mteb_model_meta = meta # type: ignore
return model

Expand Down Expand Up @@ -226,6 +231,36 @@ def model_meta_from_hf_hub(model_name: str) -> ModelMeta:
)


def model_meta_from_cross_encoder(model: CrossEncoder) -> ModelMeta:
try:
name = model.model.name_or_path
# languages = (
# [model.model_card_data.language]
# if isinstance(model.model_card_data.language, str)
# else model.model_card_data.language
# )

meta = ModelMeta(
name=name,
revision=model.config._commit_hash,
release_date=None,
languages=None,
framework=["Sentence Transformers"],
similarity_fn_name=None,
)
except AttributeError as e:
logger.warning(
f"Failed to extract metadata from model: {e}. Upgrading to sentence-transformers v3.0.0 or above is recommended."
)
meta = ModelMeta(
name=None,
revision=None,
languages=None,
release_date=None,
)
return meta


def model_meta_from_sentence_transformers(model: SentenceTransformer) -> ModelMeta:
try:
name = (
Expand Down
51 changes: 25 additions & 26 deletions tests/test_benchmark/test_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,33 +56,35 @@ def test_mulitple_mteb_tasks(tasks: list[AbsTask], model: mteb.Encoder, tmp_path
MockTorchbf16Encoder(),
],
)
def test_benchmark_encoders_on_task(task: str | AbsTask, model: mteb.Encoder):
def test_benchmark_encoders_on_task(
task: str | AbsTask, model: mteb.Encoder, tmp_path: Path
):
"""Test that a task can be fetched and run using a variety of encoders"""
if isinstance(task, str):
tasks = mteb.get_tasks(tasks=[task])
else:
tasks = [task]

eval = mteb.MTEB(tasks=tasks)
eval.run(model, output_folder="tests/results", overwrite_results=True)
eval.run(model, output_folder=str(tmp_path))


@pytest.mark.parametrize("task", [MockMultilingualRetrievalTask()])
@pytest.mark.parametrize(
"model",
[MockSentenceTransformer()],
)
def test_run_eval_without_co2_tracking(task: str | AbsTask, model: mteb.Encoder):
def test_run_eval_without_co2_tracking(
task: str | AbsTask, model: mteb.Encoder, tmp_path: Path
):
"""Test that a task can be fetched and run without CO2 tracking"""
if isinstance(task, str):
tasks = mteb.get_tasks(tasks=[task])
else:
tasks = [task]

eval = mteb.MTEB(tasks=tasks)
eval.run(
model, output_folder="tests/results", overwrite_results=True, co2_tracker=False
)
eval.run(model, output_folder=str(tmp_path), co2_tracker=False)


@pytest.mark.parametrize("task", MOCK_TASK_TEST_GRID[:1])
Expand All @@ -108,7 +110,7 @@ def test_reload_results(task: str | AbsTask, model: mteb.Encoder, tmp_path: Path


@pytest.mark.parametrize("task_name", MOCK_TASK_TEST_GRID)
def test_prompt_name_passed_to_all_encodes(task_name: str | AbsTask):
def test_prompt_name_passed_to_all_encodes(task_name: str | AbsTask, tmp_path: Path):
"""Test that all tasks correctly pass down the prompt_name to the encoder which supports it, and that the encoder which does not support it does not
receive it.
"""
Expand Down Expand Up @@ -141,17 +143,17 @@ def encode(self, sentences, **kwargs):

eval.run(
model,
output_folder="tests/results",
output_folder=str(tmp_path),
overwrite_results=True,
)
# Test that the task_name is not passed down to the encoder
model = EncoderWithoutInstructions("average_word_embeddings_levy_dependency")
assert model.prompts == {}, "The encoder should not have any prompts"
eval.run(model, output_folder="tests/results", overwrite_results=True)
eval.run(model, output_folder=tmp_path.as_posix(), overwrite_results=True)


@pytest.mark.parametrize("task_name", MOCK_TASK_TEST_GRID)
def test_encode_kwargs_passed_to_all_encodes(task_name: str | AbsTask):
def test_encode_kwargs_passed_to_all_encodes(task_name: str | AbsTask, tmp_path: Path):
"""Test that all tasks correctly pass down the encode_kwargs to the encoder."""
my_encode_kwargs = {"no_one_uses_this_args": "but_its_here"}

Expand All @@ -175,36 +177,34 @@ def encode(self, sentences, task_name: str | None = None, **kwargs):
model = MockEncoderWithKwargs()
eval.run(
model,
output_folder="tests/results",
output_folder=str(tmp_path),
overwrite_results=True,
encode_kwargs=my_encode_kwargs,
)


@pytest.mark.parametrize("model", [MockNumpyEncoder()])
def test_run_using_benchmark(model: mteb.Encoder):
def test_run_using_benchmark(model: mteb.Encoder, tmp_path: Path):
"""Test that a benchmark object can be run using the MTEB class."""
bench = Benchmark(
name="test_bench", tasks=mteb.get_tasks(tasks=["STS12", "SummEval"])
)

eval = mteb.MTEB(tasks=[bench])
eval.run(
model, output_folder="tests/results", overwrite_results=True
model, output_folder=tmp_path.as_posix(), overwrite_results=True
) # we just want to test that it runs


@pytest.mark.parametrize("model", [MockNumpyEncoder()])
def test_run_using_list_of_benchmark(model: mteb.Encoder):
def test_run_using_list_of_benchmark(model: mteb.Encoder, tmp_path: Path):
"""Test that a list of benchmark objects can be run using the MTEB class."""
bench = [
Benchmark(name="test_bench", tasks=mteb.get_tasks(tasks=["STS12", "SummEval"]))
]

eval = mteb.MTEB(tasks=bench)
eval.run(
model, output_folder="tests/results", overwrite_results=True
) # we just want to test that it runs
eval.run(model, output_folder=str(tmp_path)) # we just want to test that it runs


def test_benchmark_names_must_be_unique():
Expand All @@ -229,7 +229,7 @@ def test_get_benchmark(name):
@pytest.mark.parametrize("task", MOCK_TASK_TEST_GRID)
@pytest.mark.parametrize("is_task_name", [True, False])
def test_prompt_name_passed_to_all_encodes_with_prompts(
task: AbsTask | str, is_task_name: bool
task: AbsTask | str, is_task_name: bool, tmp_path: Path
):
"""Test that all tasks and task_types correctly pass down the prompt_name to the encoder with prompts."""
_task_name = task.metadata.name if isinstance(task, AbsTask) else task
Expand Down Expand Up @@ -258,8 +258,7 @@ def encode(self, sentences, prompt_name: str | None = None, **kwargs):
)
eval.run(
model,
output_folder="tests/results",
overwrite_results=True,
output_folder=str(tmp_path),
)

class MockEncoderWithExistingPrompts(mteb.Encoder):
Expand All @@ -275,7 +274,7 @@ def encode(self, sentences, prompt_name: str | None = None, **kwargs):
model = MockSentenceTransformerWrapper(MockEncoderWithExistingPrompts())
eval.run(
model,
output_folder="tests/results",
output_folder=str(tmp_path),
overwrite_results=True,
)

Expand All @@ -292,7 +291,9 @@ def encode(self, sentences, prompt_name: str | None = None, **kwargs):
],
)
@pytest.mark.parametrize("is_task_name", [True, False])
def test_model_query_passage_prompts_task_type(task: AbsTask | str, is_task_name: bool):
def test_model_query_passage_prompts_task_type(
task: AbsTask | str, is_task_name: bool, tmp_path: Path
):
"""Test that the model with prompts is correctly called."""
tasks = [task]

Expand Down Expand Up @@ -331,8 +332,7 @@ def encode(self, sentences, prompt_name: str | None = None, *args, **kwargs):
eval.run(
model,
model_prompts=prompt_list,
output_folder="tests/results",
overwrite_results=True,
output_folder=str(tmp_path),
)
model = MockSentenceTransformerWrapper(
MockSentenceEncoderWithPrompts(), model_prompts=prompt_list
Expand All @@ -341,6 +341,5 @@ def encode(self, sentences, prompt_name: str | None = None, *args, **kwargs):
eval.run(
model,
model_prompts=prompt_list,
output_folder="tests/results",
overwrite_results=True,
output_folder=str(tmp_path),
)
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from __future__ import annotations

import logging
from pathlib import Path

import pytest

Expand All @@ -18,7 +19,7 @@

@pytest.mark.parametrize("task", TASK_TEST_GRID)
@pytest.mark.parametrize("model", [MockNumpyEncoder()])
def test_benchmark_datasets(task: str | AbsTask, model: mteb.Encoder):
def test_benchmark_datasets(task: str | AbsTask, model: mteb.Encoder, tmp_path: Path):
"""Test that a task can be fetched and run"""
eval = MTEB(tasks=[task])
eval.run(model, output_folder="tests/results", overwrite_results=True)
eval.run(model, output_folder=tmp_path.as_posix(), overwrite_results=True)
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from __future__ import annotations

import logging
from pathlib import Path

import pytest
from sentence_transformers import SentenceTransformer
Expand All @@ -22,9 +23,11 @@
"average_word_embeddings_levy_dependency",
],
)
def test_benchmark_sentence_transformer(task: str | AbsTask, model_name: str):
def test_benchmark_sentence_transformer(
task: str | AbsTask, model_name: str, tmp_path: Path
):
"""Test that a task can be fetched and run"""
if isinstance(model_name, str):
model = SentenceTransformer(model_name)
eval = MTEB(tasks=[task])
eval.run(model, output_folder="tests/results", overwrite_results=True)
eval.run(model, output_folder=tmp_path.as_posix(), overwrite_results=True)
13 changes: 10 additions & 3 deletions tests/test_benchmark/test_models.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
from __future__ import annotations

import sys
from pathlib import Path

import pytest

import mteb
Expand All @@ -9,9 +12,10 @@
from .mock_tasks import MockRetrievalTask


@pytest.mark.skipif(sys.version_info < (3, 10), reason="Requires Python 3.10 or higher")
@pytest.mark.parametrize("model", ["colbert-ir/colbertv2.0"])
@pytest.mark.parametrize("task", [MockRetrievalTask()])
def test_colbert_model_e2e(task: AbsTask, model: str):
def test_colbert_model_e2e(task: AbsTask, model: str, tmp_path: Path):
pytest.importorskip("pylate", reason="pylate not installed")
eval_splits = ["test"]
model = mteb.get_model(model)
Expand All @@ -21,13 +25,14 @@ def test_colbert_model_e2e(task: AbsTask, model: str):
model,
eval_splits=eval_splits,
corpus_chunk_size=500,
output_folder=str(tmp_path),
)
result = results[0]

assert result.scores["test"][0]["ndcg_at_1"] == 1.0


def test_bm25s_e2e():
def test_bm25s_e2e(tmp_path: Path):
# fails for dataset smaller then 1000
pytest.importorskip("bm25s", reason="bm25s not installed")
pytest.importorskip("Stemmer", reason="PyStemmer not installed")
Expand All @@ -38,7 +43,9 @@ def test_bm25s_e2e():

evaluation = MTEB(tasks=tasks)

results = evaluation.run(model, eval_splits=eval_splits)
results = evaluation.run(
model, eval_splits=eval_splits, output_folder=str(tmp_path)
)
result = results[0]

assert result.scores["test"][0]["ndcg_at_1"] == 0.42879
7 changes: 3 additions & 4 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,12 +50,13 @@ def test_run_task(
model_name: str,
task_name: str,
model_revision: str,
tmp_path: Path,
):
args = Namespace(
model=model_name,
tasks=[task_name],
model_revision=model_revision,
output_folder="tests/results/test_model",
output_folder=tmp_path.as_posix(),
verbosity=3,
device=None,
categories=None,
Expand All @@ -71,9 +72,7 @@ def test_run_task(
run(args)

model_name_as_path = model_name.replace("/", "__").replace(" ", "_")
results_path = Path(
f"tests/results/test_model/{model_name_as_path}/{model_revision}"
)
results_path = tmp_path / model_name_as_path / model_revision
assert results_path.exists(), "Output folder not created"
assert "model_meta.json" in [
f.name for f in list(results_path.glob("*.json"))
Expand Down
Loading
Loading