From 61a0a08895ffb28a81c50cdcf840b10a2396fe54 Mon Sep 17 00:00:00 2001 From: Simon Edwardsson Date: Fri, 28 Jul 2023 12:04:12 +0100 Subject: [PATCH] added PipeEvaluator to run fast and cheap Evaluator first with expensive ones as backup --- README.md | 6 ++++ benchllm/cli/utils.py | 6 ++++ benchllm/evaluator/__init__.py | 9 +++++- benchllm/evaluator/pipe.py | 19 ++++++++++++ test/evaulator/test_pipe.py | 53 ++++++++++++++++++++++++++++++++++ 5 files changed, 92 insertions(+), 1 deletion(-) create mode 100644 benchllm/evaluator/pipe.py create mode 100644 test/evaulator/test_pipe.py diff --git a/README.md b/README.md index a4a1672..2b1c88a 100644 --- a/README.md +++ b/README.md @@ -115,6 +115,12 @@ The non interactive evaluators also supports `--workers N` to run in the evaluat $ bench run --evaluator string-match --workers 5 ``` +It is also possible to combine multiple evaluators, e.g. first do string-match, and only if that doe not match do semantic similarity. This speeds up the evaluation process and can help save on API calls. + +```bash +$ bench run --evaluator string-match,embedding,semantic +``` + To accelerate the evaluation process, BenchLLM uses a cache. If a (prediction, expected) pair has been evaluated in the past and a cache was used, the evaluation output will be saved for future evaluations. There are several types of caches: - `memory`, only caches output values during the current run. This is particularly useful when running with `--retry-count N` diff --git a/benchllm/cli/utils.py b/benchllm/cli/utils.py index e5f550d..03a3600 100644 --- a/benchllm/cli/utils.py +++ b/benchllm/cli/utils.py @@ -6,6 +6,7 @@ from benchllm.evaluator import ( EmbeddingEvaluator, Evaluator, + PipeEvaluator, SemanticEvaluator, StringMatchEvaluator, ) @@ -24,6 +25,11 @@ def output_dir_factory() -> Path: def get_evaluator(evaluator_name: str, model: str, workers: int) -> Evaluator: + if "," in evaluator_name: + evaluator_names = evaluator_name.split(",") + return PipeEvaluator( + workers=workers, evaluators=[get_evaluator(name, model, workers) for name in evaluator_names] + ) if evaluator_name == "semantic": return SemanticEvaluator(model=model, workers=workers) elif evaluator_name == "interactive": diff --git a/benchllm/evaluator/__init__.py b/benchllm/evaluator/__init__.py index b1c3edc..e99e842 100644 --- a/benchllm/evaluator/__init__.py +++ b/benchllm/evaluator/__init__.py @@ -1,5 +1,12 @@ -from benchllm.evaluator.evaluator import Evaluator # noqa # Adding an empty comment to force import order to avoid circular imports +from benchllm.evaluator.evaluator import Evaluator # noqa + + +def noop(): + pass + + from benchllm.evaluator.embedding import EmbeddingEvaluator # noqa +from benchllm.evaluator.pipe import PipeEvaluator from benchllm.evaluator.semantic import SemanticEvaluator # noqa from benchllm.evaluator.string_match import StringMatchEvaluator # noqa diff --git a/benchllm/evaluator/pipe.py b/benchllm/evaluator/pipe.py new file mode 100644 index 0000000..c554352 --- /dev/null +++ b/benchllm/evaluator/pipe.py @@ -0,0 +1,19 @@ +from benchllm.data_types import Prediction +from benchllm.evaluator import Evaluator + + +class PipeEvaluator(Evaluator): + def __init__(self, evaluators: list[Evaluator], workers: int = 1): + super().__init__(workers=workers) + self._evaluators: list[Evaluator] = evaluators + + def evaluate_prediction(self, prediction: Prediction) -> list[Evaluator.Candidate]: + candidates = [] + for evaluator in self._evaluators: + candidates = evaluator.evaluate_prediction(prediction) + # only return passed candidates so negative predication don't get cached + # since they might have passed further down the chain + passed_candidates = [candidate for candidate in candidates if candidate.passed] + if passed_candidates: + return passed_candidates + return candidates diff --git a/test/evaulator/test_pipe.py b/test/evaulator/test_pipe.py new file mode 100644 index 0000000..904e4f9 --- /dev/null +++ b/test/evaulator/test_pipe.py @@ -0,0 +1,53 @@ +from benchllm import Prediction, Test +from benchllm.data_types import FunctionID +from benchllm.evaluator import Evaluator, PipeEvaluator + +PREDICTION_EXAMPLE = Prediction( + test=Test(input="Who are you?", expected=["Yoda I am.", "Yoda"]), + output="I am Yoda.", + time_elapsed=0, + function_id=FunctionID.default(), +) + + +class HardCoded(Evaluator): + def __init__(self, passes: list[bool]): + super().__init__(workers=1) + self.passes = passes + + def evaluate_prediction(self, prediction: Prediction) -> list[Evaluator.Candidate]: + assert len(self.passes) == len(prediction.test.expected) + candidates = [] + for passed, expected in zip(self.passes, prediction.test.expected): + candidates.append( + Evaluator.Candidate( + prediction=prediction.output, expected=expected, score=1.0 if passed else 0, passed=passed + ) + ) + return candidates + + +def test_evaluator_runs_all_evaluators(): + pipe = PipeEvaluator([HardCoded([False, False]), HardCoded([True, False])]) + pipe.load([PREDICTION_EXAMPLE]) + evaluations = pipe.run() + assert len(evaluations) == 1 + assert evaluations[0].passed + + +def test_evaluator_runs_all_evaluators_with_all_failures(): + pipe = PipeEvaluator([HardCoded([False, False]), HardCoded([False, False])]) + pipe.load([PREDICTION_EXAMPLE]) + + evaluations = pipe.run() + assert len(evaluations) == 1 + assert not evaluations[0].passed + + +def test_evaluator_runs_all_evaluators_with_early_stopping(): + pipe = PipeEvaluator([HardCoded([False, True]), HardCoded([])]) + pipe.load([PREDICTION_EXAMPLE]) + + evaluations = pipe.run() + assert len(evaluations) == 1 + assert evaluations[0].passed