From 61a0a08895ffb28a81c50cdcf840b10a2396fe54 Mon Sep 17 00:00:00 2001
From: Simon Edwardsson <simon.edwardsson@gmail.com>
Date: Fri, 28 Jul 2023 12:04:12 +0100
Subject: [PATCH] added PipeEvaluator to run fast and cheap Evaluator first
 with expensive ones as backup

---
 README.md                      |  6 ++++
 benchllm/cli/utils.py          |  6 ++++
 benchllm/evaluator/__init__.py |  9 +++++-
 benchllm/evaluator/pipe.py     | 19 ++++++++++++
 test/evaulator/test_pipe.py    | 53 ++++++++++++++++++++++++++++++++++
 5 files changed, 92 insertions(+), 1 deletion(-)
 create mode 100644 benchllm/evaluator/pipe.py
 create mode 100644 test/evaulator/test_pipe.py

diff --git a/README.md b/README.md
index a4a1672..2b1c88a 100644
--- a/README.md
+++ b/README.md
@@ -115,6 +115,12 @@ The non interactive evaluators also supports `--workers N` to run in the evaluat
 $ bench run --evaluator string-match --workers 5
 ```
 
+It is also possible to combine multiple evaluators, e.g. first do string-match, and only if that doe not match do semantic similarity. This speeds up the evaluation process and can help save on API calls.
+
+```bash
+$ bench run --evaluator string-match,embedding,semantic
+```
+
 To accelerate the evaluation process, BenchLLM uses a cache. If a (prediction, expected) pair has been evaluated in the past and a cache was used, the evaluation output will be saved for future evaluations. There are several types of caches:
 
 - `memory`, only caches output values during the current run. This is particularly useful when running with `--retry-count N`
diff --git a/benchllm/cli/utils.py b/benchllm/cli/utils.py
index e5f550d..03a3600 100644
--- a/benchllm/cli/utils.py
+++ b/benchllm/cli/utils.py
@@ -6,6 +6,7 @@
 from benchllm.evaluator import (
     EmbeddingEvaluator,
     Evaluator,
+    PipeEvaluator,
     SemanticEvaluator,
     StringMatchEvaluator,
 )
@@ -24,6 +25,11 @@ def output_dir_factory() -> Path:
 
 
 def get_evaluator(evaluator_name: str, model: str, workers: int) -> Evaluator:
+    if "," in evaluator_name:
+        evaluator_names = evaluator_name.split(",")
+        return PipeEvaluator(
+            workers=workers, evaluators=[get_evaluator(name, model, workers) for name in evaluator_names]
+        )
     if evaluator_name == "semantic":
         return SemanticEvaluator(model=model, workers=workers)
     elif evaluator_name == "interactive":
diff --git a/benchllm/evaluator/__init__.py b/benchllm/evaluator/__init__.py
index b1c3edc..e99e842 100644
--- a/benchllm/evaluator/__init__.py
+++ b/benchllm/evaluator/__init__.py
@@ -1,5 +1,12 @@
-from benchllm.evaluator.evaluator import Evaluator  # noqa
 # Adding an empty comment to force import order to avoid circular imports
+from benchllm.evaluator.evaluator import Evaluator  # noqa
+
+
+def noop():
+    pass
+
+
 from benchllm.evaluator.embedding import EmbeddingEvaluator  # noqa
+from benchllm.evaluator.pipe import PipeEvaluator
 from benchllm.evaluator.semantic import SemanticEvaluator  # noqa
 from benchllm.evaluator.string_match import StringMatchEvaluator  # noqa
diff --git a/benchllm/evaluator/pipe.py b/benchllm/evaluator/pipe.py
new file mode 100644
index 0000000..c554352
--- /dev/null
+++ b/benchllm/evaluator/pipe.py
@@ -0,0 +1,19 @@
+from benchllm.data_types import Prediction
+from benchllm.evaluator import Evaluator
+
+
+class PipeEvaluator(Evaluator):
+    def __init__(self, evaluators: list[Evaluator], workers: int = 1):
+        super().__init__(workers=workers)
+        self._evaluators: list[Evaluator] = evaluators
+
+    def evaluate_prediction(self, prediction: Prediction) -> list[Evaluator.Candidate]:
+        candidates = []
+        for evaluator in self._evaluators:
+            candidates = evaluator.evaluate_prediction(prediction)
+            # only return passed candidates so negative predication don't get cached
+            # since they might have passed further down the chain
+            passed_candidates = [candidate for candidate in candidates if candidate.passed]
+            if passed_candidates:
+                return passed_candidates
+        return candidates
diff --git a/test/evaulator/test_pipe.py b/test/evaulator/test_pipe.py
new file mode 100644
index 0000000..904e4f9
--- /dev/null
+++ b/test/evaulator/test_pipe.py
@@ -0,0 +1,53 @@
+from benchllm import Prediction, Test
+from benchllm.data_types import FunctionID
+from benchllm.evaluator import Evaluator, PipeEvaluator
+
+PREDICTION_EXAMPLE = Prediction(
+    test=Test(input="Who are you?", expected=["Yoda I am.", "Yoda"]),
+    output="I am Yoda.",
+    time_elapsed=0,
+    function_id=FunctionID.default(),
+)
+
+
+class HardCoded(Evaluator):
+    def __init__(self, passes: list[bool]):
+        super().__init__(workers=1)
+        self.passes = passes
+
+    def evaluate_prediction(self, prediction: Prediction) -> list[Evaluator.Candidate]:
+        assert len(self.passes) == len(prediction.test.expected)
+        candidates = []
+        for passed, expected in zip(self.passes, prediction.test.expected):
+            candidates.append(
+                Evaluator.Candidate(
+                    prediction=prediction.output, expected=expected, score=1.0 if passed else 0, passed=passed
+                )
+            )
+        return candidates
+
+
+def test_evaluator_runs_all_evaluators():
+    pipe = PipeEvaluator([HardCoded([False, False]), HardCoded([True, False])])
+    pipe.load([PREDICTION_EXAMPLE])
+    evaluations = pipe.run()
+    assert len(evaluations) == 1
+    assert evaluations[0].passed
+
+
+def test_evaluator_runs_all_evaluators_with_all_failures():
+    pipe = PipeEvaluator([HardCoded([False, False]), HardCoded([False, False])])
+    pipe.load([PREDICTION_EXAMPLE])
+
+    evaluations = pipe.run()
+    assert len(evaluations) == 1
+    assert not evaluations[0].passed
+
+
+def test_evaluator_runs_all_evaluators_with_early_stopping():
+    pipe = PipeEvaluator([HardCoded([False, True]), HardCoded([])])
+    pipe.load([PREDICTION_EXAMPLE])
+
+    evaluations = pipe.run()
+    assert len(evaluations) == 1
+    assert evaluations[0].passed