Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adds PipeEvaluator #8

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,12 @@ The non interactive evaluators also supports `--workers N` to run in the evaluat
$ bench run --evaluator string-match --workers 5
```

It is also possible to combine multiple evaluators, e.g. first do string-match, and only if that doe not match do semantic similarity. This speeds up the evaluation process and can help save on API calls.

```bash
$ bench run --evaluator string-match,embedding,semantic
```

To accelerate the evaluation process, BenchLLM uses a cache. If a (prediction, expected) pair has been evaluated in the past and a cache was used, the evaluation output will be saved for future evaluations. There are several types of caches:

- `memory`, only caches output values during the current run. This is particularly useful when running with `--retry-count N`
Expand Down
6 changes: 6 additions & 0 deletions benchllm/cli/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from benchllm.evaluator import (
EmbeddingEvaluator,
Evaluator,
PipeEvaluator,
SemanticEvaluator,
StringMatchEvaluator,
)
Expand All @@ -24,6 +25,11 @@ def output_dir_factory() -> Path:


def get_evaluator(evaluator_name: str, model: str, workers: int) -> Evaluator:
if "," in evaluator_name:
evaluator_names = evaluator_name.split(",")
return PipeEvaluator(
workers=workers, evaluators=[get_evaluator(name, model, workers) for name in evaluator_names]
)
if evaluator_name == "semantic":
return SemanticEvaluator(model=model, workers=workers)
elif evaluator_name == "interactive":
Expand Down
9 changes: 8 additions & 1 deletion benchllm/evaluator/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
from benchllm.evaluator.evaluator import Evaluator # noqa
# Adding an empty comment to force import order to avoid circular imports
from benchllm.evaluator.evaluator import Evaluator # noqa


def noop():
pass
Comment on lines +5 to +6
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this used anywhere else? If this is to prevent auto-ordering these lines, we have used the # Adding an empty comment to force import order to avoid circular imports comment above, we could potentially use that again?



from benchllm.evaluator.embedding import EmbeddingEvaluator # noqa
from benchllm.evaluator.pipe import PipeEvaluator
from benchllm.evaluator.semantic import SemanticEvaluator # noqa
from benchllm.evaluator.string_match import StringMatchEvaluator # noqa
19 changes: 19 additions & 0 deletions benchllm/evaluator/pipe.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from benchllm.data_types import Prediction
from benchllm.evaluator import Evaluator


class PipeEvaluator(Evaluator):
def __init__(self, evaluators: list[Evaluator], workers: int = 1):
super().__init__(workers=workers)
self._evaluators: list[Evaluator] = evaluators

def evaluate_prediction(self, prediction: Prediction) -> list[Evaluator.Candidate]:
candidates = []
for evaluator in self._evaluators:
candidates = evaluator.evaluate_prediction(prediction)
# only return passed candidates so negative predication don't get cached
# since they might have passed further down the chain
passed_candidates = [candidate for candidate in candidates if candidate.passed]
if passed_candidates:
return passed_candidates
return candidates
53 changes: 53 additions & 0 deletions test/evaulator/test_pipe.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
from benchllm import Prediction, Test
from benchllm.data_types import FunctionID
from benchllm.evaluator import Evaluator, PipeEvaluator

PREDICTION_EXAMPLE = Prediction(
test=Test(input="Who are you?", expected=["Yoda I am.", "Yoda"]),
output="I am Yoda.",
time_elapsed=0,
function_id=FunctionID.default(),
)


class HardCoded(Evaluator):
def __init__(self, passes: list[bool]):
super().__init__(workers=1)
self.passes = passes

def evaluate_prediction(self, prediction: Prediction) -> list[Evaluator.Candidate]:
assert len(self.passes) == len(prediction.test.expected)
candidates = []
for passed, expected in zip(self.passes, prediction.test.expected):
candidates.append(
Evaluator.Candidate(
prediction=prediction.output, expected=expected, score=1.0 if passed else 0, passed=passed
)
)
return candidates


def test_evaluator_runs_all_evaluators():
pipe = PipeEvaluator([HardCoded([False, False]), HardCoded([True, False])])
pipe.load([PREDICTION_EXAMPLE])
evaluations = pipe.run()
assert len(evaluations) == 1
assert evaluations[0].passed


def test_evaluator_runs_all_evaluators_with_all_failures():
pipe = PipeEvaluator([HardCoded([False, False]), HardCoded([False, False])])
pipe.load([PREDICTION_EXAMPLE])

evaluations = pipe.run()
assert len(evaluations) == 1
assert not evaluations[0].passed


def test_evaluator_runs_all_evaluators_with_early_stopping():
pipe = PipeEvaluator([HardCoded([False, True]), HardCoded([])])
pipe.load([PREDICTION_EXAMPLE])

evaluations = pipe.run()
assert len(evaluations) == 1
assert evaluations[0].passed