diff --git a/scripts/tool_benchmarks.py b/scripts/tool_benchmarks.py
index 0f97e2b..918d88f 100644
--- a/scripts/tool_benchmarks.py
+++ b/scripts/tool_benchmarks.py
@@ -161,13 +161,13 @@
     for model_name, model in tests[:-1]:
         rate_limiter = RateLimiter(requests_per_second=1)
 
-    print(f"Benchmarking {task.name} with model: {model_name}")
-    eval_config = task.get_eval_config()
+        print(f"Benchmarking {task.name} with model: {model_name}")
+        eval_config = task.get_eval_config()
 
-    for prompt, prompt_name in prompts:
-        agent_factory = StandardAgentFactory(
-            task, llm, prompt, rate_limiter=rate_limiter
-        )
+        for prompt, prompt_name in prompts:
+            agent_factory = StandardAgentFactory(
+                task, model, prompt, rate_limiter=rate_limiter
+            )
 
             client.run_on_dataset(
                 dataset_name=dataset_name,