diff --git a/scripts/tool_benchmarks.py b/scripts/tool_benchmarks.py
index 9da8e8c..b913cf0 100644
--- a/scripts/tool_benchmarks.py
+++ b/scripts/tool_benchmarks.py
@@ -10,13 +10,17 @@
 from langchain_core.prompts.few_shot import FewShotChatMessagePromptTemplate
 from langchain_openai import  OpenAIEmbeddings
 from langsmith.client import Client
-from langchain_benchmarks import __version__, registry
+#from langchain_benchmarks import __version__, registry
 from langchain_benchmarks.rate_limiting import RateLimiter
 from langchain_benchmarks.tool_usage.agents import StandardAgentFactory
+import sys 
+sys.path.append("./..")
+from langchain_benchmarks import __version__, registry
 from langchain_benchmarks.tool_usage.tasks.multiverse_math import *
 from langchain.chat_models import init_chat_model
 from langsmith.evaluation import evaluate
 from langchain.agents import AgentExecutor, create_tool_calling_agent
+from langchain.tools import tool
 
 tests = [
     ("claude-3-haiku-20240307","anthropic",),
@@ -28,8 +32,8 @@
     ("gpt-4o-mini","openai"),
     ("llama3-groq-70b-8192-tool-use-preview","groq"),
     ("llama3-groq-8b-8192-tool-use-preview","groq"),
-    ("gemini-1.5-pro","google_vertexai"),
-    ("gemini-1.5-flash","google_vertexai")
+    ("gemini-1.5-pro","google_genai"),
+    ("gemini-1.5-flash","google_genai")
 ]
 
 client = Client()  # Launch langsmith client for cloning datasets
@@ -150,9 +154,13 @@ def get_prompts(task_name, **kwargs):
 
 def predict_from_callable(callable,instructions):
     def predict(run):
-        return callable.invoke({"question":run['question'],"instructions":instructions})['output']
+        return callable.invoke({"question":run['question'],"instructions":instructions})
     return predict
 
+def pi(a: float) -> float:
+    """Returns a precise value of PI for this alternate universe."""
+    return math.e
+
 experiment_uuid = uuid.uuid4().hex[:4]
 today = datetime.date.today().isoformat()
 for task in registry.tasks:
@@ -165,7 +173,7 @@ def predict(run):
     few_shot_str, few_shot_three_str = get_few_shot_str_from_messages(few_shot_messages,few_shot_three_messages)
     prompts = get_prompts(task.name,examples=examples,few_shot_three_messages=few_shot_three_messages,few_shot_three_str=few_shot_three_str)
 
-    for model_name, model_provider in tests:
+    for model_name, model_provider in tests[9:]:
         model = init_chat_model(model_name,model_provider=model_provider)
         rate_limiter = RateLimiter(requests_per_second=1)
 
@@ -175,9 +183,14 @@ def predict(run):
         for prompt, prompt_name in prompts[:-1]:
             
             tools = task.create_environment().tools
+            if "google" in model_provider:
+                tools[9] = tool(pi)
             agent = create_tool_calling_agent(model, tools, prompt)
-            agent_executor = AgentExecutor(agent=agent, tools=tools)
+            agent_executor = AgentExecutor(agent=agent, tools=tools, return_intermediate_steps=True)
+
+
             '''
+            # Legacy way of running, migrate to evaluate
             agent_factory = StandardAgentFactory(
                 task, model, prompt, rate_limiter=rate_limiter
             )
@@ -197,11 +210,10 @@ def predict(run):
                 },
             )
             '''
-            print(agent_executor.invoke({"question":"placeholder question","instructions":task.instructions}))
             evaluate(
                 predict_from_callable(agent_executor,task.instructions),
                 data=dataset_name,
-                evaluators=eval_config.evaluators,
+                evaluators=eval_config.custom_evaluators,
                 max_concurrency=5,
                 metadata={
                     "model": model_name,
@@ -210,6 +222,7 @@ def predict(run):
                     "date": today,
                     "langchain_benchmarks_version": __version__,
                 },
+                experiment_prefix=f"{model_name}-{task.name}-{prompt_name}"
             )
-
+            
             
\ No newline at end of file