From 782dc78d094a7121dfb940f3976204661eda08e2 Mon Sep 17 00:00:00 2001
From: isaac hershenson <ihershenson@hmc.edu>
Date: Tue, 16 Jul 2024 11:31:35 -0700
Subject: [PATCH] edits

---
 langchain_benchmarks/model_registration.py    |  22 +-
 .../tool_usage/tasks/extraction_query.py      | 480 +++++++++++++-----
 scripts/extract_task.py                       | 150 ++++--
 3 files changed, 492 insertions(+), 160 deletions(-)

diff --git a/langchain_benchmarks/model_registration.py b/langchain_benchmarks/model_registration.py
index 1edcdb9..9450293 100644
--- a/langchain_benchmarks/model_registration.py
+++ b/langchain_benchmarks/model_registration.py
@@ -162,9 +162,27 @@
             "model": "gpt-4-32k-0314",
         },
     ),
+    RegisteredModel(
+        provider="openai",
+        name="gpt-4o",
+        description="GPT-4o base model",
+        type="chat",
+        params={
+            "model": "gpt-4o",
+        },
+    ),
 ]
 
 _FIREWORKS_MODELS = [
+    RegisteredModel(
+        provider="fireworks",
+        name="firefunction-v2",
+        type="chat",
+        description="Fireworks function calling model",
+        params={
+            "model": "accounts/fireworks/models/firefunction-v2",
+        }, 
+    ),
     RegisteredModel(
         provider="fireworks",
         name="llama-v2-7b-chat-fw",
@@ -207,7 +225,9 @@
         description="Mistral MoE 8x7B Instruct v0.1 model with Sparse "
         "Mixture of Experts. Fine tuned for instruction following",
         type="llm",
-        params={"model": "accounts/fireworks/models/mixtral-8x7b-instruct"},
+        params={
+            "model": "accounts/fireworks/models/mixtral-8x7b-instruct"
+        },
     ),
 ]
 
diff --git a/langchain_benchmarks/tool_usage/tasks/extraction_query.py b/langchain_benchmarks/tool_usage/tasks/extraction_query.py
index 286d47a..dbc0aad 100644
--- a/langchain_benchmarks/tool_usage/tasks/extraction_query.py
+++ b/langchain_benchmarks/tool_usage/tasks/extraction_query.py
@@ -1,64 +1,65 @@
-'''
-Notes:
-
-Model needs to be aware that "today" means 2024-01-01
-Need to provide it with a lot of context about what langchain/smith/graph are used for
-'''
-
-from typing import List, Literal, Optional, Union, cast
+from datetime import datetime
+from typing import List, Literal, Union, cast
 
-from langchain.tools import BaseTool, tool
 from langchain.pydantic_v1 import BaseModel, Field
-from langchain_benchmarks.schema import ToolUsageEnvironment, ToolUsageTask
-from datetime import datetime
+from langchain.tools import BaseTool, tool
 from langchain_core.messages import HumanMessage
-from langsmith.schemas import Example, Run
-from langsmith.evaluation.evaluator import (
-    EvaluationResult,
-    EvaluationResults,
-    RunEvaluator,
-)
-
-
-
 
+from langchain_benchmarks.schema import ToolUsageEnvironment, ToolUsageTask
 class DocQuery(BaseModel):
     """Query against documentation"""
 
-    query: str = Field(...,description="The question to answer")
-    source: Literal["langchain", "langsmith", "langgraph"] = Field(...,description="The documentation source to search against. Should be one of 'langchain', 'langsmith', or "
-                                                                   "'langgraph' depending on which one product the user question pertains to")
+    query: str = Field(..., description="The question to answer")
+    source: Literal["langchain", "langsmith", "langgraph"] = Field(
+        ...,
+        description="The documentation source to search against. Should be one of 'langchain', 'langsmith', or "
+        "'langgraph' depending on which one product the user question pertains to",
+    )
+
 
 class TweetQuery(BaseModel):
     """Query against tweets"""
 
-    subject: str = Field(...,description="Subject to search for")
-    min_likes: Union[int, None] = Field(None,description="Minimum amount of likes on the tweet")
-    max_likes: Union[int, None] = Field(None,description="Maximum amount of likes on the tweet")
-    start_date: Union[datetime, None] = Field(None, description="Earliest date to start pulling tweets from")
-    end_date: Union[datetime, None] = Field(None,description="Latest date to pull tweets from, None if pulling up to the present")
-    has_link: bool = Field(False,description="Whether to query for tweets that have a link.")
+    subject: str = Field(..., description="Subject to search for")
+    min_likes: Union[int, None] = Field(
+        None, description="Minimum amount of likes on the tweet"
+    )
+    max_likes: Union[int, None] = Field(
+        None, description="Maximum amount of likes on the tweet"
+    )
+    start_date: Union[datetime, None] = Field(
+        None, description="Earliest date to start pulling tweets from"
+    )
+    end_date: Union[datetime, None] = Field(
+        None,
+        description="Latest date to pull tweets from, None if pulling up to the present",
+    )
+    has_link: bool = Field(
+        False, description="Whether to query for tweets that have a link."
+    )
+
 
 class BlogQuery(BaseModel):
     """Query against blog posts"""
 
-    subject: Union[str, None] = Field(...,description="Subject to search for")
-    authors: Union[None, str, list[str]] = Field(None,description="Authors to search for. None if not searching for a speific author,  list if searching for more than one.")
-    start_date: Union[datetime, None] = Field(None, description="Earliest date to start pulling blog posts from")
-    end_date: Union[datetime, None] = Field(None,description="Latest date to pull blog posts from")
+    subject: Union[str, None] = Field(..., description="Subject to search for")
+    authors: Union[None, str, list[str]] = Field(
+        None,
+        description="Authors to search for. None if not searching for a speific author,  list if searching for more than one.",
+    )
+    start_date: Union[datetime, None] = Field(
+        None, description="Earliest date to start pulling blog posts from"
+    )
+    end_date: Union[datetime, None] = Field(
+        None, description="Latest date to pull blog posts from"
+    )
+
 
 def get_environment() -> ToolUsageEnvironment:
     """Create an environment."""
     tools = cast(
         List[BaseTool],
-        [
-            tool(func)
-            for func in [
-                TweetQuery,
-                DocQuery,
-                BlogQuery
-            ]
-        ],
+        [tool(func) for func in [TweetQuery, DocQuery, BlogQuery]],
     )
     return ToolUsageEnvironment(
         tools=tools,
@@ -68,132 +69,360 @@ def get_environment() -> ToolUsageEnvironment:
 
 DOC_DATASET = [
     {
-        "question":[HumanMessage("How do I use the langgraph Send method?")],
-        "tool_calls":[{'name': 'DocQuery',
-                    'args': {'query': 'Send method','source':'langgraph'}}]
+        "question": [HumanMessage("How do I use the langgraph Send method?")],
+        "tool_calls": [
+            {
+                "name": "DocQuery",
+                "args": {"query": "Send method", "source": "langgraph"},
+            }
+        ],
     },
     {
-        "question":[HumanMessage("How do you chain a prompt with a model?")],
-        "tool_calls":[{'name': 'DocQuery',
-                    'args': {'query': 'chaining prompt and model','source':'langchain'}}]
+        "question": [HumanMessage("How do you chain a prompt with a model?")],
+        "tool_calls": [
+            {
+                "name": "DocQuery",
+                "args": {"query": "chaining prompt and model", "source": "langchain"},
+            }
+        ],
     },
     {
-        "question":[HumanMessage("How do you run a pairwise experiment in langsmith?")],
-        "tool_calls":[{'name': 'DocQuery',
-                    'args': {'query': 'pairwise experiment','source':'langsmith'}}]
+        "question": [
+            HumanMessage("How do you run a pairwise experiment in langsmith?")
+        ],
+        "tool_calls": [
+            {
+                "name": "DocQuery",
+                "args": {"query": "pairwise experiment", "source": "langsmith"},
+            }
+        ],
     },
     {
-        "question":[HumanMessage("What is a tool node?")],
-        "tool_calls":[{'name': 'DocQuery',
-                    'args': {'query': 'tool node','source':'langgraph'}}]
+        "question": [HumanMessage("What is a tool node?")],
+        "tool_calls": [
+            {"name": "DocQuery", "args": {"query": "tool node", "source": "langgraph"}}
+        ],
     },
     {
-        "question":[HumanMessage("How do I get the log probabilities of my chat model?")],
-        "tool_calls":[{'name': 'DocQuery',
-                    'args': {'query': 'log probabilities','source':'langchain'}}]
+        "question": [
+            HumanMessage("How do I get the log probabilities of my chat model?")
+        ],
+        "tool_calls": [
+            {
+                "name": "DocQuery",
+                "args": {"query": "log probabilities", "source": "langchain"},
+            }
+        ],
     },
     {
-        "question":[HumanMessage("How can I build my own custom evaluator?")],
-        "tool_calls":[{'name': 'DocQuery',
-                    'args': {'query': 'custom evaluator','source':'langsmith'}}]
+        "question": [HumanMessage("How can I build my own custom evaluator?")],
+        "tool_calls": [
+            {
+                "name": "DocQuery",
+                "args": {"query": "custom evaluator", "source": "langsmith"},
+            }
+        ],
     },
     {
-        "question":[HumanMessage("How do I use a tool in a routing function?")],
-        "tool_calls":[{'name': 'DocQuery',
-                    'args': {'query': 'tool in routing function','source':'langgraph'}}]
+        "question": [HumanMessage("How do I use a tool in a routing function?")],
+        "tool_calls": [
+            {
+                "name": "DocQuery",
+                "args": {"query": "tool in routing function", "source": "langgraph"},
+            }
+        ],
     },
     {
-        "question":[HumanMessage("How do use Pinecone as a vectorstore for few shot prompting?")],
-        "tool_calls":[{'name': 'DocQuery',
-                    'args': {'query': 'Pinecone for few shot prompting','source':'langchain'}}]
+        "question": [
+            HumanMessage("How do use Pinecone as a vectorstore for few shot prompting?")
+        ],
+        "tool_calls": [
+            {
+                "name": "DocQuery",
+                "args": {
+                    "query": "Pinecone for few shot prompting",
+                    "source": "langchain",
+                },
+            }
+        ],
     },
     {
-        "question":[HumanMessage("How do I prevent personal data from being logged in my traces?")],
-        "tool_calls":[{'name': 'DocQuery',
-                    'args': {'query': 'personal data logging','source':'langsmith'}}]
+        "question": [
+            HumanMessage(
+                "How do I prevent personal data from being logged in my traces?"
+            )
+        ],
+        "tool_calls": [
+            {
+                "name": "DocQuery",
+                "args": {"query": "personal data logging", "source": "langsmith"},
+            }
+        ],
     },
     {
-        "question":[HumanMessage("How do you use a nested graph? Can you stream messages from inside them?")],
-        "tool_calls":[{'name': 'DocQuery',
-                    'args': {'query': 'nested graph','source':'langgraph'}},
-                    {'name': 'DocQuery',
-                    'args': {'query': 'stream messages nested graph','source':'langgraph'}}]
+        "question": [
+            HumanMessage(
+                "How do you use a nested graph? Can you stream messages from inside them?"
+            )
+        ],
+        "tool_calls": [
+            {
+                "name": "DocQuery",
+                "args": {"query": "nested graph", "source": "langgraph"},
+            },
+            {
+                "name": "DocQuery",
+                "args": {
+                    "query": "stream messages nested graph",
+                    "source": "langgraph",
+                },
+            },
+        ],
     },
     {
-        "question":[HumanMessage("How do I extract text from PDF data for my retrieval chain? Can I combine image and text in a prompt?")],
-        "tool_calls":[{'name': 'DocQuery',
-                    'args': {'query': 'PDF extraction for chain','source':'langchain'}},
-                    {'name': 'DocQuery',
-                    'args': {'query': 'multimodal prompt','source':'langchain'}}]
+        "question": [
+            HumanMessage(
+                "How do I extract text from PDF data for my retrieval chain? Can I combine image and text in a prompt?"
+            )
+        ],
+        "tool_calls": [
+            {
+                "name": "DocQuery",
+                "args": {"query": "PDF extraction for chain", "source": "langchain"},
+            },
+            {
+                "name": "DocQuery",
+                "args": {"query": "multimodal prompt", "source": "langchain"},
+            },
+        ],
     },
     {
-        "question":[HumanMessage("How do I setup automation rules for my traces? How do I view logs for those rules?")],
-        "tool_calls":[{'name': 'DocQuery',
-                    'args': {'query': 'automation rules for traces','source':'langsmith'}},
-                    {'name': 'DocQuery',
-                    'args': {'query': 'automation rules logs','source':'langsmith'}}]
+        "question": [
+            HumanMessage(
+                "How do I setup automation rules for my traces? How do I view logs for those rules?"
+            )
+        ],
+        "tool_calls": [
+            {
+                "name": "DocQuery",
+                "args": {"query": "automation rules for traces", "source": "langsmith"},
+            },
+            {
+                "name": "DocQuery",
+                "args": {"query": "automation rules logs", "source": "langsmith"},
+            },
+        ],
     },
 ]
 
 TWEET_DATASET = [
     {
-        "question":[HumanMessage("Did we have any tweets about agents with more than 1000 likes that also included a link?")],
-        "tool_calls":[{'name': 'TweetQuery',
-                    'args': {'subject': 'agents','min_likes':1000,'max_likes':None,"start_date":None,"end_date":None,"has_link":True}}]
+        "question": [
+            HumanMessage(
+                "Did we have any tweets about agents with more than 1000 likes that also included a link?"
+            )
+        ],
+        "tool_calls": [
+            {
+                "name": "TweetQuery",
+                "args": {
+                    "subject": "agents",
+                    "min_likes": 1000,
+                    "max_likes": None,
+                    "start_date": None,
+                    "end_date": None,
+                    "has_link": True,
+                },
+            }
+        ],
     },
     {
-        "question":[HumanMessage("Are there any tweets about evaluators by langchain with less than 100 likes?")],
-        "tool_calls":[{'name': 'TweetQuery',
-                    'args': {'subject': 'evaluators','min_likes':None,'max_likes':100,"start_date":None,"end_date":None,"has_link":False}}]
+        "question": [
+            HumanMessage(
+                "Are there any tweets about evaluators by langchain with less than 100 likes?"
+            )
+        ],
+        "tool_calls": [
+            {
+                "name": "TweetQuery",
+                "args": {
+                    "subject": "evaluators",
+                    "min_likes": None,
+                    "max_likes": 100,
+                    "start_date": None,
+                    "end_date": None,
+                    "has_link": False,
+                },
+            }
+        ],
     },
     {
-        "question":[HumanMessage("Are there any tweets that link to the anthropic website in the last year?")],
-        "tool_calls":[{'name': 'TweetQuery',
-                    'args': {'subject': 'anthropic','min_likes':None,'max_likes':None,"start_date":datetime(2023,1,1),"end_date":None,"has_link":True}}]
+        "question": [
+            HumanMessage(
+                "Are there any tweets that link to the anthropic website in the last year?"
+            )
+        ],
+        "tool_calls": [
+            {
+                "name": "TweetQuery",
+                "args": {
+                    "subject": "anthropic",
+                    "min_likes": None,
+                    "max_likes": None,
+                    "start_date": datetime(2023, 1, 1),
+                    "end_date": None,
+                    "has_link": True,
+                },
+            }
+        ],
     },
     {
-        "question":[HumanMessage("In Q2 2023 did we tweet anything about LangSmith?")],
-        "tool_calls":[{'name': 'TweetQuery',
-                    'args': {'subject': 'LangSmith','min_likes':None,'max_likes':None,"start_date":datetime(2023,3,1),"end_date":datetime(2023,6,1),"has_link":False}}]
+        "question": [HumanMessage("In Q2 2023 did we tweet anything about LangSmith?")],
+        "tool_calls": [
+            {
+                "name": "TweetQuery",
+                "args": {
+                    "subject": "LangSmith",
+                    "min_likes": None,
+                    "max_likes": None,
+                    "start_date": datetime(2023, 3, 1),
+                    "end_date": datetime(2023, 6, 1),
+                    "has_link": False,
+                },
+            }
+        ],
     },
     {
-        "question":[HumanMessage("Were there any social media posts with triple digit likes about few shot prompting?")],
-        "tool_calls":[{'name': 'TweetQuery',
-                    'args': {'subject': 'few shot prompting','min_likes':100,'max_likes':999,"start_date":None,"end_date":None,"has_link":False}}]
+        "question": [
+            HumanMessage(
+                "Were there any social media posts with triple digit likes about few shot prompting?"
+            )
+        ],
+        "tool_calls": [
+            {
+                "name": "TweetQuery",
+                "args": {
+                    "subject": "few shot prompting",
+                    "min_likes": 100,
+                    "max_likes": 999,
+                    "start_date": None,
+                    "end_date": None,
+                    "has_link": False,
+                },
+            }
+        ],
     },
     {
-        "question":[HumanMessage("Are there any posts aout LangServe before June 2023 that have more than 2000 likes and include a link?")],
-        "tool_calls":[{'name': 'TweetQuery',
-                    'args': {'subject': 'LangServe','min_likes':2000,'max_likes':None,"start_date":None,"end_date":datetime(2023,5,31),"has_link":True}}]
+        "question": [
+            HumanMessage(
+                "Are there any posts aout LangServe before June 2023 that have more than 2000 likes and include a link?"
+            )
+        ],
+        "tool_calls": [
+            {
+                "name": "TweetQuery",
+                "args": {
+                    "subject": "LangServe",
+                    "min_likes": 2000,
+                    "max_likes": None,
+                    "start_date": None,
+                    "end_date": datetime(2023, 5, 31),
+                    "has_link": True,
+                },
+            }
+        ],
     },
 ]
 
 BLOG_DATASET = [
     {
-        "question":[HumanMessage("what are some blog posts in the past year about agents?")],
-        "tool_calls":[{'name': 'BlogQuery',
-                    'args': {'subject': 'agents','authors':None,"start_date":datetime(2023,1,1),"end_date":None}}]
+        "question": [
+            HumanMessage("what are some blog posts in the past year about agents?")
+        ],
+        "tool_calls": [
+            {
+                "name": "BlogQuery",
+                "args": {
+                    "subject": "agents",
+                    "authors": None,
+                    "start_date": datetime(2023, 1, 1),
+                    "end_date": None,
+                },
+            }
+        ],
     },
     {
-        "question":[HumanMessage("how many blogs mentioned chat-gpt in the month after October 2023?")],
-        "tool_calls":[{'name': 'BlogQuery',
-                    'args': {'subject': 'chat gpt','authors':None,"start_date":datetime(2023,11,1),"end_date":datetime(2023,11,30)}}]
+        "question": [
+            HumanMessage(
+                "how many blogs mentioned chat-gpt in the month after October 2023?"
+            )
+        ],
+        "tool_calls": [
+            {
+                "name": "BlogQuery",
+                "args": {
+                    "subject": "chat gpt",
+                    "authors": None,
+                    "start_date": datetime(2023, 11, 1),
+                    "end_date": datetime(2023, 11, 30),
+                },
+            }
+        ],
     },
     {
-        "question":[HumanMessage("what has Bagatur written about universal configurable models?")],
-        "tool_calls":[{'name': 'BlogQuery',
-                    'args': {'subject': 'universal configurable model','authors':"Bagatur","start_date":None,"end_date":None}}]
+        "question": [
+            HumanMessage(
+                "what has Bagatur written about universal configurable models?"
+            )
+        ],
+        "tool_calls": [
+            {
+                "name": "BlogQuery",
+                "args": {
+                    "subject": "universal configurable model",
+                    "authors": "Bagatur",
+                    "start_date": None,
+                    "end_date": None,
+                },
+            }
+        ],
     },
     {
-        "question":[HumanMessage("Have Harrison or Bagatur written anything about passing in runnables as tools in the last week?")],
-        "tool_calls":[{'name': 'BlogQuery',
-                    'args': {'subject': 'runnables as tools','authors':["Harrison","Bagatur"],"start_date":datetime(2023,12,24),"end_date":None}}]
+        "question": [
+            HumanMessage(
+                "Have Harrison or Bagatur written anything about passing in runnables as tools in the last week?"
+            )
+        ],
+        "tool_calls": [
+            {
+                "name": "BlogQuery",
+                "args": {
+                    "subject": "runnables as tools",
+                    "authors": ["Harrison", "Bagatur"],
+                    "start_date": datetime(2023, 12, 24),
+                    "end_date": None,
+                },
+            }
+        ],
     },
     {
-        "question":[HumanMessage("Did Harrison write anything about LangGraph in the time frame up to the end of Q1 2023?")],
-        "tool_calls":[{'name': 'BlogQuery',
-                    'args': {'subject': 'LangGraph','authors':"Harrison","start_date":None,"end_date":datetime(2023,3,1)}}]
-    }
+        "question": [
+            HumanMessage(
+                "Did Harrison write anything about LangGraph in the time frame up to the end of Q1 2023?"
+            )
+        ],
+        "tool_calls": [
+            {
+                "name": "BlogQuery",
+                "args": {
+                    "subject": "LangGraph",
+                    "authors": "Harrison",
+                    "start_date": None,
+                    "end_date": datetime(2023, 3, 1),
+                },
+            }
+        ],
+    },
 ]
 
 DATASET = DOC_DATASET + TWEET_DATASET + BLOG_DATASET
@@ -202,8 +431,14 @@ def get_environment() -> ToolUsageEnvironment:
     name="Extraction Task",
     dataset_id="https://smith.langchain.com/public/594f9f60-30a0-49bf-b075-f44beabf546a/d",
     create_environment=get_environment,
-    instructions=(
-        "You are requested to solve math questions in an alternate"
+    instructions=("""
+                    You are requested to generate queries for searching either through tweets, docs, or blog entries. 
+                    Inside the docs there are three different sources that you may wish to query for: LangGraph, LangSmith, or LangChain. 
+                    LangGraph is a library for building stateful, multi-actor applications with LLMs, used to create agent and multi-agent workflows. 
+                    LangSmith is an all-in-one developer platform for every step of the LLM-powered application lifecycle. 
+                    It helps you debug, evaluate, test, and monitor your LLM applications. LangChain is a framework to build with LLMs by chaining interoperable components.
+                    One last important thing to remember is that some queries will ask for date ranges, and you must remember that today is 2024-01-01.
+                 """
     ),
     description=(
         """\
@@ -221,6 +456,7 @@ def get_environment() -> ToolUsageEnvironment:
     },
 )
 
+
 def _create_dataset() -> None:
     """Create a dataset with the langsmith client."""
     from langsmith.client import Client
@@ -238,8 +474,4 @@ def _create_dataset() -> None:
                 "reference": example["tool_calls"],
             },
             dataset_id=dataset_id,
-        )
-
-if __name__=="__main__":
-    #_create_dataset()
-    pass
\ No newline at end of file
+        )
\ No newline at end of file
diff --git a/scripts/extract_task.py b/scripts/extract_task.py
index 029da94..0f83f6a 100644
--- a/scripts/extract_task.py
+++ b/scripts/extract_task.py
@@ -1,30 +1,57 @@
-from langsmith.evaluation import evaluate
-from langchain_benchmarks.tool_usage.tasks.extraction_query import *
-from langchain_openai import ChatOpenAI
-from langchain_core.prompts import ChatPromptTemplate
-from langchain_core.output_parsers import StrOutputParser
 import json
+import sys
+sys.path.append("../langchain_benchmarks")
+from tool_usage.tasks.extraction_query import *
+from datetime import datetime
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_openai import ChatOpenAI
+from langsmith.evaluation import evaluate
+from model_registration import model_registry
+from langsmith.evaluation.evaluator import (
+    EvaluationResult,
+    EvaluationResults,
+)
+from langsmith.schemas import Example, Run
+from typing import Optional
+
+
 
 llm = ChatOpenAI(model="gpt-4o")
+#TODO: system prompt
 llm_judge = ChatOpenAI(model="gpt-4o")
 
-judge_prompt = ChatPromptTemplate.from_messages([
-    ("system","You are an llm tasked with determining if the subject extracted by another LLM is an accurate "
-     "representation of the correct answer. You are to check for general semantic similarity since the words might not "
-     "match up perfectly but the meaning might still be the same. Return YES if the answers match, and NO otherwise. "
-     "Never return anything other than YES or NO."),
-     ("human","Is this query: {run_query} very similar to this reference query: {reference_query}")
-])
+judge_prompt = ChatPromptTemplate.from_messages(
+    [
+        (
+            "system",
+            "You are an llm tasked with determining if the subject extracted by another LLM is an accurate "
+            "representation of the correct answer. You are to check for general semantic similarity since the words might not "
+            "match up perfectly but the meaning might still be the same. Return YES if the answers match, and NO otherwise. "
+            "Never return anything other than YES or NO.",
+        ),
+        (
+            "human",
+            "Is this query: {run_query} very similar to this reference query: {reference_query}",
+        ),
+    ]
+)
 
 judge_chain = judge_prompt | llm_judge | StrOutputParser()
 
-tools = [DocQuery,TweetQuery,BlogQuery]
+tools = [DocQuery, TweetQuery, BlogQuery]
 llm = llm.bind_tools(tools)
 
 
 def compare_outputs(run_outputs: dict, example_outputs: dict) -> EvaluationResults:
     # Chose the correct tool
-    correct_tool_score = int([tool['name'] for tool in example_outputs['reference']] == [tool['function']["name"] for tool in run_outputs['response'].additional_kwargs['tool_calls']])
+    correct_tool_score = int(
+        [tool["name"] for tool in example_outputs["reference"]]
+        == [
+            tool["function"]["name"]
+            for tool in run_outputs["response"].additional_kwargs["tool_calls"]
+        ]
+    )
 
     # Has the correct determenistic args
     determinstic_score = 0
@@ -33,18 +60,44 @@ def compare_outputs(run_outputs: dict, example_outputs: dict) -> EvaluationResul
 
     if correct_tool_score == 1:
         determinstic_score, underministic_score = 1, 1
-        for tool in example_outputs['reference']:
-            corresponding_response_tool = json.loads([t['function'] for t in run_outputs['response'].additional_kwargs['tool_calls'] if t['function']["name"]==tool["name"]][0]['arguments'])
-            for arg in tool['args']:
-                if arg in ['query','subject']:
-                    ans = judge_chain.invoke({"run_query":corresponding_response_tool[arg],"reference_query":tool['args'][arg]})
+        for tool in example_outputs["reference"]:
+            corresponding_response_tool = json.loads(
+                [
+                    t["function"]
+                    for t in run_outputs["response"].additional_kwargs["tool_calls"]
+                    if t["function"]["name"] == tool["name"]
+                ][0]["arguments"]
+            )
+            for arg in tool["args"]:
+                if arg in ["query", "subject"]:
+                    ans = judge_chain.invoke(
+                        {
+                            "run_query": corresponding_response_tool[arg],
+                            "reference_query": tool["args"][arg],
+                        }
+                    )
                     underministic_score = 1 if ans == "YES" else 0
                 else:
-                    if (tool['args'][arg] and arg not in corresponding_response_tool) or (tool['args'][arg] and not (tool['args'][arg] == corresponding_response_tool[arg]) and \
-                    not (isinstance(tool['args'][arg],datetime) and datetime.fromisoformat((corresponding_response_tool[arg])).replace(tzinfo=None) == tool['args'][arg])):
+                    if (
+                        tool["args"][arg] and arg not in corresponding_response_tool
+                    ) or (
+                        tool["args"][arg]
+                        and not (tool["args"][arg] == corresponding_response_tool[arg])
+                        and not (
+                            isinstance(tool["args"][arg], datetime)
+                            and datetime.fromisoformat(
+                                (corresponding_response_tool[arg])
+                            ).replace(tzinfo=None)
+                            == tool["args"][arg]
+                        )
+                    ):
                         determinstic_score = 0
     # Overall correctness
-    overall_score = int(bool(correct_tool_score) and bool(determinstic_score) and bool(underministic_score))
+    overall_score = int(
+        bool(correct_tool_score)
+        and bool(determinstic_score)
+        and bool(underministic_score)
+    )
     results = [
         EvaluationResult(
             key="Correct tool",
@@ -61,24 +114,51 @@ def compare_outputs(run_outputs: dict, example_outputs: dict) -> EvaluationResul
         EvaluationResult(
             key="Overall correctness",
             score=overall_score,
-        )
+        ),
     ]
 
-    return {"results":results}
+    return {"results": results}
+
 
-def evaluate_run(
-       run: Run, example: Optional[Example] = None
-    ) -> EvaluationResults:
-    return compare_outputs(
-            run.outputs,
-            example.outputs)
+def evaluate_run(run: Run, example: Optional[Example] = None) -> EvaluationResults:
+    return compare_outputs(run.outputs, example.outputs)
 
-def predict(example: dict):
-    return {"response":llm.invoke(example['question'])}
+instructions = ""
+ChatPromptTemplate.from_messages(
+    [
+        ("system", "{instructions}"),
+        ("human", "{input}"),
+    ]
+)
+
+prompt = ChatPromptTemplate.from_messages(
+                    [
+                        ("system", "{instructions}"),
+                        ("human", "{input}"),
+                    ]
+)
+def predict_for_model(model,instructions):
+    chain = prompt | model.bind_tools(tools)
+    def predict(example: dict):
+        return {"response": chain.invoke({"input":example["question"],"instructions":instructions})}
+    return predict
 
+'''
 experiment_results = evaluate(
-        predict,
+    predict,
+    data=EXTRACTION_TASK.name,
+    evaluators=[evaluate_run],
+    experiment_prefix="test-single-tool",
+)'''
+
+models = ["gpt-4o", "firefunction-v2"]
+
+for model_name in models:
+    model = model_registry[model_name].get_model()
+    evaluate(
+        predict_for_model(model,EXTRACTION_TASK.instructions),
         data=EXTRACTION_TASK.name,
         evaluators=[evaluate_run],
-        experiment_prefix="test-single-tool",
-    )
\ No newline at end of file
+        experiment_prefix=f"test-{model_name}",
+    )
+